summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig106
-rw-r--r--fs/Makefile3
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/affs/inode.c3
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c75
-rw-r--r--fs/autofs4/expire.c4
-rw-r--r--fs/autofs4/inode.c18
-rw-r--r--fs/autofs4/waitq.c8
-rw-r--r--fs/bfs/inode.c45
-rw-r--r--fs/binfmt_elf.c12
-rw-r--r--fs/binfmt_elf_fdpic.c35
-rw-r--r--fs/binfmt_flat.c34
-rw-r--r--fs/binfmt_misc.c5
-rw-r--r--fs/bio.c36
-rw-r--r--fs/block_dev.c31
-rw-r--r--fs/btrfs/Makefile25
-rw-r--r--fs/btrfs/acl.c351
-rw-r--r--fs/btrfs/async-thread.c419
-rw-r--r--fs/btrfs/async-thread.h101
-rw-r--r--fs/btrfs/btrfs_inode.h131
-rw-r--r--fs/btrfs/compat.h7
-rw-r--r--fs/btrfs/compression.c709
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/crc32c.h29
-rw-r--r--fs/btrfs/ctree.c3953
-rw-r--r--fs/btrfs/ctree.h2129
-rw-r--r--fs/btrfs/dir-item.c386
-rw-r--r--fs/btrfs/disk-io.c2343
-rw-r--r--fs/btrfs/disk-io.h102
-rw-r--r--fs/btrfs/export.c203
-rw-r--r--fs/btrfs/export.h19
-rw-r--r--fs/btrfs/extent-tree.c5986
-rw-r--r--fs/btrfs/extent_io.c3717
-rw-r--r--fs/btrfs/extent_io.h269
-rw-r--r--fs/btrfs/extent_map.c351
-rw-r--r--fs/btrfs/extent_map.h62
-rw-r--r--fs/btrfs/file-item.c831
-rw-r--r--fs/btrfs/file.c1288
-rw-r--r--fs/btrfs/free-space-cache.c495
-rw-r--r--fs/btrfs/hash.h27
-rw-r--r--fs/btrfs/inode-item.c206
-rw-r--r--fs/btrfs/inode-map.c144
-rw-r--r--fs/btrfs/inode.c5035
-rw-r--r--fs/btrfs/ioctl.c1132
-rw-r--r--fs/btrfs/ioctl.h67
-rw-r--r--fs/btrfs/locking.c88
-rw-r--r--fs/btrfs/locking.h27
-rw-r--r--fs/btrfs/ordered-data.c730
-rw-r--r--fs/btrfs/ordered-data.h158
-rw-r--r--fs/btrfs/orphan.c67
-rw-r--r--fs/btrfs/print-tree.c216
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/ref-cache.c230
-rw-r--r--fs/btrfs/ref-cache.h77
-rw-r--r--fs/btrfs/root-tree.c366
-rw-r--r--fs/btrfs/struct-funcs.c139
-rw-r--r--fs/btrfs/super.c720
-rw-r--r--fs/btrfs/sysfs.c269
-rw-r--r--fs/btrfs/transaction.c1097
-rw-r--r--fs/btrfs/transaction.h106
-rw-r--r--fs/btrfs/tree-defrag.c147
-rw-r--r--fs/btrfs/tree-log.c2898
-rw-r--r--fs/btrfs/tree-log.h41
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c3218
-rw-r--r--fs/btrfs/volumes.h162
-rw-r--r--fs/btrfs/xattr.c322
-rw-r--r--fs/btrfs/xattr.h39
-rw-r--r--fs/btrfs/zlib.c632
-rw-r--r--fs/buffer.c5
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/coda/file.c12
-rw-r--r--fs/coda/sysctl.c5
-rw-r--r--fs/compat.c6
-rw-r--r--fs/configfs/inode.c3
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/dcache.c14
-rw-r--r--fs/debugfs/file.c32
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/devpts/inode.c4
-rw-r--r--fs/direct-io.c13
-rw-r--r--fs/dlm/ast.c56
-rw-r--r--fs/dlm/ast.h4
-rw-r--r--fs/dlm/debug_fs.c310
-rw-r--r--fs/dlm/dir.c18
-rw-r--r--fs/dlm/dlm_internal.h4
-rw-r--r--fs/dlm/lock.c31
-rw-r--r--fs/dlm/lowcomms.c8
-rw-r--r--fs/dlm/memory.c6
-rw-r--r--fs/dlm/midcomms.c2
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/user.c4
-rw-r--r--fs/dlm/user.h2
-rw-r--r--fs/dquot.c438
-rw-r--r--fs/ecryptfs/crypto.c514
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h105
-rw-r--r--fs/ecryptfs/file.c45
-rw-r--r--fs/ecryptfs/inode.c300
-rw-r--r--fs/ecryptfs/keystore.c651
-rw-r--r--fs/ecryptfs/main.c126
-rw-r--r--fs/ecryptfs/messaging.c4
-rw-r--r--fs/ecryptfs/miscdev.c18
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/exec.c34
-rw-r--r--fs/ext2/ialloc.c8
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/ioctl.c3
-rw-r--r--fs/ext2/super.c10
-rw-r--r--fs/ext3/hash.c77
-rw-r--r--fs/ext3/ialloc.c8
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext3/ioctl.c3
-rw-r--r--fs/ext3/namei.c18
-rw-r--r--fs/ext3/super.c59
-rw-r--r--fs/ext4/balloc.c293
-rw-r--r--fs/ext4/bitmap.c5
-rw-r--r--fs/ext4/dir.c10
-rw-r--r--fs/ext4/ext4.h158
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_i.h16
-rw-r--r--fs/ext4/ext4_jbd2.c83
-rw-r--r--fs/ext4/ext4_jbd2.h87
-rw-r--r--fs/ext4/ext4_sb.h12
-rw-r--r--fs/ext4/extents.c62
-rw-r--r--fs/ext4/file.c3
-rw-r--r--fs/ext4/hash.c77
-rw-r--r--fs/ext4/ialloc.c324
-rw-r--r--fs/ext4/inode.c315
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c629
-rw-r--r--fs/ext4/mballoc.h71
-rw-r--r--fs/ext4/migrate.c19
-rw-r--r--fs/ext4/namei.c99
-rw-r--r--fs/ext4/resize.c113
-rw-r--r--fs/ext4/super.c637
-rw-r--r--fs/ext4/xattr.c25
-rw-r--r--fs/filesystems.c23
-rw-r--r--fs/fs-writeback.c92
-rw-r--r--fs/fuse/control.c6
-rw-r--r--fs/fuse/dev.c113
-rw-r--r--fs/fuse/dir.c48
-rw-r--r--fs/fuse/file.c461
-rw-r--r--fs/fuse/fuse_i.h83
-rw-r--r--fs/fuse/inode.c157
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/bmap.c77
-rw-r--r--fs/gfs2/bmap.h34
-rw-r--r--fs/gfs2/daemon.c136
-rw-r--r--fs/gfs2/daemon.h17
-rw-r--r--fs/gfs2/dir.c62
-rw-r--r--fs/gfs2/dir.h1
-rw-r--r--fs/gfs2/eattr.c40
-rw-r--r--fs/gfs2/glock.c303
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c56
-rw-r--r--fs/gfs2/incore.h55
-rw-r--r--fs/gfs2/inode.c53
-rw-r--r--fs/gfs2/inode.h13
-rw-r--r--fs/gfs2/locking/dlm/mount.c12
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c16
-rw-r--r--fs/gfs2/main.c15
-rw-r--r--fs/gfs2/mount.c29
-rw-r--r--fs/gfs2/ops_address.c35
-rw-r--r--fs/gfs2/ops_dentry.c2
-rw-r--r--fs/gfs2/ops_dentry.h17
-rw-r--r--fs/gfs2/ops_export.c5
-rw-r--r--fs/gfs2/ops_file.c24
-rw-r--r--fs/gfs2/ops_fstype.c125
-rw-r--r--fs/gfs2/ops_fstype.h19
-rw-r--r--fs/gfs2/ops_inode.c75
-rw-r--r--fs/gfs2/ops_inode.h25
-rw-r--r--fs/gfs2/ops_super.c149
-rw-r--r--fs/gfs2/ops_super.h17
-rw-r--r--fs/gfs2/quota.c113
-rw-r--r--fs/gfs2/quota.h24
-rw-r--r--fs/gfs2/recovery.c48
-rw-r--r--fs/gfs2/recovery.h14
-rw-r--r--fs/gfs2/rgrp.c58
-rw-r--r--fs/gfs2/super.c246
-rw-r--r--fs/gfs2/super.h13
-rw-r--r--fs/gfs2/sys.c66
-rw-r--r--fs/gfs2/sys.h4
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/gfs2/util.h1
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hugetlbfs/inode.c13
-rw-r--r--fs/inode.c15
-rw-r--r--fs/ioctl.c44
-rw-r--r--fs/ioprio.c3
-rw-r--r--fs/isofs/inode.c6
-rw-r--r--fs/jbd/commit.c15
-rw-r--r--fs/jbd/transaction.c39
-rw-r--r--fs/jbd2/checkpoint.c24
-rw-r--r--fs/jbd2/commit.c67
-rw-r--r--fs/jbd2/journal.c143
-rw-r--r--fs/jbd2/transaction.c107
-rw-r--r--fs/jffs2/compr_rubin.c120
-rw-r--r--fs/jffs2/erase.c5
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/nodelist.h3
-rw-r--r--fs/jfs/jfs_imap.c10
-rw-r--r--fs/libfs.c7
-rw-r--r--fs/lockd/clntproc.c7
-rw-r--r--fs/lockd/host.c170
-rw-r--r--fs/lockd/mon.c569
-rw-r--r--fs/lockd/svc.c72
-rw-r--r--fs/lockd/svc4proc.c13
-rw-r--r--fs/lockd/svcproc.c13
-rw-r--r--fs/lockd/svcsubs.c1
-rw-r--r--fs/lockd/xdr.c5
-rw-r--r--fs/lockd/xdr4.c5
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/mpage.c6
-rw-r--r--fs/namei.c45
-rw-r--r--fs/ncpfs/getopt.c1
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfsd/auth.c4
-rw-r--r--fs/nfsd/nfs4callback.c3
-rw-r--r--fs/nfsd/nfs4proc.c5
-rw-r--r--fs/nfsd/nfs4recover.c2
-rw-r--r--fs/nfsd/nfs4state.c79
-rw-r--r--fs/nfsd/nfs4xdr.c2
-rw-r--r--fs/nfsd/nfsctl.c479
-rw-r--r--fs/nfsd/nfsfh.c36
-rw-r--r--fs/nfsd/nfsproc.c1
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ocfs2/Makefile7
-rw-r--r--fs/ocfs2/acl.c479
-rw-r--r--fs/ocfs2/acl.h58
-rw-r--r--fs/ocfs2/alloc.c712
-rw-r--r--fs/ocfs2/alloc.h30
-rw-r--r--fs/ocfs2/aops.c59
-rw-r--r--fs/ocfs2/blockcheck.c477
-rw-r--r--fs/ocfs2/blockcheck.h82
-rw-r--r--fs/ocfs2/buffer_head_io.c32
-rw-r--r--fs/ocfs2/buffer_head_io.h27
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/dir.c399
-rw-r--r--fs/ocfs2/dir.h2
-rw-r--r--fs/ocfs2/dlm/dlmast.c52
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h3
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c53
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c42
-rw-r--r--fs/ocfs2/dlm/dlmthread.c3
-rw-r--r--fs/ocfs2/dlmglue.c172
-rw-r--r--fs/ocfs2/dlmglue.h19
-rw-r--r--fs/ocfs2/extent_map.c96
-rw-r--r--fs/ocfs2/extent_map.h24
-rw-r--r--fs/ocfs2/file.c211
-rw-r--r--fs/ocfs2/file.h3
-rw-r--r--fs/ocfs2/inode.c175
-rw-r--r--fs/ocfs2/inode.h18
-rw-r--r--fs/ocfs2/journal.c364
-rw-r--r--fs/ocfs2/journal.h128
-rw-r--r--fs/ocfs2/localalloc.c26
-rw-r--r--fs/ocfs2/namei.c318
-rw-r--r--fs/ocfs2/ocfs2.h46
-rw-r--r--fs/ocfs2/ocfs2_fs.h213
-rw-r--r--fs/ocfs2/ocfs2_jbd_compat.h82
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/quota.h119
-rw-r--r--fs/ocfs2/quota_global.c1025
-rw-r--r--fs/ocfs2/quota_local.c1253
-rw-r--r--fs/ocfs2/resize.c76
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/suballoc.c363
-rw-r--r--fs/ocfs2/suballoc.h18
-rw-r--r--fs/ocfs2/super.c328
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/xattr.c2984
-rw-r--r--fs/ocfs2/xattr.h45
-rw-r--r--fs/omfs/inode.c1
-rw-r--r--fs/open.c2
-rw-r--r--fs/openpromfs/inode.c3
-rw-r--r--fs/partitions/check.c12
-rw-r--r--fs/pipe.c7
-rw-r--r--fs/proc/base.c235
-rw-r--r--fs/proc/generic.c8
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/meminfo.c6
-rw-r--r--fs/proc/nommu.c71
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/proc_sysctl.c1
-rw-r--r--fs/proc/root.c8
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/task_nommu.c122
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/quota.c11
-rw-r--r--fs/quota_tree.c645
-rw-r--r--fs/quota_tree.h25
-rw-r--r--fs/quota_v1.c28
-rw-r--r--fs/quota_v2.c631
-rw-r--r--fs/quotaio_v1.h33
-rw-r--r--fs/quotaio_v2.h60
-rw-r--r--fs/ramfs/file-nommu.c21
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c13
-rw-r--r--fs/reiserfs/inode.c15
-rw-r--r--fs/reiserfs/super.c10
-rw-r--r--fs/romfs/inode.c13
-rw-r--r--fs/select.c76
-rw-r--r--fs/smbfs/file.c2
-rw-r--r--fs/splice.c1
-rw-r--r--fs/squashfs/Makefile8
-rw-r--r--fs/squashfs/block.c274
-rw-r--r--fs/squashfs/cache.c412
-rw-r--r--fs/squashfs/dir.c235
-rw-r--r--fs/squashfs/export.c155
-rw-r--r--fs/squashfs/file.c502
-rw-r--r--fs/squashfs/fragment.c98
-rw-r--r--fs/squashfs/id.c94
-rw-r--r--fs/squashfs/inode.c346
-rw-r--r--fs/squashfs/namei.c242
-rw-r--r--fs/squashfs/squashfs.h90
-rw-r--r--fs/squashfs/squashfs_fs.h381
-rw-r--r--fs/squashfs/squashfs_fs_i.h45
-rw-r--r--fs/squashfs/squashfs_fs_sb.h76
-rw-r--r--fs/squashfs/super.c440
-rw-r--r--fs/squashfs/symlink.c118
-rw-r--r--fs/stat.c2
-rw-r--r--fs/super.c12
-rw-r--r--fs/sync.c50
-rw-r--r--fs/sysfs/inode.c3
-rw-r--r--fs/ubifs/Kconfig2
-rw-r--r--fs/ubifs/budget.c4
-rw-r--r--fs/ubifs/file.c9
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/journal.c2
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/super.c9
-rw-r--r--fs/xattr.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c2
348 files changed, 63099 insertions, 7220 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index ff0e8198020..51307b0fdf0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -189,6 +189,8 @@ config OCFS2_FS
select CONFIGFS_FS
select JBD2
select CRC32
+ select QUOTA
+ select QUOTA_TREE
help
OCFS2 is a general purpose extent based shared disk cluster file
system with many similarities to ext3. It supports 64 bit inode
@@ -258,15 +260,33 @@ config OCFS2_DEBUG_FS
this option for debugging only as it is likely to decrease
performance of the filesystem.
-config OCFS2_COMPAT_JBD
- bool "Use JBD for compatibility"
+config OCFS2_FS_POSIX_ACL
+ bool "OCFS2 POSIX Access Control Lists"
depends on OCFS2_FS
+ select FS_POSIX_ACL
default n
- select JBD
help
- The ocfs2 filesystem now uses JBD2 for its journalling. JBD2
- is backwards compatible with JBD. It is safe to say N here.
- However, if you really want to use the original JBD, say Y here.
+ Posix Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+config BTRFS_FS
+ tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
+ depends on EXPERIMENTAL
+ select LIBCRC32C
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
+ help
+ Btrfs is a new filesystem with extents, writable snapshotting,
+ support for multiple devices and many more features.
+
+ Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
+ FINALIZED. You should say N here unless you are interested in
+ testing Btrfs with non-critical data.
+
+ To compile this file system support as a module, choose M here. The
+ module will be called btrfs.
+
+ If unsure, say N.
endif # BLOCK
@@ -303,6 +323,10 @@ config PRINT_QUOTA_WARNING
Note that this behavior is currently deprecated and may go away in
future. Please use notification via netlink socket instead.
+# Generic support for tree structured quota files. Seleted when needed.
+config QUOTA_TREE
+ tristate
+
config QFMT_V1
tristate "Old quota format support"
depends on QUOTA
@@ -314,6 +338,7 @@ config QFMT_V1
config QFMT_V2
tristate "Quota format v2 support"
depends on QUOTA
+ select QUOTA_TREE
help
This quota format allows using quotas with 32-bit UIDs/GIDs. If you
need this functionality say Y here.
@@ -715,7 +740,20 @@ config CONFIGFS_FS
endmenu
-menu "Miscellaneous filesystems"
+menuconfig MISC_FILESYSTEMS
+ bool "Miscellaneous filesystems"
+ default y
+ ---help---
+ Say Y here to get to see options for various miscellaneous
+ filesystems, such as filesystems that came from other
+ operating systems.
+
+ This option alone does not add any kernel code.
+
+ If you say N, all options in this submenu will be skipped and
+ disabled; if unsure, say Y here.
+
+if MISC_FILESYSTEMS
config ADFS_FS
tristate "ADFS file system support (EXPERIMENTAL)"
@@ -894,6 +932,58 @@ config CRAMFS
If unsure, say N.
+config SQUASHFS
+ tristate "SquashFS 4.0 - Squashed file system support"
+ depends on BLOCK
+ select ZLIB_INFLATE
+ help
+ Saying Y here includes support for SquashFS 4.0 (a Compressed
+ Read-Only File System). Squashfs is a highly compressed read-only
+ filesystem for Linux. It uses zlib compression to compress both
+ files, inodes and directories. Inodes in the system are very small
+ and all blocks are packed to minimise data overhead. Block sizes
+ greater than 4K are supported up to a maximum of 1 Mbytes (default
+ block size 128K). SquashFS 4.0 supports 64 bit filesystems and files
+ (larger than 4GB), full uid/gid information, hard links and
+ timestamps.
+
+ Squashfs is intended for general read-only filesystem use, for
+ archival use (i.e. in cases where a .tar.gz file may be used), and in
+ embedded systems where low overhead is needed. Further information
+ and tools are available from http://squashfs.sourceforge.net.
+
+ If you want to compile this as a module ( = code which can be
+ inserted in and removed from the running kernel whenever you want),
+ say M here and read <file:Documentation/modules.txt>. The module
+ will be called squashfs. Note that the root file system (the one
+ containing the directory /) cannot be compiled as a module.
+
+ If unsure, say N.
+
+config SQUASHFS_EMBEDDED
+
+ bool "Additional option for memory-constrained systems"
+ depends on SQUASHFS
+ default n
+ help
+ Saying Y here allows you to specify cache size.
+
+ If unsure, say N.
+
+config SQUASHFS_FRAGMENT_CACHE_SIZE
+ int "Number of fragments cached" if SQUASHFS_EMBEDDED
+ depends on SQUASHFS
+ default "3"
+ help
+ By default SquashFS caches the last 3 fragments read from
+ the filesystem. Increasing this amount may mean SquashFS
+ has to re-read fragments less often from disk, at the expense
+ of extra system memory. Decreasing this amount will mean
+ SquashFS uses less memory at the expense of extra reads from disk.
+
+ Note there must be at least one cached fragment. Anything
+ much more than three will probably not make much difference.
+
config VXFS_FS
tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
depends on BLOCK
@@ -1085,7 +1175,7 @@ config UFS_DEBUG
Y here. This will result in _many_ additional debugging messages to be
written to the system log.
-endmenu
+endif # MISC_FILESYSTEMS
menuconfig NETWORK_FILESYSTEMS
bool "Network File Systems"
diff --git a/fs/Makefile b/fs/Makefile
index e6f423d1d22..38bc735c67a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
obj-$(CONFIG_QUOTA) += dquot.o
obj-$(CONFIG_QFMT_V1) += quota_v1.o
obj-$(CONFIG_QFMT_V2) += quota_v2.o
+obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
obj-$(CONFIG_QUOTACTL) += quota.o
obj-$(CONFIG_PROC_FS) += proc/
@@ -73,6 +74,7 @@ obj-$(CONFIG_JBD) += jbd/
obj-$(CONFIG_JBD2) += jbd2/
obj-$(CONFIG_EXT2_FS) += ext2/
obj-$(CONFIG_CRAMFS) += cramfs/
+obj-$(CONFIG_SQUASHFS) += squashfs/
obj-y += ramfs/
obj-$(CONFIG_HUGETLBFS) += hugetlbfs/
obj-$(CONFIG_CODA_FS) += coda/
@@ -118,4 +120,5 @@ obj-$(CONFIG_HOSTFS) += hostfs/
obj-$(CONFIG_HPPFS) += hppfs/
obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
+obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1377b1240b6..9246cb4aa01 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,7 +628,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
}
index = pos >> PAGE_CACHE_SHIFT;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 415d9c67ac1..3c4ec7d864c 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
goto bad_inode;
#else
inode->i_mode |= S_IFDIR;
- inode->i_op = NULL;
- inode->i_fop = NULL;
+ /* ... and leave ->i_op and ->i_fop pointing to empty */
break;
#endif
case ST_LINKFILE:
diff --git a/fs/afs/write.c b/fs/afs/write.c
index d6b85dab35f..3fb36d43362 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -144,7 +144,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
candidate->state = AFS_WBACK_PENDING;
init_waitqueue_head(&candidate->waitq);
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
kfree(candidate);
return -ENOMEM;
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index c773680d5c6..e1734f2d6e2 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
inode->i_nlink = 2;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- inode->i_blocks = 0;
if (ino == AUTOFS_ROOT_INO) {
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
inode->i_op = &autofs_root_inode_operations;
inode->i_fop = &autofs_root_operations;
- inode->i_uid = inode->i_gid = 0; /* Changed in read_super */
goto done;
}
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index e0f16da00e5..a76803108d0 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -25,8 +25,6 @@
#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION)
#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11)
-#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
-
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/time.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 63b7c7afe8d..025e105bffe 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
/*
* Check sanity of parameter control fields and if a path is present
- * check that it has a "/" and is terminated.
+ * check that it is terminated and contains at least one "/".
*/
static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
{
@@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
}
if (param->size > sizeof(*param)) {
- err = check_name(param->path);
+ err = invalid_str(param->path,
+ (void *) ((size_t) param + param->size));
if (err) {
- AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
- cmd);
+ AUTOFS_WARN(
+ "path string terminator missing for cmd(0x%08x)",
+ cmd);
goto out;
}
- err = invalid_str(param->path,
- (void *) ((size_t) param + param->size));
+ err = check_name(param->path);
if (err) {
AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
cmd);
@@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp,
struct autofs_sb_info *sbi,
struct autofs_dev_ioctl *param)
{
- param->arg1 = sbi->version;
+ param->protover.version = sbi->version;
return 0;
}
@@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
struct autofs_sb_info *sbi,
struct autofs_dev_ioctl *param)
{
- param->arg1 = sbi->sub_version;
+ param->protosubver.sub_version = sbi->sub_version;
return 0;
}
@@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
int err, fd;
/* param->path has already been checked */
- if (!param->arg1)
+ if (!param->openmount.devid)
return -EINVAL;
param->ioctlfd = -1;
path = param->path;
- devid = param->arg1;
+ devid = param->openmount.devid;
err = 0;
fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp,
{
autofs_wqt_t token;
- token = (autofs_wqt_t) param->arg1;
+ token = (autofs_wqt_t) param->ready.token;
return autofs4_wait_release(sbi, token, 0);
}
@@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp,
autofs_wqt_t token;
int status;
- token = (autofs_wqt_t) param->arg1;
- status = param->arg2 ? param->arg2 : -ENOENT;
+ token = (autofs_wqt_t) param->fail.token;
+ status = param->fail.status ? param->fail.status : -ENOENT;
return autofs4_wait_release(sbi, token, status);
}
@@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
int pipefd;
int err = 0;
- if (param->arg1 == -1)
+ if (param->setpipefd.pipefd == -1)
return -EINVAL;
- pipefd = param->arg1;
+ pipefd = param->setpipefd.pipefd;
mutex_lock(&sbi->wq_mutex);
if (!sbi->catatonic) {
@@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
{
unsigned long timeout;
- timeout = param->arg1;
- param->arg1 = sbi->exp_timeout / HZ;
+ timeout = param->timeout.timeout;
+ param->timeout.timeout = sbi->exp_timeout / HZ;
sbi->exp_timeout = timeout * HZ;
return 0;
}
@@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
path = param->path;
devid = sbi->sb->s_dev;
- param->arg1 = param->arg2 = -1;
+ param->requester.uid = param->requester.gid = -1;
/* Get nameidata of the parent directory */
err = path_lookup(path, LOOKUP_PARENT, &nd);
@@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
err = 0;
autofs4_expire_wait(nd.path.dentry);
spin_lock(&sbi->fs_lock);
- param->arg1 = ino->uid;
- param->arg2 = ino->gid;
+ param->requester.uid = ino->uid;
+ param->requester.gid = ino->gid;
spin_unlock(&sbi->fs_lock);
}
@@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp,
int err = -EAGAIN;
int how;
- how = param->arg1;
+ how = param->expire.how;
mnt = fp->f_path.mnt;
- if (sbi->type & AUTOFS_TYPE_TRIGGER)
+ if (autofs_type_trigger(sbi->type))
dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
else
dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
@@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
struct autofs_sb_info *sbi,
struct autofs_dev_ioctl *param)
{
- param->arg1 = 0;
+ param->askumount.may_umount = 0;
if (may_umount(fp->f_path.mnt))
- param->arg1 = 1;
+ param->askumount.may_umount = 1;
return 0;
}
@@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
struct nameidata nd;
const char *path;
unsigned int type;
+ unsigned int devid, magic;
int err = -ENOENT;
if (param->size <= sizeof(*param)) {
@@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
}
path = param->path;
- type = param->arg1;
+ type = param->ismountpoint.in.type;
- param->arg1 = 0;
- param->arg2 = 0;
+ param->ismountpoint.out.devid = devid = 0;
+ param->ismountpoint.out.magic = magic = 0;
if (!fp || param->ioctlfd == -1) {
- if (type == AUTOFS_TYPE_ANY) {
+ if (autofs_type_any(type)) {
struct super_block *sb;
err = path_lookup(path, LOOKUP_FOLLOW, &nd);
@@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
goto out;
sb = nd.path.dentry->d_sb;
- param->arg1 = new_encode_dev(sb->s_dev);
+ devid = new_encode_dev(sb->s_dev);
} else {
struct autofs_info *ino;
@@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
goto out_release;
ino = autofs4_dentry_ino(nd.path.dentry);
- param->arg1 = autofs4_get_dev(ino->sbi);
+ devid = autofs4_get_dev(ino->sbi);
}
err = 0;
if (nd.path.dentry->d_inode &&
nd.path.mnt->mnt_root == nd.path.dentry) {
err = 1;
- param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic;
+ magic = nd.path.dentry->d_inode->i_sb->s_magic;
}
} else {
- dev_t devid = new_encode_dev(sbi->sb->s_dev);
+ dev_t dev = autofs4_get_dev(sbi);
err = path_lookup(path, LOOKUP_PARENT, &nd);
if (err)
goto out;
- err = autofs_dev_ioctl_find_super(&nd, devid);
+ err = autofs_dev_ioctl_find_super(&nd, dev);
if (err)
goto out_release;
- param->arg1 = autofs4_get_dev(sbi);
+ devid = dev;
err = have_submounts(nd.path.dentry);
if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
struct inode *inode = nd.path.dentry->d_inode;
- param->arg2 = inode->i_sb->s_magic;
+ magic = inode->i_sb->s_magic;
}
}
}
+ param->ismountpoint.out.devid = devid;
+ param->ismountpoint.out.magic = magic;
+
out_release:
path_put(&nd.path);
out:
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4b6fb3f628c..e3bd50776f9 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
/* This is an autofs submount, we can't expire it */
- if (sbi->type == AUTOFS_TYPE_INDIRECT)
+ if (autofs_type_indirect(sbi->type))
goto done;
/*
@@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
if (arg && get_user(do_now, arg))
return -EFAULT;
- if (sbi->type & AUTOFS_TYPE_TRIGGER)
+ if (autofs_type_trigger(sbi->type))
dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
else
dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7b19802cfef..716e12b627b 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
seq_printf(m, ",minproto=%d", sbi->min_proto);
seq_printf(m, ",maxproto=%d", sbi->max_proto);
- if (sbi->type & AUTOFS_TYPE_OFFSET)
+ if (autofs_type_offset(sbi->type))
seq_printf(m, ",offset");
- else if (sbi->type & AUTOFS_TYPE_DIRECT)
+ else if (autofs_type_direct(sbi->type))
seq_printf(m, ",direct");
else
seq_printf(m, ",indirect");
@@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
*maxproto = option;
break;
case Opt_indirect:
- *type = AUTOFS_TYPE_INDIRECT;
+ set_autofs_type_indirect(type);
break;
case Opt_direct:
- *type = AUTOFS_TYPE_DIRECT;
+ set_autofs_type_direct(type);
break;
case Opt_offset:
- *type = AUTOFS_TYPE_OFFSET;
+ set_autofs_type_offset(type);
break;
default:
return 1;
@@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->sb = s;
sbi->version = 0;
sbi->sub_version = 0;
- sbi->type = AUTOFS_TYPE_INDIRECT;
+ set_autofs_type_indirect(&sbi->type);
sbi->min_proto = 0;
sbi->max_proto = 0;
mutex_init(&sbi->wq_mutex);
@@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
}
root_inode->i_fop = &autofs4_root_operations;
- root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ?
+ root_inode->i_op = autofs_type_trigger(sbi->type) ?
&autofs4_direct_root_inode_operations :
&autofs4_indirect_root_inode_operations;
@@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
if (sb->s_root) {
inode->i_uid = sb->s_root->d_inode->i_uid;
inode->i_gid = sb->s_root->d_inode->i_gid;
- } else {
- inode->i_uid = 0;
- inode->i_gid = 0;
}
- inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
if (S_ISDIR(inf->mode)) {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e02cc8ae5eb..eeb24684590 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
* is very similar for indirect mounts except only dentrys
* in the root of the autofs file system may be negative.
*/
- if (sbi->type & AUTOFS_TYPE_TRIGGER)
+ if (autofs_type_trigger(sbi->type))
return -ENOENT;
else if (!IS_ROOT(dentry->d_parent))
return -ENOENT;
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
return -ENOMEM;
/* If this is a direct mount request create a dummy name */
- if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER)
+ if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
qstr.len = sprintf(name, "%p", dentry);
else {
qstr.len = autofs4_getpath(sbi, dentry, &name);
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
type = autofs_ptype_expire_multi;
} else {
if (notify == NFY_MOUNT)
- type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+ type = autofs_type_trigger(sbi->type) ?
autofs_ptype_missing_direct :
autofs_ptype_missing_indirect;
else
- type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+ type = autofs_type_trigger(sbi->type) ?
autofs_ptype_expire_direct :
autofs_ptype_expire_indirect;
}
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 0ed57b5ee01..cc4062d12ca 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s)
{
struct bfs_sb_info *info = BFS_SB(s);
+ if (!info)
+ return;
+
brelse(info->si_sbh);
mutex_destroy(&info->bfs_lock);
kfree(info->si_imap);
@@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
unsigned i, imap_len;
struct bfs_sb_info *info;
long ret = -EINVAL;
+ unsigned long i_sblock, i_eblock, i_eoff, s_size;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
@@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
s->s_magic = BFS_MAGIC;
info->si_sbh = bh;
+
+ if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
+ printf("Superblock is corrupted\n");
+ goto out;
+ }
+
info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
sizeof(struct bfs_inode)
+ BFS_ROOT_INO - 1;
@@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
- le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
info->si_freei = 0;
info->si_lf_eblk = 0;
+
+ /* can we read the last block? */
+ bh = sb_bread(s, info->si_blocks - 1);
+ if (!bh) {
+ printf("Last block not available: %lu\n", info->si_blocks - 1);
+ iput(inode);
+ ret = -EIO;
+ kfree(info->si_imap);
+ goto out;
+ }
+ brelse(bh);
+
bh = NULL;
for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
struct bfs_inode *di;
@@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
di = (struct bfs_inode *)bh->b_data + off;
+ /* test if filesystem is not corrupted */
+
+ i_eoff = le32_to_cpu(di->i_eoffset);
+ i_sblock = le32_to_cpu(di->i_sblock);
+ i_eblock = le32_to_cpu(di->i_eblock);
+ s_size = le32_to_cpu(bfs_sb->s_end);
+
+ if (i_sblock > info->si_blocks ||
+ i_eblock > info->si_blocks ||
+ i_sblock > i_eblock ||
+ i_eoff > s_size ||
+ i_sblock * BFS_BSIZE > i_eoff) {
+
+ printf("Inode 0x%08x corrupted\n", i);
+
+ brelse(bh);
+ s->s_root = NULL;
+ kfree(info->si_imap);
+ kfree(info);
+ s->s_fs_info = NULL;
+ return -EIO;
+ }
+
if (!di->i_ino) {
info->si_freei++;
continue;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af767..e3ff2b9e602 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
elf_addr_t __user *sp;
elf_addr_t __user *u_platform;
elf_addr_t __user *u_base_platform;
+ elf_addr_t __user *u_rand_bytes;
const char *k_platform = ELF_PLATFORM;
const char *k_base_platform = ELF_BASE_PLATFORM;
+ unsigned char k_rand_bytes[16];
int items;
elf_addr_t *elf_info;
int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
return -EFAULT;
}
+ /*
+ * Generate 16 random bytes for userspace PRNG seeding.
+ */
+ get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+ u_rand_bytes = (elf_addr_t __user *)
+ STACK_ALLOC(p, sizeof(k_rand_bytes));
+ if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+ return -EFAULT;
+
/* Create the ELF interpreter info */
elf_info = (elf_addr_t *)current->mm->saved_auxv;
/* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
NEW_AUX_ENT(AT_GID, cred->gid);
NEW_AUX_ENT(AT_EGID, cred->egid);
NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+ NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
if (k_platform) {
NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index aa5b43205e3..f3e72c5c19f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -168,9 +168,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
struct elf_fdpic_params exec_params, interp_params;
struct elf_phdr *phdr;
unsigned long stack_size, entryaddr;
-#ifndef CONFIG_MMU
- unsigned long fullsize;
-#endif
#ifdef ELF_FDPIC_PLAT_INIT
unsigned long dynaddr;
#endif
@@ -390,11 +387,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
goto error_kill;
}
- /* expand the stack mapping to use up the entire allocation granule */
- fullsize = kobjsize((char *) current->mm->start_brk);
- if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size,
- fullsize, 0, 0)))
- stack_size = fullsize;
up_write(&current->mm->mmap_sem);
current->mm->brk = current->mm->start_brk;
@@ -1567,11 +1559,9 @@ end_coredump:
static int elf_fdpic_dump_segments(struct file *file, size_t *size,
unsigned long *limit, unsigned long mm_flags)
{
- struct vm_list_struct *vml;
-
- for (vml = current->mm->context.vmlist; vml; vml = vml->next) {
- struct vm_area_struct *vma = vml->vma;
+ struct vm_area_struct *vma;
+ for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
if (!maydump(vma, mm_flags))
continue;
@@ -1617,9 +1607,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
elf_fpxregset_t *xfpu = NULL;
#endif
int thread_status_size = 0;
-#ifndef CONFIG_MMU
- struct vm_list_struct *vml;
-#endif
elf_addr_t *auxv;
unsigned long mm_flags;
@@ -1685,13 +1672,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
fill_prstatus(prstatus, current, signr);
elf_core_copy_regs(&prstatus->pr_reg, regs);
-#ifdef CONFIG_MMU
segs = current->mm->map_count;
-#else
- segs = 0;
- for (vml = current->mm->context.vmlist; vml; vml = vml->next)
- segs++;
-#endif
#ifdef ELF_CORE_EXTRA_PHDRS
segs += ELF_CORE_EXTRA_PHDRS;
#endif
@@ -1766,20 +1747,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
mm_flags = current->mm->flags;
/* write program headers for segments dump */
- for (
-#ifdef CONFIG_MMU
- vma = current->mm->mmap; vma; vma = vma->vm_next
-#else
- vml = current->mm->context.vmlist; vml; vml = vml->next
-#endif
- ) {
+ for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
struct elf_phdr phdr;
size_t sz;
-#ifndef CONFIG_MMU
- vma = vml->vma;
-#endif
-
sz = vma->vm_end - vma->vm_start;
phdr.p_type = PT_LOAD;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7bbd5c6b372..5cebf0b3779 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -417,8 +417,8 @@ static int load_flat_file(struct linux_binprm * bprm,
unsigned long textpos = 0, datapos = 0, result;
unsigned long realdatastart = 0;
unsigned long text_len, data_len, bss_len, stack_len, flags;
- unsigned long len, reallen, memp = 0;
- unsigned long extra, rlim;
+ unsigned long len, memp = 0;
+ unsigned long memp_size, extra, rlim;
unsigned long *reloc = 0, *rp;
struct inode *inode;
int i, rev, relocs = 0;
@@ -543,17 +543,10 @@ static int load_flat_file(struct linux_binprm * bprm,
}
len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+ len = PAGE_ALIGN(len);
down_write(&current->mm->mmap_sem);
realdatastart = do_mmap(0, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
- /* Remap to use all availabe slack region space */
- if (realdatastart && (realdatastart < (unsigned long)-4096)) {
- reallen = kobjsize((void *)realdatastart);
- if (reallen > len) {
- realdatastart = do_mremap(realdatastart, len,
- reallen, MREMAP_FIXED, realdatastart);
- }
- }
up_write(&current->mm->mmap_sem);
if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) {
@@ -591,21 +584,14 @@ static int load_flat_file(struct linux_binprm * bprm,
reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
memp = realdatastart;
-
+ memp_size = len;
} else {
len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+ len = PAGE_ALIGN(len);
down_write(&current->mm->mmap_sem);
textpos = do_mmap(0, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
- /* Remap to use all availabe slack region space */
- if (textpos && (textpos < (unsigned long) -4096)) {
- reallen = kobjsize((void *)textpos);
- if (reallen > len) {
- textpos = do_mremap(textpos, len, reallen,
- MREMAP_FIXED, textpos);
- }
- }
up_write(&current->mm->mmap_sem);
if (!textpos || textpos >= (unsigned long) -4096) {
@@ -622,7 +608,7 @@ static int load_flat_file(struct linux_binprm * bprm,
reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) +
MAX_SHARED_LIBS * sizeof(unsigned long));
memp = textpos;
-
+ memp_size = len;
#ifdef CONFIG_BINFMT_ZFLAT
/*
* load it all in and treat it like a RAM load from now on
@@ -680,10 +666,12 @@ static int load_flat_file(struct linux_binprm * bprm,
* set up the brk stuff, uses any slack left in data/bss/stack
* allocation. We put the brk after the bss (between the bss
* and stack) like other platforms.
+ * Userspace code relies on the stack pointer starting out at
+ * an address right at the end of a page.
*/
current->mm->start_brk = datapos + data_len + bss_len;
current->mm->brk = (current->mm->start_brk + 3) & ~3;
- current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len;
+ current->mm->context.end_brk = memp + memp_size - stack_len;
}
if (flags & FLAT_FLAG_KTRACE)
@@ -790,8 +778,8 @@ static int load_flat_file(struct linux_binprm * bprm,
/* zero the BSS, BRK and stack areas */
memset((void*)(datapos + data_len), 0, bss_len +
- (memp + kobjsize((void *) memp) - stack_len - /* end brk */
- libinfo->lib_list[id].start_brk) + /* start brk */
+ (memp + memp_size - stack_len - /* end brk */
+ libinfo->lib_list[id].start_brk) + /* start brk */
stack_len);
return 0;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f2744ab4e5b..c4e83537ead 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
if (inode) {
inode->i_mode = mode;
- inode->i_uid = 0;
- inode->i_gid = 0;
- inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime =
current_fs_time(inode->i_sb);
}
@@ -652,7 +649,7 @@ static const struct file_operations bm_register_operations = {
static ssize_t
bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
{
- char *s = enabled ? "enabled" : "disabled";
+ char *s = enabled ? "enabled\n" : "disabled\n";
return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
}
diff --git a/fs/bio.c b/fs/bio.c
index 711cee10360..062299acbcc 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -788,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
int i, ret;
int nr_pages = 0;
unsigned int len = 0;
+ unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
for (i = 0; i < iov_count; i++) {
unsigned long uaddr;
@@ -814,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
bio->bi_rw |= (!write_to_vm << BIO_RW);
ret = 0;
- i = 0;
+
+ if (map_data) {
+ nr_pages = 1 << map_data->page_order;
+ i = map_data->offset / PAGE_SIZE;
+ }
while (len) {
- unsigned int bytes;
+ unsigned int bytes = PAGE_SIZE;
- if (map_data)
- bytes = 1U << (PAGE_SHIFT + map_data->page_order);
- else
- bytes = PAGE_SIZE;
+ bytes -= offset;
if (bytes > len)
bytes = len;
if (map_data) {
- if (i == map_data->nr_entries) {
+ if (i == map_data->nr_entries * nr_pages) {
ret = -ENOMEM;
break;
}
- page = map_data->pages[i++];
- } else
+
+ page = map_data->pages[i / nr_pages];
+ page += (i % nr_pages);
+
+ i++;
+ } else {
page = alloc_page(q->bounce_gfp | gfp_mask);
- if (!page) {
- ret = -ENOMEM;
- break;
+ if (!page) {
+ ret = -ENOMEM;
+ break;
+ }
}
- if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
+ if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
break;
len -= bytes;
+ offset = 0;
}
if (ret)
@@ -851,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
/*
* success
*/
- if (!write_to_vm) {
+ if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
if (ret)
goto cleanup;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 349a26c1000..ac7031f12ea 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1005,6 +1005,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
lock_kernel();
+ restart:
ret = -ENXIO;
disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1025,6 +1026,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (disk->fops->open) {
ret = disk->fops->open(bdev, mode);
+ if (ret == -ERESTARTSYS) {
+ /* Lost a race with 'disk' being
+ * deleted, try again.
+ * See md.c
+ */
+ disk_put_part(bdev->bd_part);
+ bdev->bd_part = NULL;
+ module_put(disk->fops->owner);
+ put_disk(disk);
+ bdev->bd_disk = NULL;
+ mutex_unlock(&bdev->bd_mutex);
+ goto restart;
+ }
if (ret)
goto out_clear;
}
@@ -1220,6 +1234,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return blkdev_ioctl(bdev, mode, cmd, arg);
}
+/*
+ * Try to release a page associated with block device when the system
+ * is under memory pressure.
+ */
+static int blkdev_releasepage(struct page *page, gfp_t wait)
+{
+ struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
+
+ if (super && super->s_op->bdev_try_to_free_page)
+ return super->s_op->bdev_try_to_free_page(super, page, wait);
+
+ return try_to_free_buffers(page);
+}
+
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
.writepage = blkdev_writepage,
@@ -1227,6 +1255,7 @@ static const struct address_space_operations def_blk_aops = {
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
.writepages = generic_writepages,
+ .releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
};
@@ -1262,7 +1291,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
/**
* lookup_bdev - lookup a struct block_device by name
- * @path: special file representing the block device
+ * @pathname: special file representing the block device
*
* Get a reference to the blockdevice at @pathname in the current
* namespace if possible and return it. Return ERR_PTR(error)
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 00000000000..d2cf5a54a4b
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,25 @@
+ifneq ($(KERNELRELEASE),)
+# kbuild part of makefile
+
+obj-$(CONFIG_BTRFS_FS) := btrfs.o
+btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+ file-item.o inode-item.o inode-map.o disk-io.o \
+ transaction.o inode.o file.o tree-defrag.o \
+ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+ ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+ compression.o
+else
+
+# Normal Makefile
+
+KERNELDIR := /lib/modules/`uname -r`/build
+all:
+ $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
+
+modules_install:
+ $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
+clean:
+ $(MAKE) -C $(KERNELDIR) M=`pwd` clean
+
+endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 00000000000..1d53b62dbba
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "xattr.h"
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+static void btrfs_update_cached_acl(struct inode *inode,
+ struct posix_acl **p_acl,
+ struct posix_acl *acl)
+{
+ spin_lock(&inode->i_lock);
+ if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
+ posix_acl_release(*p_acl);
+ *p_acl = posix_acl_dup(acl);
+ spin_unlock(&inode->i_lock);
+}
+
+static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+{
+ int size;
+ const char *name;
+ char *value = NULL;
+ struct posix_acl *acl = NULL, **p_acl;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = POSIX_ACL_XATTR_ACCESS;
+ p_acl = &BTRFS_I(inode)->i_acl;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = POSIX_ACL_XATTR_DEFAULT;
+ p_acl = &BTRFS_I(inode)->i_default_acl;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ spin_lock(&inode->i_lock);
+ if (*p_acl != BTRFS_ACL_NOT_CACHED)
+ acl = posix_acl_dup(*p_acl);
+ spin_unlock(&inode->i_lock);
+
+ if (acl)
+ return acl;
+
+
+ size = __btrfs_getxattr(inode, name, "", 0);
+ if (size > 0) {
+ value = kzalloc(size, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ size = __btrfs_getxattr(inode, name, value, size);
+ if (size > 0) {
+ acl = posix_acl_from_xattr(value, size);
+ btrfs_update_cached_acl(inode, p_acl, acl);
+ }
+ kfree(value);
+ } else if (size == -ENOENT) {
+ acl = NULL;
+ btrfs_update_cached_acl(inode, p_acl, acl);
+ }
+
+ return acl;
+}
+
+static int btrfs_xattr_get_acl(struct inode *inode, int type,
+ void *value, size_t size)
+{
+ struct posix_acl *acl;
+ int ret = 0;
+
+ acl = btrfs_get_acl(inode, type);
+
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl == NULL)
+ return -ENODATA;
+ ret = posix_acl_to_xattr(acl, value, size);
+ posix_acl_release(acl);
+
+ return ret;
+}
+
+/*
+ * Needs to be called with fs_mutex held
+ */
+static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ int ret, size = 0;
+ const char *name;
+ struct posix_acl **p_acl;
+ char *value = NULL;
+ mode_t mode;
+
+ if (acl) {
+ ret = posix_acl_valid(acl);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+ }
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ mode = inode->i_mode;
+ ret = posix_acl_equiv_mode(acl, &mode);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+ inode->i_mode = mode;
+ name = POSIX_ACL_XATTR_ACCESS;
+ p_acl = &BTRFS_I(inode)->i_acl;
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EINVAL : 0;
+ name = POSIX_ACL_XATTR_DEFAULT;
+ p_acl = &BTRFS_I(inode)->i_default_acl;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmalloc(size, GFP_NOFS);
+ if (!value) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = posix_acl_to_xattr(acl, value, size);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = __btrfs_setxattr(inode, name, value, size, 0);
+
+out:
+ kfree(value);
+
+ if (!ret)
+ btrfs_update_cached_acl(inode, p_acl, acl);
+
+ return ret;
+}
+
+static int btrfs_xattr_set_acl(struct inode *inode, int type,
+ const void *value, size_t size)
+{
+ int ret = 0;
+ struct posix_acl *acl = NULL;
+
+ if (value) {
+ acl = posix_acl_from_xattr(value, size);
+ if (acl == NULL) {
+ value = NULL;
+ size = 0;
+ } else if (IS_ERR(acl)) {
+ return PTR_ERR(acl);
+ }
+ }
+
+ ret = btrfs_set_acl(inode, acl, type);
+
+ posix_acl_release(acl);
+
+ return ret;
+}
+
+
+static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
+ void *value, size_t size)
+{
+ return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
+ void *value, size_t size)
+{
+ return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+ struct posix_acl *acl;
+ int error = -EAGAIN;
+
+ acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl) {
+ error = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ }
+
+ return error;
+}
+
+/*
+ * btrfs_init_acl is already generally called under fs_mutex, so the locking
+ * stuff has been fixed to work with that. If the locking stuff changes, we
+ * need to re-evaluate the acl locking stuff.
+ */
+int btrfs_init_acl(struct inode *inode, struct inode *dir)
+{
+ struct posix_acl *acl = NULL;
+ int ret = 0;
+
+ /* this happens with subvols */
+ if (!dir)
+ return 0;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (IS_POSIXACL(dir)) {
+ acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+
+ if (!acl)
+ inode->i_mode &= ~current->fs->umask;
+ }
+
+ if (IS_POSIXACL(dir) && acl) {
+ struct posix_acl *clone;
+ mode_t mode;
+
+ if (S_ISDIR(inode->i_mode)) {
+ ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
+ if (ret)
+ goto failed;
+ }
+ clone = posix_acl_clone(acl, GFP_NOFS);
+ ret = -ENOMEM;
+ if (!clone)
+ goto failed;
+
+ mode = inode->i_mode;
+ ret = posix_acl_create_masq(clone, &mode);
+ if (ret >= 0) {
+ inode->i_mode = mode;
+ if (ret > 0) {
+ /* we need an acl */
+ ret = btrfs_set_acl(inode, clone,
+ ACL_TYPE_ACCESS);
+ }
+ }
+ }
+failed:
+ posix_acl_release(acl);
+
+ return ret;
+}
+
+int btrfs_acl_chmod(struct inode *inode)
+{
+ struct posix_acl *acl, *clone;
+ int ret = 0;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!IS_POSIXACL(inode))
+ return 0;
+
+ acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ posix_acl_release(acl);
+ if (!clone)
+ return -ENOMEM;
+
+ ret = posix_acl_chmod_masq(clone, inode->i_mode);
+ if (!ret)
+ ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
+
+ posix_acl_release(clone);
+
+ return ret;
+}
+
+struct xattr_handler btrfs_xattr_acl_default_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .get = btrfs_xattr_acl_default_get,
+ .set = btrfs_xattr_acl_default_set,
+};
+
+struct xattr_handler btrfs_xattr_acl_access_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .get = btrfs_xattr_acl_access_get,
+ .set = btrfs_xattr_acl_access_set,
+};
+
+#else /* CONFIG_FS_POSIX_ACL */
+
+int btrfs_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+
+int btrfs_init_acl(struct inode *inode, struct inode *dir)
+{
+ return 0;
+}
+
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+ return 0;
+}
+
+#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 00000000000..8e2fec05dbe
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/version.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+# include <linux/freezer.h>
+#include "async-thread.h"
+
+#define WORK_QUEUED_BIT 0
+#define WORK_DONE_BIT 1
+#define WORK_ORDER_DONE_BIT 2
+
+/*
+ * container for the kthread task pointer and the list of pending work
+ * One of these is allocated per thread.
+ */
+struct btrfs_worker_thread {
+ /* pool we belong to */
+ struct btrfs_workers *workers;
+
+ /* list of struct btrfs_work that are waiting for service */
+ struct list_head pending;
+
+ /* list of worker threads from struct btrfs_workers */
+ struct list_head worker_list;
+
+ /* kthread */
+ struct task_struct *task;
+
+ /* number of things on the pending list */
+ atomic_t num_pending;
+
+ unsigned long sequence;
+
+ /* protects the pending list. */
+ spinlock_t lock;
+
+ /* set to non-zero when this thread is already awake and kicking */
+ int working;
+
+ /* are we currently idle */
+ int idle;
+};
+
+/*
+ * helper function to move a thread onto the idle list after it
+ * has finished some requests.
+ */
+static void check_idle_worker(struct btrfs_worker_thread *worker)
+{
+ if (!worker->idle && atomic_read(&worker->num_pending) <
+ worker->workers->idle_thresh / 2) {
+ unsigned long flags;
+ spin_lock_irqsave(&worker->workers->lock, flags);
+ worker->idle = 1;
+ list_move(&worker->worker_list, &worker->workers->idle_list);
+ spin_unlock_irqrestore(&worker->workers->lock, flags);
+ }
+}
+
+/*
+ * helper function to move a thread off the idle list after new
+ * pending work is added.
+ */
+static void check_busy_worker(struct btrfs_worker_thread *worker)
+{
+ if (worker->idle && atomic_read(&worker->num_pending) >=
+ worker->workers->idle_thresh) {
+ unsigned long flags;
+ spin_lock_irqsave(&worker->workers->lock, flags);
+ worker->idle = 0;
+ list_move_tail(&worker->worker_list,
+ &worker->workers->worker_list);
+ spin_unlock_irqrestore(&worker->workers->lock, flags);
+ }
+}
+
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+ struct btrfs_work *work)
+{
+ unsigned long flags;
+
+ if (!workers->ordered)
+ return 0;
+
+ set_bit(WORK_DONE_BIT, &work->flags);
+
+ spin_lock_irqsave(&workers->lock, flags);
+
+ while (!list_empty(&workers->order_list)) {
+ work = list_entry(workers->order_list.next,
+ struct btrfs_work, order_list);
+
+ if (!test_bit(WORK_DONE_BIT, &work->flags))
+ break;
+
+ /* we are going to call the ordered done function, but
+ * we leave the work item on the list as a barrier so
+ * that later work items that are done don't have their
+ * functions called before this one returns
+ */
+ if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+ break;
+
+ spin_unlock_irqrestore(&workers->lock, flags);
+
+ work->ordered_func(work);
+
+ /* now take the lock again and call the freeing code */
+ spin_lock_irqsave(&workers->lock, flags);
+ list_del(&work->order_list);
+ work->ordered_free(work);
+ }
+
+ spin_unlock_irqrestore(&workers->lock, flags);
+ return 0;
+}
+
+/*
+ * main loop for servicing work items
+ */
+static int worker_loop(void *arg)
+{
+ struct btrfs_worker_thread *worker = arg;
+ struct list_head *cur;
+ struct btrfs_work *work;
+ do {
+ spin_lock_irq(&worker->lock);
+ while (!list_empty(&worker->pending)) {
+ cur = worker->pending.next;
+ work = list_entry(cur, struct btrfs_work, list);
+ list_del(&work->list);
+ clear_bit(WORK_QUEUED_BIT, &work->flags);
+
+ work->worker = worker;
+ spin_unlock_irq(&worker->lock);
+
+ work->func(work);
+
+ atomic_dec(&worker->num_pending);
+ /*
+ * unless this is an ordered work queue,
+ * 'work' was probably freed by func above.
+ */
+ run_ordered_completions(worker->workers, work);
+
+ spin_lock_irq(&worker->lock);
+ check_idle_worker(worker);
+
+ }
+ worker->working = 0;
+ if (freezing(current)) {
+ refrigerator();
+ } else {
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&worker->lock);
+ if (!kthread_should_stop())
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+/*
+ * this will wait for all the worker threads to shutdown
+ */
+int btrfs_stop_workers(struct btrfs_workers *workers)
+{
+ struct list_head *cur;
+ struct btrfs_worker_thread *worker;
+
+ list_splice_init(&workers->idle_list, &workers->worker_list);
+ while (!list_empty(&workers->worker_list)) {
+ cur = workers->worker_list.next;
+ worker = list_entry(cur, struct btrfs_worker_thread,
+ worker_list);
+ kthread_stop(worker->task);
+ list_del(&worker->worker_list);
+ kfree(worker);
+ }
+ return 0;
+}
+
+/*
+ * simple init on struct btrfs_workers
+ */
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
+{
+ workers->num_workers = 0;
+ INIT_LIST_HEAD(&workers->worker_list);
+ INIT_LIST_HEAD(&workers->idle_list);
+ INIT_LIST_HEAD(&workers->order_list);
+ spin_lock_init(&workers->lock);
+ workers->max_workers = max;
+ workers->idle_thresh = 32;
+ workers->name = name;
+ workers->ordered = 0;
+}
+
+/*
+ * starts new worker threads. This does not enforce the max worker
+ * count in case you need to temporarily go past it.
+ */
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+ struct btrfs_worker_thread *worker;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < num_workers; i++) {
+ worker = kzalloc(sizeof(*worker), GFP_NOFS);
+ if (!worker) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ INIT_LIST_HEAD(&worker->pending);
+ INIT_LIST_HEAD(&worker->worker_list);
+ spin_lock_init(&worker->lock);
+ atomic_set(&worker->num_pending, 0);
+ worker->task = kthread_run(worker_loop, worker,
+ "btrfs-%s-%d", workers->name,
+ workers->num_workers + i);
+ worker->workers = workers;
+ if (IS_ERR(worker->task)) {
+ kfree(worker);
+ ret = PTR_ERR(worker->task);
+ goto fail;
+ }
+
+ spin_lock_irq(&workers->lock);
+ list_add_tail(&worker->worker_list, &workers->idle_list);
+ worker->idle = 1;
+ workers->num_workers++;
+ spin_unlock_irq(&workers->lock);
+ }
+ return 0;
+fail:
+ btrfs_stop_workers(workers);
+ return ret;
+}
+
+/*
+ * run through the list and find a worker thread that doesn't have a lot
+ * to do right now. This can return null if we aren't yet at the thread
+ * count limit and all of the threads are busy.
+ */
+static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+{
+ struct btrfs_worker_thread *worker;
+ struct list_head *next;
+ int enforce_min = workers->num_workers < workers->max_workers;
+
+ /*
+ * if we find an idle thread, don't move it to the end of the
+ * idle list. This improves the chance that the next submission
+ * will reuse the same thread, and maybe catch it while it is still
+ * working
+ */
+ if (!list_empty(&workers->idle_list)) {
+ next = workers->idle_list.next;
+ worker = list_entry(next, struct btrfs_worker_thread,
+ worker_list);
+ return worker;
+ }
+ if (enforce_min || list_empty(&workers->worker_list))
+ return NULL;
+
+ /*
+ * if we pick a busy task, move the task to the end of the list.
+ * hopefully this will keep things somewhat evenly balanced.
+ * Do the move in batches based on the sequence number. This groups
+ * requests submitted at roughly the same time onto the same worker.
+ */
+ next = workers->worker_list.next;
+ worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+ atomic_inc(&worker->num_pending);
+ worker->sequence++;
+
+ if (worker->sequence % workers->idle_thresh == 0)
+ list_move_tail(next, &workers->worker_list);
+ return worker;
+}
+
+/*
+ * selects a worker thread to take the next job. This will either find
+ * an idle worker, start a new worker up to the max count, or just return
+ * one of the existing busy workers.
+ */
+static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+{
+ struct btrfs_worker_thread *worker;
+ unsigned long flags;
+
+again:
+ spin_lock_irqsave(&workers->lock, flags);
+ worker = next_worker(workers);
+ spin_unlock_irqrestore(&workers->lock, flags);
+
+ if (!worker) {
+ spin_lock_irqsave(&workers->lock, flags);
+ if (workers->num_workers >= workers->max_workers) {
+ struct list_head *fallback = NULL;
+ /*
+ * we have failed to find any workers, just
+ * return the force one
+ */
+ if (!list_empty(&workers->worker_list))
+ fallback = workers->worker_list.next;
+ if (!list_empty(&workers->idle_list))
+ fallback = workers->idle_list.next;
+ BUG_ON(!fallback);
+ worker = list_entry(fallback,
+ struct btrfs_worker_thread, worker_list);
+ spin_unlock_irqrestore(&workers->lock, flags);
+ } else {
+ spin_unlock_irqrestore(&workers->lock, flags);
+ /* we're below the limit, start another worker */
+ btrfs_start_workers(workers, 1);
+ goto again;
+ }
+ }
+ return worker;
+}
+
+/*
+ * btrfs_requeue_work just puts the work item back on the tail of the list
+ * it was taken from. It is intended for use with long running work functions
+ * that make some progress and want to give the cpu up for others.
+ */
+int btrfs_requeue_work(struct btrfs_work *work)
+{
+ struct btrfs_worker_thread *worker = work->worker;
+ unsigned long flags;
+
+ if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+ goto out;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ atomic_inc(&worker->num_pending);
+ list_add_tail(&work->list, &worker->pending);
+
+ /* by definition we're busy, take ourselves off the idle
+ * list
+ */
+ if (worker->idle) {
+ spin_lock_irqsave(&worker->workers->lock, flags);
+ worker->idle = 0;
+ list_move_tail(&worker->worker_list,
+ &worker->workers->worker_list);
+ spin_unlock_irqrestore(&worker->workers->lock, flags);
+ }
+
+ spin_unlock_irqrestore(&worker->lock, flags);
+
+out:
+ return 0;
+}
+
+/*
+ * places a struct btrfs_work into the pending queue of one of the kthreads
+ */
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+{
+ struct btrfs_worker_thread *worker;
+ unsigned long flags;
+ int wake = 0;
+
+ /* don't requeue something already on a list */
+ if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+ goto out;
+
+ worker = find_worker(workers);
+ if (workers->ordered) {
+ spin_lock_irqsave(&workers->lock, flags);
+ list_add_tail(&work->order_list, &workers->order_list);
+ spin_unlock_irqrestore(&workers->lock, flags);
+ } else {
+ INIT_LIST_HEAD(&work->order_list);
+ }
+
+ spin_lock_irqsave(&worker->lock, flags);
+ atomic_inc(&worker->num_pending);
+ check_busy_worker(worker);
+ list_add_tail(&work->list, &worker->pending);
+
+ /*
+ * avoid calling into wake_up_process if this thread has already
+ * been kicked
+ */
+ if (!worker->working)
+ wake = 1;
+ worker->working = 1;
+
+ spin_unlock_irqrestore(&worker->lock, flags);
+
+ if (wake)
+ wake_up_process(worker->task);
+out:
+ return 0;
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 00000000000..31be4ed8b63
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ASYNC_THREAD_
+#define __BTRFS_ASYNC_THREAD_
+
+struct btrfs_worker_thread;
+
+/*
+ * This is similar to a workqueue, but it is meant to spread the operations
+ * across all available cpus instead of just the CPU that was used to
+ * queue the work. There is also some batching introduced to try and
+ * cut down on context switches.
+ *
+ * By default threads are added on demand up to 2 * the number of cpus.
+ * Changing struct btrfs_workers->max_workers is one way to prevent
+ * demand creation of kthreads.
+ *
+ * the basic model of these worker threads is to embed a btrfs_work
+ * structure in your own data struct, and use container_of in a
+ * work function to get back to your data struct.
+ */
+struct btrfs_work {
+ /*
+ * func should be set to the function you want called
+ * your work struct is passed as the only arg
+ *
+ * ordered_func must be set for work sent to an ordered work queue,
+ * and it is called to complete a given work item in the same
+ * order they were sent to the queue.
+ */
+ void (*func)(struct btrfs_work *work);
+ void (*ordered_func)(struct btrfs_work *work);
+ void (*ordered_free)(struct btrfs_work *work);
+
+ /*
+ * flags should be set to zero. It is used to make sure the
+ * struct is only inserted once into the list.
+ */
+ unsigned long flags;
+
+ /* don't touch these */
+ struct btrfs_worker_thread *worker;
+ struct list_head list;
+ struct list_head order_list;
+};
+
+struct btrfs_workers {
+ /* current number of running workers */
+ int num_workers;
+
+ /* max number of workers allowed. changed by btrfs_start_workers */
+ int max_workers;
+
+ /* once a worker has this many requests or fewer, it is idle */
+ int idle_thresh;
+
+ /* force completions in the order they were queued */
+ int ordered;
+
+ /* list with all the work threads. The workers on the idle thread
+ * may be actively servicing jobs, but they haven't yet hit the
+ * idle thresh limit above.
+ */
+ struct list_head worker_list;
+ struct list_head idle_list;
+
+ /*
+ * when operating in ordered mode, this maintains the list
+ * of work items waiting for completion
+ */
+ struct list_head order_list;
+
+ /* lock for finding the next worker thread to queue on */
+ spinlock_t lock;
+
+ /* extra name for this worker, used for current->name */
+ char *name;
+};
+
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
+int btrfs_requeue_work(struct btrfs_work *work);
+#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 00000000000..a8c9693b75a
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_I__
+#define __BTRFS_I__
+
+#include "extent_map.h"
+#include "extent_io.h"
+#include "ordered-data.h"
+
+/* in memory btrfs inode */
+struct btrfs_inode {
+ /* which subvolume this inode belongs to */
+ struct btrfs_root *root;
+
+ /* key used to find this inode on disk. This is used by the code
+ * to read in roots of subvolumes
+ */
+ struct btrfs_key location;
+
+ /* the extent_tree has caches of all the extent mappings to disk */
+ struct extent_map_tree extent_tree;
+
+ /* the io_tree does range state (DIRTY, LOCKED etc) */
+ struct extent_io_tree io_tree;
+
+ /* special utility tree used to record which mirrors have already been
+ * tried when checksums fail for a given block
+ */
+ struct extent_io_tree io_failure_tree;
+
+ /* held while inesrting or deleting extents from files */
+ struct mutex extent_mutex;
+
+ /* held while logging the inode in tree-log.c */
+ struct mutex log_mutex;
+
+ /* used to order data wrt metadata */
+ struct btrfs_ordered_inode_tree ordered_tree;
+
+ /* standard acl pointers */
+ struct posix_acl *i_acl;
+ struct posix_acl *i_default_acl;
+
+ /* for keeping track of orphaned inodes */
+ struct list_head i_orphan;
+
+ /* list of all the delalloc inodes in the FS. There are times we need
+ * to write all the delalloc pages to disk, and this list is used
+ * to walk them all.
+ */
+ struct list_head delalloc_inodes;
+
+ /* full 64 bit generation number, struct vfs_inode doesn't have a big
+ * enough field for this.
+ */
+ u64 generation;
+
+ /* sequence number for NFS changes */
+ u64 sequence;
+
+ /*
+ * transid of the trans_handle that last modified this inode
+ */
+ u64 last_trans;
+ /*
+ * transid that last logged this inode
+ */
+ u64 logged_trans;
+
+ /*
+ * trans that last made a change that should be fully fsync'd. This
+ * gets reset to zero each time the inode is logged
+ */
+ u64 log_dirty_trans;
+
+ /* total number of bytes pending delalloc, used by stat to calc the
+ * real block usage of the file
+ */
+ u64 delalloc_bytes;
+
+ /*
+ * the size of the file stored in the metadata on disk. data=ordered
+ * means the in-memory i_size might be larger than the size on disk
+ * because not all the blocks are written yet.
+ */
+ u64 disk_i_size;
+
+ /* flags field from the on disk inode */
+ u32 flags;
+
+ /*
+ * if this is a directory then index_cnt is the counter for the index
+ * number for new files that are created
+ */
+ u64 index_cnt;
+
+ /* the start of block group preferred for allocations. */
+ u64 block_group;
+
+ struct inode vfs_inode;
+};
+
+static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+{
+ return container_of(inode, struct btrfs_inode, vfs_inode);
+}
+
+static inline void btrfs_i_size_write(struct inode *inode, u64 size)
+{
+ inode->i_size = size;
+ BTRFS_I(inode)->disk_i_size = size;
+}
+
+
+#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 00000000000..7c4503ef6ef
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,7 @@
+#ifndef _COMPAT_H_
+#define _COMPAT_H_
+
+#define btrfs_drop_nlink(inode) drop_nlink(inode)
+#define btrfs_inc_nlink(inode) inc_nlink(inode)
+
+#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 00000000000..ee848d8585d
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,709 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include <linux/pagevec.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+ /* number of bios pending for this compressed extent */
+ atomic_t pending_bios;
+
+ /* the pages with the compressed data on them */
+ struct page **compressed_pages;
+
+ /* inode that owns this data */
+ struct inode *inode;
+
+ /* starting offset in the inode for our pages */
+ u64 start;
+
+ /* number of bytes in the inode we're working on */
+ unsigned long len;
+
+ /* number of bytes on disk */
+ unsigned long compressed_len;
+
+ /* number of compressed pages in the array */
+ unsigned long nr_pages;
+
+ /* IO errors */
+ int errors;
+ int mirror_num;
+
+ /* for reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+
+ /*
+ * the start of a variable length array of checksums only
+ * used by reads
+ */
+ u32 sums;
+};
+
+static inline int compressed_bio_size(struct btrfs_root *root,
+ unsigned long disk_size)
+{
+ u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+ return sizeof(struct compressed_bio) +
+ ((disk_size + root->sectorsize - 1) / root->sectorsize) *
+ csum_size;
+}
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+ u64 first_byte, gfp_t gfp_flags)
+{
+ struct bio *bio;
+ int nr_vecs;
+
+ nr_vecs = bio_get_nr_vecs(bdev);
+ bio = bio_alloc(gfp_flags, nr_vecs);
+
+ if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (nr_vecs /= 2))
+ bio = bio_alloc(gfp_flags, nr_vecs);
+ }
+
+ if (bio) {
+ bio->bi_size = 0;
+ bio->bi_bdev = bdev;
+ bio->bi_sector = first_byte >> 9;
+ }
+ return bio;
+}
+
+static int check_compressed_csum(struct inode *inode,
+ struct compressed_bio *cb,
+ u64 disk_start)
+{
+ int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct page *page;
+ unsigned long i;
+ char *kaddr;
+ u32 csum;
+ u32 *cb_sum = &cb->sums;
+
+ if (btrfs_test_flag(inode, NODATASUM))
+ return 0;
+
+ for (i = 0; i < cb->nr_pages; i++) {
+ page = cb->compressed_pages[i];
+ csum = ~(u32)0;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+ btrfs_csum_final(csum, (char *)&csum);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ if (csum != *cb_sum) {
+ printk(KERN_INFO "btrfs csum failed ino %lu "
+ "extent %llu csum %u "
+ "wanted %u mirror %d\n", inode->i_ino,
+ (unsigned long long)disk_start,
+ csum, *cb_sum, cb->mirror_num);
+ ret = -EIO;
+ goto fail;
+ }
+ cb_sum++;
+
+ }
+ ret = 0;
+fail:
+ return ret;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+ struct extent_io_tree *tree;
+ struct compressed_bio *cb = bio->bi_private;
+ struct inode *inode;
+ struct page *page;
+ unsigned long index;
+ int ret;
+
+ if (err)
+ cb->errors = 1;
+
+ /* if there are more bios still pending for this compressed
+ * extent, just exit
+ */
+ if (!atomic_dec_and_test(&cb->pending_bios))
+ goto out;
+
+ inode = cb->inode;
+ ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+ if (ret)
+ goto csum_failed;
+
+ /* ok, we're the last bio for this extent, lets start
+ * the decompression.
+ */
+ tree = &BTRFS_I(inode)->io_tree;
+ ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+ cb->start,
+ cb->orig_bio->bi_io_vec,
+ cb->orig_bio->bi_vcnt,
+ cb->compressed_len);
+csum_failed:
+ if (ret)
+ cb->errors = 1;
+
+ /* release the compressed pages */
+ index = 0;
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ page_cache_release(page);
+ }
+
+ /* do io completion on the original bio */
+ if (cb->errors) {
+ bio_io_error(cb->orig_bio);
+ } else {
+ int bio_index = 0;
+ struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+
+ /*
+ * we have verified the checksum already, set page
+ * checked so the end_io handlers know about it
+ */
+ while (bio_index < cb->orig_bio->bi_vcnt) {
+ SetPageChecked(bvec->bv_page);
+ bvec++;
+ bio_index++;
+ }
+ bio_endio(cb->orig_bio, 0);
+ }
+
+ /* finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+out:
+ bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+ unsigned long ram_size)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+ struct page *pages[16];
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+ int ret;
+
+ while (nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long,
+ nr_pages, ARRAY_SIZE(pages)), pages);
+ if (ret == 0) {
+ nr_pages -= 1;
+ index += 1;
+ continue;
+ }
+ for (i = 0; i < ret; i++) {
+ end_page_writeback(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ }
+ /* the inode may be gone now */
+ return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+ struct extent_io_tree *tree;
+ struct compressed_bio *cb = bio->bi_private;
+ struct inode *inode;
+ struct page *page;
+ unsigned long index;
+
+ if (err)
+ cb->errors = 1;
+
+ /* if there are more bios still pending for this compressed
+ * extent, just exit
+ */
+ if (!atomic_dec_and_test(&cb->pending_bios))
+ goto out;
+
+ /* ok, we're the last bio for this extent, step one is to
+ * call back into the FS and do all the end_io operations
+ */
+ inode = cb->inode;
+ tree = &BTRFS_I(inode)->io_tree;
+ cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
+ tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+ cb->start,
+ cb->start + cb->len - 1,
+ NULL, 1);
+ cb->compressed_pages[0]->mapping = NULL;
+
+ end_compressed_writeback(inode, cb->start, cb->len);
+ /* note, our inode could be gone now */
+
+ /*
+ * release the compressed pages, these came from alloc_page and
+ * are not attached to the inode at all
+ */
+ index = 0;
+ for (index = 0; index < cb->nr_pages; index++) {
+ page = cb->compressed_pages[index];
+ page->mapping = NULL;
+ page_cache_release(page);
+ }
+
+ /* finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+out:
+ bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+ unsigned long len, u64 disk_start,
+ unsigned long compressed_len,
+ struct page **compressed_pages,
+ unsigned long nr_pages)
+{
+ struct bio *bio = NULL;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct compressed_bio *cb;
+ unsigned long bytes_left;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int page_index = 0;
+ struct page *page;
+ u64 first_byte = disk_start;
+ struct block_device *bdev;
+ int ret;
+
+ WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+ cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+ atomic_set(&cb->pending_bios, 0);
+ cb->errors = 0;
+ cb->inode = inode;
+ cb->start = start;
+ cb->len = len;
+ cb->mirror_num = 0;
+ cb->compressed_pages = compressed_pages;
+ cb->compressed_len = compressed_len;
+ cb->orig_bio = NULL;
+ cb->nr_pages = nr_pages;
+
+ bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ atomic_inc(&cb->pending_bios);
+
+ /* create and submit bios for the compressed pages */
+ bytes_left = compressed_len;
+ for (page_index = 0; page_index < cb->nr_pages; page_index++) {
+ page = compressed_pages[page_index];
+ page->mapping = inode->i_mapping;
+ if (bio->bi_size)
+ ret = io_tree->ops->merge_bio_hook(page, 0,
+ PAGE_CACHE_SIZE,
+ bio, 0);
+ else
+ ret = 0;
+
+ page->mapping = NULL;
+ if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ bio_get(bio);
+
+ /*
+ * inc the count before we submit the bio so
+ * we know the end IO handler won't happen before
+ * we inc the count. Otherwise, the cb might get
+ * freed before we're done setting it up
+ */
+ atomic_inc(&cb->pending_bios);
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+ BUG_ON(ret);
+
+ bio_put(bio);
+
+ bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ bio->bi_private = cb;
+ bio->bi_end_io = end_compressed_bio_write;
+ bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ }
+ if (bytes_left < PAGE_CACHE_SIZE) {
+ printk("bytes left %lu compress len %lu nr %lu\n",
+ bytes_left, cb->compressed_len, cb->nr_pages);
+ }
+ bytes_left -= PAGE_CACHE_SIZE;
+ first_byte += PAGE_CACHE_SIZE;
+ cond_resched();
+ }
+ bio_get(bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ BUG_ON(ret);
+
+ ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+ BUG_ON(ret);
+
+ ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+ BUG_ON(ret);
+
+ bio_put(bio);
+ return 0;
+}
+
+static noinline int add_ra_bio_pages(struct inode *inode,
+ u64 compressed_end,
+ struct compressed_bio *cb)
+{
+ unsigned long end_index;
+ unsigned long page_index;
+ u64 last_offset;
+ u64 isize = i_size_read(inode);
+ int ret;
+ struct page *page;
+ unsigned long nr_pages = 0;
+ struct extent_map *em;
+ struct address_space *mapping = inode->i_mapping;
+ struct pagevec pvec;
+ struct extent_map_tree *em_tree;
+ struct extent_io_tree *tree;
+ u64 end;
+ int misses = 0;
+
+ page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
+ last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ tree = &BTRFS_I(inode)->io_tree;
+
+ if (isize == 0)
+ return 0;
+
+ end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+ pagevec_init(&pvec, 0);
+ while (last_offset < compressed_end) {
+ page_index = last_offset >> PAGE_CACHE_SHIFT;
+
+ if (page_index > end_index)
+ break;
+
+ rcu_read_lock();
+ page = radix_tree_lookup(&mapping->page_tree, page_index);
+ rcu_read_unlock();
+ if (page) {
+ misses++;
+ if (misses > 4)
+ break;
+ goto next;
+ }
+
+ page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
+ if (!page)
+ break;
+
+ page->index = page_index;
+ /*
+ * what we want to do here is call add_to_page_cache_lru,
+ * but that isn't exported, so we reproduce it here
+ */
+ if (add_to_page_cache(page, mapping,
+ page->index, GFP_NOFS)) {
+ page_cache_release(page);
+ goto next;
+ }
+
+ /* open coding of lru_cache_add, also not exported */
+ page_cache_get(page);
+ if (!pagevec_add(&pvec, page))
+ __pagevec_lru_add_file(&pvec);
+
+ end = last_offset + PAGE_CACHE_SIZE - 1;
+ /*
+ * at this point, we have a locked page in the page cache
+ * for these bytes in the file. But, we have to make
+ * sure they map to this compressed extent on disk.
+ */
+ set_page_extent_mapped(page);
+ lock_extent(tree, last_offset, end, GFP_NOFS);
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, last_offset,
+ PAGE_CACHE_SIZE);
+ spin_unlock(&em_tree->lock);
+
+ if (!em || last_offset < em->start ||
+ (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+ (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+ free_extent_map(em);
+ unlock_extent(tree, last_offset, end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ break;
+ }
+ free_extent_map(em);
+
+ if (page->index == end_index) {
+ char *userpage;
+ size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+
+ if (zero_offset) {
+ int zeros;
+ zeros = PAGE_CACHE_SIZE - zero_offset;
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + zero_offset, 0, zeros);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
+ }
+ }
+
+ ret = bio_add_page(cb->orig_bio, page,
+ PAGE_CACHE_SIZE, 0);
+
+ if (ret == PAGE_CACHE_SIZE) {
+ nr_pages++;
+ page_cache_release(page);
+ } else {
+ unlock_extent(tree, last_offset, end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ break;
+ }
+next:
+ last_offset += PAGE_CACHE_SIZE;
+ }
+ if (pagevec_count(&pvec))
+ __pagevec_lru_add_file(&pvec);
+ return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it. We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ struct extent_io_tree *tree;
+ struct extent_map_tree *em_tree;
+ struct compressed_bio *cb;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ unsigned long compressed_len;
+ unsigned long nr_pages;
+ unsigned long page_index;
+ struct page *page;
+ struct block_device *bdev;
+ struct bio *comp_bio;
+ u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+ u64 em_len;
+ u64 em_start;
+ struct extent_map *em;
+ int ret;
+ u32 *sums;
+
+ tree = &BTRFS_I(inode)->io_tree;
+ em_tree = &BTRFS_I(inode)->extent_tree;
+
+ /* we need the actual starting offset of this extent in the file */
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree,
+ page_offset(bio->bi_io_vec->bv_page),
+ PAGE_CACHE_SIZE);
+ spin_unlock(&em_tree->lock);
+
+ compressed_len = em->block_len;
+ cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+ atomic_set(&cb->pending_bios, 0);
+ cb->errors = 0;
+ cb->inode = inode;
+ cb->mirror_num = mirror_num;
+ sums = &cb->sums;
+
+ cb->start = em->orig_start;
+ em_len = em->len;
+ em_start = em->start;
+
+ free_extent_map(em);
+ em = NULL;
+
+ cb->len = uncompressed_len;
+ cb->compressed_len = compressed_len;
+ cb->orig_bio = bio;
+
+ nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE;
+ cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+ GFP_NOFS);
+ bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ for (page_index = 0; page_index < nr_pages; page_index++) {
+ cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+ __GFP_HIGHMEM);
+ }
+ cb->nr_pages = nr_pages;
+
+ add_ra_bio_pages(inode, em_start + em_len, cb);
+
+ /* include any pages we added in add_ra-bio_pages */
+ uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ cb->len = uncompressed_len;
+
+ comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+ comp_bio->bi_private = cb;
+ comp_bio->bi_end_io = end_compressed_bio_read;
+ atomic_inc(&cb->pending_bios);
+
+ for (page_index = 0; page_index < nr_pages; page_index++) {
+ page = cb->compressed_pages[page_index];
+ page->mapping = inode->i_mapping;
+ page->index = em_start >> PAGE_CACHE_SHIFT;
+
+ if (comp_bio->bi_size)
+ ret = tree->ops->merge_bio_hook(page, 0,
+ PAGE_CACHE_SIZE,
+ comp_bio, 0);
+ else
+ ret = 0;
+
+ page->mapping = NULL;
+ if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ bio_get(comp_bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ BUG_ON(ret);
+
+ /*
+ * inc the count before we submit the bio so
+ * we know the end IO handler won't happen before
+ * we inc the count. Otherwise, the cb might get
+ * freed before we're done setting it up
+ */
+ atomic_inc(&cb->pending_bios);
+
+ if (!btrfs_test_flag(inode, NODATASUM)) {
+ btrfs_lookup_bio_sums(root, inode, comp_bio,
+ sums);
+ }
+ sums += (comp_bio->bi_size + root->sectorsize - 1) /
+ root->sectorsize;
+
+ ret = btrfs_map_bio(root, READ, comp_bio,
+ mirror_num, 0);
+ BUG_ON(ret);
+
+ bio_put(comp_bio);
+
+ comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+ GFP_NOFS);
+ comp_bio->bi_private = cb;
+ comp_bio->bi_end_io = end_compressed_bio_read;
+
+ bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
+ }
+ cur_disk_byte += PAGE_CACHE_SIZE;
+ }
+ bio_get(comp_bio);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+ BUG_ON(ret);
+
+ if (!btrfs_test_flag(inode, NODATASUM))
+ btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+
+ ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
+ BUG_ON(ret);
+
+ bio_put(comp_bio);
+ return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 00000000000..421f5b4aa71
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+int btrfs_zlib_decompress(unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+ unsigned long len, u64 disk_start,
+ unsigned long compressed_len,
+ struct page **compressed_pages,
+ unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 00000000000..6e1b3de3670
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_CRC32C__
+#define __BTRFS_CRC32C__
+#include <linux/crc32c.h>
+
+/*
+ * this file used to do more for selecting the HW version of crc32c,
+ * perhaps it will one day again soon.
+ */
+#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
+#endif
+
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 00000000000..9e46c077681
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3953 @@
+/*
+ * Copyright (C) 2007,2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "locking.h"
+
+static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int level);
+static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *ins_key,
+ struct btrfs_path *path, int data_size, int extend);
+static int push_node_left(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *dst,
+ struct extent_buffer *src, int empty);
+static int balance_node_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *dst_buf,
+ struct extent_buffer *src_buf);
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int level, int slot);
+
+inline void btrfs_init_path(struct btrfs_path *p)
+{
+ memset(p, 0, sizeof(*p));
+}
+
+struct btrfs_path *btrfs_alloc_path(void)
+{
+ struct btrfs_path *path;
+ path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
+ if (path) {
+ btrfs_init_path(path);
+ path->reada = 1;
+ }
+ return path;
+}
+
+/* this also releases the path */
+void btrfs_free_path(struct btrfs_path *p)
+{
+ btrfs_release_path(NULL, p);
+ kmem_cache_free(btrfs_path_cachep, p);
+}
+
+/*
+ * path release drops references on the extent buffers in the path
+ * and it drops any locks held by this path
+ *
+ * It is safe to call this on paths that no locks or extent buffers held.
+ */
+noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ p->slots[i] = 0;
+ if (!p->nodes[i])
+ continue;
+ if (p->locks[i]) {
+ btrfs_tree_unlock(p->nodes[i]);
+ p->locks[i] = 0;
+ }
+ free_extent_buffer(p->nodes[i]);
+ p->nodes[i] = NULL;
+ }
+}
+
+/*
+ * safely gets a reference on the root node of a tree. A lock
+ * is not taken, so a concurrent writer may put a different node
+ * at the root of the tree. See btrfs_lock_root_node for the
+ * looping required.
+ *
+ * The extent buffer returned by this has a reference taken, so
+ * it won't disappear. It may stop being the root of the tree
+ * at any time because there are no locks held.
+ */
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+ spin_lock(&root->node_lock);
+ eb = root->node;
+ extent_buffer_get(eb);
+ spin_unlock(&root->node_lock);
+ return eb;
+}
+
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root. A locked buffer
+ * is returned, with a reference held.
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ btrfs_tree_lock(eb);
+
+ spin_lock(&root->node_lock);
+ if (eb == root->node) {
+ spin_unlock(&root->node_lock);
+ break;
+ }
+ spin_unlock(&root->node_lock);
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/* cowonly root (everything not a reference counted cow subvolume), just get
+ * put onto a simple dirty list. transaction.c walks this to make sure they
+ * get properly updated on disk.
+ */
+static void add_root_to_dirty_list(struct btrfs_root *root)
+{
+ if (root->track_dirty && list_empty(&root->dirty_list)) {
+ list_add(&root->dirty_list,
+ &root->fs_info->dirty_cowonly_roots);
+ }
+}
+
+/*
+ * used by snapshot creation to make a copy of a root for a tree with
+ * a given objectid. The buffer with the new root node is returned in
+ * cow_ret, and this func returns zero on success or a negative error code.
+ */
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+ struct extent_buffer *cow;
+ u32 nritems;
+ int ret = 0;
+ int level;
+ struct btrfs_root *new_root;
+
+ new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
+ if (!new_root)
+ return -ENOMEM;
+
+ memcpy(new_root, root, sizeof(*new_root));
+ new_root->root_key.objectid = new_root_objectid;
+
+ WARN_ON(root->ref_cows && trans->transid !=
+ root->fs_info->running_transaction->transid);
+ WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+
+ level = btrfs_header_level(buf);
+ nritems = btrfs_header_nritems(buf);
+
+ cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
+ new_root_objectid, trans->transid,
+ level, buf->start, 0);
+ if (IS_ERR(cow)) {
+ kfree(new_root);
+ return PTR_ERR(cow);
+ }
+
+ copy_extent_buffer(cow, buf, 0, 0, cow->len);
+ btrfs_set_header_bytenr(cow, cow->start);
+ btrfs_set_header_generation(cow, trans->transid);
+ btrfs_set_header_owner(cow, new_root_objectid);
+ btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+
+ write_extent_buffer(cow, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(cow),
+ BTRFS_FSID_SIZE);
+
+ WARN_ON(btrfs_header_generation(buf) > trans->transid);
+ ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
+ kfree(new_root);
+
+ if (ret)
+ return ret;
+
+ btrfs_mark_buffer_dirty(cow);
+ *cow_ret = cow;
+ return 0;
+}
+
+/*
+ * does the dirty work in cow of a single block. The parent block (if
+ * supplied) is updated to point to the new cow copy. The new buffer is marked
+ * dirty and returned locked. If you modify the block it needs to be marked
+ * dirty again.
+ *
+ * search_start -- an allocation hint for the new block
+ *
+ * empty_size -- a hint that you plan on doing more cow. This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
+ *
+ * prealloc_dest -- if you have already reserved a destination for the cow,
+ * this uses that block instead of allocating a new one.
+ * btrfs_alloc_reserved_extent is used to finish the allocation.
+ */
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ u64 search_start, u64 empty_size,
+ u64 prealloc_dest)
+{
+ u64 parent_start;
+ struct extent_buffer *cow;
+ u32 nritems;
+ int ret = 0;
+ int level;
+ int unlock_orig = 0;
+
+ if (*cow_ret == buf)
+ unlock_orig = 1;
+
+ WARN_ON(!btrfs_tree_locked(buf));
+
+ if (parent)
+ parent_start = parent->start;
+ else
+ parent_start = 0;
+
+ WARN_ON(root->ref_cows && trans->transid !=
+ root->fs_info->running_transaction->transid);
+ WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+
+ level = btrfs_header_level(buf);
+ nritems = btrfs_header_nritems(buf);
+
+ if (prealloc_dest) {
+ struct btrfs_key ins;
+
+ ins.objectid = prealloc_dest;
+ ins.offset = buf->len;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+ ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
+ root->root_key.objectid,
+ trans->transid, level, &ins);
+ BUG_ON(ret);
+ cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
+ buf->len);
+ } else {
+ cow = btrfs_alloc_free_block(trans, root, buf->len,
+ parent_start,
+ root->root_key.objectid,
+ trans->transid, level,
+ search_start, empty_size);
+ }
+ if (IS_ERR(cow))
+ return PTR_ERR(cow);
+
+ copy_extent_buffer(cow, buf, 0, 0, cow->len);
+ btrfs_set_header_bytenr(cow, cow->start);
+ btrfs_set_header_generation(cow, trans->transid);
+ btrfs_set_header_owner(cow, root->root_key.objectid);
+ btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+
+ write_extent_buffer(cow, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(cow),
+ BTRFS_FSID_SIZE);
+
+ WARN_ON(btrfs_header_generation(buf) > trans->transid);
+ if (btrfs_header_generation(buf) != trans->transid) {
+ u32 nr_extents;
+ ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
+ if (ret)
+ return ret;
+
+ ret = btrfs_cache_ref(trans, root, buf, nr_extents);
+ WARN_ON(ret);
+ } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
+ /*
+ * There are only two places that can drop reference to
+ * tree blocks owned by living reloc trees, one is here,
+ * the other place is btrfs_drop_subtree. In both places,
+ * we check reference count while tree block is locked.
+ * Furthermore, if reference count is one, it won't get
+ * increased by someone else.
+ */
+ u32 refs;
+ ret = btrfs_lookup_extent_ref(trans, root, buf->start,
+ buf->len, &refs);
+ BUG_ON(ret);
+ if (refs == 1) {
+ ret = btrfs_update_ref(trans, root, buf, cow,
+ 0, nritems);
+ clean_tree_block(trans, root, buf);
+ } else {
+ ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
+ }
+ BUG_ON(ret);
+ } else {
+ ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
+ if (ret)
+ return ret;
+ clean_tree_block(trans, root, buf);
+ }
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
+ WARN_ON(ret);
+ }
+
+ if (buf == root->node) {
+ WARN_ON(parent && parent != buf);
+
+ spin_lock(&root->node_lock);
+ root->node = cow;
+ extent_buffer_get(cow);
+ spin_unlock(&root->node_lock);
+
+ if (buf != root->commit_root) {
+ btrfs_free_extent(trans, root, buf->start,
+ buf->len, buf->start,
+ root->root_key.objectid,
+ btrfs_header_generation(buf),
+ level, 1);
+ }
+ free_extent_buffer(buf);
+ add_root_to_dirty_list(root);
+ } else {
+ btrfs_set_node_blockptr(parent, parent_slot,
+ cow->start);
+ WARN_ON(trans->transid == 0);
+ btrfs_set_node_ptr_generation(parent, parent_slot,
+ trans->transid);
+ btrfs_mark_buffer_dirty(parent);
+ WARN_ON(btrfs_header_generation(parent) != trans->transid);
+ btrfs_free_extent(trans, root, buf->start, buf->len,
+ parent_start, btrfs_header_owner(parent),
+ btrfs_header_generation(parent), level, 1);
+ }
+ if (unlock_orig)
+ btrfs_tree_unlock(buf);
+ free_extent_buffer(buf);
+ btrfs_mark_buffer_dirty(cow);
+ *cow_ret = cow;
+ return 0;
+}
+
+/*
+ * cows a single block, see __btrfs_cow_block for the real work.
+ * This version of it has extra checks so that a block isn't cow'd more than
+ * once per transaction, as long as it hasn't been written yet
+ */
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret, u64 prealloc_dest)
+{
+ u64 search_start;
+ int ret;
+
+ if (trans->transaction != root->fs_info->running_transaction) {
+ printk(KERN_CRIT "trans %llu running %llu\n",
+ (unsigned long long)trans->transid,
+ (unsigned long long)
+ root->fs_info->running_transaction->transid);
+ WARN_ON(1);
+ }
+ if (trans->transid != root->fs_info->generation) {
+ printk(KERN_CRIT "trans %llu running %llu\n",
+ (unsigned long long)trans->transid,
+ (unsigned long long)root->fs_info->generation);
+ WARN_ON(1);
+ }
+
+ spin_lock(&root->fs_info->hash_lock);
+ if (btrfs_header_generation(buf) == trans->transid &&
+ btrfs_header_owner(buf) == root->root_key.objectid &&
+ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ *cow_ret = buf;
+ spin_unlock(&root->fs_info->hash_lock);
+ WARN_ON(prealloc_dest);
+ return 0;
+ }
+ spin_unlock(&root->fs_info->hash_lock);
+ search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+ ret = __btrfs_cow_block(trans, root, buf, parent,
+ parent_slot, cow_ret, search_start, 0,
+ prealloc_dest);
+ return ret;
+}
+
+/*
+ * helper function for defrag to decide if two blocks pointed to by a
+ * node are actually close by
+ */
+static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+ if (blocknr < other && other - (blocknr + blocksize) < 32768)
+ return 1;
+ if (blocknr > other && blocknr - (other + blocksize) < 32768)
+ return 1;
+ return 0;
+}
+
+/*
+ * compare two keys in a memcmp fashion
+ */
+static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
+{
+ struct btrfs_key k1;
+
+ btrfs_disk_key_to_cpu(&k1, disk);
+
+ if (k1.objectid > k2->objectid)
+ return 1;
+ if (k1.objectid < k2->objectid)
+ return -1;
+ if (k1.type > k2->type)
+ return 1;
+ if (k1.type < k2->type)
+ return -1;
+ if (k1.offset > k2->offset)
+ return 1;
+ if (k1.offset < k2->offset)
+ return -1;
+ return 0;
+}
+
+/*
+ * same as comp_keys only with two btrfs_key's
+ */
+static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+ if (k1->objectid > k2->objectid)
+ return 1;
+ if (k1->objectid < k2->objectid)
+ return -1;
+ if (k1->type > k2->type)
+ return 1;
+ if (k1->type < k2->type)
+ return -1;
+ if (k1->offset > k2->offset)
+ return 1;
+ if (k1->offset < k2->offset)
+ return -1;
+ return 0;
+}
+
+/*
+ * this is used by the defrag code to go through all the
+ * leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order
+ */
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *parent,
+ int start_slot, int cache_only, u64 *last_ret,
+ struct btrfs_key *progress)
+{
+ struct extent_buffer *cur;
+ u64 blocknr;
+ u64 gen;
+ u64 search_start = *last_ret;
+ u64 last_block = 0;
+ u64 other;
+ u32 parent_nritems;
+ int end_slot;
+ int i;
+ int err = 0;
+ int parent_level;
+ int uptodate;
+ u32 blocksize;
+ int progress_passed = 0;
+ struct btrfs_disk_key disk_key;
+
+ parent_level = btrfs_header_level(parent);
+ if (cache_only && parent_level != 1)
+ return 0;
+
+ if (trans->transaction != root->fs_info->running_transaction)
+ WARN_ON(1);
+ if (trans->transid != root->fs_info->generation)
+ WARN_ON(1);
+
+ parent_nritems = btrfs_header_nritems(parent);
+ blocksize = btrfs_level_size(root, parent_level - 1);
+ end_slot = parent_nritems;
+
+ if (parent_nritems == 1)
+ return 0;
+
+ for (i = start_slot; i < end_slot; i++) {
+ int close = 1;
+
+ if (!parent->map_token) {
+ map_extent_buffer(parent,
+ btrfs_node_key_ptr_offset(i),
+ sizeof(struct btrfs_key_ptr),
+ &parent->map_token, &parent->kaddr,
+ &parent->map_start, &parent->map_len,
+ KM_USER1);
+ }
+ btrfs_node_key(parent, &disk_key, i);
+ if (!progress_passed && comp_keys(&disk_key, progress) < 0)
+ continue;
+
+ progress_passed = 1;
+ blocknr = btrfs_node_blockptr(parent, i);
+ gen = btrfs_node_ptr_generation(parent, i);
+ if (last_block == 0)
+ last_block = blocknr;
+
+ if (i > 0) {
+ other = btrfs_node_blockptr(parent, i - 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (!close && i < end_slot - 2) {
+ other = btrfs_node_blockptr(parent, i + 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (close) {
+ last_block = blocknr;
+ continue;
+ }
+ if (parent->map_token) {
+ unmap_extent_buffer(parent, parent->map_token,
+ KM_USER1);
+ parent->map_token = NULL;
+ }
+
+ cur = btrfs_find_tree_block(root, blocknr, blocksize);
+ if (cur)
+ uptodate = btrfs_buffer_uptodate(cur, gen);
+ else
+ uptodate = 0;
+ if (!cur || !uptodate) {
+ if (cache_only) {
+ free_extent_buffer(cur);
+ continue;
+ }
+ if (!cur) {
+ cur = read_tree_block(root, blocknr,
+ blocksize, gen);
+ } else if (!uptodate) {
+ btrfs_read_buffer(cur, gen);
+ }
+ }
+ if (search_start == 0)
+ search_start = last_block;
+
+ btrfs_tree_lock(cur);
+ err = __btrfs_cow_block(trans, root, cur, parent, i,
+ &cur, search_start,
+ min(16 * blocksize,
+ (end_slot - i) * blocksize), 0);
+ if (err) {
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ break;
+ }
+ search_start = cur->start;
+ last_block = cur->start;
+ *last_ret = search_start;
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ }
+ if (parent->map_token) {
+ unmap_extent_buffer(parent, parent->map_token,
+ KM_USER1);
+ parent->map_token = NULL;
+ }
+ return err;
+}
+
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ u32 nr = btrfs_header_nritems(leaf);
+ if (nr == 0)
+ return BTRFS_LEAF_DATA_SIZE(root);
+ return btrfs_item_offset_nr(leaf, nr - 1);
+}
+
+/*
+ * extra debugging checks to make sure all the items in a key are
+ * well formed and in the proper order
+ */
+static int check_node(struct btrfs_root *root, struct btrfs_path *path,
+ int level)
+{
+ struct extent_buffer *parent = NULL;
+ struct extent_buffer *node = path->nodes[level];
+ struct btrfs_disk_key parent_key;
+ struct btrfs_disk_key node_key;
+ int parent_slot;
+ int slot;
+ struct btrfs_key cpukey;
+ u32 nritems = btrfs_header_nritems(node);
+
+ if (path->nodes[level + 1])
+ parent = path->nodes[level + 1];
+
+ slot = path->slots[level];
+ BUG_ON(nritems == 0);
+ if (parent) {
+ parent_slot = path->slots[level + 1];
+ btrfs_node_key(parent, &parent_key, parent_slot);
+ btrfs_node_key(node, &node_key, 0);
+ BUG_ON(memcmp(&parent_key, &node_key,
+ sizeof(struct btrfs_disk_key)));
+ BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
+ btrfs_header_bytenr(node));
+ }
+ BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
+ if (slot != 0) {
+ btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
+ btrfs_node_key(node, &node_key, slot);
+ BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
+ }
+ if (slot < nritems - 1) {
+ btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
+ btrfs_node_key(node, &node_key, slot);
+ BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
+ }
+ return 0;
+}
+
+/*
+ * extra checking to make sure all the items in a leaf are
+ * well formed and in the proper order
+ */
+static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ int level)
+{
+ struct extent_buffer *leaf = path->nodes[level];
+ struct extent_buffer *parent = NULL;
+ int parent_slot;
+ struct btrfs_key cpukey;
+ struct btrfs_disk_key parent_key;
+ struct btrfs_disk_key leaf_key;
+ int slot = path->slots[0];
+
+ u32 nritems = btrfs_header_nritems(leaf);
+
+ if (path->nodes[level + 1])
+ parent = path->nodes[level + 1];
+
+ if (nritems == 0)
+ return 0;
+
+ if (parent) {
+ parent_slot = path->slots[level + 1];
+ btrfs_node_key(parent, &parent_key, parent_slot);
+ btrfs_item_key(leaf, &leaf_key, 0);
+
+ BUG_ON(memcmp(&parent_key, &leaf_key,
+ sizeof(struct btrfs_disk_key)));
+ BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
+ btrfs_header_bytenr(leaf));
+ }
+ if (slot != 0 && slot < nritems - 1) {
+ btrfs_item_key(leaf, &leaf_key, slot);
+ btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
+ if (comp_keys(&leaf_key, &cpukey) <= 0) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d offset bad key\n", slot);
+ BUG_ON(1);
+ }
+ if (btrfs_item_offset_nr(leaf, slot - 1) !=
+ btrfs_item_end_nr(leaf, slot)) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d offset bad\n", slot);
+ BUG_ON(1);
+ }
+ }
+ if (slot < nritems - 1) {
+ btrfs_item_key(leaf, &leaf_key, slot);
+ btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
+ BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
+ if (btrfs_item_offset_nr(leaf, slot) !=
+ btrfs_item_end_nr(leaf, slot + 1)) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d offset bad\n", slot);
+ BUG_ON(1);
+ }
+ }
+ BUG_ON(btrfs_item_offset_nr(leaf, 0) +
+ btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
+ return 0;
+}
+
+static noinline int check_block(struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ return 0;
+ if (level == 0)
+ return check_leaf(root, path, level);
+ return check_node(root, path, level);
+}
+
+/*
+ * search for key in the extent_buffer. The items start at offset p,
+ * and they are item_size apart. There are 'max' items in p.
+ *
+ * the slot in the array is returned via slot, and it points to
+ * the place where you would insert key if it is not found in
+ * the array.
+ *
+ * slot may point to max if the key is bigger than all of the keys
+ */
+static noinline int generic_bin_search(struct extent_buffer *eb,
+ unsigned long p,
+ int item_size, struct btrfs_key *key,
+ int max, int *slot)
+{
+ int low = 0;
+ int high = max;
+ int mid;
+ int ret;
+ struct btrfs_disk_key *tmp = NULL;
+ struct btrfs_disk_key unaligned;
+ unsigned long offset;
+ char *map_token = NULL;
+ char *kaddr = NULL;
+ unsigned long map_start = 0;
+ unsigned long map_len = 0;
+ int err;
+
+ while (low < high) {
+ mid = (low + high) / 2;
+ offset = p + mid * item_size;
+
+ if (!map_token || offset < map_start ||
+ (offset + sizeof(struct btrfs_disk_key)) >
+ map_start + map_len) {
+ if (map_token) {
+ unmap_extent_buffer(eb, map_token, KM_USER0);
+ map_token = NULL;
+ }
+
+ err = map_private_extent_buffer(eb, offset,
+ sizeof(struct btrfs_disk_key),
+ &map_token, &kaddr,
+ &map_start, &map_len, KM_USER0);
+
+ if (!err) {
+ tmp = (struct btrfs_disk_key *)(kaddr + offset -
+ map_start);
+ } else {
+ read_extent_buffer(eb, &unaligned,
+ offset, sizeof(unaligned));
+ tmp = &unaligned;
+ }
+
+ } else {
+ tmp = (struct btrfs_disk_key *)(kaddr + offset -
+ map_start);
+ }
+ ret = comp_keys(tmp, key);
+
+ if (ret < 0)
+ low = mid + 1;
+ else if (ret > 0)
+ high = mid;
+ else {
+ *slot = mid;
+ if (map_token)
+ unmap_extent_buffer(eb, map_token, KM_USER0);
+ return 0;
+ }
+ }
+ *slot = low;
+ if (map_token)
+ unmap_extent_buffer(eb, map_token, KM_USER0);
+ return 1;
+}
+
+/*
+ * simple bin_search frontend that does the right thing for
+ * leaves vs nodes
+ */
+static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+ int level, int *slot)
+{
+ if (level == 0) {
+ return generic_bin_search(eb,
+ offsetof(struct btrfs_leaf, items),
+ sizeof(struct btrfs_item),
+ key, btrfs_header_nritems(eb),
+ slot);
+ } else {
+ return generic_bin_search(eb,
+ offsetof(struct btrfs_node, ptrs),
+ sizeof(struct btrfs_key_ptr),
+ key, btrfs_header_nritems(eb),
+ slot);
+ }
+ return -1;
+}
+
+/* given a node and slot number, this reads the blocks it points to. The
+ * extent buffer is returned with a reference taken (but unlocked).
+ * NULL is returned on error.
+ */
+static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
+ struct extent_buffer *parent, int slot)
+{
+ int level = btrfs_header_level(parent);
+ if (slot < 0)
+ return NULL;
+ if (slot >= btrfs_header_nritems(parent))
+ return NULL;
+
+ BUG_ON(level == 0);
+
+ return read_tree_block(root, btrfs_node_blockptr(parent, slot),
+ btrfs_level_size(root, level - 1),
+ btrfs_node_ptr_generation(parent, slot));
+}
+
+/*
+ * node level balancing, used to make sure nodes are in proper order for
+ * item deletion. We balance from the top down, so we have to make sure
+ * that a deletion won't leave an node completely empty later on.
+ */
+static noinline int balance_level(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct extent_buffer *right = NULL;
+ struct extent_buffer *mid;
+ struct extent_buffer *left = NULL;
+ struct extent_buffer *parent = NULL;
+ int ret = 0;
+ int wret;
+ int pslot;
+ int orig_slot = path->slots[level];
+ int err_on_enospc = 0;
+ u64 orig_ptr;
+
+ if (level == 0)
+ return 0;
+
+ mid = path->nodes[level];
+ WARN_ON(!path->locks[level]);
+ WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
+ orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+
+ if (level < BTRFS_MAX_LEVEL - 1)
+ parent = path->nodes[level + 1];
+ pslot = path->slots[level + 1];
+
+ /*
+ * deal with the case where there is only one pointer in the root
+ * by promoting the node below to a root
+ */
+ if (!parent) {
+ struct extent_buffer *child;
+
+ if (btrfs_header_nritems(mid) != 1)
+ return 0;
+
+ /* promote the child to a root */
+ child = read_node_slot(root, mid, 0);
+ btrfs_tree_lock(child);
+ BUG_ON(!child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+ BUG_ON(ret);
+
+ spin_lock(&root->node_lock);
+ root->node = child;
+ spin_unlock(&root->node_lock);
+
+ ret = btrfs_update_extent_ref(trans, root, child->start,
+ mid->start, child->start,
+ root->root_key.objectid,
+ trans->transid, level - 1);
+ BUG_ON(ret);
+
+ add_root_to_dirty_list(root);
+ btrfs_tree_unlock(child);
+ path->locks[level] = 0;
+ path->nodes[level] = NULL;
+ clean_tree_block(trans, root, mid);
+ btrfs_tree_unlock(mid);
+ /* once for the path */
+ free_extent_buffer(mid);
+ ret = btrfs_free_extent(trans, root, mid->start, mid->len,
+ mid->start, root->root_key.objectid,
+ btrfs_header_generation(mid),
+ level, 1);
+ /* once for the root ptr */
+ free_extent_buffer(mid);
+ return ret;
+ }
+ if (btrfs_header_nritems(mid) >
+ BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
+ return 0;
+
+ if (btrfs_header_nritems(mid) < 2)
+ err_on_enospc = 1;
+
+ left = read_node_slot(root, parent, pslot - 1);
+ if (left) {
+ btrfs_tree_lock(left);
+ wret = btrfs_cow_block(trans, root, left,
+ parent, pslot - 1, &left, 0);
+ if (wret) {
+ ret = wret;
+ goto enospc;
+ }
+ }
+ right = read_node_slot(root, parent, pslot + 1);
+ if (right) {
+ btrfs_tree_lock(right);
+ wret = btrfs_cow_block(trans, root, right,
+ parent, pslot + 1, &right, 0);
+ if (wret) {
+ ret = wret;
+ goto enospc;
+ }
+ }
+
+ /* first, try to make some room in the middle buffer */
+ if (left) {
+ orig_slot += btrfs_header_nritems(left);
+ wret = push_node_left(trans, root, left, mid, 1);
+ if (wret < 0)
+ ret = wret;
+ if (btrfs_header_nritems(mid) < 2)
+ err_on_enospc = 1;
+ }
+
+ /*
+ * then try to empty the right most buffer into the middle
+ */
+ if (right) {
+ wret = push_node_left(trans, root, mid, right, 1);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+ if (btrfs_header_nritems(right) == 0) {
+ u64 bytenr = right->start;
+ u64 generation = btrfs_header_generation(parent);
+ u32 blocksize = right->len;
+
+ clean_tree_block(trans, root, right);
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ right = NULL;
+ wret = del_ptr(trans, root, path, level + 1, pslot +
+ 1);
+ if (wret)
+ ret = wret;
+ wret = btrfs_free_extent(trans, root, bytenr,
+ blocksize, parent->start,
+ btrfs_header_owner(parent),
+ generation, level, 1);
+ if (wret)
+ ret = wret;
+ } else {
+ struct btrfs_disk_key right_key;
+ btrfs_node_key(right, &right_key, 0);
+ btrfs_set_node_key(parent, &right_key, pslot + 1);
+ btrfs_mark_buffer_dirty(parent);
+ }
+ }
+ if (btrfs_header_nritems(mid) == 1) {
+ /*
+ * we're not allowed to leave a node with one item in the
+ * tree during a delete. A deletion from lower in the tree
+ * could try to delete the only pointer in this node.
+ * So, pull some keys from the left.
+ * There has to be a left pointer at this point because
+ * otherwise we would have pulled some pointers from the
+ * right
+ */
+ BUG_ON(!left);
+ wret = balance_node_right(trans, root, mid, left);
+ if (wret < 0) {
+ ret = wret;
+ goto enospc;
+ }
+ if (wret == 1) {
+ wret = push_node_left(trans, root, left, mid, 1);
+ if (wret < 0)
+ ret = wret;
+ }
+ BUG_ON(wret == 1);
+ }
+ if (btrfs_header_nritems(mid) == 0) {
+ /* we've managed to empty the middle node, drop it */
+ u64 root_gen = btrfs_header_generation(parent);
+ u64 bytenr = mid->start;
+ u32 blocksize = mid->len;
+
+ clean_tree_block(trans, root, mid);
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ mid = NULL;
+ wret = del_ptr(trans, root, path, level + 1, pslot);
+ if (wret)
+ ret = wret;
+ wret = btrfs_free_extent(trans, root, bytenr, blocksize,
+ parent->start,
+ btrfs_header_owner(parent),
+ root_gen, level, 1);
+ if (wret)
+ ret = wret;
+ } else {
+ /* update the parent key to reflect our changes */
+ struct btrfs_disk_key mid_key;
+ btrfs_node_key(mid, &mid_key, 0);
+ btrfs_set_node_key(parent, &mid_key, pslot);
+ btrfs_mark_buffer_dirty(parent);
+ }
+
+ /* update the path */
+ if (left) {
+ if (btrfs_header_nritems(left) > orig_slot) {
+ extent_buffer_get(left);
+ /* left was locked after cow */
+ path->nodes[level] = left;
+ path->slots[level + 1] -= 1;
+ path->slots[level] = orig_slot;
+ if (mid) {
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ }
+ } else {
+ orig_slot -= btrfs_header_nritems(left);
+ path->slots[level] = orig_slot;
+ }
+ }
+ /* double check we haven't messed things up */
+ check_block(root, path, level);
+ if (orig_ptr !=
+ btrfs_node_blockptr(path->nodes[level], path->slots[level]))
+ BUG();
+enospc:
+ if (right) {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ if (left) {
+ if (path->nodes[level] != left)
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ return ret;
+}
+
+/* Node balancing for insertion. Here we only split or push nodes around
+ * when they are completely full. This is also done top down, so we
+ * have to be pessimistic.
+ */
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct extent_buffer *right = NULL;
+ struct extent_buffer *mid;
+ struct extent_buffer *left = NULL;
+ struct extent_buffer *parent = NULL;
+ int ret = 0;
+ int wret;
+ int pslot;
+ int orig_slot = path->slots[level];
+ u64 orig_ptr;
+
+ if (level == 0)
+ return 1;
+
+ mid = path->nodes[level];
+ WARN_ON(btrfs_header_generation(mid) != trans->transid);
+ orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+
+ if (level < BTRFS_MAX_LEVEL - 1)
+ parent = path->nodes[level + 1];
+ pslot = path->slots[level + 1];
+
+ if (!parent)
+ return 1;
+
+ left = read_node_slot(root, parent, pslot - 1);
+
+ /* first, try to make some room in the middle buffer */
+ if (left) {
+ u32 left_nr;
+
+ btrfs_tree_lock(left);
+ left_nr = btrfs_header_nritems(left);
+ if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+ wret = 1;
+ } else {
+ ret = btrfs_cow_block(trans, root, left, parent,
+ pslot - 1, &left, 0);
+ if (ret)
+ wret = 1;
+ else {
+ wret = push_node_left(trans, root,
+ left, mid, 0);
+ }
+ }
+ if (wret < 0)
+ ret = wret;
+ if (wret == 0) {
+ struct btrfs_disk_key disk_key;
+ orig_slot += left_nr;
+ btrfs_node_key(mid, &disk_key, 0);
+ btrfs_set_node_key(parent, &disk_key, pslot);
+ btrfs_mark_buffer_dirty(parent);
+ if (btrfs_header_nritems(left) > orig_slot) {
+ path->nodes[level] = left;
+ path->slots[level + 1] -= 1;
+ path->slots[level] = orig_slot;
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ } else {
+ orig_slot -=
+ btrfs_header_nritems(left);
+ path->slots[level] = orig_slot;
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ return 0;
+ }
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ }
+ right = read_node_slot(root, parent, pslot + 1);
+
+ /*
+ * then try to empty the right most buffer into the middle
+ */
+ if (right) {
+ u32 right_nr;
+ btrfs_tree_lock(right);
+ right_nr = btrfs_header_nritems(right);
+ if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+ wret = 1;
+ } else {
+ ret = btrfs_cow_block(trans, root, right,
+ parent, pslot + 1,
+ &right, 0);
+ if (ret)
+ wret = 1;
+ else {
+ wret = balance_node_right(trans, root,
+ right, mid);
+ }
+ }
+ if (wret < 0)
+ ret = wret;
+ if (wret == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_node_key(right, &disk_key, 0);
+ btrfs_set_node_key(parent, &disk_key, pslot + 1);
+ btrfs_mark_buffer_dirty(parent);
+
+ if (btrfs_header_nritems(mid) <= orig_slot) {
+ path->nodes[level] = right;
+ path->slots[level + 1] += 1;
+ path->slots[level] = orig_slot -
+ btrfs_header_nritems(mid);
+ btrfs_tree_unlock(mid);
+ free_extent_buffer(mid);
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 0;
+ }
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 1;
+}
+
+/*
+ * readahead one full node of leaves, finding things that are close
+ * to the block in 'slot', and triggering ra on them.
+ */
+static noinline void reada_for_search(struct btrfs_root *root,
+ struct btrfs_path *path,
+ int level, int slot, u64 objectid)
+{
+ struct extent_buffer *node;
+ struct btrfs_disk_key disk_key;
+ u32 nritems;
+ u64 search;
+ u64 lowest_read;
+ u64 highest_read;
+ u64 nread = 0;
+ int direction = path->reada;
+ struct extent_buffer *eb;
+ u32 nr;
+ u32 blocksize;
+ u32 nscan = 0;
+
+ if (level != 1)
+ return;
+
+ if (!path->nodes[level])
+ return;
+
+ node = path->nodes[level];
+
+ search = btrfs_node_blockptr(node, slot);
+ blocksize = btrfs_level_size(root, level - 1);
+ eb = btrfs_find_tree_block(root, search, blocksize);
+ if (eb) {
+ free_extent_buffer(eb);
+ return;
+ }
+
+ highest_read = search;
+ lowest_read = search;
+
+ nritems = btrfs_header_nritems(node);
+ nr = slot;
+ while (1) {
+ if (direction < 0) {
+ if (nr == 0)
+ break;
+ nr--;
+ } else if (direction > 0) {
+ nr++;
+ if (nr >= nritems)
+ break;
+ }
+ if (path->reada < 0 && objectid) {
+ btrfs_node_key(node, &disk_key, nr);
+ if (btrfs_disk_key_objectid(&disk_key) != objectid)
+ break;
+ }
+ search = btrfs_node_blockptr(node, nr);
+ if ((search >= lowest_read && search <= highest_read) ||
+ (search < lowest_read && lowest_read - search <= 16384) ||
+ (search > highest_read && search - highest_read <= 16384)) {
+ readahead_tree_block(root, search, blocksize,
+ btrfs_node_ptr_generation(node, nr));
+ nread += blocksize;
+ }
+ nscan++;
+ if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
+ break;
+
+ if (nread > (256 * 1024) || nscan > 128)
+ break;
+
+ if (search < lowest_read)
+ lowest_read = search;
+ if (search > highest_read)
+ highest_read = search;
+ }
+}
+
+/*
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree. The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
+ *
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block. This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
+ *
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
+ * if lowest_unlock is 1, level 0 won't be unlocked
+ */
+static noinline void unlock_up(struct btrfs_path *path, int level,
+ int lowest_unlock)
+{
+ int i;
+ int skip_level = level;
+ int no_skips = 0;
+ struct extent_buffer *t;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ if (!path->nodes[i])
+ break;
+ if (!path->locks[i])
+ break;
+ if (!no_skips && path->slots[i] == 0) {
+ skip_level = i + 1;
+ continue;
+ }
+ if (!no_skips && path->keep_locks) {
+ u32 nritems;
+ t = path->nodes[i];
+ nritems = btrfs_header_nritems(t);
+ if (nritems < 1 || path->slots[i] >= nritems - 1) {
+ skip_level = i + 1;
+ continue;
+ }
+ }
+ if (skip_level < i && i >= lowest_unlock)
+ no_skips = 1;
+
+ t = path->nodes[i];
+ if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+ btrfs_tree_unlock(t);
+ path->locks[i] = 0;
+ }
+ }
+}
+
+/*
+ * look for key in the tree. path is filled in with nodes along the way
+ * if key is found, we return zero and you can find the item in the leaf
+ * level of the path (level 0)
+ *
+ * If the key isn't found, the path points to the slot where it should
+ * be inserted, and 1 is returned. If there are other errors during the
+ * search a negative error number is returned.
+ *
+ * if ins_len > 0, nodes and leaves will be split as we walk down the
+ * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
+ * possible)
+ */
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_path *p, int
+ ins_len, int cow)
+{
+ struct extent_buffer *b;
+ struct extent_buffer *tmp;
+ int slot;
+ int ret;
+ int level;
+ int should_reada = p->reada;
+ int lowest_unlock = 1;
+ int blocksize;
+ u8 lowest_level = 0;
+ u64 blocknr;
+ u64 gen;
+ struct btrfs_key prealloc_block;
+
+ lowest_level = p->lowest_level;
+ WARN_ON(lowest_level && ins_len > 0);
+ WARN_ON(p->nodes[0] != NULL);
+
+ if (ins_len < 0)
+ lowest_unlock = 2;
+
+ prealloc_block.objectid = 0;
+
+again:
+ if (p->skip_locking)
+ b = btrfs_root_node(root);
+ else
+ b = btrfs_lock_root_node(root);
+
+ while (b) {
+ level = btrfs_header_level(b);
+
+ /*
+ * setup the path here so we can release it under lock
+ * contention with the cow code
+ */
+ p->nodes[level] = b;
+ if (!p->skip_locking)
+ p->locks[level] = 1;
+
+ if (cow) {
+ int wret;
+
+ /* is a cow on this block not required */
+ spin_lock(&root->fs_info->hash_lock);
+ if (btrfs_header_generation(b) == trans->transid &&
+ btrfs_header_owner(b) == root->root_key.objectid &&
+ !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
+ spin_unlock(&root->fs_info->hash_lock);
+ goto cow_done;
+ }
+ spin_unlock(&root->fs_info->hash_lock);
+
+ /* ok, we have to cow, is our old prealloc the right
+ * size?
+ */
+ if (prealloc_block.objectid &&
+ prealloc_block.offset != b->len) {
+ btrfs_free_reserved_extent(root,
+ prealloc_block.objectid,
+ prealloc_block.offset);
+ prealloc_block.objectid = 0;
+ }
+
+ /*
+ * for higher level blocks, try not to allocate blocks
+ * with the block and the parent locks held.
+ */
+ if (level > 1 && !prealloc_block.objectid &&
+ btrfs_path_lock_waiting(p, level)) {
+ u32 size = b->len;
+ u64 hint = b->start;
+
+ btrfs_release_path(root, p);
+ ret = btrfs_reserve_extent(trans, root,
+ size, size, 0,
+ hint, (u64)-1,
+ &prealloc_block, 0);
+ BUG_ON(ret);
+ goto again;
+ }
+
+ wret = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1],
+ &b, prealloc_block.objectid);
+ prealloc_block.objectid = 0;
+ if (wret) {
+ free_extent_buffer(b);
+ ret = wret;
+ goto done;
+ }
+ }
+cow_done:
+ BUG_ON(!cow && ins_len);
+ if (level != btrfs_header_level(b))
+ WARN_ON(1);
+ level = btrfs_header_level(b);
+
+ p->nodes[level] = b;
+ if (!p->skip_locking)
+ p->locks[level] = 1;
+
+ ret = check_block(root, p, level);
+ if (ret) {
+ ret = -1;
+ goto done;
+ }
+
+ ret = bin_search(b, key, level, &slot);
+ if (level != 0) {
+ if (ret && slot > 0)
+ slot -= 1;
+ p->slots[level] = slot;
+ if ((p->search_for_split || ins_len > 0) &&
+ btrfs_header_nritems(b) >=
+ BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+ int sret = split_node(trans, root, p, level);
+ BUG_ON(sret > 0);
+ if (sret) {
+ ret = sret;
+ goto done;
+ }
+ b = p->nodes[level];
+ slot = p->slots[level];
+ } else if (ins_len < 0) {
+ int sret = balance_level(trans, root, p,
+ level);
+ if (sret) {
+ ret = sret;
+ goto done;
+ }
+ b = p->nodes[level];
+ if (!b) {
+ btrfs_release_path(NULL, p);
+ goto again;
+ }
+ slot = p->slots[level];
+ BUG_ON(btrfs_header_nritems(b) == 1);
+ }
+ unlock_up(p, level, lowest_unlock);
+
+ /* this is only true while dropping a snapshot */
+ if (level == lowest_level) {
+ ret = 0;
+ goto done;
+ }
+
+ blocknr = btrfs_node_blockptr(b, slot);
+ gen = btrfs_node_ptr_generation(b, slot);
+ blocksize = btrfs_level_size(root, level - 1);
+
+ tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+ if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+ b = tmp;
+ } else {
+ /*
+ * reduce lock contention at high levels
+ * of the btree by dropping locks before
+ * we read.
+ */
+ if (level > 1) {
+ btrfs_release_path(NULL, p);
+ if (tmp)
+ free_extent_buffer(tmp);
+ if (should_reada)
+ reada_for_search(root, p,
+ level, slot,
+ key->objectid);
+
+ tmp = read_tree_block(root, blocknr,
+ blocksize, gen);
+ if (tmp)
+ free_extent_buffer(tmp);
+ goto again;
+ } else {
+ if (tmp)
+ free_extent_buffer(tmp);
+ if (should_reada)
+ reada_for_search(root, p,
+ level, slot,
+ key->objectid);
+ b = read_node_slot(root, b, slot);
+ }
+ }
+ if (!p->skip_locking)
+ btrfs_tree_lock(b);
+ } else {
+ p->slots[level] = slot;
+ if (ins_len > 0 &&
+ btrfs_leaf_free_space(root, b) < ins_len) {
+ int sret = split_leaf(trans, root, key,
+ p, ins_len, ret == 0);
+ BUG_ON(sret > 0);
+ if (sret) {
+ ret = sret;
+ goto done;
+ }
+ }
+ if (!p->search_for_split)
+ unlock_up(p, level, lowest_unlock);
+ goto done;
+ }
+ }
+ ret = 1;
+done:
+ if (prealloc_block.objectid) {
+ btrfs_free_reserved_extent(root,
+ prealloc_block.objectid,
+ prealloc_block.offset);
+ }
+
+ return ret;
+}
+
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *node_keys,
+ u64 *nodes, int lowest_level)
+{
+ struct extent_buffer *eb;
+ struct extent_buffer *parent;
+ struct btrfs_key key;
+ u64 bytenr;
+ u64 generation;
+ u32 blocksize;
+ int level;
+ int slot;
+ int key_match;
+ int ret;
+
+ eb = btrfs_lock_root_node(root);
+ ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+ BUG_ON(ret);
+
+ parent = eb;
+ while (1) {
+ level = btrfs_header_level(parent);
+ if (level == 0 || level <= lowest_level)
+ break;
+
+ ret = bin_search(parent, &node_keys[lowest_level], level,
+ &slot);
+ if (ret && slot > 0)
+ slot--;
+
+ bytenr = btrfs_node_blockptr(parent, slot);
+ if (nodes[level - 1] == bytenr)
+ break;
+
+ blocksize = btrfs_level_size(root, level - 1);
+ generation = btrfs_node_ptr_generation(parent, slot);
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
+
+ if (generation == trans->transid) {
+ eb = read_tree_block(root, bytenr, blocksize,
+ generation);
+ btrfs_tree_lock(eb);
+ }
+
+ /*
+ * if node keys match and node pointer hasn't been modified
+ * in the running transaction, we can merge the path. for
+ * blocks owened by reloc trees, the node pointer check is
+ * skipped, this is because these blocks are fully controlled
+ * by the space balance code, no one else can modify them.
+ */
+ if (!nodes[level - 1] || !key_match ||
+ (generation == trans->transid &&
+ btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
+ if (level == 1 || level == lowest_level + 1) {
+ if (generation == trans->transid) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ break;
+ }
+
+ if (generation != trans->transid) {
+ eb = read_tree_block(root, bytenr, blocksize,
+ generation);
+ btrfs_tree_lock(eb);
+ }
+
+ ret = btrfs_cow_block(trans, root, eb, parent, slot,
+ &eb, 0);
+ BUG_ON(ret);
+
+ if (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID) {
+ if (!nodes[level - 1]) {
+ nodes[level - 1] = eb->start;
+ memcpy(&node_keys[level - 1], &key,
+ sizeof(node_keys[0]));
+ } else {
+ WARN_ON(1);
+ }
+ }
+
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ parent = eb;
+ continue;
+ }
+
+ btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
+ btrfs_set_node_ptr_generation(parent, slot, trans->transid);
+ btrfs_mark_buffer_dirty(parent);
+
+ ret = btrfs_inc_extent_ref(trans, root,
+ nodes[level - 1],
+ blocksize, parent->start,
+ btrfs_header_owner(parent),
+ btrfs_header_generation(parent),
+ level - 1);
+ BUG_ON(ret);
+
+ /*
+ * If the block was created in the running transaction,
+ * it's possible this is the last reference to it, so we
+ * should drop the subtree.
+ */
+ if (generation == trans->transid) {
+ ret = btrfs_drop_subtree(trans, root, eb, parent);
+ BUG_ON(ret);
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ } else {
+ ret = btrfs_free_extent(trans, root, bytenr,
+ blocksize, parent->start,
+ btrfs_header_owner(parent),
+ btrfs_header_generation(parent),
+ level - 1, 1);
+ BUG_ON(ret);
+ }
+ break;
+ }
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ return 0;
+}
+
+/*
+ * adjust the pointers going up the tree, starting at level
+ * making sure the right key of each node is points to 'key'.
+ * This is used after shifting pointers to the left, so it stops
+ * fixing up pointers when a given leaf/node is not in slot 0 of the
+ * higher levels
+ *
+ * If this fails to write a tree block, it returns -1, but continues
+ * fixing up the blocks in ram so the tree is consistent.
+ */
+static int fixup_low_keys(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_disk_key *key, int level)
+{
+ int i;
+ int ret = 0;
+ struct extent_buffer *t;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ int tslot = path->slots[i];
+ if (!path->nodes[i])
+ break;
+ t = path->nodes[i];
+ btrfs_set_node_key(t, key, tslot);
+ btrfs_mark_buffer_dirty(path->nodes[i]);
+ if (tslot != 0)
+ break;
+ }
+ return ret;
+}
+
+/*
+ * update item key.
+ *
+ * This function isn't completely safe. It's the caller's responsibility
+ * that the new key won't break the order
+ */
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *new_key)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *eb;
+ int slot;
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ if (slot > 0) {
+ btrfs_item_key(eb, &disk_key, slot - 1);
+ if (comp_keys(&disk_key, new_key) >= 0)
+ return -1;
+ }
+ if (slot < btrfs_header_nritems(eb) - 1) {
+ btrfs_item_key(eb, &disk_key, slot + 1);
+ if (comp_keys(&disk_key, new_key) <= 0)
+ return -1;
+ }
+
+ btrfs_cpu_key_to_disk(&disk_key, new_key);
+ btrfs_set_item_key(eb, &disk_key, slot);
+ btrfs_mark_buffer_dirty(eb);
+ if (slot == 0)
+ fixup_low_keys(trans, root, path, &disk_key, 1);
+ return 0;
+}
+
+/*
+ * try to push data from one node into the next node left in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the left hand block.
+ */
+static int push_node_left(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *dst,
+ struct extent_buffer *src, int empty)
+{
+ int push_items = 0;
+ int src_nritems;
+ int dst_nritems;
+ int ret = 0;
+
+ src_nritems = btrfs_header_nritems(src);
+ dst_nritems = btrfs_header_nritems(dst);
+ push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+ WARN_ON(btrfs_header_generation(src) != trans->transid);
+ WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+ if (!empty && src_nritems <= 8)
+ return 1;
+
+ if (push_items <= 0)
+ return 1;
+
+ if (empty) {
+ push_items = min(src_nritems, push_items);
+ if (push_items < src_nritems) {
+ /* leave at least 8 pointers in the node if
+ * we aren't going to empty it
+ */
+ if (src_nritems - push_items < 8) {
+ if (push_items <= 8)
+ return 1;
+ push_items -= 8;
+ }
+ }
+ } else
+ push_items = min(src_nritems - 8, push_items);
+
+ copy_extent_buffer(dst, src,
+ btrfs_node_key_ptr_offset(dst_nritems),
+ btrfs_node_key_ptr_offset(0),
+ push_items * sizeof(struct btrfs_key_ptr));
+
+ if (push_items < src_nritems) {
+ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(push_items),
+ (src_nritems - push_items) *
+ sizeof(struct btrfs_key_ptr));
+ }
+ btrfs_set_header_nritems(src, src_nritems - push_items);
+ btrfs_set_header_nritems(dst, dst_nritems + push_items);
+ btrfs_mark_buffer_dirty(src);
+ btrfs_mark_buffer_dirty(dst);
+
+ ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
+ BUG_ON(ret);
+
+ return ret;
+}
+
+/*
+ * try to push data from one node into the next node right in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the right hand block.
+ *
+ * this will only push up to 1/2 the contents of the left node over
+ */
+static int balance_node_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *dst,
+ struct extent_buffer *src)
+{
+ int push_items = 0;
+ int max_push;
+ int src_nritems;
+ int dst_nritems;
+ int ret = 0;
+
+ WARN_ON(btrfs_header_generation(src) != trans->transid);
+ WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+ src_nritems = btrfs_header_nritems(src);
+ dst_nritems = btrfs_header_nritems(dst);
+ push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+ if (push_items <= 0)
+ return 1;
+
+ if (src_nritems < 4)
+ return 1;
+
+ max_push = src_nritems / 2 + 1;
+ /* don't try to empty the node */
+ if (max_push >= src_nritems)
+ return 1;
+
+ if (max_push < push_items)
+ push_items = max_push;
+
+ memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
+ btrfs_node_key_ptr_offset(0),
+ (dst_nritems) *
+ sizeof(struct btrfs_key_ptr));
+
+ copy_extent_buffer(dst, src,
+ btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(src_nritems - push_items),
+ push_items * sizeof(struct btrfs_key_ptr));
+
+ btrfs_set_header_nritems(src, src_nritems - push_items);
+ btrfs_set_header_nritems(dst, dst_nritems + push_items);
+
+ btrfs_mark_buffer_dirty(src);
+ btrfs_mark_buffer_dirty(dst);
+
+ ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
+ BUG_ON(ret);
+
+ return ret;
+}
+
+/*
+ * helper function to insert a new root level in the tree.
+ * A new node is allocated, and a single item is inserted to
+ * point to the existing root
+ *
+ * returns zero on success or < 0 on failure.
+ */
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ u64 lower_gen;
+ struct extent_buffer *lower;
+ struct extent_buffer *c;
+ struct extent_buffer *old;
+ struct btrfs_disk_key lower_key;
+ int ret;
+
+ BUG_ON(path->nodes[level]);
+ BUG_ON(path->nodes[level-1] != root->node);
+
+ lower = path->nodes[level-1];
+ if (level == 1)
+ btrfs_item_key(lower, &lower_key, 0);
+ else
+ btrfs_node_key(lower, &lower_key, 0);
+
+ c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+ root->root_key.objectid, trans->transid,
+ level, root->node->start, 0);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+
+ memset_extent_buffer(c, 0, 0, root->nodesize);
+ btrfs_set_header_nritems(c, 1);
+ btrfs_set_header_level(c, level);
+ btrfs_set_header_bytenr(c, c->start);
+ btrfs_set_header_generation(c, trans->transid);
+ btrfs_set_header_owner(c, root->root_key.objectid);
+
+ write_extent_buffer(c, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(c),
+ BTRFS_FSID_SIZE);
+
+ write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_header_chunk_tree_uuid(c),
+ BTRFS_UUID_SIZE);
+
+ btrfs_set_node_key(c, &lower_key, 0);
+ btrfs_set_node_blockptr(c, 0, lower->start);
+ lower_gen = btrfs_header_generation(lower);
+ WARN_ON(lower_gen != trans->transid);
+
+ btrfs_set_node_ptr_generation(c, 0, lower_gen);
+
+ btrfs_mark_buffer_dirty(c);
+
+ spin_lock(&root->node_lock);
+ old = root->node;
+ root->node = c;
+ spin_unlock(&root->node_lock);
+
+ ret = btrfs_update_extent_ref(trans, root, lower->start,
+ lower->start, c->start,
+ root->root_key.objectid,
+ trans->transid, level - 1);
+ BUG_ON(ret);
+
+ /* the super has an extra ref to root->node */
+ free_extent_buffer(old);
+
+ add_root_to_dirty_list(root);
+ extent_buffer_get(c);
+ path->nodes[level] = c;
+ path->locks[level] = 1;
+ path->slots[level] = 0;
+ return 0;
+}
+
+/*
+ * worker function to insert a single pointer in a node.
+ * the node should have enough room for the pointer already
+ *
+ * slot and level indicate where you want the key to go, and
+ * blocknr is the block the key points to.
+ *
+ * returns zero on success and < 0 on any error
+ */
+static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, struct btrfs_disk_key
+ *key, u64 bytenr, int slot, int level)
+{
+ struct extent_buffer *lower;
+ int nritems;
+
+ BUG_ON(!path->nodes[level]);
+ lower = path->nodes[level];
+ nritems = btrfs_header_nritems(lower);
+ if (slot > nritems)
+ BUG();
+ if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
+ BUG();
+ if (slot != nritems) {
+ memmove_extent_buffer(lower,
+ btrfs_node_key_ptr_offset(slot + 1),
+ btrfs_node_key_ptr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_key_ptr));
+ }
+ btrfs_set_node_key(lower, key, slot);
+ btrfs_set_node_blockptr(lower, slot, bytenr);
+ WARN_ON(trans->transid == 0);
+ btrfs_set_node_ptr_generation(lower, slot, trans->transid);
+ btrfs_set_header_nritems(lower, nritems + 1);
+ btrfs_mark_buffer_dirty(lower);
+ return 0;
+}
+
+/*
+ * split the node at the specified level in path in two.
+ * The path is corrected to point to the appropriate node after the split
+ *
+ * Before splitting this tries to make some room in the node by pushing
+ * left and right, if either one works, it returns right away.
+ *
+ * returns 0 on success and < 0 on failure
+ */
+static noinline int split_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int level)
+{
+ struct extent_buffer *c;
+ struct extent_buffer *split;
+ struct btrfs_disk_key disk_key;
+ int mid;
+ int ret;
+ int wret;
+ u32 c_nritems;
+
+ c = path->nodes[level];
+ WARN_ON(btrfs_header_generation(c) != trans->transid);
+ if (c == root->node) {
+ /* trying to split the root, lets make a new one */
+ ret = insert_new_root(trans, root, path, level + 1);
+ if (ret)
+ return ret;
+ } else {
+ ret = push_nodes_for_insert(trans, root, path, level);
+ c = path->nodes[level];
+ if (!ret && btrfs_header_nritems(c) <
+ BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
+ return 0;
+ if (ret < 0)
+ return ret;
+ }
+
+ c_nritems = btrfs_header_nritems(c);
+
+ split = btrfs_alloc_free_block(trans, root, root->nodesize,
+ path->nodes[level + 1]->start,
+ root->root_key.objectid,
+ trans->transid, level, c->start, 0);
+ if (IS_ERR(split))
+ return PTR_ERR(split);
+
+ btrfs_set_header_flags(split, btrfs_header_flags(c));
+ btrfs_set_header_level(split, btrfs_header_level(c));
+ btrfs_set_header_bytenr(split, split->start);
+ btrfs_set_header_generation(split, trans->transid);
+ btrfs_set_header_owner(split, root->root_key.objectid);
+ btrfs_set_header_flags(split, 0);
+ write_extent_buffer(split, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(split),
+ BTRFS_FSID_SIZE);
+ write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_header_chunk_tree_uuid(split),
+ BTRFS_UUID_SIZE);
+
+ mid = (c_nritems + 1) / 2;
+
+ copy_extent_buffer(split, c,
+ btrfs_node_key_ptr_offset(0),
+ btrfs_node_key_ptr_offset(mid),
+ (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
+ btrfs_set_header_nritems(split, c_nritems - mid);
+ btrfs_set_header_nritems(c, mid);
+ ret = 0;
+
+ btrfs_mark_buffer_dirty(c);
+ btrfs_mark_buffer_dirty(split);
+
+ btrfs_node_key(split, &disk_key, 0);
+ wret = insert_ptr(trans, root, path, &disk_key, split->start,
+ path->slots[level + 1] + 1,
+ level + 1);
+ if (wret)
+ ret = wret;
+
+ ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
+ BUG_ON(ret);
+
+ if (path->slots[level] >= mid) {
+ path->slots[level] -= mid;
+ btrfs_tree_unlock(c);
+ free_extent_buffer(c);
+ path->nodes[level] = split;
+ path->slots[level + 1] += 1;
+ } else {
+ btrfs_tree_unlock(split);
+ free_extent_buffer(split);
+ }
+ return ret;
+}
+
+/*
+ * how many bytes are required to store the items in a leaf. start
+ * and nr indicate which items in the leaf to check. This totals up the
+ * space used both by the item structs and the item data
+ */
+static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+{
+ int data_len;
+ int nritems = btrfs_header_nritems(l);
+ int end = min(nritems, start + nr) - 1;
+
+ if (!nr)
+ return 0;
+ data_len = btrfs_item_end_nr(l, start);
+ data_len = data_len - btrfs_item_offset_nr(l, end);
+ data_len += sizeof(struct btrfs_item) * nr;
+ WARN_ON(data_len < 0);
+ return data_len;
+}
+
+/*
+ * The space between the end of the leaf items and
+ * the start of the leaf data. IOW, how much room
+ * the leaf has left for both items and data
+ */
+noinline int btrfs_leaf_free_space(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ int nritems = btrfs_header_nritems(leaf);
+ int ret;
+ ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+ if (ret < 0) {
+ printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
+ "used %d nritems %d\n",
+ ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
+ leaf_space_used(leaf, 0, nritems), nritems);
+ }
+ return ret;
+}
+
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int data_size,
+ int empty)
+{
+ struct extent_buffer *left = path->nodes[0];
+ struct extent_buffer *right;
+ struct extent_buffer *upper;
+ struct btrfs_disk_key disk_key;
+ int slot;
+ u32 i;
+ int free_space;
+ int push_space = 0;
+ int push_items = 0;
+ struct btrfs_item *item;
+ u32 left_nritems;
+ u32 nr;
+ u32 right_nritems;
+ u32 data_end;
+ u32 this_item_size;
+ int ret;
+
+ slot = path->slots[1];
+ if (!path->nodes[1])
+ return 1;
+
+ upper = path->nodes[1];
+ if (slot >= btrfs_header_nritems(upper) - 1)
+ return 1;
+
+ WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+
+ right = read_node_slot(root, upper, slot + 1);
+ btrfs_tree_lock(right);
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, right, upper,
+ slot + 1, &right, 0);
+ if (ret)
+ goto out_unlock;
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ left_nritems = btrfs_header_nritems(left);
+ if (left_nritems == 0)
+ goto out_unlock;
+
+ if (empty)
+ nr = 0;
+ else
+ nr = 1;
+
+ if (path->slots[0] >= left_nritems)
+ push_space += data_size;
+
+ i = left_nritems - 1;
+ while (i >= nr) {
+ item = btrfs_item_nr(left, i);
+
+ if (!empty && push_items > 0) {
+ if (path->slots[0] > i)
+ break;
+ if (path->slots[0] == i) {
+ int space = btrfs_leaf_free_space(root, left);
+ if (space + push_space * 2 > free_space)
+ break;
+ }
+ }
+
+ if (path->slots[0] == i)
+ push_space += data_size;
+
+ if (!left->map_token) {
+ map_extent_buffer(left, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &left->map_token, &left->kaddr,
+ &left->map_start, &left->map_len,
+ KM_USER1);
+ }
+
+ this_item_size = btrfs_item_size(left, item);
+ if (this_item_size + sizeof(*item) + push_space > free_space)
+ break;
+
+ push_items++;
+ push_space += this_item_size + sizeof(*item);
+ if (i == 0)
+ break;
+ i--;
+ }
+ if (left->map_token) {
+ unmap_extent_buffer(left, left->map_token, KM_USER1);
+ left->map_token = NULL;
+ }
+
+ if (push_items == 0)
+ goto out_unlock;
+
+ if (!empty && push_items == left_nritems)
+ WARN_ON(1);
+
+ /* push left to right */
+ right_nritems = btrfs_header_nritems(right);
+
+ push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+ push_space -= leaf_data_end(root, left);
+
+ /* make room in the right data area */
+ data_end = leaf_data_end(root, right);
+ memmove_extent_buffer(right,
+ btrfs_leaf_data(right) + data_end - push_space,
+ btrfs_leaf_data(right) + data_end,
+ BTRFS_LEAF_DATA_SIZE(root) - data_end);
+
+ /* copy from the left data area */
+ copy_extent_buffer(right, left, btrfs_leaf_data(right) +
+ BTRFS_LEAF_DATA_SIZE(root) - push_space,
+ btrfs_leaf_data(left) + leaf_data_end(root, left),
+ push_space);
+
+ memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
+ btrfs_item_nr_offset(0),
+ right_nritems * sizeof(struct btrfs_item));
+
+ /* copy the items from left to right */
+ copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(left_nritems - push_items),
+ push_items * sizeof(struct btrfs_item));
+
+ /* update the item pointers */
+ right_nritems += push_items;
+ btrfs_set_header_nritems(right, right_nritems);
+ push_space = BTRFS_LEAF_DATA_SIZE(root);
+ for (i = 0; i < right_nritems; i++) {
+ item = btrfs_item_nr(right, i);
+ if (!right->map_token) {
+ map_extent_buffer(right, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &right->map_token, &right->kaddr,
+ &right->map_start, &right->map_len,
+ KM_USER1);
+ }
+ push_space -= btrfs_item_size(right, item);
+ btrfs_set_item_offset(right, item, push_space);
+ }
+
+ if (right->map_token) {
+ unmap_extent_buffer(right, right->map_token, KM_USER1);
+ right->map_token = NULL;
+ }
+ left_nritems -= push_items;
+ btrfs_set_header_nritems(left, left_nritems);
+
+ if (left_nritems)
+ btrfs_mark_buffer_dirty(left);
+ btrfs_mark_buffer_dirty(right);
+
+ ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
+ BUG_ON(ret);
+
+ btrfs_item_key(right, &disk_key, 0);
+ btrfs_set_node_key(upper, &disk_key, slot + 1);
+ btrfs_mark_buffer_dirty(upper);
+
+ /* then fixup the leaf pointer in the path */
+ if (path->slots[0] >= left_nritems) {
+ path->slots[0] -= left_nritems;
+ if (btrfs_header_nritems(path->nodes[0]) == 0)
+ clean_tree_block(trans, root, path->nodes[0]);
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[1] += 1;
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+ return 0;
+
+out_unlock:
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return 1;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int data_size,
+ int empty)
+{
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *right = path->nodes[0];
+ struct extent_buffer *left;
+ int slot;
+ int i;
+ int free_space;
+ int push_space = 0;
+ int push_items = 0;
+ struct btrfs_item *item;
+ u32 old_left_nritems;
+ u32 right_nritems;
+ u32 nr;
+ int ret = 0;
+ int wret;
+ u32 this_item_size;
+ u32 old_left_item_size;
+
+ slot = path->slots[1];
+ if (slot == 0)
+ return 1;
+ if (!path->nodes[1])
+ return 1;
+
+ right_nritems = btrfs_header_nritems(right);
+ if (right_nritems == 0)
+ return 1;
+
+ WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+
+ left = read_node_slot(root, path->nodes[1], slot - 1);
+ btrfs_tree_lock(left);
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, left,
+ path->nodes[1], slot - 1, &left, 0);
+ if (ret) {
+ /* we hit -ENOSPC, but it isn't fatal here */
+ ret = 1;
+ goto out;
+ }
+
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ if (empty)
+ nr = right_nritems;
+ else
+ nr = right_nritems - 1;
+
+ for (i = 0; i < nr; i++) {
+ item = btrfs_item_nr(right, i);
+ if (!right->map_token) {
+ map_extent_buffer(right, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &right->map_token, &right->kaddr,
+ &right->map_start, &right->map_len,
+ KM_USER1);
+ }
+
+ if (!empty && push_items > 0) {
+ if (path->slots[0] < i)
+ break;
+ if (path->slots[0] == i) {
+ int space = btrfs_leaf_free_space(root, right);
+ if (space + push_space * 2 > free_space)
+ break;
+ }
+ }
+
+ if (path->slots[0] == i)
+ push_space += data_size;
+
+ this_item_size = btrfs_item_size(right, item);
+ if (this_item_size + sizeof(*item) + push_space > free_space)
+ break;
+
+ push_items++;
+ push_space += this_item_size + sizeof(*item);
+ }
+
+ if (right->map_token) {
+ unmap_extent_buffer(right, right->map_token, KM_USER1);
+ right->map_token = NULL;
+ }
+
+ if (push_items == 0) {
+ ret = 1;
+ goto out;
+ }
+ if (!empty && push_items == btrfs_header_nritems(right))
+ WARN_ON(1);
+
+ /* push data from right to left */
+ copy_extent_buffer(left, right,
+ btrfs_item_nr_offset(btrfs_header_nritems(left)),
+ btrfs_item_nr_offset(0),
+ push_items * sizeof(struct btrfs_item));
+
+ push_space = BTRFS_LEAF_DATA_SIZE(root) -
+ btrfs_item_offset_nr(right, push_items - 1);
+
+ copy_extent_buffer(left, right, btrfs_leaf_data(left) +
+ leaf_data_end(root, left) - push_space,
+ btrfs_leaf_data(right) +
+ btrfs_item_offset_nr(right, push_items - 1),
+ push_space);
+ old_left_nritems = btrfs_header_nritems(left);
+ BUG_ON(old_left_nritems <= 0);
+
+ old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+ for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+ u32 ioff;
+
+ item = btrfs_item_nr(left, i);
+ if (!left->map_token) {
+ map_extent_buffer(left, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &left->map_token, &left->kaddr,
+ &left->map_start, &left->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(left, item);
+ btrfs_set_item_offset(left, item,
+ ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+ }
+ btrfs_set_header_nritems(left, old_left_nritems + push_items);
+ if (left->map_token) {
+ unmap_extent_buffer(left, left->map_token, KM_USER1);
+ left->map_token = NULL;
+ }
+
+ /* fixup right node */
+ if (push_items > right_nritems) {
+ printk(KERN_CRIT "push items %d nr %u\n", push_items,
+ right_nritems);
+ WARN_ON(1);
+ }
+
+ if (push_items < right_nritems) {
+ push_space = btrfs_item_offset_nr(right, push_items - 1) -
+ leaf_data_end(root, right);
+ memmove_extent_buffer(right, btrfs_leaf_data(right) +
+ BTRFS_LEAF_DATA_SIZE(root) - push_space,
+ btrfs_leaf_data(right) +
+ leaf_data_end(root, right), push_space);
+
+ memmove_extent_buffer(right, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(push_items),
+ (btrfs_header_nritems(right) - push_items) *
+ sizeof(struct btrfs_item));
+ }
+ right_nritems -= push_items;
+ btrfs_set_header_nritems(right, right_nritems);
+ push_space = BTRFS_LEAF_DATA_SIZE(root);
+ for (i = 0; i < right_nritems; i++) {
+ item = btrfs_item_nr(right, i);
+
+ if (!right->map_token) {
+ map_extent_buffer(right, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &right->map_token, &right->kaddr,
+ &right->map_start, &right->map_len,
+ KM_USER1);
+ }
+
+ push_space = push_space - btrfs_item_size(right, item);
+ btrfs_set_item_offset(right, item, push_space);
+ }
+ if (right->map_token) {
+ unmap_extent_buffer(right, right->map_token, KM_USER1);
+ right->map_token = NULL;
+ }
+
+ btrfs_mark_buffer_dirty(left);
+ if (right_nritems)
+ btrfs_mark_buffer_dirty(right);
+
+ ret = btrfs_update_ref(trans, root, right, left,
+ old_left_nritems, push_items);
+ BUG_ON(ret);
+
+ btrfs_item_key(right, &disk_key, 0);
+ wret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ if (wret)
+ ret = wret;
+
+ /* then fixup the leaf pointer in the path */
+ if (path->slots[0] < push_items) {
+ path->slots[0] += old_left_nritems;
+ if (btrfs_header_nritems(path->nodes[0]) == 0)
+ clean_tree_block(trans, root, path->nodes[0]);
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = left;
+ path->slots[1] -= 1;
+ } else {
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ path->slots[0] -= push_items;
+ }
+ BUG_ON(path->slots[0] < 0);
+ return ret;
+out:
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int split_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *ins_key,
+ struct btrfs_path *path, int data_size,
+ int extend)
+{
+ struct extent_buffer *l;
+ u32 nritems;
+ int mid;
+ int slot;
+ struct extent_buffer *right;
+ int data_copy_size;
+ int rt_data_off;
+ int i;
+ int ret = 0;
+ int wret;
+ int double_split;
+ int num_doubles = 0;
+ struct btrfs_disk_key disk_key;
+
+ /* first try to make some room by pushing left and right */
+ if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+ wret = push_leaf_right(trans, root, path, data_size, 0);
+ if (wret < 0)
+ return wret;
+ if (wret) {
+ wret = push_leaf_left(trans, root, path, data_size, 0);
+ if (wret < 0)
+ return wret;
+ }
+ l = path->nodes[0];
+
+ /* did the pushes work? */
+ if (btrfs_leaf_free_space(root, l) >= data_size)
+ return 0;
+ }
+
+ if (!path->nodes[1]) {
+ ret = insert_new_root(trans, root, path, 1);
+ if (ret)
+ return ret;
+ }
+again:
+ double_split = 0;
+ l = path->nodes[0];
+ slot = path->slots[0];
+ nritems = btrfs_header_nritems(l);
+ mid = (nritems + 1) / 2;
+
+ right = btrfs_alloc_free_block(trans, root, root->leafsize,
+ path->nodes[1]->start,
+ root->root_key.objectid,
+ trans->transid, 0, l->start, 0);
+ if (IS_ERR(right)) {
+ BUG_ON(1);
+ return PTR_ERR(right);
+ }
+
+ memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
+ btrfs_set_header_bytenr(right, right->start);
+ btrfs_set_header_generation(right, trans->transid);
+ btrfs_set_header_owner(right, root->root_key.objectid);
+ btrfs_set_header_level(right, 0);
+ write_extent_buffer(right, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(right),
+ BTRFS_FSID_SIZE);
+
+ write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_header_chunk_tree_uuid(right),
+ BTRFS_UUID_SIZE);
+ if (mid <= slot) {
+ if (nritems == 1 ||
+ leaf_space_used(l, mid, nritems - mid) + data_size >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ if (slot >= nritems) {
+ btrfs_cpu_key_to_disk(&disk_key, ins_key);
+ btrfs_set_header_nritems(right, 0);
+ wret = insert_ptr(trans, root, path,
+ &disk_key, right->start,
+ path->slots[1] + 1, 1);
+ if (wret)
+ ret = wret;
+
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] = 0;
+ path->slots[1] += 1;
+ btrfs_mark_buffer_dirty(right);
+ return ret;
+ }
+ mid = slot;
+ if (mid != nritems &&
+ leaf_space_used(l, mid, nritems - mid) +
+ data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ double_split = 1;
+ }
+ }
+ } else {
+ if (leaf_space_used(l, 0, mid) + data_size >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ if (!extend && data_size && slot == 0) {
+ btrfs_cpu_key_to_disk(&disk_key, ins_key);
+ btrfs_set_header_nritems(right, 0);
+ wret = insert_ptr(trans, root, path,
+ &disk_key,
+ right->start,
+ path->slots[1], 1);
+ if (wret)
+ ret = wret;
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] = 0;
+ if (path->slots[1] == 0) {
+ wret = fixup_low_keys(trans, root,
+ path, &disk_key, 1);
+ if (wret)
+ ret = wret;
+ }
+ btrfs_mark_buffer_dirty(right);
+ return ret;
+ } else if ((extend || !data_size) && slot == 0) {
+ mid = 1;
+ } else {
+ mid = slot;
+ if (mid != nritems &&
+ leaf_space_used(l, mid, nritems - mid) +
+ data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ double_split = 1;
+ }
+ }
+ }
+ }
+ nritems = nritems - mid;
+ btrfs_set_header_nritems(right, nritems);
+ data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+ copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(mid),
+ nritems * sizeof(struct btrfs_item));
+
+ copy_extent_buffer(right, l,
+ btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+ data_copy_size, btrfs_leaf_data(l) +
+ leaf_data_end(root, l), data_copy_size);
+
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+ btrfs_item_end_nr(l, mid);
+
+ for (i = 0; i < nritems; i++) {
+ struct btrfs_item *item = btrfs_item_nr(right, i);
+ u32 ioff;
+
+ if (!right->map_token) {
+ map_extent_buffer(right, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &right->map_token, &right->kaddr,
+ &right->map_start, &right->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(right, item);
+ btrfs_set_item_offset(right, item, ioff + rt_data_off);
+ }
+
+ if (right->map_token) {
+ unmap_extent_buffer(right, right->map_token, KM_USER1);
+ right->map_token = NULL;
+ }
+
+ btrfs_set_header_nritems(l, mid);
+ ret = 0;
+ btrfs_item_key(right, &disk_key, 0);
+ wret = insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
+ if (wret)
+ ret = wret;
+
+ btrfs_mark_buffer_dirty(right);
+ btrfs_mark_buffer_dirty(l);
+ BUG_ON(path->slots[0] != slot);
+
+ ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+ BUG_ON(ret);
+
+ if (mid <= slot) {
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] -= mid;
+ path->slots[1] += 1;
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+
+ BUG_ON(path->slots[0] < 0);
+
+ if (double_split) {
+ BUG_ON(num_doubles != 0);
+ num_doubles++;
+ goto again;
+ }
+ return ret;
+}
+
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation. After
+ * the split, the path is pointing to the old item. The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ unsigned long split_offset)
+{
+ u32 item_size;
+ struct extent_buffer *leaf;
+ struct btrfs_key orig_key;
+ struct btrfs_item *item;
+ struct btrfs_item *new_item;
+ int ret = 0;
+ int slot;
+ u32 nritems;
+ u32 orig_offset;
+ struct btrfs_disk_key disk_key;
+ char *buf;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
+ if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
+ goto split;
+
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ btrfs_release_path(root, path);
+
+ path->search_for_split = 1;
+ path->keep_locks = 1;
+
+ ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
+ path->search_for_split = 0;
+
+ /* if our item isn't there or got smaller, return now */
+ if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
+ path->slots[0])) {
+ path->keep_locks = 0;
+ return -EAGAIN;
+ }
+
+ ret = split_leaf(trans, root, &orig_key, path,
+ sizeof(struct btrfs_item), 1);
+ path->keep_locks = 0;
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+split:
+ item = btrfs_item_nr(leaf, path->slots[0]);
+ orig_offset = btrfs_item_offset(leaf, item);
+ item_size = btrfs_item_size(leaf, item);
+
+
+ buf = kmalloc(item_size, GFP_NOFS);
+ read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
+ path->slots[0]), item_size);
+ slot = path->slots[0] + 1;
+ leaf = path->nodes[0];
+
+ nritems = btrfs_header_nritems(leaf);
+
+ if (slot != nritems) {
+ /* shift the items */
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+ btrfs_item_nr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_item));
+
+ }
+
+ btrfs_cpu_key_to_disk(&disk_key, new_key);
+ btrfs_set_item_key(leaf, &disk_key, slot);
+
+ new_item = btrfs_item_nr(leaf, slot);
+
+ btrfs_set_item_offset(leaf, new_item, orig_offset);
+ btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+
+ btrfs_set_item_offset(leaf, item,
+ orig_offset + item_size - split_offset);
+ btrfs_set_item_size(leaf, item, split_offset);
+
+ btrfs_set_header_nritems(leaf, nritems + 1);
+
+ /* write the data for the start of the original item */
+ write_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ split_offset);
+
+ /* write the data for the new item */
+ write_extent_buffer(leaf, buf + split_offset,
+ btrfs_item_ptr_offset(leaf, slot),
+ item_size - split_offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+ kfree(buf);
+ return ret;
+}
+
+/*
+ * make the item pointed to by the path smaller. new_size indicates
+ * how small to make it, and from_end tells us if we just chop bytes
+ * off the end of the item or if we shift the item to chop bytes off
+ * the front.
+ */
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u32 new_size, int from_end)
+{
+ int ret = 0;
+ int slot;
+ int slot_orig;
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ u32 nritems;
+ unsigned int data_end;
+ unsigned int old_data_start;
+ unsigned int old_size;
+ unsigned int size_diff;
+ int i;
+
+ slot_orig = path->slots[0];
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ old_size = btrfs_item_size_nr(leaf, slot);
+ if (old_size == new_size)
+ return 0;
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(root, leaf);
+
+ old_data_start = btrfs_item_offset_nr(leaf, slot);
+
+ size_diff = old_size - new_size;
+
+ BUG_ON(slot < 0);
+ BUG_ON(slot >= nritems);
+
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+ item = btrfs_item_nr(leaf, i);
+
+ if (!leaf->map_token) {
+ map_extent_buffer(leaf, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(leaf, item);
+ btrfs_set_item_offset(leaf, item, ioff + size_diff);
+ }
+
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
+
+ /* shift the data */
+ if (from_end) {
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end + size_diff, btrfs_leaf_data(leaf) +
+ data_end, old_data_start + new_size - data_end);
+ } else {
+ struct btrfs_disk_key disk_key;
+ u64 offset;
+
+ btrfs_item_key(leaf, &disk_key, slot);
+
+ if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
+ unsigned long ptr;
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ fi = (struct btrfs_file_extent_item *)(
+ (unsigned long)fi - size_diff);
+
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ memmove_extent_buffer(leaf, ptr,
+ (unsigned long)fi,
+ offsetof(struct btrfs_file_extent_item,
+ disk_bytenr));
+ }
+ }
+
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end + size_diff, btrfs_leaf_data(leaf) +
+ data_end, old_data_start - data_end);
+
+ offset = btrfs_disk_key_offset(&disk_key);
+ btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
+ btrfs_set_item_key(leaf, &disk_key, slot);
+ if (slot == 0)
+ fixup_low_keys(trans, root, path, &disk_key, 1);
+ }
+
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_set_item_size(leaf, item, new_size);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+ return ret;
+}
+
+/*
+ * make the item pointed to by the path bigger, data_size is the new size.
+ */
+int btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ u32 data_size)
+{
+ int ret = 0;
+ int slot;
+ int slot_orig;
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ u32 nritems;
+ unsigned int data_end;
+ unsigned int old_data;
+ unsigned int old_size;
+ int i;
+
+ slot_orig = path->slots[0];
+ leaf = path->nodes[0];
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(root, leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < data_size) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+ slot = path->slots[0];
+ old_data = btrfs_item_end_nr(leaf, slot);
+
+ BUG_ON(slot < 0);
+ if (slot >= nritems) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d too large, nritems %d\n",
+ slot, nritems);
+ BUG_ON(1);
+ }
+
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+ item = btrfs_item_nr(leaf, i);
+
+ if (!leaf->map_token) {
+ map_extent_buffer(leaf, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
+ }
+ ioff = btrfs_item_offset(leaf, item);
+ btrfs_set_item_offset(leaf, item, ioff - data_size);
+ }
+
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
+
+ /* shift the data */
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end - data_size, btrfs_leaf_data(leaf) +
+ data_end, old_data - data_end);
+
+ data_end = old_data;
+ old_size = btrfs_item_size_nr(leaf, slot);
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_set_item_size(leaf, item, old_size + data_size);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+ return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ * Returns the number of keys that were inserted.
+ */
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ int nr)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ int ret = 0;
+ int slot;
+ int i;
+ u32 nritems;
+ u32 total_data = 0;
+ u32 total_size = 0;
+ unsigned int data_end;
+ struct btrfs_disk_key disk_key;
+ struct btrfs_key found_key;
+
+ for (i = 0; i < nr; i++) {
+ if (total_size + data_size[i] + sizeof(struct btrfs_item) >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ break;
+ nr = i;
+ }
+ total_data += data_size[i];
+ total_size += data_size[i] + sizeof(struct btrfs_item);
+ }
+ BUG_ON(nr == 0);
+
+ ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ if (ret == 0)
+ return -EEXIST;
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(root, leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < total_size) {
+ for (i = nr; i >= 0; i--) {
+ total_data -= data_size[i];
+ total_size -= data_size[i] + sizeof(struct btrfs_item);
+ if (total_size < btrfs_leaf_free_space(root, leaf))
+ break;
+ }
+ nr = i;
+ }
+
+ slot = path->slots[0];
+ BUG_ON(slot < 0);
+
+ if (slot != nritems) {
+ unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /* figure out how many keys we can insert in here */
+ total_data = data_size[0];
+ for (i = 1; i < nr; i++) {
+ if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+ break;
+ total_data += data_size[i];
+ }
+ nr = i;
+
+ if (old_data < data_end) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+ slot, old_data, data_end);
+ BUG_ON(1);
+ }
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ WARN_ON(leaf->map_token);
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+
+ item = btrfs_item_nr(leaf, i);
+ if (!leaf->map_token) {
+ map_extent_buffer(leaf, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(leaf, item);
+ btrfs_set_item_offset(leaf, item, ioff - total_data);
+ }
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
+
+ /* shift the items */
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ btrfs_item_nr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_item));
+
+ /* shift the data */
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end - total_data, btrfs_leaf_data(leaf) +
+ data_end, old_data - data_end);
+ data_end = old_data;
+ } else {
+ /*
+ * this sucks but it has to be done, if we are inserting at
+ * the end of the leaf only insert 1 of the items, since we
+ * have no way of knowing whats on the next leaf and we'd have
+ * to drop our current locks to figure it out
+ */
+ nr = 1;
+ }
+
+ /* setup the item for the new data */
+ for (i = 0; i < nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ btrfs_set_item_key(leaf, &disk_key, slot + i);
+ item = btrfs_item_nr(leaf, slot + i);
+ btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+ data_end -= data_size[i];
+ btrfs_set_item_size(leaf, item, data_size[i]);
+ }
+ btrfs_set_header_nritems(leaf, nritems + nr);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+ if (slot == 0) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ }
+
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+out:
+ if (!ret)
+ ret = nr;
+ return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ int nr)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ int ret = 0;
+ int slot;
+ int slot_orig;
+ int i;
+ u32 nritems;
+ u32 total_size = 0;
+ u32 total_data = 0;
+ unsigned int data_end;
+ struct btrfs_disk_key disk_key;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ if (ret == 0)
+ return -EEXIST;
+ if (ret < 0)
+ goto out;
+
+ slot_orig = path->slots[0];
+ leaf = path->nodes[0];
+
+ nritems = btrfs_header_nritems(leaf);
+ data_end = leaf_data_end(root, leaf);
+
+ if (btrfs_leaf_free_space(root, leaf) < total_size) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "not enough freespace need %u have %d\n",
+ total_size, btrfs_leaf_free_space(root, leaf));
+ BUG();
+ }
+
+ slot = path->slots[0];
+ BUG_ON(slot < 0);
+
+ if (slot != nritems) {
+ unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+
+ if (old_data < data_end) {
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+ slot, old_data, data_end);
+ BUG_ON(1);
+ }
+ /*
+ * item0..itemN ... dataN.offset..dataN.size .. data0.size
+ */
+ /* first correct the data pointers */
+ WARN_ON(leaf->map_token);
+ for (i = slot; i < nritems; i++) {
+ u32 ioff;
+
+ item = btrfs_item_nr(leaf, i);
+ if (!leaf->map_token) {
+ map_extent_buffer(leaf, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(leaf, item);
+ btrfs_set_item_offset(leaf, item, ioff - total_data);
+ }
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
+
+ /* shift the items */
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ btrfs_item_nr_offset(slot),
+ (nritems - slot) * sizeof(struct btrfs_item));
+
+ /* shift the data */
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end - total_data, btrfs_leaf_data(leaf) +
+ data_end, old_data - data_end);
+ data_end = old_data;
+ }
+
+ /* setup the item for the new data */
+ for (i = 0; i < nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ btrfs_set_item_key(leaf, &disk_key, slot + i);
+ item = btrfs_item_nr(leaf, slot + i);
+ btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+ data_end -= data_size[i];
+ btrfs_set_item_size(leaf, item, data_size[i]);
+ }
+ btrfs_set_header_nritems(leaf, nritems + nr);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+ if (slot == 0) {
+ btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+ }
+
+ if (btrfs_leaf_free_space(root, leaf) < 0) {
+ btrfs_print_leaf(root, leaf);
+ BUG();
+ }
+out:
+ return ret;
+}
+
+/*
+ * Given a key and some data, insert an item into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *cpu_key, void *data, u32
+ data_size)
+{
+ int ret = 0;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+ if (!ret) {
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, data, ptr, data_size);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * delete the pointer from a given node.
+ *
+ * the tree should have been previously balanced so the deletion does not
+ * empty a node.
+ */
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int level, int slot)
+{
+ struct extent_buffer *parent = path->nodes[level];
+ u32 nritems;
+ int ret = 0;
+ int wret;
+
+ nritems = btrfs_header_nritems(parent);
+ if (slot != nritems - 1) {
+ memmove_extent_buffer(parent,
+ btrfs_node_key_ptr_offset(slot),
+ btrfs_node_key_ptr_offset(slot + 1),
+ sizeof(struct btrfs_key_ptr) *
+ (nritems - slot - 1));
+ }
+ nritems--;
+ btrfs_set_header_nritems(parent, nritems);
+ if (nritems == 0 && parent == root->node) {
+ BUG_ON(btrfs_header_level(root->node) != 1);
+ /* just turn the root into a leaf and break */
+ btrfs_set_header_level(root->node, 0);
+ } else if (slot == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_node_key(parent, &disk_key, 0);
+ wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
+ if (wret)
+ ret = wret;
+ }
+ btrfs_mark_buffer_dirty(parent);
+ return ret;
+}
+
+/*
+ * a helper function to delete the leaf pointed to by path->slots[1] and
+ * path->nodes[1]. bytenr is the node block pointer, but since the callers
+ * already know it, it is faster to have them pass it down than to
+ * read it out of the node again.
+ *
+ * This deletes the pointer in path->nodes[1] and frees the leaf
+ * block extent. zero is returned if it all worked out, < 0 otherwise.
+ *
+ * The path must have already been setup for deleting the leaf, including
+ * all the proper balancing. path->nodes[1] must be locked.
+ */
+noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 bytenr)
+{
+ int ret;
+ u64 root_gen = btrfs_header_generation(path->nodes[1]);
+
+ ret = del_ptr(trans, root, path, 1, path->slots[1]);
+ if (ret)
+ return ret;
+
+ ret = btrfs_free_extent(trans, root, bytenr,
+ btrfs_level_size(root, 0),
+ path->nodes[1]->start,
+ btrfs_header_owner(path->nodes[1]),
+ root_gen, 0, 1);
+ return ret;
+}
+/*
+ * delete the item at the leaf level in path. If that empties
+ * the leaf, remove it from the tree
+ */
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int slot, int nr)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ int last_off;
+ int dsize = 0;
+ int ret = 0;
+ int wret;
+ int i;
+ u32 nritems;
+
+ leaf = path->nodes[0];
+ last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+
+ for (i = 0; i < nr; i++)
+ dsize += btrfs_item_size_nr(leaf, slot + i);
+
+ nritems = btrfs_header_nritems(leaf);
+
+ if (slot + nr != nritems) {
+ int data_end = leaf_data_end(root, leaf);
+
+ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+ data_end + dsize,
+ btrfs_leaf_data(leaf) + data_end,
+ last_off - data_end);
+
+ for (i = slot + nr; i < nritems; i++) {
+ u32 ioff;
+
+ item = btrfs_item_nr(leaf, i);
+ if (!leaf->map_token) {
+ map_extent_buffer(leaf, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
+ }
+ ioff = btrfs_item_offset(leaf, item);
+ btrfs_set_item_offset(leaf, item, ioff + dsize);
+ }
+
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
+
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+ btrfs_item_nr_offset(slot + nr),
+ sizeof(struct btrfs_item) *
+ (nritems - slot - nr));
+ }
+ btrfs_set_header_nritems(leaf, nritems - nr);
+ nritems -= nr;
+
+ /* delete the leaf if we've emptied it */
+ if (nritems == 0) {
+ if (leaf == root->node) {
+ btrfs_set_header_level(leaf, 0);
+ } else {
+ ret = btrfs_del_leaf(trans, root, path, leaf->start);
+ BUG_ON(ret);
+ }
+ } else {
+ int used = leaf_space_used(leaf, 0, nritems);
+ if (slot == 0) {
+ struct btrfs_disk_key disk_key;
+
+ btrfs_item_key(leaf, &disk_key, 0);
+ wret = fixup_low_keys(trans, root, path,
+ &disk_key, 1);
+ if (wret)
+ ret = wret;
+ }
+
+ /* delete the leaf if it is mostly empty */
+ if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+ /* push_leaf_left fixes the path.
+ * make sure the path still points to our leaf
+ * for possible call to del_ptr below
+ */
+ slot = path->slots[1];
+ extent_buffer_get(leaf);
+
+ wret = push_leaf_left(trans, root, path, 1, 1);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+
+ if (path->nodes[0] == leaf &&
+ btrfs_header_nritems(leaf)) {
+ wret = push_leaf_right(trans, root, path, 1, 1);
+ if (wret < 0 && wret != -ENOSPC)
+ ret = wret;
+ }
+
+ if (btrfs_header_nritems(leaf) == 0) {
+ path->slots[1] = slot;
+ ret = btrfs_del_leaf(trans, root, path,
+ leaf->start);
+ BUG_ON(ret);
+ free_extent_buffer(leaf);
+ } else {
+ /* if we're still in the path, make sure
+ * we're dirty. Otherwise, one of the
+ * push_leaf functions must have already
+ * dirtied this buffer
+ */
+ if (path->nodes[0] == leaf)
+ btrfs_mark_buffer_dirty(leaf);
+ free_extent_buffer(leaf);
+ }
+ } else {
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ }
+ return ret;
+}
+
+/*
+ * search the tree again to find a leaf with lesser keys
+ * returns 0 if it found something or 1 if there are no lesser leaves.
+ * returns < 0 on io errors.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
+ */
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ struct btrfs_disk_key found_key;
+ int ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+
+ if (key.offset > 0)
+ key.offset--;
+ else if (key.type > 0)
+ key.type--;
+ else if (key.objectid > 0)
+ key.objectid--;
+ else
+ return 1;
+
+ btrfs_release_path(root, path);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ btrfs_item_key(path->nodes[0], &found_key, 0);
+ ret = comp_keys(&found_key, &key);
+ if (ret < 0)
+ return 0;
+ return 1;
+}
+
+/*
+ * A helper function to walk down the tree starting at min_key, and looking
+ * for nodes or leaves that are either in cache or have a minimum
+ * transaction id. This is used by the btree defrag code, and tree logging
+ *
+ * This does not cow, but it does stuff the starting key it finds back
+ * into min_key, so you can call btrfs_search_slot with cow=1 on the
+ * key and get a writable path.
+ *
+ * This does lock as it descends, and path->keep_locks should be set
+ * to 1 by the caller.
+ *
+ * This honors path->lowest_level to prevent descent past a given level
+ * of the tree.
+ *
+ * min_trans indicates the oldest transaction that you are interested
+ * in walking through. Any nodes or leaves older than min_trans are
+ * skipped over (without reading them).
+ *
+ * returns zero if something useful was found, < 0 on error and 1 if there
+ * was nothing in the tree that matched the search criteria.
+ */
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+ struct btrfs_key *max_key,
+ struct btrfs_path *path, int cache_only,
+ u64 min_trans)
+{
+ struct extent_buffer *cur;
+ struct btrfs_key found_key;
+ int slot;
+ int sret;
+ u32 nritems;
+ int level;
+ int ret = 1;
+
+ WARN_ON(!path->keep_locks);
+again:
+ cur = btrfs_lock_root_node(root);
+ level = btrfs_header_level(cur);
+ WARN_ON(path->nodes[level]);
+ path->nodes[level] = cur;
+ path->locks[level] = 1;
+
+ if (btrfs_header_generation(cur) < min_trans) {
+ ret = 1;
+ goto out;
+ }
+ while (1) {
+ nritems = btrfs_header_nritems(cur);
+ level = btrfs_header_level(cur);
+ sret = bin_search(cur, min_key, level, &slot);
+
+ /* at the lowest level, we're done, setup the path and exit */
+ if (level == path->lowest_level) {
+ if (slot >= nritems)
+ goto find_next_key;
+ ret = 0;
+ path->slots[level] = slot;
+ btrfs_item_key_to_cpu(cur, &found_key, slot);
+ goto out;
+ }
+ if (sret && slot > 0)
+ slot--;
+ /*
+ * check this node pointer against the cache_only and
+ * min_trans parameters. If it isn't in cache or is too
+ * old, skip to the next one.
+ */
+ while (slot < nritems) {
+ u64 blockptr;
+ u64 gen;
+ struct extent_buffer *tmp;
+ struct btrfs_disk_key disk_key;
+
+ blockptr = btrfs_node_blockptr(cur, slot);
+ gen = btrfs_node_ptr_generation(cur, slot);
+ if (gen < min_trans) {
+ slot++;
+ continue;
+ }
+ if (!cache_only)
+ break;
+
+ if (max_key) {
+ btrfs_node_key(cur, &disk_key, slot);
+ if (comp_keys(&disk_key, max_key) >= 0) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+ tmp = btrfs_find_tree_block(root, blockptr,
+ btrfs_level_size(root, level - 1));
+
+ if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+ free_extent_buffer(tmp);
+ break;
+ }
+ if (tmp)
+ free_extent_buffer(tmp);
+ slot++;
+ }
+find_next_key:
+ /*
+ * we didn't find a candidate key in this node, walk forward
+ * and find another one
+ */
+ if (slot >= nritems) {
+ path->slots[level] = slot;
+ sret = btrfs_find_next_key(root, path, min_key, level,
+ cache_only, min_trans);
+ if (sret == 0) {
+ btrfs_release_path(root, path);
+ goto again;
+ } else {
+ goto out;
+ }
+ }
+ /* save our key for returning back */
+ btrfs_node_key_to_cpu(cur, &found_key, slot);
+ path->slots[level] = slot;
+ if (level == path->lowest_level) {
+ ret = 0;
+ unlock_up(path, level, 1);
+ goto out;
+ }
+ cur = read_node_slot(root, cur, slot);
+
+ btrfs_tree_lock(cur);
+ path->locks[level - 1] = 1;
+ path->nodes[level - 1] = cur;
+ unlock_up(path, level, 1);
+ }
+out:
+ if (ret == 0)
+ memcpy(min_key, &found_key, sizeof(found_key));
+ return ret;
+}
+
+/*
+ * this is similar to btrfs_next_leaf, but does not try to preserve
+ * and fixup the path. It looks for and returns the next key in the
+ * tree based on the current path and the cache_only and min_trans
+ * parameters.
+ *
+ * 0 is returned if another key is found, < 0 if there are any errors
+ * and 1 is returned if there are no higher keys in the tree
+ *
+ * path->keep_locks should be set to 1 on the search made before
+ * calling this function.
+ */
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, int lowest_level,
+ int cache_only, u64 min_trans)
+{
+ int level = lowest_level;
+ int slot;
+ struct extent_buffer *c;
+
+ WARN_ON(!path->keep_locks);
+ while (level < BTRFS_MAX_LEVEL) {
+ if (!path->nodes[level])
+ return 1;
+
+ slot = path->slots[level] + 1;
+ c = path->nodes[level];
+next:
+ if (slot >= btrfs_header_nritems(c)) {
+ level++;
+ if (level == BTRFS_MAX_LEVEL)
+ return 1;
+ continue;
+ }
+ if (level == 0)
+ btrfs_item_key_to_cpu(c, key, slot);
+ else {
+ u64 blockptr = btrfs_node_blockptr(c, slot);
+ u64 gen = btrfs_node_ptr_generation(c, slot);
+
+ if (cache_only) {
+ struct extent_buffer *cur;
+ cur = btrfs_find_tree_block(root, blockptr,
+ btrfs_level_size(root, level - 1));
+ if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
+ slot++;
+ if (cur)
+ free_extent_buffer(cur);
+ goto next;
+ }
+ free_extent_buffer(cur);
+ }
+ if (gen < min_trans) {
+ slot++;
+ goto next;
+ }
+ btrfs_node_key_to_cpu(c, key, slot);
+ }
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * search the tree again to find a leaf with greater keys
+ * returns 0 if it found something or 1 if there are no greater leaves.
+ * returns < 0 on io errors.
+ */
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ int slot;
+ int level = 1;
+ struct extent_buffer *c;
+ struct extent_buffer *next = NULL;
+ struct btrfs_key key;
+ u32 nritems;
+ int ret;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (nritems == 0)
+ return 1;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+
+ btrfs_release_path(root, path);
+ path->keep_locks = 1;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ path->keep_locks = 0;
+
+ if (ret < 0)
+ return ret;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ /*
+ * by releasing the path above we dropped all our locks. A balance
+ * could have added more items next to the key that used to be
+ * at the very end of the block. So, check again here and
+ * advance the path if there are now more items available.
+ */
+ if (nritems > 0 && path->slots[0] < nritems - 1) {
+ path->slots[0]++;
+ goto done;
+ }
+
+ while (level < BTRFS_MAX_LEVEL) {
+ if (!path->nodes[level])
+ return 1;
+
+ slot = path->slots[level] + 1;
+ c = path->nodes[level];
+ if (slot >= btrfs_header_nritems(c)) {
+ level++;
+ if (level == BTRFS_MAX_LEVEL)
+ return 1;
+ continue;
+ }
+
+ if (next) {
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ }
+
+ if (level == 1 && (path->locks[1] || path->skip_locking) &&
+ path->reada)
+ reada_for_search(root, path, level, slot, 0);
+
+ next = read_node_slot(root, c, slot);
+ if (!path->skip_locking) {
+ WARN_ON(!btrfs_tree_locked(c));
+ btrfs_tree_lock(next);
+ }
+ break;
+ }
+ path->slots[level] = slot;
+ while (1) {
+ level--;
+ c = path->nodes[level];
+ if (path->locks[level])
+ btrfs_tree_unlock(c);
+ free_extent_buffer(c);
+ path->nodes[level] = next;
+ path->slots[level] = 0;
+ if (!path->skip_locking)
+ path->locks[level] = 1;
+ if (!level)
+ break;
+ if (level == 1 && path->locks[1] && path->reada)
+ reada_for_search(root, path, level, slot, 0);
+ next = read_node_slot(root, next, 0);
+ if (!path->skip_locking) {
+ WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+ btrfs_tree_lock(next);
+ }
+ }
+done:
+ unlock_up(path, 0, 1);
+ return 0;
+}
+
+/*
+ * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
+ * searching until it gets past min_objectid or finds an item of 'type'
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid,
+ int type)
+{
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int ret;
+
+ while (1) {
+ if (path->slots[0] == 0) {
+ ret = btrfs_prev_leaf(root, path);
+ if (ret != 0)
+ return ret;
+ } else {
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (nritems == 0)
+ return 1;
+ if (path->slots[0] == nritems)
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.type == type)
+ return 0;
+ if (found_key.objectid < min_objectid)
+ break;
+ if (found_key.objectid == min_objectid &&
+ found_key.type < type)
+ break;
+ }
+ return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 00000000000..eee060f8811
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2129 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
+
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <asm/kmap_types.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "async-thread.h"
+
+struct btrfs_trans_handle;
+struct btrfs_transaction;
+extern struct kmem_cache *btrfs_trans_handle_cachep;
+extern struct kmem_cache *btrfs_transaction_cachep;
+extern struct kmem_cache *btrfs_bit_radix_cachep;
+extern struct kmem_cache *btrfs_path_cachep;
+struct btrfs_ordered_sum;
+
+#define BTRFS_MAGIC "_BHRfS_M"
+
+#define BTRFS_ACL_NOT_CACHED ((void *)-1)
+
+#ifdef CONFIG_LOCKDEP
+# define BTRFS_MAX_LEVEL 7
+#else
+# define BTRFS_MAX_LEVEL 8
+#endif
+
+/* holds pointers to all of the tree roots */
+#define BTRFS_ROOT_TREE_OBJECTID 1ULL
+
+/* stores information about which extents are in use, and reference counts */
+#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
+
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device. The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 4ULL
+
+/* one per subvolume, storing files and directories */
+#define BTRFS_FS_TREE_OBJECTID 5ULL
+
+/* directory objectid inside the root tree */
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
+
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+
+/* orhpan objectid for tracking unlinked/truncated files */
+#define BTRFS_ORPHAN_OBJECTID -5ULL
+
+/* does write ahead logging to speed up fsyncs */
+#define BTRFS_TREE_LOG_OBJECTID -6ULL
+#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
+
+/* for space balancing */
+#define BTRFS_TREE_RELOC_OBJECTID -8ULL
+#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
+
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+
+/* dummy objectid represents multiple objectids */
+#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
+
+/*
+ * All files have objectids in this range.
+ */
+#define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_LAST_FREE_OBJECTID -256ULL
+#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
+
+
+/*
+ * the device items go into the chunk tree. The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+
+/*
+ * we can actually store much bigger names, but lets not confuse the rest
+ * of linux
+ */
+#define BTRFS_NAME_LEN 255
+
+/* 32 bytes in various csum fields */
+#define BTRFS_CSUM_SIZE 32
+
+/* csum types */
+#define BTRFS_CSUM_TYPE_CRC32 0
+
+static int btrfs_csum_sizes[] = { 4, 0 };
+
+/* four bytes for CRC32 */
+#define BTRFS_EMPTY_DIR_SIZE 0
+
+#define BTRFS_FT_UNKNOWN 0
+#define BTRFS_FT_REG_FILE 1
+#define BTRFS_FT_DIR 2
+#define BTRFS_FT_CHRDEV 3
+#define BTRFS_FT_BLKDEV 4
+#define BTRFS_FT_FIFO 5
+#define BTRFS_FT_SOCK 6
+#define BTRFS_FT_SYMLINK 7
+#define BTRFS_FT_XATTR 8
+#define BTRFS_FT_MAX 9
+
+/*
+ * the key defines the order in the tree, and so it also defines (optimal)
+ * block layout. objectid corresonds to the inode number. The flags
+ * tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with flags of 1 might refer to the inode
+ * data, flags of 2 may point to file data in the btree and flags == 3
+ * may point to extents.
+ *
+ * offset is the starting byte offset for this key in the stream.
+ *
+ * btrfs_disk_key is in disk byte order. struct btrfs_key is always
+ * in cpu native order. Otherwise they are identical and their sizes
+ * should be the same (ie both packed)
+ */
+struct btrfs_disk_key {
+ __le64 objectid;
+ u8 type;
+ __le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_key {
+ u64 objectid;
+ u8 type;
+ u64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_mapping_tree {
+ struct extent_map_tree map_tree;
+};
+
+#define BTRFS_UUID_SIZE 16
+struct btrfs_dev_item {
+ /* the internal btrfs device id */
+ __le64 devid;
+
+ /* size of the device */
+ __le64 total_bytes;
+
+ /* bytes used */
+ __le64 bytes_used;
+
+ /* optimal io alignment for this device */
+ __le32 io_align;
+
+ /* optimal io width for this device */
+ __le32 io_width;
+
+ /* minimal io size for this device */
+ __le32 sector_size;
+
+ /* type and info about this device */
+ __le64 type;
+
+ /* expected generation for this device */
+ __le64 generation;
+
+ /*
+ * starting byte of this partition on the device,
+ * to allowr for stripe alignment in the future
+ */
+ __le64 start_offset;
+
+ /* grouping information for allocation decisions */
+ __le32 dev_group;
+
+ /* seek speed 0-100 where 100 is fastest */
+ u8 seek_speed;
+
+ /* bandwidth 0-100 where 100 is fastest */
+ u8 bandwidth;
+
+ /* btrfs generated uuid for this device */
+ u8 uuid[BTRFS_UUID_SIZE];
+
+ /* uuid of FS who owns this device */
+ u8 fsid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_stripe {
+ __le64 devid;
+ __le64 offset;
+ u8 dev_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_chunk {
+ /* size of this chunk in bytes */
+ __le64 length;
+
+ /* objectid of the root referencing this chunk */
+ __le64 owner;
+
+ __le64 stripe_len;
+ __le64 type;
+
+ /* optimal io alignment for this chunk */
+ __le32 io_align;
+
+ /* optimal io width for this chunk */
+ __le32 io_width;
+
+ /* minimal io size for this chunk */
+ __le32 sector_size;
+
+ /* 2^16 stripes is quite a lot, a second limit is the size of a single
+ * item in the btree
+ */
+ __le16 num_stripes;
+
+ /* sub stripes only matter for raid10 */
+ __le16 sub_stripes;
+ struct btrfs_stripe stripe;
+ /* additional stripes go here */
+} __attribute__ ((__packed__));
+
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+ BUG_ON(num_stripes == 0);
+ return sizeof(struct btrfs_chunk) +
+ sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
+#define BTRFS_FSID_SIZE 16
+#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
+
+/*
+ * every tree block (leaf or node) starts with this header.
+ */
+struct btrfs_header {
+ /* these first four must match the super block */
+ u8 csum[BTRFS_CSUM_SIZE];
+ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ __le64 bytenr; /* which block this node is supposed to live in */
+ __le64 flags;
+
+ /* allowed to be different from the super from here on down */
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+ __le64 generation;
+ __le64 owner;
+ __le32 nritems;
+ u8 level;
+} __attribute__ ((__packed__));
+
+#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
+ sizeof(struct btrfs_header)) / \
+ sizeof(struct btrfs_key_ptr))
+#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
+#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+ sizeof(struct btrfs_item) - \
+ sizeof(struct btrfs_file_extent_item))
+
+#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
+
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+#define BTRFS_LABEL_SIZE 256
+
+/*
+ * the super block basically lists the main trees of the FS
+ * it currently lacks any block count etc etc
+ */
+struct btrfs_super_block {
+ u8 csum[BTRFS_CSUM_SIZE];
+ /* the first 4 fields must match struct btrfs_header */
+ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+ __le64 bytenr; /* this block number */
+ __le64 flags;
+
+ /* allowed to be different from the btrfs_header from here own down */
+ __le64 magic;
+ __le64 generation;
+ __le64 root;
+ __le64 chunk_root;
+ __le64 log_root;
+
+ /* this will help find the new super based on the log root */
+ __le64 log_root_transid;
+ __le64 total_bytes;
+ __le64 bytes_used;
+ __le64 root_dir_objectid;
+ __le64 num_devices;
+ __le32 sectorsize;
+ __le32 nodesize;
+ __le32 leafsize;
+ __le32 stripesize;
+ __le32 sys_chunk_array_size;
+ __le64 chunk_root_generation;
+ __le64 compat_flags;
+ __le64 compat_ro_flags;
+ __le64 incompat_flags;
+ __le16 csum_type;
+ u8 root_level;
+ u8 chunk_root_level;
+ u8 log_root_level;
+ struct btrfs_dev_item dev_item;
+
+ char label[BTRFS_LABEL_SIZE];
+
+ /* future expansion */
+ __le64 reserved[32];
+ u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+} __attribute__ ((__packed__));
+
+/*
+ * Compat flags that we support. If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_COMPAT_SUPP 0x0
+#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0
+#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0
+
+/*
+ * A leaf is full of items. offset and size tell us where to find
+ * the item in the leaf (relative to the start of the data area)
+ */
+struct btrfs_item {
+ struct btrfs_disk_key key;
+ __le32 offset;
+ __le32 size;
+} __attribute__ ((__packed__));
+
+/*
+ * leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together
+ * during searches.
+ */
+struct btrfs_leaf {
+ struct btrfs_header header;
+ struct btrfs_item items[];
+} __attribute__ ((__packed__));
+
+/*
+ * all non-leaf blocks are nodes, they hold only keys and pointers to
+ * other blocks
+ */
+struct btrfs_key_ptr {
+ struct btrfs_disk_key key;
+ __le64 blockptr;
+ __le64 generation;
+} __attribute__ ((__packed__));
+
+struct btrfs_node {
+ struct btrfs_header header;
+ struct btrfs_key_ptr ptrs[];
+} __attribute__ ((__packed__));
+
+/*
+ * btrfs_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
+ * to any other levels that are present.
+ *
+ * The slots array records the index of the item or block pointer
+ * used while walking the tree.
+ */
+struct btrfs_path {
+ struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
+ int slots[BTRFS_MAX_LEVEL];
+ /* if there is real range locking, this locks field will change */
+ int locks[BTRFS_MAX_LEVEL];
+ int reada;
+ /* keep some upper locks as we walk down */
+ int keep_locks;
+ int skip_locking;
+ int lowest_level;
+
+ /*
+ * set by btrfs_split_item, tells search_slot to keep all locks
+ * and to force calls to keep space in the nodes
+ */
+ int search_for_split;
+};
+
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
+struct btrfs_extent_item {
+ __le32 refs;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_ref {
+ __le64 root;
+ __le64 generation;
+ __le64 objectid;
+ __le32 num_refs;
+} __attribute__ ((__packed__));
+
+/* dev extents record free space on individual devices. The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent. The chunk tree uuid field is a way to double check the owner
+ */
+struct btrfs_dev_extent {
+ __le64 chunk_tree;
+ __le64 chunk_objectid;
+ __le64 chunk_offset;
+ __le64 length;
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_ref {
+ __le64 index;
+ __le16 name_len;
+ /* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_timespec {
+ __le64 sec;
+ __le32 nsec;
+} __attribute__ ((__packed__));
+
+typedef enum {
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+
+/* we don't understand any encryption methods right now */
+typedef enum {
+ BTRFS_ENCRYPTION_NONE = 0,
+ BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
+
+struct btrfs_inode_item {
+ /* nfs style generation number */
+ __le64 generation;
+ /* transid that last touched this inode */
+ __le64 transid;
+ __le64 size;
+ __le64 nbytes;
+ __le64 block_group;
+ __le32 nlink;
+ __le32 uid;
+ __le32 gid;
+ __le32 mode;
+ __le64 rdev;
+ __le64 flags;
+
+ /* modification sequence number for NFS */
+ __le64 sequence;
+
+ /*
+ * a little future expansion, for more than this we can
+ * just grow the inode item and version it
+ */
+ __le64 reserved[4];
+ struct btrfs_timespec atime;
+ struct btrfs_timespec ctime;
+ struct btrfs_timespec mtime;
+ struct btrfs_timespec otime;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_log_item {
+ __le64 end;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_item {
+ struct btrfs_disk_key location;
+ __le64 transid;
+ __le16 data_len;
+ __le16 name_len;
+ u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_root_item {
+ struct btrfs_inode_item inode;
+ __le64 generation;
+ __le64 root_dirid;
+ __le64 bytenr;
+ __le64 byte_limit;
+ __le64 bytes_used;
+ __le64 last_snapshot;
+ __le64 flags;
+ __le32 refs;
+ struct btrfs_disk_key drop_progress;
+ u8 drop_level;
+ u8 level;
+} __attribute__ ((__packed__));
+
+/*
+ * this is used for both forward and backward root refs
+ */
+struct btrfs_root_ref {
+ __le64 dirid;
+ __le64 sequence;
+ __le16 name_len;
+} __attribute__ ((__packed__));
+
+#define BTRFS_FILE_EXTENT_INLINE 0
+#define BTRFS_FILE_EXTENT_REG 1
+#define BTRFS_FILE_EXTENT_PREALLOC 2
+
+struct btrfs_file_extent_item {
+ /*
+ * transaction id that created this extent
+ */
+ __le64 generation;
+ /*
+ * max number of bytes to hold this extent in ram
+ * when we split a compressed extent we can't know how big
+ * each of the resulting pieces will be. So, this is
+ * an upper limit on the size of the extent in ram instead of
+ * an exact limit.
+ */
+ __le64 ram_bytes;
+
+ /*
+ * 32 bits for the various ways we might encode the data,
+ * including compression and encryption. If any of these
+ * are set to something a given disk format doesn't understand
+ * it is treated like an incompat flag for reading and writing,
+ * but not for stat.
+ */
+ u8 compression;
+ u8 encryption;
+ __le16 other_encoding; /* spare for later use */
+
+ /* are we inline data or a real extent? */
+ u8 type;
+
+ /*
+ * disk space consumed by the extent, checksum blocks are included
+ * in these numbers
+ */
+ __le64 disk_bytenr;
+ __le64 disk_num_bytes;
+ /*
+ * the logical offset in file blocks (no csums)
+ * this extent record is for. This allows a file extent to point
+ * into the middle of an existing extent on disk, sharing it
+ * between two snapshots (useful if some bytes in the middle of the
+ * extent have changed
+ */
+ __le64 offset;
+ /*
+ * the logical number of file blocks (no csums included). This
+ * always reflects the size uncompressed and without encoding.
+ */
+ __le64 num_bytes;
+
+} __attribute__ ((__packed__));
+
+struct btrfs_csum_item {
+ u8 csum;
+} __attribute__ ((__packed__));
+
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
+#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
+#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
+#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
+#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
+
+struct btrfs_block_group_item {
+ __le64 used;
+ __le64 chunk_objectid;
+ __le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_space_info {
+ u64 flags;
+ u64 total_bytes;
+ u64 bytes_used;
+ u64 bytes_pinned;
+ u64 bytes_reserved;
+ u64 bytes_readonly;
+ int full;
+ int force_alloc;
+ struct list_head list;
+
+ /* for block groups in our same type */
+ struct list_head block_groups;
+ spinlock_t lock;
+ struct rw_semaphore groups_sem;
+};
+
+struct btrfs_free_space {
+ struct rb_node bytes_index;
+ struct rb_node offset_index;
+ u64 offset;
+ u64 bytes;
+};
+
+struct btrfs_block_group_cache {
+ struct btrfs_key key;
+ struct btrfs_block_group_item item;
+ spinlock_t lock;
+ struct mutex alloc_mutex;
+ struct mutex cache_mutex;
+ u64 pinned;
+ u64 reserved;
+ u64 flags;
+ int cached;
+ int ro;
+ int dirty;
+
+ struct btrfs_space_info *space_info;
+
+ /* free space cache stuff */
+ struct rb_root free_space_bytes;
+ struct rb_root free_space_offset;
+
+ /* block group cache stuff */
+ struct rb_node cache_node;
+
+ /* for block groups in the same raid type */
+ struct list_head list;
+
+ /* usage count */
+ atomic_t count;
+};
+
+struct btrfs_leaf_ref_tree {
+ struct rb_root root;
+ struct list_head list;
+ spinlock_t lock;
+};
+
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_fs_info {
+ u8 fsid[BTRFS_FSID_SIZE];
+ u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+ struct btrfs_root *extent_root;
+ struct btrfs_root *tree_root;
+ struct btrfs_root *chunk_root;
+ struct btrfs_root *dev_root;
+ struct btrfs_root *fs_root;
+ struct btrfs_root *csum_root;
+
+ /* the log root tree is a directory of all the other log roots */
+ struct btrfs_root *log_root_tree;
+ struct radix_tree_root fs_roots_radix;
+
+ /* block group cache stuff */
+ spinlock_t block_group_cache_lock;
+ struct rb_root block_group_cache_tree;
+
+ struct extent_io_tree pinned_extents;
+ struct extent_io_tree pending_del;
+ struct extent_io_tree extent_ins;
+
+ /* logical->physical extent mapping */
+ struct btrfs_mapping_tree mapping_tree;
+
+ u64 generation;
+ u64 last_trans_committed;
+ u64 last_trans_new_blockgroup;
+ u64 open_ioctl_trans;
+ unsigned long mount_opt;
+ u64 max_extent;
+ u64 max_inline;
+ u64 alloc_start;
+ struct btrfs_transaction *running_transaction;
+ wait_queue_head_t transaction_throttle;
+ wait_queue_head_t transaction_wait;
+
+ wait_queue_head_t async_submit_wait;
+ wait_queue_head_t tree_log_wait;
+
+ struct btrfs_super_block super_copy;
+ struct btrfs_super_block super_for_commit;
+ struct block_device *__bdev;
+ struct super_block *sb;
+ struct inode *btree_inode;
+ struct backing_dev_info bdi;
+ spinlock_t hash_lock;
+ struct mutex trans_mutex;
+ struct mutex tree_log_mutex;
+ struct mutex transaction_kthread_mutex;
+ struct mutex cleaner_mutex;
+ struct mutex extent_ins_mutex;
+ struct mutex pinned_mutex;
+ struct mutex chunk_mutex;
+ struct mutex drop_mutex;
+ struct mutex volume_mutex;
+ struct mutex tree_reloc_mutex;
+ struct list_head trans_list;
+ struct list_head hashers;
+ struct list_head dead_roots;
+
+ atomic_t nr_async_submits;
+ atomic_t async_submit_draining;
+ atomic_t nr_async_bios;
+ atomic_t async_delalloc_pages;
+ atomic_t tree_log_writers;
+ atomic_t tree_log_commit;
+ unsigned long tree_log_batch;
+ u64 tree_log_transid;
+
+ /*
+ * this is used by the balancing code to wait for all the pending
+ * ordered extents
+ */
+ spinlock_t ordered_extent_lock;
+ struct list_head ordered_extents;
+ struct list_head delalloc_inodes;
+
+ /*
+ * there is a pool of worker threads for checksumming during writes
+ * and a pool for checksumming after reads. This is because readers
+ * can run with FS locks held, and the writers may be waiting for
+ * those locks. We don't want ordering in the pending list to cause
+ * deadlocks, and so the two are serviced separately.
+ *
+ * A third pool does submit_bio to avoid deadlocking with the other
+ * two
+ */
+ struct btrfs_workers workers;
+ struct btrfs_workers delalloc_workers;
+ struct btrfs_workers endio_workers;
+ struct btrfs_workers endio_meta_workers;
+ struct btrfs_workers endio_meta_write_workers;
+ struct btrfs_workers endio_write_workers;
+ struct btrfs_workers submit_workers;
+ /*
+ * fixup workers take dirty pages that didn't properly go through
+ * the cow mechanism and make them safe to write. It happens
+ * for the sys_munmap function call path
+ */
+ struct btrfs_workers fixup_workers;
+ struct task_struct *transaction_kthread;
+ struct task_struct *cleaner_kthread;
+ int thread_pool_size;
+
+ /* tree relocation relocated fields */
+ struct list_head dead_reloc_roots;
+ struct btrfs_leaf_ref_tree reloc_ref_tree;
+ struct btrfs_leaf_ref_tree shared_ref_tree;
+
+ struct kobject super_kobj;
+ struct completion kobj_unregister;
+ int do_barriers;
+ int closing;
+ int log_root_recovering;
+ atomic_t throttles;
+ atomic_t throttle_gen;
+
+ u64 total_pinned;
+ struct list_head dirty_cowonly_roots;
+
+ struct btrfs_fs_devices *fs_devices;
+ struct list_head space_info;
+ spinlock_t delalloc_lock;
+ spinlock_t new_trans_lock;
+ u64 delalloc_bytes;
+ u64 last_alloc;
+ u64 last_data_alloc;
+
+ spinlock_t ref_cache_lock;
+ u64 total_ref_cache_size;
+
+ u64 avail_data_alloc_bits;
+ u64 avail_metadata_alloc_bits;
+ u64 avail_system_alloc_bits;
+ u64 data_alloc_profile;
+ u64 metadata_alloc_profile;
+ u64 system_alloc_profile;
+
+ void *bdev_holder;
+};
+
+/*
+ * in ram representation of the tree. extent_root is used for all allocations
+ * and for the extent tree extent_root root.
+ */
+struct btrfs_dirty_root;
+struct btrfs_root {
+ struct extent_buffer *node;
+
+ /* the node lock is held while changing the node pointer */
+ spinlock_t node_lock;
+
+ struct extent_buffer *commit_root;
+ struct btrfs_leaf_ref_tree *ref_tree;
+ struct btrfs_leaf_ref_tree ref_tree_struct;
+ struct btrfs_dirty_root *dirty_root;
+ struct btrfs_root *log_root;
+ struct btrfs_root *reloc_root;
+
+ struct btrfs_root_item root_item;
+ struct btrfs_key root_key;
+ struct btrfs_fs_info *fs_info;
+ struct extent_io_tree dirty_log_pages;
+
+ struct kobject root_kobj;
+ struct completion kobj_unregister;
+ struct mutex objectid_mutex;
+ struct mutex log_mutex;
+
+ u64 objectid;
+ u64 last_trans;
+
+ /* data allocations are done in sectorsize units */
+ u32 sectorsize;
+
+ /* node allocations are done in nodesize units */
+ u32 nodesize;
+
+ /* leaf allocations are done in leafsize units */
+ u32 leafsize;
+
+ u32 stripesize;
+
+ u32 type;
+ u64 highest_inode;
+ u64 last_inode_alloc;
+ int ref_cows;
+ int track_dirty;
+ u64 defrag_trans_start;
+ struct btrfs_key defrag_progress;
+ struct btrfs_key defrag_max;
+ int defrag_running;
+ int defrag_level;
+ char *name;
+ int in_sysfs;
+
+ /* the dirty list is only used by non-reference counted roots */
+ struct list_head dirty_list;
+
+ spinlock_t list_lock;
+ struct list_head dead_list;
+ struct list_head orphan_list;
+
+ /*
+ * right now this just gets used so that a root has its own devid
+ * for stat. It may be used for more later
+ */
+ struct super_block anon_super;
+};
+
+/*
+
+ * inode items have the data typically returned from stat and store other
+ * info about object characteristics. There is one for every file and dir in
+ * the FS
+ */
+#define BTRFS_INODE_ITEM_KEY 1
+#define BTRFS_INODE_REF_KEY 12
+#define BTRFS_XATTR_ITEM_KEY 24
+#define BTRFS_ORPHAN_ITEM_KEY 48
+/* reserve 2-15 close to the inode for later flexibility */
+
+/*
+ * dir items are the name -> inode pointers in a directory. There is one
+ * for every name in a directory.
+ */
+#define BTRFS_DIR_LOG_ITEM_KEY 60
+#define BTRFS_DIR_LOG_INDEX_KEY 72
+#define BTRFS_DIR_ITEM_KEY 84
+#define BTRFS_DIR_INDEX_KEY 96
+/*
+ * extent data is for file data
+ */
+#define BTRFS_EXTENT_DATA_KEY 108
+
+/*
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
+ */
+#define BTRFS_EXTENT_CSUM_KEY 128
+
+/*
+ * root items point to tree roots. There are typically in the root
+ * tree used by the super block to find all the other trees
+ */
+#define BTRFS_ROOT_ITEM_KEY 132
+
+/*
+ * root backrefs tie subvols and snapshots to the directory entries that
+ * reference them
+ */
+#define BTRFS_ROOT_BACKREF_KEY 144
+
+/*
+ * root refs make a fast index for listing all of the snapshots and
+ * subvolumes referenced by a given root. They point directly to the
+ * directory item in the root that references the subvol
+ */
+#define BTRFS_ROOT_REF_KEY 156
+
+/*
+ * extent items are in the extent map tree. These record which blocks
+ * are used, and how many references there are to each block
+ */
+#define BTRFS_EXTENT_ITEM_KEY 168
+#define BTRFS_EXTENT_REF_KEY 180
+
+/*
+ * block groups give us hints into the extent allocation trees. Which
+ * blocks are free etc etc
+ */
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+
+#define BTRFS_DEV_EXTENT_KEY 204
+#define BTRFS_DEV_ITEM_KEY 216
+#define BTRFS_CHUNK_ITEM_KEY 228
+
+/*
+ * string items are for debugging. They just store a short string of
+ * data in the FS
+ */
+#define BTRFS_STRING_ITEM_KEY 253
+
+#define BTRFS_MOUNT_NODATASUM (1 << 0)
+#define BTRFS_MOUNT_NODATACOW (1 << 1)
+#define BTRFS_MOUNT_NOBARRIER (1 << 2)
+#define BTRFS_MOUNT_SSD (1 << 3)
+#define BTRFS_MOUNT_DEGRADED (1 << 4)
+#define BTRFS_MOUNT_COMPRESS (1 << 5)
+
+#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
+ BTRFS_MOUNT_##opt)
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM (1 << 0)
+#define BTRFS_INODE_NODATACOW (1 << 1)
+#define BTRFS_INODE_READONLY (1 << 2)
+#define BTRFS_INODE_NOCOMPRESS (1 << 3)
+#define BTRFS_INODE_PREALLOC (1 << 4)
+#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
+ ~BTRFS_INODE_##flag)
+#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
+ BTRFS_INODE_##flag)
+#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \
+ BTRFS_INODE_##flag)
+/* some macros to generate set/get funcs for the struct fields. This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple
+ * one for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+
+#define read_eb_member(eb, ptr, type, member, result) ( \
+ read_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#define write_eb_member(eb, ptr, type, member, result) ( \
+ write_extent_buffer(eb, (char *)(result), \
+ ((unsigned long)(ptr)) + \
+ offsetof(type, member), \
+ sizeof(((type *)0)->member)))
+
+#ifndef BTRFS_SETGET_FUNCS
+#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
+u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
+#endif
+
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(struct extent_buffer *eb) \
+{ \
+ type *p = kmap_atomic(eb->first_page, KM_USER0); \
+ u##bits res = le##bits##_to_cpu(p->member); \
+ kunmap_atomic(p, KM_USER0); \
+ return res; \
+} \
+static inline void btrfs_set_##name(struct extent_buffer *eb, \
+ u##bits val) \
+{ \
+ type *p = kmap_atomic(eb->first_page, KM_USER0); \
+ p->member = cpu_to_le##bits(val); \
+ kunmap_atomic(p, KM_USER0); \
+}
+
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
+static inline u##bits btrfs_##name(type *s) \
+{ \
+ return le##bits##_to_cpu(s->member); \
+} \
+static inline void btrfs_set_##name(type *s, u##bits val) \
+{ \
+ s->member = cpu_to_le##bits(val); \
+}
+
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+ start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+ io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+ io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
+ dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+ seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+ bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+ generation, 64);
+
+static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+ return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+ return (char *)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+ return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+ stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+ io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+ io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+ sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+ num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+ sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+ int nr)
+{
+ unsigned long offset = (unsigned long)c;
+ offset += offsetof(struct btrfs_chunk, stripe);
+ offset += nr * sizeof(struct btrfs_stripe);
+ return (struct btrfs_stripe *)offset;
+}
+
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr,
+ u64 val)
+{
+ btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr)
+{
+ return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
+ struct btrfs_chunk *c, int nr,
+ u64 val)
+{
+ btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+ used, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+ used, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
+ struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags,
+ struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+ struct btrfs_block_group_item, flags, 64);
+
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+
+static inline struct btrfs_timespec *
+btrfs_inode_atime(struct btrfs_inode_item *inode_item)
+{
+ unsigned long ptr = (unsigned long)inode_item;
+ ptr += offsetof(struct btrfs_inode_item, atime);
+ return (struct btrfs_timespec *)ptr;
+}
+
+static inline struct btrfs_timespec *
+btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
+{
+ unsigned long ptr = (unsigned long)inode_item;
+ ptr += offsetof(struct btrfs_inode_item, mtime);
+ return (struct btrfs_timespec *)ptr;
+}
+
+static inline struct btrfs_timespec *
+btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
+{
+ unsigned long ptr = (unsigned long)inode_item;
+ ptr += offsetof(struct btrfs_inode_item, ctime);
+ return (struct btrfs_timespec *)ptr;
+}
+
+static inline struct btrfs_timespec *
+btrfs_inode_otime(struct btrfs_inode_item *inode_item)
+{
+ unsigned long ptr = (unsigned long)inode_item;
+ ptr += offsetof(struct btrfs_inode_item, otime);
+ return (struct btrfs_timespec *)ptr;
+}
+
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
+ chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+ chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+ chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+
+static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+{
+ unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
+ return (u8 *)((unsigned long)dev + ptr);
+}
+
+/* struct btrfs_extent_ref */
+BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
+
+BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
+ objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
+ num_refs, 32);
+
+/* struct btrfs_extent_item */
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
+ refs, 32);
+
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+
+static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
+ int nr, u64 val)
+{
+ unsigned long ptr;
+ ptr = offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+ btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline unsigned long btrfs_node_key_ptr_offset(int nr)
+{
+ return offsetof(struct btrfs_node, ptrs) +
+ sizeof(struct btrfs_key_ptr) * nr;
+}
+
+void btrfs_node_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr);
+
+static inline void btrfs_set_node_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ unsigned long ptr;
+ ptr = btrfs_node_key_ptr_offset(nr);
+ write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+ struct btrfs_key_ptr, key, disk_key);
+}
+
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+
+static inline unsigned long btrfs_item_nr_offset(int nr)
+{
+ return offsetof(struct btrfs_leaf, items) +
+ sizeof(struct btrfs_item) * nr;
+}
+
+static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
+ int nr)
+{
+ return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+}
+
+static inline u32 btrfs_item_end(struct extent_buffer *eb,
+ struct btrfs_item *item)
+{
+ return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+}
+
+static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+{
+ return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline void btrfs_item_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(eb, nr);
+ read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ struct btrfs_item *item = btrfs_item_nr(eb, nr);
+ write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
+/*
+ * struct btrfs_root_ref
+ */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+
+static inline void btrfs_dir_item_key(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
+ objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+ struct btrfs_disk_key *disk)
+{
+ cpu->offset = le64_to_cpu(disk->offset);
+ cpu->type = disk->type;
+ cpu->objectid = le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+ struct btrfs_key *cpu)
+{
+ disk->offset = cpu_to_le64(cpu->offset);
+ disk->type = cpu->type;
+ disk->objectid = cpu_to_le64(cpu->objectid);
+}
+
+static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_node_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
+ struct btrfs_key *key, int nr)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_item_key(eb, &disk_key, nr);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
+ struct btrfs_dir_item *item,
+ struct btrfs_key *key)
+{
+ struct btrfs_disk_key disk_key;
+ btrfs_dir_item_key(eb, item, &disk_key);
+ btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+
+static inline u8 btrfs_key_type(struct btrfs_key *key)
+{
+ return key->type;
+}
+
+static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
+{
+ key->type = val;
+}
+
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
+ generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+
+static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ return (btrfs_header_flags(eb) & flag) == flag;
+}
+
+static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+ btrfs_set_header_flags(eb, flags | flag);
+ return (flags & flag) == flag;
+}
+
+static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+ u64 flags = btrfs_header_flags(eb);
+ btrfs_set_header_flags(eb, flags & ~flag);
+ return (flags & flag) == flag;
+}
+
+static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
+{
+ unsigned long ptr = offsetof(struct btrfs_header, fsid);
+ return (u8 *)ptr;
+}
+
+static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+{
+ unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
+ return (u8 *)ptr;
+}
+
+static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
+{
+ unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
+ return (u8 *)ptr;
+}
+
+static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
+{
+ unsigned long ptr = offsetof(struct btrfs_header, csum);
+ return (u8 *)ptr;
+}
+
+static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
+{
+ return NULL;
+}
+
+static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
+{
+ return NULL;
+}
+
+static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
+{
+ return NULL;
+}
+
+static inline int btrfs_is_leaf(struct extent_buffer *eb)
+{
+ return btrfs_header_level(eb) == 0;
+}
+
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+ last_snapshot, 64);
+
+/* struct btrfs_super_block */
+
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+ struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+ struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+ root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+ chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+ chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
+ log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
+ log_root_transid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+ log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+ total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+ bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+ sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+ nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
+ leafsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+ stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+ root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+ num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+ compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+ compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+ incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+ csum_type, 16);
+
+static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+{
+ int t = btrfs_super_csum_type(s);
+ BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+ return btrfs_csum_sizes[t];
+}
+
+static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
+{
+ return offsetof(struct btrfs_leaf, items);
+}
+
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+
+static inline unsigned long
+btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+{
+ unsigned long offset = (unsigned long)e;
+ offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
+ return offset;
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+ return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
+}
+
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+ disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+ generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+ disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+ offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+ num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+ ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+ compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+ other_encoding, 16);
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+ struct btrfs_file_extent_item *e)
+{
+ return btrfs_file_extent_ram_bytes(eb, e);
+}
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers. If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+ struct btrfs_item *e)
+{
+ unsigned long offset;
+ offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+ return btrfs_item_size(eb, e) - offset;
+}
+
+static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline int btrfs_set_root_name(struct btrfs_root *root,
+ const char *name, int len)
+{
+ /* if we already have a name just free it */
+ kfree(root->name);
+
+ root->name = kmalloc(len+1, GFP_KERNEL);
+ if (!root->name)
+ return -ENOMEM;
+
+ memcpy(root->name, name, len);
+ root->name[len] = '\0';
+
+ return 0;
+}
+
+static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
+{
+ if (level == 0)
+ return root->leafsize;
+ return root->nodesize;
+}
+
+/* helper function to cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+ ((type *)(btrfs_leaf_data(leaf) + \
+ btrfs_item_offset_nr(leaf, slot)))
+
+#define btrfs_item_ptr_offset(leaf, slot) \
+ ((unsigned long)(btrfs_leaf_data(leaf) + \
+ btrfs_item_offset_nr(leaf, slot)))
+
+static inline struct dentry *fdentry(struct file *file)
+{
+ return file->f_path.dentry;
+}
+
+/* extent-tree.c */
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u32 *refs);
+int btrfs_update_pinned_extents(struct btrfs_root *root,
+ u64 bytenr, u64 num, int pin);
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid, u64 bytenr);
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+ struct btrfs_fs_info *info,
+ u64 bytenr);
+u64 btrfs_find_block_group(struct btrfs_root *root,
+ u64 search_start, u64 search_hint, int owner);
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u32 blocksize, u64 parent,
+ u64 root_objectid,
+ u64 ref_generation,
+ int level,
+ u64 hint,
+ u64 empty_size);
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u32 blocksize);
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 num_bytes, u64 parent, u64 min_bytes,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner, u64 empty_size, u64 hint_byte,
+ u64 search_end, struct btrfs_key *ins, u64 data);
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner, struct btrfs_key *ins);
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner, struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 num_bytes, u64 min_alloc_size,
+ u64 empty_size, u64 hint_byte,
+ u64 search_end, struct btrfs_key *ins,
+ u64 data);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *orig_buf, struct extent_buffer *buf,
+ u32 *nr_extents);
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, u32 nr_extents);
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *orig_buf,
+ struct extent_buffer *buf, int start_slot, int nr);
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid, int pin);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_io_tree *unpin);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid);
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 orig_parent, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid);
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytes_used,
+ u64 type, u64 chunk_objectid, u64 chunk_offset,
+ u64 size);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf, u64 orig_start);
+int btrfs_add_dead_reloc_root(struct btrfs_root *root);
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+/* ctree.c */
+int btrfs_previous_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid,
+ int type);
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *node_keys,
+ u64 *nodes, int lowest_level);
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *new_key);
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, int lowest_level,
+ int cache_only, u64 min_trans);
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+ struct btrfs_key *max_key,
+ struct btrfs_path *path, int cache_only,
+ u64 min_trans);
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret, u64 prealloc_dest);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, u32 data_size);
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u32 new_size, int from_end);
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ unsigned long split_offset);
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_path *p, int
+ ins_len, int cow);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *parent,
+ int start_slot, int cache_only, u64 *last_ret,
+ struct btrfs_key *progress);
+void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
+struct btrfs_path *btrfs_alloc_path(void);
+void btrfs_free_path(struct btrfs_path *p);
+void btrfs_init_path(struct btrfs_path *p);
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, int slot, int nr);
+int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 bytenr);
+static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ return btrfs_del_items(trans, root, path, path->slots[0], 1);
+}
+
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, void *data, u32 data_size);
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ int nr);
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size, int nr);
+
+static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ u32 data_size)
+{
+ return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+}
+
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *node,
+ struct extent_buffer *parent);
+/* root-item.c */
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+ u64 root_id, u64 ref_id);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u8 type, u64 ref_id,
+ u64 dirid, u64 sequence,
+ const char *name, int name_len);
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_root_item
+ *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_root_item
+ *item);
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
+ btrfs_root_item *item, struct btrfs_key *key);
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+ u64 *found_objectid);
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+ struct btrfs_root *latest_root);
+/* dir-item.c */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *name,
+ int name_len, u64 dir,
+ struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, int name_len,
+ int mod);
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ u64 objectid, const char *name, int name_len,
+ int mod);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *name,
+ u16 name_len, const void *data, u16 data_len,
+ u64 dir);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, u16 name_len,
+ int mod);
+
+/* orphan.c */
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset);
+
+/* inode-map.c */
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+ struct btrfs_root *fs_root,
+ u64 dirid, u64 *objectid);
+int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
+
+/* inode-item.c */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ struct btrfs_key *location, int mod);
+
+/* file-item.c */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 len);
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u32 *dst);
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 pos,
+ u64 disk_offset, u64 disk_num_bytes,
+ u64 num_bytes, u64 offset, u64 ram_bytes,
+ u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_ordered_sum *sums);
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 file_start, int contig);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+ u64 start, unsigned long len);
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, int cow);
+int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ u64 isize);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
+ u64 end, struct list_head *list);
+/* inode.c */
+
+/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
+#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
+#define ClearPageChecked ClearPageFsMisc
+#define SetPageChecked SetPageFsMisc
+#define PageChecked PageFsMisc
+#endif
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, struct inode *inode,
+ const char *name, int name_len);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct inode *parent_inode, struct inode *inode,
+ const char *name, int name_len, int add_backref, u64 index);
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode, u64 new_size,
+ u32 min_type);
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+int btrfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc);
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *new_root, struct dentry *dentry,
+ u64 new_dirid, u64 alloc_hint);
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+ size_t size, struct bio *bio, unsigned long bio_flags);
+
+unsigned long btrfs_force_ra(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *file,
+ pgoff_t offset, pgoff_t last_index);
+int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
+ int for_del);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_readpage(struct file *file, struct page *page);
+void btrfs_delete_inode(struct inode *inode);
+void btrfs_put_inode(struct inode *inode);
+void btrfs_read_locked_inode(struct inode *inode);
+int btrfs_write_inode(struct inode *inode, int wait);
+void btrfs_dirty_inode(struct inode *inode);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+int btrfs_init_cachep(void);
+void btrfs_destroy_cachep(void);
+long btrfs_ioctl_trans_end(struct file *file);
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+ struct btrfs_root *root, int wait);
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+ struct btrfs_root *root);
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root, int *is_new);
+int btrfs_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to);
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+ size_t page_offset, u64 start, u64 end,
+ int create);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
+void btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_cont_expand(struct inode *inode, loff_t size);
+
+/* ioctl.c */
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/* file.c */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned);
+int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
+extern struct file_operations btrfs_file_operations;
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode, u64 start, u64 end);
+int btrfs_release_file(struct inode *inode, struct file *file);
+
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int cache_only);
+
+/* sysfs.c */
+int btrfs_init_sysfs(void);
+void btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
+int btrfs_sysfs_add_root(struct btrfs_root *root);
+void btrfs_sysfs_del_root(struct btrfs_root *root);
+void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
+
+/* xattr.c */
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+/* super.c */
+u64 btrfs_parse_size(char *str);
+int btrfs_parse_options(struct btrfs_root *root, char *options);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+
+/* acl.c */
+int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_init_acl(struct inode *inode, struct inode *dir);
+int btrfs_acl_chmod(struct inode *inode);
+
+/* free-space-cache.c */
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytenr, u64 size);
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytenr, u64 size);
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+ *block_group);
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+ *block_group, u64 offset,
+ u64 bytes);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 00000000000..926a0b287a7
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+
+/*
+ * insert a name into a directory, doing overflow properly if there is a hash
+ * collision. data_size indicates how big the item inserted should be. On
+ * success a struct btrfs_dir_item pointer is returned, otherwise it is
+ * an ERR_PTR.
+ *
+ * The name is not copied into the dir item, you have to do that yourself.
+ */
+static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+ *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key,
+ u32 data_size,
+ const char *name,
+ int name_len)
+{
+ int ret;
+ char *ptr;
+ struct btrfs_item *item;
+ struct extent_buffer *leaf;
+
+ ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+ if (ret == -EEXIST) {
+ struct btrfs_dir_item *di;
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di)
+ return ERR_PTR(-EEXIST);
+ ret = btrfs_extend_item(trans, root, path, data_size);
+ WARN_ON(ret > 0);
+ }
+ if (ret < 0)
+ return ERR_PTR(ret);
+ WARN_ON(ret > 0);
+ leaf = path->nodes[0];
+ item = btrfs_item_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+ BUG_ON(data_size > btrfs_item_size(leaf, item));
+ ptr += btrfs_item_size(leaf, item) - data_size;
+ return (struct btrfs_dir_item *)ptr;
+}
+
+/*
+ * xattrs work a lot like directories, this inserts an xattr item
+ * into the tree
+ */
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *name,
+ u16 name_len, const void *data, u16 data_len,
+ u64 dir)
+{
+ int ret = 0;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *dir_item;
+ unsigned long name_ptr, data_ptr;
+ struct btrfs_key key, location;
+ struct btrfs_disk_key disk_key;
+ struct extent_buffer *leaf;
+ u32 data_size;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.offset = btrfs_name_hash(name, name_len);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ if (name_len + data_len + sizeof(struct btrfs_dir_item) >
+ BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
+ return -ENOSPC;
+
+ data_size = sizeof(*dir_item) + name_len + data_len;
+ dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+ name, name_len);
+ /*
+ * FIXME: at some point we should handle xattr's that are larger than
+ * what we can fit in our leaf. We set location to NULL b/c we arent
+ * pointing at anything else, that will change if we store the xattr
+ * data in a separate inode.
+ */
+ BUG_ON(IS_ERR(dir_item));
+ memset(&location, 0, sizeof(location));
+
+ leaf = path->nodes[0];
+ btrfs_cpu_key_to_disk(&disk_key, &location);
+ btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+ btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+ btrfs_set_dir_name_len(leaf, dir_item, name_len);
+ btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+ btrfs_set_dir_data_len(leaf, dir_item, data_len);
+ name_ptr = (unsigned long)(dir_item + 1);
+ data_ptr = (unsigned long)((char *)name_ptr + name_len);
+
+ write_extent_buffer(leaf, name, name_ptr, name_len);
+ write_extent_buffer(leaf, data, data_ptr, data_len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * insert a directory item in the tree, doing all the magic for
+ * both indexes. 'dir' indicates which objectid to insert it into,
+ * 'location' is the key to stuff into the directory item, 'type' is the
+ * type of the inode we're pointing to, and 'index' is the sequence number
+ * to use for the second index (if one is created).
+ */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, const char *name, int name_len, u64 dir,
+ struct btrfs_key *location, u8 type, u64 index)
+{
+ int ret = 0;
+ int ret2 = 0;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *dir_item;
+ struct extent_buffer *leaf;
+ unsigned long name_ptr;
+ struct btrfs_key key;
+ struct btrfs_disk_key disk_key;
+ u32 data_size;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+ key.offset = btrfs_name_hash(name, name_len);
+ path = btrfs_alloc_path();
+ data_size = sizeof(*dir_item) + name_len;
+ dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+ name, name_len);
+ if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ if (ret == -EEXIST)
+ goto second_insert;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_cpu_key_to_disk(&disk_key, location);
+ btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+ btrfs_set_dir_type(leaf, dir_item, type);
+ btrfs_set_dir_data_len(leaf, dir_item, 0);
+ btrfs_set_dir_name_len(leaf, dir_item, name_len);
+ btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+ name_ptr = (unsigned long)(dir_item + 1);
+
+ write_extent_buffer(leaf, name, name_ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+
+second_insert:
+ /* FIXME, use some real flag for selecting the extra index */
+ if (root == root->fs_info->tree_root) {
+ ret = 0;
+ goto out;
+ }
+ btrfs_release_path(root, path);
+
+ btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.offset = index;
+ dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+ name, name_len);
+ if (IS_ERR(dir_item)) {
+ ret2 = PTR_ERR(dir_item);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ btrfs_cpu_key_to_disk(&disk_key, location);
+ btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+ btrfs_set_dir_type(leaf, dir_item, type);
+ btrfs_set_dir_data_len(leaf, dir_item, 0);
+ btrfs_set_dir_name_len(leaf, dir_item, name_len);
+ btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+ name_ptr = (unsigned long)(dir_item + 1);
+ write_extent_buffer(leaf, name, name_ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ return ret;
+ if (ret2)
+ return ret2;
+ return 0;
+}
+
+/*
+ * lookup a directory item based on name. 'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ */
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, int name_len,
+ int mod)
+{
+ int ret;
+ struct btrfs_key key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+
+ key.offset = btrfs_name_hash(name, name_len);
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ return NULL;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid != dir ||
+ btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
+ found_key.offset != key.offset)
+ return NULL;
+
+ return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+/*
+ * lookup a directory item based on index. 'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ *
+ * The name is used to make sure the index really points to the name you were
+ * looking for.
+ */
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ u64 objectid, const char *name, int name_len,
+ int mod)
+{
+ int ret;
+ struct btrfs_key key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.offset = objectid;
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return ERR_PTR(-ENOENT);
+ return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 dir,
+ const char *name, u16 name_len,
+ int mod)
+{
+ int ret;
+ struct btrfs_key key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+
+ key.objectid = dir;
+ btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.offset = btrfs_name_hash(name, name_len);
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ return NULL;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid != dir ||
+ btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
+ found_key.offset != key.offset)
+ return NULL;
+
+ return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+/*
+ * helper function to look at the directory item pointed to by 'path'
+ * this walks through all the entries in a dir item and finds one
+ * for a specific name.
+ */
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len)
+{
+ struct btrfs_dir_item *dir_item;
+ unsigned long name_ptr;
+ u32 total_len;
+ u32 cur = 0;
+ u32 this_len;
+ struct extent_buffer *leaf;
+
+ leaf = path->nodes[0];
+ dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+ total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ while (cur < total_len) {
+ this_len = sizeof(*dir_item) +
+ btrfs_dir_name_len(leaf, dir_item) +
+ btrfs_dir_data_len(leaf, dir_item);
+ name_ptr = (unsigned long)(dir_item + 1);
+
+ if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
+ memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+ return dir_item;
+
+ cur += this_len;
+ dir_item = (struct btrfs_dir_item *)((char *)dir_item +
+ this_len);
+ }
+ return NULL;
+}
+
+/*
+ * given a pointer into a directory item, delete it. This
+ * handles items that have more than one entry in them.
+ */
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *di)
+{
+
+ struct extent_buffer *leaf;
+ u32 sub_item_len;
+ u32 item_len;
+ int ret = 0;
+
+ leaf = path->nodes[0];
+ sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+ btrfs_dir_data_len(leaf, di);
+ item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (sub_item_len == item_len) {
+ ret = btrfs_del_item(trans, root, path);
+ } else {
+ /* MARKER */
+ unsigned long ptr = (unsigned long)di;
+ unsigned long start;
+
+ start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+ item_len - (ptr + sub_item_len - start));
+ ret = btrfs_truncate_item(trans, root, path,
+ item_len - sub_item_len, 1);
+ }
+ return 0;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 00000000000..81a313874ae
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2343 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/radix-tree.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include "compat.h"
+#include "crc32c.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "async-thread.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "tree-log.h"
+
+static struct extent_io_ops btree_extent_io_ops;
+static void end_workqueue_fn(struct btrfs_work *work);
+
+/*
+ * end_io_wq structs are used to do processing in task context when an IO is
+ * complete. This is used during reads to verify checksums, and it is used
+ * by writes to insert metadata for new file extents after IO is complete.
+ */
+struct end_io_wq {
+ struct bio *bio;
+ bio_end_io_t *end_io;
+ void *private;
+ struct btrfs_fs_info *info;
+ int error;
+ int metadata;
+ struct list_head list;
+ struct btrfs_work work;
+};
+
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads. They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
+struct async_submit_bio {
+ struct inode *inode;
+ struct bio *bio;
+ struct list_head list;
+ extent_submit_bio_hook_t *submit_bio_start;
+ extent_submit_bio_hook_t *submit_bio_done;
+ int rw;
+ int mirror_num;
+ unsigned long bio_flags;
+ struct btrfs_work work;
+};
+
+/*
+ * extents on the btree inode are pretty simple, there's one extent
+ * that covers the entire device
+ */
+static struct extent_map *btree_get_extent(struct inode *inode,
+ struct page *page, size_t page_offset, u64 start, u64 len,
+ int create)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret;
+
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em) {
+ em->bdev =
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ spin_unlock(&em_tree->lock);
+ goto out;
+ }
+ spin_unlock(&em_tree->lock);
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+ em = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ em->start = 0;
+ em->len = (u64)-1;
+ em->block_len = (u64)-1;
+ em->block_start = 0;
+ em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+ spin_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ if (ret == -EEXIST) {
+ u64 failed_start = em->start;
+ u64 failed_len = em->len;
+
+ free_extent_map(em);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em) {
+ ret = 0;
+ } else {
+ em = lookup_extent_mapping(em_tree, failed_start,
+ failed_len);
+ ret = -EIO;
+ }
+ } else if (ret) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ spin_unlock(&em_tree->lock);
+
+ if (ret)
+ em = ERR_PTR(ret);
+out:
+ return em;
+}
+
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+{
+ return btrfs_crc32c(seed, data, len);
+}
+
+void btrfs_csum_final(u32 crc, char *result)
+{
+ *(__le32 *)result = ~cpu_to_le32(crc);
+}
+
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
+static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+ int verify)
+{
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+ char *result = NULL;
+ unsigned long len;
+ unsigned long cur_len;
+ unsigned long offset = BTRFS_CSUM_SIZE;
+ char *map_token = NULL;
+ char *kaddr;
+ unsigned long map_start;
+ unsigned long map_len;
+ int err;
+ u32 crc = ~(u32)0;
+ unsigned long inline_result;
+
+ len = buf->len - offset;
+ while (len > 0) {
+ err = map_private_extent_buffer(buf, offset, 32,
+ &map_token, &kaddr,
+ &map_start, &map_len, KM_USER0);
+ if (err)
+ return 1;
+ cur_len = min(len, map_len - (offset - map_start));
+ crc = btrfs_csum_data(root, kaddr + offset - map_start,
+ crc, cur_len);
+ len -= cur_len;
+ offset += cur_len;
+ unmap_extent_buffer(buf, map_token, KM_USER0);
+ }
+ if (csum_size > sizeof(inline_result)) {
+ result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+ if (!result)
+ return 1;
+ } else {
+ result = (char *)&inline_result;
+ }
+
+ btrfs_csum_final(crc, result);
+
+ if (verify) {
+ if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
+ u32 val;
+ u32 found = 0;
+ memcpy(&found, result, csum_size);
+
+ read_extent_buffer(buf, &val, 0, csum_size);
+ printk(KERN_INFO "btrfs: %s checksum verify failed "
+ "on %llu wanted %X found %X level %d\n",
+ root->fs_info->sb->s_id,
+ buf->start, val, found, btrfs_header_level(buf));
+ if (result != (char *)&inline_result)
+ kfree(result);
+ return 1;
+ }
+ } else {
+ write_extent_buffer(buf, result, 0, csum_size);
+ }
+ if (result != (char *)&inline_result)
+ kfree(result);
+ return 0;
+}
+
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer. This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+ struct extent_buffer *eb, u64 parent_transid)
+{
+ int ret;
+
+ if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+ return 0;
+
+ lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
+ if (extent_buffer_uptodate(io_tree, eb) &&
+ btrfs_header_generation(eb) == parent_transid) {
+ ret = 0;
+ goto out;
+ }
+ printk("parent transid verify failed on %llu wanted %llu found %llu\n",
+ (unsigned long long)eb->start,
+ (unsigned long long)parent_transid,
+ (unsigned long long)btrfs_header_generation(eb));
+ ret = 1;
+ clear_extent_buffer_uptodate(io_tree, eb);
+out:
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ GFP_NOFS);
+ return ret;
+}
+
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ */
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+ struct extent_buffer *eb,
+ u64 start, u64 parent_transid)
+{
+ struct extent_io_tree *io_tree;
+ int ret;
+ int num_copies = 0;
+ int mirror_num = 0;
+
+ io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+ while (1) {
+ ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+ btree_get_extent, mirror_num);
+ if (!ret &&
+ !verify_parent_transid(io_tree, eb, parent_transid))
+ return ret;
+
+ num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+ eb->start, eb->len);
+ if (num_copies == 1)
+ return ret;
+
+ mirror_num++;
+ if (mirror_num > num_copies)
+ return ret;
+ }
+ return -EIO;
+}
+
+/*
+ * checksum a dirty tree block before IO. This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
+ */
+
+static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+{
+ struct extent_io_tree *tree;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 found_start;
+ int found_level;
+ unsigned long len;
+ struct extent_buffer *eb;
+ int ret;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+ if (!page->private)
+ goto out;
+ len = page->private >> 2;
+ WARN_ON(len == 0);
+
+ eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
+ btrfs_header_generation(eb));
+ BUG_ON(ret);
+ found_start = btrfs_header_bytenr(eb);
+ if (found_start != start) {
+ WARN_ON(1);
+ goto err;
+ }
+ if (eb->first_page != page) {
+ WARN_ON(1);
+ goto err;
+ }
+ if (!PageUptodate(page)) {
+ WARN_ON(1);
+ goto err;
+ }
+ found_level = btrfs_header_level(eb);
+
+ csum_tree_block(root, eb, 0);
+err:
+ free_extent_buffer(eb);
+out:
+ return 0;
+}
+
+static int check_tree_block_fsid(struct btrfs_root *root,
+ struct extent_buffer *eb)
+{
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ u8 fsid[BTRFS_UUID_SIZE];
+ int ret = 1;
+
+ read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
+ BTRFS_FSID_SIZE);
+ while (fs_devices) {
+ if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+ ret = 0;
+ break;
+ }
+ fs_devices = fs_devices->seed;
+ }
+ return ret;
+}
+
+static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+ struct extent_state *state)
+{
+ struct extent_io_tree *tree;
+ u64 found_start;
+ int found_level;
+ unsigned long len;
+ struct extent_buffer *eb;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ int ret = 0;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+ if (!page->private)
+ goto out;
+
+ len = page->private >> 2;
+ WARN_ON(len == 0);
+
+ eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+
+ found_start = btrfs_header_bytenr(eb);
+ if (found_start != start) {
+ printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
+ (unsigned long long)found_start,
+ (unsigned long long)eb->start);
+ ret = -EIO;
+ goto err;
+ }
+ if (eb->first_page != page) {
+ printk(KERN_INFO "btrfs bad first page %lu %lu\n",
+ eb->first_page->index, page->index);
+ WARN_ON(1);
+ ret = -EIO;
+ goto err;
+ }
+ if (check_tree_block_fsid(root, eb)) {
+ printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+ (unsigned long long)eb->start);
+ ret = -EIO;
+ goto err;
+ }
+ found_level = btrfs_header_level(eb);
+
+ ret = csum_tree_block(root, eb, 1);
+ if (ret)
+ ret = -EIO;
+
+ end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
+ end = eb->start + end - 1;
+err:
+ free_extent_buffer(eb);
+out:
+ return ret;
+}
+
+static void end_workqueue_bio(struct bio *bio, int err)
+{
+ struct end_io_wq *end_io_wq = bio->bi_private;
+ struct btrfs_fs_info *fs_info;
+
+ fs_info = end_io_wq->info;
+ end_io_wq->error = err;
+ end_io_wq->work.func = end_workqueue_fn;
+ end_io_wq->work.flags = 0;
+
+ if (bio->bi_rw & (1 << BIO_RW)) {
+ if (end_io_wq->metadata)
+ btrfs_queue_worker(&fs_info->endio_meta_write_workers,
+ &end_io_wq->work);
+ else
+ btrfs_queue_worker(&fs_info->endio_write_workers,
+ &end_io_wq->work);
+ } else {
+ if (end_io_wq->metadata)
+ btrfs_queue_worker(&fs_info->endio_meta_workers,
+ &end_io_wq->work);
+ else
+ btrfs_queue_worker(&fs_info->endio_workers,
+ &end_io_wq->work);
+ }
+}
+
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+ int metadata)
+{
+ struct end_io_wq *end_io_wq;
+ end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+ if (!end_io_wq)
+ return -ENOMEM;
+
+ end_io_wq->private = bio->bi_private;
+ end_io_wq->end_io = bio->bi_end_io;
+ end_io_wq->info = info;
+ end_io_wq->error = 0;
+ end_io_wq->bio = bio;
+ end_io_wq->metadata = metadata;
+
+ bio->bi_private = end_io_wq;
+ bio->bi_end_io = end_workqueue_bio;
+ return 0;
+}
+
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
+{
+ unsigned long limit = min_t(unsigned long,
+ info->workers.max_workers,
+ info->fs_devices->open_devices);
+ return 256 * limit;
+}
+
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
+{
+ return atomic_read(&info->nr_async_bios) >
+ btrfs_async_submit_limit(info);
+}
+
+static void run_one_async_start(struct btrfs_work *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct async_submit_bio *async;
+
+ async = container_of(work, struct async_submit_bio, work);
+ fs_info = BTRFS_I(async->inode)->root->fs_info;
+ async->submit_bio_start(async->inode, async->rw, async->bio,
+ async->mirror_num, async->bio_flags);
+}
+
+static void run_one_async_done(struct btrfs_work *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct async_submit_bio *async;
+ int limit;
+
+ async = container_of(work, struct async_submit_bio, work);
+ fs_info = BTRFS_I(async->inode)->root->fs_info;
+
+ limit = btrfs_async_submit_limit(fs_info);
+ limit = limit * 2 / 3;
+
+ atomic_dec(&fs_info->nr_async_submits);
+
+ if (atomic_read(&fs_info->nr_async_submits) < limit &&
+ waitqueue_active(&fs_info->async_submit_wait))
+ wake_up(&fs_info->async_submit_wait);
+
+ async->submit_bio_done(async->inode, async->rw, async->bio,
+ async->mirror_num, async->bio_flags);
+}
+
+static void run_one_async_free(struct btrfs_work *work)
+{
+ struct async_submit_bio *async;
+
+ async = container_of(work, struct async_submit_bio, work);
+ kfree(async);
+}
+
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+ int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
+ extent_submit_bio_hook_t *submit_bio_start,
+ extent_submit_bio_hook_t *submit_bio_done)
+{
+ struct async_submit_bio *async;
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ return -ENOMEM;
+
+ async->inode = inode;
+ async->rw = rw;
+ async->bio = bio;
+ async->mirror_num = mirror_num;
+ async->submit_bio_start = submit_bio_start;
+ async->submit_bio_done = submit_bio_done;
+
+ async->work.func = run_one_async_start;
+ async->work.ordered_func = run_one_async_done;
+ async->work.ordered_free = run_one_async_free;
+
+ async->work.flags = 0;
+ async->bio_flags = bio_flags;
+
+ atomic_inc(&fs_info->nr_async_submits);
+ btrfs_queue_worker(&fs_info->workers, &async->work);
+#if 0
+ int limit = btrfs_async_submit_limit(fs_info);
+ if (atomic_read(&fs_info->nr_async_submits) > limit) {
+ wait_event_timeout(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) < limit),
+ HZ/10);
+
+ wait_event_timeout(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_bios) < limit),
+ HZ/10);
+ }
+#endif
+ while (atomic_read(&fs_info->async_submit_draining) &&
+ atomic_read(&fs_info->nr_async_submits)) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) == 0));
+ }
+
+ return 0;
+}
+
+static int btree_csum_one_bio(struct bio *bio)
+{
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int bio_index = 0;
+ struct btrfs_root *root;
+
+ WARN_ON(bio->bi_vcnt <= 0);
+ while (bio_index < bio->bi_vcnt) {
+ root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+ csum_dirty_buffer(root, bvec->bv_page);
+ bio_index++;
+ bvec++;
+ }
+ return 0;
+}
+
+static int __btree_submit_bio_start(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ /*
+ * when we're called for a write, we're already in the async
+ * submission context. Just jump into btrfs_map_bio
+ */
+ btree_csum_one_bio(bio);
+ return 0;
+}
+
+static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ /*
+ * when we're called for a write, we're already in the async
+ * submission context. Just jump into btrfs_map_bio
+ */
+ return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+}
+
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ int ret;
+
+ ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+ bio, 1);
+ BUG_ON(ret);
+
+ if (!(rw & (1 << BIO_RW))) {
+ /*
+ * called for a read, do the setup so that checksum validation
+ * can happen in the async kernel threads
+ */
+ return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+ mirror_num, 0);
+ }
+ /*
+ * kthread helpers are used to submit writes so that checksumming
+ * can happen in parallel across all CPUs
+ */
+ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, mirror_num, 0,
+ __btree_submit_bio_start,
+ __btree_submit_bio_done);
+}
+
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ if (current->flags & PF_MEMALLOC) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ return extent_write_full_page(tree, page, btree_get_extent, wbc);
+}
+
+static int btree_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(mapping->host)->io_tree;
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ u64 num_dirty;
+ u64 start = 0;
+ unsigned long thresh = 32 * 1024 * 1024;
+
+ if (wbc->for_kupdate)
+ return 0;
+
+ num_dirty = count_range_bits(tree, &start, (u64)-1,
+ thresh, EXTENT_DIRTY);
+ if (num_dirty < thresh)
+ return 0;
+ }
+ return extent_writepages(tree, mapping, btree_get_extent, wbc);
+}
+
+static int btree_readpage(struct file *file, struct page *page)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ return extent_read_full_page(tree, page, btree_get_extent);
+}
+
+static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+{
+ struct extent_io_tree *tree;
+ struct extent_map_tree *map;
+ int ret;
+
+ if (PageWriteback(page) || PageDirty(page))
+ return 0;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ map = &BTRFS_I(page->mapping->host)->extent_tree;
+
+ ret = try_release_extent_state(map, tree, page, gfp_flags);
+ if (!ret)
+ return 0;
+
+ ret = try_release_extent_buffer(tree, page);
+ if (ret == 1) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+
+ return ret;
+}
+
+static void btree_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ extent_invalidatepage(tree, page, offset);
+ btree_releasepage(page, GFP_NOFS);
+ if (PagePrivate(page)) {
+ printk(KERN_WARNING "btrfs warning page private not zero "
+ "on page %llu\n", (unsigned long long)page_offset(page));
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+}
+
+#if 0
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct buffer_head *bh;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ struct buffer_head *head;
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, root->fs_info->sb->s_blocksize,
+ (1 << BH_Dirty)|(1 << BH_Uptodate));
+ }
+ head = page_buffers(page);
+ bh = head;
+ do {
+ if (buffer_dirty(bh))
+ csum_tree_block(root, bh, 0);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ return block_write_full_page(page, btree_get_block, wbc);
+}
+#endif
+
+static struct address_space_operations btree_aops = {
+ .readpage = btree_readpage,
+ .writepage = btree_writepage,
+ .writepages = btree_writepages,
+ .releasepage = btree_releasepage,
+ .invalidatepage = btree_invalidatepage,
+ .sync_page = block_sync_page,
+};
+
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ u64 parent_transid)
+{
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ int ret = 0;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return 0;
+ read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
+ buf, 0, 0, btree_get_extent, 0);
+ free_extent_buffer(buf);
+ return ret;
+}
+
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize)
+{
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_buffer *eb;
+ eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+ bytenr, blocksize, GFP_NOFS);
+ return eb;
+}
+
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize)
+{
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_buffer *eb;
+
+ eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+ bytenr, blocksize, NULL, GFP_NOFS);
+ return eb;
+}
+
+
+int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+ return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
+ buf->start + buf->len - 1, WB_SYNC_ALL);
+}
+
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+ return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
+ buf->start, buf->start + buf->len - 1);
+}
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+ u32 blocksize, u64 parent_transid)
+{
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_io_tree *io_tree;
+ int ret;
+
+ io_tree = &BTRFS_I(btree_inode)->io_tree;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return NULL;
+
+ ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+
+ if (ret == 0)
+ buf->flags |= EXTENT_UPTODATE;
+ else
+ WARN_ON(1);
+ return buf;
+
+}
+
+int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf)
+{
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ if (btrfs_header_generation(buf) ==
+ root->fs_info->running_transaction->transid) {
+ WARN_ON(!btrfs_tree_locked(buf));
+ clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+ buf);
+ }
+ return 0;
+}
+
+static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+ u32 stripesize, struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ root->node = NULL;
+ root->commit_root = NULL;
+ root->ref_tree = NULL;
+ root->sectorsize = sectorsize;
+ root->nodesize = nodesize;
+ root->leafsize = leafsize;
+ root->stripesize = stripesize;
+ root->ref_cows = 0;
+ root->track_dirty = 0;
+
+ root->fs_info = fs_info;
+ root->objectid = objectid;
+ root->last_trans = 0;
+ root->highest_inode = 0;
+ root->last_inode_alloc = 0;
+ root->name = NULL;
+ root->in_sysfs = 0;
+
+ INIT_LIST_HEAD(&root->dirty_list);
+ INIT_LIST_HEAD(&root->orphan_list);
+ INIT_LIST_HEAD(&root->dead_list);
+ spin_lock_init(&root->node_lock);
+ spin_lock_init(&root->list_lock);
+ mutex_init(&root->objectid_mutex);
+ mutex_init(&root->log_mutex);
+ extent_io_tree_init(&root->dirty_log_pages,
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+
+ btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
+ root->ref_tree = &root->ref_tree_struct;
+
+ memset(&root->root_key, 0, sizeof(root->root_key));
+ memset(&root->root_item, 0, sizeof(root->root_item));
+ memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+ memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+ root->defrag_trans_start = fs_info->generation;
+ init_completion(&root->kobj_unregister);
+ root->defrag_running = 0;
+ root->defrag_level = 0;
+ root->root_key.objectid = objectid;
+ root->anon_super.s_root = NULL;
+ root->anon_super.s_dev = 0;
+ INIT_LIST_HEAD(&root->anon_super.s_list);
+ INIT_LIST_HEAD(&root->anon_super.s_instances);
+ init_rwsem(&root->anon_super.s_umount);
+
+ return 0;
+}
+
+static int find_and_setup_root(struct btrfs_root *tree_root,
+ struct btrfs_fs_info *fs_info,
+ u64 objectid,
+ struct btrfs_root *root)
+{
+ int ret;
+ u32 blocksize;
+ u64 generation;
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+ tree_root->sectorsize, tree_root->stripesize,
+ root, fs_info, objectid);
+ ret = btrfs_find_last_root(tree_root, objectid,
+ &root->root_item, &root->root_key);
+ BUG_ON(ret);
+
+ generation = btrfs_root_generation(&root->root_item);
+ blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ blocksize, generation);
+ BUG_ON(!root->node);
+ return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct extent_buffer *eb;
+ struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ u64 start = 0;
+ u64 end = 0;
+ int ret;
+
+ if (!log_root_tree)
+ return 0;
+
+ while (1) {
+ ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
+ 0, &start, &end, EXTENT_DIRTY);
+ if (ret)
+ break;
+
+ clear_extent_dirty(&log_root_tree->dirty_log_pages,
+ start, end, GFP_NOFS);
+ }
+ eb = fs_info->log_root_tree->node;
+
+ WARN_ON(btrfs_header_level(eb) != 0);
+ WARN_ON(btrfs_header_nritems(eb) != 0);
+
+ ret = btrfs_free_reserved_extent(fs_info->tree_root,
+ eb->start, eb->len);
+ BUG_ON(ret);
+
+ free_extent_buffer(eb);
+ kfree(fs_info->log_root_tree);
+ fs_info->log_root_tree = NULL;
+ return 0;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+
+ root = kzalloc(sizeof(*root), GFP_NOFS);
+ if (!root)
+ return -ENOMEM;
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+ tree_root->sectorsize, tree_root->stripesize,
+ root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+ root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+ root->ref_cows = 0;
+
+ root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
+ 0, BTRFS_TREE_LOG_OBJECTID,
+ trans->transid, 0, 0, 0);
+
+ btrfs_set_header_nritems(root->node, 0);
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_bytenr(root->node, root->node->start);
+ btrfs_set_header_generation(root->node, trans->transid);
+ btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
+
+ write_extent_buffer(root->node, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(root->node),
+ BTRFS_FSID_SIZE);
+ btrfs_mark_buffer_dirty(root->node);
+ btrfs_tree_unlock(root->node);
+ fs_info->log_root_tree = root;
+ return 0;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ struct btrfs_key *location)
+{
+ struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_path *path;
+ struct extent_buffer *l;
+ u64 highest_inode;
+ u64 generation;
+ u32 blocksize;
+ int ret = 0;
+
+ root = kzalloc(sizeof(*root), GFP_NOFS);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+ if (location->offset == (u64)-1) {
+ ret = find_and_setup_root(tree_root, fs_info,
+ location->objectid, root);
+ if (ret) {
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+ goto insert;
+ }
+
+ __setup_root(tree_root->nodesize, tree_root->leafsize,
+ tree_root->sectorsize, tree_root->stripesize,
+ root, fs_info, location->objectid);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
+ if (ret != 0) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
+ }
+ l = path->nodes[0];
+ read_extent_buffer(l, &root->root_item,
+ btrfs_item_ptr_offset(l, path->slots[0]),
+ sizeof(root->root_item));
+ memcpy(&root->root_key, location, sizeof(*location));
+ ret = 0;
+out:
+ btrfs_release_path(root, path);
+ btrfs_free_path(path);
+ if (ret) {
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+ generation = btrfs_root_generation(&root->root_item);
+ blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ blocksize, generation);
+ BUG_ON(!root->node);
+insert:
+ if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+ root->ref_cows = 1;
+ ret = btrfs_find_highest_inode(root, &highest_inode);
+ if (ret == 0) {
+ root->highest_inode = highest_inode;
+ root->last_inode_alloc = highest_inode;
+ }
+ }
+ return root;
+}
+
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_objectid)
+{
+ struct btrfs_root *root;
+
+ if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return fs_info->tree_root;
+ if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return fs_info->extent_root;
+
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)root_objectid);
+ return root;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location)
+{
+ struct btrfs_root *root;
+ int ret;
+
+ if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return fs_info->tree_root;
+ if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return fs_info->extent_root;
+ if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ return fs_info->chunk_root;
+ if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+ return fs_info->dev_root;
+ if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return fs_info->csum_root;
+
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)location->objectid);
+ if (root)
+ return root;
+
+ root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+ if (IS_ERR(root))
+ return root;
+
+ set_anon_super(&root->anon_super, NULL);
+
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
+ if (ret) {
+ free_extent_buffer(root->node);
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+ if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_find_dead_roots(fs_info->tree_root,
+ root->root_key.objectid, root);
+ BUG_ON(ret);
+ btrfs_orphan_cleanup(root);
+ }
+ return root;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location,
+ const char *name, int namelen)
+{
+ struct btrfs_root *root;
+ int ret;
+
+ root = btrfs_read_fs_root_no_name(fs_info, location);
+ if (!root)
+ return NULL;
+
+ if (root->in_sysfs)
+ return root;
+
+ ret = btrfs_set_root_name(root, name, namelen);
+ if (ret) {
+ free_extent_buffer(root->node);
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+#if 0
+ ret = btrfs_sysfs_add_root(root);
+ if (ret) {
+ free_extent_buffer(root->node);
+ kfree(root->name);
+ kfree(root);
+ return ERR_PTR(ret);
+ }
+#endif
+ root->in_sysfs = 1;
+ return root;
+}
+
+static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+{
+ struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
+ int ret = 0;
+ struct list_head *cur;
+ struct btrfs_device *device;
+ struct backing_dev_info *bdi;
+#if 0
+ if ((bdi_bits & (1 << BDI_write_congested)) &&
+ btrfs_congested_async(info, 0))
+ return 1;
+#endif
+ list_for_each(cur, &info->fs_devices->devices) {
+ device = list_entry(cur, struct btrfs_device, dev_list);
+ if (!device->bdev)
+ continue;
+ bdi = blk_get_backing_dev_info(device->bdev);
+ if (bdi && bdi_congested(bdi, bdi_bits)) {
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+/*
+ * this unplugs every device on the box, and it is only used when page
+ * is null
+ */
+static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+ struct list_head *cur;
+ struct btrfs_device *device;
+ struct btrfs_fs_info *info;
+
+ info = (struct btrfs_fs_info *)bdi->unplug_io_data;
+ list_for_each(cur, &info->fs_devices->devices) {
+ device = list_entry(cur, struct btrfs_device, dev_list);
+ if (!device->bdev)
+ continue;
+
+ bdi = blk_get_backing_dev_info(device->bdev);
+ if (bdi->unplug_io_fn)
+ bdi->unplug_io_fn(bdi, page);
+ }
+}
+
+static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+ struct inode *inode;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct address_space *mapping;
+ u64 offset;
+
+ /* the generic O_DIRECT read code does this */
+ if (1 || !page) {
+ __unplug_io_fn(bdi, page);
+ return;
+ }
+
+ /*
+ * page->mapping may change at any time. Get a consistent copy
+ * and use that for everything below
+ */
+ smp_mb();
+ mapping = page->mapping;
+ if (!mapping)
+ return;
+
+ inode = mapping->host;
+
+ /*
+ * don't do the expensive searching for a small number of
+ * devices
+ */
+ if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
+ __unplug_io_fn(bdi, page);
+ return;
+ }
+
+ offset = page_offset(page);
+
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+ spin_unlock(&em_tree->lock);
+ if (!em) {
+ __unplug_io_fn(bdi, page);
+ return;
+ }
+
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ __unplug_io_fn(bdi, page);
+ return;
+ }
+ offset = offset - em->start;
+ btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
+ em->block_start + offset, page);
+ free_extent_map(em);
+}
+
+static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+{
+ bdi_init(bdi);
+ bdi->ra_pages = default_backing_dev_info.ra_pages;
+ bdi->state = 0;
+ bdi->capabilities = default_backing_dev_info.capabilities;
+ bdi->unplug_io_fn = btrfs_unplug_io_fn;
+ bdi->unplug_io_data = info;
+ bdi->congested_fn = btrfs_congested_fn;
+ bdi->congested_data = info;
+ return 0;
+}
+
+static int bio_ready_for_csum(struct bio *bio)
+{
+ u64 length = 0;
+ u64 buf_len = 0;
+ u64 start = 0;
+ struct page *page;
+ struct extent_io_tree *io_tree = NULL;
+ struct btrfs_fs_info *info = NULL;
+ struct bio_vec *bvec;
+ int i;
+ int ret;
+
+ bio_for_each_segment(bvec, bio, i) {
+ page = bvec->bv_page;
+ if (page->private == EXTENT_PAGE_PRIVATE) {
+ length += bvec->bv_len;
+ continue;
+ }
+ if (!page->private) {
+ length += bvec->bv_len;
+ continue;
+ }
+ length = bvec->bv_len;
+ buf_len = page->private >> 2;
+ start = page_offset(page) + bvec->bv_offset;
+ io_tree = &BTRFS_I(page->mapping->host)->io_tree;
+ info = BTRFS_I(page->mapping->host)->root->fs_info;
+ }
+ /* are we fully contained in this bio? */
+ if (buf_len <= length)
+ return 1;
+
+ ret = extent_range_uptodate(io_tree, start + length,
+ start + buf_len - 1);
+ if (ret == 1)
+ return ret;
+ return ret;
+}
+
+/*
+ * called by the kthread helper functions to finally call the bio end_io
+ * functions. This is where read checksum verification actually happens
+ */
+static void end_workqueue_fn(struct btrfs_work *work)
+{
+ struct bio *bio;
+ struct end_io_wq *end_io_wq;
+ struct btrfs_fs_info *fs_info;
+ int error;
+
+ end_io_wq = container_of(work, struct end_io_wq, work);
+ bio = end_io_wq->bio;
+ fs_info = end_io_wq->info;
+
+ /* metadata bio reads are special because the whole tree block must
+ * be checksummed at once. This makes sure the entire block is in
+ * ram and up to date before trying to verify things. For
+ * blocksize <= pagesize, it is basically a noop
+ */
+ if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
+ !bio_ready_for_csum(bio)) {
+ btrfs_queue_worker(&fs_info->endio_meta_workers,
+ &end_io_wq->work);
+ return;
+ }
+ error = end_io_wq->error;
+ bio->bi_private = end_io_wq->private;
+ bio->bi_end_io = end_io_wq->end_io;
+ kfree(end_io_wq);
+ bio_endio(bio, error);
+}
+
+static int cleaner_kthread(void *arg)
+{
+ struct btrfs_root *root = arg;
+
+ do {
+ smp_mb();
+ if (root->fs_info->closing)
+ break;
+
+ vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_clean_old_snapshots(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ if (freezing(current)) {
+ refrigerator();
+ } else {
+ smp_mb();
+ if (root->fs_info->closing)
+ break;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+static int transaction_kthread(void *arg)
+{
+ struct btrfs_root *root = arg;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_transaction *cur;
+ unsigned long now;
+ unsigned long delay;
+ int ret;
+
+ do {
+ smp_mb();
+ if (root->fs_info->closing)
+ break;
+
+ delay = HZ * 30;
+ vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+ mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+ if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
+ printk(KERN_INFO "btrfs: total reference cache "
+ "size %llu\n",
+ root->fs_info->total_ref_cache_size);
+ }
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ cur = root->fs_info->running_transaction;
+ if (!cur) {
+ mutex_unlock(&root->fs_info->trans_mutex);
+ goto sleep;
+ }
+
+ now = get_seconds();
+ if (now < cur->start_time || now - cur->start_time < 30) {
+ mutex_unlock(&root->fs_info->trans_mutex);
+ delay = HZ * 5;
+ goto sleep;
+ }
+ mutex_unlock(&root->fs_info->trans_mutex);
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_commit_transaction(trans, root);
+sleep:
+ wake_up_process(root->fs_info->cleaner_kthread);
+ mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+ if (freezing(current)) {
+ refrigerator();
+ } else {
+ if (root->fs_info->closing)
+ break;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(delay);
+ __set_current_state(TASK_RUNNING);
+ }
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+struct btrfs_root *open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options)
+{
+ u32 sectorsize;
+ u32 nodesize;
+ u32 leafsize;
+ u32 blocksize;
+ u32 stripesize;
+ u64 generation;
+ u64 features;
+ struct btrfs_key location;
+ struct buffer_head *bh;
+ struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+ struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+ struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+ struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
+ GFP_NOFS);
+ struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+ struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+ struct btrfs_root *log_tree_root;
+
+ int ret;
+ int err = -EINVAL;
+
+ struct btrfs_super_block *disk_super;
+
+ if (!extent_root || !tree_root || !fs_info ||
+ !chunk_root || !dev_root || !csum_root) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
+ INIT_LIST_HEAD(&fs_info->trans_list);
+ INIT_LIST_HEAD(&fs_info->dead_roots);
+ INIT_LIST_HEAD(&fs_info->hashers);
+ INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+ spin_lock_init(&fs_info->hash_lock);
+ spin_lock_init(&fs_info->delalloc_lock);
+ spin_lock_init(&fs_info->new_trans_lock);
+ spin_lock_init(&fs_info->ref_cache_lock);
+
+ init_completion(&fs_info->kobj_unregister);
+ fs_info->tree_root = tree_root;
+ fs_info->extent_root = extent_root;
+ fs_info->csum_root = csum_root;
+ fs_info->chunk_root = chunk_root;
+ fs_info->dev_root = dev_root;
+ fs_info->fs_devices = fs_devices;
+ INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
+ INIT_LIST_HEAD(&fs_info->space_info);
+ btrfs_mapping_init(&fs_info->mapping_tree);
+ atomic_set(&fs_info->nr_async_submits, 0);
+ atomic_set(&fs_info->async_delalloc_pages, 0);
+ atomic_set(&fs_info->async_submit_draining, 0);
+ atomic_set(&fs_info->nr_async_bios, 0);
+ atomic_set(&fs_info->throttles, 0);
+ atomic_set(&fs_info->throttle_gen, 0);
+ fs_info->sb = sb;
+ fs_info->max_extent = (u64)-1;
+ fs_info->max_inline = 8192 * 1024;
+ setup_bdi(fs_info, &fs_info->bdi);
+ fs_info->btree_inode = new_inode(sb);
+ fs_info->btree_inode->i_ino = 1;
+ fs_info->btree_inode->i_nlink = 1;
+
+ fs_info->thread_pool_size = min_t(unsigned long,
+ num_online_cpus() + 2, 8);
+
+ INIT_LIST_HEAD(&fs_info->ordered_extents);
+ spin_lock_init(&fs_info->ordered_extent_lock);
+
+ sb->s_blocksize = 4096;
+ sb->s_blocksize_bits = blksize_bits(4096);
+
+ /*
+ * we set the i_size on the btree inode to the max possible int.
+ * the real end of the address space is determined by all of
+ * the devices in the system
+ */
+ fs_info->btree_inode->i_size = OFFSET_MAX;
+ fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+ fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
+
+ extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+ fs_info->btree_inode->i_mapping,
+ GFP_NOFS);
+ extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
+ GFP_NOFS);
+
+ BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+
+ spin_lock_init(&fs_info->block_group_cache_lock);
+ fs_info->block_group_cache_tree.rb_node = NULL;
+
+ extent_io_tree_init(&fs_info->pinned_extents,
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+ extent_io_tree_init(&fs_info->pending_del,
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+ extent_io_tree_init(&fs_info->extent_ins,
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+ fs_info->do_barriers = 1;
+
+ INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
+ btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
+ btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
+
+ BTRFS_I(fs_info->btree_inode)->root = tree_root;
+ memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+ sizeof(struct btrfs_key));
+ insert_inode_hash(fs_info->btree_inode);
+
+ mutex_init(&fs_info->trans_mutex);
+ mutex_init(&fs_info->tree_log_mutex);
+ mutex_init(&fs_info->drop_mutex);
+ mutex_init(&fs_info->extent_ins_mutex);
+ mutex_init(&fs_info->pinned_mutex);
+ mutex_init(&fs_info->chunk_mutex);
+ mutex_init(&fs_info->transaction_kthread_mutex);
+ mutex_init(&fs_info->cleaner_mutex);
+ mutex_init(&fs_info->volume_mutex);
+ mutex_init(&fs_info->tree_reloc_mutex);
+ init_waitqueue_head(&fs_info->transaction_throttle);
+ init_waitqueue_head(&fs_info->transaction_wait);
+ init_waitqueue_head(&fs_info->async_submit_wait);
+ init_waitqueue_head(&fs_info->tree_log_wait);
+ atomic_set(&fs_info->tree_log_commit, 0);
+ atomic_set(&fs_info->tree_log_writers, 0);
+ fs_info->tree_log_transid = 0;
+
+ __setup_root(4096, 4096, 4096, 4096, tree_root,
+ fs_info, BTRFS_ROOT_TREE_OBJECTID);
+
+
+ bh = btrfs_read_dev_super(fs_devices->latest_bdev);
+ if (!bh)
+ goto fail_iput;
+
+ memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+ memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
+ sizeof(fs_info->super_for_commit));
+ brelse(bh);
+
+ memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+
+ disk_super = &fs_info->super_copy;
+ if (!btrfs_super_root(disk_super))
+ goto fail_iput;
+
+ ret = btrfs_parse_options(tree_root, options);
+ if (ret) {
+ err = ret;
+ goto fail_iput;
+ }
+
+ features = btrfs_super_incompat_flags(disk_super) &
+ ~BTRFS_FEATURE_INCOMPAT_SUPP;
+ if (features) {
+ printk(KERN_ERR "BTRFS: couldn't mount because of "
+ "unsupported optional features (%Lx).\n",
+ features);
+ err = -EINVAL;
+ goto fail_iput;
+ }
+
+ features = btrfs_super_compat_ro_flags(disk_super) &
+ ~BTRFS_FEATURE_COMPAT_RO_SUPP;
+ if (!(sb->s_flags & MS_RDONLY) && features) {
+ printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
+ "unsupported option features (%Lx).\n",
+ features);
+ err = -EINVAL;
+ goto fail_iput;
+ }
+
+ /*
+ * we need to start all the end_io workers up front because the
+ * queue work function gets called at interrupt time, and so it
+ * cannot dynamically grow.
+ */
+ btrfs_init_workers(&fs_info->workers, "worker",
+ fs_info->thread_pool_size);
+
+ btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+ fs_info->thread_pool_size);
+
+ btrfs_init_workers(&fs_info->submit_workers, "submit",
+ min_t(u64, fs_devices->num_devices,
+ fs_info->thread_pool_size));
+
+ /* a higher idle thresh on the submit workers makes it much more
+ * likely that bios will be send down in a sane order to the
+ * devices
+ */
+ fs_info->submit_workers.idle_thresh = 64;
+
+ fs_info->workers.idle_thresh = 16;
+ fs_info->workers.ordered = 1;
+
+ fs_info->delalloc_workers.idle_thresh = 2;
+ fs_info->delalloc_workers.ordered = 1;
+
+ btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
+ btrfs_init_workers(&fs_info->endio_workers, "endio",
+ fs_info->thread_pool_size);
+ btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+ fs_info->thread_pool_size);
+ btrfs_init_workers(&fs_info->endio_meta_write_workers,
+ "endio-meta-write", fs_info->thread_pool_size);
+ btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
+ fs_info->thread_pool_size);
+
+ /*
+ * endios are largely parallel and should have a very
+ * low idle thresh
+ */
+ fs_info->endio_workers.idle_thresh = 4;
+ fs_info->endio_write_workers.idle_thresh = 64;
+ fs_info->endio_meta_write_workers.idle_thresh = 64;
+
+ btrfs_start_workers(&fs_info->workers, 1);
+ btrfs_start_workers(&fs_info->submit_workers, 1);
+ btrfs_start_workers(&fs_info->delalloc_workers, 1);
+ btrfs_start_workers(&fs_info->fixup_workers, 1);
+ btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+ btrfs_start_workers(&fs_info->endio_meta_workers,
+ fs_info->thread_pool_size);
+ btrfs_start_workers(&fs_info->endio_meta_write_workers,
+ fs_info->thread_pool_size);
+ btrfs_start_workers(&fs_info->endio_write_workers,
+ fs_info->thread_pool_size);
+
+ fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+ fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+ 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+
+ nodesize = btrfs_super_nodesize(disk_super);
+ leafsize = btrfs_super_leafsize(disk_super);
+ sectorsize = btrfs_super_sectorsize(disk_super);
+ stripesize = btrfs_super_stripesize(disk_super);
+ tree_root->nodesize = nodesize;
+ tree_root->leafsize = leafsize;
+ tree_root->sectorsize = sectorsize;
+ tree_root->stripesize = stripesize;
+
+ sb->s_blocksize = sectorsize;
+ sb->s_blocksize_bits = blksize_bits(sectorsize);
+
+ if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+ sizeof(disk_super->magic))) {
+ printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
+ goto fail_sb_buffer;
+ }
+
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = btrfs_read_sys_array(tree_root);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret) {
+ printk(KERN_WARNING "btrfs: failed to read the system "
+ "array on %s\n", sb->s_id);
+ goto fail_sys_array;
+ }
+
+ blocksize = btrfs_level_size(tree_root,
+ btrfs_super_chunk_root_level(disk_super));
+ generation = btrfs_super_chunk_root_generation(disk_super);
+
+ __setup_root(nodesize, leafsize, sectorsize, stripesize,
+ chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+
+ chunk_root->node = read_tree_block(chunk_root,
+ btrfs_super_chunk_root(disk_super),
+ blocksize, generation);
+ BUG_ON(!chunk_root->node);
+
+ read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
+ BTRFS_UUID_SIZE);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = btrfs_read_chunk_tree(chunk_root);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret) {
+ printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+ sb->s_id);
+ goto fail_chunk_root;
+ }
+
+ btrfs_close_extra_devices(fs_devices);
+
+ blocksize = btrfs_level_size(tree_root,
+ btrfs_super_root_level(disk_super));
+ generation = btrfs_super_generation(disk_super);
+
+ tree_root->node = read_tree_block(tree_root,
+ btrfs_super_root(disk_super),
+ blocksize, generation);
+ if (!tree_root->node)
+ goto fail_chunk_root;
+
+
+ ret = find_and_setup_root(tree_root, fs_info,
+ BTRFS_EXTENT_TREE_OBJECTID, extent_root);
+ if (ret)
+ goto fail_tree_root;
+ extent_root->track_dirty = 1;
+
+ ret = find_and_setup_root(tree_root, fs_info,
+ BTRFS_DEV_TREE_OBJECTID, dev_root);
+ dev_root->track_dirty = 1;
+
+ if (ret)
+ goto fail_extent_root;
+
+ ret = find_and_setup_root(tree_root, fs_info,
+ BTRFS_CSUM_TREE_OBJECTID, csum_root);
+ if (ret)
+ goto fail_extent_root;
+
+ csum_root->track_dirty = 1;
+
+ btrfs_read_block_groups(extent_root);
+
+ fs_info->generation = generation;
+ fs_info->last_trans_committed = generation;
+ fs_info->data_alloc_profile = (u64)-1;
+ fs_info->metadata_alloc_profile = (u64)-1;
+ fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+ fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+ "btrfs-cleaner");
+ if (!fs_info->cleaner_kthread)
+ goto fail_csum_root;
+
+ fs_info->transaction_kthread = kthread_run(transaction_kthread,
+ tree_root,
+ "btrfs-transaction");
+ if (!fs_info->transaction_kthread)
+ goto fail_cleaner;
+
+ if (btrfs_super_log_root(disk_super) != 0) {
+ u64 bytenr = btrfs_super_log_root(disk_super);
+
+ if (fs_devices->rw_devices == 0) {
+ printk(KERN_WARNING "Btrfs log replay required "
+ "on RO media\n");
+ err = -EIO;
+ goto fail_trans_kthread;
+ }
+ blocksize =
+ btrfs_level_size(tree_root,
+ btrfs_super_log_root_level(disk_super));
+
+ log_tree_root = kzalloc(sizeof(struct btrfs_root),
+ GFP_NOFS);
+
+ __setup_root(nodesize, leafsize, sectorsize, stripesize,
+ log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+ log_tree_root->node = read_tree_block(tree_root, bytenr,
+ blocksize,
+ generation + 1);
+ ret = btrfs_recover_log_trees(log_tree_root);
+ BUG_ON(ret);
+
+ if (sb->s_flags & MS_RDONLY) {
+ ret = btrfs_commit_super(tree_root);
+ BUG_ON(ret);
+ }
+ }
+
+ if (!(sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_cleanup_reloc_trees(tree_root);
+ BUG_ON(ret);
+ }
+
+ location.objectid = BTRFS_FS_TREE_OBJECTID;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = (u64)-1;
+
+ fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+ if (!fs_info->fs_root)
+ goto fail_trans_kthread;
+ return tree_root;
+
+fail_trans_kthread:
+ kthread_stop(fs_info->transaction_kthread);
+fail_cleaner:
+ kthread_stop(fs_info->cleaner_kthread);
+
+ /*
+ * make sure we're done with the btree inode before we stop our
+ * kthreads
+ */
+ filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+
+fail_csum_root:
+ free_extent_buffer(csum_root->node);
+fail_extent_root:
+ free_extent_buffer(extent_root->node);
+fail_tree_root:
+ free_extent_buffer(tree_root->node);
+fail_chunk_root:
+ free_extent_buffer(chunk_root->node);
+fail_sys_array:
+ free_extent_buffer(dev_root->node);
+fail_sb_buffer:
+ btrfs_stop_workers(&fs_info->fixup_workers);
+ btrfs_stop_workers(&fs_info->delalloc_workers);
+ btrfs_stop_workers(&fs_info->workers);
+ btrfs_stop_workers(&fs_info->endio_workers);
+ btrfs_stop_workers(&fs_info->endio_meta_workers);
+ btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+ btrfs_stop_workers(&fs_info->endio_write_workers);
+ btrfs_stop_workers(&fs_info->submit_workers);
+fail_iput:
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+ iput(fs_info->btree_inode);
+fail:
+ btrfs_close_devices(fs_info->fs_devices);
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+ kfree(extent_root);
+ kfree(tree_root);
+ bdi_destroy(&fs_info->bdi);
+ kfree(fs_info);
+ kfree(chunk_root);
+ kfree(dev_root);
+ kfree(csum_root);
+ return ERR_PTR(err);
+}
+
+static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+ char b[BDEVNAME_SIZE];
+
+ if (uptodate) {
+ set_buffer_uptodate(bh);
+ } else {
+ if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+ printk(KERN_WARNING "lost page write due to "
+ "I/O error on %s\n",
+ bdevname(bh->b_bdev, b));
+ }
+ /* note, we dont' set_buffer_write_io_error because we have
+ * our own ways of dealing with the IO errors
+ */
+ clear_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
+ put_bh(bh);
+}
+
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+{
+ struct buffer_head *bh;
+ struct buffer_head *latest = NULL;
+ struct btrfs_super_block *super;
+ int i;
+ u64 transid = 0;
+ u64 bytenr;
+
+ /* we would like to check all the supers, but that would make
+ * a btrfs mount succeed after a mkfs from a different FS.
+ * So, we need to add a special mount option to scan for
+ * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+ */
+ for (i = 0; i < 1; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+ break;
+ bh = __bread(bdev, bytenr / 4096, 4096);
+ if (!bh)
+ continue;
+
+ super = (struct btrfs_super_block *)bh->b_data;
+ if (btrfs_super_bytenr(super) != bytenr ||
+ strncmp((char *)(&super->magic), BTRFS_MAGIC,
+ sizeof(super->magic))) {
+ brelse(bh);
+ continue;
+ }
+
+ if (!latest || btrfs_super_generation(super) > transid) {
+ brelse(latest);
+ latest = bh;
+ transid = btrfs_super_generation(super);
+ } else {
+ brelse(bh);
+ }
+ }
+ return latest;
+}
+
+static int write_dev_supers(struct btrfs_device *device,
+ struct btrfs_super_block *sb,
+ int do_barriers, int wait, int max_mirrors)
+{
+ struct buffer_head *bh;
+ int i;
+ int ret;
+ int errors = 0;
+ u32 crc;
+ u64 bytenr;
+ int last_barrier = 0;
+
+ if (max_mirrors == 0)
+ max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+ /* make sure only the last submit_bh does a barrier */
+ if (do_barriers) {
+ for (i = 0; i < max_mirrors; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ device->total_bytes)
+ break;
+ last_barrier = i;
+ }
+ }
+
+ for (i = 0; i < max_mirrors; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ break;
+
+ if (wait) {
+ bh = __find_get_block(device->bdev, bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ BUG_ON(!bh);
+ brelse(bh);
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ brelse(bh);
+ continue;
+ }
+ } else {
+ btrfs_set_super_bytenr(sb, bytenr);
+
+ crc = ~(u32)0;
+ crc = btrfs_csum_data(NULL, (char *)sb +
+ BTRFS_CSUM_SIZE, crc,
+ BTRFS_SUPER_INFO_SIZE -
+ BTRFS_CSUM_SIZE);
+ btrfs_csum_final(crc, sb->csum);
+
+ bh = __getblk(device->bdev, bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+
+ set_buffer_uptodate(bh);
+ get_bh(bh);
+ lock_buffer(bh);
+ bh->b_end_io = btrfs_end_buffer_write_sync;
+ }
+
+ if (i == last_barrier && do_barriers && device->barriers) {
+ ret = submit_bh(WRITE_BARRIER, bh);
+ if (ret == -EOPNOTSUPP) {
+ printk("btrfs: disabling barriers on dev %s\n",
+ device->name);
+ set_buffer_uptodate(bh);
+ device->barriers = 0;
+ get_bh(bh);
+ lock_buffer(bh);
+ ret = submit_bh(WRITE, bh);
+ }
+ } else {
+ ret = submit_bh(WRITE, bh);
+ }
+
+ if (!ret && wait) {
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ errors++;
+ } else if (ret) {
+ errors++;
+ }
+ if (wait)
+ brelse(bh);
+ }
+ return errors < i ? 0 : -1;
+}
+
+int write_all_supers(struct btrfs_root *root, int max_mirrors)
+{
+ struct list_head *cur;
+ struct list_head *head = &root->fs_info->fs_devices->devices;
+ struct btrfs_device *dev;
+ struct btrfs_super_block *sb;
+ struct btrfs_dev_item *dev_item;
+ int ret;
+ int do_barriers;
+ int max_errors;
+ int total_errors = 0;
+ u64 flags;
+
+ max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+ do_barriers = !btrfs_test_opt(root, NOBARRIER);
+
+ sb = &root->fs_info->super_for_commit;
+ dev_item = &sb->dev_item;
+ list_for_each(cur, head) {
+ dev = list_entry(cur, struct btrfs_device, dev_list);
+ if (!dev->bdev) {
+ total_errors++;
+ continue;
+ }
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ btrfs_set_stack_device_generation(dev_item, 0);
+ btrfs_set_stack_device_type(dev_item, dev->type);
+ btrfs_set_stack_device_id(dev_item, dev->devid);
+ btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+ btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+ btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+ btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+ btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+ memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+ memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+
+ flags = btrfs_super_flags(sb);
+ btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+
+ ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
+ if (ret)
+ total_errors++;
+ }
+ if (total_errors > max_errors) {
+ printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+ total_errors);
+ BUG();
+ }
+
+ total_errors = 0;
+ list_for_each(cur, head) {
+ dev = list_entry(cur, struct btrfs_device, dev_list);
+ if (!dev->bdev)
+ continue;
+ if (!dev->in_fs_metadata || !dev->writeable)
+ continue;
+
+ ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
+ if (ret)
+ total_errors++;
+ }
+ if (total_errors > max_errors) {
+ printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+ total_errors);
+ BUG();
+ }
+ return 0;
+}
+
+int write_ctree_super(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int max_mirrors)
+{
+ int ret;
+
+ ret = write_all_supers(root, max_mirrors);
+ return ret;
+}
+
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+ radix_tree_delete(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid);
+ if (root->anon_super.s_dev) {
+ down_write(&root->anon_super.s_umount);
+ kill_anon_super(&root->anon_super);
+ }
+ if (root->node)
+ free_extent_buffer(root->node);
+ if (root->commit_root)
+ free_extent_buffer(root->commit_root);
+ kfree(root->name);
+ kfree(root);
+ return 0;
+}
+
+static int del_fs_roots(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ struct btrfs_root *gang[8];
+ int i;
+
+ while (1) {
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang));
+ if (!ret)
+ break;
+ for (i = 0; i < ret; i++)
+ btrfs_free_fs_root(fs_info, gang[i]);
+ }
+ return 0;
+}
+
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+{
+ u64 root_objectid = 0;
+ struct btrfs_root *gang[8];
+ int i;
+ int ret;
+
+ while (1) {
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang));
+ if (!ret)
+ break;
+ for (i = 0; i < ret; i++) {
+ root_objectid = gang[i]->root_key.objectid;
+ ret = btrfs_find_dead_roots(fs_info->tree_root,
+ root_objectid, gang[i]);
+ BUG_ON(ret);
+ btrfs_orphan_cleanup(gang[i]);
+ }
+ root_objectid++;
+ }
+ return 0;
+}
+
+int btrfs_commit_super(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_clean_old_snapshots(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ /* run commit again to drop the original snapshot */
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_commit_transaction(trans, root);
+ ret = btrfs_write_and_wait_transaction(NULL, root);
+ BUG_ON(ret);
+
+ ret = write_ctree_super(NULL, root, 0);
+ return ret;
+}
+
+int close_ctree(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ fs_info->closing = 1;
+ smp_mb();
+
+ kthread_stop(root->fs_info->transaction_kthread);
+ kthread_stop(root->fs_info->cleaner_kthread);
+
+ if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_commit_super(root);
+ if (ret)
+ printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+ }
+
+ if (fs_info->delalloc_bytes) {
+ printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
+ fs_info->delalloc_bytes);
+ }
+ if (fs_info->total_ref_cache_size) {
+ printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
+ (unsigned long long)fs_info->total_ref_cache_size);
+ }
+
+ if (fs_info->extent_root->node)
+ free_extent_buffer(fs_info->extent_root->node);
+
+ if (fs_info->tree_root->node)
+ free_extent_buffer(fs_info->tree_root->node);
+
+ if (root->fs_info->chunk_root->node)
+ free_extent_buffer(root->fs_info->chunk_root->node);
+
+ if (root->fs_info->dev_root->node)
+ free_extent_buffer(root->fs_info->dev_root->node);
+
+ if (root->fs_info->csum_root->node)
+ free_extent_buffer(root->fs_info->csum_root->node);
+
+ btrfs_free_block_groups(root->fs_info);
+
+ del_fs_roots(fs_info);
+
+ iput(fs_info->btree_inode);
+
+ btrfs_stop_workers(&fs_info->fixup_workers);
+ btrfs_stop_workers(&fs_info->delalloc_workers);
+ btrfs_stop_workers(&fs_info->workers);
+ btrfs_stop_workers(&fs_info->endio_workers);
+ btrfs_stop_workers(&fs_info->endio_meta_workers);
+ btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+ btrfs_stop_workers(&fs_info->endio_write_workers);
+ btrfs_stop_workers(&fs_info->submit_workers);
+
+#if 0
+ while (!list_empty(&fs_info->hashers)) {
+ struct btrfs_hasher *hasher;
+ hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
+ hashers);
+ list_del(&hasher->hashers);
+ crypto_free_hash(&fs_info->hash_tfm);
+ kfree(hasher);
+ }
+#endif
+ btrfs_close_devices(fs_info->fs_devices);
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+ bdi_destroy(&fs_info->bdi);
+
+ kfree(fs_info->extent_root);
+ kfree(fs_info->tree_root);
+ kfree(fs_info->chunk_root);
+ kfree(fs_info->dev_root);
+ kfree(fs_info->csum_root);
+ return 0;
+}
+
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
+{
+ int ret;
+ struct inode *btree_inode = buf->first_page->mapping->host;
+
+ ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+ if (!ret)
+ return ret;
+
+ ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+ parent_transid);
+ return !ret;
+}
+
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
+{
+ struct inode *btree_inode = buf->first_page->mapping->host;
+ return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
+ buf);
+}
+
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+{
+ struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+ u64 transid = btrfs_header_generation(buf);
+ struct inode *btree_inode = root->fs_info->btree_inode;
+
+ WARN_ON(!btrfs_tree_locked(buf));
+ if (transid != root->fs_info->generation) {
+ printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+ "found %llu running %llu\n",
+ (unsigned long long)buf->start,
+ (unsigned long long)transid,
+ (unsigned long long)root->fs_info->generation);
+ WARN_ON(1);
+ }
+ set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+}
+
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+{
+ /*
+ * looks as though older kernels can get into trouble with
+ * this code, they end up stuck in balance_dirty_pages forever
+ */
+ struct extent_io_tree *tree;
+ u64 num_dirty;
+ u64 start = 0;
+ unsigned long thresh = 32 * 1024 * 1024;
+ tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+
+ if (current_is_pdflush() || current->flags & PF_MEMALLOC)
+ return;
+
+ num_dirty = count_range_bits(tree, &start, (u64)-1,
+ thresh, EXTENT_DIRTY);
+ if (num_dirty > thresh) {
+ balance_dirty_pages_ratelimited_nr(
+ root->fs_info->btree_inode->i_mapping, 1);
+ }
+ return;
+}
+
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+{
+ struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+ int ret;
+ ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+ if (ret == 0)
+ buf->flags |= EXTENT_UPTODATE;
+ return ret;
+}
+
+int btree_lock_page_hook(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_buffer *eb;
+ unsigned long len;
+ u64 bytenr = page_offset(page);
+
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+
+ len = page->private >> 2;
+ eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+ if (!eb)
+ goto out;
+
+ btrfs_tree_lock(eb);
+ spin_lock(&root->fs_info->hash_lock);
+ btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+ spin_unlock(&root->fs_info->hash_lock);
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+out:
+ lock_page(page);
+ return 0;
+}
+
+static struct extent_io_ops btree_extent_io_ops = {
+ .write_cache_pages_lock_hook = btree_lock_page_hook,
+ .readpage_end_io_hook = btree_readpage_end_io_hook,
+ .submit_bio_hook = btree_submit_bio_hook,
+ /* note we're sharing with inode.c for the merge bio hook */
+ .merge_bio_hook = btrfs_merge_bio_hook,
+};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 00000000000..c0ff404c31b
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __DISKIO__
+#define __DISKIO__
+
+#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_SIZE 4096
+
+#define BTRFS_SUPER_MIRROR_MAX 3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+
+static inline u64 btrfs_sb_offset(int mirror)
+{
+ u64 start = 16 * 1024;
+ if (mirror)
+ return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+ return BTRFS_SUPER_INFO_OFFSET;
+}
+
+struct btrfs_device;
+struct btrfs_fs_devices;
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+ u32 blocksize, u64 parent_transid);
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ u64 parent_transid);
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize);
+int clean_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf);
+struct btrfs_root *open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options);
+int close_ctree(struct btrfs_root *root);
+int write_ctree_super(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int max_mirrors);
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_commit_super(struct btrfs_root *root);
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_objectid);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location,
+ const char *name, int namelen);
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+ struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location);
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
+int btrfs_insert_dev_radix(struct btrfs_root *root,
+ struct block_device *bdev,
+ u64 device_id,
+ u64 block_start,
+ u64 num_blocks);
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
+int wait_on_tree_block_writeback(struct btrfs_root *root,
+ struct extent_buffer *buf);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
+void btrfs_csum_final(u32 crc, char *result);
+int btrfs_open_device(struct btrfs_device *dev);
+int btrfs_verify_block_csum(struct btrfs_root *root,
+ struct extent_buffer *buf);
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+ int metadata);
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+ int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags,
+ extent_submit_bio_hook_t *submit_bio_start,
+ extent_submit_bio_hook_t *submit_bio_done);
+
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+int btrfs_write_tree_block(struct extent_buffer *buf);
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btree_lock_page_hook(struct page *page);
+#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 00000000000..85315d2c90d
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,203 @@
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "export.h"
+#include "compat.h"
+
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+ parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+ parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
+
+static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+ int connectable)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *)fh;
+ struct inode *inode = dentry->d_inode;
+ int len = *max_len;
+ int type;
+
+ if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
+ (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+ return 255;
+
+ len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+ type = FILEID_BTRFS_WITHOUT_PARENT;
+
+ fid->objectid = BTRFS_I(inode)->location.objectid;
+ fid->root_objectid = BTRFS_I(inode)->root->objectid;
+ fid->gen = inode->i_generation;
+
+ if (connectable && !S_ISDIR(inode->i_mode)) {
+ struct inode *parent;
+ u64 parent_root_id;
+
+ spin_lock(&dentry->d_lock);
+
+ parent = dentry->d_parent->d_inode;
+ fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+ fid->parent_gen = parent->i_generation;
+ parent_root_id = BTRFS_I(parent)->root->objectid;
+
+ spin_unlock(&dentry->d_lock);
+
+ if (parent_root_id != fid->root_objectid) {
+ fid->parent_root_objectid = parent_root_id;
+ len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+ type = FILEID_BTRFS_WITH_PARENT_ROOT;
+ } else {
+ len = BTRFS_FID_SIZE_CONNECTABLE;
+ type = FILEID_BTRFS_WITH_PARENT;
+ }
+ }
+
+ *max_len = len;
+ return type;
+}
+
+static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u32 generation)
+{
+ struct btrfs_root *root;
+ struct inode *inode;
+ struct btrfs_key key;
+
+ key.objectid = root_objectid;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = (u64)-1;
+
+ root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
+ if (IS_ERR(root))
+ return ERR_CAST(root);
+
+ key.objectid = objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+
+ inode = btrfs_iget(sb, &key, root, NULL);
+ if (IS_ERR(inode))
+ return (void *)inode;
+
+ if (generation != inode->i_generation) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+
+ return d_obtain_alias(inode);
+}
+
+static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+ u64 objectid, root_objectid;
+ u32 generation;
+
+ if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+ if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
+ return NULL;
+ root_objectid = fid->root_objectid;
+ } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
+ if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+ return NULL;
+ root_objectid = fid->parent_root_objectid;
+ } else
+ return NULL;
+
+ objectid = fid->parent_objectid;
+ generation = fid->parent_gen;
+
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+}
+
+static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+ u64 objectid, root_objectid;
+ u32 generation;
+
+ if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
+ fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+ (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
+ fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+ (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
+ fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+ return NULL;
+
+ objectid = fid->objectid;
+ root_objectid = fid->root_objectid;
+ generation = fid->gen;
+
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+}
+
+static struct dentry *btrfs_get_parent(struct dentry *child)
+{
+ struct inode *dir = child->d_inode;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int slot;
+ u64 objectid;
+ int ret;
+
+ path = btrfs_alloc_path();
+
+ key.objectid = dir->i_ino;
+ btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ /* Error */
+ btrfs_free_path(path);
+ return ERR_PTR(ret);
+ }
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (ret) {
+ /* btrfs_search_slot() returns the slot where we'd want to
+ insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
+ The _real_ backref, telling us what the parent inode
+ _actually_ is, will be in the slot _before_ the one
+ that btrfs_search_slot() returns. */
+ if (!slot) {
+ /* Unless there is _no_ key in the tree before... */
+ btrfs_free_path(path);
+ return ERR_PTR(-EIO);
+ }
+ slot--;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ btrfs_free_path(path);
+
+ if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
+ return ERR_PTR(-EINVAL);
+
+ objectid = key.offset;
+
+ /* If we are already at the root of a subvol, return the real root */
+ if (objectid == dir->i_ino)
+ return dget(dir->i_sb->s_root);
+
+ /* Build a new key for the inode item */
+ key.objectid = objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+
+ return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+}
+
+const struct export_operations btrfs_export_ops = {
+ .encode_fh = btrfs_encode_fh,
+ .fh_to_dentry = btrfs_fh_to_dentry,
+ .fh_to_parent = btrfs_fh_to_parent,
+ .get_parent = btrfs_get_parent,
+};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 00000000000..074348a9584
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
+#ifndef BTRFS_EXPORT_H
+#define BTRFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations btrfs_export_ops;
+
+struct btrfs_fid {
+ u64 objectid;
+ u64 root_objectid;
+ u32 gen;
+
+ u64 parent_objectid;
+ u32 parent_gen;
+
+ u64 parent_root_objectid;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 00000000000..293da650873
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,5986 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/version.h>
+#include "compat.h"
+#include "hash.h"
+#include "crc32c.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "compat.h"
+
+#define PENDING_EXTENT_INSERT 0
+#define PENDING_EXTENT_DELETE 1
+#define PENDING_BACKREF_UPDATE 2
+
+struct pending_extent_op {
+ int type;
+ u64 bytenr;
+ u64 num_bytes;
+ u64 parent;
+ u64 orig_parent;
+ u64 generation;
+ u64 orig_generation;
+ int level;
+ struct list_head list;
+ int del;
+};
+
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, int all);
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, int all);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int is_data);
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int alloc,
+ int mark_free);
+
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+ return (cache->flags & bits) == bits;
+}
+
+/*
+ * this adds the block group to the fs_info rb tree for the block group
+ * cache
+ */
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct btrfs_block_group_cache *cache;
+
+ spin_lock(&info->block_group_cache_lock);
+ p = &info->block_group_cache_tree.rb_node;
+
+ while (*p) {
+ parent = *p;
+ cache = rb_entry(parent, struct btrfs_block_group_cache,
+ cache_node);
+ if (block_group->key.objectid < cache->key.objectid) {
+ p = &(*p)->rb_left;
+ } else if (block_group->key.objectid > cache->key.objectid) {
+ p = &(*p)->rb_right;
+ } else {
+ spin_unlock(&info->block_group_cache_lock);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&block_group->cache_node, parent, p);
+ rb_insert_color(&block_group->cache_node,
+ &info->block_group_cache_tree);
+ spin_unlock(&info->block_group_cache_lock);
+
+ return 0;
+}
+
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group_cache *
+block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+ int contains)
+{
+ struct btrfs_block_group_cache *cache, *ret = NULL;
+ struct rb_node *n;
+ u64 end, start;
+
+ spin_lock(&info->block_group_cache_lock);
+ n = info->block_group_cache_tree.rb_node;
+
+ while (n) {
+ cache = rb_entry(n, struct btrfs_block_group_cache,
+ cache_node);
+ end = cache->key.objectid + cache->key.offset - 1;
+ start = cache->key.objectid;
+
+ if (bytenr < start) {
+ if (!contains && (!ret || start < ret->key.objectid))
+ ret = cache;
+ n = n->rb_left;
+ } else if (bytenr > start) {
+ if (contains && bytenr <= end) {
+ ret = cache;
+ break;
+ }
+ n = n->rb_right;
+ } else {
+ ret = cache;
+ break;
+ }
+ }
+ if (ret)
+ atomic_inc(&ret->count);
+ spin_unlock(&info->block_group_cache_lock);
+
+ return ret;
+}
+
+/*
+ * this is only called by cache_block_group, since we could have freed extents
+ * we need to check the pinned_extents for any extents that can't be used yet
+ * since their free space will be released as soon as the transaction commits.
+ */
+static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end)
+{
+ u64 extent_start, extent_end, size;
+ int ret;
+
+ mutex_lock(&info->pinned_mutex);
+ while (start < end) {
+ ret = find_first_extent_bit(&info->pinned_extents, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY);
+ if (ret)
+ break;
+
+ if (extent_start == start) {
+ start = extent_end + 1;
+ } else if (extent_start > start && extent_start < end) {
+ size = extent_start - start;
+ ret = btrfs_add_free_space(block_group, start,
+ size);
+ BUG_ON(ret);
+ start = extent_end + 1;
+ } else {
+ break;
+ }
+ }
+
+ if (start < end) {
+ size = end - start;
+ ret = btrfs_add_free_space(block_group, start, size);
+ BUG_ON(ret);
+ }
+ mutex_unlock(&info->pinned_mutex);
+
+ return 0;
+}
+
+static int remove_sb_from_cache(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ u64 bytenr;
+ u64 *logical;
+ int stripe_len;
+ int i, nr, ret;
+
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+ cache->key.objectid, bytenr, 0,
+ &logical, &nr, &stripe_len);
+ BUG_ON(ret);
+ while (nr--) {
+ btrfs_remove_free_space(cache, logical[nr],
+ stripe_len);
+ }
+ kfree(logical);
+ }
+ return 0;
+}
+
+static int cache_block_group(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_path *path;
+ int ret = 0;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ int slot;
+ u64 last;
+
+ if (!block_group)
+ return 0;
+
+ root = root->fs_info->extent_root;
+
+ if (block_group->cached)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 2;
+ /*
+ * we get into deadlocks with paths held by callers of this function.
+ * since the alloc_mutex is protecting things right now, just
+ * skip the locking here
+ */
+ path->skip_locking = 1;
+ last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+ key.objectid = last;
+ key.offset = 0;
+ btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto err;
+ if (ret == 0)
+ continue;
+ else
+ break;
+ }
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid < block_group->key.objectid)
+ goto next;
+
+ if (key.objectid >= block_group->key.objectid +
+ block_group->key.offset)
+ break;
+
+ if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
+ add_new_free_space(block_group, root->fs_info, last,
+ key.objectid);
+
+ last = key.objectid + key.offset;
+ }
+next:
+ path->slots[0]++;
+ }
+
+ add_new_free_space(block_group, root->fs_info, last,
+ block_group->key.objectid +
+ block_group->key.offset);
+
+ remove_sb_from_cache(root, block_group);
+ block_group->cached = 1;
+ ret = 0;
+err:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * return the block group that starts at or after bytenr
+ */
+static struct btrfs_block_group_cache *
+btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = block_group_cache_tree_search(info, bytenr, 0);
+
+ return cache;
+}
+
+/*
+ * return the block group that contains teh given bytenr
+ */
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+ struct btrfs_fs_info *info,
+ u64 bytenr)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = block_group_cache_tree_search(info, bytenr, 1);
+
+ return cache;
+}
+
+static inline void put_block_group(struct btrfs_block_group_cache *cache)
+{
+ if (atomic_dec_and_test(&cache->count))
+ kfree(cache);
+}
+
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+ u64 flags)
+{
+ struct list_head *head = &info->space_info;
+ struct list_head *cur;
+ struct btrfs_space_info *found;
+ list_for_each(cur, head) {
+ found = list_entry(cur, struct btrfs_space_info, list);
+ if (found->flags == flags)
+ return found;
+ }
+ return NULL;
+}
+
+static u64 div_factor(u64 num, int factor)
+{
+ if (factor == 10)
+ return num;
+ num *= factor;
+ do_div(num, 10);
+ return num;
+}
+
+u64 btrfs_find_block_group(struct btrfs_root *root,
+ u64 search_start, u64 search_hint, int owner)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 used;
+ u64 last = max(search_hint, search_start);
+ u64 group_start = 0;
+ int full_search = 0;
+ int factor = 9;
+ int wrapped = 0;
+again:
+ while (1) {
+ cache = btrfs_lookup_first_block_group(root->fs_info, last);
+ if (!cache)
+ break;
+
+ spin_lock(&cache->lock);
+ last = cache->key.objectid + cache->key.offset;
+ used = btrfs_block_group_used(&cache->item);
+
+ if ((full_search || !cache->ro) &&
+ block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
+ if (used + cache->pinned + cache->reserved <
+ div_factor(cache->key.offset, factor)) {
+ group_start = cache->key.objectid;
+ spin_unlock(&cache->lock);
+ put_block_group(cache);
+ goto found;
+ }
+ }
+ spin_unlock(&cache->lock);
+ put_block_group(cache);
+ cond_resched();
+ }
+ if (!wrapped) {
+ last = search_start;
+ wrapped = 1;
+ goto again;
+ }
+ if (!full_search && factor < 10) {
+ last = search_start;
+ full_search = 1;
+ factor = 10;
+ goto again;
+ }
+found:
+ return group_start;
+}
+
+/* simple helper to search for an existing extent at a given offset */
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ key.objectid = start;
+ key.offset = len;
+ btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+ ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+ 0, 0);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Back reference rules. Back refs have three main goals:
+ *
+ * 1) differentiate between all holders of references to an extent so that
+ * when a reference is dropped we can make sure it was a valid reference
+ * before freeing the extent.
+ *
+ * 2) Provide enough information to quickly find the holders of an extent
+ * if we notice a given block is corrupted or bad.
+ *
+ * 3) Make it easy to migrate blocks for FS shrinking or storage pool
+ * maintenance. This is actually the same as #2, but with a slightly
+ * different use case.
+ *
+ * File extents can be referenced by:
+ *
+ * - multiple snapshots, subvolumes, or different generations in one subvol
+ * - different files inside a single subvolume
+ * - different offsets inside a file (bookend extents in file.c)
+ *
+ * The extent ref structure has fields for:
+ *
+ * - Objectid of the subvolume root
+ * - Generation number of the tree holding the reference
+ * - objectid of the file holding the reference
+ * - number of references holding by parent node (alway 1 for tree blocks)
+ *
+ * Btree leaf may hold multiple references to a file extent. In most cases,
+ * these references are from same file and the corresponding offsets inside
+ * the file are close together.
+ *
+ * When a file extent is allocated the fields are filled in:
+ * (root_key.objectid, trans->transid, inode objectid, 1)
+ *
+ * When a leaf is cow'd new references are added for every file extent found
+ * in the leaf. It looks similar to the create case, but trans->transid will
+ * be different when the block is cow'd.
+ *
+ * (root_key.objectid, trans->transid, inode objectid,
+ * number of references in the leaf)
+ *
+ * When a file extent is removed either during snapshot deletion or
+ * file truncation, we find the corresponding back reference and check
+ * the following fields:
+ *
+ * (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
+ * inode objectid)
+ *
+ * Btree extents can be referenced by:
+ *
+ * - Different subvolumes
+ * - Different generations of the same subvolume
+ *
+ * When a tree block is created, back references are inserted:
+ *
+ * (root->root_key.objectid, trans->transid, level, 1)
+ *
+ * When a tree block is cow'd, new back references are added for all the
+ * blocks it points to. If the tree block isn't in reference counted root,
+ * the old back references are removed. These new back references are of
+ * the form (trans->transid will have increased since creation):
+ *
+ * (root->root_key.objectid, trans->transid, level, 1)
+ *
+ * When a backref is in deleting, the following fields are checked:
+ *
+ * if backref was for a tree root:
+ * (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
+ * else
+ * (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent, the key
+ * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
+ * byte of parent extent. If a extent is tree root, the key offset is set
+ * to the key objectid.
+ */
+
+static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 ref_root, u64 ref_generation,
+ u64 owner_objectid, int del)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_ref *ref;
+ struct extent_buffer *leaf;
+ u64 ref_objectid;
+ int ret;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_REF_KEY;
+ key.offset = parent;
+
+ ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+ ref_objectid = btrfs_ref_objectid(leaf, ref);
+ if (btrfs_ref_root(leaf, ref) != ref_root ||
+ btrfs_ref_generation(leaf, ref) != ref_generation ||
+ (ref_objectid != owner_objectid &&
+ ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+ ret = -EIO;
+ WARN_ON(1);
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * updates all the backrefs that are pending on update_list for the
+ * extent_root
+ */
+static noinline int update_backrefs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_path *path,
+ struct list_head *update_list)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_ref *ref;
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ struct pending_extent_op *op;
+ struct extent_buffer *leaf;
+ int ret = 0;
+ struct list_head *cur = update_list->next;
+ u64 ref_objectid;
+ u64 ref_root = extent_root->root_key.objectid;
+
+ op = list_entry(cur, struct pending_extent_op, list);
+
+search:
+ key.objectid = op->bytenr;
+ key.type = BTRFS_EXTENT_REF_KEY;
+ key.offset = op->orig_parent;
+
+ ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+
+loop:
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+
+ ref_objectid = btrfs_ref_objectid(leaf, ref);
+
+ if (btrfs_ref_root(leaf, ref) != ref_root ||
+ btrfs_ref_generation(leaf, ref) != op->orig_generation ||
+ (ref_objectid != op->level &&
+ ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+ printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
+ "root %llu, owner %u\n",
+ (unsigned long long)op->bytenr,
+ (unsigned long long)op->orig_parent,
+ (unsigned long long)ref_root, op->level);
+ btrfs_print_leaf(extent_root, leaf);
+ BUG();
+ }
+
+ key.objectid = op->bytenr;
+ key.offset = op->parent;
+ key.type = BTRFS_EXTENT_REF_KEY;
+ ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
+ BUG_ON(ret);
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+ btrfs_set_ref_generation(leaf, ref, op->generation);
+
+ cur = cur->next;
+
+ list_del_init(&op->list);
+ unlock_extent(&info->extent_ins, op->bytenr,
+ op->bytenr + op->num_bytes - 1, GFP_NOFS);
+ kfree(op);
+
+ if (cur == update_list) {
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(extent_root, path);
+ goto out;
+ }
+
+ op = list_entry(cur, struct pending_extent_op, list);
+
+ path->slots[0]++;
+ while (path->slots[0] < btrfs_header_nritems(leaf)) {
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid == op->bytenr &&
+ key.type == BTRFS_EXTENT_REF_KEY)
+ goto loop;
+ path->slots[0]++;
+ }
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(extent_root, path);
+ goto search;
+
+out:
+ return 0;
+}
+
+static noinline int insert_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_path *path,
+ struct list_head *insert_list, int nr)
+{
+ struct btrfs_key *keys;
+ u32 *data_size;
+ struct pending_extent_op *op;
+ struct extent_buffer *leaf;
+ struct list_head *cur = insert_list->next;
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ u64 ref_root = extent_root->root_key.objectid;
+ int i = 0, last = 0, ret;
+ int total = nr * 2;
+
+ if (!nr)
+ return 0;
+
+ keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!keys)
+ return -ENOMEM;
+
+ data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
+ if (!data_size) {
+ kfree(keys);
+ return -ENOMEM;
+ }
+
+ list_for_each_entry(op, insert_list, list) {
+ keys[i].objectid = op->bytenr;
+ keys[i].offset = op->num_bytes;
+ keys[i].type = BTRFS_EXTENT_ITEM_KEY;
+ data_size[i] = sizeof(struct btrfs_extent_item);
+ i++;
+
+ keys[i].objectid = op->bytenr;
+ keys[i].offset = op->parent;
+ keys[i].type = BTRFS_EXTENT_REF_KEY;
+ data_size[i] = sizeof(struct btrfs_extent_ref);
+ i++;
+ }
+
+ op = list_entry(cur, struct pending_extent_op, list);
+ i = 0;
+ while (i < total) {
+ int c;
+ ret = btrfs_insert_some_items(trans, extent_root, path,
+ keys+i, data_size+i, total-i);
+ BUG_ON(ret < 0);
+
+ if (last && ret > 1)
+ BUG();
+
+ leaf = path->nodes[0];
+ for (c = 0; c < ret; c++) {
+ int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
+
+ /*
+ * if the first item we inserted was a backref, then
+ * the EXTENT_ITEM will be the odd c's, else it will
+ * be the even c's
+ */
+ if ((ref_first && (c % 2)) ||
+ (!ref_first && !(c % 2))) {
+ struct btrfs_extent_item *itm;
+
+ itm = btrfs_item_ptr(leaf, path->slots[0] + c,
+ struct btrfs_extent_item);
+ btrfs_set_extent_refs(path->nodes[0], itm, 1);
+ op->del++;
+ } else {
+ struct btrfs_extent_ref *ref;
+
+ ref = btrfs_item_ptr(leaf, path->slots[0] + c,
+ struct btrfs_extent_ref);
+ btrfs_set_ref_root(leaf, ref, ref_root);
+ btrfs_set_ref_generation(leaf, ref,
+ op->generation);
+ btrfs_set_ref_objectid(leaf, ref, op->level);
+ btrfs_set_ref_num_refs(leaf, ref, 1);
+ op->del++;
+ }
+
+ /*
+ * using del to see when its ok to free up the
+ * pending_extent_op. In the case where we insert the
+ * last item on the list in order to help do batching
+ * we need to not free the extent op until we actually
+ * insert the extent_item
+ */
+ if (op->del == 2) {
+ unlock_extent(&info->extent_ins, op->bytenr,
+ op->bytenr + op->num_bytes - 1,
+ GFP_NOFS);
+ cur = cur->next;
+ list_del_init(&op->list);
+ kfree(op);
+ if (cur != insert_list)
+ op = list_entry(cur,
+ struct pending_extent_op,
+ list);
+ }
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(extent_root, path);
+
+ /*
+ * Ok backref's and items usually go right next to eachother,
+ * but if we could only insert 1 item that means that we
+ * inserted on the end of a leaf, and we have no idea what may
+ * be on the next leaf so we just play it safe. In order to
+ * try and help this case we insert the last thing on our
+ * insert list so hopefully it will end up being the last
+ * thing on the leaf and everything else will be before it,
+ * which will let us insert a whole bunch of items at the same
+ * time.
+ */
+ if (ret == 1 && !last && (i + ret < total)) {
+ /*
+ * last: where we will pick up the next time around
+ * i: our current key to insert, will be total - 1
+ * cur: the current op we are screwing with
+ * op: duh
+ */
+ last = i + ret;
+ i = total - 1;
+ cur = insert_list->prev;
+ op = list_entry(cur, struct pending_extent_op, list);
+ } else if (last) {
+ /*
+ * ok we successfully inserted the last item on the
+ * list, lets reset everything
+ *
+ * i: our current key to insert, so where we left off
+ * last time
+ * last: done with this
+ * cur: the op we are messing with
+ * op: duh
+ * total: since we inserted the last key, we need to
+ * decrement total so we dont overflow
+ */
+ i = last;
+ last = 0;
+ total--;
+ if (i < total) {
+ cur = insert_list->next;
+ op = list_entry(cur, struct pending_extent_op,
+ list);
+ }
+ } else {
+ i += ret;
+ }
+
+ cond_resched();
+ }
+ ret = 0;
+ kfree(keys);
+ kfree(data_size);
+ return ret;
+}
+
+static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 parent,
+ u64 ref_root, u64 ref_generation,
+ u64 owner_objectid)
+{
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_ref *ref;
+ u32 num_refs;
+ int ret;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_REF_KEY;
+ key.offset = parent;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref);
+ btrfs_set_ref_root(leaf, ref, ref_root);
+ btrfs_set_ref_generation(leaf, ref, ref_generation);
+ btrfs_set_ref_objectid(leaf, ref, owner_objectid);
+ btrfs_set_ref_num_refs(leaf, ref, 1);
+ } else if (ret == -EEXIST) {
+ u64 existing_owner;
+ BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref);
+ if (btrfs_ref_root(leaf, ref) != ref_root ||
+ btrfs_ref_generation(leaf, ref) != ref_generation) {
+ ret = -EIO;
+ WARN_ON(1);
+ goto out;
+ }
+
+ num_refs = btrfs_ref_num_refs(leaf, ref);
+ BUG_ON(num_refs == 0);
+ btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+
+ existing_owner = btrfs_ref_objectid(leaf, ref);
+ if (existing_owner != owner_objectid &&
+ existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+ btrfs_set_ref_objectid(leaf, ref,
+ BTRFS_MULTIPLE_OBJECTIDS);
+ }
+ ret = 0;
+ } else {
+ goto out;
+ }
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+ btrfs_release_path(root, path);
+ return ret;
+}
+
+static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_extent_ref *ref;
+ u32 num_refs;
+ int ret = 0;
+
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+ num_refs = btrfs_ref_num_refs(leaf, ref);
+ BUG_ON(num_refs == 0);
+ num_refs -= 1;
+ if (num_refs == 0) {
+ ret = btrfs_del_item(trans, root, path);
+ } else {
+ btrfs_set_ref_num_refs(leaf, ref, num_refs);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ btrfs_release_path(root, path);
+ return ret;
+}
+
+#ifdef BIO_RW_DISCARD
+static void btrfs_issue_discard(struct block_device *bdev,
+ u64 start, u64 len)
+{
+ blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+}
+#endif
+
+static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+#ifdef BIO_RW_DISCARD
+ int ret;
+ u64 map_length = num_bytes;
+ struct btrfs_multi_bio *multi = NULL;
+
+ /* Tell the block device(s) that the sectors can be discarded */
+ ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+ bytenr, &map_length, &multi, 0);
+ if (!ret) {
+ struct btrfs_bio_stripe *stripe = multi->stripes;
+ int i;
+
+ if (map_length > num_bytes)
+ map_length = num_bytes;
+
+ for (i = 0; i < multi->num_stripes; i++, stripe++) {
+ btrfs_issue_discard(stripe->dev->bdev,
+ stripe->physical,
+ map_length);
+ }
+ kfree(multi);
+ }
+
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+static noinline int free_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct list_head *del_list)
+{
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ struct btrfs_path *path;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ struct list_head *cur;
+ struct pending_extent_op *op;
+ struct btrfs_extent_item *ei;
+ int ret, num_to_del, extent_slot = 0, found_extent = 0;
+ u32 refs;
+ u64 bytes_freed = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 1;
+
+search:
+ /* search for the backref for the current ref we want to delete */
+ cur = del_list->next;
+ op = list_entry(cur, struct pending_extent_op, list);
+ ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
+ op->orig_parent,
+ extent_root->root_key.objectid,
+ op->orig_generation, op->level, 1);
+ if (ret) {
+ printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
+ "root %llu gen %llu owner %u\n",
+ (unsigned long long)op->bytenr,
+ (unsigned long long)extent_root->root_key.objectid,
+ (unsigned long long)op->orig_generation, op->level);
+ btrfs_print_leaf(extent_root, path->nodes[0]);
+ WARN_ON(1);
+ goto out;
+ }
+
+ extent_slot = path->slots[0];
+ num_to_del = 1;
+ found_extent = 0;
+
+ /*
+ * if we aren't the first item on the leaf we can move back one and see
+ * if our ref is right next to our extent item
+ */
+ if (likely(extent_slot)) {
+ extent_slot--;
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ extent_slot);
+ if (found_key.objectid == op->bytenr &&
+ found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+ found_key.offset == op->num_bytes) {
+ num_to_del++;
+ found_extent = 1;
+ }
+ }
+
+ /*
+ * if we didn't find the extent we need to delete the backref and then
+ * search for the extent item key so we can update its ref count
+ */
+ if (!found_extent) {
+ key.objectid = op->bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = op->num_bytes;
+
+ ret = remove_extent_backref(trans, extent_root, path);
+ BUG_ON(ret);
+ btrfs_release_path(extent_root, path);
+ ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
+ BUG_ON(ret);
+ extent_slot = path->slots[0];
+ }
+
+ /* this is where we update the ref count for the extent */
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
+ refs = btrfs_extent_refs(leaf, ei);
+ BUG_ON(refs == 0);
+ refs--;
+ btrfs_set_extent_refs(leaf, ei, refs);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ /*
+ * This extent needs deleting. The reason cur_slot is extent_slot +
+ * num_to_del is because extent_slot points to the slot where the extent
+ * is, and if the backref was not right next to the extent we will be
+ * deleting at least 1 item, and will want to start searching at the
+ * slot directly next to extent_slot. However if we did find the
+ * backref next to the extent item them we will be deleting at least 2
+ * items and will want to start searching directly after the ref slot
+ */
+ if (!refs) {
+ struct list_head *pos, *n, *end;
+ int cur_slot = extent_slot+num_to_del;
+ u64 super_used;
+ u64 root_used;
+
+ path->slots[0] = extent_slot;
+ bytes_freed = op->num_bytes;
+
+ mutex_lock(&info->pinned_mutex);
+ ret = pin_down_bytes(trans, extent_root, op->bytenr,
+ op->num_bytes, op->level >=
+ BTRFS_FIRST_FREE_OBJECTID);
+ mutex_unlock(&info->pinned_mutex);
+ BUG_ON(ret < 0);
+ op->del = ret;
+
+ /*
+ * we need to see if we can delete multiple things at once, so
+ * start looping through the list of extents we are wanting to
+ * delete and see if their extent/backref's are right next to
+ * eachother and the extents only have 1 ref
+ */
+ for (pos = cur->next; pos != del_list; pos = pos->next) {
+ struct pending_extent_op *tmp;
+
+ tmp = list_entry(pos, struct pending_extent_op, list);
+
+ /* we only want to delete extent+ref at this stage */
+ if (cur_slot >= btrfs_header_nritems(leaf) - 1)
+ break;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
+ if (found_key.objectid != tmp->bytenr ||
+ found_key.type != BTRFS_EXTENT_ITEM_KEY ||
+ found_key.offset != tmp->num_bytes)
+ break;
+
+ /* check to make sure this extent only has one ref */
+ ei = btrfs_item_ptr(leaf, cur_slot,
+ struct btrfs_extent_item);
+ if (btrfs_extent_refs(leaf, ei) != 1)
+ break;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
+ if (found_key.objectid != tmp->bytenr ||
+ found_key.type != BTRFS_EXTENT_REF_KEY ||
+ found_key.offset != tmp->orig_parent)
+ break;
+
+ /*
+ * the ref is right next to the extent, we can set the
+ * ref count to 0 since we will delete them both now
+ */
+ btrfs_set_extent_refs(leaf, ei, 0);
+
+ /* pin down the bytes for this extent */
+ mutex_lock(&info->pinned_mutex);
+ ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
+ tmp->num_bytes, tmp->level >=
+ BTRFS_FIRST_FREE_OBJECTID);
+ mutex_unlock(&info->pinned_mutex);
+ BUG_ON(ret < 0);
+
+ /*
+ * use the del field to tell if we need to go ahead and
+ * free up the extent when we delete the item or not.
+ */
+ tmp->del = ret;
+ bytes_freed += tmp->num_bytes;
+
+ num_to_del += 2;
+ cur_slot += 2;
+ }
+ end = pos;
+
+ /* update the free space counters */
+ spin_lock(&info->delalloc_lock);
+ super_used = btrfs_super_bytes_used(&info->super_copy);
+ btrfs_set_super_bytes_used(&info->super_copy,
+ super_used - bytes_freed);
+
+ root_used = btrfs_root_used(&extent_root->root_item);
+ btrfs_set_root_used(&extent_root->root_item,
+ root_used - bytes_freed);
+ spin_unlock(&info->delalloc_lock);
+
+ /* delete the items */
+ ret = btrfs_del_items(trans, extent_root, path,
+ path->slots[0], num_to_del);
+ BUG_ON(ret);
+
+ /*
+ * loop through the extents we deleted and do the cleanup work
+ * on them
+ */
+ for (pos = cur, n = pos->next; pos != end;
+ pos = n, n = pos->next) {
+ struct pending_extent_op *tmp;
+ tmp = list_entry(pos, struct pending_extent_op, list);
+
+ /*
+ * remember tmp->del tells us wether or not we pinned
+ * down the extent
+ */
+ ret = update_block_group(trans, extent_root,
+ tmp->bytenr, tmp->num_bytes, 0,
+ tmp->del);
+ BUG_ON(ret);
+
+ list_del_init(&tmp->list);
+ unlock_extent(&info->extent_ins, tmp->bytenr,
+ tmp->bytenr + tmp->num_bytes - 1,
+ GFP_NOFS);
+ kfree(tmp);
+ }
+ } else if (refs && found_extent) {
+ /*
+ * the ref and extent were right next to eachother, but the
+ * extent still has a ref, so just free the backref and keep
+ * going
+ */
+ ret = remove_extent_backref(trans, extent_root, path);
+ BUG_ON(ret);
+
+ list_del_init(&op->list);
+ unlock_extent(&info->extent_ins, op->bytenr,
+ op->bytenr + op->num_bytes - 1, GFP_NOFS);
+ kfree(op);
+ } else {
+ /*
+ * the extent has multiple refs and the backref we were looking
+ * for was not right next to it, so just unlock and go next,
+ * we're good to go
+ */
+ list_del_init(&op->list);
+ unlock_extent(&info->extent_ins, op->bytenr,
+ op->bytenr + op->num_bytes - 1, GFP_NOFS);
+ kfree(op);
+ }
+
+ btrfs_release_path(extent_root, path);
+ if (!list_empty(del_list))
+ goto search;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 orig_parent, u64 parent,
+ u64 orig_root, u64 ref_root,
+ u64 orig_generation, u64 ref_generation,
+ u64 owner_objectid)
+{
+ int ret;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct btrfs_path *path;
+
+ if (root == root->fs_info->extent_root) {
+ struct pending_extent_op *extent_op;
+ u64 num_bytes;
+
+ BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
+ num_bytes = btrfs_level_size(root, (int)owner_objectid);
+ mutex_lock(&root->fs_info->extent_ins_mutex);
+ if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+ bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
+ u64 priv;
+ ret = get_state_private(&root->fs_info->extent_ins,
+ bytenr, &priv);
+ BUG_ON(ret);
+ extent_op = (struct pending_extent_op *)
+ (unsigned long)priv;
+ BUG_ON(extent_op->parent != orig_parent);
+ BUG_ON(extent_op->generation != orig_generation);
+
+ extent_op->parent = parent;
+ extent_op->generation = ref_generation;
+ } else {
+ extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ BUG_ON(!extent_op);
+
+ extent_op->type = PENDING_BACKREF_UPDATE;
+ extent_op->bytenr = bytenr;
+ extent_op->num_bytes = num_bytes;
+ extent_op->parent = parent;
+ extent_op->orig_parent = orig_parent;
+ extent_op->generation = ref_generation;
+ extent_op->orig_generation = orig_generation;
+ extent_op->level = (int)owner_objectid;
+ INIT_LIST_HEAD(&extent_op->list);
+ extent_op->del = 0;
+
+ set_extent_bits(&root->fs_info->extent_ins,
+ bytenr, bytenr + num_bytes - 1,
+ EXTENT_WRITEBACK, GFP_NOFS);
+ set_state_private(&root->fs_info->extent_ins,
+ bytenr, (unsigned long)extent_op);
+ }
+ mutex_unlock(&root->fs_info->extent_ins_mutex);
+ return 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ ret = lookup_extent_backref(trans, extent_root, path,
+ bytenr, orig_parent, orig_root,
+ orig_generation, owner_objectid, 1);
+ if (ret)
+ goto out;
+ ret = remove_extent_backref(trans, extent_root, path);
+ if (ret)
+ goto out;
+ ret = insert_extent_backref(trans, extent_root, path, bytenr,
+ parent, ref_root, ref_generation,
+ owner_objectid);
+ BUG_ON(ret);
+ finish_current_insert(trans, extent_root, 0);
+ del_pending_extents(trans, extent_root, 0);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 orig_parent, u64 parent,
+ u64 ref_root, u64 ref_generation,
+ u64 owner_objectid)
+{
+ int ret;
+ if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+ owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+ return 0;
+ ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
+ parent, ref_root, ref_root,
+ ref_generation, ref_generation,
+ owner_objectid);
+ return ret;
+}
+
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 orig_parent, u64 parent,
+ u64 orig_root, u64 ref_root,
+ u64 orig_generation, u64 ref_generation,
+ u64 owner_objectid)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct btrfs_key key;
+ struct extent_buffer *l;
+ struct btrfs_extent_item *item;
+ u32 refs;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 1;
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+ 0, 1);
+ if (ret < 0)
+ return ret;
+ BUG_ON(ret == 0 || path->slots[0] == 0);
+
+ path->slots[0]--;
+ l = path->nodes[0];
+
+ btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+ if (key.objectid != bytenr) {
+ btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
+ printk(KERN_ERR "btrfs wanted %llu found %llu\n",
+ (unsigned long long)bytenr,
+ (unsigned long long)key.objectid);
+ BUG();
+ }
+ BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
+
+ item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+ refs = btrfs_extent_refs(l, item);
+ btrfs_set_extent_refs(l, item, refs + 1);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ btrfs_release_path(root->fs_info->extent_root, path);
+
+ path->reada = 1;
+ ret = insert_extent_backref(trans, root->fs_info->extent_root,
+ path, bytenr, parent,
+ ref_root, ref_generation,
+ owner_objectid);
+ BUG_ON(ret);
+ finish_current_insert(trans, root->fs_info->extent_root, 0);
+ del_pending_extents(trans, root->fs_info->extent_root, 0);
+
+ btrfs_free_path(path);
+ return 0;
+}
+
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 ref_root, u64 ref_generation,
+ u64 owner_objectid)
+{
+ int ret;
+ if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+ owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+ return 0;
+ ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+ 0, ref_root, 0, ref_generation,
+ owner_objectid);
+ return ret;
+}
+
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ finish_current_insert(trans, root->fs_info->extent_root, 1);
+ del_pending_extents(trans, root->fs_info->extent_root, 1);
+ return 0;
+}
+
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u32 *refs)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct btrfs_key key;
+ struct extent_buffer *l;
+ struct btrfs_extent_item *item;
+
+ WARN_ON(num_bytes < root->sectorsize);
+ path = btrfs_alloc_path();
+ path->reada = 1;
+ key.objectid = bytenr;
+ key.offset = num_bytes;
+ btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+ 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret != 0) {
+ btrfs_print_leaf(root, path->nodes[0]);
+ printk(KERN_INFO "btrfs failed to find block number %llu\n",
+ (unsigned long long)bytenr);
+ BUG();
+ }
+ l = path->nodes[0];
+ item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+ *refs = btrfs_extent_refs(l, item);
+out:
+ btrfs_free_path(path);
+ return 0;
+}
+
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid, u64 bytenr)
+{
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_ref *ref_item;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 ref_root;
+ u64 last_snapshot;
+ u32 nritems;
+ int ret;
+
+ key.objectid = bytenr;
+ key.offset = (u64)-1;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
+ path = btrfs_alloc_path();
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ ret = -ENOENT;
+ if (path->slots[0] == 0)
+ goto out;
+
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid != bytenr ||
+ found_key.type != BTRFS_EXTENT_ITEM_KEY)
+ goto out;
+
+ last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+ while (1) {
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret == 0)
+ continue;
+ break;
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != bytenr)
+ break;
+
+ if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+ path->slots[0]++;
+ continue;
+ }
+
+ ref_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref);
+ ref_root = btrfs_ref_root(leaf, ref_item);
+ if ((ref_root != root->root_key.objectid &&
+ ref_root != BTRFS_TREE_LOG_OBJECTID) ||
+ objectid != btrfs_ref_objectid(leaf, ref_item)) {
+ ret = 1;
+ goto out;
+ }
+ if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
+ ret = 1;
+ goto out;
+ }
+
+ path->slots[0]++;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *buf, u32 nr_extents)
+{
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ u64 root_gen;
+ u32 nritems;
+ int i;
+ int level;
+ int ret = 0;
+ int shared = 0;
+
+ if (!root->ref_cows)
+ return 0;
+
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ shared = 0;
+ root_gen = root->root_key.offset;
+ } else {
+ shared = 1;
+ root_gen = trans->transid - 1;
+ }
+
+ level = btrfs_header_level(buf);
+ nritems = btrfs_header_nritems(buf);
+
+ if (level == 0) {
+ struct btrfs_leaf_ref *ref;
+ struct btrfs_extent_info *info;
+
+ ref = btrfs_alloc_leaf_ref(root, nr_extents);
+ if (!ref) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ref->root_gen = root_gen;
+ ref->bytenr = buf->start;
+ ref->owner = btrfs_header_owner(buf);
+ ref->generation = btrfs_header_generation(buf);
+ ref->nritems = nr_extents;
+ info = ref->extents;
+
+ for (i = 0; nr_extents > 0 && i < nritems; i++) {
+ u64 disk_bytenr;
+ btrfs_item_key_to_cpu(buf, &key, i);
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(buf, i,
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(buf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (disk_bytenr == 0)
+ continue;
+
+ info->bytenr = disk_bytenr;
+ info->num_bytes =
+ btrfs_file_extent_disk_num_bytes(buf, fi);
+ info->objectid = key.objectid;
+ info->offset = key.offset;
+ info++;
+ }
+
+ ret = btrfs_add_leaf_ref(root, ref, shared);
+ if (ret == -EEXIST && shared) {
+ struct btrfs_leaf_ref *old;
+ old = btrfs_lookup_leaf_ref(root, ref->bytenr);
+ BUG_ON(!old);
+ btrfs_remove_leaf_ref(root, old);
+ btrfs_free_leaf_ref(root, old);
+ ret = btrfs_add_leaf_ref(root, ref, shared);
+ }
+ WARN_ON(ret);
+ btrfs_free_leaf_ref(root, ref);
+ }
+out:
+ return ret;
+}
+
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct extent_buffer *orig_buf, struct extent_buffer *buf,
+ u32 *nr_extents)
+{
+ u64 bytenr;
+ u64 ref_root;
+ u64 orig_root;
+ u64 ref_generation;
+ u64 orig_generation;
+ u32 nritems;
+ u32 nr_file_extents = 0;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ int i;
+ int level;
+ int ret = 0;
+ int faili = 0;
+ int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
+ u64, u64, u64, u64, u64, u64, u64, u64);
+
+ ref_root = btrfs_header_owner(buf);
+ ref_generation = btrfs_header_generation(buf);
+ orig_root = btrfs_header_owner(orig_buf);
+ orig_generation = btrfs_header_generation(orig_buf);
+
+ nritems = btrfs_header_nritems(buf);
+ level = btrfs_header_level(buf);
+
+ if (root->ref_cows) {
+ process_func = __btrfs_inc_extent_ref;
+ } else {
+ if (level == 0 &&
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ goto out;
+ if (level != 0 &&
+ root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+ goto out;
+ process_func = __btrfs_update_extent_ref;
+ }
+
+ for (i = 0; i < nritems; i++) {
+ cond_resched();
+ if (level == 0) {
+ btrfs_item_key_to_cpu(buf, &key, i);
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(buf, i,
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(buf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (bytenr == 0)
+ continue;
+
+ nr_file_extents++;
+
+ ret = process_func(trans, root, bytenr,
+ orig_buf->start, buf->start,
+ orig_root, ref_root,
+ orig_generation, ref_generation,
+ key.objectid);
+
+ if (ret) {
+ faili = i;
+ WARN_ON(1);
+ goto fail;
+ }
+ } else {
+ bytenr = btrfs_node_blockptr(buf, i);
+ ret = process_func(trans, root, bytenr,
+ orig_buf->start, buf->start,
+ orig_root, ref_root,
+ orig_generation, ref_generation,
+ level - 1);
+ if (ret) {
+ faili = i;
+ WARN_ON(1);
+ goto fail;
+ }
+ }
+ }
+out:
+ if (nr_extents) {
+ if (level == 0)
+ *nr_extents = nr_file_extents;
+ else
+ *nr_extents = nritems;
+ }
+ return 0;
+fail:
+ WARN_ON(1);
+ return ret;
+}
+
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *orig_buf,
+ struct extent_buffer *buf, int start_slot, int nr)
+
+{
+ u64 bytenr;
+ u64 ref_root;
+ u64 orig_root;
+ u64 ref_generation;
+ u64 orig_generation;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ int i;
+ int ret;
+ int slot;
+ int level;
+
+ BUG_ON(start_slot < 0);
+ BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
+
+ ref_root = btrfs_header_owner(buf);
+ ref_generation = btrfs_header_generation(buf);
+ orig_root = btrfs_header_owner(orig_buf);
+ orig_generation = btrfs_header_generation(orig_buf);
+ level = btrfs_header_level(buf);
+
+ if (!root->ref_cows) {
+ if (level == 0 &&
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ return 0;
+ if (level != 0 &&
+ root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+ return 0;
+ }
+
+ for (i = 0, slot = start_slot; i < nr; i++, slot++) {
+ cond_resched();
+ if (level == 0) {
+ btrfs_item_key_to_cpu(buf, &key, slot);
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(buf, slot,
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(buf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (bytenr == 0)
+ continue;
+ ret = __btrfs_update_extent_ref(trans, root, bytenr,
+ orig_buf->start, buf->start,
+ orig_root, ref_root,
+ orig_generation, ref_generation,
+ key.objectid);
+ if (ret)
+ goto fail;
+ } else {
+ bytenr = btrfs_node_blockptr(buf, slot);
+ ret = __btrfs_update_extent_ref(trans, root, bytenr,
+ orig_buf->start, buf->start,
+ orig_root, ref_root,
+ orig_generation, ref_generation,
+ level - 1);
+ if (ret)
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ WARN_ON(1);
+ return -1;
+}
+
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_block_group_cache *cache)
+{
+ int ret;
+ int pending_ret;
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ unsigned long bi;
+ struct extent_buffer *leaf;
+
+ ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+ if (ret < 0)
+ goto fail;
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(extent_root, path);
+fail:
+ finish_current_insert(trans, extent_root, 0);
+ pending_ret = del_pending_extents(trans, extent_root, 0);
+ if (ret)
+ return ret;
+ if (pending_ret)
+ return pending_ret;
+ return 0;
+
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *cache, *entry;
+ struct rb_node *n;
+ int err = 0;
+ int werr = 0;
+ struct btrfs_path *path;
+ u64 last = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ cache = NULL;
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ for (n = rb_first(&root->fs_info->block_group_cache_tree);
+ n; n = rb_next(n)) {
+ entry = rb_entry(n, struct btrfs_block_group_cache,
+ cache_node);
+ if (entry->dirty) {
+ cache = entry;
+ break;
+ }
+ }
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+
+ if (!cache)
+ break;
+
+ cache->dirty = 0;
+ last += cache->key.offset;
+
+ err = write_one_cache_group(trans, root,
+ path, cache);
+ /*
+ * if we fail to write the cache group, we want
+ * to keep it marked dirty in hopes that a later
+ * write will work
+ */
+ if (err) {
+ werr = err;
+ continue;
+ }
+ }
+ btrfs_free_path(path);
+ return werr;
+}
+
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
+{
+ struct btrfs_block_group_cache *block_group;
+ int readonly = 0;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+ if (!block_group || block_group->ro)
+ readonly = 1;
+ if (block_group)
+ put_block_group(block_group);
+ return readonly;
+}
+
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+ u64 total_bytes, u64 bytes_used,
+ struct btrfs_space_info **space_info)
+{
+ struct btrfs_space_info *found;
+
+ found = __find_space_info(info, flags);
+ if (found) {
+ spin_lock(&found->lock);
+ found->total_bytes += total_bytes;
+ found->bytes_used += bytes_used;
+ found->full = 0;
+ spin_unlock(&found->lock);
+ *space_info = found;
+ return 0;
+ }
+ found = kzalloc(sizeof(*found), GFP_NOFS);
+ if (!found)
+ return -ENOMEM;
+
+ list_add(&found->list, &info->space_info);
+ INIT_LIST_HEAD(&found->block_groups);
+ init_rwsem(&found->groups_sem);
+ spin_lock_init(&found->lock);
+ found->flags = flags;
+ found->total_bytes = total_bytes;
+ found->bytes_used = bytes_used;
+ found->bytes_pinned = 0;
+ found->bytes_reserved = 0;
+ found->bytes_readonly = 0;
+ found->full = 0;
+ found->force_alloc = 0;
+ *space_info = found;
+ return 0;
+}
+
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP);
+ if (extra_flags) {
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits |= extra_flags;
+ }
+}
+
+static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
+{
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ if (!cache->ro) {
+ cache->space_info->bytes_readonly += cache->key.offset -
+ btrfs_block_group_used(&cache->item);
+ cache->ro = 1;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+}
+
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+{
+ u64 num_devices = root->fs_info->fs_devices->rw_devices;
+
+ if (num_devices == 1)
+ flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+ if (num_devices < 4)
+ flags &= ~BTRFS_BLOCK_GROUP_RAID10;
+
+ if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+ (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))) {
+ flags &= ~BTRFS_BLOCK_GROUP_DUP;
+ }
+
+ if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
+ (flags & BTRFS_BLOCK_GROUP_RAID10)) {
+ flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+ }
+
+ if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
+ ((flags & BTRFS_BLOCK_GROUP_RAID1) |
+ (flags & BTRFS_BLOCK_GROUP_RAID10) |
+ (flags & BTRFS_BLOCK_GROUP_DUP)))
+ flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+ return flags;
+}
+
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 alloc_bytes,
+ u64 flags, int force)
+{
+ struct btrfs_space_info *space_info;
+ u64 thresh;
+ int ret = 0;
+
+ mutex_lock(&extent_root->fs_info->chunk_mutex);
+
+ flags = btrfs_reduce_alloc_profile(extent_root, flags);
+
+ space_info = __find_space_info(extent_root->fs_info, flags);
+ if (!space_info) {
+ ret = update_space_info(extent_root->fs_info, flags,
+ 0, 0, &space_info);
+ BUG_ON(ret);
+ }
+ BUG_ON(!space_info);
+
+ spin_lock(&space_info->lock);
+ if (space_info->force_alloc) {
+ force = 1;
+ space_info->force_alloc = 0;
+ }
+ if (space_info->full) {
+ spin_unlock(&space_info->lock);
+ goto out;
+ }
+
+ thresh = space_info->total_bytes - space_info->bytes_readonly;
+ thresh = div_factor(thresh, 6);
+ if (!force &&
+ (space_info->bytes_used + space_info->bytes_pinned +
+ space_info->bytes_reserved + alloc_bytes) < thresh) {
+ spin_unlock(&space_info->lock);
+ goto out;
+ }
+ spin_unlock(&space_info->lock);
+
+ ret = btrfs_alloc_chunk(trans, extent_root, flags);
+ if (ret)
+ space_info->full = 1;
+out:
+ mutex_unlock(&extent_root->fs_info->chunk_mutex);
+ return ret;
+}
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int alloc,
+ int mark_free)
+{
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_fs_info *info = root->fs_info;
+ u64 total = num_bytes;
+ u64 old_val;
+ u64 byte_in_group;
+
+ while (total) {
+ cache = btrfs_lookup_block_group(info, bytenr);
+ if (!cache)
+ return -1;
+ byte_in_group = bytenr - cache->key.objectid;
+ WARN_ON(byte_in_group > cache->key.offset);
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->dirty = 1;
+ old_val = btrfs_block_group_used(&cache->item);
+ num_bytes = min(total, cache->key.offset - byte_in_group);
+ if (alloc) {
+ old_val += num_bytes;
+ cache->space_info->bytes_used += num_bytes;
+ if (cache->ro)
+ cache->space_info->bytes_readonly -= num_bytes;
+ btrfs_set_block_group_used(&cache->item, old_val);
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ } else {
+ old_val -= num_bytes;
+ cache->space_info->bytes_used -= num_bytes;
+ if (cache->ro)
+ cache->space_info->bytes_readonly += num_bytes;
+ btrfs_set_block_group_used(&cache->item, old_val);
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ if (mark_free) {
+ int ret;
+
+ ret = btrfs_discard_extent(root, bytenr,
+ num_bytes);
+ WARN_ON(ret);
+
+ ret = btrfs_add_free_space(cache, bytenr,
+ num_bytes);
+ WARN_ON(ret);
+ }
+ }
+ put_block_group(cache);
+ total -= num_bytes;
+ bytenr += num_bytes;
+ }
+ return 0;
+}
+
+static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 bytenr;
+
+ cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
+ if (!cache)
+ return 0;
+
+ bytenr = cache->key.objectid;
+ put_block_group(cache);
+
+ return bytenr;
+}
+
+int btrfs_update_pinned_extents(struct btrfs_root *root,
+ u64 bytenr, u64 num, int pin)
+{
+ u64 len;
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
+ if (pin) {
+ set_extent_dirty(&fs_info->pinned_extents,
+ bytenr, bytenr + num - 1, GFP_NOFS);
+ } else {
+ clear_extent_dirty(&fs_info->pinned_extents,
+ bytenr, bytenr + num - 1, GFP_NOFS);
+ }
+ while (num > 0) {
+ cache = btrfs_lookup_block_group(fs_info, bytenr);
+ BUG_ON(!cache);
+ len = min(num, cache->key.offset -
+ (bytenr - cache->key.objectid));
+ if (pin) {
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned += len;
+ cache->space_info->bytes_pinned += len;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ fs_info->total_pinned += len;
+ } else {
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned -= len;
+ cache->space_info->bytes_pinned -= len;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ fs_info->total_pinned -= len;
+ if (cache->cached)
+ btrfs_add_free_space(cache, bytenr, len);
+ }
+ put_block_group(cache);
+ bytenr += len;
+ num -= len;
+ }
+ return 0;
+}
+
+static int update_reserved_extents(struct btrfs_root *root,
+ u64 bytenr, u64 num, int reserve)
+{
+ u64 len;
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ while (num > 0) {
+ cache = btrfs_lookup_block_group(fs_info, bytenr);
+ BUG_ON(!cache);
+ len = min(num, cache->key.offset -
+ (bytenr - cache->key.objectid));
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ if (reserve) {
+ cache->reserved += len;
+ cache->space_info->bytes_reserved += len;
+ } else {
+ cache->reserved -= len;
+ cache->space_info->bytes_reserved -= len;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ put_block_group(cache);
+ bytenr += len;
+ num -= len;
+ }
+ return 0;
+}
+
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
+{
+ u64 last = 0;
+ u64 start;
+ u64 end;
+ struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
+ int ret;
+
+ mutex_lock(&root->fs_info->pinned_mutex);
+ while (1) {
+ ret = find_first_extent_bit(pinned_extents, last,
+ &start, &end, EXTENT_DIRTY);
+ if (ret)
+ break;
+ set_extent_dirty(copy, start, end, GFP_NOFS);
+ last = end + 1;
+ }
+ mutex_unlock(&root->fs_info->pinned_mutex);
+ return 0;
+}
+
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_io_tree *unpin)
+{
+ u64 start;
+ u64 end;
+ int ret;
+
+ mutex_lock(&root->fs_info->pinned_mutex);
+ while (1) {
+ ret = find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY);
+ if (ret)
+ break;
+
+ ret = btrfs_discard_extent(root, start, end + 1 - start);
+
+ btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
+ clear_extent_dirty(unpin, start, end, GFP_NOFS);
+
+ if (need_resched()) {
+ mutex_unlock(&root->fs_info->pinned_mutex);
+ cond_resched();
+ mutex_lock(&root->fs_info->pinned_mutex);
+ }
+ }
+ mutex_unlock(&root->fs_info->pinned_mutex);
+ return ret;
+}
+
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, int all)
+{
+ u64 start;
+ u64 end;
+ u64 priv;
+ u64 search = 0;
+ u64 skipped = 0;
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ struct btrfs_path *path;
+ struct pending_extent_op *extent_op, *tmp;
+ struct list_head insert_list, update_list;
+ int ret;
+ int num_inserts = 0, max_inserts;
+
+ path = btrfs_alloc_path();
+ INIT_LIST_HEAD(&insert_list);
+ INIT_LIST_HEAD(&update_list);
+
+ max_inserts = extent_root->leafsize /
+ (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
+ sizeof(struct btrfs_extent_ref) +
+ sizeof(struct btrfs_extent_item));
+again:
+ mutex_lock(&info->extent_ins_mutex);
+ while (1) {
+ ret = find_first_extent_bit(&info->extent_ins, search, &start,
+ &end, EXTENT_WRITEBACK);
+ if (ret) {
+ if (skipped && all && !num_inserts) {
+ skipped = 0;
+ search = 0;
+ continue;
+ }
+ mutex_unlock(&info->extent_ins_mutex);
+ break;
+ }
+
+ ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
+ if (!ret) {
+ skipped = 1;
+ search = end + 1;
+ if (need_resched()) {
+ mutex_unlock(&info->extent_ins_mutex);
+ cond_resched();
+ mutex_lock(&info->extent_ins_mutex);
+ }
+ continue;
+ }
+
+ ret = get_state_private(&info->extent_ins, start, &priv);
+ BUG_ON(ret);
+ extent_op = (struct pending_extent_op *)(unsigned long) priv;
+
+ if (extent_op->type == PENDING_EXTENT_INSERT) {
+ num_inserts++;
+ list_add_tail(&extent_op->list, &insert_list);
+ search = end + 1;
+ if (num_inserts == max_inserts) {
+ mutex_unlock(&info->extent_ins_mutex);
+ break;
+ }
+ } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
+ list_add_tail(&extent_op->list, &update_list);
+ search = end + 1;
+ } else {
+ BUG();
+ }
+ }
+
+ /*
+ * process the update list, clear the writeback bit for it, and if
+ * somebody marked this thing for deletion then just unlock it and be
+ * done, the free_extents will handle it
+ */
+ mutex_lock(&info->extent_ins_mutex);
+ list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
+ clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+ extent_op->bytenr + extent_op->num_bytes - 1,
+ EXTENT_WRITEBACK, GFP_NOFS);
+ if (extent_op->del) {
+ list_del_init(&extent_op->list);
+ unlock_extent(&info->extent_ins, extent_op->bytenr,
+ extent_op->bytenr + extent_op->num_bytes
+ - 1, GFP_NOFS);
+ kfree(extent_op);
+ }
+ }
+ mutex_unlock(&info->extent_ins_mutex);
+
+ /*
+ * still have things left on the update list, go ahead an update
+ * everything
+ */
+ if (!list_empty(&update_list)) {
+ ret = update_backrefs(trans, extent_root, path, &update_list);
+ BUG_ON(ret);
+ }
+
+ /*
+ * if no inserts need to be done, but we skipped some extents and we
+ * need to make sure everything is cleaned then reset everything and
+ * go back to the beginning
+ */
+ if (!num_inserts && all && skipped) {
+ search = 0;
+ skipped = 0;
+ INIT_LIST_HEAD(&update_list);
+ INIT_LIST_HEAD(&insert_list);
+ goto again;
+ } else if (!num_inserts) {
+ goto out;
+ }
+
+ /*
+ * process the insert extents list. Again if we are deleting this
+ * extent, then just unlock it, pin down the bytes if need be, and be
+ * done with it. Saves us from having to actually insert the extent
+ * into the tree and then subsequently come along and delete it
+ */
+ mutex_lock(&info->extent_ins_mutex);
+ list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
+ clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+ extent_op->bytenr + extent_op->num_bytes - 1,
+ EXTENT_WRITEBACK, GFP_NOFS);
+ if (extent_op->del) {
+ u64 used;
+ list_del_init(&extent_op->list);
+ unlock_extent(&info->extent_ins, extent_op->bytenr,
+ extent_op->bytenr + extent_op->num_bytes
+ - 1, GFP_NOFS);
+
+ mutex_lock(&extent_root->fs_info->pinned_mutex);
+ ret = pin_down_bytes(trans, extent_root,
+ extent_op->bytenr,
+ extent_op->num_bytes, 0);
+ mutex_unlock(&extent_root->fs_info->pinned_mutex);
+
+ spin_lock(&info->delalloc_lock);
+ used = btrfs_super_bytes_used(&info->super_copy);
+ btrfs_set_super_bytes_used(&info->super_copy,
+ used - extent_op->num_bytes);
+ used = btrfs_root_used(&extent_root->root_item);
+ btrfs_set_root_used(&extent_root->root_item,
+ used - extent_op->num_bytes);
+ spin_unlock(&info->delalloc_lock);
+
+ ret = update_block_group(trans, extent_root,
+ extent_op->bytenr,
+ extent_op->num_bytes,
+ 0, ret > 0);
+ BUG_ON(ret);
+ kfree(extent_op);
+ num_inserts--;
+ }
+ }
+ mutex_unlock(&info->extent_ins_mutex);
+
+ ret = insert_extents(trans, extent_root, path, &insert_list,
+ num_inserts);
+ BUG_ON(ret);
+
+ /*
+ * if we broke out of the loop in order to insert stuff because we hit
+ * the maximum number of inserts at a time we can handle, then loop
+ * back and pick up where we left off
+ */
+ if (num_inserts == max_inserts) {
+ INIT_LIST_HEAD(&insert_list);
+ INIT_LIST_HEAD(&update_list);
+ num_inserts = 0;
+ goto again;
+ }
+
+ /*
+ * again, if we need to make absolutely sure there are no more pending
+ * extent operations left and we know that we skipped some, go back to
+ * the beginning and do it all again
+ */
+ if (all && skipped) {
+ INIT_LIST_HEAD(&insert_list);
+ INIT_LIST_HEAD(&update_list);
+ search = 0;
+ skipped = 0;
+ num_inserts = 0;
+ goto again;
+ }
+out:
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int is_data)
+{
+ int err = 0;
+ struct extent_buffer *buf;
+
+ if (is_data)
+ goto pinit;
+
+ buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+ if (!buf)
+ goto pinit;
+
+ /* we can reuse a block if it hasn't been written
+ * and it is from this transaction. We can't
+ * reuse anything from the tree log root because
+ * it has tiny sub-transactions.
+ */
+ if (btrfs_buffer_uptodate(buf, 0) &&
+ btrfs_try_tree_lock(buf)) {
+ u64 header_owner = btrfs_header_owner(buf);
+ u64 header_transid = btrfs_header_generation(buf);
+ if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
+ header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+ header_transid == trans->transid &&
+ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ clean_tree_block(NULL, root, buf);
+ btrfs_tree_unlock(buf);
+ free_extent_buffer(buf);
+ return 1;
+ }
+ btrfs_tree_unlock(buf);
+ }
+ free_extent_buffer(buf);
+pinit:
+ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+
+ BUG_ON(err < 0);
+ return 0;
+}
+
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+static int __free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid, int pin, int mark_free)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_root *extent_root = info->extent_root;
+ struct extent_buffer *leaf;
+ int ret;
+ int extent_slot = 0;
+ int found_extent = 0;
+ int num_to_del = 1;
+ struct btrfs_extent_item *ei;
+ u32 refs;
+
+ key.objectid = bytenr;
+ btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+ key.offset = num_bytes;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 1;
+ ret = lookup_extent_backref(trans, extent_root, path,
+ bytenr, parent, root_objectid,
+ ref_generation, owner_objectid, 1);
+ if (ret == 0) {
+ struct btrfs_key found_key;
+ extent_slot = path->slots[0];
+ while (extent_slot > 0) {
+ extent_slot--;
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ extent_slot);
+ if (found_key.objectid != bytenr)
+ break;
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+ found_key.offset == num_bytes) {
+ found_extent = 1;
+ break;
+ }
+ if (path->slots[0] - extent_slot > 5)
+ break;
+ }
+ if (!found_extent) {
+ ret = remove_extent_backref(trans, extent_root, path);
+ BUG_ON(ret);
+ btrfs_release_path(extent_root, path);
+ ret = btrfs_search_slot(trans, extent_root,
+ &key, path, -1, 1);
+ if (ret) {
+ printk(KERN_ERR "umm, got %d back from search"
+ ", was looking for %llu\n", ret,
+ (unsigned long long)bytenr);
+ btrfs_print_leaf(extent_root, path->nodes[0]);
+ }
+ BUG_ON(ret);
+ extent_slot = path->slots[0];
+ }
+ } else {
+ btrfs_print_leaf(extent_root, path->nodes[0]);
+ WARN_ON(1);
+ printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
+ "root %llu gen %llu owner %llu\n",
+ (unsigned long long)bytenr,
+ (unsigned long long)root_objectid,
+ (unsigned long long)ref_generation,
+ (unsigned long long)owner_objectid);
+ }
+
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, extent_slot,
+ struct btrfs_extent_item);
+ refs = btrfs_extent_refs(leaf, ei);
+ BUG_ON(refs == 0);
+ refs -= 1;
+ btrfs_set_extent_refs(leaf, ei, refs);
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+ struct btrfs_extent_ref *ref;
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref);
+ BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+ /* if the back ref and the extent are next to each other
+ * they get deleted below in one shot
+ */
+ path->slots[0] = extent_slot;
+ num_to_del = 2;
+ } else if (found_extent) {
+ /* otherwise delete the extent back ref */
+ ret = remove_extent_backref(trans, extent_root, path);
+ BUG_ON(ret);
+ /* if refs are 0, we need to setup the path for deletion */
+ if (refs == 0) {
+ btrfs_release_path(extent_root, path);
+ ret = btrfs_search_slot(trans, extent_root, &key, path,
+ -1, 1);
+ BUG_ON(ret);
+ }
+ }
+
+ if (refs == 0) {
+ u64 super_used;
+ u64 root_used;
+
+ if (pin) {
+ mutex_lock(&root->fs_info->pinned_mutex);
+ ret = pin_down_bytes(trans, root, bytenr, num_bytes,
+ owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
+ mutex_unlock(&root->fs_info->pinned_mutex);
+ if (ret > 0)
+ mark_free = 1;
+ BUG_ON(ret < 0);
+ }
+ /* block accounting for super block */
+ spin_lock(&info->delalloc_lock);
+ super_used = btrfs_super_bytes_used(&info->super_copy);
+ btrfs_set_super_bytes_used(&info->super_copy,
+ super_used - num_bytes);
+
+ /* block accounting for root item */
+ root_used = btrfs_root_used(&root->root_item);
+ btrfs_set_root_used(&root->root_item,
+ root_used - num_bytes);
+ spin_unlock(&info->delalloc_lock);
+ ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
+ num_to_del);
+ BUG_ON(ret);
+ btrfs_release_path(extent_root, path);
+
+ if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+ BUG_ON(ret);
+ }
+
+ ret = update_block_group(trans, root, bytenr, num_bytes, 0,
+ mark_free);
+ BUG_ON(ret);
+ }
+ btrfs_free_path(path);
+ finish_current_insert(trans, extent_root, 0);
+ return ret;
+}
+
+/*
+ * find all the blocks marked as pending in the radix tree and remove
+ * them from the extent map
+ */
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, int all)
+{
+ int ret;
+ int err = 0;
+ u64 start;
+ u64 end;
+ u64 priv;
+ u64 search = 0;
+ int nr = 0, skipped = 0;
+ struct extent_io_tree *pending_del;
+ struct extent_io_tree *extent_ins;
+ struct pending_extent_op *extent_op;
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ struct list_head delete_list;
+
+ INIT_LIST_HEAD(&delete_list);
+ extent_ins = &extent_root->fs_info->extent_ins;
+ pending_del = &extent_root->fs_info->pending_del;
+
+again:
+ mutex_lock(&info->extent_ins_mutex);
+ while (1) {
+ ret = find_first_extent_bit(pending_del, search, &start, &end,
+ EXTENT_WRITEBACK);
+ if (ret) {
+ if (all && skipped && !nr) {
+ search = 0;
+ continue;
+ }
+ mutex_unlock(&info->extent_ins_mutex);
+ break;
+ }
+
+ ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
+ if (!ret) {
+ search = end+1;
+ skipped = 1;
+
+ if (need_resched()) {
+ mutex_unlock(&info->extent_ins_mutex);
+ cond_resched();
+ mutex_lock(&info->extent_ins_mutex);
+ }
+
+ continue;
+ }
+ BUG_ON(ret < 0);
+
+ ret = get_state_private(pending_del, start, &priv);
+ BUG_ON(ret);
+ extent_op = (struct pending_extent_op *)(unsigned long)priv;
+
+ clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
+ GFP_NOFS);
+ if (!test_range_bit(extent_ins, start, end,
+ EXTENT_WRITEBACK, 0)) {
+ list_add_tail(&extent_op->list, &delete_list);
+ nr++;
+ } else {
+ kfree(extent_op);
+
+ ret = get_state_private(&info->extent_ins, start,
+ &priv);
+ BUG_ON(ret);
+ extent_op = (struct pending_extent_op *)
+ (unsigned long)priv;
+
+ clear_extent_bits(&info->extent_ins, start, end,
+ EXTENT_WRITEBACK, GFP_NOFS);
+
+ if (extent_op->type == PENDING_BACKREF_UPDATE) {
+ list_add_tail(&extent_op->list, &delete_list);
+ search = end + 1;
+ nr++;
+ continue;
+ }
+
+ mutex_lock(&extent_root->fs_info->pinned_mutex);
+ ret = pin_down_bytes(trans, extent_root, start,
+ end + 1 - start, 0);
+ mutex_unlock(&extent_root->fs_info->pinned_mutex);
+
+ ret = update_block_group(trans, extent_root, start,
+ end + 1 - start, 0, ret > 0);
+
+ unlock_extent(extent_ins, start, end, GFP_NOFS);
+ BUG_ON(ret);
+ kfree(extent_op);
+ }
+ if (ret)
+ err = ret;
+
+ search = end + 1;
+
+ if (need_resched()) {
+ mutex_unlock(&info->extent_ins_mutex);
+ cond_resched();
+ mutex_lock(&info->extent_ins_mutex);
+ }
+ }
+
+ if (nr) {
+ ret = free_extents(trans, extent_root, &delete_list);
+ BUG_ON(ret);
+ }
+
+ if (all && skipped) {
+ INIT_LIST_HEAD(&delete_list);
+ search = 0;
+ nr = 0;
+ goto again;
+ }
+
+ return err;
+}
+
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid, int pin)
+{
+ struct btrfs_root *extent_root = root->fs_info->extent_root;
+ int pending_ret;
+ int ret;
+
+ WARN_ON(num_bytes < root->sectorsize);
+ if (root == extent_root) {
+ struct pending_extent_op *extent_op = NULL;
+
+ mutex_lock(&root->fs_info->extent_ins_mutex);
+ if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+ bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
+ u64 priv;
+ ret = get_state_private(&root->fs_info->extent_ins,
+ bytenr, &priv);
+ BUG_ON(ret);
+ extent_op = (struct pending_extent_op *)
+ (unsigned long)priv;
+
+ extent_op->del = 1;
+ if (extent_op->type == PENDING_EXTENT_INSERT) {
+ mutex_unlock(&root->fs_info->extent_ins_mutex);
+ return 0;
+ }
+ }
+
+ if (extent_op) {
+ ref_generation = extent_op->orig_generation;
+ parent = extent_op->orig_parent;
+ }
+
+ extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ BUG_ON(!extent_op);
+
+ extent_op->type = PENDING_EXTENT_DELETE;
+ extent_op->bytenr = bytenr;
+ extent_op->num_bytes = num_bytes;
+ extent_op->parent = parent;
+ extent_op->orig_parent = parent;
+ extent_op->generation = ref_generation;
+ extent_op->orig_generation = ref_generation;
+ extent_op->level = (int)owner_objectid;
+ INIT_LIST_HEAD(&extent_op->list);
+ extent_op->del = 0;
+
+ set_extent_bits(&root->fs_info->pending_del,
+ bytenr, bytenr + num_bytes - 1,
+ EXTENT_WRITEBACK, GFP_NOFS);
+ set_state_private(&root->fs_info->pending_del,
+ bytenr, (unsigned long)extent_op);
+ mutex_unlock(&root->fs_info->extent_ins_mutex);
+ return 0;
+ }
+ /* if metadata always pin */
+ if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_block_group_cache *cache;
+
+ /* btrfs_free_reserved_extent */
+ cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ BUG_ON(!cache);
+ btrfs_add_free_space(cache, bytenr, num_bytes);
+ put_block_group(cache);
+ update_reserved_extents(root, bytenr, num_bytes, 0);
+ return 0;
+ }
+ pin = 1;
+ }
+
+ /* if data pin when any transaction has committed this */
+ if (ref_generation != trans->transid)
+ pin = 1;
+
+ ret = __free_extent(trans, root, bytenr, num_bytes, parent,
+ root_objectid, ref_generation,
+ owner_objectid, pin, pin == 0);
+
+ finish_current_insert(trans, root->fs_info->extent_root, 0);
+ pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
+ return ret ? ret : pending_ret;
+}
+
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid, int pin)
+{
+ int ret;
+
+ ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
+ root_objectid, ref_generation,
+ owner_objectid, pin);
+ return ret;
+}
+
+static u64 stripe_align(struct btrfs_root *root, u64 val)
+{
+ u64 mask = ((u64)root->stripesize - 1);
+ u64 ret = (val + mask) & ~mask;
+ return ret;
+}
+
+/*
+ * walks the btree of allocated extents and find a hole of a given size.
+ * The key ins is changed to record the hole:
+ * ins->objectid == block start
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == number of blocks
+ * Any available blocks before search_start are skipped.
+ */
+static noinline int find_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *orig_root,
+ u64 num_bytes, u64 empty_size,
+ u64 search_start, u64 search_end,
+ u64 hint_byte, struct btrfs_key *ins,
+ u64 exclude_start, u64 exclude_nr,
+ int data)
+{
+ int ret = 0;
+ struct btrfs_root *root = orig_root->fs_info->extent_root;
+ u64 total_needed = num_bytes;
+ u64 *last_ptr = NULL;
+ u64 last_wanted = 0;
+ struct btrfs_block_group_cache *block_group = NULL;
+ int chunk_alloc_done = 0;
+ int empty_cluster = 2 * 1024 * 1024;
+ int allowed_chunk_alloc = 0;
+ struct list_head *head = NULL, *cur = NULL;
+ int loop = 0;
+ int extra_loop = 0;
+ struct btrfs_space_info *space_info;
+
+ WARN_ON(num_bytes < root->sectorsize);
+ btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+ ins->objectid = 0;
+ ins->offset = 0;
+
+ if (orig_root->ref_cows || empty_size)
+ allowed_chunk_alloc = 1;
+
+ if (data & BTRFS_BLOCK_GROUP_METADATA) {
+ last_ptr = &root->fs_info->last_alloc;
+ empty_cluster = 64 * 1024;
+ }
+
+ if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
+ last_ptr = &root->fs_info->last_data_alloc;
+
+ if (last_ptr) {
+ if (*last_ptr) {
+ hint_byte = *last_ptr;
+ last_wanted = *last_ptr;
+ } else
+ empty_size += empty_cluster;
+ } else {
+ empty_cluster = 0;
+ }
+ search_start = max(search_start, first_logical_byte(root, 0));
+ search_start = max(search_start, hint_byte);
+
+ if (last_wanted && search_start != last_wanted) {
+ last_wanted = 0;
+ empty_size += empty_cluster;
+ }
+
+ total_needed += empty_size;
+ block_group = btrfs_lookup_block_group(root->fs_info, search_start);
+ if (!block_group)
+ block_group = btrfs_lookup_first_block_group(root->fs_info,
+ search_start);
+ space_info = __find_space_info(root->fs_info, data);
+
+ down_read(&space_info->groups_sem);
+ while (1) {
+ struct btrfs_free_space *free_space;
+ /*
+ * the only way this happens if our hint points to a block
+ * group thats not of the proper type, while looping this
+ * should never happen
+ */
+ if (empty_size)
+ extra_loop = 1;
+
+ if (!block_group)
+ goto new_group_no_lock;
+
+ if (unlikely(!block_group->cached)) {
+ mutex_lock(&block_group->cache_mutex);
+ ret = cache_block_group(root, block_group);
+ mutex_unlock(&block_group->cache_mutex);
+ if (ret)
+ break;
+ }
+
+ mutex_lock(&block_group->alloc_mutex);
+ if (unlikely(!block_group_bits(block_group, data)))
+ goto new_group;
+
+ if (unlikely(block_group->ro))
+ goto new_group;
+
+ free_space = btrfs_find_free_space(block_group, search_start,
+ total_needed);
+ if (free_space) {
+ u64 start = block_group->key.objectid;
+ u64 end = block_group->key.objectid +
+ block_group->key.offset;
+
+ search_start = stripe_align(root, free_space->offset);
+
+ /* move on to the next group */
+ if (search_start + num_bytes >= search_end)
+ goto new_group;
+
+ /* move on to the next group */
+ if (search_start + num_bytes > end)
+ goto new_group;
+
+ if (last_wanted && search_start != last_wanted) {
+ total_needed += empty_cluster;
+ empty_size += empty_cluster;
+ last_wanted = 0;
+ /*
+ * if search_start is still in this block group
+ * then we just re-search this block group
+ */
+ if (search_start >= start &&
+ search_start < end) {
+ mutex_unlock(&block_group->alloc_mutex);
+ continue;
+ }
+
+ /* else we go to the next block group */
+ goto new_group;
+ }
+
+ if (exclude_nr > 0 &&
+ (search_start + num_bytes > exclude_start &&
+ search_start < exclude_start + exclude_nr)) {
+ search_start = exclude_start + exclude_nr;
+ /*
+ * if search_start is still in this block group
+ * then we just re-search this block group
+ */
+ if (search_start >= start &&
+ search_start < end) {
+ mutex_unlock(&block_group->alloc_mutex);
+ last_wanted = 0;
+ continue;
+ }
+
+ /* else we go to the next block group */
+ goto new_group;
+ }
+
+ ins->objectid = search_start;
+ ins->offset = num_bytes;
+
+ btrfs_remove_free_space_lock(block_group, search_start,
+ num_bytes);
+ /* we are all good, lets return */
+ mutex_unlock(&block_group->alloc_mutex);
+ break;
+ }
+new_group:
+ mutex_unlock(&block_group->alloc_mutex);
+ put_block_group(block_group);
+ block_group = NULL;
+new_group_no_lock:
+ /* don't try to compare new allocations against the
+ * last allocation any more
+ */
+ last_wanted = 0;
+
+ /*
+ * Here's how this works.
+ * loop == 0: we were searching a block group via a hint
+ * and didn't find anything, so we start at
+ * the head of the block groups and keep searching
+ * loop == 1: we're searching through all of the block groups
+ * if we hit the head again we have searched
+ * all of the block groups for this space and we
+ * need to try and allocate, if we cant error out.
+ * loop == 2: we allocated more space and are looping through
+ * all of the block groups again.
+ */
+ if (loop == 0) {
+ head = &space_info->block_groups;
+ cur = head->next;
+ loop++;
+ } else if (loop == 1 && cur == head) {
+ int keep_going;
+
+ /* at this point we give up on the empty_size
+ * allocations and just try to allocate the min
+ * space.
+ *
+ * The extra_loop field was set if an empty_size
+ * allocation was attempted above, and if this
+ * is try we need to try the loop again without
+ * the additional empty_size.
+ */
+ total_needed -= empty_size;
+ empty_size = 0;
+ keep_going = extra_loop;
+ loop++;
+
+ if (allowed_chunk_alloc && !chunk_alloc_done) {
+ up_read(&space_info->groups_sem);
+ ret = do_chunk_alloc(trans, root, num_bytes +
+ 2 * 1024 * 1024, data, 1);
+ down_read(&space_info->groups_sem);
+ if (ret < 0)
+ goto loop_check;
+ head = &space_info->block_groups;
+ /*
+ * we've allocated a new chunk, keep
+ * trying
+ */
+ keep_going = 1;
+ chunk_alloc_done = 1;
+ } else if (!allowed_chunk_alloc) {
+ space_info->force_alloc = 1;
+ }
+loop_check:
+ if (keep_going) {
+ cur = head->next;
+ extra_loop = 0;
+ } else {
+ break;
+ }
+ } else if (cur == head) {
+ break;
+ }
+
+ block_group = list_entry(cur, struct btrfs_block_group_cache,
+ list);
+ atomic_inc(&block_group->count);
+
+ search_start = block_group->key.objectid;
+ cur = cur->next;
+ }
+
+ /* we found what we needed */
+ if (ins->objectid) {
+ if (!(data & BTRFS_BLOCK_GROUP_DATA))
+ trans->block_group = block_group->key.objectid;
+
+ if (last_ptr)
+ *last_ptr = ins->objectid + ins->offset;
+ ret = 0;
+ } else if (!ret) {
+ printk(KERN_ERR "btrfs searching for %llu bytes, "
+ "num_bytes %llu, loop %d, allowed_alloc %d\n",
+ (unsigned long long)total_needed,
+ (unsigned long long)num_bytes,
+ loop, allowed_chunk_alloc);
+ ret = -ENOSPC;
+ }
+ if (block_group)
+ put_block_group(block_group);
+
+ up_read(&space_info->groups_sem);
+ return ret;
+}
+
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+{
+ struct btrfs_block_group_cache *cache;
+ struct list_head *l;
+
+ printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+ (unsigned long long)(info->total_bytes - info->bytes_used -
+ info->bytes_pinned - info->bytes_reserved),
+ (info->full) ? "" : "not ");
+
+ down_read(&info->groups_sem);
+ list_for_each(l, &info->block_groups) {
+ cache = list_entry(l, struct btrfs_block_group_cache, list);
+ spin_lock(&cache->lock);
+ printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
+ "%llu pinned %llu reserved\n",
+ (unsigned long long)cache->key.objectid,
+ (unsigned long long)cache->key.offset,
+ (unsigned long long)btrfs_block_group_used(&cache->item),
+ (unsigned long long)cache->pinned,
+ (unsigned long long)cache->reserved);
+ btrfs_dump_free_space(cache, bytes);
+ spin_unlock(&cache->lock);
+ }
+ up_read(&info->groups_sem);
+}
+
+static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 num_bytes, u64 min_alloc_size,
+ u64 empty_size, u64 hint_byte,
+ u64 search_end, struct btrfs_key *ins,
+ u64 data)
+{
+ int ret;
+ u64 search_start = 0;
+ u64 alloc_profile;
+ struct btrfs_fs_info *info = root->fs_info;
+
+ if (data) {
+ alloc_profile = info->avail_data_alloc_bits &
+ info->data_alloc_profile;
+ data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+ } else if (root == root->fs_info->chunk_root) {
+ alloc_profile = info->avail_system_alloc_bits &
+ info->system_alloc_profile;
+ data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+ } else {
+ alloc_profile = info->avail_metadata_alloc_bits &
+ info->metadata_alloc_profile;
+ data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+ }
+again:
+ data = btrfs_reduce_alloc_profile(root, data);
+ /*
+ * the only place that sets empty_size is btrfs_realloc_node, which
+ * is not called recursively on allocations
+ */
+ if (empty_size || root->ref_cows) {
+ if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ 2 * 1024 * 1024,
+ BTRFS_BLOCK_GROUP_METADATA |
+ (info->metadata_alloc_profile &
+ info->avail_metadata_alloc_bits), 0);
+ }
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ num_bytes + 2 * 1024 * 1024, data, 0);
+ }
+
+ WARN_ON(num_bytes < root->sectorsize);
+ ret = find_free_extent(trans, root, num_bytes, empty_size,
+ search_start, search_end, hint_byte, ins,
+ trans->alloc_exclude_start,
+ trans->alloc_exclude_nr, data);
+
+ if (ret == -ENOSPC && num_bytes > min_alloc_size) {
+ num_bytes = num_bytes >> 1;
+ num_bytes = num_bytes & ~(root->sectorsize - 1);
+ num_bytes = max(num_bytes, min_alloc_size);
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ num_bytes, data, 1);
+ goto again;
+ }
+ if (ret) {
+ struct btrfs_space_info *sinfo;
+
+ sinfo = __find_space_info(root->fs_info, data);
+ printk(KERN_ERR "btrfs allocation failed flags %llu, "
+ "wanted %llu\n", (unsigned long long)data,
+ (unsigned long long)num_bytes);
+ dump_space_info(sinfo, num_bytes);
+ BUG();
+ }
+
+ return ret;
+}
+
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+ struct btrfs_block_group_cache *cache;
+ int ret = 0;
+
+ cache = btrfs_lookup_block_group(root->fs_info, start);
+ if (!cache) {
+ printk(KERN_ERR "Unable to find block group for %llu\n",
+ (unsigned long long)start);
+ return -ENOSPC;
+ }
+
+ ret = btrfs_discard_extent(root, start, len);
+
+ btrfs_add_free_space(cache, start, len);
+ put_block_group(cache);
+ update_reserved_extents(root, start, len, 0);
+
+ return ret;
+}
+
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 num_bytes, u64 min_alloc_size,
+ u64 empty_size, u64 hint_byte,
+ u64 search_end, struct btrfs_key *ins,
+ u64 data)
+{
+ int ret;
+ ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+ empty_size, hint_byte, search_end, ins,
+ data);
+ update_reserved_extents(root, ins->objectid, ins->offset, 1);
+ return ret;
+}
+
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner, struct btrfs_key *ins)
+{
+ int ret;
+ int pending_ret;
+ u64 super_used;
+ u64 root_used;
+ u64 num_bytes = ins->offset;
+ u32 sizes[2];
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_root *extent_root = info->extent_root;
+ struct btrfs_extent_item *extent_item;
+ struct btrfs_extent_ref *ref;
+ struct btrfs_path *path;
+ struct btrfs_key keys[2];
+
+ if (parent == 0)
+ parent = ins->objectid;
+
+ /* block accounting for super block */
+ spin_lock(&info->delalloc_lock);
+ super_used = btrfs_super_bytes_used(&info->super_copy);
+ btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
+
+ /* block accounting for root item */
+ root_used = btrfs_root_used(&root->root_item);
+ btrfs_set_root_used(&root->root_item, root_used + num_bytes);
+ spin_unlock(&info->delalloc_lock);
+
+ if (root == extent_root) {
+ struct pending_extent_op *extent_op;
+
+ extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ BUG_ON(!extent_op);
+
+ extent_op->type = PENDING_EXTENT_INSERT;
+ extent_op->bytenr = ins->objectid;
+ extent_op->num_bytes = ins->offset;
+ extent_op->parent = parent;
+ extent_op->orig_parent = 0;
+ extent_op->generation = ref_generation;
+ extent_op->orig_generation = 0;
+ extent_op->level = (int)owner;
+ INIT_LIST_HEAD(&extent_op->list);
+ extent_op->del = 0;
+
+ mutex_lock(&root->fs_info->extent_ins_mutex);
+ set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
+ ins->objectid + ins->offset - 1,
+ EXTENT_WRITEBACK, GFP_NOFS);
+ set_state_private(&root->fs_info->extent_ins,
+ ins->objectid, (unsigned long)extent_op);
+ mutex_unlock(&root->fs_info->extent_ins_mutex);
+ goto update_block;
+ }
+
+ memcpy(&keys[0], ins, sizeof(*ins));
+ keys[1].objectid = ins->objectid;
+ keys[1].type = BTRFS_EXTENT_REF_KEY;
+ keys[1].offset = parent;
+ sizes[0] = sizeof(*extent_item);
+ sizes[1] = sizeof(*ref);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
+ sizes, 2);
+ BUG_ON(ret);
+
+ extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+ struct btrfs_extent_ref);
+
+ btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
+ btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
+ btrfs_set_ref_objectid(path->nodes[0], ref, owner);
+ btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ trans->alloc_exclude_start = 0;
+ trans->alloc_exclude_nr = 0;
+ btrfs_free_path(path);
+ finish_current_insert(trans, extent_root, 0);
+ pending_ret = del_pending_extents(trans, extent_root, 0);
+
+ if (ret)
+ goto out;
+ if (pending_ret) {
+ ret = pending_ret;
+ goto out;
+ }
+
+update_block:
+ ret = update_block_group(trans, root, ins->objectid,
+ ins->offset, 1, 0);
+ if (ret) {
+ printk(KERN_ERR "btrfs update block group failed for %llu "
+ "%llu\n", (unsigned long long)ins->objectid,
+ (unsigned long long)ins->offset);
+ BUG();
+ }
+out:
+ return ret;
+}
+
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner, struct btrfs_key *ins)
+{
+ int ret;
+
+ if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
+ return 0;
+ ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+ ref_generation, owner, ins);
+ update_reserved_extents(root, ins->objectid, ins->offset, 0);
+ return ret;
+}
+
+/*
+ * this is used by the tree logging recovery code. It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner, struct btrfs_key *ins)
+{
+ int ret;
+ struct btrfs_block_group_cache *block_group;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+ mutex_lock(&block_group->cache_mutex);
+ cache_block_group(root, block_group);
+ mutex_unlock(&block_group->cache_mutex);
+
+ ret = btrfs_remove_free_space(block_group, ins->objectid,
+ ins->offset);
+ BUG_ON(ret);
+ put_block_group(block_group);
+ ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+ ref_generation, owner, ins);
+ return ret;
+}
+
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns 0 if everything worked, non-zero otherwise.
+ */
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 num_bytes, u64 parent, u64 min_alloc_size,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid, u64 empty_size, u64 hint_byte,
+ u64 search_end, struct btrfs_key *ins, u64 data)
+{
+ int ret;
+
+ ret = __btrfs_reserve_extent(trans, root, num_bytes,
+ min_alloc_size, empty_size, hint_byte,
+ search_end, ins, data);
+ BUG_ON(ret);
+ if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+ root_objectid, ref_generation,
+ owner_objectid, ins);
+ BUG_ON(ret);
+
+ } else {
+ update_reserved_extents(root, ins->objectid, ins->offset, 1);
+ }
+ return ret;
+}
+
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u32 blocksize)
+{
+ struct extent_buffer *buf;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+ btrfs_set_header_generation(buf, trans->transid);
+ btrfs_tree_lock(buf);
+ clean_tree_block(trans, root, buf);
+ btrfs_set_buffer_uptodate(buf);
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ set_extent_dirty(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
+ } else {
+ set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+ buf->start + buf->len - 1, GFP_NOFS);
+ }
+ trans->blocks_used++;
+ return buf;
+}
+
+/*
+ * helper function to allocate a block for a given tree
+ * returns the tree buffer or NULL.
+ */
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u32 blocksize, u64 parent,
+ u64 root_objectid,
+ u64 ref_generation,
+ int level,
+ u64 hint,
+ u64 empty_size)
+{
+ struct btrfs_key ins;
+ int ret;
+ struct extent_buffer *buf;
+
+ ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
+ root_objectid, ref_generation, level,
+ empty_size, hint, (u64)-1, &ins, 0);
+ if (ret) {
+ BUG_ON(ret > 0);
+ return ERR_PTR(ret);
+ }
+
+ buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+ return buf;
+}
+
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *leaf)
+{
+ u64 leaf_owner;
+ u64 leaf_generation;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ int i;
+ int nritems;
+ int ret;
+
+ BUG_ON(!btrfs_is_leaf(leaf));
+ nritems = btrfs_header_nritems(leaf);
+ leaf_owner = btrfs_header_owner(leaf);
+ leaf_generation = btrfs_header_generation(leaf);
+
+ for (i = 0; i < nritems; i++) {
+ u64 disk_bytenr;
+ cond_resched();
+
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ /*
+ * FIXME make sure to insert a trans record that
+ * repeats the snapshot del on crash
+ */
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (disk_bytenr == 0)
+ continue;
+
+ ret = __btrfs_free_extent(trans, root, disk_bytenr,
+ btrfs_file_extent_disk_num_bytes(leaf, fi),
+ leaf->start, leaf_owner, leaf_generation,
+ key.objectid, 0);
+ BUG_ON(ret);
+
+ atomic_inc(&root->fs_info->throttle_gen);
+ wake_up(&root->fs_info->transaction_throttle);
+ cond_resched();
+ }
+ return 0;
+}
+
+static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_leaf_ref *ref)
+{
+ int i;
+ int ret;
+ struct btrfs_extent_info *info = ref->extents;
+
+ for (i = 0; i < ref->nritems; i++) {
+ ret = __btrfs_free_extent(trans, root, info->bytenr,
+ info->num_bytes, ref->bytenr,
+ ref->owner, ref->generation,
+ info->objectid, 0);
+
+ atomic_inc(&root->fs_info->throttle_gen);
+ wake_up(&root->fs_info->transaction_throttle);
+ cond_resched();
+
+ BUG_ON(ret);
+ info++;
+ }
+
+ return 0;
+}
+
+static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+ u64 len, u32 *refs)
+{
+ int ret;
+
+ ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+ BUG_ON(ret);
+
+#if 0 /* some debugging code in case we see problems here */
+ /* if the refs count is one, it won't get increased again. But
+ * if the ref count is > 1, someone may be decreasing it at
+ * the same time we are.
+ */
+ if (*refs != 1) {
+ struct extent_buffer *eb = NULL;
+ eb = btrfs_find_create_tree_block(root, start, len);
+ if (eb)
+ btrfs_tree_lock(eb);
+
+ mutex_lock(&root->fs_info->alloc_mutex);
+ ret = lookup_extent_ref(NULL, root, start, len, refs);
+ BUG_ON(ret);
+ mutex_unlock(&root->fs_info->alloc_mutex);
+
+ if (eb) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ if (*refs == 1) {
+ printk(KERN_ERR "btrfs block %llu went down to one "
+ "during drop_snap\n", (unsigned long long)start);
+ }
+
+ }
+#endif
+
+ cond_resched();
+ return ret;
+}
+
+/*
+ * helper function for drop_snapshot, this walks down the tree dropping ref
+ * counts as it goes.
+ */
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level)
+{
+ u64 root_owner;
+ u64 root_gen;
+ u64 bytenr;
+ u64 ptr_gen;
+ struct extent_buffer *next;
+ struct extent_buffer *cur;
+ struct extent_buffer *parent;
+ struct btrfs_leaf_ref *ref;
+ u32 blocksize;
+ int ret;
+ u32 refs;
+
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+ ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+ path->nodes[*level]->len, &refs);
+ BUG_ON(ret);
+ if (refs > 1)
+ goto out;
+
+ /*
+ * walk down to the last node level and free all the leaves
+ */
+ while (*level >= 0) {
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+ cur = path->nodes[*level];
+
+ if (btrfs_header_level(cur) != *level)
+ WARN_ON(1);
+
+ if (path->slots[*level] >=
+ btrfs_header_nritems(cur))
+ break;
+ if (*level == 0) {
+ ret = btrfs_drop_leaf_ref(trans, root, cur);
+ BUG_ON(ret);
+ break;
+ }
+ bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+ ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+ blocksize = btrfs_level_size(root, *level - 1);
+
+ ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+ BUG_ON(ret);
+ if (refs != 1) {
+ parent = path->nodes[*level];
+ root_owner = btrfs_header_owner(parent);
+ root_gen = btrfs_header_generation(parent);
+ path->slots[*level]++;
+
+ ret = __btrfs_free_extent(trans, root, bytenr,
+ blocksize, parent->start,
+ root_owner, root_gen,
+ *level - 1, 1);
+ BUG_ON(ret);
+
+ atomic_inc(&root->fs_info->throttle_gen);
+ wake_up(&root->fs_info->transaction_throttle);
+ cond_resched();
+
+ continue;
+ }
+ /*
+ * at this point, we have a single ref, and since the
+ * only place referencing this extent is a dead root
+ * the reference count should never go higher.
+ * So, we don't need to check it again
+ */
+ if (*level == 1) {
+ ref = btrfs_lookup_leaf_ref(root, bytenr);
+ if (ref && ref->generation != ptr_gen) {
+ btrfs_free_leaf_ref(root, ref);
+ ref = NULL;
+ }
+ if (ref) {
+ ret = cache_drop_leaf_ref(trans, root, ref);
+ BUG_ON(ret);
+ btrfs_remove_leaf_ref(root, ref);
+ btrfs_free_leaf_ref(root, ref);
+ *level = 0;
+ break;
+ }
+ }
+ next = btrfs_find_tree_block(root, bytenr, blocksize);
+ if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
+ free_extent_buffer(next);
+
+ next = read_tree_block(root, bytenr, blocksize,
+ ptr_gen);
+ cond_resched();
+#if 0
+ /*
+ * this is a debugging check and can go away
+ * the ref should never go all the way down to 1
+ * at this point
+ */
+ ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
+ &refs);
+ BUG_ON(ret);
+ WARN_ON(refs != 1);
+#endif
+ }
+ WARN_ON(*level <= 0);
+ if (path->nodes[*level-1])
+ free_extent_buffer(path->nodes[*level-1]);
+ path->nodes[*level-1] = next;
+ *level = btrfs_header_level(next);
+ path->slots[*level] = 0;
+ cond_resched();
+ }
+out:
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+ if (path->nodes[*level] == root->node) {
+ parent = path->nodes[*level];
+ bytenr = path->nodes[*level]->start;
+ } else {
+ parent = path->nodes[*level + 1];
+ bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
+ }
+
+ blocksize = btrfs_level_size(root, *level);
+ root_owner = btrfs_header_owner(parent);
+ root_gen = btrfs_header_generation(parent);
+
+ ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+ parent->start, root_owner, root_gen,
+ *level, 1);
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level += 1;
+ BUG_ON(ret);
+
+ cond_resched();
+ return 0;
+}
+
+/*
+ * helper function for drop_subtree, this function is similar to
+ * walk_down_tree. The main difference is that it checks reference
+ * counts while tree blocks are locked.
+ */
+static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level)
+{
+ struct extent_buffer *next;
+ struct extent_buffer *cur;
+ struct extent_buffer *parent;
+ u64 bytenr;
+ u64 ptr_gen;
+ u32 blocksize;
+ u32 refs;
+ int ret;
+
+ cur = path->nodes[*level];
+ ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
+ &refs);
+ BUG_ON(ret);
+ if (refs > 1)
+ goto out;
+
+ while (*level >= 0) {
+ cur = path->nodes[*level];
+ if (*level == 0) {
+ ret = btrfs_drop_leaf_ref(trans, root, cur);
+ BUG_ON(ret);
+ clean_tree_block(trans, root, cur);
+ break;
+ }
+ if (path->slots[*level] >= btrfs_header_nritems(cur)) {
+ clean_tree_block(trans, root, cur);
+ break;
+ }
+
+ bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+ blocksize = btrfs_level_size(root, *level - 1);
+ ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+
+ next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+ btrfs_tree_lock(next);
+
+ ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
+ &refs);
+ BUG_ON(ret);
+ if (refs > 1) {
+ parent = path->nodes[*level];
+ ret = btrfs_free_extent(trans, root, bytenr,
+ blocksize, parent->start,
+ btrfs_header_owner(parent),
+ btrfs_header_generation(parent),
+ *level - 1, 1);
+ BUG_ON(ret);
+ path->slots[*level]++;
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ continue;
+ }
+
+ *level = btrfs_header_level(next);
+ path->nodes[*level] = next;
+ path->slots[*level] = 0;
+ path->locks[*level] = 1;
+ cond_resched();
+ }
+out:
+ parent = path->nodes[*level + 1];
+ bytenr = path->nodes[*level]->start;
+ blocksize = path->nodes[*level]->len;
+
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+ parent->start, btrfs_header_owner(parent),
+ btrfs_header_generation(parent), *level, 1);
+ BUG_ON(ret);
+
+ if (path->locks[*level]) {
+ btrfs_tree_unlock(path->nodes[*level]);
+ path->locks[*level] = 0;
+ }
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level += 1;
+ cond_resched();
+ return 0;
+}
+
+/*
+ * helper for dropping snapshots. This walks back up the tree in the path
+ * to find the first node higher up where we haven't yet gone through
+ * all the slots
+ */
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int *level, int max_level)
+{
+ u64 root_owner;
+ u64 root_gen;
+ struct btrfs_root_item *root_item = &root->root_item;
+ int i;
+ int slot;
+ int ret;
+
+ for (i = *level; i < max_level && path->nodes[i]; i++) {
+ slot = path->slots[i];
+ if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+ struct extent_buffer *node;
+ struct btrfs_disk_key disk_key;
+ node = path->nodes[i];
+ path->slots[i]++;
+ *level = i;
+ WARN_ON(*level == 0);
+ btrfs_node_key(node, &disk_key, path->slots[i]);
+ memcpy(&root_item->drop_progress,
+ &disk_key, sizeof(disk_key));
+ root_item->drop_level = i;
+ return 0;
+ } else {
+ struct extent_buffer *parent;
+ if (path->nodes[*level] == root->node)
+ parent = path->nodes[*level];
+ else
+ parent = path->nodes[*level + 1];
+
+ root_owner = btrfs_header_owner(parent);
+ root_gen = btrfs_header_generation(parent);
+
+ clean_tree_block(trans, root, path->nodes[*level]);
+ ret = btrfs_free_extent(trans, root,
+ path->nodes[*level]->start,
+ path->nodes[*level]->len,
+ parent->start, root_owner,
+ root_gen, *level, 1);
+ BUG_ON(ret);
+ if (path->locks[*level]) {
+ btrfs_tree_unlock(path->nodes[*level]);
+ path->locks[*level] = 0;
+ }
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level = i + 1;
+ }
+ }
+ return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'. This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root)
+{
+ int ret = 0;
+ int wret;
+ int level;
+ struct btrfs_path *path;
+ int i;
+ int orig_level;
+ struct btrfs_root_item *root_item = &root->root_item;
+
+ WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ level = btrfs_header_level(root->node);
+ orig_level = level;
+ if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+ path->nodes[level] = root->node;
+ extent_buffer_get(root->node);
+ path->slots[level] = 0;
+ } else {
+ struct btrfs_key key;
+ struct btrfs_disk_key found_key;
+ struct extent_buffer *node;
+
+ btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+ level = root_item->drop_level;
+ path->lowest_level = level;
+ wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+ node = path->nodes[level];
+ btrfs_node_key(node, &found_key, path->slots[level]);
+ WARN_ON(memcmp(&found_key, &root_item->drop_progress,
+ sizeof(found_key)));
+ /*
+ * unlock our path, this is safe because only this
+ * function is allowed to delete this snapshot
+ */
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (path->nodes[i] && path->locks[i]) {
+ path->locks[i] = 0;
+ btrfs_tree_unlock(path->nodes[i]);
+ }
+ }
+ }
+ while (1) {
+ wret = walk_down_tree(trans, root, path, &level);
+ if (wret > 0)
+ break;
+ if (wret < 0)
+ ret = wret;
+
+ wret = walk_up_tree(trans, root, path, &level,
+ BTRFS_MAX_LEVEL);
+ if (wret > 0)
+ break;
+ if (wret < 0)
+ ret = wret;
+ if (trans->transaction->in_commit) {
+ ret = -EAGAIN;
+ break;
+ }
+ atomic_inc(&root->fs_info->throttle_gen);
+ wake_up(&root->fs_info->transaction_throttle);
+ }
+ for (i = 0; i <= orig_level; i++) {
+ if (path->nodes[i]) {
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *node,
+ struct extent_buffer *parent)
+{
+ struct btrfs_path *path;
+ int level;
+ int parent_level;
+ int ret = 0;
+ int wret;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ BUG_ON(!btrfs_tree_locked(parent));
+ parent_level = btrfs_header_level(parent);
+ extent_buffer_get(parent);
+ path->nodes[parent_level] = parent;
+ path->slots[parent_level] = btrfs_header_nritems(parent);
+
+ BUG_ON(!btrfs_tree_locked(node));
+ level = btrfs_header_level(node);
+ extent_buffer_get(node);
+ path->nodes[level] = node;
+ path->slots[level] = 0;
+
+ while (1) {
+ wret = walk_down_subtree(trans, root, path, &level);
+ if (wret < 0)
+ ret = wret;
+ if (wret != 0)
+ break;
+
+ wret = walk_up_tree(trans, root, path, &level, parent_level);
+ if (wret < 0)
+ ret = wret;
+ if (wret != 0)
+ break;
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static unsigned long calc_ra(unsigned long start, unsigned long last,
+ unsigned long nr)
+{
+ return min(last, start + nr - 1);
+}
+
+static noinline int relocate_inode_pages(struct inode *inode, u64 start,
+ u64 len)
+{
+ u64 page_start;
+ u64 page_end;
+ unsigned long first_index;
+ unsigned long last_index;
+ unsigned long i;
+ struct page *page;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct file_ra_state *ra;
+ struct btrfs_ordered_extent *ordered;
+ unsigned int total_read = 0;
+ unsigned int total_dirty = 0;
+ int ret = 0;
+
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+
+ mutex_lock(&inode->i_mutex);
+ first_index = start >> PAGE_CACHE_SHIFT;
+ last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+
+ /* make sure the dirty trick played by the caller work */
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ first_index, last_index);
+ if (ret)
+ goto out_unlock;
+
+ file_ra_state_init(ra, inode->i_mapping);
+
+ for (i = first_index ; i <= last_index; i++) {
+ if (total_read % ra->ra_pages == 0) {
+ btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+ calc_ra(i, last_index, ra->ra_pages));
+ }
+ total_read++;
+again:
+ if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+ BUG_ON(1);
+ page = grab_cache_page(inode->i_mapping, i);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ ret = -EIO;
+ goto out_unlock;
+ }
+ }
+ wait_on_page_writeback(page);
+
+ page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+ lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+ set_page_extent_mapped(page);
+
+ if (i == first_index)
+ set_extent_bits(io_tree, page_start, page_end,
+ EXTENT_BOUNDARY, GFP_NOFS);
+ btrfs_set_extent_delalloc(inode, page_start, page_end);
+
+ set_page_dirty(page);
+ total_dirty++;
+
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+
+out_unlock:
+ kfree(ra);
+ mutex_unlock(&inode->i_mutex);
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+ return ret;
+}
+
+static noinline int relocate_data_extent(struct inode *reloc_inode,
+ struct btrfs_key *extent_key,
+ u64 offset)
+{
+ struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+ struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
+ struct extent_map *em;
+ u64 start = extent_key->objectid - offset;
+ u64 end = start + extent_key->offset - 1;
+
+ em = alloc_extent_map(GFP_NOFS);
+ BUG_ON(!em || IS_ERR(em));
+
+ em->start = start;
+ em->len = extent_key->offset;
+ em->block_len = extent_key->offset;
+ em->block_start = extent_key->objectid;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ /* setup extent map to cheat btrfs_readpage */
+ lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+ while (1) {
+ int ret;
+ spin_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ spin_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(reloc_inode, start, end, 0);
+ }
+ unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+
+ return relocate_inode_pages(reloc_inode, start, extent_key->offset);
+}
+
+struct btrfs_ref_path {
+ u64 extent_start;
+ u64 nodes[BTRFS_MAX_LEVEL];
+ u64 root_objectid;
+ u64 root_generation;
+ u64 owner_objectid;
+ u32 num_refs;
+ int lowest_level;
+ int current_level;
+ int shared_level;
+
+ struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
+ u64 new_nodes[BTRFS_MAX_LEVEL];
+};
+
+struct disk_extent {
+ u64 ram_bytes;
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 offset;
+ u64 num_bytes;
+ u8 compression;
+ u8 encryption;
+ u16 other_encoding;
+};
+
+static int is_cowonly_root(u64 root_objectid)
+{
+ if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+ root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+ root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+ root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+ root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+ root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return 1;
+ return 0;
+}
+
+static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_ref_path *ref_path,
+ int first_time)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_extent_ref *ref;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 bytenr;
+ u32 nritems;
+ int level;
+ int ret = 1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (first_time) {
+ ref_path->lowest_level = -1;
+ ref_path->current_level = -1;
+ ref_path->shared_level = -1;
+ goto walk_up;
+ }
+walk_down:
+ level = ref_path->current_level - 1;
+ while (level >= -1) {
+ u64 parent;
+ if (level < ref_path->lowest_level)
+ break;
+
+ if (level >= 0)
+ bytenr = ref_path->nodes[level];
+ else
+ bytenr = ref_path->extent_start;
+ BUG_ON(bytenr == 0);
+
+ parent = ref_path->nodes[level + 1];
+ ref_path->nodes[level + 1] = 0;
+ ref_path->current_level = level;
+ BUG_ON(parent == 0);
+
+ key.objectid = bytenr;
+ key.offset = parent + 1;
+ key.type = BTRFS_EXTENT_REF_KEY;
+
+ ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ goto next;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid == bytenr &&
+ found_key.type == BTRFS_EXTENT_REF_KEY) {
+ if (level < ref_path->shared_level)
+ ref_path->shared_level = level;
+ goto found;
+ }
+next:
+ level--;
+ btrfs_release_path(extent_root, path);
+ cond_resched();
+ }
+ /* reached lowest level */
+ ret = 1;
+ goto out;
+walk_up:
+ level = ref_path->current_level;
+ while (level < BTRFS_MAX_LEVEL - 1) {
+ u64 ref_objectid;
+
+ if (level >= 0)
+ bytenr = ref_path->nodes[level];
+ else
+ bytenr = ref_path->extent_start;
+
+ BUG_ON(bytenr == 0);
+
+ key.objectid = bytenr;
+ key.offset = 0;
+ key.type = BTRFS_EXTENT_REF_KEY;
+
+ ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ /* the extent was freed by someone */
+ if (ref_path->lowest_level == level)
+ goto out;
+ btrfs_release_path(extent_root, path);
+ goto walk_down;
+ }
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != bytenr ||
+ found_key.type != BTRFS_EXTENT_REF_KEY) {
+ /* the extent was freed by someone */
+ if (ref_path->lowest_level == level) {
+ ret = 1;
+ goto out;
+ }
+ btrfs_release_path(extent_root, path);
+ goto walk_down;
+ }
+found:
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_ref);
+ ref_objectid = btrfs_ref_objectid(leaf, ref);
+ if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+ if (first_time) {
+ level = (int)ref_objectid;
+ BUG_ON(level >= BTRFS_MAX_LEVEL);
+ ref_path->lowest_level = level;
+ ref_path->current_level = level;
+ ref_path->nodes[level] = bytenr;
+ } else {
+ WARN_ON(ref_objectid != level);
+ }
+ } else {
+ WARN_ON(level != -1);
+ }
+ first_time = 0;
+
+ if (ref_path->lowest_level == level) {
+ ref_path->owner_objectid = ref_objectid;
+ ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
+ }
+
+ /*
+ * the block is tree root or the block isn't in reference
+ * counted tree.
+ */
+ if (found_key.objectid == found_key.offset ||
+ is_cowonly_root(btrfs_ref_root(leaf, ref))) {
+ ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+ ref_path->root_generation =
+ btrfs_ref_generation(leaf, ref);
+ if (level < 0) {
+ /* special reference from the tree log */
+ ref_path->nodes[0] = found_key.offset;
+ ref_path->current_level = 0;
+ }
+ ret = 0;
+ goto out;
+ }
+
+ level++;
+ BUG_ON(ref_path->nodes[level] != 0);
+ ref_path->nodes[level] = found_key.offset;
+ ref_path->current_level = level;
+
+ /*
+ * the reference was created in the running transaction,
+ * no need to continue walking up.
+ */
+ if (btrfs_ref_generation(leaf, ref) == trans->transid) {
+ ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+ ref_path->root_generation =
+ btrfs_ref_generation(leaf, ref);
+ ret = 0;
+ goto out;
+ }
+
+ btrfs_release_path(extent_root, path);
+ cond_resched();
+ }
+ /* reached max tree level, but no tree root found. */
+ BUG();
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_ref_path *ref_path,
+ u64 extent_start)
+{
+ memset(ref_path, 0, sizeof(*ref_path));
+ ref_path->extent_start = extent_start;
+
+ return __next_ref_path(trans, extent_root, ref_path, 1);
+}
+
+static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_ref_path *ref_path)
+{
+ return __next_ref_path(trans, extent_root, ref_path, 0);
+}
+
+static noinline int get_new_locations(struct inode *reloc_inode,
+ struct btrfs_key *extent_key,
+ u64 offset, int no_fragment,
+ struct disk_extent **extents,
+ int *nr_extents)
+{
+ struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf;
+ struct disk_extent *exts = *extents;
+ struct btrfs_key found_key;
+ u64 cur_pos;
+ u64 last_byte;
+ u32 nritems;
+ int nr = 0;
+ int max = *nr_extents;
+ int ret;
+
+ WARN_ON(!no_fragment && *extents);
+ if (!exts) {
+ max = 1;
+ exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
+ if (!exts)
+ return -ENOMEM;
+ }
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ cur_pos = extent_key->objectid - offset;
+ last_byte = extent_key->objectid + extent_key->offset;
+ ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+ cur_pos, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ while (1) {
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.offset != cur_pos ||
+ found_key.type != BTRFS_EXTENT_DATA_KEY ||
+ found_key.objectid != reloc_inode->i_ino)
+ break;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) !=
+ BTRFS_FILE_EXTENT_REG ||
+ btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+ break;
+
+ if (nr == max) {
+ struct disk_extent *old = exts;
+ max *= 2;
+ exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
+ memcpy(exts, old, sizeof(*exts) * nr);
+ if (old != *extents)
+ kfree(old);
+ }
+
+ exts[nr].disk_bytenr =
+ btrfs_file_extent_disk_bytenr(leaf, fi);
+ exts[nr].disk_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf, fi);
+ exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
+ exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+ exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+ exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+ exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+ fi);
+ BUG_ON(exts[nr].offset > 0);
+ BUG_ON(exts[nr].compression || exts[nr].encryption);
+ BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
+
+ cur_pos += exts[nr].num_bytes;
+ nr++;
+
+ if (cur_pos + offset >= last_byte)
+ break;
+
+ if (no_fragment) {
+ ret = 1;
+ goto out;
+ }
+ path->slots[0]++;
+ }
+
+ BUG_ON(cur_pos + offset > last_byte);
+ if (cur_pos + offset < last_byte) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (ret) {
+ if (exts != *extents)
+ kfree(exts);
+ } else {
+ *extents = exts;
+ *nr_extents = nr;
+ }
+ return ret;
+}
+
+static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *extent_key,
+ struct btrfs_key *leaf_key,
+ struct btrfs_ref_path *ref_path,
+ struct disk_extent *new_extents,
+ int nr_extents)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct inode *inode = NULL;
+ struct btrfs_key key;
+ u64 lock_start = 0;
+ u64 lock_end = 0;
+ u64 num_bytes;
+ u64 ext_offset;
+ u64 first_pos;
+ u32 nritems;
+ int nr_scaned = 0;
+ int extent_locked = 0;
+ int extent_type;
+ int ret;
+
+ memcpy(&key, leaf_key, sizeof(key));
+ first_pos = INT_LIMIT(loff_t) - extent_key->offset;
+ if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+ if (key.objectid < ref_path->owner_objectid ||
+ (key.objectid == ref_path->owner_objectid &&
+ key.type < BTRFS_EXTENT_DATA_KEY)) {
+ key.objectid = ref_path->owner_objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ }
+ }
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+next:
+ if (extent_locked && ret > 0) {
+ /*
+ * the file extent item was modified by someone
+ * before the extent got locked.
+ */
+ unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+ lock_end, GFP_NOFS);
+ extent_locked = 0;
+ }
+
+ if (path->slots[0] >= nritems) {
+ if (++nr_scaned > 2)
+ break;
+
+ BUG_ON(extent_locked);
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+ if ((key.objectid > ref_path->owner_objectid) ||
+ (key.objectid == ref_path->owner_objectid &&
+ key.type > BTRFS_EXTENT_DATA_KEY) ||
+ (key.offset >= first_pos + extent_key->offset))
+ break;
+ }
+
+ if (inode && key.objectid != inode->i_ino) {
+ BUG_ON(extent_locked);
+ btrfs_release_path(root, path);
+ mutex_unlock(&inode->i_mutex);
+ iput(inode);
+ inode = NULL;
+ continue;
+ }
+
+ if (key.type != BTRFS_EXTENT_DATA_KEY) {
+ path->slots[0]++;
+ ret = 1;
+ goto next;
+ }
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ if ((extent_type != BTRFS_FILE_EXTENT_REG &&
+ extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
+ (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+ extent_key->objectid)) {
+ path->slots[0]++;
+ ret = 1;
+ goto next;
+ }
+
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+ ext_offset = btrfs_file_extent_offset(leaf, fi);
+
+ if (first_pos > key.offset - ext_offset)
+ first_pos = key.offset - ext_offset;
+
+ if (!extent_locked) {
+ lock_start = key.offset;
+ lock_end = lock_start + num_bytes - 1;
+ } else {
+ if (lock_start > key.offset ||
+ lock_end + 1 < key.offset + num_bytes) {
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ lock_start, lock_end, GFP_NOFS);
+ extent_locked = 0;
+ }
+ }
+
+ if (!inode) {
+ btrfs_release_path(root, path);
+
+ inode = btrfs_iget_locked(root->fs_info->sb,
+ key.objectid, root);
+ if (inode->i_state & I_NEW) {
+ BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->location.objectid =
+ key.objectid;
+ BTRFS_I(inode)->location.type =
+ BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.offset = 0;
+ btrfs_read_locked_inode(inode);
+ unlock_new_inode(inode);
+ }
+ /*
+ * some code call btrfs_commit_transaction while
+ * holding the i_mutex, so we can't use mutex_lock
+ * here.
+ */
+ if (is_bad_inode(inode) ||
+ !mutex_trylock(&inode->i_mutex)) {
+ iput(inode);
+ inode = NULL;
+ key.offset = (u64)-1;
+ goto skip;
+ }
+ }
+
+ if (!extent_locked) {
+ struct btrfs_ordered_extent *ordered;
+
+ btrfs_release_path(root, path);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+ lock_end, GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ lock_end);
+ if (ordered &&
+ ordered->file_offset <= lock_end &&
+ ordered->file_offset + ordered->len > lock_start) {
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ lock_start, lock_end, GFP_NOFS);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ key.offset += num_bytes;
+ goto skip;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+
+ extent_locked = 1;
+ continue;
+ }
+
+ if (nr_extents == 1) {
+ /* update extent pointer in place */
+ btrfs_set_file_extent_disk_bytenr(leaf, fi,
+ new_extents[0].disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+ new_extents[0].disk_num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_drop_extent_cache(inode, key.offset,
+ key.offset + num_bytes - 1, 0);
+
+ ret = btrfs_inc_extent_ref(trans, root,
+ new_extents[0].disk_bytenr,
+ new_extents[0].disk_num_bytes,
+ leaf->start,
+ root->root_key.objectid,
+ trans->transid,
+ key.objectid);
+ BUG_ON(ret);
+
+ ret = btrfs_free_extent(trans, root,
+ extent_key->objectid,
+ extent_key->offset,
+ leaf->start,
+ btrfs_header_owner(leaf),
+ btrfs_header_generation(leaf),
+ key.objectid, 0);
+ BUG_ON(ret);
+
+ btrfs_release_path(root, path);
+ key.offset += num_bytes;
+ } else {
+ BUG_ON(1);
+#if 0
+ u64 alloc_hint;
+ u64 extent_len;
+ int i;
+ /*
+ * drop old extent pointer at first, then insert the
+ * new pointers one bye one
+ */
+ btrfs_release_path(root, path);
+ ret = btrfs_drop_extents(trans, root, inode, key.offset,
+ key.offset + num_bytes,
+ key.offset, &alloc_hint);
+ BUG_ON(ret);
+
+ for (i = 0; i < nr_extents; i++) {
+ if (ext_offset >= new_extents[i].num_bytes) {
+ ext_offset -= new_extents[i].num_bytes;
+ continue;
+ }
+ extent_len = min(new_extents[i].num_bytes -
+ ext_offset, num_bytes);
+
+ ret = btrfs_insert_empty_item(trans, root,
+ path, &key,
+ sizeof(*fi));
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_disk_bytenr(leaf, fi,
+ new_extents[i].disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+ new_extents[i].disk_num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi,
+ new_extents[i].ram_bytes);
+
+ btrfs_set_file_extent_compression(leaf, fi,
+ new_extents[i].compression);
+ btrfs_set_file_extent_encryption(leaf, fi,
+ new_extents[i].encryption);
+ btrfs_set_file_extent_other_encoding(leaf, fi,
+ new_extents[i].other_encoding);
+
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_len);
+ ext_offset += new_extents[i].offset;
+ btrfs_set_file_extent_offset(leaf, fi,
+ ext_offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_drop_extent_cache(inode, key.offset,
+ key.offset + extent_len - 1, 0);
+
+ ret = btrfs_inc_extent_ref(trans, root,
+ new_extents[i].disk_bytenr,
+ new_extents[i].disk_num_bytes,
+ leaf->start,
+ root->root_key.objectid,
+ trans->transid, key.objectid);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ inode_add_bytes(inode, extent_len);
+
+ ext_offset = 0;
+ num_bytes -= extent_len;
+ key.offset += extent_len;
+
+ if (num_bytes == 0)
+ break;
+ }
+ BUG_ON(i >= nr_extents);
+#endif
+ }
+
+ if (extent_locked) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+ lock_end, GFP_NOFS);
+ extent_locked = 0;
+ }
+skip:
+ if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
+ key.offset >= first_pos + extent_key->offset)
+ break;
+
+ cond_resched();
+ }
+ ret = 0;
+out:
+ btrfs_release_path(root, path);
+ if (inode) {
+ mutex_unlock(&inode->i_mutex);
+ if (extent_locked) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+ lock_end, GFP_NOFS);
+ }
+ iput(inode);
+ }
+ return ret;
+}
+
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf, u64 orig_start)
+{
+ int level;
+ int ret;
+
+ BUG_ON(btrfs_header_generation(buf) != trans->transid);
+ BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
+ level = btrfs_header_level(buf);
+ if (level == 0) {
+ struct btrfs_leaf_ref *ref;
+ struct btrfs_leaf_ref *orig_ref;
+
+ orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
+ if (!orig_ref)
+ return -ENOENT;
+
+ ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
+ if (!ref) {
+ btrfs_free_leaf_ref(root, orig_ref);
+ return -ENOMEM;
+ }
+
+ ref->nritems = orig_ref->nritems;
+ memcpy(ref->extents, orig_ref->extents,
+ sizeof(ref->extents[0]) * ref->nritems);
+
+ btrfs_free_leaf_ref(root, orig_ref);
+
+ ref->root_gen = trans->transid;
+ ref->bytenr = buf->start;
+ ref->owner = btrfs_header_owner(buf);
+ ref->generation = btrfs_header_generation(buf);
+ ret = btrfs_add_leaf_ref(root, ref, 0);
+ WARN_ON(ret);
+ btrfs_free_leaf_ref(root, ref);
+ }
+ return 0;
+}
+
+static noinline int invalidate_extent_cache(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_block_group_cache *group,
+ struct btrfs_root *target_root)
+{
+ struct btrfs_key key;
+ struct inode *inode = NULL;
+ struct btrfs_file_extent_item *fi;
+ u64 num_bytes;
+ u64 skip_objectid = 0;
+ u32 nritems;
+ u32 i;
+
+ nritems = btrfs_header_nritems(leaf);
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (key.objectid == skip_objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+ continue;
+ if (!inode || inode->i_ino != key.objectid) {
+ iput(inode);
+ inode = btrfs_ilookup(target_root->fs_info->sb,
+ key.objectid, target_root, 1);
+ }
+ if (!inode) {
+ skip_objectid = key.objectid;
+ continue;
+ }
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+ key.offset + num_bytes - 1, GFP_NOFS);
+ btrfs_drop_extent_cache(inode, key.offset,
+ key.offset + num_bytes - 1, 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+ key.offset + num_bytes - 1, GFP_NOFS);
+ cond_resched();
+ }
+ iput(inode);
+ return 0;
+}
+
+static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_block_group_cache *group,
+ struct inode *reloc_inode)
+{
+ struct btrfs_key key;
+ struct btrfs_key extent_key;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_leaf_ref *ref;
+ struct disk_extent *new_extent;
+ u64 bytenr;
+ u64 num_bytes;
+ u32 nritems;
+ u32 i;
+ int ext_index;
+ int nr_extent;
+ int ret;
+
+ new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
+ BUG_ON(!new_extent);
+
+ ref = btrfs_lookup_leaf_ref(root, leaf->start);
+ BUG_ON(!ref);
+
+ ext_index = -1;
+ nritems = btrfs_header_nritems(leaf);
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ if (bytenr == 0)
+ continue;
+
+ ext_index++;
+ if (bytenr >= group->key.objectid + group->key.offset ||
+ bytenr + num_bytes <= group->key.objectid)
+ continue;
+
+ extent_key.objectid = bytenr;
+ extent_key.offset = num_bytes;
+ extent_key.type = BTRFS_EXTENT_ITEM_KEY;
+ nr_extent = 1;
+ ret = get_new_locations(reloc_inode, &extent_key,
+ group->key.objectid, 1,
+ &new_extent, &nr_extent);
+ if (ret > 0)
+ continue;
+ BUG_ON(ret < 0);
+
+ BUG_ON(ref->extents[ext_index].bytenr != bytenr);
+ BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
+ ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
+ ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
+
+ btrfs_set_file_extent_disk_bytenr(leaf, fi,
+ new_extent->disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+ new_extent->disk_num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_inc_extent_ref(trans, root,
+ new_extent->disk_bytenr,
+ new_extent->disk_num_bytes,
+ leaf->start,
+ root->root_key.objectid,
+ trans->transid, key.objectid);
+ BUG_ON(ret);
+ ret = btrfs_free_extent(trans, root,
+ bytenr, num_bytes, leaf->start,
+ btrfs_header_owner(leaf),
+ btrfs_header_generation(leaf),
+ key.objectid, 0);
+ BUG_ON(ret);
+ cond_resched();
+ }
+ kfree(new_extent);
+ BUG_ON(ext_index + 1 != ref->nritems);
+ btrfs_free_leaf_ref(root, ref);
+ return 0;
+}
+
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ int ret;
+
+ if (root->reloc_root) {
+ reloc_root = root->reloc_root;
+ root->reloc_root = NULL;
+ list_add(&reloc_root->dead_list,
+ &root->fs_info->dead_reloc_roots);
+
+ btrfs_set_root_bytenr(&reloc_root->root_item,
+ reloc_root->node->start);
+ btrfs_set_root_level(&root->root_item,
+ btrfs_header_level(reloc_root->node));
+ memset(&reloc_root->root_item.drop_progress, 0,
+ sizeof(struct btrfs_disk_key));
+ reloc_root->root_item.drop_level = 0;
+
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &reloc_root->root_key,
+ &reloc_root->root_item);
+ BUG_ON(ret);
+ }
+ return 0;
+}
+
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root *prev_root = NULL;
+ struct list_head dead_roots;
+ int ret;
+ unsigned long nr;
+
+ INIT_LIST_HEAD(&dead_roots);
+ list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
+
+ while (!list_empty(&dead_roots)) {
+ reloc_root = list_entry(dead_roots.prev,
+ struct btrfs_root, dead_list);
+ list_del_init(&reloc_root->dead_list);
+
+ BUG_ON(reloc_root->commit_root != NULL);
+ while (1) {
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
+
+ mutex_lock(&root->fs_info->drop_mutex);
+ ret = btrfs_drop_snapshot(trans, reloc_root);
+ if (ret != -EAGAIN)
+ break;
+ mutex_unlock(&root->fs_info->drop_mutex);
+
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction(trans, root);
+ BUG_ON(ret);
+ btrfs_btree_balance_dirty(root, nr);
+ }
+
+ free_extent_buffer(reloc_root->node);
+
+ ret = btrfs_del_root(trans, root->fs_info->tree_root,
+ &reloc_root->root_key);
+ BUG_ON(ret);
+ mutex_unlock(&root->fs_info->drop_mutex);
+
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction(trans, root);
+ BUG_ON(ret);
+ btrfs_btree_balance_dirty(root, nr);
+
+ kfree(prev_root);
+ prev_root = reloc_root;
+ }
+ if (prev_root) {
+ btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
+ kfree(prev_root);
+ }
+ return 0;
+}
+
+int btrfs_add_dead_reloc_root(struct btrfs_root *root)
+{
+ list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
+ return 0;
+}
+
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key location;
+ int found;
+ int ret;
+
+ mutex_lock(&root->fs_info->tree_reloc_mutex);
+ ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
+ BUG_ON(ret);
+ found = !list_empty(&root->fs_info->dead_reloc_roots);
+ mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+ if (found) {
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ }
+
+ location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+ location.offset = (u64)-1;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+
+ reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ BUG_ON(!reloc_root);
+ btrfs_orphan_cleanup(reloc_root);
+ return 0;
+}
+
+static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct extent_buffer *eb;
+ struct btrfs_root_item *root_item;
+ struct btrfs_key root_key;
+ int ret;
+
+ BUG_ON(!root->ref_cows);
+ if (root->reloc_root)
+ return 0;
+
+ root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+ BUG_ON(!root_item);
+
+ ret = btrfs_copy_root(trans, root, root->commit_root,
+ &eb, BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+
+ root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+ root_key.offset = root->root_key.objectid;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+
+ memcpy(root_item, &root->root_item, sizeof(root_item));
+ btrfs_set_root_refs(root_item, 0);
+ btrfs_set_root_bytenr(root_item, eb->start);
+ btrfs_set_root_level(root_item, btrfs_header_level(eb));
+ btrfs_set_root_generation(root_item, trans->transid);
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+
+ ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+ &root_key, root_item);
+ BUG_ON(ret);
+ kfree(root_item);
+
+ reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+ &root_key);
+ BUG_ON(!reloc_root);
+ reloc_root->last_trans = trans->transid;
+ reloc_root->commit_root = NULL;
+ reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
+
+ root->reloc_root = reloc_root;
+ return 0;
+}
+
+/*
+ * Core function of space balance.
+ *
+ * The idea is using reloc trees to relocate tree blocks in reference
+ * counted roots. There is one reloc tree for each subvol, and all
+ * reloc trees share same root key objectid. Reloc trees are snapshots
+ * of the latest committed roots of subvols (root->commit_root).
+ *
+ * To relocate a tree block referenced by a subvol, there are two steps.
+ * COW the block through subvol's reloc tree, then update block pointer
+ * in the subvol to point to the new block. Since all reloc trees share
+ * same root key objectid, doing special handing for tree blocks owned
+ * by them is easy. Once a tree block has been COWed in one reloc tree,
+ * we can use the resulting new block directly when the same block is
+ * required to COW again through other reloc trees. By this way, relocated
+ * tree blocks are shared between reloc trees, so they are also shared
+ * between subvols.
+ */
+static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *first_key,
+ struct btrfs_ref_path *ref_path,
+ struct btrfs_block_group_cache *group,
+ struct inode *reloc_inode)
+{
+ struct btrfs_root *reloc_root;
+ struct extent_buffer *eb = NULL;
+ struct btrfs_key *keys;
+ u64 *nodes;
+ int level;
+ int shared_level;
+ int lowest_level = 0;
+ int ret;
+
+ if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+ lowest_level = ref_path->owner_objectid;
+
+ if (!root->ref_cows) {
+ path->lowest_level = lowest_level;
+ ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
+ BUG_ON(ret < 0);
+ path->lowest_level = 0;
+ btrfs_release_path(root, path);
+ return 0;
+ }
+
+ mutex_lock(&root->fs_info->tree_reloc_mutex);
+ ret = init_reloc_tree(trans, root);
+ BUG_ON(ret);
+ reloc_root = root->reloc_root;
+
+ shared_level = ref_path->shared_level;
+ ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
+
+ keys = ref_path->node_keys;
+ nodes = ref_path->new_nodes;
+ memset(&keys[shared_level + 1], 0,
+ sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
+ memset(&nodes[shared_level + 1], 0,
+ sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
+
+ if (nodes[lowest_level] == 0) {
+ path->lowest_level = lowest_level;
+ ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+ 0, 1);
+ BUG_ON(ret);
+ for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
+ eb = path->nodes[level];
+ if (!eb || eb == reloc_root->node)
+ break;
+ nodes[level] = eb->start;
+ if (level == 0)
+ btrfs_item_key_to_cpu(eb, &keys[level], 0);
+ else
+ btrfs_node_key_to_cpu(eb, &keys[level], 0);
+ }
+ if (nodes[0] &&
+ ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ eb = path->nodes[0];
+ ret = replace_extents_in_leaf(trans, reloc_root, eb,
+ group, reloc_inode);
+ BUG_ON(ret);
+ }
+ btrfs_release_path(reloc_root, path);
+ } else {
+ ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
+ lowest_level);
+ BUG_ON(ret);
+ }
+
+ /*
+ * replace tree blocks in the fs tree with tree blocks in
+ * the reloc tree.
+ */
+ ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
+ BUG_ON(ret < 0);
+
+ if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+ 0, 0);
+ BUG_ON(ret);
+ extent_buffer_get(path->nodes[0]);
+ eb = path->nodes[0];
+ btrfs_release_path(reloc_root, path);
+ ret = invalidate_extent_cache(reloc_root, eb, group, root);
+ BUG_ON(ret);
+ free_extent_buffer(eb);
+ }
+
+ mutex_unlock(&root->fs_info->tree_reloc_mutex);
+ path->lowest_level = 0;
+ return 0;
+}
+
+static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *first_key,
+ struct btrfs_ref_path *ref_path)
+{
+ int ret;
+
+ ret = relocate_one_path(trans, root, path, first_key,
+ ref_path, NULL, NULL);
+ BUG_ON(ret);
+
+ if (root == root->fs_info->extent_root)
+ btrfs_extent_post_op(trans, root);
+
+ return 0;
+}
+
+static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct btrfs_path *path,
+ struct btrfs_key *extent_key)
+{
+ int ret;
+
+ ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
+ if (ret)
+ goto out;
+ ret = btrfs_del_item(trans, extent_root, path);
+out:
+ btrfs_release_path(extent_root, path);
+ return ret;
+}
+
+static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_ref_path *ref_path)
+{
+ struct btrfs_key root_key;
+
+ root_key.objectid = ref_path->root_objectid;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ if (is_cowonly_root(ref_path->root_objectid))
+ root_key.offset = 0;
+ else
+ root_key.offset = (u64)-1;
+
+ return btrfs_read_fs_root_no_name(fs_info, &root_key);
+}
+
+static noinline int relocate_one_extent(struct btrfs_root *extent_root,
+ struct btrfs_path *path,
+ struct btrfs_key *extent_key,
+ struct btrfs_block_group_cache *group,
+ struct inode *reloc_inode, int pass)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *found_root;
+ struct btrfs_ref_path *ref_path = NULL;
+ struct disk_extent *new_extents = NULL;
+ int nr_extents = 0;
+ int loops;
+ int ret;
+ int level;
+ struct btrfs_key first_key;
+ u64 prev_block = 0;
+
+
+ trans = btrfs_start_transaction(extent_root, 1);
+ BUG_ON(!trans);
+
+ if (extent_key->objectid == 0) {
+ ret = del_extent_zero(trans, extent_root, path, extent_key);
+ goto out;
+ }
+
+ ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
+ if (!ref_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (loops = 0; ; loops++) {
+ if (loops == 0) {
+ ret = btrfs_first_ref_path(trans, extent_root, ref_path,
+ extent_key->objectid);
+ } else {
+ ret = btrfs_next_ref_path(trans, extent_root, ref_path);
+ }
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+
+ if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+ ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ continue;
+
+ found_root = read_ref_root(extent_root->fs_info, ref_path);
+ BUG_ON(!found_root);
+ /*
+ * for reference counted tree, only process reference paths
+ * rooted at the latest committed root.
+ */
+ if (found_root->ref_cows &&
+ ref_path->root_generation != found_root->root_key.offset)
+ continue;
+
+ if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ if (pass == 0) {
+ /*
+ * copy data extents to new locations
+ */
+ u64 group_start = group->key.objectid;
+ ret = relocate_data_extent(reloc_inode,
+ extent_key,
+ group_start);
+ if (ret < 0)
+ goto out;
+ break;
+ }
+ level = 0;
+ } else {
+ level = ref_path->owner_objectid;
+ }
+
+ if (prev_block != ref_path->nodes[level]) {
+ struct extent_buffer *eb;
+ u64 block_start = ref_path->nodes[level];
+ u64 block_size = btrfs_level_size(found_root, level);
+
+ eb = read_tree_block(found_root, block_start,
+ block_size, 0);
+ btrfs_tree_lock(eb);
+ BUG_ON(level != btrfs_header_level(eb));
+
+ if (level == 0)
+ btrfs_item_key_to_cpu(eb, &first_key, 0);
+ else
+ btrfs_node_key_to_cpu(eb, &first_key, 0);
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ prev_block = block_start;
+ }
+
+ btrfs_record_root_in_trans(found_root);
+ if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * try to update data extent references while
+ * keeping metadata shared between snapshots.
+ */
+ if (pass == 1) {
+ ret = relocate_one_path(trans, found_root,
+ path, &first_key, ref_path,
+ group, reloc_inode);
+ if (ret < 0)
+ goto out;
+ continue;
+ }
+ /*
+ * use fallback method to process the remaining
+ * references.
+ */
+ if (!new_extents) {
+ u64 group_start = group->key.objectid;
+ new_extents = kmalloc(sizeof(*new_extents),
+ GFP_NOFS);
+ nr_extents = 1;
+ ret = get_new_locations(reloc_inode,
+ extent_key,
+ group_start, 1,
+ &new_extents,
+ &nr_extents);
+ if (ret)
+ goto out;
+ }
+ ret = replace_one_extent(trans, found_root,
+ path, extent_key,
+ &first_key, ref_path,
+ new_extents, nr_extents);
+ } else {
+ ret = relocate_tree_block(trans, found_root, path,
+ &first_key, ref_path);
+ }
+ if (ret < 0)
+ goto out;
+ }
+ ret = 0;
+out:
+ btrfs_end_transaction(trans, extent_root);
+ kfree(new_extents);
+ kfree(ref_path);
+ return ret;
+}
+
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+{
+ u64 num_devices;
+ u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
+ num_devices = root->fs_info->fs_devices->rw_devices;
+ if (num_devices == 1) {
+ stripped |= BTRFS_BLOCK_GROUP_DUP;
+ stripped = flags & ~stripped;
+
+ /* turn raid0 into single device chunks */
+ if (flags & BTRFS_BLOCK_GROUP_RAID0)
+ return stripped;
+
+ /* turn mirroring into duplication */
+ if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ return stripped | BTRFS_BLOCK_GROUP_DUP;
+ return flags;
+ } else {
+ /* they already had raid on here, just return */
+ if (flags & stripped)
+ return flags;
+
+ stripped |= BTRFS_BLOCK_GROUP_DUP;
+ stripped = flags & ~stripped;
+
+ /* switch duplicated blocks with raid1 */
+ if (flags & BTRFS_BLOCK_GROUP_DUP)
+ return stripped | BTRFS_BLOCK_GROUP_RAID1;
+
+ /* turn single device chunks into raid0 */
+ return stripped | BTRFS_BLOCK_GROUP_RAID0;
+ }
+ return flags;
+}
+
+static int __alloc_chunk_for_shrink(struct btrfs_root *root,
+ struct btrfs_block_group_cache *shrink_block_group,
+ int force)
+{
+ struct btrfs_trans_handle *trans;
+ u64 new_alloc_flags;
+ u64 calc;
+
+ spin_lock(&shrink_block_group->lock);
+ if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+ spin_unlock(&shrink_block_group->lock);
+
+ trans = btrfs_start_transaction(root, 1);
+ spin_lock(&shrink_block_group->lock);
+
+ new_alloc_flags = update_block_group_flags(root,
+ shrink_block_group->flags);
+ if (new_alloc_flags != shrink_block_group->flags) {
+ calc =
+ btrfs_block_group_used(&shrink_block_group->item);
+ } else {
+ calc = shrink_block_group->key.offset;
+ }
+ spin_unlock(&shrink_block_group->lock);
+
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ calc + 2 * 1024 * 1024, new_alloc_flags, force);
+
+ btrfs_end_transaction(trans, root);
+ } else
+ spin_unlock(&shrink_block_group->lock);
+ return 0;
+}
+
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 size)
+{
+ struct btrfs_path *path;
+ struct btrfs_inode_item *item;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+ memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+ btrfs_set_inode_generation(leaf, item, 1);
+ btrfs_set_inode_size(leaf, item, size);
+ btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *group)
+{
+ struct inode *inode = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root;
+ struct btrfs_key root_key;
+ u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+ int err = 0;
+
+ root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ if (IS_ERR(root))
+ return ERR_CAST(root);
+
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+
+ err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+ if (err)
+ goto out;
+
+ err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+ BUG_ON(err);
+
+ err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+ group->key.offset, 0, group->key.offset,
+ 0, 0, 0);
+ BUG_ON(err);
+
+ inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+ if (inode->i_state & I_NEW) {
+ BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->location.objectid = objectid;
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.offset = 0;
+ btrfs_read_locked_inode(inode);
+ unlock_new_inode(inode);
+ BUG_ON(is_bad_inode(inode));
+ } else {
+ BUG_ON(1);
+ }
+ BTRFS_I(inode)->index_cnt = group->key.objectid;
+
+ err = btrfs_orphan_add(trans, inode);
+out:
+ btrfs_end_transaction(trans, root);
+ if (err) {
+ if (inode)
+ iput(inode);
+ inode = ERR_PTR(err);
+ }
+ return inode;
+}
+
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_sector_sum *sector_sum;
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct list_head list;
+ size_t offset;
+ int ret;
+ u64 disk_bytenr;
+
+ INIT_LIST_HEAD(&list);
+
+ ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+ BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+
+ disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+ ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+ disk_bytenr + len - 1, &list);
+
+ while (!list_empty(&list)) {
+ sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+ list_del_init(&sums->list);
+
+ sector_sum = sums->sums;
+ sums->bytenr = ordered->start;
+
+ offset = 0;
+ while (offset < sums->len) {
+ sector_sum->bytenr += ordered->start - disk_bytenr;
+ sector_sum++;
+ offset += root->sectorsize;
+ }
+
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ }
+ btrfs_put_ordered_extent(ordered);
+ return 0;
+}
+
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_fs_info *info = root->fs_info;
+ struct extent_buffer *leaf;
+ struct inode *reloc_inode;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_key key;
+ u64 skipped;
+ u64 cur_byte;
+ u64 total_found;
+ u32 nritems;
+ int ret;
+ int progress;
+ int pass = 0;
+
+ root = root->fs_info->extent_root;
+
+ block_group = btrfs_lookup_block_group(info, group_start);
+ BUG_ON(!block_group);
+
+ printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
+ (unsigned long long)block_group->key.objectid,
+ (unsigned long long)block_group->flags);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ reloc_inode = create_reloc_inode(info, block_group);
+ BUG_ON(IS_ERR(reloc_inode));
+
+ __alloc_chunk_for_shrink(root, block_group, 1);
+ set_block_group_readonly(block_group);
+
+ btrfs_start_delalloc_inodes(info->tree_root);
+ btrfs_wait_ordered_extents(info->tree_root, 0);
+again:
+ skipped = 0;
+ total_found = 0;
+ progress = 0;
+ key.objectid = block_group->key.objectid;
+ key.offset = 0;
+ key.type = 0;
+ cur_byte = key.objectid;
+
+ trans = btrfs_start_transaction(info->tree_root, 1);
+ btrfs_commit_transaction(trans, info->tree_root);
+
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_clean_old_snapshots(info->tree_root);
+ btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+next:
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid >= block_group->key.objectid +
+ block_group->key.offset)
+ break;
+
+ if (progress && need_resched()) {
+ btrfs_release_path(root, path);
+ cond_resched();
+ progress = 0;
+ continue;
+ }
+ progress = 1;
+
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
+ key.objectid + key.offset <= cur_byte) {
+ path->slots[0]++;
+ goto next;
+ }
+
+ total_found++;
+ cur_byte = key.objectid + key.offset;
+ btrfs_release_path(root, path);
+
+ __alloc_chunk_for_shrink(root, block_group, 0);
+ ret = relocate_one_extent(root, path, &key, block_group,
+ reloc_inode, pass);
+ BUG_ON(ret < 0);
+ if (ret > 0)
+ skipped++;
+
+ key.objectid = cur_byte;
+ key.type = 0;
+ key.offset = 0;
+ }
+
+ btrfs_release_path(root, path);
+
+ if (pass == 0) {
+ btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
+ invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
+ }
+
+ if (total_found > 0) {
+ printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
+ (unsigned long long)total_found, pass);
+ pass++;
+ if (total_found == skipped && pass > 2) {
+ iput(reloc_inode);
+ reloc_inode = create_reloc_inode(info, block_group);
+ pass = 0;
+ }
+ goto again;
+ }
+
+ /* delete reloc_inode */
+ iput(reloc_inode);
+
+ /* unpin extents in this range */
+ trans = btrfs_start_transaction(info->tree_root, 1);
+ btrfs_commit_transaction(trans, info->tree_root);
+
+ spin_lock(&block_group->lock);
+ WARN_ON(block_group->pinned > 0);
+ WARN_ON(block_group->reserved > 0);
+ WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
+ spin_unlock(&block_group->lock);
+ put_block_group(block_group);
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int find_first_block_group(struct btrfs_root *root,
+ struct btrfs_path *path, struct btrfs_key *key)
+{
+ int ret = 0;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ int slot;
+
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (1) {
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+ break;
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.objectid >= key->objectid &&
+ found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+ path->slots[0]++;
+ }
+ ret = -ENOENT;
+out:
+ return ret;
+}
+
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct rb_node *n;
+
+ spin_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ block_group = rb_entry(n, struct btrfs_block_group_cache,
+ cache_node);
+ rb_erase(&block_group->cache_node,
+ &info->block_group_cache_tree);
+ spin_unlock(&info->block_group_cache_lock);
+
+ btrfs_remove_free_space_cache(block_group);
+ down_write(&block_group->space_info->groups_sem);
+ list_del(&block_group->list);
+ up_write(&block_group->space_info->groups_sem);
+
+ WARN_ON(atomic_read(&block_group->count) != 1);
+ kfree(block_group);
+
+ spin_lock(&info->block_group_cache_lock);
+ }
+ spin_unlock(&info->block_group_cache_lock);
+ return 0;
+}
+
+int btrfs_read_block_groups(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *space_info;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+
+ root = info->extent_root;
+ key.objectid = 0;
+ key.offset = 0;
+ btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ ret = find_first_block_group(root, path, &key);
+ if (ret > 0) {
+ ret = 0;
+ goto error;
+ }
+ if (ret != 0)
+ goto error;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ atomic_set(&cache->count, 1);
+ spin_lock_init(&cache->lock);
+ mutex_init(&cache->alloc_mutex);
+ mutex_init(&cache->cache_mutex);
+ INIT_LIST_HEAD(&cache->list);
+ read_extent_buffer(leaf, &cache->item,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(cache->item));
+ memcpy(&cache->key, &found_key, sizeof(found_key));
+
+ key.objectid = found_key.objectid + found_key.offset;
+ btrfs_release_path(root, path);
+ cache->flags = btrfs_block_group_flags(&cache->item);
+
+ ret = update_space_info(info, cache->flags, found_key.offset,
+ btrfs_block_group_used(&cache->item),
+ &space_info);
+ BUG_ON(ret);
+ cache->space_info = space_info;
+ down_write(&space_info->groups_sem);
+ list_add_tail(&cache->list, &space_info->block_groups);
+ up_write(&space_info->groups_sem);
+
+ ret = btrfs_add_block_group_cache(root->fs_info, cache);
+ BUG_ON(ret);
+
+ set_avail_alloc_bits(root->fs_info, cache->flags);
+ if (btrfs_chunk_readonly(root, cache->key.objectid))
+ set_block_group_readonly(cache);
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytes_used,
+ u64 type, u64 chunk_objectid, u64 chunk_offset,
+ u64 size)
+{
+ int ret;
+ struct btrfs_root *extent_root;
+ struct btrfs_block_group_cache *cache;
+
+ extent_root = root->fs_info->extent_root;
+
+ root->fs_info->last_trans_new_blockgroup = trans->transid;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+ return -ENOMEM;
+
+ cache->key.objectid = chunk_offset;
+ cache->key.offset = size;
+ cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ atomic_set(&cache->count, 1);
+ spin_lock_init(&cache->lock);
+ mutex_init(&cache->alloc_mutex);
+ mutex_init(&cache->cache_mutex);
+ INIT_LIST_HEAD(&cache->list);
+
+ btrfs_set_block_group_used(&cache->item, bytes_used);
+ btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
+ cache->flags = type;
+ btrfs_set_block_group_flags(&cache->item, type);
+
+ ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+ &cache->space_info);
+ BUG_ON(ret);
+ down_write(&cache->space_info->groups_sem);
+ list_add_tail(&cache->list, &cache->space_info->block_groups);
+ up_write(&cache->space_info->groups_sem);
+
+ ret = btrfs_add_block_group_cache(root->fs_info, cache);
+ BUG_ON(ret);
+
+ ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
+ sizeof(cache->item));
+ BUG_ON(ret);
+
+ finish_current_insert(trans, extent_root, 0);
+ ret = del_pending_extents(trans, extent_root, 0);
+ BUG_ON(ret);
+ set_avail_alloc_bits(extent_root->fs_info, type);
+
+ return 0;
+}
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 group_start)
+{
+ struct btrfs_path *path;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_key key;
+ int ret;
+
+ root = root->fs_info->extent_root;
+
+ block_group = btrfs_lookup_block_group(root->fs_info, group_start);
+ BUG_ON(!block_group);
+ BUG_ON(!block_group->ro);
+
+ memcpy(&key, &block_group->key, sizeof(key));
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ btrfs_remove_free_space_cache(block_group);
+ rb_erase(&block_group->cache_node,
+ &root->fs_info->block_group_cache_tree);
+ down_write(&block_group->space_info->groups_sem);
+ list_del(&block_group->list);
+ up_write(&block_group->space_info->groups_sem);
+
+ spin_lock(&block_group->space_info->lock);
+ block_group->space_info->total_bytes -= block_group->key.offset;
+ block_group->space_info->bytes_readonly -= block_group->key.offset;
+ spin_unlock(&block_group->space_info->lock);
+ block_group->space_info->full = 0;
+
+ put_block_group(block_group);
+ put_block_group(block_group);
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -EIO;
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 00000000000..e086d407f1f
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3717 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/version.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "compat.h"
+#include "ctree.h"
+#include "btrfs_inode.h"
+
+/* temporary define until extent_map moves out of btrfs */
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+ unsigned long extra_flags,
+ void (*ctor)(void *, struct kmem_cache *,
+ unsigned long));
+
+static struct kmem_cache *extent_state_cache;
+static struct kmem_cache *extent_buffer_cache;
+
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+
+#define LEAK_DEBUG 0
+#ifdef LEAK_DEBUG
+static DEFINE_SPINLOCK(leak_lock);
+#endif
+
+#define BUFFER_LRU_MAX 64
+
+struct tree_entry {
+ u64 start;
+ u64 end;
+ struct rb_node rb_node;
+};
+
+struct extent_page_data {
+ struct bio *bio;
+ struct extent_io_tree *tree;
+ get_extent_t *get_extent;
+
+ /* tells writepage not to lock the state bits for this range
+ * it still does the unlocking
+ */
+ int extent_locked;
+};
+
+int __init extent_io_init(void)
+{
+ extent_state_cache = btrfs_cache_create("extent_state",
+ sizeof(struct extent_state), 0,
+ NULL);
+ if (!extent_state_cache)
+ return -ENOMEM;
+
+ extent_buffer_cache = btrfs_cache_create("extent_buffers",
+ sizeof(struct extent_buffer), 0,
+ NULL);
+ if (!extent_buffer_cache)
+ goto free_state_cache;
+ return 0;
+
+free_state_cache:
+ kmem_cache_destroy(extent_state_cache);
+ return -ENOMEM;
+}
+
+void extent_io_exit(void)
+{
+ struct extent_state *state;
+ struct extent_buffer *eb;
+
+ while (!list_empty(&states)) {
+ state = list_entry(states.next, struct extent_state, leak_list);
+ printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+ "state %lu in tree %p refs %d\n",
+ (unsigned long long)state->start,
+ (unsigned long long)state->end,
+ state->state, state->tree, atomic_read(&state->refs));
+ list_del(&state->leak_list);
+ kmem_cache_free(extent_state_cache, state);
+
+ }
+
+ while (!list_empty(&buffers)) {
+ eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+ printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+ "refs %d\n", (unsigned long long)eb->start,
+ eb->len, atomic_read(&eb->refs));
+ list_del(&eb->leak_list);
+ kmem_cache_free(extent_buffer_cache, eb);
+ }
+ if (extent_state_cache)
+ kmem_cache_destroy(extent_state_cache);
+ if (extent_buffer_cache)
+ kmem_cache_destroy(extent_buffer_cache);
+}
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+ struct address_space *mapping, gfp_t mask)
+{
+ tree->state.rb_node = NULL;
+ tree->buffer.rb_node = NULL;
+ tree->ops = NULL;
+ tree->dirty_bytes = 0;
+ spin_lock_init(&tree->lock);
+ spin_lock_init(&tree->buffer_lock);
+ tree->mapping = mapping;
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+ struct extent_state *state;
+#ifdef LEAK_DEBUG
+ unsigned long flags;
+#endif
+
+ state = kmem_cache_alloc(extent_state_cache, mask);
+ if (!state)
+ return state;
+ state->state = 0;
+ state->private = 0;
+ state->tree = NULL;
+#ifdef LEAK_DEBUG
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(&state->leak_list, &states);
+ spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+ atomic_set(&state->refs, 1);
+ init_waitqueue_head(&state->wq);
+ return state;
+}
+
+static void free_extent_state(struct extent_state *state)
+{
+ if (!state)
+ return;
+ if (atomic_dec_and_test(&state->refs)) {
+#ifdef LEAK_DEBUG
+ unsigned long flags;
+#endif
+ WARN_ON(state->tree);
+#ifdef LEAK_DEBUG
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&state->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct tree_entry *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct tree_entry, rb_node);
+
+ if (offset < entry->start)
+ p = &(*p)->rb_left;
+ else if (offset > entry->end)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ entry = rb_entry(node, struct tree_entry, rb_node);
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
+ struct rb_node **prev_ret,
+ struct rb_node **next_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node *n = root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev = NULL;
+ struct tree_entry *entry;
+ struct tree_entry *prev_entry = NULL;
+
+ while (n) {
+ entry = rb_entry(n, struct tree_entry, rb_node);
+ prev = n;
+ prev_entry = entry;
+
+ if (offset < entry->start)
+ n = n->rb_left;
+ else if (offset > entry->end)
+ n = n->rb_right;
+ else
+ return n;
+ }
+
+ if (prev_ret) {
+ orig_prev = prev;
+ while (prev && offset > prev_entry->end) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+ }
+ *prev_ret = prev;
+ prev = orig_prev;
+ }
+
+ if (next_ret) {
+ prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+ while (prev && offset < prev_entry->start) {
+ prev = rb_prev(prev);
+ prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+ }
+ *next_ret = prev;
+ }
+ return NULL;
+}
+
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+ u64 offset)
+{
+ struct rb_node *prev = NULL;
+ struct rb_node *ret;
+
+ ret = __etree_search(tree, offset, &prev, NULL);
+ if (!ret)
+ return prev;
+ return ret;
+}
+
+static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
+ u64 offset, struct rb_node *node)
+{
+ struct rb_root *root = &tree->buffer;
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_buffer *eb;
+
+ while (*p) {
+ parent = *p;
+ eb = rb_entry(parent, struct extent_buffer, rb_node);
+
+ if (offset < eb->start)
+ p = &(*p)->rb_left;
+ else if (offset > eb->start)
+ p = &(*p)->rb_right;
+ else
+ return eb;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
+ u64 offset)
+{
+ struct rb_root *root = &tree->buffer;
+ struct rb_node *n = root->rb_node;
+ struct extent_buffer *eb;
+
+ while (n) {
+ eb = rb_entry(n, struct extent_buffer, rb_node);
+ if (offset < eb->start)
+ n = n->rb_left;
+ else if (offset > eb->start)
+ n = n->rb_right;
+ else
+ return eb;
+ }
+ return NULL;
+}
+
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree. Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static int merge_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+{
+ struct extent_state *other;
+ struct rb_node *other_node;
+
+ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+ return 0;
+
+ other_node = rb_prev(&state->rb_node);
+ if (other_node) {
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->end == state->start - 1 &&
+ other->state == state->state) {
+ state->start = other->start;
+ other->tree = NULL;
+ rb_erase(&other->rb_node, &tree->state);
+ free_extent_state(other);
+ }
+ }
+ other_node = rb_next(&state->rb_node);
+ if (other_node) {
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->start == state->end + 1 &&
+ other->state == state->state) {
+ other->start = state->start;
+ state->tree = NULL;
+ rb_erase(&state->rb_node, &tree->state);
+ free_extent_state(state);
+ }
+ }
+ return 0;
+}
+
+static void set_state_cb(struct extent_io_tree *tree,
+ struct extent_state *state,
+ unsigned long bits)
+{
+ if (tree->ops && tree->ops->set_bit_hook) {
+ tree->ops->set_bit_hook(tree->mapping->host, state->start,
+ state->end, state->state, bits);
+ }
+}
+
+static void clear_state_cb(struct extent_io_tree *tree,
+ struct extent_state *state,
+ unsigned long bits)
+{
+ if (tree->ops && tree->ops->clear_bit_hook) {
+ tree->ops->clear_bit_hook(tree->mapping->host, state->start,
+ state->end, state->state, bits);
+ }
+}
+
+/*
+ * insert an extent_state struct into the tree. 'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally. This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+ struct extent_state *state, u64 start, u64 end,
+ int bits)
+{
+ struct rb_node *node;
+
+ if (end < start) {
+ printk(KERN_ERR "btrfs end < start %llu %llu\n",
+ (unsigned long long)end,
+ (unsigned long long)start);
+ WARN_ON(1);
+ }
+ if (bits & EXTENT_DIRTY)
+ tree->dirty_bytes += end - start + 1;
+ set_state_cb(tree, state, bits);
+ state->state |= bits;
+ state->start = start;
+ state->end = end;
+ node = tree_insert(&tree->state, end, &state->rb_node);
+ if (node) {
+ struct extent_state *found;
+ found = rb_entry(node, struct extent_state, rb_node);
+ printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+ "%llu %llu\n", (unsigned long long)found->start,
+ (unsigned long long)found->end,
+ (unsigned long long)start, (unsigned long long)end);
+ free_extent_state(state);
+ return -EEXIST;
+ }
+ state->tree = tree;
+ merge_state(tree, state);
+ return 0;
+}
+
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half. 'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end]. After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ struct extent_state *prealloc, u64 split)
+{
+ struct rb_node *node;
+ prealloc->start = orig->start;
+ prealloc->end = split - 1;
+ prealloc->state = orig->state;
+ orig->start = split;
+
+ node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+ if (node) {
+ struct extent_state *found;
+ found = rb_entry(node, struct extent_state, rb_node);
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
+ prealloc->tree = tree;
+ return 0;
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state, int bits, int wake,
+ int delete)
+{
+ int ret = state->state & bits;
+
+ if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+ u64 range = state->end - state->start + 1;
+ WARN_ON(range > tree->dirty_bytes);
+ tree->dirty_bytes -= range;
+ }
+ clear_state_cb(tree, state, bits);
+ state->state &= ~bits;
+ if (wake)
+ wake_up(&state->wq);
+ if (delete || state->state == 0) {
+ if (state->tree) {
+ clear_state_cb(tree, state, state->state);
+ rb_erase(&state->rb_node, &tree->state);
+ state->tree = NULL;
+ free_extent_state(state);
+ } else {
+ WARN_ON(1);
+ }
+ } else {
+ merge_state(tree, state);
+ }
+ return ret;
+}
+
+/*
+ * clear some bits on a range in the tree. This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns < 0 on error, > 0 if any of the
+ * bits were already set, or zero if none of the bits were already set.
+ */
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int wake, int delete, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ int err;
+ int set = 0;
+
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+ if (!prealloc)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find the extents that end after
+ * our range starts
+ */
+ node = tree_search(tree, start);
+ if (!node)
+ goto out;
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->end < start);
+
+ /*
+ * | ---- desired range ---- |
+ * | state | or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip
+ * bits on second half.
+ *
+ * If the extent we found extends past our range, we
+ * just split and search again. It'll get split again
+ * the next time though.
+ *
+ * If the extent we found is inside our range, we clear
+ * the desired bit on it.
+ */
+
+ if (state->start < start) {
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+ err = split_state(tree, state, prealloc, start);
+ BUG_ON(err == -EEXIST);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ start = state->end + 1;
+ set |= clear_state_bit(tree, state, bits,
+ wake, delete);
+ } else {
+ start = state->start;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and clear the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+
+ if (wake)
+ wake_up(&state->wq);
+ set |= clear_state_bit(tree, prealloc, bits,
+ wake, delete);
+ prealloc = NULL;
+ goto out;
+ }
+
+ start = state->end + 1;
+ set |= clear_state_bit(tree, state, bits, wake, delete);
+ goto search_again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return set;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+
+static int wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+ __releases(tree->lock)
+ __acquires(tree->lock)
+{
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
+ return 0;
+}
+
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+{
+ struct extent_state *state;
+ struct rb_node *node;
+
+ spin_lock(&tree->lock);
+again:
+ while (1) {
+ /*
+ * this search will find all the extents that end after
+ * our range starts
+ */
+ node = tree_search(tree, start);
+ if (!node)
+ break;
+
+ state = rb_entry(node, struct extent_state, rb_node);
+
+ if (state->start > end)
+ goto out;
+
+ if (state->state & bits) {
+ start = state->start;
+ atomic_inc(&state->refs);
+ wait_on_state(tree, state);
+ free_extent_state(state);
+ goto again;
+ }
+ start = state->end + 1;
+
+ if (start > end)
+ break;
+
+ if (need_resched()) {
+ spin_unlock(&tree->lock);
+ cond_resched();
+ spin_lock(&tree->lock);
+ }
+ }
+out:
+ spin_unlock(&tree->lock);
+ return 0;
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state,
+ int bits)
+{
+ if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+ u64 range = state->end - state->start + 1;
+ tree->dirty_bytes += range;
+ }
+ set_state_cb(tree, state, bits);
+ state->state |= bits;
+}
+
+/*
+ * set some bits on a range in the tree. This may require allocations
+ * or sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+ * range already has the desired bits set. The start of the existing
+ * range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive
+ * This takes the tree lock.
+ */
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive, u64 *failed_start,
+ gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node *node;
+ int err = 0;
+ int set;
+ u64 last_start;
+ u64 last_end;
+again:
+ if (!prealloc && (mask & __GFP_WAIT)) {
+ prealloc = alloc_extent_state(mask);
+ if (!prealloc)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ err = insert_state(tree, prealloc, start, end, bits);
+ prealloc = NULL;
+ BUG_ON(err == -EEXIST);
+ goto out;
+ }
+
+ state = rb_entry(node, struct extent_state, rb_node);
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ set = state->state & bits;
+ if (set && exclusive) {
+ *failed_start = state->start;
+ err = -EEXIST;
+ goto out;
+ }
+ set_state_bits(tree, state, bits);
+ start = state->end + 1;
+ merge_state(tree, state);
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on
+ * second half.
+ *
+ * If the extent we found extends past our
+ * range, we just split and search again. It'll get split
+ * again the next time though.
+ *
+ * If the extent we found is inside our range, we set the
+ * desired bit on it.
+ */
+ if (state->start < start) {
+ set = state->state & bits;
+ if (exclusive && set) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, start);
+ BUG_ON(err == -EEXIST);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits);
+ start = state->end + 1;
+ merge_state(tree, state);
+ } else {
+ start = state->start;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and
+ * ignore the extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+ err = insert_state(tree, prealloc, start, this_end,
+ bits);
+ prealloc = NULL;
+ BUG_ON(err == -EEXIST);
+ if (err)
+ goto out;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and set the bit
+ * on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ set = state->state & bits;
+ if (exclusive && set) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, end + 1);
+ BUG_ON(err == -EEXIST);
+
+ set_state_bits(tree, prealloc, bits);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+ goto search_again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+ mask);
+}
+
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+}
+
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, bits, 0, NULL,
+ mask);
+}
+
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+}
+
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_DIRTY,
+ 0, NULL, mask);
+}
+
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
+}
+
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+}
+
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+ mask);
+}
+
+static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+}
+
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+ mask);
+}
+
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+}
+
+static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+ 0, NULL, mask);
+}
+
+static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+}
+
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
+}
+
+/*
+ * either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+ int err;
+ u64 failed_start;
+ while (1) {
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+ &failed_start, mask);
+ if (err == -EEXIST && (mask & __GFP_WAIT)) {
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ start = failed_start;
+ } else {
+ break;
+ }
+ WARN_ON(start > end);
+ }
+ return err;
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ int err;
+ u64 failed_start;
+
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+ &failed_start, mask);
+ if (err == -EEXIST) {
+ if (failed_start > start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, 1, 0, mask);
+ return 0;
+ }
+ return 1;
+}
+
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+}
+
+/*
+ * helper function to set pages and extents in the tree dirty
+ */
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(tree->mapping, index);
+ BUG_ON(!page);
+ __set_page_dirty_nobuffers(page);
+ page_cache_release(page);
+ index++;
+ }
+ set_extent_dirty(tree, start, end, GFP_NOFS);
+ return 0;
+}
+
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(tree->mapping, index);
+ BUG_ON(!page);
+ set_page_writeback(page);
+ page_cache_release(page);
+ index++;
+ }
+ set_extent_writeback(tree, start, end, GFP_NOFS);
+ return 0;
+}
+
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, int bits)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node)
+ goto out;
+
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->end >= start && (state->state & bits)) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ ret = 0;
+ break;
+ }
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/* find the first state struct with 'bits' set after 'start', and
+ * return it. tree->lock must be held. NULL will returned if
+ * nothing was found after 'start'
+ */
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, int bits)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node)
+ goto out;
+
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->end >= start && (state->state & bits))
+ return state;
+
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ return NULL;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'. start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+ u64 *start, u64 *end, u64 max_bytes)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 found = 0;
+ u64 total_bytes = 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, cur_start);
+ if (!node) {
+ if (!found)
+ *end = (u64)-1;
+ goto out;
+ }
+
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (found && (state->start != cur_start ||
+ (state->state & EXTENT_BOUNDARY))) {
+ goto out;
+ }
+ if (!(state->state & EXTENT_DELALLOC)) {
+ if (!found)
+ *end = state->end;
+ goto out;
+ }
+ if (!found)
+ *start = state->start;
+ found++;
+ *end = state->end;
+ cur_start = state->end + 1;
+ node = rb_next(node);
+ if (!node)
+ break;
+ total_bytes += state->end - state->start + 1;
+ if (total_bytes >= max_bytes)
+ break;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return found;
+}
+
+static noinline int __unlock_for_delalloc(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end)
+{
+ int ret;
+ struct page *pages[16];
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+
+ if (index == locked_page->index && end_index == index)
+ return 0;
+
+ while (nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long, nr_pages,
+ ARRAY_SIZE(pages)), pages);
+ for (i = 0; i < ret; i++) {
+ if (pages[i] != locked_page)
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ cond_resched();
+ }
+ return 0;
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+ struct page *locked_page,
+ u64 delalloc_start,
+ u64 delalloc_end)
+{
+ unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+ unsigned long start_index = index;
+ unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+ unsigned long pages_locked = 0;
+ struct page *pages[16];
+ unsigned long nrpages;
+ int ret;
+ int i;
+
+ /* the caller is responsible for locking the start index */
+ if (index == locked_page->index && index == end_index)
+ return 0;
+
+ /* skip the page at the start index */
+ nrpages = end_index - index + 1;
+ while (nrpages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long,
+ nrpages, ARRAY_SIZE(pages)), pages);
+ if (ret == 0) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ /* now we have an array of pages, lock them all */
+ for (i = 0; i < ret; i++) {
+ /*
+ * the caller is taking responsibility for
+ * locked_page
+ */
+ if (pages[i] != locked_page) {
+ lock_page(pages[i]);
+ if (!PageDirty(pages[i]) ||
+ pages[i]->mapping != inode->i_mapping) {
+ ret = -EAGAIN;
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ goto done;
+ }
+ }
+ page_cache_release(pages[i]);
+ pages_locked++;
+ }
+ nrpages -= ret;
+ index += ret;
+ cond_resched();
+ }
+ ret = 0;
+done:
+ if (ret && pages_locked) {
+ __unlock_for_delalloc(inode, locked_page,
+ delalloc_start,
+ ((u64)(start_index + pages_locked - 1)) <<
+ PAGE_CACHE_SHIFT);
+ }
+ return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'. start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page,
+ u64 *start, u64 *end,
+ u64 max_bytes)
+{
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 found;
+ int ret;
+ int loops = 0;
+
+again:
+ /* step one, find a bunch of delalloc bytes starting at start */
+ delalloc_start = *start;
+ delalloc_end = 0;
+ found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+ max_bytes);
+ if (!found || delalloc_end <= *start) {
+ *start = delalloc_start;
+ *end = delalloc_end;
+ return found;
+ }
+
+ /*
+ * start comes from the offset of locked_page. We have to lock
+ * pages in order, so we can't process delalloc bytes before
+ * locked_page
+ */
+ if (delalloc_start < *start)
+ delalloc_start = *start;
+
+ /*
+ * make sure to limit the number of pages we try to lock down
+ * if we're looping.
+ */
+ if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
+ delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+
+ /* step two, lock all the pages after the page that has start */
+ ret = lock_delalloc_pages(inode, locked_page,
+ delalloc_start, delalloc_end);
+ if (ret == -EAGAIN) {
+ /* some of the pages are gone, lets avoid looping by
+ * shortening the size of the delalloc range we're searching
+ */
+ if (!loops) {
+ unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+ max_bytes = PAGE_CACHE_SIZE - offset;
+ loops = 1;
+ goto again;
+ } else {
+ found = 0;
+ goto out_failed;
+ }
+ }
+ BUG_ON(ret);
+
+ /* step three, lock the state bits for the whole range */
+ lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+
+ /* then test to make sure it is all still delalloc */
+ ret = test_range_bit(tree, delalloc_start, delalloc_end,
+ EXTENT_DELALLOC, 1);
+ if (!ret) {
+ unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+ __unlock_for_delalloc(inode, locked_page,
+ delalloc_start, delalloc_end);
+ cond_resched();
+ goto again;
+ }
+ *start = delalloc_start;
+ *end = delalloc_end;
+out_failed:
+ return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode,
+ struct extent_io_tree *tree,
+ u64 start, u64 end, struct page *locked_page,
+ int unlock_pages,
+ int clear_unlock,
+ int clear_delalloc, int clear_dirty,
+ int set_writeback,
+ int end_writeback)
+{
+ int ret;
+ struct page *pages[16];
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+ int clear_bits = 0;
+
+ if (clear_unlock)
+ clear_bits |= EXTENT_LOCKED;
+ if (clear_dirty)
+ clear_bits |= EXTENT_DIRTY;
+
+ if (clear_delalloc)
+ clear_bits |= EXTENT_DELALLOC;
+
+ clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+ if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+ return 0;
+
+ while (nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long,
+ nr_pages, ARRAY_SIZE(pages)), pages);
+ for (i = 0; i < ret; i++) {
+ if (pages[i] == locked_page) {
+ page_cache_release(pages[i]);
+ continue;
+ }
+ if (clear_dirty)
+ clear_page_dirty_for_io(pages[i]);
+ if (set_writeback)
+ set_page_writeback(pages[i]);
+ if (end_writeback)
+ end_page_writeback(pages[i]);
+ if (unlock_pages)
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ cond_resched();
+ }
+ return 0;
+}
+
+/*
+ * count the number of bytes in the tree that have a given bit(s)
+ * set. This can be fairly slow, except for EXTENT_DIRTY which is
+ * cached. The total number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ unsigned long bits)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 total_bytes = 0;
+ int found = 0;
+
+ if (search_end <= cur_start) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ spin_lock(&tree->lock);
+ if (cur_start == 0 && bits == EXTENT_DIRTY) {
+ total_bytes = tree->dirty_bytes;
+ goto out;
+ }
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, cur_start);
+ if (!node)
+ goto out;
+
+ while (1) {
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start > search_end)
+ break;
+ if (state->end >= cur_start && (state->state & bits)) {
+ total_bytes += min(search_end, state->end) + 1 -
+ max(cur_start, state->start);
+ if (total_bytes >= max_bytes)
+ break;
+ if (!found) {
+ *start = state->start;
+ found = 1;
+ }
+ }
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return total_bytes;
+}
+
+#if 0
+/*
+ * helper function to lock both pages and extents in the tree.
+ * pages must be locked first.
+ */
+static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ int err;
+
+ while (index <= end_index) {
+ page = grab_cache_page(tree->mapping, index);
+ if (!page) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto failed;
+ }
+ index++;
+ }
+ lock_extent(tree, start, end, GFP_NOFS);
+ return 0;
+
+failed:
+ /*
+ * we failed above in getting the page at 'index', so we undo here
+ * up to but not including the page at 'index'
+ */
+ end_index = index;
+ index = start >> PAGE_CACHE_SHIFT;
+ while (index < end_index) {
+ page = find_get_page(tree->mapping, index);
+ unlock_page(page);
+ page_cache_release(page);
+ index++;
+ }
+ return err;
+}
+
+/*
+ * helper function to unlock both pages and extents in the tree.
+ */
+static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ while (index <= end_index) {
+ page = find_get_page(tree->mapping, index);
+ unlock_page(page);
+ page_cache_release(page);
+ index++;
+ }
+ unlock_extent(tree, start, end, GFP_NOFS);
+ return 0;
+}
+#endif
+
+/*
+ * set the private field for a given byte offset in the tree. If there isn't
+ * an extent_state there already, this does nothing.
+ */
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ int ret = 0;
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ ret = -ENOENT;
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start != start) {
+ ret = -ENOENT;
+ goto out;
+ }
+ state->private = private;
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+{
+ struct rb_node *node;
+ struct extent_state *state;
+ int ret = 0;
+
+ spin_lock(&tree->lock);
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ if (!node) {
+ ret = -ENOENT;
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+ if (state->start != start) {
+ ret = -ENOENT;
+ goto out;
+ }
+ *private = state->private;
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if every extent in the tree
+ * has the bits set. Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int filled)
+{
+ struct extent_state *state = NULL;
+ struct rb_node *node;
+ int bitset = 0;
+
+ spin_lock(&tree->lock);
+ node = tree_search(tree, start);
+ while (node && start <= end) {
+ state = rb_entry(node, struct extent_state, rb_node);
+
+ if (filled && state->start > start) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->start > end)
+ break;
+
+ if (state->state & bits) {
+ bitset = 1;
+ if (!filled)
+ break;
+ } else if (filled) {
+ bitset = 0;
+ break;
+ }
+ start = state->end + 1;
+ if (start > end)
+ break;
+ node = rb_next(node);
+ if (!node) {
+ if (filled)
+ bitset = 0;
+ break;
+ }
+ }
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static int check_page_uptodate(struct extent_io_tree *tree,
+ struct page *page)
+{
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+ SetPageUptodate(page);
+ return 0;
+}
+
+/*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static int check_page_locked(struct extent_io_tree *tree,
+ struct page *page)
+{
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+ unlock_page(page);
+ return 0;
+}
+
+/*
+ * helper function to end page writeback if all the extents
+ * in the tree for that page are done with writeback
+ */
+static int check_page_writeback(struct extent_io_tree *tree,
+ struct page *page)
+{
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+ end_page_writeback(page);
+ return 0;
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_writepage(struct bio *bio, int err)
+{
+ int uptodate = err == 0;
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_io_tree *tree;
+ u64 start;
+ u64 end;
+ int whole_page;
+ int ret;
+
+ do {
+ struct page *page = bvec->bv_page;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+ whole_page = 1;
+ else
+ whole_page = 0;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+ if (tree->ops && tree->ops->writepage_end_io_hook) {
+ ret = tree->ops->writepage_end_io_hook(page, start,
+ end, NULL, uptodate);
+ if (ret)
+ uptodate = 0;
+ }
+
+ if (!uptodate && tree->ops &&
+ tree->ops->writepage_io_failed_hook) {
+ ret = tree->ops->writepage_io_failed_hook(bio, page,
+ start, end, NULL);
+ if (ret == 0) {
+ uptodate = (err == 0);
+ continue;
+ }
+ }
+
+ if (!uptodate) {
+ clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+
+ clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+
+ if (whole_page)
+ end_page_writeback(page);
+ else
+ check_page_writeback(tree, page);
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_readpage(struct bio *bio, int err)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_io_tree *tree;
+ u64 start;
+ u64 end;
+ int whole_page;
+ int ret;
+
+ if (err)
+ uptodate = 0;
+
+ do {
+ struct page *page = bvec->bv_page;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+ whole_page = 1;
+ else
+ whole_page = 0;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
+ ret = tree->ops->readpage_end_io_hook(page, start, end,
+ NULL);
+ if (ret)
+ uptodate = 0;
+ }
+ if (!uptodate && tree->ops &&
+ tree->ops->readpage_io_failed_hook) {
+ ret = tree->ops->readpage_io_failed_hook(bio, page,
+ start, end, NULL);
+ if (ret == 0) {
+ uptodate =
+ test_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (err)
+ uptodate = 0;
+ continue;
+ }
+ }
+
+ if (uptodate) {
+ set_extent_uptodate(tree, start, end,
+ GFP_ATOMIC);
+ }
+ unlock_extent(tree, start, end, GFP_ATOMIC);
+
+ if (whole_page) {
+ if (uptodate) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ } else {
+ if (uptodate) {
+ check_page_uptodate(tree, page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ check_page_locked(tree, page);
+ }
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+}
+
+/*
+ * IO done from prepare_write is pretty simple, we just unlock
+ * the structs in the extent tree when done, and set the uptodate bits
+ * as appropriate.
+ */
+static void end_bio_extent_preparewrite(struct bio *bio, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct extent_io_tree *tree;
+ u64 start;
+ u64 end;
+
+ do {
+ struct page *page = bvec->bv_page;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ if (uptodate) {
+ set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+
+ unlock_extent(tree, start, end, GFP_ATOMIC);
+
+ } while (bvec >= bio->bi_io_vec);
+
+ bio_put(bio);
+}
+
+static struct bio *
+extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+ gfp_t gfp_flags)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(gfp_flags, nr_vecs);
+
+ if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (nr_vecs /= 2))
+ bio = bio_alloc(gfp_flags, nr_vecs);
+ }
+
+ if (bio) {
+ bio->bi_size = 0;
+ bio->bi_bdev = bdev;
+ bio->bi_sector = first_sector;
+ }
+ return bio;
+}
+
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ int ret = 0;
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct page *page = bvec->bv_page;
+ struct extent_io_tree *tree = bio->bi_private;
+ u64 start;
+ u64 end;
+
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ bio->bi_private = NULL;
+
+ bio_get(bio);
+
+ if (tree->ops && tree->ops->submit_bio_hook)
+ tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+ mirror_num, bio_flags);
+ else
+ submit_bio(rw, bio);
+ if (bio_flagged(bio, BIO_EOPNOTSUPP))
+ ret = -EOPNOTSUPP;
+ bio_put(bio);
+ return ret;
+}
+
+static int submit_extent_page(int rw, struct extent_io_tree *tree,
+ struct page *page, sector_t sector,
+ size_t size, unsigned long offset,
+ struct block_device *bdev,
+ struct bio **bio_ret,
+ unsigned long max_pages,
+ bio_end_io_t end_io_func,
+ int mirror_num,
+ unsigned long prev_bio_flags,
+ unsigned long bio_flags)
+{
+ int ret = 0;
+ struct bio *bio;
+ int nr;
+ int contig = 0;
+ int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+ int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+ size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
+
+ if (bio_ret && *bio_ret) {
+ bio = *bio_ret;
+ if (old_compressed)
+ contig = bio->bi_sector == sector;
+ else
+ contig = bio->bi_sector + (bio->bi_size >> 9) ==
+ sector;
+
+ if (prev_bio_flags != bio_flags || !contig ||
+ (tree->ops && tree->ops->merge_bio_hook &&
+ tree->ops->merge_bio_hook(page, offset, page_size, bio,
+ bio_flags)) ||
+ bio_add_page(bio, page, page_size, offset) < page_size) {
+ ret = submit_one_bio(rw, bio, mirror_num,
+ prev_bio_flags);
+ bio = NULL;
+ } else {
+ return 0;
+ }
+ }
+ if (this_compressed)
+ nr = BIO_MAX_PAGES;
+ else
+ nr = bio_get_nr_vecs(bdev);
+
+ bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+
+ bio_add_page(bio, page, page_size, offset);
+ bio->bi_end_io = end_io_func;
+ bio->bi_private = tree;
+
+ if (bio_ret)
+ *bio_ret = bio;
+ else
+ ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
+
+ return ret;
+}
+
+void set_page_extent_mapped(struct page *page)
+{
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, EXTENT_PAGE_PRIVATE);
+ }
+}
+
+static void set_page_extent_head(struct page *page, unsigned long len)
+{
+ set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+}
+
+/*
+ * basic readpage implementation. Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ */
+static int __extent_read_full_page(struct extent_io_tree *tree,
+ struct page *page,
+ get_extent_t *get_extent,
+ struct bio **bio, int mirror_num,
+ unsigned long *bio_flags)
+{
+ struct inode *inode = page->mapping->host;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 end;
+ u64 cur = start;
+ u64 extent_offset;
+ u64 last_byte = i_size_read(inode);
+ u64 block_start;
+ u64 cur_end;
+ sector_t sector;
+ struct extent_map *em;
+ struct block_device *bdev;
+ int ret;
+ int nr = 0;
+ size_t page_offset = 0;
+ size_t iosize;
+ size_t disk_io_size;
+ size_t blocksize = inode->i_sb->s_blocksize;
+ unsigned long this_bio_flag = 0;
+
+ set_page_extent_mapped(page);
+
+ end = page_end;
+ lock_extent(tree, start, end, GFP_NOFS);
+
+ if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+ char *userpage;
+ size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+ if (zero_offset) {
+ iosize = PAGE_CACHE_SIZE - zero_offset;
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + zero_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
+ }
+ }
+ while (cur <= end) {
+ if (cur >= last_byte) {
+ char *userpage;
+ iosize = PAGE_CACHE_SIZE - page_offset;
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + page_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
+ set_extent_uptodate(tree, cur, cur + iosize - 1,
+ GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ break;
+ }
+ em = get_extent(inode, page, page_offset, cur,
+ end - cur + 1, 0);
+ if (IS_ERR(em) || !em) {
+ SetPageError(page);
+ unlock_extent(tree, cur, end, GFP_NOFS);
+ break;
+ }
+ extent_offset = cur - em->start;
+ BUG_ON(extent_map_end(em) <= cur);
+ BUG_ON(end < cur);
+
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ this_bio_flag = EXTENT_BIO_COMPRESSED;
+
+ iosize = min(extent_map_end(em) - cur, end - cur + 1);
+ cur_end = min(extent_map_end(em) - 1, end);
+ iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+ if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+ disk_io_size = em->block_len;
+ sector = em->block_start >> 9;
+ } else {
+ sector = (em->block_start + extent_offset) >> 9;
+ disk_io_size = iosize;
+ }
+ bdev = em->bdev;
+ block_start = em->block_start;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ block_start = EXTENT_MAP_HOLE;
+ free_extent_map(em);
+ em = NULL;
+
+ /* we've found a hole, just zero and go on */
+ if (block_start == EXTENT_MAP_HOLE) {
+ char *userpage;
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + page_offset, 0, iosize);
+ flush_dcache_page(page);
+ kunmap_atomic(userpage, KM_USER0);
+
+ set_extent_uptodate(tree, cur, cur + iosize - 1,
+ GFP_NOFS);
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ cur = cur + iosize;
+ page_offset += iosize;
+ continue;
+ }
+ /* the get_extent function already copied into the page */
+ if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+ check_page_uptodate(tree, page);
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ cur = cur + iosize;
+ page_offset += iosize;
+ continue;
+ }
+ /* we have an inline extent but it didn't get marked up
+ * to date. Error out
+ */
+ if (block_start == EXTENT_MAP_INLINE) {
+ SetPageError(page);
+ unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ cur = cur + iosize;
+ page_offset += iosize;
+ continue;
+ }
+
+ ret = 0;
+ if (tree->ops && tree->ops->readpage_io_hook) {
+ ret = tree->ops->readpage_io_hook(page, cur,
+ cur + iosize - 1);
+ }
+ if (!ret) {
+ unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+ pnr -= page->index;
+ ret = submit_extent_page(READ, tree, page,
+ sector, disk_io_size, page_offset,
+ bdev, bio, pnr,
+ end_bio_extent_readpage, mirror_num,
+ *bio_flags,
+ this_bio_flag);
+ nr++;
+ *bio_flags = this_bio_flag;
+ }
+ if (ret)
+ SetPageError(page);
+ cur = cur + iosize;
+ page_offset += iosize;
+ }
+ if (!nr) {
+ if (!PageError(page))
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ return 0;
+}
+
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent)
+{
+ struct bio *bio = NULL;
+ unsigned long bio_flags = 0;
+ int ret;
+
+ ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+ &bio_flags);
+ if (bio)
+ submit_one_bio(READ, bio, 0, bio_flags);
+ return ret;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage. extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback. Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct inode *inode = page->mapping->host;
+ struct extent_page_data *epd = data;
+ struct extent_io_tree *tree = epd->tree;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 delalloc_start;
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 end;
+ u64 cur = start;
+ u64 extent_offset;
+ u64 last_byte = i_size_read(inode);
+ u64 block_start;
+ u64 iosize;
+ u64 unlock_start;
+ sector_t sector;
+ struct extent_map *em;
+ struct block_device *bdev;
+ int ret;
+ int nr = 0;
+ size_t pg_offset = 0;
+ size_t blocksize;
+ loff_t i_size = i_size_read(inode);
+ unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+ u64 nr_delalloc;
+ u64 delalloc_end;
+ int page_started;
+ int compressed;
+ unsigned long nr_written = 0;
+
+ WARN_ON(!PageLocked(page));
+ pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+ if (page->index > end_index ||
+ (page->index == end_index && !pg_offset)) {
+ page->mapping->a_ops->invalidatepage(page, 0);
+ unlock_page(page);
+ return 0;
+ }
+
+ if (page->index == end_index) {
+ char *userpage;
+
+ userpage = kmap_atomic(page, KM_USER0);
+ memset(userpage + pg_offset, 0,
+ PAGE_CACHE_SIZE - pg_offset);
+ kunmap_atomic(userpage, KM_USER0);
+ flush_dcache_page(page);
+ }
+ pg_offset = 0;
+
+ set_page_extent_mapped(page);
+
+ delalloc_start = start;
+ delalloc_end = 0;
+ page_started = 0;
+ if (!epd->extent_locked) {
+ while (delalloc_end < page_end) {
+ nr_delalloc = find_lock_delalloc_range(inode, tree,
+ page,
+ &delalloc_start,
+ &delalloc_end,
+ 128 * 1024 * 1024);
+ if (nr_delalloc == 0) {
+ delalloc_start = delalloc_end + 1;
+ continue;
+ }
+ tree->ops->fill_delalloc(inode, page, delalloc_start,
+ delalloc_end, &page_started,
+ &nr_written);
+ delalloc_start = delalloc_end + 1;
+ }
+
+ /* did the fill delalloc function already unlock and start
+ * the IO?
+ */
+ if (page_started) {
+ ret = 0;
+ goto update_nr_written;
+ }
+ }
+ lock_extent(tree, start, page_end, GFP_NOFS);
+
+ unlock_start = start;
+
+ if (tree->ops && tree->ops->writepage_start_hook) {
+ ret = tree->ops->writepage_start_hook(page, start,
+ page_end);
+ if (ret == -EAGAIN) {
+ unlock_extent(tree, start, page_end, GFP_NOFS);
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ ret = 0;
+ goto update_nr_written;
+ }
+ }
+
+ nr_written++;
+
+ end = page_end;
+ if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
+ printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
+
+ if (last_byte <= start) {
+ clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+ unlock_extent(tree, start, page_end, GFP_NOFS);
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, start,
+ page_end, NULL, 1);
+ unlock_start = page_end + 1;
+ goto done;
+ }
+
+ set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+ blocksize = inode->i_sb->s_blocksize;
+
+ while (cur <= end) {
+ if (cur >= last_byte) {
+ clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+ unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, cur,
+ page_end, NULL, 1);
+ unlock_start = page_end + 1;
+ break;
+ }
+ em = epd->get_extent(inode, page, pg_offset, cur,
+ end - cur + 1, 1);
+ if (IS_ERR(em) || !em) {
+ SetPageError(page);
+ break;
+ }
+
+ extent_offset = cur - em->start;
+ BUG_ON(extent_map_end(em) <= cur);
+ BUG_ON(end < cur);
+ iosize = min(extent_map_end(em) - cur, end - cur + 1);
+ iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+ sector = (em->block_start + extent_offset) >> 9;
+ bdev = em->bdev;
+ block_start = em->block_start;
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ free_extent_map(em);
+ em = NULL;
+
+ /*
+ * compressed and inline extents are written through other
+ * paths in the FS
+ */
+ if (compressed || block_start == EXTENT_MAP_HOLE ||
+ block_start == EXTENT_MAP_INLINE) {
+ clear_extent_dirty(tree, cur,
+ cur + iosize - 1, GFP_NOFS);
+
+ unlock_extent(tree, unlock_start, cur + iosize - 1,
+ GFP_NOFS);
+
+ /*
+ * end_io notification does not happen here for
+ * compressed extents
+ */
+ if (!compressed && tree->ops &&
+ tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, cur,
+ cur + iosize - 1,
+ NULL, 1);
+ else if (compressed) {
+ /* we don't want to end_page_writeback on
+ * a compressed extent. this happens
+ * elsewhere
+ */
+ nr++;
+ }
+
+ cur += iosize;
+ pg_offset += iosize;
+ unlock_start = cur;
+ continue;
+ }
+ /* leave this out until we have a page_mkwrite call */
+ if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+ EXTENT_DIRTY, 0)) {
+ cur = cur + iosize;
+ pg_offset += iosize;
+ continue;
+ }
+
+ clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+ if (tree->ops && tree->ops->writepage_io_hook) {
+ ret = tree->ops->writepage_io_hook(page, cur,
+ cur + iosize - 1);
+ } else {
+ ret = 0;
+ }
+ if (ret) {
+ SetPageError(page);
+ } else {
+ unsigned long max_nr = end_index + 1;
+
+ set_range_writeback(tree, cur, cur + iosize - 1);
+ if (!PageWriteback(page)) {
+ printk(KERN_ERR "btrfs warning page %lu not "
+ "writeback, cur %llu end %llu\n",
+ page->index, (unsigned long long)cur,
+ (unsigned long long)end);
+ }
+
+ ret = submit_extent_page(WRITE, tree, page, sector,
+ iosize, pg_offset, bdev,
+ &epd->bio, max_nr,
+ end_bio_extent_writepage,
+ 0, 0, 0);
+ if (ret)
+ SetPageError(page);
+ }
+ cur = cur + iosize;
+ pg_offset += iosize;
+ nr++;
+ }
+done:
+ if (nr == 0) {
+ /* make sure the mapping tag for page dirty gets cleared */
+ set_page_writeback(page);
+ end_page_writeback(page);
+ }
+ if (unlock_start <= page_end)
+ unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+ unlock_page(page);
+
+update_nr_written:
+ wbc->nr_to_write -= nr_written;
+ if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+ wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+ page->mapping->writeback_index = page->index + nr_written;
+ return 0;
+}
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them. If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int extent_write_cache_pages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ struct writeback_control *wbc,
+ writepage_t writepage, void *data,
+ void (*flush_fn)(void *))
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ int ret = 0;
+ int done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ int scanned = 0;
+ int range_whole = 0;
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ return 0;
+ }
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ scanned = 1;
+ }
+retry:
+ while (!done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY, min(end - index,
+ (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ unsigned i;
+
+ scanned = 1;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point we hold neither mapping->tree_lock nor
+ * lock on the page itself: the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or even
+ * swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+ if (tree->ops && tree->ops->write_cache_pages_lock_hook)
+ tree->ops->write_cache_pages_lock_hook(page);
+ else
+ lock_page(page);
+
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!wbc->range_cyclic && page->index > end) {
+ done = 1;
+ unlock_page(page);
+ continue;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ if (PageWriteback(page))
+ flush_fn(data);
+ wait_on_page_writeback(page);
+ }
+
+ if (PageWriteback(page) ||
+ !clear_page_dirty_for_io(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ret = (*writepage)(page, wbc, data);
+
+ if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+ unlock_page(page);
+ ret = 0;
+ }
+ if (ret || wbc->nr_to_write <= 0)
+ done = 1;
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ done = 1;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ if (!scanned && !done) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+ goto retry;
+ }
+ return ret;
+}
+
+static noinline void flush_write_bio(void *data)
+{
+ struct extent_page_data *epd = data;
+ if (epd->bio) {
+ submit_one_bio(WRITE, epd->bio, 0, 0);
+ epd->bio = NULL;
+ }
+}
+
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc)
+{
+ int ret;
+ struct address_space *mapping = page->mapping;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .get_extent = get_extent,
+ .extent_locked = 0,
+ };
+ struct writeback_control wbc_writepages = {
+ .bdi = wbc->bdi,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = NULL,
+ .nr_to_write = 64,
+ .range_start = page_offset(page) + PAGE_CACHE_SIZE,
+ .range_end = (loff_t)-1,
+ };
+
+
+ ret = __extent_writepage(page, wbc, &epd);
+
+ extent_write_cache_pages(tree, mapping, &wbc_writepages,
+ __extent_writepage, &epd, flush_write_bio);
+ if (epd.bio)
+ submit_one_bio(WRITE, epd.bio, 0, 0);
+ return ret;
+}
+
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+ u64 start, u64 end, get_extent_t *get_extent,
+ int mode)
+{
+ int ret = 0;
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+ unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .get_extent = get_extent,
+ .extent_locked = 1,
+ };
+ struct writeback_control wbc_writepages = {
+ .bdi = inode->i_mapping->backing_dev_info,
+ .sync_mode = mode,
+ .older_than_this = NULL,
+ .nr_to_write = nr_pages * 2,
+ .range_start = start,
+ .range_end = end + 1,
+ };
+
+ while (start <= end) {
+ page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ if (clear_page_dirty_for_io(page))
+ ret = __extent_writepage(page, &wbc_writepages, &epd);
+ else {
+ if (tree->ops && tree->ops->writepage_end_io_hook)
+ tree->ops->writepage_end_io_hook(page, start,
+ start + PAGE_CACHE_SIZE - 1,
+ NULL, 1);
+ unlock_page(page);
+ }
+ page_cache_release(page);
+ start += PAGE_CACHE_SIZE;
+ }
+
+ if (epd.bio)
+ submit_one_bio(WRITE, epd.bio, 0, 0);
+ return ret;
+}
+
+int extent_writepages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ struct extent_page_data epd = {
+ .bio = NULL,
+ .tree = tree,
+ .get_extent = get_extent,
+ .extent_locked = 0,
+ };
+
+ ret = extent_write_cache_pages(tree, mapping, wbc,
+ __extent_writepage, &epd,
+ flush_write_bio);
+ if (epd.bio)
+ submit_one_bio(WRITE, epd.bio, 0, 0);
+ return ret;
+}
+
+int extent_readpages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages,
+ get_extent_t get_extent)
+{
+ struct bio *bio = NULL;
+ unsigned page_idx;
+ struct pagevec pvec;
+ unsigned long bio_flags = 0;
+
+ pagevec_init(&pvec, 0);
+ for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ struct page *page = list_entry(pages->prev, struct page, lru);
+
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+ /*
+ * what we want to do here is call add_to_page_cache_lru,
+ * but that isn't exported, so we reproduce it here
+ */
+ if (!add_to_page_cache(page, mapping,
+ page->index, GFP_KERNEL)) {
+
+ /* open coding of lru_cache_add, also not exported */
+ page_cache_get(page);
+ if (!pagevec_add(&pvec, page))
+ __pagevec_lru_add_file(&pvec);
+ __extent_read_full_page(tree, page, get_extent,
+ &bio, 0, &bio_flags);
+ }
+ page_cache_release(page);
+ }
+ if (pagevec_count(&pvec))
+ __pagevec_lru_add_file(&pvec);
+ BUG_ON(!list_empty(pages));
+ if (bio)
+ submit_one_bio(READ, bio, 0, bio_flags);
+ return 0;
+}
+
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_io_tree *tree,
+ struct page *page, unsigned long offset)
+{
+ u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+
+ start += (offset + blocksize - 1) & ~(blocksize - 1);
+ if (start > end)
+ return 0;
+
+ lock_extent(tree, start, end, GFP_NOFS);
+ wait_on_extent_writeback(tree, start, end);
+ clear_extent_bit(tree, start, end,
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+ 1, 1, GFP_NOFS);
+ return 0;
+}
+
+/*
+ * simple commit_write call, set_range_dirty is used to mark both
+ * the pages and the extent records as dirty
+ */
+int extent_commit_write(struct extent_io_tree *tree,
+ struct inode *inode, struct page *page,
+ unsigned from, unsigned to)
+{
+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+ set_page_extent_mapped(page);
+ set_page_dirty(page);
+
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ return 0;
+}
+
+int extent_prepare_write(struct extent_io_tree *tree,
+ struct inode *inode, struct page *page,
+ unsigned from, unsigned to, get_extent_t *get_extent)
+{
+ u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+ u64 block_start;
+ u64 orig_block_start;
+ u64 block_end;
+ u64 cur_end;
+ struct extent_map *em;
+ unsigned blocksize = 1 << inode->i_blkbits;
+ size_t page_offset = 0;
+ size_t block_off_start;
+ size_t block_off_end;
+ int err = 0;
+ int iocount = 0;
+ int ret = 0;
+ int isnew;
+
+ set_page_extent_mapped(page);
+
+ block_start = (page_start + from) & ~((u64)blocksize - 1);
+ block_end = (page_start + to - 1) | (blocksize - 1);
+ orig_block_start = block_start;
+
+ lock_extent(tree, page_start, page_end, GFP_NOFS);
+ while (block_start <= block_end) {
+ em = get_extent(inode, page, page_offset, block_start,
+ block_end - block_start + 1, 1);
+ if (IS_ERR(em) || !em)
+ goto err;
+
+ cur_end = min(block_end, extent_map_end(em) - 1);
+ block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
+ block_off_end = block_off_start + blocksize;
+ isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
+
+ if (!PageUptodate(page) && isnew &&
+ (block_off_end > to || block_off_start < from)) {
+ void *kaddr;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (block_off_end > to)
+ memset(kaddr + to, 0, block_off_end - to);
+ if (block_off_start < from)
+ memset(kaddr + block_off_start, 0,
+ from - block_off_start);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ if ((em->block_start != EXTENT_MAP_HOLE &&
+ em->block_start != EXTENT_MAP_INLINE) &&
+ !isnew && !PageUptodate(page) &&
+ (block_off_end > to || block_off_start < from) &&
+ !test_range_bit(tree, block_start, cur_end,
+ EXTENT_UPTODATE, 1)) {
+ u64 sector;
+ u64 extent_offset = block_start - em->start;
+ size_t iosize;
+ sector = (em->block_start + extent_offset) >> 9;
+ iosize = (cur_end - block_start + blocksize) &
+ ~((u64)blocksize - 1);
+ /*
+ * we've already got the extent locked, but we
+ * need to split the state such that our end_bio
+ * handler can clear the lock.
+ */
+ set_extent_bit(tree, block_start,
+ block_start + iosize - 1,
+ EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+ ret = submit_extent_page(READ, tree, page,
+ sector, iosize, page_offset, em->bdev,
+ NULL, 1,
+ end_bio_extent_preparewrite, 0,
+ 0, 0);
+ iocount++;
+ block_start = block_start + iosize;
+ } else {
+ set_extent_uptodate(tree, block_start, cur_end,
+ GFP_NOFS);
+ unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+ block_start = cur_end + 1;
+ }
+ page_offset = block_start & (PAGE_CACHE_SIZE - 1);
+ free_extent_map(em);
+ }
+ if (iocount) {
+ wait_extent_bit(tree, orig_block_start,
+ block_end, EXTENT_LOCKED);
+ }
+ check_page_uptodate(tree, page);
+err:
+ /* FIXME, zero out newly allocated blocks on error */
+ return err;
+}
+
+/*
+ * a helper for releasepage, this tests for areas of the page that
+ * are locked or under IO and drops the related state bits if it is safe
+ * to drop the page.
+ */
+int try_release_extent_state(struct extent_map_tree *map,
+ struct extent_io_tree *tree, struct page *page,
+ gfp_t mask)
+{
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+ int ret = 1;
+
+ if (test_range_bit(tree, start, end,
+ EXTENT_IOBITS | EXTENT_ORDERED, 0))
+ ret = 0;
+ else {
+ if ((mask & GFP_NOFS) == GFP_NOFS)
+ mask = GFP_NOFS;
+ clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+ 1, 1, mask);
+ }
+ return ret;
+}
+
+/*
+ * a helper for releasepage. As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *map,
+ struct extent_io_tree *tree, struct page *page,
+ gfp_t mask)
+{
+ struct extent_map *em;
+ u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+ u64 end = start + PAGE_CACHE_SIZE - 1;
+
+ if ((mask & __GFP_WAIT) &&
+ page->mapping->host->i_size > 16 * 1024 * 1024) {
+ u64 len;
+ while (start <= end) {
+ len = end - start + 1;
+ spin_lock(&map->lock);
+ em = lookup_extent_mapping(map, start, len);
+ if (!em || IS_ERR(em)) {
+ spin_unlock(&map->lock);
+ break;
+ }
+ if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+ em->start != start) {
+ spin_unlock(&map->lock);
+ free_extent_map(em);
+ break;
+ }
+ if (!test_range_bit(tree, em->start,
+ extent_map_end(em) - 1,
+ EXTENT_LOCKED | EXTENT_WRITEBACK |
+ EXTENT_ORDERED,
+ 0)) {
+ remove_extent_mapping(map, em);
+ /* once for the rb tree */
+ free_extent_map(em);
+ }
+ start = extent_map_end(em);
+ spin_unlock(&map->lock);
+
+ /* once for us */
+ free_extent_map(em);
+ }
+ }
+ return try_release_extent_state(map, tree, page, mask);
+}
+
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+ get_extent_t *get_extent)
+{
+ struct inode *inode = mapping->host;
+ u64 start = iblock << inode->i_blkbits;
+ sector_t sector = 0;
+ size_t blksize = (1 << inode->i_blkbits);
+ struct extent_map *em;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+ GFP_NOFS);
+ em = get_extent(inode, NULL, 0, start, blksize, 0);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+ GFP_NOFS);
+ if (!em || IS_ERR(em))
+ return 0;
+
+ if (em->block_start > EXTENT_MAP_LAST_BYTE)
+ goto out;
+
+ sector = (em->block_start + start - em->start) >> inode->i_blkbits;
+out:
+ free_extent_map(em);
+ return sector;
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+ unsigned long i)
+{
+ struct page *p;
+ struct address_space *mapping;
+
+ if (i == 0)
+ return eb->first_page;
+ i += eb->start >> PAGE_CACHE_SHIFT;
+ mapping = eb->first_page->mapping;
+ if (!mapping)
+ return NULL;
+
+ /*
+ * extent_buffer_page is only called after pinning the page
+ * by increasing the reference count. So we know the page must
+ * be in the radix tree.
+ */
+ rcu_read_lock();
+ p = radix_tree_lookup(&mapping->page_tree, i);
+ rcu_read_unlock();
+
+ return p;
+}
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+ return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+ (start >> PAGE_CACHE_SHIFT);
+}
+
+static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
+ u64 start,
+ unsigned long len,
+ gfp_t mask)
+{
+ struct extent_buffer *eb = NULL;
+#ifdef LEAK_DEBUG
+ unsigned long flags;
+#endif
+
+ eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+ eb->start = start;
+ eb->len = len;
+ mutex_init(&eb->mutex);
+#ifdef LEAK_DEBUG
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(&eb->leak_list, &buffers);
+ spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+ atomic_set(&eb->refs, 1);
+
+ return eb;
+}
+
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#ifdef LEAK_DEBUG
+ unsigned long flags;
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&eb->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+ kmem_cache_free(extent_buffer_cache, eb);
+}
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+ u64 start, unsigned long len,
+ struct page *page0,
+ gfp_t mask)
+{
+ unsigned long num_pages = num_extent_pages(start, len);
+ unsigned long i;
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ struct extent_buffer *eb;
+ struct extent_buffer *exists = NULL;
+ struct page *p;
+ struct address_space *mapping = tree->mapping;
+ int uptodate = 1;
+
+ spin_lock(&tree->buffer_lock);
+ eb = buffer_search(tree, start);
+ if (eb) {
+ atomic_inc(&eb->refs);
+ spin_unlock(&tree->buffer_lock);
+ mark_page_accessed(eb->first_page);
+ return eb;
+ }
+ spin_unlock(&tree->buffer_lock);
+
+ eb = __alloc_extent_buffer(tree, start, len, mask);
+ if (!eb)
+ return NULL;
+
+ if (page0) {
+ eb->first_page = page0;
+ i = 1;
+ index++;
+ page_cache_get(page0);
+ mark_page_accessed(page0);
+ set_page_extent_mapped(page0);
+ set_page_extent_head(page0, len);
+ uptodate = PageUptodate(page0);
+ } else {
+ i = 0;
+ }
+ for (; i < num_pages; i++, index++) {
+ p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
+ if (!p) {
+ WARN_ON(1);
+ goto free_eb;
+ }
+ set_page_extent_mapped(p);
+ mark_page_accessed(p);
+ if (i == 0) {
+ eb->first_page = p;
+ set_page_extent_head(p, len);
+ } else {
+ set_page_private(p, EXTENT_PAGE_PRIVATE);
+ }
+ if (!PageUptodate(p))
+ uptodate = 0;
+ unlock_page(p);
+ }
+ if (uptodate)
+ eb->flags |= EXTENT_UPTODATE;
+ eb->flags |= EXTENT_BUFFER_FILLED;
+
+ spin_lock(&tree->buffer_lock);
+ exists = buffer_tree_insert(tree, start, &eb->rb_node);
+ if (exists) {
+ /* add one reference for the caller */
+ atomic_inc(&exists->refs);
+ spin_unlock(&tree->buffer_lock);
+ goto free_eb;
+ }
+ spin_unlock(&tree->buffer_lock);
+
+ /* add one reference for the tree */
+ atomic_inc(&eb->refs);
+ return eb;
+
+free_eb:
+ if (!atomic_dec_and_test(&eb->refs))
+ return exists;
+ for (index = 1; index < i; index++)
+ page_cache_release(extent_buffer_page(eb, index));
+ page_cache_release(extent_buffer_page(eb, 0));
+ __free_extent_buffer(eb);
+ return exists;
+}
+
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+ u64 start, unsigned long len,
+ gfp_t mask)
+{
+ struct extent_buffer *eb;
+
+ spin_lock(&tree->buffer_lock);
+ eb = buffer_search(tree, start);
+ if (eb)
+ atomic_inc(&eb->refs);
+ spin_unlock(&tree->buffer_lock);
+
+ if (eb)
+ mark_page_accessed(eb->first_page);
+
+ return eb;
+}
+
+void free_extent_buffer(struct extent_buffer *eb)
+{
+ if (!eb)
+ return;
+
+ if (!atomic_dec_and_test(&eb->refs))
+ return;
+
+ WARN_ON(1);
+}
+
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ int set;
+ unsigned long i;
+ unsigned long num_pages;
+ struct page *page;
+
+ u64 start = eb->start;
+ u64 end = start + eb->len - 1;
+
+ set = clear_extent_dirty(tree, start, end, GFP_NOFS);
+ num_pages = num_extent_pages(eb->start, eb->len);
+
+ for (i = 0; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if (!set && !PageDirty(page))
+ continue;
+
+ lock_page(page);
+ if (i == 0)
+ set_page_extent_head(page, eb->len);
+ else
+ set_page_private(page, EXTENT_PAGE_PRIVATE);
+
+ /*
+ * if we're on the last page or the first page and the
+ * block isn't aligned on a page boundary, do extra checks
+ * to make sure we don't clean page that is partially dirty
+ */
+ if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+ ((i == num_pages - 1) &&
+ ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+ start = (u64)page->index << PAGE_CACHE_SHIFT;
+ end = start + PAGE_CACHE_SIZE - 1;
+ if (test_range_bit(tree, start, end,
+ EXTENT_DIRTY, 0)) {
+ unlock_page(page);
+ continue;
+ }
+ }
+ clear_page_dirty_for_io(page);
+ spin_lock_irq(&page->mapping->tree_lock);
+ if (!PageDirty(page)) {
+ radix_tree_tag_clear(&page->mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ }
+ spin_unlock_irq(&page->mapping->tree_lock);
+ unlock_page(page);
+ }
+ return 0;
+}
+
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ return wait_on_extent_writeback(tree, eb->start,
+ eb->start + eb->len - 1);
+}
+
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ unsigned long i;
+ unsigned long num_pages;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ struct page *page = extent_buffer_page(eb, i);
+ /* writepage may need to do something special for the
+ * first page, we have to make sure page->private is
+ * properly set. releasepage may drop page->private
+ * on us if the page isn't already dirty.
+ */
+ lock_page(page);
+ if (i == 0) {
+ set_page_extent_head(page, eb->len);
+ } else if (PagePrivate(page) &&
+ page->private != EXTENT_PAGE_PRIVATE) {
+ set_page_extent_mapped(page);
+ }
+ __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+ set_extent_dirty(tree, page_offset(page),
+ page_offset(page) + PAGE_CACHE_SIZE - 1,
+ GFP_NOFS);
+ unlock_page(page);
+ }
+ return 0;
+}
+
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ unsigned long i;
+ struct page *page;
+ unsigned long num_pages;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ eb->flags &= ~EXTENT_UPTODATE;
+
+ clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+ GFP_NOFS);
+ for (i = 0; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if (page)
+ ClearPageUptodate(page);
+ }
+ return 0;
+}
+
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ unsigned long i;
+ struct page *page;
+ unsigned long num_pages;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+
+ set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+ GFP_NOFS);
+ for (i = 0; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+ ((i == num_pages - 1) &&
+ ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+ check_page_uptodate(tree, page);
+ continue;
+ }
+ SetPageUptodate(page);
+ }
+ return 0;
+}
+
+int extent_range_uptodate(struct extent_io_tree *tree,
+ u64 start, u64 end)
+{
+ struct page *page;
+ int ret;
+ int pg_uptodate = 1;
+ int uptodate;
+ unsigned long index;
+
+ ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+ if (ret)
+ return 1;
+ while (start <= end) {
+ index = start >> PAGE_CACHE_SHIFT;
+ page = find_get_page(tree->mapping, index);
+ uptodate = PageUptodate(page);
+ page_cache_release(page);
+ if (!uptodate) {
+ pg_uptodate = 0;
+ break;
+ }
+ start += PAGE_CACHE_SIZE;
+ }
+ return pg_uptodate;
+}
+
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb)
+{
+ int ret = 0;
+ unsigned long num_pages;
+ unsigned long i;
+ struct page *page;
+ int pg_uptodate = 1;
+
+ if (eb->flags & EXTENT_UPTODATE)
+ return 1;
+
+ ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+ EXTENT_UPTODATE, 1);
+ if (ret)
+ return ret;
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if (!PageUptodate(page)) {
+ pg_uptodate = 0;
+ break;
+ }
+ }
+ return pg_uptodate;
+}
+
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+ struct extent_buffer *eb,
+ u64 start, int wait,
+ get_extent_t *get_extent, int mirror_num)
+{
+ unsigned long i;
+ unsigned long start_i;
+ struct page *page;
+ int err;
+ int ret = 0;
+ int locked_pages = 0;
+ int all_uptodate = 1;
+ int inc_all_pages = 0;
+ unsigned long num_pages;
+ struct bio *bio = NULL;
+ unsigned long bio_flags = 0;
+
+ if (eb->flags & EXTENT_UPTODATE)
+ return 0;
+
+ if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+ EXTENT_UPTODATE, 1)) {
+ return 0;
+ }
+
+ if (start) {
+ WARN_ON(start < eb->start);
+ start_i = (start >> PAGE_CACHE_SHIFT) -
+ (eb->start >> PAGE_CACHE_SHIFT);
+ } else {
+ start_i = 0;
+ }
+
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = start_i; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if (!wait) {
+ if (!trylock_page(page))
+ goto unlock_exit;
+ } else {
+ lock_page(page);
+ }
+ locked_pages++;
+ if (!PageUptodate(page))
+ all_uptodate = 0;
+ }
+ if (all_uptodate) {
+ if (start_i == 0)
+ eb->flags |= EXTENT_UPTODATE;
+ goto unlock_exit;
+ }
+
+ for (i = start_i; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if (inc_all_pages)
+ page_cache_get(page);
+ if (!PageUptodate(page)) {
+ if (start_i == 0)
+ inc_all_pages = 1;
+ ClearPageError(page);
+ err = __extent_read_full_page(tree, page,
+ get_extent, &bio,
+ mirror_num, &bio_flags);
+ if (err)
+ ret = err;
+ } else {
+ unlock_page(page);
+ }
+ }
+
+ if (bio)
+ submit_one_bio(READ, bio, mirror_num, bio_flags);
+
+ if (ret || !wait)
+ return ret;
+
+ for (i = start_i; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ wait_on_page_locked(page);
+ if (!PageUptodate(page))
+ ret = -EIO;
+ }
+
+ if (!ret)
+ eb->flags |= EXTENT_UPTODATE;
+ return ret;
+
+unlock_exit:
+ i = start_i;
+ while (locked_pages > 0) {
+ page = extent_buffer_page(eb, i);
+ i++;
+ unlock_page(page);
+ locked_pages--;
+ }
+ return ret;
+}
+
+void read_extent_buffer(struct extent_buffer *eb, void *dstv,
+ unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *dst = (char *)dstv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+
+ cur = min(len, (PAGE_CACHE_SIZE - offset));
+ kaddr = kmap_atomic(page, KM_USER1);
+ memcpy(dst, kaddr + offset, cur);
+ kunmap_atomic(kaddr, KM_USER1);
+
+ dst += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
+ unsigned long min_len, char **token, char **map,
+ unsigned long *map_start,
+ unsigned long *map_len, int km)
+{
+ size_t offset = start & (PAGE_CACHE_SIZE - 1);
+ char *kaddr;
+ struct page *p;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ unsigned long end_i = (start_offset + start + min_len - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ if (i != end_i)
+ return -EINVAL;
+
+ if (i == 0) {
+ offset = start_offset;
+ *map_start = 0;
+ } else {
+ offset = 0;
+ *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+ }
+
+ if (start + min_len > eb->len) {
+ printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+ "wanted %lu %lu\n", (unsigned long long)eb->start,
+ eb->len, start, min_len);
+ WARN_ON(1);
+ }
+
+ p = extent_buffer_page(eb, i);
+ kaddr = kmap_atomic(p, km);
+ *token = kaddr;
+ *map = kaddr + offset;
+ *map_len = PAGE_CACHE_SIZE - offset;
+ return 0;
+}
+
+int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+ unsigned long min_len,
+ char **token, char **map,
+ unsigned long *map_start,
+ unsigned long *map_len, int km)
+{
+ int err;
+ int save = 0;
+ if (eb->map_token) {
+ unmap_extent_buffer(eb, eb->map_token, km);
+ eb->map_token = NULL;
+ save = 1;
+ WARN_ON(!mutex_is_locked(&eb->mutex));
+ }
+ err = map_private_extent_buffer(eb, start, min_len, token, map,
+ map_start, map_len, km);
+ if (!err && save) {
+ eb->map_token = *token;
+ eb->kaddr = *map;
+ eb->map_start = *map_start;
+ eb->map_len = *map_len;
+ }
+ return err;
+}
+
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
+{
+ kunmap_atomic(token, km);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+ unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *ptr = (char *)ptrv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ int ret = 0;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+
+ cur = min(len, (PAGE_CACHE_SIZE - offset));
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ ret = memcmp(ptr, kaddr + offset, cur);
+ kunmap_atomic(kaddr, KM_USER0);
+ if (ret)
+ break;
+
+ ptr += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+ return ret;
+}
+
+void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+ unsigned long start, unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char *src = (char *)srcv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+ WARN_ON(!PageUptodate(page));
+
+ cur = min(len, PAGE_CACHE_SIZE - offset);
+ kaddr = kmap_atomic(page, KM_USER1);
+ memcpy(kaddr + offset, src, cur);
+ kunmap_atomic(kaddr, KM_USER1);
+
+ src += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+ unsigned long start, unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+ WARN_ON(!PageUptodate(page));
+
+ cur = min(len, PAGE_CACHE_SIZE - offset);
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, c, cur);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
+{
+ u64 dst_len = dst->len;
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(src->len != dst_len);
+
+ offset = (start_offset + dst_offset) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(dst, i);
+ WARN_ON(!PageUptodate(page));
+
+ cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ read_extent_buffer(src, kaddr + offset, src_offset, cur);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ src_offset += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+}
+
+static void move_pages(struct page *dst_page, struct page *src_page,
+ unsigned long dst_off, unsigned long src_off,
+ unsigned long len)
+{
+ char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+ if (dst_page == src_page) {
+ memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
+ } else {
+ char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+ char *p = dst_kaddr + dst_off + len;
+ char *s = src_kaddr + src_off + len;
+
+ while (len--)
+ *--p = *--s;
+
+ kunmap_atomic(src_kaddr, KM_USER1);
+ }
+ kunmap_atomic(dst_kaddr, KM_USER0);
+}
+
+static void copy_pages(struct page *dst_page, struct page *src_page,
+ unsigned long dst_off, unsigned long src_off,
+ unsigned long len)
+{
+ char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+ char *src_kaddr;
+
+ if (dst_page != src_page)
+ src_kaddr = kmap_atomic(src_page, KM_USER1);
+ else
+ src_kaddr = dst_kaddr;
+
+ memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+ kunmap_atomic(dst_kaddr, KM_USER0);
+ if (dst_page != src_page)
+ kunmap_atomic(src_kaddr, KM_USER1);
+}
+
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len)
+{
+ size_t cur;
+ size_t dst_off_in_page;
+ size_t src_off_in_page;
+ size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long dst_i;
+ unsigned long src_i;
+
+ if (src_offset + len > dst->len) {
+ printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+ "len %lu dst len %lu\n", src_offset, len, dst->len);
+ BUG_ON(1);
+ }
+ if (dst_offset + len > dst->len) {
+ printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+ "len %lu dst len %lu\n", dst_offset, len, dst->len);
+ BUG_ON(1);
+ }
+
+ while (len > 0) {
+ dst_off_in_page = (start_offset + dst_offset) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+ src_off_in_page = (start_offset + src_offset) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+ src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+
+ cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+ src_off_in_page));
+ cur = min_t(unsigned long, cur,
+ (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+
+ copy_pages(extent_buffer_page(dst, dst_i),
+ extent_buffer_page(dst, src_i),
+ dst_off_in_page, src_off_in_page, cur);
+
+ src_offset += cur;
+ dst_offset += cur;
+ len -= cur;
+ }
+}
+
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len)
+{
+ size_t cur;
+ size_t dst_off_in_page;
+ size_t src_off_in_page;
+ unsigned long dst_end = dst_offset + len - 1;
+ unsigned long src_end = src_offset + len - 1;
+ size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long dst_i;
+ unsigned long src_i;
+
+ if (src_offset + len > dst->len) {
+ printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+ "len %lu len %lu\n", src_offset, len, dst->len);
+ BUG_ON(1);
+ }
+ if (dst_offset + len > dst->len) {
+ printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+ "len %lu len %lu\n", dst_offset, len, dst->len);
+ BUG_ON(1);
+ }
+ if (dst_offset < src_offset) {
+ memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+ return;
+ }
+ while (len > 0) {
+ dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
+ src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+
+ dst_off_in_page = (start_offset + dst_end) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+ src_off_in_page = (start_offset + src_end) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ cur = min_t(unsigned long, len, src_off_in_page + 1);
+ cur = min(cur, dst_off_in_page + 1);
+ move_pages(extent_buffer_page(dst, dst_i),
+ extent_buffer_page(dst, src_i),
+ dst_off_in_page - cur + 1,
+ src_off_in_page - cur + 1, cur);
+
+ dst_end -= cur;
+ src_end -= cur;
+ len -= cur;
+ }
+}
+
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+{
+ u64 start = page_offset(page);
+ struct extent_buffer *eb;
+ int ret = 1;
+ unsigned long i;
+ unsigned long num_pages;
+
+ spin_lock(&tree->buffer_lock);
+ eb = buffer_search(tree, start);
+ if (!eb)
+ goto out;
+
+ if (atomic_read(&eb->refs) > 1) {
+ ret = 0;
+ goto out;
+ }
+ /* at this point we can safely release the extent buffer */
+ num_pages = num_extent_pages(eb->start, eb->len);
+ for (i = 0; i < num_pages; i++)
+ page_cache_release(extent_buffer_page(eb, i));
+ rb_erase(&eb->rb_node, &tree->buffer);
+ __free_extent_buffer(eb);
+out:
+ spin_unlock(&tree->buffer_lock);
+ return ret;
+}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 00000000000..c5b483a7913
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,269 @@
+#ifndef __EXTENTIO__
+#define __EXTENTIO__
+
+#include <linux/rbtree.h>
+
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+#define EXTENT_DEFRAG (1 << 6)
+#define EXTENT_DEFRAG_DONE (1 << 7)
+#define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_ORDERED (1 << 9)
+#define EXTENT_ORDERED_METADATA (1 << 10)
+#define EXTENT_BOUNDARY (1 << 11)
+#define EXTENT_NODATASUM (1 << 12)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
+
+/*
+ * page->private values. Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
+
+struct extent_state;
+
+typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags);
+struct extent_io_ops {
+ int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written);
+ int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
+ int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+ extent_submit_bio_hook_t *submit_bio_hook;
+ int (*merge_bio_hook)(struct page *page, unsigned long offset,
+ size_t size, struct bio *bio,
+ unsigned long bio_flags);
+ int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
+ int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
+ u64 start, u64 end,
+ struct extent_state *state);
+ int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
+ u64 start, u64 end,
+ struct extent_state *state);
+ int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
+ struct extent_state *state);
+ int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+ struct extent_state *state, int uptodate);
+ int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+ unsigned long old, unsigned long bits);
+ int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
+ unsigned long old, unsigned long bits);
+ int (*write_cache_pages_lock_hook)(struct page *page);
+};
+
+struct extent_io_tree {
+ struct rb_root state;
+ struct rb_root buffer;
+ struct address_space *mapping;
+ u64 dirty_bytes;
+ spinlock_t lock;
+ spinlock_t buffer_lock;
+ struct extent_io_ops *ops;
+};
+
+struct extent_state {
+ u64 start;
+ u64 end; /* inclusive */
+ struct rb_node rb_node;
+ struct extent_io_tree *tree;
+ wait_queue_head_t wq;
+ atomic_t refs;
+ unsigned long state;
+
+ /* for use by the FS */
+ u64 private;
+
+ struct list_head leak_list;
+};
+
+struct extent_buffer {
+ u64 start;
+ unsigned long len;
+ char *map_token;
+ char *kaddr;
+ unsigned long map_start;
+ unsigned long map_len;
+ struct page *first_page;
+ atomic_t refs;
+ int flags;
+ struct list_head leak_list;
+ struct rb_node rb_node;
+ struct mutex mutex;
+};
+
+struct extent_map_tree;
+
+static inline struct extent_state *extent_state_next(struct extent_state *state)
+{
+ struct rb_node *node;
+ node = rb_next(&state->rb_node);
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct extent_state, rb_node);
+}
+
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+ struct page *page,
+ size_t page_offset,
+ u64 start, u64 len,
+ int create);
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+ struct address_space *mapping, gfp_t mask);
+int try_release_extent_mapping(struct extent_map_tree *map,
+ struct extent_io_tree *tree, struct page *page,
+ gfp_t mask);
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
+int try_release_extent_state(struct extent_map_tree *map,
+ struct extent_io_tree *tree, struct page *page,
+ gfp_t mask);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent);
+int __init extent_io_init(void);
+void extent_io_exit(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end,
+ u64 max_bytes, unsigned long bits);
+
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int filled);
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int wake, int delete, gfp_t mask);
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, gfp_t mask);
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask);
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+ gfp_t mask);
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, int bits);
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, int bits);
+int extent_invalidatepage(struct extent_io_tree *tree,
+ struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc);
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+ u64 start, u64 end, get_extent_t *get_extent,
+ int mode);
+int extent_writepages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ get_extent_t *get_extent,
+ struct writeback_control *wbc);
+int extent_readpages(struct extent_io_tree *tree,
+ struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages,
+ get_extent_t get_extent);
+int extent_prepare_write(struct extent_io_tree *tree,
+ struct inode *inode, struct page *page,
+ unsigned from, unsigned to, get_extent_t *get_extent);
+int extent_commit_write(struct extent_io_tree *tree,
+ struct inode *inode, struct page *page,
+ unsigned from, unsigned to);
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+ get_extent_t *get_extent);
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
+void set_page_extent_mapped(struct page *page);
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+ u64 start, unsigned long len,
+ struct page *page0,
+ gfp_t mask);
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+ u64 start, unsigned long len,
+ gfp_t mask);
+void free_extent_buffer(struct extent_buffer *eb);
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+ struct extent_buffer *eb, u64 start, int wait,
+ get_extent_t *get_extent, int mirror_num);
+
+static inline void extent_buffer_get(struct extent_buffer *eb)
+{
+ atomic_inc(&eb->refs);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+ unsigned long start,
+ unsigned long len);
+void read_extent_buffer(struct extent_buffer *eb, void *dst,
+ unsigned long start,
+ unsigned long len);
+void write_extent_buffer(struct extent_buffer *eb, const void *src,
+ unsigned long start, unsigned long len);
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len);
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len);
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+ unsigned long src_offset, unsigned long len);
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+ unsigned long start, unsigned long len);
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
+int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+ unsigned long min_len, char **token, char **map,
+ unsigned long *map_start,
+ unsigned long *map_len, int km);
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+ unsigned long min_len, char **token, char **map,
+ unsigned long *map_start,
+ unsigned long *map_len, int km);
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
+int release_extent_buffer_tail_pages(struct extent_buffer *eb);
+int extent_range_uptodate(struct extent_io_tree *tree,
+ u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+ struct extent_io_tree *tree,
+ u64 start, u64 end, struct page *locked_page,
+ int unlock_page,
+ int clear_unlock,
+ int clear_delalloc, int clear_dirty,
+ int set_writeback,
+ int end_writeback);
+#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 00000000000..4a83e33ada3
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,351 @@
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/version.h>
+#include <linux/hardirq.h>
+#include "extent_map.h"
+
+/* temporary define until extent_map moves out of btrfs */
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+ unsigned long extra_flags,
+ void (*ctor)(void *, struct kmem_cache *,
+ unsigned long));
+
+static struct kmem_cache *extent_map_cache;
+
+int __init extent_map_init(void)
+{
+ extent_map_cache = btrfs_cache_create("extent_map",
+ sizeof(struct extent_map), 0,
+ NULL);
+ if (!extent_map_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void extent_map_exit(void)
+{
+ if (extent_map_cache)
+ kmem_cache_destroy(extent_map_cache);
+}
+
+/**
+ * extent_map_tree_init - initialize extent map tree
+ * @tree: tree to initialize
+ * @mask: flags for memory allocations during tree operations
+ *
+ * Initialize the extent tree @tree. Should be called for each new inode
+ * or other user of the extent_map interface.
+ */
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+{
+ tree->map.rb_node = NULL;
+ spin_lock_init(&tree->lock);
+}
+EXPORT_SYMBOL(extent_map_tree_init);
+
+/**
+ * alloc_extent_map - allocate new extent map structure
+ * @mask: memory allocation flags
+ *
+ * Allocate a new extent_map structure. The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_extent_map()
+ */
+struct extent_map *alloc_extent_map(gfp_t mask)
+{
+ struct extent_map *em;
+ em = kmem_cache_alloc(extent_map_cache, mask);
+ if (!em || IS_ERR(em))
+ return em;
+ em->in_tree = 0;
+ em->flags = 0;
+ atomic_set(&em->refs, 1);
+ return em;
+}
+EXPORT_SYMBOL(alloc_extent_map);
+
+/**
+ * free_extent_map - drop reference count of an extent_map
+ * @em: extent map beeing releasead
+ *
+ * Drops the reference out on @em by one and free the structure
+ * if the reference count hits zero.
+ */
+void free_extent_map(struct extent_map *em)
+{
+ if (!em)
+ return;
+ WARN_ON(atomic_read(&em->refs) == 0);
+ if (atomic_dec_and_test(&em->refs)) {
+ WARN_ON(em->in_tree);
+ kmem_cache_free(extent_map_cache, em);
+ }
+}
+EXPORT_SYMBOL(free_extent_map);
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_map *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct extent_map, rb_node);
+
+ WARN_ON(!entry->in_tree);
+
+ if (offset < entry->start)
+ p = &(*p)->rb_left;
+ else if (offset >= extent_map_end(entry))
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ entry = rb_entry(node, struct extent_map, rb_node);
+ entry->in_tree = 1;
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/*
+ * search through the tree for an extent_map with a given offset. If
+ * it can't be found, try to find some neighboring extents
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+ struct rb_node **prev_ret,
+ struct rb_node **next_ret)
+{
+ struct rb_node *n = root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev = NULL;
+ struct extent_map *entry;
+ struct extent_map *prev_entry = NULL;
+
+ while (n) {
+ entry = rb_entry(n, struct extent_map, rb_node);
+ prev = n;
+ prev_entry = entry;
+
+ WARN_ON(!entry->in_tree);
+
+ if (offset < entry->start)
+ n = n->rb_left;
+ else if (offset >= extent_map_end(entry))
+ n = n->rb_right;
+ else
+ return n;
+ }
+
+ if (prev_ret) {
+ orig_prev = prev;
+ while (prev && offset >= extent_map_end(prev_entry)) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ }
+ *prev_ret = prev;
+ prev = orig_prev;
+ }
+
+ if (next_ret) {
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ while (prev && offset < prev_entry->start) {
+ prev = rb_prev(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ }
+ *next_ret = prev;
+ }
+ return NULL;
+}
+
+/*
+ * look for an offset in the tree, and if it can't be found, return
+ * the first offset we can find smaller than 'offset'.
+ */
+static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
+{
+ struct rb_node *prev;
+ struct rb_node *ret;
+ ret = __tree_search(root, offset, &prev, NULL);
+ if (!ret)
+ return prev;
+ return ret;
+}
+
+/* check to see if two extent_map structs are adjacent and safe to merge */
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+ if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
+ return 0;
+
+ /*
+ * don't merge compressed extents, we need to know their
+ * actual size
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+ return 0;
+
+ if (extent_map_end(prev) == next->start &&
+ prev->flags == next->flags &&
+ prev->bdev == next->bdev &&
+ ((next->block_start == EXTENT_MAP_HOLE &&
+ prev->block_start == EXTENT_MAP_HOLE) ||
+ (next->block_start == EXTENT_MAP_INLINE &&
+ prev->block_start == EXTENT_MAP_INLINE) ||
+ (next->block_start == EXTENT_MAP_DELALLOC &&
+ prev->block_start == EXTENT_MAP_DELALLOC) ||
+ (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
+ next->block_start == extent_map_block_end(prev)))) {
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * add_extent_mapping - add new extent map to the extent tree
+ * @tree: tree to insert new map in
+ * @em: map to insert
+ *
+ * Insert @em into @tree or perform a simple forward/backward merge with
+ * existing mappings. The extent_map struct passed in will be inserted
+ * into the tree directly, with an additional reference taken, or a
+ * reference dropped if the merge attempt was sucessfull.
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em)
+{
+ int ret = 0;
+ struct extent_map *merge = NULL;
+ struct rb_node *rb;
+ struct extent_map *exist;
+
+ exist = lookup_extent_mapping(tree, em->start, em->len);
+ if (exist) {
+ free_extent_map(exist);
+ ret = -EEXIST;
+ goto out;
+ }
+ assert_spin_locked(&tree->lock);
+ rb = tree_insert(&tree->map, em->start, &em->rb_node);
+ if (rb) {
+ ret = -EEXIST;
+ free_extent_map(merge);
+ goto out;
+ }
+ atomic_inc(&em->refs);
+ if (em->start != 0) {
+ rb = rb_prev(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(merge, em)) {
+ em->start = merge->start;
+ em->len += merge->len;
+ em->block_len += merge->block_len;
+ em->block_start = merge->block_start;
+ merge->in_tree = 0;
+ rb_erase(&merge->rb_node, &tree->map);
+ free_extent_map(merge);
+ }
+ }
+ rb = rb_next(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(em, merge)) {
+ em->len += merge->len;
+ em->block_len += merge->len;
+ rb_erase(&merge->rb_node, &tree->map);
+ merge->in_tree = 0;
+ free_extent_map(merge);
+ }
+out:
+ return ret;
+}
+EXPORT_SYMBOL(add_extent_mapping);
+
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+ if (start + len < start)
+ return (u64)-1;
+ return start + len;
+}
+
+/**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree: tree to lookup in
+ * @start: byte offset to start the search
+ * @len: length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range. There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
+{
+ struct extent_map *em;
+ struct rb_node *rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *next = NULL;
+ u64 end = range_end(start, len);
+
+ assert_spin_locked(&tree->lock);
+ rb_node = __tree_search(&tree->map, start, &prev, &next);
+ if (!rb_node && prev) {
+ em = rb_entry(prev, struct extent_map, rb_node);
+ if (end > em->start && start < extent_map_end(em))
+ goto found;
+ }
+ if (!rb_node && next) {
+ em = rb_entry(next, struct extent_map, rb_node);
+ if (end > em->start && start < extent_map_end(em))
+ goto found;
+ }
+ if (!rb_node) {
+ em = NULL;
+ goto out;
+ }
+ if (IS_ERR(rb_node)) {
+ em = ERR_PTR(PTR_ERR(rb_node));
+ goto out;
+ }
+ em = rb_entry(rb_node, struct extent_map, rb_node);
+ if (end > em->start && start < extent_map_end(em))
+ goto found;
+
+ em = NULL;
+ goto out;
+
+found:
+ atomic_inc(&em->refs);
+out:
+ return em;
+}
+EXPORT_SYMBOL(lookup_extent_mapping);
+
+/**
+ * remove_extent_mapping - removes an extent_map from the extent tree
+ * @tree: extent tree to remove from
+ * @em: extent map beeing removed
+ *
+ * Removes @em from @tree. No reference counts are dropped, and no checks
+ * are done to see if the range is in use
+ */
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+ int ret = 0;
+
+ WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ assert_spin_locked(&tree->lock);
+ rb_erase(&em->rb_node, &tree->map);
+ em->in_tree = 0;
+ return ret;
+}
+EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 00000000000..fb6eeef06bb
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,62 @@
+#ifndef __EXTENTMAP__
+#define __EXTENTMAP__
+
+#include <linux/rbtree.h>
+
+#define EXTENT_MAP_LAST_BYTE (u64)-4
+#define EXTENT_MAP_HOLE (u64)-3
+#define EXTENT_MAP_INLINE (u64)-2
+#define EXTENT_MAP_DELALLOC (u64)-1
+
+/* bits for the flags field */
+#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
+#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
+#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+
+struct extent_map {
+ struct rb_node rb_node;
+
+ /* all of these are in bytes */
+ u64 start;
+ u64 len;
+ u64 orig_start;
+ u64 block_start;
+ u64 block_len;
+ unsigned long flags;
+ struct block_device *bdev;
+ atomic_t refs;
+ int in_tree;
+};
+
+struct extent_map_tree {
+ struct rb_root map;
+ spinlock_t lock;
+};
+
+static inline u64 extent_map_end(struct extent_map *em)
+{
+ if (em->start + em->len < em->start)
+ return (u64)-1;
+ return em->start + em->len;
+}
+
+static inline u64 extent_map_block_end(struct extent_map *em)
+{
+ if (em->block_start + em->block_len < em->block_start)
+ return (u64)-1;
+ return em->block_start + em->block_len;
+}
+
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
+int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em);
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+
+struct extent_map *alloc_extent_map(gfp_t mask);
+void free_extent_map(struct extent_map *em);
+int __init extent_map_init(void);
+void extent_map_exit(void);
+#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 00000000000..964652435fd
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,831 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+
+#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+ sizeof(struct btrfs_item) * 2) / \
+ size) - 1))
+
+#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
+ sizeof(struct btrfs_ordered_sum)) / \
+ sizeof(struct btrfs_sector_sum) * \
+ (r)->sectorsize - (r)->sectorsize)
+
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 objectid, u64 pos,
+ u64 disk_offset, u64 disk_num_bytes,
+ u64 num_bytes, u64 offset, u64 ram_bytes,
+ u8 compression, u8 encryption, u16 other_encoding)
+{
+ int ret = 0;
+ struct btrfs_file_extent_item *item;
+ struct btrfs_key file_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ file_key.objectid = objectid;
+ file_key.offset = pos;
+ btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+
+ ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+ sizeof(*item));
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret);
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
+ btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
+ btrfs_set_file_extent_offset(leaf, item, offset);
+ btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+ btrfs_set_file_extent_generation(leaf, item, trans->transid);
+ btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_compression(leaf, item, compression);
+ btrfs_set_file_extent_encryption(leaf, item, encryption);
+ btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, int cow)
+{
+ int ret;
+ struct btrfs_key file_key;
+ struct btrfs_key found_key;
+ struct btrfs_csum_item *item;
+ struct extent_buffer *leaf;
+ u64 csum_offset = 0;
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+ int csums_in_item;
+
+ file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ file_key.offset = bytenr;
+ btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+ ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
+ if (ret < 0)
+ goto fail;
+ leaf = path->nodes[0];
+ if (ret > 0) {
+ ret = 1;
+ if (path->slots[0] == 0)
+ goto fail;
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
+ goto fail;
+
+ csum_offset = (bytenr - found_key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+ csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+ csums_in_item /= csum_size;
+
+ if (csum_offset >= csums_in_item) {
+ ret = -EFBIG;
+ goto fail;
+ }
+ }
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ item = (struct btrfs_csum_item *)((unsigned char *)item +
+ csum_offset * csum_size);
+ return item;
+fail:
+ if (ret > 0)
+ ret = -ENOENT;
+ return ERR_PTR(ret);
+}
+
+
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid,
+ u64 offset, int mod)
+{
+ int ret;
+ struct btrfs_key file_key;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+
+ file_key.objectid = objectid;
+ file_key.offset = offset;
+ btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+ ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+ return ret;
+}
+
+
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u32 *dst)
+{
+ u32 sum;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int bio_index = 0;
+ u64 offset;
+ u64 item_start_offset = 0;
+ u64 item_last_offset = 0;
+ u64 disk_bytenr;
+ u32 diff;
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_csum_item *item = NULL;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+
+ path = btrfs_alloc_path();
+ if (bio->bi_size > PAGE_CACHE_SIZE * 8)
+ path->reada = 2;
+
+ WARN_ON(bio->bi_vcnt <= 0);
+
+ disk_bytenr = (u64)bio->bi_sector << 9;
+ while (bio_index < bio->bi_vcnt) {
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
+ if (ret == 0)
+ goto found;
+
+ if (!item || disk_bytenr < item_start_offset ||
+ disk_bytenr >= item_last_offset) {
+ struct btrfs_key found_key;
+ u32 item_size;
+
+ if (item)
+ btrfs_release_path(root, path);
+ item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+ path, disk_bytenr, 0);
+ if (IS_ERR(item)) {
+ ret = PTR_ERR(item);
+ if (ret == -ENOENT || ret == -EFBIG)
+ ret = 0;
+ sum = 0;
+ if (BTRFS_I(inode)->root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ set_extent_bits(io_tree, offset,
+ offset + bvec->bv_len - 1,
+ EXTENT_NODATASUM, GFP_NOFS);
+ } else {
+ printk(KERN_INFO "btrfs no csum found "
+ "for inode %lu start %llu\n",
+ inode->i_ino,
+ (unsigned long long)offset);
+ }
+ item = NULL;
+ btrfs_release_path(root, path);
+ goto found;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+
+ item_start_offset = found_key.offset;
+ item_size = btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ item_last_offset = item_start_offset +
+ (item_size / csum_size) *
+ root->sectorsize;
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ }
+ /*
+ * this byte range must be able to fit inside
+ * a single leaf so it will also fit inside a u32
+ */
+ diff = disk_bytenr - item_start_offset;
+ diff = diff / root->sectorsize;
+ diff = diff * csum_size;
+
+ read_extent_buffer(path->nodes[0], &sum,
+ ((unsigned long)item) + diff,
+ csum_size);
+found:
+ if (dst)
+ *dst++ = sum;
+ else
+ set_state_private(io_tree, offset, sum);
+ disk_bytenr += bvec->bv_len;
+ bio_index++;
+ bvec++;
+ }
+ btrfs_free_path(path);
+ return 0;
+}
+
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_sector_sum *sector_sum;
+ struct btrfs_csum_item *item;
+ unsigned long offset;
+ int ret;
+ size_t size;
+ u64 csum_end;
+ u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.offset = start;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0 && path->slots[0] > 0) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+ key.type == BTRFS_EXTENT_CSUM_KEY) {
+ offset = (start - key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+ if (offset * csum_size <
+ btrfs_item_size_nr(leaf, path->slots[0] - 1))
+ path->slots[0]--;
+ }
+ }
+
+ while (start <= end) {
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto fail;
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ key.type != BTRFS_EXTENT_CSUM_KEY)
+ break;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.offset > end)
+ break;
+
+ if (key.offset > start)
+ start = key.offset;
+
+ size = btrfs_item_size_nr(leaf, path->slots[0]);
+ csum_end = key.offset + (size / csum_size) * root->sectorsize;
+ if (csum_end <= start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ csum_end = min(csum_end, end + 1);
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ while (start < csum_end) {
+ size = min_t(size_t, csum_end - start,
+ MAX_ORDERED_SUM_BYTES(root));
+ sums = kzalloc(btrfs_ordered_sum_size(root, size),
+ GFP_NOFS);
+ BUG_ON(!sums);
+
+ sector_sum = sums->sums;
+ sums->bytenr = start;
+ sums->len = size;
+
+ offset = (start - key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+ offset *= csum_size;
+
+ while (size > 0) {
+ read_extent_buffer(path->nodes[0],
+ &sector_sum->sum,
+ ((unsigned long)item) +
+ offset, csum_size);
+ sector_sum->bytenr = start;
+
+ size -= root->sectorsize;
+ start += root->sectorsize;
+ offset += csum_size;
+ sector_sum++;
+ }
+ list_add_tail(&sums->list, list);
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+fail:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 file_start, int contig)
+{
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_sector_sum *sector_sum;
+ struct btrfs_ordered_extent *ordered;
+ char *data;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int bio_index = 0;
+ unsigned long total_bytes = 0;
+ unsigned long this_sum_bytes = 0;
+ u64 offset;
+ u64 disk_bytenr;
+
+ WARN_ON(bio->bi_vcnt <= 0);
+ sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
+ if (!sums)
+ return -ENOMEM;
+
+ sector_sum = sums->sums;
+ disk_bytenr = (u64)bio->bi_sector << 9;
+ sums->len = bio->bi_size;
+ INIT_LIST_HEAD(&sums->list);
+
+ if (contig)
+ offset = file_start;
+ else
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
+ BUG_ON(!ordered);
+ sums->bytenr = ordered->start;
+
+ while (bio_index < bio->bi_vcnt) {
+ if (!contig)
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+ if (!contig && (offset >= ordered->file_offset + ordered->len ||
+ offset < ordered->file_offset)) {
+ unsigned long bytes_left;
+ sums->len = this_sum_bytes;
+ this_sum_bytes = 0;
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_put_ordered_extent(ordered);
+
+ bytes_left = bio->bi_size - total_bytes;
+
+ sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+ GFP_NOFS);
+ BUG_ON(!sums);
+ sector_sum = sums->sums;
+ sums->len = bytes_left;
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
+ BUG_ON(!ordered);
+ sums->bytenr = ordered->start;
+ }
+
+ data = kmap_atomic(bvec->bv_page, KM_USER0);
+ sector_sum->sum = ~(u32)0;
+ sector_sum->sum = btrfs_csum_data(root,
+ data + bvec->bv_offset,
+ sector_sum->sum,
+ bvec->bv_len);
+ kunmap_atomic(data, KM_USER0);
+ btrfs_csum_final(sector_sum->sum,
+ (char *)&sector_sum->sum);
+ sector_sum->bytenr = disk_bytenr;
+
+ sector_sum++;
+ bio_index++;
+ total_bytes += bvec->bv_len;
+ this_sum_bytes += bvec->bv_len;
+ disk_bytenr += bvec->bv_len;
+ offset += bvec->bv_len;
+ bvec++;
+ }
+ this_sum_bytes = 0;
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_put_ordered_extent(ordered);
+ return 0;
+}
+
+/*
+ * helper function for csum removal, this expects the
+ * key to describe the csum pointed to by the path, and it expects
+ * the csum to overlap the range [bytenr, len]
+ *
+ * The csum should not be entirely contained in the range and the
+ * range should not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the
+ * overlap, and fixes up the key as required.
+ */
+static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *key,
+ u64 bytenr, u64 len)
+{
+ struct extent_buffer *leaf;
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+ u64 csum_end;
+ u64 end_byte = bytenr + len;
+ u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+ int ret;
+
+ leaf = path->nodes[0];
+ csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+ csum_end <<= root->fs_info->sb->s_blocksize_bits;
+ csum_end += key->offset;
+
+ if (key->offset < bytenr && csum_end <= end_byte) {
+ /*
+ * [ bytenr - len ]
+ * [ ]
+ * [csum ]
+ * A simple truncate off the end of the item
+ */
+ u32 new_size = (bytenr - key->offset) >> blocksize_bits;
+ new_size *= csum_size;
+ ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+ BUG_ON(ret);
+ } else if (key->offset >= bytenr && csum_end > end_byte &&
+ end_byte > key->offset) {
+ /*
+ * [ bytenr - len ]
+ * [ ]
+ * [csum ]
+ * we need to truncate from the beginning of the csum
+ */
+ u32 new_size = (csum_end - end_byte) >> blocksize_bits;
+ new_size *= csum_size;
+
+ ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+ BUG_ON(ret);
+
+ key->offset = end_byte;
+ ret = btrfs_set_item_key_safe(trans, root, path, key);
+ BUG_ON(ret);
+ } else {
+ BUG();
+ }
+ return 0;
+}
+
+/*
+ * deletes the csum items from the csum tree for a given
+ * range of bytes.
+ */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr, u64 len)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 end_byte = bytenr + len;
+ u64 csum_end;
+ struct extent_buffer *leaf;
+ int ret;
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+ int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+
+ root = root->fs_info->csum_root;
+
+ path = btrfs_alloc_path();
+
+ while (1) {
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.offset = end_byte - 1;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ key.type != BTRFS_EXTENT_CSUM_KEY) {
+ break;
+ }
+
+ if (key.offset >= end_byte)
+ break;
+
+ csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+ csum_end <<= blocksize_bits;
+ csum_end += key.offset;
+
+ /* this csum ends before we start, we're done */
+ if (csum_end <= bytenr)
+ break;
+
+ /* delete the entire item, it is inside our range */
+ if (key.offset >= bytenr && csum_end <= end_byte) {
+ ret = btrfs_del_item(trans, root, path);
+ BUG_ON(ret);
+ if (key.offset == bytenr)
+ break;
+ } else if (key.offset < bytenr && csum_end > end_byte) {
+ unsigned long offset;
+ unsigned long shift_len;
+ unsigned long item_offset;
+ /*
+ * [ bytenr - len ]
+ * [csum ]
+ *
+ * Our bytes are in the middle of the csum,
+ * we need to split this item and insert a new one.
+ *
+ * But we can't drop the path because the
+ * csum could change, get removed, extended etc.
+ *
+ * The trick here is the max size of a csum item leaves
+ * enough room in the tree block for a single
+ * item header. So, we split the item in place,
+ * adding a new header pointing to the existing
+ * bytes. Then we loop around again and we have
+ * a nicely formed csum item that we can neatly
+ * truncate.
+ */
+ offset = (bytenr - key.offset) >> blocksize_bits;
+ offset *= csum_size;
+
+ shift_len = (len >> blocksize_bits) * csum_size;
+
+ item_offset = btrfs_item_ptr_offset(leaf,
+ path->slots[0]);
+
+ memset_extent_buffer(leaf, 0, item_offset + offset,
+ shift_len);
+ key.offset = bytenr;
+
+ /*
+ * btrfs_split_item returns -EAGAIN when the
+ * item changed size or key
+ */
+ ret = btrfs_split_item(trans, root, path, &key, offset);
+ BUG_ON(ret && ret != -EAGAIN);
+
+ key.offset = end_byte - 1;
+ } else {
+ ret = truncate_one_csum(trans, root, path,
+ &key, bytenr, len);
+ BUG_ON(ret);
+ if (key.offset < bytenr)
+ break;
+ }
+ btrfs_release_path(root, path);
+ }
+out:
+ btrfs_free_path(path);
+ return 0;
+}
+
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_ordered_sum *sums)
+{
+ u64 bytenr;
+ int ret;
+ struct btrfs_key file_key;
+ struct btrfs_key found_key;
+ u64 next_offset;
+ u64 total_bytes = 0;
+ int found_next;
+ struct btrfs_path *path;
+ struct btrfs_csum_item *item;
+ struct btrfs_csum_item *item_end;
+ struct extent_buffer *leaf = NULL;
+ u64 csum_offset;
+ struct btrfs_sector_sum *sector_sum;
+ u32 nritems;
+ u32 ins_size;
+ char *eb_map;
+ char *eb_token;
+ unsigned long map_len;
+ unsigned long map_start;
+ u16 csum_size =
+ btrfs_super_csum_size(&root->fs_info->super_copy);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ sector_sum = sums->sums;
+again:
+ next_offset = (u64)-1;
+ found_next = 0;
+ file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ file_key.offset = sector_sum->bytenr;
+ bytenr = sector_sum->bytenr;
+ btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+
+ item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+ if (!IS_ERR(item)) {
+ leaf = path->nodes[0];
+ ret = 0;
+ goto found;
+ }
+ ret = PTR_ERR(item);
+ if (ret == -EFBIG) {
+ u32 item_size;
+ /* we found one, but it isn't big enough yet */
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if ((item_size / csum_size) >=
+ MAX_CSUM_ITEMS(root, csum_size)) {
+ /* already at max size, make a new one */
+ goto insert;
+ }
+ } else {
+ int slot = path->slots[0] + 1;
+ /* we didn't find a csum item, insert one */
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems - 1) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 1)
+ found_next = 1;
+ if (ret != 0)
+ goto insert;
+ slot = 0;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+ if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ found_key.type != BTRFS_EXTENT_CSUM_KEY) {
+ found_next = 1;
+ goto insert;
+ }
+ next_offset = found_key.offset;
+ found_next = 1;
+ goto insert;
+ }
+
+ /*
+ * at this point, we know the tree has an item, but it isn't big
+ * enough yet to put our csum in. Grow it
+ */
+ btrfs_release_path(root, path);
+ ret = btrfs_search_slot(trans, root, &file_key, path,
+ csum_size, 1);
+ if (ret < 0)
+ goto fail_unlock;
+
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ goto insert;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ csum_offset = (bytenr - found_key.offset) >>
+ root->fs_info->sb->s_blocksize_bits;
+
+ if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+ found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
+ goto insert;
+ }
+
+ if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+ csum_size) {
+ u32 diff = (csum_offset + 1) * csum_size;
+
+ /*
+ * is the item big enough already? we dropped our lock
+ * before and need to recheck
+ */
+ if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
+ goto csum;
+
+ diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
+ if (diff != csum_size)
+ goto insert;
+
+ ret = btrfs_extend_item(trans, root, path, diff);
+ BUG_ON(ret);
+ goto csum;
+ }
+
+insert:
+ btrfs_release_path(root, path);
+ csum_offset = 0;
+ if (found_next) {
+ u64 tmp = total_bytes + root->sectorsize;
+ u64 next_sector = sector_sum->bytenr;
+ struct btrfs_sector_sum *next = sector_sum + 1;
+
+ while (tmp < sums->len) {
+ if (next_sector + root->sectorsize != next->bytenr)
+ break;
+ tmp += root->sectorsize;
+ next_sector = next->bytenr;
+ next++;
+ }
+ tmp = min(tmp, next_offset - file_key.offset);
+ tmp >>= root->fs_info->sb->s_blocksize_bits;
+ tmp = max((u64)1, tmp);
+ tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
+ ins_size = csum_size * tmp;
+ } else {
+ ins_size = csum_size;
+ }
+ ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+ ins_size);
+ if (ret < 0)
+ goto fail_unlock;
+ if (ret != 0) {
+ WARN_ON(1);
+ goto fail_unlock;
+ }
+csum:
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ ret = 0;
+ item = (struct btrfs_csum_item *)((unsigned char *)item +
+ csum_offset * csum_size);
+found:
+ item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+ item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
+ btrfs_item_size_nr(leaf, path->slots[0]));
+ eb_token = NULL;
+ cond_resched();
+next_sector:
+
+ if (!eb_token ||
+ (unsigned long)item + csum_size >= map_start + map_len) {
+ int err;
+
+ if (eb_token)
+ unmap_extent_buffer(leaf, eb_token, KM_USER1);
+ eb_token = NULL;
+ err = map_private_extent_buffer(leaf, (unsigned long)item,
+ csum_size,
+ &eb_token, &eb_map,
+ &map_start, &map_len, KM_USER1);
+ if (err)
+ eb_token = NULL;
+ }
+ if (eb_token) {
+ memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
+ &sector_sum->sum, csum_size);
+ } else {
+ write_extent_buffer(leaf, &sector_sum->sum,
+ (unsigned long)item, csum_size);
+ }
+
+ total_bytes += root->sectorsize;
+ sector_sum++;
+ if (total_bytes < sums->len) {
+ item = (struct btrfs_csum_item *)((char *)item +
+ csum_size);
+ if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
+ sector_sum->bytenr) {
+ bytenr = sector_sum->bytenr;
+ goto next_sector;
+ }
+ }
+ if (eb_token) {
+ unmap_extent_buffer(leaf, eb_token, KM_USER1);
+ eb_token = NULL;
+ }
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ cond_resched();
+ if (total_bytes < sums->len) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+
+fail_unlock:
+ goto out;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 00000000000..90268334145
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1288 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
+#include "compat.h"
+
+
+/* simple helper to fault in pages and copy. This should go away
+ * and be replaced with calls into generic code.
+ */
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
+ int write_bytes,
+ struct page **prepared_pages,
+ const char __user *buf)
+{
+ long page_fault = 0;
+ int i;
+ int offset = pos & (PAGE_CACHE_SIZE - 1);
+
+ for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+ size_t count = min_t(size_t,
+ PAGE_CACHE_SIZE - offset, write_bytes);
+ struct page *page = prepared_pages[i];
+ fault_in_pages_readable(buf, count);
+
+ /* Copy data from userspace to the current page */
+ kmap(page);
+ page_fault = __copy_from_user(page_address(page) + offset,
+ buf, count);
+ /* Flush processor's dcache for this page */
+ flush_dcache_page(page);
+ kunmap(page);
+ buf += count;
+ write_bytes -= count;
+
+ if (page_fault)
+ break;
+ }
+ return page_fault ? -EFAULT : 0;
+}
+
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
+static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+{
+ size_t i;
+ for (i = 0; i < num_pages; i++) {
+ if (!pages[i])
+ break;
+ /* page checked is some magic around finding pages that
+ * have been modified without going through btrfs_set_page_dirty
+ * clear it here
+ */
+ ClearPageChecked(pages[i]);
+ unlock_page(pages[i]);
+ mark_page_accessed(pages[i]);
+ page_cache_release(pages[i]);
+ }
+}
+
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
+static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct file *file,
+ struct page **pages,
+ size_t num_pages,
+ loff_t pos,
+ size_t write_bytes)
+{
+ int err = 0;
+ int i;
+ struct inode *inode = fdentry(file)->d_inode;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ u64 hint_byte;
+ u64 num_bytes;
+ u64 start_pos;
+ u64 end_of_last_block;
+ u64 end_pos = pos + write_bytes;
+ loff_t isize = i_size_read(inode);
+
+ start_pos = pos & ~((u64)root->sectorsize - 1);
+ num_bytes = (write_bytes + pos - start_pos +
+ root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+ end_of_last_block = start_pos + num_bytes - 1;
+
+ lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+ trans = btrfs_join_transaction(root, 1);
+ if (!trans) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ btrfs_set_trans_block_group(trans, inode);
+ hint_byte = 0;
+
+ set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+
+ /* check for reserved extents on each page, we don't want
+ * to reset the delalloc bit on things that already have
+ * extents reserved.
+ */
+ btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ for (i = 0; i < num_pages; i++) {
+ struct page *p = pages[i];
+ SetPageUptodate(p);
+ ClearPageChecked(p);
+ set_page_dirty(p);
+ }
+ if (end_pos > isize) {
+ i_size_write(inode, end_pos);
+ btrfs_update_inode(trans, root, inode);
+ }
+ err = btrfs_end_transaction(trans, root);
+out_unlock:
+ unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+ return err;
+}
+
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end]. Existing extents are split as required.
+ */
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned)
+{
+ struct extent_map *em;
+ struct extent_map *split = NULL;
+ struct extent_map *split2 = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 len = end - start + 1;
+ int ret;
+ int testend = 1;
+ unsigned long flags;
+ int compressed = 0;
+
+ WARN_ON(end < start);
+ if (end == (u64)-1) {
+ len = (u64)-1;
+ testend = 0;
+ }
+ while (1) {
+ if (!split)
+ split = alloc_extent_map(GFP_NOFS);
+ if (!split2)
+ split2 = alloc_extent_map(GFP_NOFS);
+
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+ spin_unlock(&em_tree->lock);
+ break;
+ }
+ flags = em->flags;
+ if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ spin_unlock(&em_tree->lock);
+ if (em->start <= start &&
+ (!testend || em->start + em->len >= start + len)) {
+ free_extent_map(em);
+ break;
+ }
+ if (start < em->start) {
+ len = em->start - start;
+ } else {
+ len = start + len - (em->start + em->len);
+ start = em->start + em->len;
+ }
+ free_extent_map(em);
+ continue;
+ }
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ remove_extent_mapping(em_tree, em);
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ em->start < start) {
+ split->start = em->start;
+ split->len = start - em->start;
+ split->orig_start = em->orig_start;
+ split->block_start = em->block_start;
+
+ if (compressed)
+ split->block_len = em->block_len;
+ else
+ split->block_len = split->len;
+
+ split->bdev = em->bdev;
+ split->flags = flags;
+ ret = add_extent_mapping(em_tree, split);
+ BUG_ON(ret);
+ free_extent_map(split);
+ split = split2;
+ split2 = NULL;
+ }
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ testend && em->start + em->len > start + len) {
+ u64 diff = start + len - em->start;
+
+ split->start = start + len;
+ split->len = em->start + em->len - (start + len);
+ split->bdev = em->bdev;
+ split->flags = flags;
+
+ if (compressed) {
+ split->block_len = em->block_len;
+ split->block_start = em->block_start;
+ split->orig_start = em->orig_start;
+ } else {
+ split->block_len = split->len;
+ split->block_start = em->block_start + diff;
+ split->orig_start = split->start;
+ }
+
+ ret = add_extent_mapping(em_tree, split);
+ BUG_ON(ret);
+ free_extent_map(split);
+ split = NULL;
+ }
+ spin_unlock(&em_tree->lock);
+
+ /* once for us */
+ free_extent_map(em);
+ /* once for the tree*/
+ free_extent_map(em);
+ }
+ if (split)
+ free_extent_map(split);
+ if (split2)
+ free_extent_map(split2);
+ return 0;
+}
+
+int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
+{
+ return 0;
+#if 0
+ struct btrfs_path *path;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *extent;
+ u64 last_offset = 0;
+ int nritems;
+ int slot;
+ int found_type;
+ int ret;
+ int err = 0;
+ u64 extent_end = 0;
+
+ path = btrfs_alloc_path();
+ ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
+ last_offset, 0);
+ while (1) {
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ goto out;
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ }
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid != inode->i_ino)
+ break;
+ if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+ goto out;
+
+ if (found_key.offset < last_offset) {
+ WARN_ON(1);
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_ERR "inode %lu found offset %llu "
+ "expected %llu\n", inode->i_ino,
+ (unsigned long long)found_key.offset,
+ (unsigned long long)last_offset);
+ err = 1;
+ goto out;
+ }
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, extent);
+ if (found_type == BTRFS_FILE_EXTENT_REG) {
+ extent_end = found_key.offset +
+ btrfs_file_extent_num_bytes(leaf, extent);
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ struct btrfs_item *item;
+ item = btrfs_item_nr(leaf, slot);
+ extent_end = found_key.offset +
+ btrfs_file_extent_inline_len(leaf, extent);
+ extent_end = (extent_end + root->sectorsize - 1) &
+ ~((u64)root->sectorsize - 1);
+ }
+ last_offset = extent_end;
+ path->slots[0]++;
+ }
+ if (0 && last_offset < inode->i_size) {
+ WARN_ON(1);
+ btrfs_print_leaf(root, leaf);
+ printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
+ inode->i_ino, (unsigned long long)last_offset,
+ (unsigned long long)inode->i_size);
+ err = 1;
+
+ }
+out:
+ btrfs_free_path(path);
+ return err;
+#endif
+}
+
+/*
+ * this is very complex, but the basic idea is to drop all extents
+ * in the range start - end. hint_block is filled in with a block number
+ * that would be a good hint to the block allocator for this file.
+ *
+ * If an extent intersects the range but is not entirely inside the range
+ * it is either truncated or split. Anything entirely inside the range
+ * is deleted from the tree.
+ *
+ * inline_limit is used to tell this code which offsets in the file to keep
+ * if they contain inline extents.
+ */
+noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
+{
+ u64 extent_end = 0;
+ u64 locked_end = end;
+ u64 search_start = start;
+ u64 leaf_start;
+ u64 ram_bytes = 0;
+ u64 orig_parent = 0;
+ u64 disk_bytenr = 0;
+ u8 compression;
+ u8 encryption;
+ u16 other_encoding = 0;
+ u64 root_gen;
+ u64 root_owner;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *extent;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item old;
+ int keep;
+ int slot;
+ int bookend;
+ int found_type = 0;
+ int found_extent;
+ int found_inline;
+ int recow;
+ int ret;
+
+ inline_limit = 0;
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ while (1) {
+ recow = 0;
+ btrfs_release_path(root, path);
+ ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ search_start, -1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ if (path->slots[0] == 0) {
+ ret = 0;
+ goto out;
+ }
+ path->slots[0]--;
+ }
+next_slot:
+ keep = 0;
+ bookend = 0;
+ found_extent = 0;
+ found_inline = 0;
+ leaf_start = 0;
+ root_gen = 0;
+ root_owner = 0;
+ compression = 0;
+ encryption = 0;
+ extent = NULL;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ ret = 0;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
+ key.offset >= end) {
+ goto out;
+ }
+ if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+ key.objectid != inode->i_ino) {
+ goto out;
+ }
+ if (recow) {
+ search_start = max(key.offset, start);
+ continue;
+ }
+ if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, extent);
+ compression = btrfs_file_extent_compression(leaf,
+ extent);
+ encryption = btrfs_file_extent_encryption(leaf,
+ extent);
+ other_encoding = btrfs_file_extent_other_encoding(leaf,
+ extent);
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ extent_end =
+ btrfs_file_extent_disk_bytenr(leaf,
+ extent);
+ if (extent_end)
+ *hint_byte = extent_end;
+
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, extent);
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+ extent);
+ found_extent = 1;
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ found_inline = 1;
+ extent_end = key.offset +
+ btrfs_file_extent_inline_len(leaf, extent);
+ }
+ } else {
+ extent_end = search_start;
+ }
+
+ /* we found nothing we can drop */
+ if ((!found_extent && !found_inline) ||
+ search_start >= extent_end) {
+ int nextret;
+ u32 nritems;
+ nritems = btrfs_header_nritems(leaf);
+ if (slot >= nritems - 1) {
+ nextret = btrfs_next_leaf(root, path);
+ if (nextret)
+ goto out;
+ recow = 1;
+ } else {
+ path->slots[0]++;
+ }
+ goto next_slot;
+ }
+
+ if (end <= extent_end && start >= key.offset && found_inline)
+ *hint_byte = EXTENT_MAP_INLINE;
+
+ if (found_extent) {
+ read_extent_buffer(leaf, &old, (unsigned long)extent,
+ sizeof(old));
+ root_gen = btrfs_header_generation(leaf);
+ root_owner = btrfs_header_owner(leaf);
+ leaf_start = leaf->start;
+ }
+
+ if (end < extent_end && end >= key.offset) {
+ bookend = 1;
+ if (found_inline && start <= key.offset)
+ keep = 1;
+ }
+
+ if (bookend && found_extent) {
+ if (locked_end < extent_end) {
+ ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+ locked_end, extent_end - 1,
+ GFP_NOFS);
+ if (!ret) {
+ btrfs_release_path(root, path);
+ lock_extent(&BTRFS_I(inode)->io_tree,
+ locked_end, extent_end - 1,
+ GFP_NOFS);
+ locked_end = extent_end;
+ continue;
+ }
+ locked_end = extent_end;
+ }
+ orig_parent = path->nodes[0]->start;
+ disk_bytenr = le64_to_cpu(old.disk_bytenr);
+ if (disk_bytenr != 0) {
+ ret = btrfs_inc_extent_ref(trans, root,
+ disk_bytenr,
+ le64_to_cpu(old.disk_num_bytes),
+ orig_parent, root->root_key.objectid,
+ trans->transid, inode->i_ino);
+ BUG_ON(ret);
+ }
+ }
+
+ if (found_inline) {
+ u64 mask = root->sectorsize - 1;
+ search_start = (extent_end + mask) & ~mask;
+ } else
+ search_start = extent_end;
+
+ /* truncate existing extent */
+ if (start > key.offset) {
+ u64 new_num;
+ u64 old_num;
+ keep = 1;
+ WARN_ON(start & (root->sectorsize - 1));
+ if (found_extent) {
+ new_num = start - key.offset;
+ old_num = btrfs_file_extent_num_bytes(leaf,
+ extent);
+ *hint_byte =
+ btrfs_file_extent_disk_bytenr(leaf,
+ extent);
+ if (btrfs_file_extent_disk_bytenr(leaf,
+ extent)) {
+ inode_sub_bytes(inode, old_num -
+ new_num);
+ }
+ btrfs_set_file_extent_num_bytes(leaf,
+ extent, new_num);
+ btrfs_mark_buffer_dirty(leaf);
+ } else if (key.offset < inline_limit &&
+ (end > extent_end) &&
+ (inline_limit < extent_end)) {
+ u32 new_size;
+ new_size = btrfs_file_extent_calc_inline_size(
+ inline_limit - key.offset);
+ inode_sub_bytes(inode, extent_end -
+ inline_limit);
+ btrfs_set_file_extent_ram_bytes(leaf, extent,
+ new_size);
+ if (!compression && !encryption) {
+ btrfs_truncate_item(trans, root, path,
+ new_size, 1);
+ }
+ }
+ }
+ /* delete the entire extent */
+ if (!keep) {
+ if (found_inline)
+ inode_sub_bytes(inode, extent_end -
+ key.offset);
+ ret = btrfs_del_item(trans, root, path);
+ /* TODO update progress marker and return */
+ BUG_ON(ret);
+ extent = NULL;
+ btrfs_release_path(root, path);
+ /* the extent will be freed later */
+ }
+ if (bookend && found_inline && start <= key.offset) {
+ u32 new_size;
+ new_size = btrfs_file_extent_calc_inline_size(
+ extent_end - end);
+ inode_sub_bytes(inode, end - key.offset);
+ btrfs_set_file_extent_ram_bytes(leaf, extent,
+ new_size);
+ if (!compression && !encryption)
+ ret = btrfs_truncate_item(trans, root, path,
+ new_size, 0);
+ BUG_ON(ret);
+ }
+ /* create bookend, splitting the extent in two */
+ if (bookend && found_extent) {
+ struct btrfs_key ins;
+ ins.objectid = inode->i_ino;
+ ins.offset = end;
+ btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
+
+ btrfs_release_path(root, path);
+ ret = btrfs_insert_empty_item(trans, root, path, &ins,
+ sizeof(*extent));
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ write_extent_buffer(leaf, &old,
+ (unsigned long)extent, sizeof(old));
+
+ btrfs_set_file_extent_compression(leaf, extent,
+ compression);
+ btrfs_set_file_extent_encryption(leaf, extent,
+ encryption);
+ btrfs_set_file_extent_other_encoding(leaf, extent,
+ other_encoding);
+ btrfs_set_file_extent_offset(leaf, extent,
+ le64_to_cpu(old.offset) + end - key.offset);
+ WARN_ON(le64_to_cpu(old.num_bytes) <
+ (extent_end - end));
+ btrfs_set_file_extent_num_bytes(leaf, extent,
+ extent_end - end);
+
+ /*
+ * set the ram bytes to the size of the full extent
+ * before splitting. This is a worst case flag,
+ * but its the best we can do because we don't know
+ * how splitting affects compression
+ */
+ btrfs_set_file_extent_ram_bytes(leaf, extent,
+ ram_bytes);
+ btrfs_set_file_extent_type(leaf, extent, found_type);
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ if (disk_bytenr != 0) {
+ ret = btrfs_update_extent_ref(trans, root,
+ disk_bytenr, orig_parent,
+ leaf->start,
+ root->root_key.objectid,
+ trans->transid, ins.objectid);
+
+ BUG_ON(ret);
+ }
+ btrfs_release_path(root, path);
+ if (disk_bytenr != 0)
+ inode_add_bytes(inode, extent_end - end);
+ }
+
+ if (found_extent && !keep) {
+ u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
+
+ if (old_disk_bytenr != 0) {
+ inode_sub_bytes(inode,
+ le64_to_cpu(old.num_bytes));
+ ret = btrfs_free_extent(trans, root,
+ old_disk_bytenr,
+ le64_to_cpu(old.disk_num_bytes),
+ leaf_start, root_owner,
+ root_gen, key.objectid, 0);
+ BUG_ON(ret);
+ *hint_byte = old_disk_bytenr;
+ }
+ }
+
+ if (search_start >= end) {
+ ret = 0;
+ goto out;
+ }
+ }
+out:
+ btrfs_free_path(path);
+ if (locked_end > end) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+ GFP_NOFS);
+ }
+ btrfs_check_file(root, inode);
+ return ret;
+}
+
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+ u64 objectid, u64 bytenr, u64 *start, u64 *end)
+{
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 extent_end;
+
+ if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+ return 0;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+ btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+ btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ return 0;
+
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if ((*start && *start != key.offset) || (*end && *end != extent_end))
+ return 0;
+
+ *start = key.offset;
+ *end = extent_end;
+ return 1;
+}
+
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode, u64 start, u64 end)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 bytenr;
+ u64 num_bytes;
+ u64 extent_end;
+ u64 extent_offset;
+ u64 other_start;
+ u64 other_end;
+ u64 split = start;
+ u64 locked_end = end;
+ u64 orig_parent;
+ int extent_type;
+ int split_end = 1;
+ int ret;
+
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+again:
+ key.objectid = inode->i_ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ if (split == start)
+ key.offset = split;
+ else
+ key.offset = split - 1;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0 && path->slots[0] > 0)
+ path->slots[0]--;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ BUG_ON(key.objectid != inode->i_ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ BUG_ON(key.offset > start || extent_end < end);
+
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+ if (key.offset == start)
+ split = end;
+
+ if (key.offset == start && extent_end == end) {
+ int del_nr = 0;
+ int del_slot = 0;
+ u64 leaf_owner = btrfs_header_owner(leaf);
+ u64 leaf_gen = btrfs_header_generation(leaf);
+ other_start = end;
+ other_end = 0;
+ if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+ bytenr, &other_start, &other_end)) {
+ extent_end = other_end;
+ del_slot = path->slots[0] + 1;
+ del_nr++;
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ leaf->start, leaf_owner,
+ leaf_gen, inode->i_ino, 0);
+ BUG_ON(ret);
+ }
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
+ bytenr, &other_start, &other_end)) {
+ key.offset = other_start;
+ del_slot = path->slots[0];
+ del_nr++;
+ ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ leaf->start, leaf_owner,
+ leaf_gen, inode->i_ino, 0);
+ BUG_ON(ret);
+ }
+ split_end = 0;
+ if (del_nr == 0) {
+ btrfs_set_file_extent_type(leaf, fi,
+ BTRFS_FILE_EXTENT_REG);
+ goto done;
+ }
+
+ fi = btrfs_item_ptr(leaf, del_slot - 1,
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_end - key.offset);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+ BUG_ON(ret);
+ goto done;
+ } else if (split == start) {
+ if (locked_end < extent_end) {
+ ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+ locked_end, extent_end - 1, GFP_NOFS);
+ if (!ret) {
+ btrfs_release_path(root, path);
+ lock_extent(&BTRFS_I(inode)->io_tree,
+ locked_end, extent_end - 1, GFP_NOFS);
+ locked_end = extent_end;
+ goto again;
+ }
+ locked_end = extent_end;
+ }
+ btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
+ extent_offset += split - key.offset;
+ } else {
+ BUG_ON(key.offset != start);
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset +
+ split - key.offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
+ key.offset = split;
+ btrfs_set_item_key_safe(trans, root, path, &key);
+ extent_end = split;
+ }
+
+ if (extent_end == end) {
+ split_end = 0;
+ extent_type = BTRFS_FILE_EXTENT_REG;
+ }
+ if (extent_end == end && split == start) {
+ other_start = end;
+ other_end = 0;
+ if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+ bytenr, &other_start, &other_end)) {
+ path->slots[0]++;
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ key.offset = split;
+ btrfs_set_item_key_safe(trans, root, path, &key);
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ other_end - split);
+ goto done;
+ }
+ }
+ if (extent_end == end && split == end) {
+ other_start = 0;
+ other_end = start;
+ if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
+ bytenr, &other_start, &other_end)) {
+ path->slots[0]--;
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
+ other_start);
+ goto done;
+ }
+ }
+
+ btrfs_mark_buffer_dirty(leaf);
+
+ orig_parent = leaf->start;
+ ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+ orig_parent, root->root_key.objectid,
+ trans->transid, inode->i_ino);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ key.offset = start;
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_set_file_extent_type(leaf, fi, extent_type);
+ btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_compression(leaf, fi, 0);
+ btrfs_set_file_extent_encryption(leaf, fi, 0);
+ btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+
+ if (orig_parent != leaf->start) {
+ ret = btrfs_update_extent_ref(trans, root, bytenr,
+ orig_parent, leaf->start,
+ root->root_key.objectid,
+ trans->transid, inode->i_ino);
+ BUG_ON(ret);
+ }
+done:
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(root, path);
+ if (split_end && split == start) {
+ split = end;
+ goto again;
+ }
+ if (locked_end > end) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+ GFP_NOFS);
+ }
+ btrfs_free_path(path);
+ return 0;
+}
+
+/*
+ * this gets pages into the page cache and locks them down, it also properly
+ * waits for data=ordered extents to finish before allowing the pages to be
+ * modified.
+ */
+static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
+ struct page **pages, size_t num_pages,
+ loff_t pos, unsigned long first_index,
+ unsigned long last_index, size_t write_bytes)
+{
+ int i;
+ unsigned long index = pos >> PAGE_CACHE_SHIFT;
+ struct inode *inode = fdentry(file)->d_inode;
+ int err = 0;
+ u64 start_pos;
+ u64 last_pos;
+
+ start_pos = pos & ~((u64)root->sectorsize - 1);
+ last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
+
+ if (start_pos > inode->i_size) {
+ err = btrfs_cont_expand(inode, start_pos);
+ if (err)
+ return err;
+ }
+
+ memset(pages, 0, num_pages * sizeof(struct page *));
+again:
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = grab_cache_page(inode->i_mapping, index + i);
+ if (!pages[i]) {
+ err = -ENOMEM;
+ BUG_ON(1);
+ }
+ wait_on_page_writeback(pages[i]);
+ }
+ if (start_pos < inode->i_size) {
+ struct btrfs_ordered_extent *ordered;
+ lock_extent(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1, GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ last_pos - 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > start_pos &&
+ ordered->file_offset < last_pos) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1, GFP_NOFS);
+ for (i = 0; i < num_pages; i++) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ btrfs_wait_ordered_range(inode, start_pos,
+ last_pos - start_pos);
+ goto again;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+ last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
+ GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ start_pos, last_pos - 1, GFP_NOFS);
+ }
+ for (i = 0; i < num_pages; i++) {
+ clear_page_dirty_for_io(pages[i]);
+ set_page_extent_mapped(pages[i]);
+ WARN_ON(!PageLocked(pages[i]));
+ }
+ return 0;
+}
+
+static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ loff_t pos;
+ loff_t start_pos;
+ ssize_t num_written = 0;
+ ssize_t err = 0;
+ int ret = 0;
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct page **pages = NULL;
+ int nrptrs;
+ struct page *pinned[2];
+ unsigned long first_index;
+ unsigned long last_index;
+ int will_write;
+
+ will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
+ (file->f_flags & O_DIRECT));
+
+ nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
+ PAGE_CACHE_SIZE / (sizeof(struct page *)));
+ pinned[0] = NULL;
+ pinned[1] = NULL;
+
+ pos = *ppos;
+ start_pos = pos;
+
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ current->backing_dev_info = inode->i_mapping->backing_dev_info;
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err)
+ goto out_nolock;
+ if (count == 0)
+ goto out_nolock;
+
+ err = file_remove_suid(file);
+ if (err)
+ goto out_nolock;
+ file_update_time(file);
+
+ pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+
+ mutex_lock(&inode->i_mutex);
+ BTRFS_I(inode)->sequence++;
+ first_index = pos >> PAGE_CACHE_SHIFT;
+ last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+
+ /*
+ * there are lots of better ways to do this, but this code
+ * makes sure the first and last page in the file range are
+ * up to date and ready for cow
+ */
+ if ((pos & (PAGE_CACHE_SIZE - 1))) {
+ pinned[0] = grab_cache_page(inode->i_mapping, first_index);
+ if (!PageUptodate(pinned[0])) {
+ ret = btrfs_readpage(NULL, pinned[0]);
+ BUG_ON(ret);
+ wait_on_page_locked(pinned[0]);
+ } else {
+ unlock_page(pinned[0]);
+ }
+ }
+ if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+ pinned[1] = grab_cache_page(inode->i_mapping, last_index);
+ if (!PageUptodate(pinned[1])) {
+ ret = btrfs_readpage(NULL, pinned[1]);
+ BUG_ON(ret);
+ wait_on_page_locked(pinned[1]);
+ } else {
+ unlock_page(pinned[1]);
+ }
+ }
+
+ while (count > 0) {
+ size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+ size_t write_bytes = min(count, nrptrs *
+ (size_t)PAGE_CACHE_SIZE -
+ offset);
+ size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ WARN_ON(num_pages > nrptrs);
+ memset(pages, 0, sizeof(struct page *) * nrptrs);
+
+ ret = btrfs_check_free_space(root, write_bytes, 0);
+ if (ret)
+ goto out;
+
+ ret = prepare_pages(root, file, pages, num_pages,
+ pos, first_index, last_index,
+ write_bytes);
+ if (ret)
+ goto out;
+
+ ret = btrfs_copy_from_user(pos, num_pages,
+ write_bytes, pages, buf);
+ if (ret) {
+ btrfs_drop_pages(pages, num_pages);
+ goto out;
+ }
+
+ ret = dirty_and_release_pages(NULL, root, file, pages,
+ num_pages, pos, write_bytes);
+ btrfs_drop_pages(pages, num_pages);
+ if (ret)
+ goto out;
+
+ if (will_write) {
+ btrfs_fdatawrite_range(inode->i_mapping, pos,
+ pos + write_bytes - 1,
+ WB_SYNC_NONE);
+ } else {
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ num_pages);
+ if (num_pages <
+ (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+ btrfs_btree_balance_dirty(root, 1);
+ btrfs_throttle(root);
+ }
+
+ buf += write_bytes;
+ count -= write_bytes;
+ pos += write_bytes;
+ num_written += write_bytes;
+
+ cond_resched();
+ }
+out:
+ mutex_unlock(&inode->i_mutex);
+
+out_nolock:
+ kfree(pages);
+ if (pinned[0])
+ page_cache_release(pinned[0]);
+ if (pinned[1])
+ page_cache_release(pinned[1]);
+ *ppos = pos;
+
+ if (num_written > 0 && will_write) {
+ struct btrfs_trans_handle *trans;
+
+ err = btrfs_wait_ordered_range(inode, start_pos, num_written);
+ if (err)
+ num_written = err;
+
+ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_log_dentry_safe(trans, root,
+ file->f_dentry);
+ if (ret == 0) {
+ btrfs_sync_log(trans, root);
+ btrfs_end_transaction(trans, root);
+ } else {
+ btrfs_commit_transaction(trans, root);
+ }
+ }
+ if (file->f_flags & O_DIRECT) {
+ invalidate_mapping_pages(inode->i_mapping,
+ start_pos >> PAGE_CACHE_SHIFT,
+ (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
+ }
+ }
+ current->backing_dev_info = NULL;
+ return num_written ? num_written : err;
+}
+
+int btrfs_release_file(struct inode *inode, struct file *filp)
+{
+ if (filp->private_data)
+ btrfs_ioctl_trans_end(filp);
+ return 0;
+}
+
+/*
+ * fsync call for both files and directories. This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit. This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+ struct inode *inode = dentry->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+ struct btrfs_trans_handle *trans;
+
+ /*
+ * check the transaction that last modified this inode
+ * and see if its already been committed
+ */
+ if (!BTRFS_I(inode)->last_trans)
+ goto out;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed) {
+ BTRFS_I(inode)->last_trans = 0;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ goto out;
+ }
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ root->fs_info->tree_log_batch++;
+ filemap_fdatawrite(inode->i_mapping);
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ root->fs_info->tree_log_batch++;
+
+ /*
+ * ok we haven't committed the transaction yet, lets do a commit
+ */
+ if (file->private_data)
+ btrfs_ioctl_trans_end(file);
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+ if (ret < 0)
+ goto out;
+
+ /* we've logged all the items and now have a consistent
+ * version of the file in the log. It is possible that
+ * someone will come in and modify the file, but that's
+ * fine because the log is consistent on disk, and we
+ * have references to all of the file's extents
+ *
+ * It is possible that someone will come in and log the
+ * file again, but that will end up using the synchronization
+ * inside btrfs_sync_log to keep things safe.
+ */
+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+
+ if (ret > 0) {
+ ret = btrfs_commit_transaction(trans, root);
+ } else {
+ btrfs_sync_log(trans, root);
+ ret = btrfs_end_transaction(trans, root);
+ }
+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
+out:
+ return ret > 0 ? EIO : ret;
+}
+
+static struct vm_operations_struct btrfs_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = btrfs_page_mkwrite,
+};
+
+static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &btrfs_file_vm_ops;
+ file_accessed(filp);
+ return 0;
+}
+
+struct file_operations btrfs_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .aio_read = generic_file_aio_read,
+ .splice_read = generic_file_splice_read,
+ .write = btrfs_file_write,
+ .mmap = btrfs_file_mmap,
+ .open = generic_file_open,
+ .release = btrfs_release_file,
+ .fsync = btrfs_sync_file,
+ .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = btrfs_ioctl,
+#endif
+};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 00000000000..d1e5f0e84c5
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,495 @@
+/*
+ * Copyright (C) 2008 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_free_space *info;
+
+ while (*p) {
+ parent = *p;
+ info = rb_entry(parent, struct btrfs_free_space, offset_index);
+
+ if (offset < info->offset)
+ p = &(*p)->rb_left;
+ else if (offset > info->offset)
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+
+ return 0;
+}
+
+static int tree_insert_bytes(struct rb_root *root, u64 bytes,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_free_space *info;
+
+ while (*p) {
+ parent = *p;
+ info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+
+ if (bytes < info->bytes)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+
+ return 0;
+}
+
+/*
+ * searches the tree for the given offset. If contains is set we will return
+ * the free space that contains the given offset. If contains is not set we
+ * will return the free space that starts at or after the given offset and is
+ * at least bytes long.
+ */
+static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
+ u64 offset, u64 bytes,
+ int contains)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_free_space *entry, *ret = NULL;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+
+ if (offset < entry->offset) {
+ if (!contains &&
+ (!ret || entry->offset < ret->offset) &&
+ (bytes <= entry->bytes))
+ ret = entry;
+ n = n->rb_left;
+ } else if (offset > entry->offset) {
+ if ((entry->offset + entry->bytes - 1) >= offset &&
+ bytes <= entry->bytes) {
+ ret = entry;
+ break;
+ }
+ n = n->rb_right;
+ } else {
+ if (bytes > entry->bytes) {
+ n = n->rb_right;
+ continue;
+ }
+ ret = entry;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * return a chunk at least bytes size, as close to offset that we can get.
+ */
+static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
+ u64 offset, u64 bytes)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_free_space *entry, *ret = NULL;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+
+ if (bytes < entry->bytes) {
+ /*
+ * We prefer to get a hole size as close to the size we
+ * are asking for so we don't take small slivers out of
+ * huge holes, but we also want to get as close to the
+ * offset as possible so we don't have a whole lot of
+ * fragmentation.
+ */
+ if (offset <= entry->offset) {
+ if (!ret)
+ ret = entry;
+ else if (entry->bytes < ret->bytes)
+ ret = entry;
+ else if (entry->offset < ret->offset)
+ ret = entry;
+ }
+ n = n->rb_left;
+ } else if (bytes > entry->bytes) {
+ n = n->rb_right;
+ } else {
+ /*
+ * Ok we may have multiple chunks of the wanted size,
+ * so we don't want to take the first one we find, we
+ * want to take the one closest to our given offset, so
+ * keep searching just in case theres a better match.
+ */
+ n = n->rb_right;
+ if (offset > entry->offset)
+ continue;
+ else if (!ret || entry->offset < ret->offset)
+ ret = entry;
+ }
+ }
+
+ return ret;
+}
+
+static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info)
+{
+ rb_erase(&info->offset_index, &block_group->free_space_offset);
+ rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+}
+
+static int link_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info)
+{
+ int ret = 0;
+
+
+ ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
+ &info->offset_index);
+ if (ret)
+ return ret;
+
+ ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
+ &info->bytes_index);
+ if (ret)
+ return ret;
+
+ return ret;
+}
+
+static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes)
+{
+ struct btrfs_free_space *right_info;
+ struct btrfs_free_space *left_info;
+ struct btrfs_free_space *info = NULL;
+ struct btrfs_free_space *alloc_info;
+ int ret = 0;
+
+ alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+ if (!alloc_info)
+ return -ENOMEM;
+
+ /*
+ * first we want to see if there is free space adjacent to the range we
+ * are adding, if there is remove that struct and add a new one to
+ * cover the entire range
+ */
+ right_info = tree_search_offset(&block_group->free_space_offset,
+ offset+bytes, 0, 1);
+ left_info = tree_search_offset(&block_group->free_space_offset,
+ offset-1, 0, 1);
+
+ if (right_info && right_info->offset == offset+bytes) {
+ unlink_free_space(block_group, right_info);
+ info = right_info;
+ info->offset = offset;
+ info->bytes += bytes;
+ } else if (right_info && right_info->offset != offset+bytes) {
+ printk(KERN_ERR "btrfs adding space in the middle of an "
+ "existing free space area. existing: "
+ "offset=%llu, bytes=%llu. new: offset=%llu, "
+ "bytes=%llu\n", (unsigned long long)right_info->offset,
+ (unsigned long long)right_info->bytes,
+ (unsigned long long)offset,
+ (unsigned long long)bytes);
+ BUG();
+ }
+
+ if (left_info) {
+ unlink_free_space(block_group, left_info);
+
+ if (unlikely((left_info->offset + left_info->bytes) !=
+ offset)) {
+ printk(KERN_ERR "btrfs free space to the left "
+ "of new free space isn't "
+ "quite right. existing: offset=%llu, "
+ "bytes=%llu. new: offset=%llu, bytes=%llu\n",
+ (unsigned long long)left_info->offset,
+ (unsigned long long)left_info->bytes,
+ (unsigned long long)offset,
+ (unsigned long long)bytes);
+ BUG();
+ }
+
+ if (info) {
+ info->offset = left_info->offset;
+ info->bytes += left_info->bytes;
+ kfree(left_info);
+ } else {
+ info = left_info;
+ info->bytes += bytes;
+ }
+ }
+
+ if (info) {
+ ret = link_free_space(block_group, info);
+ if (!ret)
+ info = NULL;
+ goto out;
+ }
+
+ info = alloc_info;
+ alloc_info = NULL;
+ info->offset = offset;
+ info->bytes = bytes;
+
+ ret = link_free_space(block_group, info);
+ if (ret)
+ kfree(info);
+out:
+ if (ret) {
+ printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+ if (ret == -EEXIST)
+ BUG();
+ }
+
+ kfree(alloc_info);
+
+ return ret;
+}
+
+static int
+__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes)
+{
+ struct btrfs_free_space *info;
+ int ret = 0;
+
+ info = tree_search_offset(&block_group->free_space_offset, offset, 0,
+ 1);
+
+ if (info && info->offset == offset) {
+ if (info->bytes < bytes) {
+ printk(KERN_ERR "Found free space at %llu, size %llu,"
+ "trying to use %llu\n",
+ (unsigned long long)info->offset,
+ (unsigned long long)info->bytes,
+ (unsigned long long)bytes);
+ WARN_ON(1);
+ ret = -EINVAL;
+ goto out;
+ }
+ unlink_free_space(block_group, info);
+
+ if (info->bytes == bytes) {
+ kfree(info);
+ goto out;
+ }
+
+ info->offset += bytes;
+ info->bytes -= bytes;
+
+ ret = link_free_space(block_group, info);
+ BUG_ON(ret);
+ } else if (info && info->offset < offset &&
+ info->offset + info->bytes >= offset + bytes) {
+ u64 old_start = info->offset;
+ /*
+ * we're freeing space in the middle of the info,
+ * this can happen during tree log replay
+ *
+ * first unlink the old info and then
+ * insert it again after the hole we're creating
+ */
+ unlink_free_space(block_group, info);
+ if (offset + bytes < info->offset + info->bytes) {
+ u64 old_end = info->offset + info->bytes;
+
+ info->offset = offset + bytes;
+ info->bytes = old_end - info->offset;
+ ret = link_free_space(block_group, info);
+ BUG_ON(ret);
+ } else {
+ /* the hole we're creating ends at the end
+ * of the info struct, just free the info
+ */
+ kfree(info);
+ }
+
+ /* step two, insert a new info struct to cover anything
+ * before the hole
+ */
+ ret = __btrfs_add_free_space(block_group, old_start,
+ offset - old_start);
+ BUG_ON(ret);
+ } else {
+ WARN_ON(1);
+ }
+out:
+ return ret;
+}
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes)
+{
+ int ret;
+ struct btrfs_free_space *sp;
+
+ mutex_lock(&block_group->alloc_mutex);
+ ret = __btrfs_add_free_space(block_group, offset, bytes);
+ sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+ BUG_ON(!sp);
+ mutex_unlock(&block_group->alloc_mutex);
+
+ return ret;
+}
+
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes)
+{
+ int ret;
+ struct btrfs_free_space *sp;
+
+ ret = __btrfs_add_free_space(block_group, offset, bytes);
+ sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+ BUG_ON(!sp);
+
+ return ret;
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes)
+{
+ int ret = 0;
+
+ mutex_lock(&block_group->alloc_mutex);
+ ret = __btrfs_remove_free_space(block_group, offset, bytes);
+ mutex_unlock(&block_group->alloc_mutex);
+
+ return ret;
+}
+
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+ u64 offset, u64 bytes)
+{
+ int ret;
+
+ ret = __btrfs_remove_free_space(block_group, offset, bytes);
+
+ return ret;
+}
+
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytes)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ int count = 0;
+
+ for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (info->bytes >= bytes)
+ count++;
+ }
+ printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
+ "\n", count);
+}
+
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ u64 ret = 0;
+
+ for (n = rb_first(&block_group->free_space_offset); n;
+ n = rb_next(n)) {
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ ret += info->bytes;
+ }
+
+ return ret;
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *node;
+
+ mutex_lock(&block_group->alloc_mutex);
+ while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
+ info = rb_entry(node, struct btrfs_free_space, bytes_index);
+ unlink_free_space(block_group, info);
+ kfree(info);
+ if (need_resched()) {
+ mutex_unlock(&block_group->alloc_mutex);
+ cond_resched();
+ mutex_lock(&block_group->alloc_mutex);
+ }
+ }
+ mutex_unlock(&block_group->alloc_mutex);
+}
+
+#if 0
+static struct btrfs_free_space *btrfs_find_free_space_offset(struct
+ btrfs_block_group_cache
+ *block_group, u64 offset,
+ u64 bytes)
+{
+ struct btrfs_free_space *ret;
+
+ mutex_lock(&block_group->alloc_mutex);
+ ret = tree_search_offset(&block_group->free_space_offset, offset,
+ bytes, 0);
+ mutex_unlock(&block_group->alloc_mutex);
+
+ return ret;
+}
+
+static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
+ btrfs_block_group_cache
+ *block_group, u64 offset,
+ u64 bytes)
+{
+ struct btrfs_free_space *ret;
+
+ mutex_lock(&block_group->alloc_mutex);
+
+ ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
+ mutex_unlock(&block_group->alloc_mutex);
+
+ return ret;
+}
+#endif
+
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+ *block_group, u64 offset,
+ u64 bytes)
+{
+ struct btrfs_free_space *ret = NULL;
+
+ ret = tree_search_offset(&block_group->free_space_offset, offset,
+ bytes, 0);
+ if (!ret)
+ ret = tree_search_bytes(&block_group->free_space_bytes,
+ offset, bytes);
+
+ return ret;
+}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 00000000000..2a020b27676
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __HASH__
+#define __HASH__
+
+#include "crc32c.h"
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+ return btrfs_crc32c((u32)~1, name, len);
+}
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 00000000000..3d46fa1f29a
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+static int find_name_in_backref(struct btrfs_path *path, const char *name,
+ int name_len, struct btrfs_inode_ref **ref_ret)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_inode_ref *ref;
+ unsigned long ptr;
+ unsigned long name_ptr;
+ u32 item_size;
+ u32 cur_offset = 0;
+ int len;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ while (cur_offset < item_size) {
+ ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+ len = btrfs_inode_ref_name_len(leaf, ref);
+ name_ptr = (unsigned long)(ref + 1);
+ cur_offset += len + sizeof(*ref);
+ if (len != name_len)
+ continue;
+ if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
+ *ref_ret = ref;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_inode_ref *ref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+ unsigned long item_start;
+ u32 item_size;
+ u32 sub_item_len;
+ int ret;
+ int del_len = name_len + sizeof(*ref);
+
+ key.objectid = inode_objectid;
+ key.offset = ref_objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ } else if (ret < 0) {
+ goto out;
+ }
+ if (!find_name_in_backref(path, name, name_len, &ref)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+
+ if (index)
+ *index = btrfs_inode_ref_index(leaf, ref);
+
+ if (del_len == item_size) {
+ ret = btrfs_del_item(trans, root, path);
+ goto out;
+ }
+ ptr = (unsigned long)ref;
+ sub_item_len = name_len + sizeof(*ref);
+ item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+ item_size - (ptr + sub_item_len - item_start));
+ ret = btrfs_truncate_item(trans, root, path,
+ item_size - sub_item_len, 1);
+ BUG_ON(ret);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_inode_ref *ref;
+ unsigned long ptr;
+ int ret;
+ int ins_len = name_len + sizeof(*ref);
+
+ key.objectid = inode_objectid;
+ key.offset = ref_objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ ins_len);
+ if (ret == -EEXIST) {
+ u32 old_size;
+
+ if (find_name_in_backref(path, name, name_len, &ref))
+ goto out;
+
+ old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ ret = btrfs_extend_item(trans, root, path, ins_len);
+ BUG_ON(ret);
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_ref);
+ ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+ ptr = (unsigned long)(ref + 1);
+ ret = 0;
+ } else if (ret < 0) {
+ goto out;
+ } else {
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_ref);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+ ptr = (unsigned long)(ref + 1);
+ }
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid)
+{
+ struct btrfs_key key;
+ int ret;
+ key.objectid = objectid;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(struct btrfs_inode_item));
+ if (ret == 0 && objectid > root->highest_inode)
+ root->highest_inode = objectid;
+ return ret;
+}
+
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ struct btrfs_key *location, int mod)
+{
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+ int ret;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key found_key;
+
+ ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
+ if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+ location->offset == (u64)-1 && path->slots[0] != 0) {
+ slot = path->slots[0] - 1;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (found_key.objectid == location->objectid &&
+ btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+ path->slots[0]--;
+ return 0;
+ }
+ }
+ return ret;
+}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 00000000000..2aa79873eb4
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *l;
+ struct btrfs_key search_key;
+ struct btrfs_key found_key;
+ int slot;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+ search_key.type = -1;
+ search_key.offset = (u64)-1;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ BUG_ON(ret == 0);
+ if (path->slots[0] > 0) {
+ slot = path->slots[0] - 1;
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ *objectid = found_key.objectid;
+ } else {
+ *objectid = BTRFS_FIRST_FREE_OBJECTID;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * walks the btree of allocated inodes and find a hole.
+ */
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 dirid, u64 *objectid)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+ int slot = 0;
+ u64 last_ino = 0;
+ int start_found;
+ struct extent_buffer *l;
+ struct btrfs_key search_key;
+ u64 search_start = dirid;
+
+ mutex_lock(&root->objectid_mutex);
+ if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
+ root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
+ *objectid = ++root->last_inode_alloc;
+ mutex_unlock(&root->objectid_mutex);
+ return 0;
+ }
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
+ search_key.objectid = search_start;
+ search_key.type = 0;
+ search_key.offset = 0;
+
+ btrfs_init_path(path);
+ start_found = 0;
+ ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+
+ while (1) {
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto error;
+ if (!start_found) {
+ *objectid = search_start;
+ start_found = 1;
+ goto found;
+ }
+ *objectid = last_ino > search_start ?
+ last_ino : search_start;
+ goto found;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+ if (key.objectid >= search_start) {
+ if (start_found) {
+ if (last_ino < search_start)
+ last_ino = search_start;
+ if (key.objectid > last_ino) {
+ *objectid = last_ino;
+ goto found;
+ }
+ } else if (key.objectid > search_start) {
+ *objectid = search_start;
+ goto found;
+ }
+ }
+ if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
+ break;
+
+ start_found = 1;
+ last_ino = key.objectid + 1;
+ path->slots[0]++;
+ }
+ BUG_ON(1);
+found:
+ btrfs_release_path(root, path);
+ btrfs_free_path(path);
+ BUG_ON(*objectid < search_start);
+ mutex_unlock(&root->objectid_mutex);
+ return 0;
+error:
+ btrfs_release_path(root, path);
+ btrfs_free_path(path);
+ mutex_unlock(&root->objectid_mutex);
+ return ret;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 00000000000..8adfe059ab4
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,5035 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/falloc.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "xattr.h"
+#include "tree-log.h"
+#include "ref-cache.h"
+#include "compression.h"
+
+struct btrfs_iget_args {
+ u64 ino;
+ struct btrfs_root *root;
+};
+
+static struct inode_operations btrfs_dir_inode_operations;
+static struct inode_operations btrfs_symlink_inode_operations;
+static struct inode_operations btrfs_dir_ro_inode_operations;
+static struct inode_operations btrfs_special_inode_operations;
+static struct inode_operations btrfs_file_inode_operations;
+static struct address_space_operations btrfs_aops;
+static struct address_space_operations btrfs_symlink_aops;
+static struct file_operations btrfs_dir_file_operations;
+static struct extent_io_ops btrfs_extent_io_ops;
+
+static struct kmem_cache *btrfs_inode_cachep;
+struct kmem_cache *btrfs_trans_handle_cachep;
+struct kmem_cache *btrfs_transaction_cachep;
+struct kmem_cache *btrfs_bit_radix_cachep;
+struct kmem_cache *btrfs_path_cachep;
+
+#define S_SHIFT 12
+static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
+};
+
+static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static noinline int cow_file_range(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written, int unlock);
+
+/*
+ * a very lame attempt at stopping writes when the FS is 85% full. There
+ * are countless ways this is incorrect, but it is better than nothing.
+ */
+int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
+ int for_del)
+{
+ u64 total;
+ u64 used;
+ u64 thresh;
+ int ret = 0;
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ total = btrfs_super_total_bytes(&root->fs_info->super_copy);
+ used = btrfs_super_bytes_used(&root->fs_info->super_copy);
+ if (for_del)
+ thresh = total * 90;
+ else
+ thresh = total * 85;
+
+ do_div(thresh, 100);
+
+ if (used + root->fs_info->delalloc_bytes + num_required > thresh)
+ ret = -ENOSPC;
+ spin_unlock(&root->fs_info->delalloc_lock);
+ return ret;
+}
+
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree. The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ u64 start, size_t size, size_t compressed_size,
+ struct page **compressed_pages)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct page *page = NULL;
+ char *kaddr;
+ unsigned long ptr;
+ struct btrfs_file_extent_item *ei;
+ int err = 0;
+ int ret;
+ size_t cur_size = size;
+ size_t datasize;
+ unsigned long offset;
+ int use_compress = 0;
+
+ if (compressed_size && compressed_pages) {
+ use_compress = 1;
+ cur_size = compressed_size;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ btrfs_set_trans_block_group(trans, inode);
+
+ key.objectid = inode->i_ino;
+ key.offset = start;
+ btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+ datasize = btrfs_file_extent_calc_inline_size(cur_size);
+
+ inode_add_bytes(inode, size);
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ datasize);
+ BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+ btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_encryption(leaf, ei, 0);
+ btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+ btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+ ptr = btrfs_file_extent_inline_start(ei);
+
+ if (use_compress) {
+ struct page *cpage;
+ int i = 0;
+ while (compressed_size > 0) {
+ cpage = compressed_pages[i];
+ cur_size = min_t(unsigned long, compressed_size,
+ PAGE_CACHE_SIZE);
+
+ kaddr = kmap(cpage);
+ write_extent_buffer(leaf, kaddr, ptr, cur_size);
+ kunmap(cpage);
+
+ i++;
+ ptr += cur_size;
+ compressed_size -= cur_size;
+ }
+ btrfs_set_file_extent_compression(leaf, ei,
+ BTRFS_COMPRESS_ZLIB);
+ } else {
+ page = find_get_page(inode->i_mapping,
+ start >> PAGE_CACHE_SHIFT);
+ btrfs_set_file_extent_compression(leaf, ei, 0);
+ kaddr = kmap_atomic(page, KM_USER0);
+ offset = start & (PAGE_CACHE_SIZE - 1);
+ write_extent_buffer(leaf, kaddr + offset, ptr, size);
+ kunmap_atomic(kaddr, KM_USER0);
+ page_cache_release(page);
+ }
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+
+ BTRFS_I(inode)->disk_i_size = inode->i_size;
+ btrfs_update_inode(trans, root, inode);
+ return 0;
+fail:
+ btrfs_free_path(path);
+ return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file. This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode, u64 start, u64 end,
+ size_t compressed_size,
+ struct page **compressed_pages)
+{
+ u64 isize = i_size_read(inode);
+ u64 actual_end = min(end + 1, isize);
+ u64 inline_len = actual_end - start;
+ u64 aligned_end = (end + root->sectorsize - 1) &
+ ~((u64)root->sectorsize - 1);
+ u64 hint_byte;
+ u64 data_len = inline_len;
+ int ret;
+
+ if (compressed_size)
+ data_len = compressed_size;
+
+ if (start > 0 ||
+ actual_end >= PAGE_CACHE_SIZE ||
+ data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+ (!compressed_size &&
+ (actual_end & (root->sectorsize - 1)) == 0) ||
+ end + 1 < isize ||
+ data_len > root->fs_info->max_inline) {
+ return 1;
+ }
+
+ ret = btrfs_drop_extents(trans, root, inode, start,
+ aligned_end, start, &hint_byte);
+ BUG_ON(ret);
+
+ if (isize > actual_end)
+ inline_len = min_t(u64, isize, actual_end);
+ ret = insert_inline_extent(trans, root, inode, start,
+ inline_len, compressed_size,
+ compressed_pages);
+ BUG_ON(ret);
+ btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+ return 0;
+}
+
+struct async_extent {
+ u64 start;
+ u64 ram_size;
+ u64 compressed_size;
+ struct page **pages;
+ unsigned long nr_pages;
+ struct list_head list;
+};
+
+struct async_cow {
+ struct inode *inode;
+ struct btrfs_root *root;
+ struct page *locked_page;
+ u64 start;
+ u64 end;
+ struct list_head extents;
+ struct btrfs_work work;
+};
+
+static noinline int add_async_extent(struct async_cow *cow,
+ u64 start, u64 ram_size,
+ u64 compressed_size,
+ struct page **pages,
+ unsigned long nr_pages)
+{
+ struct async_extent *async_extent;
+
+ async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+ async_extent->start = start;
+ async_extent->ram_size = ram_size;
+ async_extent->compressed_size = compressed_size;
+ async_extent->pages = pages;
+ async_extent->nr_pages = nr_pages;
+ list_add_tail(&async_extent->list, &cow->extents);
+ return 0;
+}
+
+/*
+ * we create compressed extents in two phases. The first
+ * phase compresses a range of pages that have already been
+ * locked (both pages and state bits are locked).
+ *
+ * This is done inside an ordered work queue, and the compression
+ * is spread across many cpus. The actual IO submission is step
+ * two, and the ordered work queue takes care of making sure that
+ * happens in the same order things were put onto the queue by
+ * writepages and friends.
+ *
+ * If this code finds it can't get good compression, it puts an
+ * entry onto the work queue to write the uncompressed bytes. This
+ * makes sure that both compressed inodes and uncompressed inodes
+ * are written in the same order that pdflush sent them down.
+ */
+static noinline int compress_file_range(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end,
+ struct async_cow *async_cow,
+ int *num_added)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ u64 num_bytes;
+ u64 orig_start;
+ u64 disk_num_bytes;
+ u64 blocksize = root->sectorsize;
+ u64 actual_end;
+ u64 isize = i_size_read(inode);
+ int ret = 0;
+ struct page **pages = NULL;
+ unsigned long nr_pages;
+ unsigned long nr_pages_ret = 0;
+ unsigned long total_compressed = 0;
+ unsigned long total_in = 0;
+ unsigned long max_compressed = 128 * 1024;
+ unsigned long max_uncompressed = 128 * 1024;
+ int i;
+ int will_compress;
+
+ orig_start = start;
+
+ actual_end = min_t(u64, isize, end + 1);
+again:
+ will_compress = 0;
+ nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+ nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+
+ total_compressed = actual_end - start;
+
+ /* we want to make sure that amount of ram required to uncompress
+ * an extent is reasonable, so we limit the total size in ram
+ * of a compressed extent to 128k. This is a crucial number
+ * because it also controls how easily we can spread reads across
+ * cpus for decompression.
+ *
+ * We also want to make sure the amount of IO required to do
+ * a random read is reasonably small, so we limit the size of
+ * a compressed extent to 128k.
+ */
+ total_compressed = min(total_compressed, max_uncompressed);
+ num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+ num_bytes = max(blocksize, num_bytes);
+ disk_num_bytes = num_bytes;
+ total_in = 0;
+ ret = 0;
+
+ /*
+ * we do compression for mount -o compress and when the
+ * inode has not been flagged as nocompress. This flag can
+ * change at any time if we discover bad compression ratios.
+ */
+ if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+ btrfs_test_opt(root, COMPRESS)) {
+ WARN_ON(pages);
+ pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+
+ ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+ total_compressed, pages,
+ nr_pages, &nr_pages_ret,
+ &total_in,
+ &total_compressed,
+ max_compressed);
+
+ if (!ret) {
+ unsigned long offset = total_compressed &
+ (PAGE_CACHE_SIZE - 1);
+ struct page *page = pages[nr_pages_ret - 1];
+ char *kaddr;
+
+ /* zero the tail end of the last page, we might be
+ * sending it down to disk
+ */
+ if (offset) {
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0,
+ PAGE_CACHE_SIZE - offset);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ will_compress = 1;
+ }
+ }
+ if (start == 0) {
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
+ btrfs_set_trans_block_group(trans, inode);
+
+ /* lets try to make an inline extent */
+ if (ret || total_in < (actual_end - start)) {
+ /* we didn't compress the entire range, try
+ * to make an uncompressed inline extent.
+ */
+ ret = cow_file_range_inline(trans, root, inode,
+ start, end, 0, NULL);
+ } else {
+ /* try making a compressed inline extent */
+ ret = cow_file_range_inline(trans, root, inode,
+ start, end,
+ total_compressed, pages);
+ }
+ btrfs_end_transaction(trans, root);
+ if (ret == 0) {
+ /*
+ * inline extent creation worked, we don't need
+ * to create any more async work items. Unlock
+ * and free up our temp pages.
+ */
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL, 1, 0,
+ 0, 1, 1, 1);
+ ret = 0;
+ goto free_pages_out;
+ }
+ }
+
+ if (will_compress) {
+ /*
+ * we aren't doing an inline extent round the compressed size
+ * up to a block size boundary so the allocator does sane
+ * things
+ */
+ total_compressed = (total_compressed + blocksize - 1) &
+ ~(blocksize - 1);
+
+ /*
+ * one last check to make sure the compression is really a
+ * win, compare the page count read with the blocks on disk
+ */
+ total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+ ~(PAGE_CACHE_SIZE - 1);
+ if (total_compressed >= total_in) {
+ will_compress = 0;
+ } else {
+ disk_num_bytes = total_compressed;
+ num_bytes = total_in;
+ }
+ }
+ if (!will_compress && pages) {
+ /*
+ * the compression code ran but failed to make things smaller,
+ * free any pages it allocated and our page pointer array
+ */
+ for (i = 0; i < nr_pages_ret; i++) {
+ WARN_ON(pages[i]->mapping);
+ page_cache_release(pages[i]);
+ }
+ kfree(pages);
+ pages = NULL;
+ total_compressed = 0;
+ nr_pages_ret = 0;
+
+ /* flag the file so we don't compress in the future */
+ btrfs_set_flag(inode, NOCOMPRESS);
+ }
+ if (will_compress) {
+ *num_added += 1;
+
+ /* the async work queues will take care of doing actual
+ * allocation on disk for these compressed pages,
+ * and will submit them to the elevator.
+ */
+ add_async_extent(async_cow, start, num_bytes,
+ total_compressed, pages, nr_pages_ret);
+
+ if (start + num_bytes < end && start + num_bytes < actual_end) {
+ start += num_bytes;
+ pages = NULL;
+ cond_resched();
+ goto again;
+ }
+ } else {
+ /*
+ * No compression, but we still need to write the pages in
+ * the file we've been given so far. redirty the locked
+ * page if it corresponds to our extent and set things up
+ * for the async work queue to run cow_file_range to do
+ * the normal delalloc dance
+ */
+ if (page_offset(locked_page) >= start &&
+ page_offset(locked_page) <= end) {
+ __set_page_dirty_nobuffers(locked_page);
+ /* unlocked later on in the async handlers */
+ }
+ add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+ *num_added += 1;
+ }
+
+out:
+ return 0;
+
+free_pages_out:
+ for (i = 0; i < nr_pages_ret; i++) {
+ WARN_ON(pages[i]->mapping);
+ page_cache_release(pages[i]);
+ }
+ kfree(pages);
+
+ goto out;
+}
+
+/*
+ * phase two of compressed writeback. This is the ordered portion
+ * of the code, which only gets called in the order the work was
+ * queued. We walk all the async extents created by compress_file_range
+ * and send them down to the disk.
+ */
+static noinline int submit_compressed_extents(struct inode *inode,
+ struct async_cow *async_cow)
+{
+ struct async_extent *async_extent;
+ u64 alloc_hint = 0;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key ins;
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_io_tree *io_tree;
+ int ret;
+
+ if (list_empty(&async_cow->extents))
+ return 0;
+
+ trans = btrfs_join_transaction(root, 1);
+
+ while (!list_empty(&async_cow->extents)) {
+ async_extent = list_entry(async_cow->extents.next,
+ struct async_extent, list);
+ list_del(&async_extent->list);
+
+ io_tree = &BTRFS_I(inode)->io_tree;
+
+ /* did the compression code fall back to uncompressed IO? */
+ if (!async_extent->pages) {
+ int page_started = 0;
+ unsigned long nr_written = 0;
+
+ lock_extent(io_tree, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, GFP_NOFS);
+
+ /* allocate blocks */
+ cow_file_range(inode, async_cow->locked_page,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ &page_started, &nr_written, 0);
+
+ /*
+ * if page_started, cow_file_range inserted an
+ * inline extent and took care of all the unlocking
+ * and IO for us. Otherwise, we need to submit
+ * all those pages down to the drive.
+ */
+ if (!page_started)
+ extent_write_locked_range(io_tree,
+ inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ btrfs_get_extent,
+ WB_SYNC_ALL);
+ kfree(async_extent);
+ cond_resched();
+ continue;
+ }
+
+ lock_extent(io_tree, async_extent->start,
+ async_extent->start + async_extent->ram_size - 1,
+ GFP_NOFS);
+ /*
+ * here we're doing allocation and writeback of the
+ * compressed pages
+ */
+ btrfs_drop_extent_cache(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, 0);
+
+ ret = btrfs_reserve_extent(trans, root,
+ async_extent->compressed_size,
+ async_extent->compressed_size,
+ 0, alloc_hint,
+ (u64)-1, &ins, 1);
+ BUG_ON(ret);
+ em = alloc_extent_map(GFP_NOFS);
+ em->start = async_extent->start;
+ em->len = async_extent->ram_size;
+ em->orig_start = em->start;
+
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+ while (1) {
+ spin_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ spin_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1, 0);
+ }
+
+ ret = btrfs_add_ordered_extent(inode, async_extent->start,
+ ins.objectid,
+ async_extent->ram_size,
+ ins.offset,
+ BTRFS_ORDERED_COMPRESSED);
+ BUG_ON(ret);
+
+ btrfs_end_transaction(trans, root);
+
+ /*
+ * clear dirty, set writeback and unlock the pages.
+ */
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ async_extent->start,
+ async_extent->start +
+ async_extent->ram_size - 1,
+ NULL, 1, 1, 0, 1, 1, 0);
+
+ ret = btrfs_submit_compressed_write(inode,
+ async_extent->start,
+ async_extent->ram_size,
+ ins.objectid,
+ ins.offset, async_extent->pages,
+ async_extent->nr_pages);
+
+ BUG_ON(ret);
+ trans = btrfs_join_transaction(root, 1);
+ alloc_hint = ins.objectid + ins.offset;
+ kfree(async_extent);
+ cond_resched();
+ }
+
+ btrfs_end_transaction(trans, root);
+ return 0;
+}
+
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code. The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already. We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it. It may be clean and already done with
+ * IO when we return.
+ */
+static noinline int cow_file_range(struct inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written,
+ int unlock)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ u64 alloc_hint = 0;
+ u64 num_bytes;
+ unsigned long ram_size;
+ u64 disk_num_bytes;
+ u64 cur_alloc_size;
+ u64 blocksize = root->sectorsize;
+ u64 actual_end;
+ u64 isize = i_size_read(inode);
+ struct btrfs_key ins;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ int ret = 0;
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
+ btrfs_set_trans_block_group(trans, inode);
+
+ actual_end = min_t(u64, isize, end + 1);
+
+ num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+ num_bytes = max(blocksize, num_bytes);
+ disk_num_bytes = num_bytes;
+ ret = 0;
+
+ if (start == 0) {
+ /* lets try to make an inline extent */
+ ret = cow_file_range_inline(trans, root, inode,
+ start, end, 0, NULL);
+ if (ret == 0) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, NULL, 1, 1,
+ 1, 1, 1, 1);
+ *nr_written = *nr_written +
+ (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+ *page_started = 1;
+ ret = 0;
+ goto out;
+ }
+ }
+
+ BUG_ON(disk_num_bytes >
+ btrfs_super_total_bytes(&root->fs_info->super_copy));
+
+ btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+
+ while (disk_num_bytes > 0) {
+ cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+ ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+ root->sectorsize, 0, alloc_hint,
+ (u64)-1, &ins, 1);
+ BUG_ON(ret);
+
+ em = alloc_extent_map(GFP_NOFS);
+ em->start = start;
+ em->orig_start = em->start;
+
+ ram_size = ins.offset;
+ em->len = ins.offset;
+
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ while (1) {
+ spin_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ spin_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, start,
+ start + ram_size - 1, 0);
+ }
+
+ cur_alloc_size = ins.offset;
+ ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
+ ram_size, cur_alloc_size, 0);
+ BUG_ON(ret);
+
+ if (root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ ret = btrfs_reloc_clone_csums(inode, start,
+ cur_alloc_size);
+ BUG_ON(ret);
+ }
+
+ if (disk_num_bytes < cur_alloc_size)
+ break;
+
+ /* we're not doing compressed IO, don't unlock the first
+ * page (which the caller expects to stay locked), don't
+ * clear any dirty bits and don't set any writeback bits
+ */
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ start, start + ram_size - 1,
+ locked_page, unlock, 1,
+ 1, 0, 0, 0);
+ disk_num_bytes -= cur_alloc_size;
+ num_bytes -= cur_alloc_size;
+ alloc_hint = ins.objectid + ins.offset;
+ start += cur_alloc_size;
+ }
+out:
+ ret = 0;
+ btrfs_end_transaction(trans, root);
+
+ return ret;
+}
+
+/*
+ * work queue call back to started compression on a file and pages
+ */
+static noinline void async_cow_start(struct btrfs_work *work)
+{
+ struct async_cow *async_cow;
+ int num_added = 0;
+ async_cow = container_of(work, struct async_cow, work);
+
+ compress_file_range(async_cow->inode, async_cow->locked_page,
+ async_cow->start, async_cow->end, async_cow,
+ &num_added);
+ if (num_added == 0)
+ async_cow->inode = NULL;
+}
+
+/*
+ * work queue call back to submit previously compressed pages
+ */
+static noinline void async_cow_submit(struct btrfs_work *work)
+{
+ struct async_cow *async_cow;
+ struct btrfs_root *root;
+ unsigned long nr_pages;
+
+ async_cow = container_of(work, struct async_cow, work);
+
+ root = async_cow->root;
+ nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+
+ atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
+
+ if (atomic_read(&root->fs_info->async_delalloc_pages) <
+ 5 * 1042 * 1024 &&
+ waitqueue_active(&root->fs_info->async_submit_wait))
+ wake_up(&root->fs_info->async_submit_wait);
+
+ if (async_cow->inode)
+ submit_compressed_extents(async_cow->inode, async_cow);
+}
+
+static noinline void async_cow_free(struct btrfs_work *work)
+{
+ struct async_cow *async_cow;
+ async_cow = container_of(work, struct async_cow, work);
+ kfree(async_cow);
+}
+
+static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written)
+{
+ struct async_cow *async_cow;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ unsigned long nr_pages;
+ u64 cur_end;
+ int limit = 10 * 1024 * 1042;
+
+ if (!btrfs_test_opt(root, COMPRESS)) {
+ return cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1);
+ }
+
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
+ EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+ while (start < end) {
+ async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+ async_cow->inode = inode;
+ async_cow->root = root;
+ async_cow->locked_page = locked_page;
+ async_cow->start = start;
+
+ if (btrfs_test_flag(inode, NOCOMPRESS))
+ cur_end = end;
+ else
+ cur_end = min(end, start + 512 * 1024 - 1);
+
+ async_cow->end = cur_end;
+ INIT_LIST_HEAD(&async_cow->extents);
+
+ async_cow->work.func = async_cow_start;
+ async_cow->work.ordered_func = async_cow_submit;
+ async_cow->work.ordered_free = async_cow_free;
+ async_cow->work.flags = 0;
+
+ nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+ atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
+
+ btrfs_queue_worker(&root->fs_info->delalloc_workers,
+ &async_cow->work);
+
+ if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
+ wait_event(root->fs_info->async_submit_wait,
+ (atomic_read(&root->fs_info->async_delalloc_pages) <
+ limit));
+ }
+
+ while (atomic_read(&root->fs_info->async_submit_draining) &&
+ atomic_read(&root->fs_info->async_delalloc_pages)) {
+ wait_event(root->fs_info->async_submit_wait,
+ (atomic_read(&root->fs_info->async_delalloc_pages) ==
+ 0));
+ }
+
+ *nr_written += nr_pages;
+ start = cur_end + 1;
+ }
+ *page_started = 1;
+ return 0;
+}
+
+static noinline int csum_exist_in_range(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_ordered_sum *sums;
+ LIST_HEAD(list);
+
+ ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
+ bytenr + num_bytes - 1, &list);
+ if (ret == 0 && list_empty(&list))
+ return 0;
+
+ while (!list_empty(&list)) {
+ sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ return 1;
+}
+
+/*
+ * when nowcow writeback call back. This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started, int force,
+ unsigned long *nr_written)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key found_key;
+ u64 cow_start;
+ u64 cur_offset;
+ u64 extent_end;
+ u64 disk_bytenr;
+ u64 num_bytes;
+ int extent_type;
+ int ret;
+ int type;
+ int nocow;
+ int check_prev = 1;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
+
+ cow_start = (u64)-1;
+ cur_offset = start;
+ while (1) {
+ ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ cur_offset, 0);
+ BUG_ON(ret < 0);
+ if (ret > 0 && path->slots[0] > 0 && check_prev) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key,
+ path->slots[0] - 1);
+ if (found_key.objectid == inode->i_ino &&
+ found_key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+ check_prev = 0;
+next_slot:
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ BUG_ON(1);
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ nocow = 0;
+ disk_bytenr = 0;
+ num_bytes = 0;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid > inode->i_ino ||
+ found_key.type > BTRFS_EXTENT_DATA_KEY ||
+ found_key.offset > end)
+ break;
+
+ if (found_key.offset > cur_offset) {
+ extent_end = found_key.offset;
+ goto out_check;
+ }
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ extent_end = found_key.offset +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_end <= start) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (disk_bytenr == 0)
+ goto out_check;
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ goto out_check;
+ if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
+ goto out_check;
+ if (btrfs_extent_readonly(root, disk_bytenr))
+ goto out_check;
+ if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+ disk_bytenr))
+ goto out_check;
+ disk_bytenr += btrfs_file_extent_offset(leaf, fi);
+ disk_bytenr += cur_offset - found_key.offset;
+ num_bytes = min(end + 1, extent_end) - cur_offset;
+ /*
+ * force cow if csum exists in the range.
+ * this ensure that csum for a given extent are
+ * either valid or do not exist.
+ */
+ if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+ goto out_check;
+ nocow = 1;
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ extent_end = found_key.offset +
+ btrfs_file_extent_inline_len(leaf, fi);
+ extent_end = ALIGN(extent_end, root->sectorsize);
+ } else {
+ BUG_ON(1);
+ }
+out_check:
+ if (extent_end <= start) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (!nocow) {
+ if (cow_start == (u64)-1)
+ cow_start = cur_offset;
+ cur_offset = extent_end;
+ if (cur_offset > end)
+ break;
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ btrfs_release_path(root, path);
+ if (cow_start != (u64)-1) {
+ ret = cow_file_range(inode, locked_page, cow_start,
+ found_key.offset - 1, page_started,
+ nr_written, 1);
+ BUG_ON(ret);
+ cow_start = (u64)-1;
+ }
+
+ if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ struct extent_map *em;
+ struct extent_map_tree *em_tree;
+ em_tree = &BTRFS_I(inode)->extent_tree;
+ em = alloc_extent_map(GFP_NOFS);
+ em->start = cur_offset;
+ em->orig_start = em->start;
+ em->len = num_bytes;
+ em->block_len = num_bytes;
+ em->block_start = disk_bytenr;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ while (1) {
+ spin_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ spin_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
+ }
+ type = BTRFS_ORDERED_PREALLOC;
+ } else {
+ type = BTRFS_ORDERED_NOCOW;
+ }
+
+ ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
+ num_bytes, num_bytes, type);
+ BUG_ON(ret);
+
+ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+ cur_offset, cur_offset + num_bytes - 1,
+ locked_page, 1, 1, 1, 0, 0, 0);
+ cur_offset = extent_end;
+ if (cur_offset > end)
+ break;
+ }
+ btrfs_release_path(root, path);
+
+ if (cur_offset <= end && cow_start == (u64)-1)
+ cow_start = cur_offset;
+ if (cow_start != (u64)-1) {
+ ret = cow_file_range(inode, locked_page, cow_start, end,
+ page_started, nr_written, 1);
+ BUG_ON(ret);
+ }
+
+ ret = btrfs_end_transaction(trans, root);
+ BUG_ON(ret);
+ btrfs_free_path(path);
+ return 0;
+}
+
+/*
+ * extent_io.c call back to do delayed allocation processing
+ */
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+ unsigned long *nr_written)
+{
+ int ret;
+
+ if (btrfs_test_flag(inode, NODATACOW))
+ ret = run_delalloc_nocow(inode, locked_page, start, end,
+ page_started, 1, nr_written);
+ else if (btrfs_test_flag(inode, PREALLOC))
+ ret = run_delalloc_nocow(inode, locked_page, start, end,
+ page_started, 0, nr_written);
+ else
+ ret = cow_file_range_async(inode, locked_page, start, end,
+ page_started, nr_written);
+
+ return ret;
+}
+
+/*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+ * have pending delalloc work to be done.
+ */
+static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+ unsigned long old, unsigned long bits)
+{
+ /*
+ * set_bit and clear bit hooks normally require _irqsave/restore
+ * but in this case, we are only testeing for the DELALLOC
+ * bit, which is only set or cleared with irqs on
+ */
+ if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ spin_lock(&root->fs_info->delalloc_lock);
+ BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+ root->fs_info->delalloc_bytes += end - start + 1;
+ if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+ return 0;
+}
+
+/*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
+static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+ unsigned long old, unsigned long bits)
+{
+ /*
+ * set_bit and clear bit hooks normally require _irqsave/restore
+ * but in this case, we are only testeing for the DELALLOC
+ * bit, which is only set or cleared with irqs on
+ */
+ if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (end - start + 1 > root->fs_info->delalloc_bytes) {
+ printk(KERN_INFO "btrfs warning: delalloc account "
+ "%llu %llu\n",
+ (unsigned long long)end - start + 1,
+ (unsigned long long)
+ root->fs_info->delalloc_bytes);
+ root->fs_info->delalloc_bytes = 0;
+ BTRFS_I(inode)->delalloc_bytes = 0;
+ } else {
+ root->fs_info->delalloc_bytes -= end - start + 1;
+ BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
+ }
+ if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+ !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+ return 0;
+}
+
+/*
+ * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
+ * we don't create bios that span stripes or chunks
+ */
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+ size_t size, struct bio *bio,
+ unsigned long bio_flags)
+{
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ struct btrfs_mapping_tree *map_tree;
+ u64 logical = (u64)bio->bi_sector << 9;
+ u64 length = 0;
+ u64 map_length;
+ int ret;
+
+ if (bio_flags & EXTENT_BIO_COMPRESSED)
+ return 0;
+
+ length = bio->bi_size;
+ map_tree = &root->fs_info->mapping_tree;
+ map_length = length;
+ ret = btrfs_map_block(map_tree, READ, logical,
+ &map_length, NULL, 0);
+
+ if (map_length < length + size)
+ return 1;
+ return 0;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time. All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_start(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+ BUG_ON(ret);
+ return 0;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time. All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+}
+
+/*
+ * extent_io.c submission hook. This does the right thing for csum calculation
+ * on write, or reading the csums from the tree before a read
+ */
+static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+ int skip_sum;
+
+ skip_sum = btrfs_test_flag(inode, NODATASUM);
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ BUG_ON(ret);
+
+ if (!(rw & (1 << BIO_RW))) {
+ if (bio_flags & EXTENT_BIO_COMPRESSED) {
+ return btrfs_submit_compressed_read(inode, bio,
+ mirror_num, bio_flags);
+ } else if (!skip_sum)
+ btrfs_lookup_bio_sums(root, inode, bio, NULL);
+ goto mapit;
+ } else if (!skip_sum) {
+ /* csum items have already been cloned */
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ goto mapit;
+ /* we're doing a write, do the async checksumming */
+ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, mirror_num,
+ bio_flags, __btrfs_submit_bio_start,
+ __btrfs_submit_bio_done);
+ }
+
+mapit:
+ return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+}
+
+/*
+ * given a list of ordered sums record them in the inode. This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
+static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 file_offset,
+ struct list_head *list)
+{
+ struct list_head *cur;
+ struct btrfs_ordered_sum *sum;
+
+ btrfs_set_trans_block_group(trans, inode);
+ list_for_each(cur, list) {
+ sum = list_entry(cur, struct btrfs_ordered_sum, list);
+ btrfs_csum_file_blocks(trans,
+ BTRFS_I(inode)->root->fs_info->csum_root, sum);
+ }
+ return 0;
+}
+
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+{
+ if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
+ WARN_ON(1);
+ return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
+ GFP_NOFS);
+}
+
+/* see btrfs_writepage_start_hook for details on why this is required */
+struct btrfs_writepage_fixup {
+ struct page *page;
+ struct btrfs_work work;
+};
+
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+ struct btrfs_writepage_fixup *fixup;
+ struct btrfs_ordered_extent *ordered;
+ struct page *page;
+ struct inode *inode;
+ u64 page_start;
+ u64 page_end;
+
+ fixup = container_of(work, struct btrfs_writepage_fixup, work);
+ page = fixup->page;
+again:
+ lock_page(page);
+ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+ ClearPageChecked(page);
+ goto out_page;
+ }
+
+ inode = page->mapping->host;
+ page_start = page_offset(page);
+ page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+
+ /* already ordered? We're done */
+ if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ EXTENT_ORDERED, 0)) {
+ goto out;
+ }
+
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
+ page_end, GFP_NOFS);
+ unlock_page(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ goto again;
+ }
+
+ btrfs_set_extent_delalloc(inode, page_start, page_end);
+ ClearPageChecked(page);
+out:
+ unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+out_page:
+ unlock_page(page);
+ page_cache_release(page);
+}
+
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea. This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the ORDERED bit set
+ * hasn't been properly setup for IO. We kick off an async process
+ * to fix it up. The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_writepage_fixup *fixup;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+ EXTENT_ORDERED, 0);
+ if (ret)
+ return 0;
+
+ if (PageChecked(page))
+ return -EAGAIN;
+
+ fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+ if (!fixup)
+ return -EAGAIN;
+
+ SetPageChecked(page);
+ page_cache_get(page);
+ fixup->work.func = btrfs_writepage_fixup_worker;
+ fixup->page = page;
+ btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+ return -EAGAIN;
+}
+
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 file_pos,
+ u64 disk_bytenr, u64 disk_num_bytes,
+ u64 num_bytes, u64 ram_bytes,
+ u8 compression, u8 encryption,
+ u16 other_encoding, int extent_type)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key ins;
+ u64 hint;
+ int ret;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ ret = btrfs_drop_extents(trans, root, inode, file_pos,
+ file_pos + num_bytes, file_pos, &hint);
+ BUG_ON(ret);
+
+ ins.objectid = inode->i_ino;
+ ins.offset = file_pos;
+ ins.type = BTRFS_EXTENT_DATA_KEY;
+ ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
+ BUG_ON(ret);
+ leaf = path->nodes[0];
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+ btrfs_set_file_extent_type(leaf, fi, extent_type);
+ btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+ btrfs_set_file_extent_compression(leaf, fi, compression);
+ btrfs_set_file_extent_encryption(leaf, fi, encryption);
+ btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+ btrfs_mark_buffer_dirty(leaf);
+
+ inode_add_bytes(inode, num_bytes);
+ btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
+
+ ins.objectid = disk_bytenr;
+ ins.offset = disk_num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+ root->root_key.objectid,
+ trans->transid, inode->i_ino, &ins);
+ BUG_ON(ret);
+
+ btrfs_free_path(path);
+ return 0;
+}
+
+/* as ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_ordered_extent *ordered_extent;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int compressed = 0;
+ int ret;
+
+ ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+ if (!ret)
+ return 0;
+
+ trans = btrfs_join_transaction(root, 1);
+
+ ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+ BUG_ON(!ordered_extent);
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
+ goto nocow;
+
+ lock_extent(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ GFP_NOFS);
+
+ if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+ compressed = 1;
+ if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+ BUG_ON(compressed);
+ ret = btrfs_mark_extent_written(trans, root, inode,
+ ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len);
+ BUG_ON(ret);
+ } else {
+ ret = insert_reserved_file_extent(trans, inode,
+ ordered_extent->file_offset,
+ ordered_extent->start,
+ ordered_extent->disk_len,
+ ordered_extent->len,
+ ordered_extent->len,
+ compressed, 0, 0,
+ BTRFS_FILE_EXTENT_REG);
+ BUG_ON(ret);
+ }
+ unlock_extent(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset + ordered_extent->len - 1,
+ GFP_NOFS);
+nocow:
+ add_pending_csums(trans, inode, ordered_extent->file_offset,
+ &ordered_extent->list);
+
+ mutex_lock(&BTRFS_I(inode)->extent_mutex);
+ btrfs_ordered_update_i_size(inode, ordered_extent);
+ btrfs_update_inode(trans, root, inode);
+ btrfs_remove_ordered_extent(inode, ordered_extent);
+ mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+
+ /* once for us */
+ btrfs_put_ordered_extent(ordered_extent);
+ /* once for the tree */
+ btrfs_put_ordered_extent(ordered_extent);
+
+ btrfs_end_transaction(trans, root);
+ return 0;
+}
+
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+ struct extent_state *state, int uptodate)
+{
+ return btrfs_finish_ordered_io(page->mapping->host, start, end);
+}
+
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data. This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors. If another mirror has good data, the page is set up to date
+ * and things continue. If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+ struct page *page;
+ u64 start;
+ u64 len;
+ u64 logical;
+ unsigned long bio_flags;
+ int last_mirror;
+};
+
+static int btrfs_io_failed_hook(struct bio *failed_bio,
+ struct page *page, u64 start, u64 end,
+ struct extent_state *state)
+{
+ struct io_failure_record *failrec = NULL;
+ u64 private;
+ struct extent_map *em;
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct bio *bio;
+ int num_copies;
+ int ret;
+ int rw;
+ u64 logical;
+
+ ret = get_state_private(failure_tree, start, &private);
+ if (ret) {
+ failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
+ if (!failrec)
+ return -ENOMEM;
+ failrec->start = start;
+ failrec->len = end - start + 1;
+ failrec->last_mirror = 0;
+ failrec->bio_flags = 0;
+
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, failrec->len);
+ if (em->start > start || em->start + em->len < start) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ spin_unlock(&em_tree->lock);
+
+ if (!em || IS_ERR(em)) {
+ kfree(failrec);
+ return -EIO;
+ }
+ logical = start - em->start;
+ logical = em->block_start + logical;
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ logical = em->block_start;
+ failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ }
+ failrec->logical = logical;
+ free_extent_map(em);
+ set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
+ EXTENT_DIRTY, GFP_NOFS);
+ set_state_private(failure_tree, start,
+ (u64)(unsigned long)failrec);
+ } else {
+ failrec = (struct io_failure_record *)(unsigned long)private;
+ }
+ num_copies = btrfs_num_copies(
+ &BTRFS_I(inode)->root->fs_info->mapping_tree,
+ failrec->logical, failrec->len);
+ failrec->last_mirror++;
+ if (!state) {
+ spin_lock(&BTRFS_I(inode)->io_tree.lock);
+ state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+ failrec->start,
+ EXTENT_LOCKED);
+ if (state && state->start != failrec->start)
+ state = NULL;
+ spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+ }
+ if (!state || failrec->last_mirror > num_copies) {
+ set_state_private(failure_tree, failrec->start, 0);
+ clear_extent_bits(failure_tree, failrec->start,
+ failrec->start + failrec->len - 1,
+ EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+ kfree(failrec);
+ return -EIO;
+ }
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio->bi_private = state;
+ bio->bi_end_io = failed_bio->bi_end_io;
+ bio->bi_sector = failrec->logical >> 9;
+ bio->bi_bdev = failed_bio->bi_bdev;
+ bio->bi_size = 0;
+
+ bio_add_page(bio, page, failrec->len, start - page_offset(page));
+ if (failed_bio->bi_rw & (1 << BIO_RW))
+ rw = WRITE;
+ else
+ rw = READ;
+
+ BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+ failrec->last_mirror,
+ failrec->bio_flags);
+ return 0;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int btrfs_clean_io_failures(struct inode *inode, u64 start)
+{
+ u64 private;
+ u64 private_failure;
+ struct io_failure_record *failure;
+ int ret;
+
+ private = 0;
+ if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+ (u64)-1, 1, EXTENT_DIRTY)) {
+ ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
+ start, &private_failure);
+ if (ret == 0) {
+ failure = (struct io_failure_record *)(unsigned long)
+ private_failure;
+ set_state_private(&BTRFS_I(inode)->io_failure_tree,
+ failure->start, 0);
+ clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+ failure->start,
+ failure->start + failure->len - 1,
+ EXTENT_DIRTY | EXTENT_LOCKED,
+ GFP_NOFS);
+ kfree(failure);
+ }
+ }
+ return 0;
+}
+
+/*
+ * when reads are done, we need to check csums to verify the data is correct
+ * if there's a match, we allow the bio to finish. If not, we go through
+ * the io_failure_record routines to find good copies
+ */
+static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+ struct extent_state *state)
+{
+ size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+ struct inode *inode = page->mapping->host;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ char *kaddr;
+ u64 private = ~(u32)0;
+ int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u32 csum = ~(u32)0;
+
+ if (PageChecked(page)) {
+ ClearPageChecked(page);
+ goto good;
+ }
+ if (btrfs_test_flag(inode, NODATASUM))
+ return 0;
+
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+ clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+ GFP_NOFS);
+ return 0;
+ }
+
+ if (state && state->start == start) {
+ private = state->private;
+ ret = 0;
+ } else {
+ ret = get_state_private(io_tree, start, &private);
+ }
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (ret)
+ goto zeroit;
+
+ csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
+ btrfs_csum_final(csum, (char *)&csum);
+ if (csum != private)
+ goto zeroit;
+
+ kunmap_atomic(kaddr, KM_USER0);
+good:
+ /* if the io failure tree for this inode is non-empty,
+ * check to see if we've recovered from a failed IO
+ */
+ btrfs_clean_io_failures(inode, start);
+ return 0;
+
+zeroit:
+ printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+ "private %llu\n", page->mapping->host->i_ino,
+ (unsigned long long)start, csum,
+ (unsigned long long)private);
+ memset(kaddr + offset, 1, end - start + 1);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ if (private == 0)
+ return 0;
+ return -EIO;
+}
+
+/*
+ * This creates an orphan entry for the given inode in case something goes
+ * wrong in the middle of an unlink/truncate.
+ */
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ spin_lock(&root->list_lock);
+
+ /* already on the orphan list, we're good */
+ if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+ spin_unlock(&root->list_lock);
+ return 0;
+ }
+
+ list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+
+ spin_unlock(&root->list_lock);
+
+ /*
+ * insert an orphan item to track this unlinked/truncated file
+ */
+ ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+
+ return ret;
+}
+
+/*
+ * We have done the truncate/delete so we can go ahead and remove the orphan
+ * item for this particular inode.
+ */
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ spin_lock(&root->list_lock);
+
+ if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+ spin_unlock(&root->list_lock);
+ return 0;
+ }
+
+ list_del_init(&BTRFS_I(inode)->i_orphan);
+ if (!trans) {
+ spin_unlock(&root->list_lock);
+ return 0;
+ }
+
+ spin_unlock(&root->list_lock);
+
+ ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+
+ return ret;
+}
+
+/*
+ * this cleans up any orphans that may be left on the list from the last use
+ * of this root.
+ */
+void btrfs_orphan_cleanup(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ struct btrfs_key key, found_key;
+ struct btrfs_trans_handle *trans;
+ struct inode *inode;
+ int ret = 0, nr_unlink = 0, nr_truncate = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return;
+ path->reada = -1;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.offset = (u64)-1;
+
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ printk(KERN_ERR "Error searching slot for orphan: %d"
+ "\n", ret);
+ break;
+ }
+
+ /*
+ * if ret == 0 means we found what we were searching for, which
+ * is weird, but possible, so only screw with path if we didnt
+ * find the key and see if we have stuff that matches
+ */
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+
+ /* pull out the item */
+ leaf = path->nodes[0];
+ item = btrfs_item_nr(leaf, path->slots[0]);
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ /* make sure the item matches what we want */
+ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
+ break;
+ if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ /* release the path since we're done with it */
+ btrfs_release_path(root, path);
+
+ /*
+ * this is where we are basically btrfs_lookup, without the
+ * crossing root thing. we store the inode number in the
+ * offset of the orphan item.
+ */
+ inode = btrfs_iget_locked(root->fs_info->sb,
+ found_key.offset, root);
+ if (!inode)
+ break;
+
+ if (inode->i_state & I_NEW) {
+ BTRFS_I(inode)->root = root;
+
+ /* have to set the location manually */
+ BTRFS_I(inode)->location.objectid = inode->i_ino;
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.offset = 0;
+
+ btrfs_read_locked_inode(inode);
+ unlock_new_inode(inode);
+ }
+
+ /*
+ * add this inode to the orphan list so btrfs_orphan_del does
+ * the proper thing when we hit it
+ */
+ spin_lock(&root->list_lock);
+ list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ spin_unlock(&root->list_lock);
+
+ /*
+ * if this is a bad inode, means we actually succeeded in
+ * removing the inode, but not the orphan record, which means
+ * we need to manually delete the orphan since iput will just
+ * do a destroy_inode
+ */
+ if (is_bad_inode(inode)) {
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_orphan_del(trans, inode);
+ btrfs_end_transaction(trans, root);
+ iput(inode);
+ continue;
+ }
+
+ /* if we have links, this was a truncate, lets do that */
+ if (inode->i_nlink) {
+ nr_truncate++;
+ btrfs_truncate(inode);
+ } else {
+ nr_unlink++;
+ }
+
+ /* this will do delete_inode and everything for us */
+ iput(inode);
+ }
+
+ if (nr_unlink)
+ printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+ if (nr_truncate)
+ printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+
+ btrfs_free_path(path);
+}
+
+/*
+ * read an inode from the btree into the in-memory inode
+ */
+void btrfs_read_locked_inode(struct inode *inode)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_timespec *tspec;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_key location;
+ u64 alloc_group_block;
+ u32 rdev;
+ int ret;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+
+ ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
+ if (ret)
+ goto make_bad;
+
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+
+ inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+ inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+ inode->i_uid = btrfs_inode_uid(leaf, inode_item);
+ inode->i_gid = btrfs_inode_gid(leaf, inode_item);
+ btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+
+ tspec = btrfs_inode_atime(inode_item);
+ inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+ inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+ tspec = btrfs_inode_mtime(inode_item);
+ inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+ inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+ tspec = btrfs_inode_ctime(inode_item);
+ inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+ inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+ inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
+ BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+ BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+ inode->i_generation = BTRFS_I(inode)->generation;
+ inode->i_rdev = 0;
+ rdev = btrfs_inode_rdev(leaf, inode_item);
+
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+ BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+
+ alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+ BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
+ alloc_group_block, 0);
+ btrfs_free_path(path);
+ inode_item = NULL;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ break;
+ case S_IFDIR:
+ inode->i_fop = &btrfs_dir_file_operations;
+ if (root == root->fs_info->tree_root)
+ inode->i_op = &btrfs_dir_ro_inode_operations;
+ else
+ inode->i_op = &btrfs_dir_inode_operations;
+ break;
+ case S_IFLNK:
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ break;
+ default:
+ init_special_inode(inode, inode->i_mode, rdev);
+ break;
+ }
+ return;
+
+make_bad:
+ btrfs_free_path(path);
+ make_bad_inode(inode);
+}
+
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+ struct extent_buffer *leaf,
+ struct btrfs_inode_item *item,
+ struct inode *inode)
+{
+ btrfs_set_inode_uid(leaf, item, inode->i_uid);
+ btrfs_set_inode_gid(leaf, item, inode->i_gid);
+ btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+ btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_sec);
+ btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+ inode->i_atime.tv_nsec);
+
+ btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_sec);
+ btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ inode->i_mtime.tv_nsec);
+
+ btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_sec);
+ btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ inode->i_ctime.tv_nsec);
+
+ btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+ btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+ btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+}
+
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
+{
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_lookup_inode(trans, root, path,
+ &BTRFS_I(inode)->location, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto failed;
+ }
+
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+
+ fill_inode_item(trans, leaf, inode_item, inode);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_set_inode_last_trans(trans, inode);
+ ret = 0;
+failed:
+ btrfs_free_path(path);
+ return ret;
+}
+
+
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code. It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, struct inode *inode,
+ const char *name, int name_len)
+{
+ struct btrfs_path *path;
+ int ret = 0;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 index;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ name, name_len, -1);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto err;
+ }
+ if (!di) {
+ ret = -ENOENT;
+ goto err;
+ }
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret)
+ goto err;
+ btrfs_release_path(root, path);
+
+ ret = btrfs_del_inode_ref(trans, root, name, name_len,
+ inode->i_ino,
+ dir->i_ino, &index);
+ if (ret) {
+ printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
+ "inode %lu parent %lu\n", name_len, name,
+ inode->i_ino, dir->i_ino);
+ goto err;
+ }
+
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+ index, name, name_len, -1);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto err;
+ }
+ if (!di) {
+ ret = -ENOENT;
+ goto err;
+ }
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ btrfs_release_path(root, path);
+
+ ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
+ inode, dir->i_ino);
+ BUG_ON(ret != 0 && ret != -ENOENT);
+ if (ret != -ENOENT)
+ BTRFS_I(dir)->log_dirty_trans = trans->transid;
+
+ ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
+ dir, index);
+ BUG_ON(ret);
+err:
+ btrfs_free_path(path);
+ if (ret)
+ goto out;
+
+ btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ btrfs_update_inode(trans, root, dir);
+ btrfs_drop_nlink(inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ dir->i_sb->s_dirt = 1;
+out:
+ return ret;
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_root *root;
+ struct btrfs_trans_handle *trans;
+ struct inode *inode = dentry->d_inode;
+ int ret;
+ unsigned long nr = 0;
+
+ root = BTRFS_I(dir)->root;
+
+ ret = btrfs_check_free_space(root, 1, 1);
+ if (ret)
+ goto fail;
+
+ trans = btrfs_start_transaction(root, 1);
+
+ btrfs_set_trans_block_group(trans, dir);
+ ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ dentry->d_name.name, dentry->d_name.len);
+
+ if (inode->i_nlink == 0)
+ ret = btrfs_orphan_add(trans, inode);
+
+ nr = trans->blocks_used;
+
+ btrfs_end_transaction_throttle(trans, root);
+fail:
+ btrfs_btree_balance_dirty(root, nr);
+ return ret;
+}
+
+static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ int err = 0;
+ int ret;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_trans_handle *trans;
+ unsigned long nr = 0;
+
+ /*
+ * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
+ * the root of a subvolume or snapshot
+ */
+ if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
+ inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ return -ENOTEMPTY;
+ }
+
+ ret = btrfs_check_free_space(root, 1, 1);
+ if (ret)
+ goto fail;
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_orphan_add(trans, inode);
+ if (err)
+ goto fail_trans;
+
+ /* now the directory is empty */
+ err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ dentry->d_name.name, dentry->d_name.len);
+ if (!err)
+ btrfs_i_size_write(inode, 0);
+
+fail_trans:
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction_throttle(trans, root);
+fail:
+ btrfs_btree_balance_dirty(root, nr);
+
+ if (ret && !err)
+ err = ret;
+ return err;
+}
+
+#if 0
+/*
+ * when truncating bytes in a file, it is possible to avoid reading
+ * the leaves that contain only checksum items. This can be the
+ * majority of the IO required to delete a large file, but it must
+ * be done carefully.
+ *
+ * The keys in the level just above the leaves are checked to make sure
+ * the lowest key in a given leaf is a csum key, and starts at an offset
+ * after the new size.
+ *
+ * Then the key for the next leaf is checked to make sure it also has
+ * a checksum item for the same file. If it does, we know our target leaf
+ * contains only checksum items, and it can be safely freed without reading
+ * it.
+ *
+ * This is just an optimization targeted at large files. It may do
+ * nothing. It will return 0 unless things went badly.
+ */
+static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct inode *inode, u64 new_size)
+{
+ struct btrfs_key key;
+ int ret;
+ int nritems;
+ struct btrfs_key found_key;
+ struct btrfs_key other_key;
+ struct btrfs_leaf_ref *ref;
+ u64 leaf_gen;
+ u64 leaf_start;
+
+ path->lowest_level = 1;
+ key.objectid = inode->i_ino;
+ key.type = BTRFS_CSUM_ITEM_KEY;
+ key.offset = new_size;
+again:
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (path->nodes[1] == NULL) {
+ ret = 0;
+ goto out;
+ }
+ ret = 0;
+ btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
+ nritems = btrfs_header_nritems(path->nodes[1]);
+
+ if (!nritems)
+ goto out;
+
+ if (path->slots[1] >= nritems)
+ goto next_node;
+
+ /* did we find a key greater than anything we want to delete? */
+ if (found_key.objectid > inode->i_ino ||
+ (found_key.objectid == inode->i_ino && found_key.type > key.type))
+ goto out;
+
+ /* we check the next key in the node to make sure the leave contains
+ * only checksum items. This comparison doesn't work if our
+ * leaf is the last one in the node
+ */
+ if (path->slots[1] + 1 >= nritems) {
+next_node:
+ /* search forward from the last key in the node, this
+ * will bring us into the next node in the tree
+ */
+ btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
+
+ /* unlikely, but we inc below, so check to be safe */
+ if (found_key.offset == (u64)-1)
+ goto out;
+
+ /* search_forward needs a path with locks held, do the
+ * search again for the original key. It is possible
+ * this will race with a balance and return a path that
+ * we could modify, but this drop is just an optimization
+ * and is allowed to miss some leaves.
+ */
+ btrfs_release_path(root, path);
+ found_key.offset++;
+
+ /* setup a max key for search_forward */
+ other_key.offset = (u64)-1;
+ other_key.type = key.type;
+ other_key.objectid = key.objectid;
+
+ path->keep_locks = 1;
+ ret = btrfs_search_forward(root, &found_key, &other_key,
+ path, 0, 0);
+ path->keep_locks = 0;
+ if (ret || found_key.objectid != key.objectid ||
+ found_key.type != key.type) {
+ ret = 0;
+ goto out;
+ }
+
+ key.offset = found_key.offset;
+ btrfs_release_path(root, path);
+ cond_resched();
+ goto again;
+ }
+
+ /* we know there's one more slot after us in the tree,
+ * read that key so we can verify it is also a checksum item
+ */
+ btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
+
+ if (found_key.objectid < inode->i_ino)
+ goto next_key;
+
+ if (found_key.type != key.type || found_key.offset < new_size)
+ goto next_key;
+
+ /*
+ * if the key for the next leaf isn't a csum key from this objectid,
+ * we can't be sure there aren't good items inside this leaf.
+ * Bail out
+ */
+ if (other_key.objectid != inode->i_ino || other_key.type != key.type)
+ goto out;
+
+ leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
+ leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
+ /*
+ * it is safe to delete this leaf, it contains only
+ * csum items from this inode at an offset >= new_size
+ */
+ ret = btrfs_del_leaf(trans, root, path, leaf_start);
+ BUG_ON(ret);
+
+ if (root->ref_cows && leaf_gen < trans->transid) {
+ ref = btrfs_alloc_leaf_ref(root, 0);
+ if (ref) {
+ ref->root_gen = root->root_key.offset;
+ ref->bytenr = leaf_start;
+ ref->owner = 0;
+ ref->generation = leaf_gen;
+ ref->nritems = 0;
+
+ ret = btrfs_add_leaf_ref(root, ref, 0);
+ WARN_ON(ret);
+ btrfs_free_leaf_ref(root, ref);
+ } else {
+ WARN_ON(1);
+ }
+ }
+next_key:
+ btrfs_release_path(root, path);
+
+ if (other_key.objectid == inode->i_ino &&
+ other_key.type == key.type && other_key.offset > key.offset) {
+ key.offset = other_key.offset;
+ cond_resched();
+ goto again;
+ }
+ ret = 0;
+out:
+ /* fixup any changes we've made to the path */
+ path->lowest_level = 0;
+ path->keep_locks = 0;
+ btrfs_release_path(root, path);
+ return ret;
+}
+
+#endif
+
+/*
+ * this can truncate away extent items, csum items and directory items.
+ * It starts at a high offset and removes keys until it can't find
+ * any higher than new_size
+ *
+ * csum items that cross the new i_size are truncated to the new size
+ * as well.
+ *
+ * min_type is the minimum key type to truncate down to. If set to 0, this
+ * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ */
+noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ u64 new_size, u32 min_type)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u32 found_type;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ u64 extent_start = 0;
+ u64 extent_num_bytes = 0;
+ u64 item_end = 0;
+ u64 root_gen = 0;
+ u64 root_owner = 0;
+ int found_extent;
+ int del_item;
+ int pending_del_nr = 0;
+ int pending_del_slot = 0;
+ int extent_type = -1;
+ int encoding;
+ u64 mask = root->sectorsize - 1;
+
+ if (root->ref_cows)
+ btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+ path = btrfs_alloc_path();
+ path->reada = -1;
+ BUG_ON(!path);
+
+ /* FIXME, add redo link to tree so we don't leak on crash */
+ key.objectid = inode->i_ino;
+ key.offset = (u64)-1;
+ key.type = (u8)-1;
+
+ btrfs_init_path(path);
+
+search_again:
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto error;
+
+ if (ret > 0) {
+ /* there are no items in the tree for us to truncate, we're
+ * done
+ */
+ if (path->slots[0] == 0) {
+ ret = 0;
+ goto error;
+ }
+ path->slots[0]--;
+ }
+
+ while (1) {
+ fi = NULL;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ found_type = btrfs_key_type(&found_key);
+ encoding = 0;
+
+ if (found_key.objectid != inode->i_ino)
+ break;
+
+ if (found_type < min_type)
+ break;
+
+ item_end = found_key.offset;
+ if (found_type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ encoding = btrfs_file_extent_compression(leaf, fi);
+ encoding |= btrfs_file_extent_encryption(leaf, fi);
+ encoding |= btrfs_file_extent_other_encoding(leaf, fi);
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ item_end +=
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ item_end += btrfs_file_extent_inline_len(leaf,
+ fi);
+ }
+ item_end--;
+ }
+ if (item_end < new_size) {
+ if (found_type == BTRFS_DIR_ITEM_KEY)
+ found_type = BTRFS_INODE_ITEM_KEY;
+ else if (found_type == BTRFS_EXTENT_ITEM_KEY)
+ found_type = BTRFS_EXTENT_DATA_KEY;
+ else if (found_type == BTRFS_EXTENT_DATA_KEY)
+ found_type = BTRFS_XATTR_ITEM_KEY;
+ else if (found_type == BTRFS_XATTR_ITEM_KEY)
+ found_type = BTRFS_INODE_REF_KEY;
+ else if (found_type)
+ found_type--;
+ else
+ break;
+ btrfs_set_key_type(&key, found_type);
+ goto next;
+ }
+ if (found_key.offset >= new_size)
+ del_item = 1;
+ else
+ del_item = 0;
+ found_extent = 0;
+
+ /* FIXME, shrink the extent if the ref count is only 1 */
+ if (found_type != BTRFS_EXTENT_DATA_KEY)
+ goto delete;
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ u64 num_dec;
+ extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (!del_item && !encoding) {
+ u64 orig_num_bytes =
+ btrfs_file_extent_num_bytes(leaf, fi);
+ extent_num_bytes = new_size -
+ found_key.offset + root->sectorsize - 1;
+ extent_num_bytes = extent_num_bytes &
+ ~((u64)root->sectorsize - 1);
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_num_bytes);
+ num_dec = (orig_num_bytes -
+ extent_num_bytes);
+ if (root->ref_cows && extent_start != 0)
+ inode_sub_bytes(inode, num_dec);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ extent_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf,
+ fi);
+ /* FIXME blocksize != 4096 */
+ num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_start != 0) {
+ found_extent = 1;
+ if (root->ref_cows)
+ inode_sub_bytes(inode, num_dec);
+ }
+ root_gen = btrfs_header_generation(leaf);
+ root_owner = btrfs_header_owner(leaf);
+ }
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ /*
+ * we can't truncate inline items that have had
+ * special encodings
+ */
+ if (!del_item &&
+ btrfs_file_extent_compression(leaf, fi) == 0 &&
+ btrfs_file_extent_encryption(leaf, fi) == 0 &&
+ btrfs_file_extent_other_encoding(leaf, fi) == 0) {
+ u32 size = new_size - found_key.offset;
+
+ if (root->ref_cows) {
+ inode_sub_bytes(inode, item_end + 1 -
+ new_size);
+ }
+ size =
+ btrfs_file_extent_calc_inline_size(size);
+ ret = btrfs_truncate_item(trans, root, path,
+ size, 1);
+ BUG_ON(ret);
+ } else if (root->ref_cows) {
+ inode_sub_bytes(inode, item_end + 1 -
+ found_key.offset);
+ }
+ }
+delete:
+ if (del_item) {
+ if (!pending_del_nr) {
+ /* no pending yet, add ourselves */
+ pending_del_slot = path->slots[0];
+ pending_del_nr = 1;
+ } else if (pending_del_nr &&
+ path->slots[0] + 1 == pending_del_slot) {
+ /* hop on the pending chunk */
+ pending_del_nr++;
+ pending_del_slot = path->slots[0];
+ } else {
+ BUG();
+ }
+ } else {
+ break;
+ }
+ if (found_extent) {
+ ret = btrfs_free_extent(trans, root, extent_start,
+ extent_num_bytes,
+ leaf->start, root_owner,
+ root_gen, inode->i_ino, 0);
+ BUG_ON(ret);
+ }
+next:
+ if (path->slots[0] == 0) {
+ if (pending_del_nr)
+ goto del_pending;
+ btrfs_release_path(root, path);
+ goto search_again;
+ }
+
+ path->slots[0]--;
+ if (pending_del_nr &&
+ path->slots[0] + 1 != pending_del_slot) {
+ struct btrfs_key debug;
+del_pending:
+ btrfs_item_key_to_cpu(path->nodes[0], &debug,
+ pending_del_slot);
+ ret = btrfs_del_items(trans, root, path,
+ pending_del_slot,
+ pending_del_nr);
+ BUG_ON(ret);
+ pending_del_nr = 0;
+ btrfs_release_path(root, path);
+ goto search_again;
+ }
+ }
+ ret = 0;
+error:
+ if (pending_del_nr) {
+ ret = btrfs_del_items(trans, root, path, pending_del_slot,
+ pending_del_nr);
+ }
+ btrfs_free_path(path);
+ inode->i_sb->s_dirt = 1;
+ return ret;
+}
+
+/*
+ * taken from block_truncate_page, but does cow as it zeros out
+ * any bytes left in the last page in the file.
+ */
+static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+{
+ struct inode *inode = mapping->host;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ char *kaddr;
+ u32 blocksize = root->sectorsize;
+ pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ struct page *page;
+ int ret = 0;
+ u64 page_start;
+ u64 page_end;
+
+ if ((offset & (blocksize - 1)) == 0)
+ goto out;
+
+ ret = -ENOMEM;
+again:
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ goto out;
+
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ if (!PageUptodate(page)) {
+ ret = btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+ }
+ wait_on_page_writeback(page);
+
+ lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ set_page_extent_mapped(page);
+
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ btrfs_set_extent_delalloc(inode, page_start, page_end);
+ ret = 0;
+ if (offset != PAGE_CACHE_SIZE) {
+ kaddr = kmap(page);
+ memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+ flush_dcache_page(page);
+ kunmap(page);
+ }
+ ClearPageChecked(page);
+ set_page_dirty(page);
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+out_unlock:
+ unlock_page(page);
+ page_cache_release(page);
+out:
+ return ret;
+}
+
+int btrfs_cont_expand(struct inode *inode, loff_t size)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em;
+ u64 mask = root->sectorsize - 1;
+ u64 hole_start = (inode->i_size + mask) & ~mask;
+ u64 block_end = (size + mask) & ~mask;
+ u64 last_byte;
+ u64 cur_offset;
+ u64 hole_size;
+ int err;
+
+ if (size <= hole_start)
+ return 0;
+
+ err = btrfs_check_free_space(root, 1, 0);
+ if (err)
+ return err;
+
+ btrfs_truncate_page(inode->i_mapping, inode->i_size);
+
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+ btrfs_wait_ordered_range(inode, hole_start,
+ block_end - hole_start);
+ lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+ ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+ if (!ordered)
+ break;
+ unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+
+ cur_offset = hole_start;
+ while (1) {
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+ block_end - cur_offset, 0);
+ BUG_ON(IS_ERR(em) || !em);
+ last_byte = min(extent_map_end(em), block_end);
+ last_byte = (last_byte + mask) & ~mask;
+ if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+ u64 hint_byte = 0;
+ hole_size = last_byte - cur_offset;
+ err = btrfs_drop_extents(trans, root, inode,
+ cur_offset,
+ cur_offset + hole_size,
+ cur_offset, &hint_byte);
+ if (err)
+ break;
+ err = btrfs_insert_file_extent(trans, root,
+ inode->i_ino, cur_offset, 0,
+ 0, hole_size, 0, hole_size,
+ 0, 0, 0);
+ btrfs_drop_extent_cache(inode, hole_start,
+ last_byte - 1, 0);
+ }
+ free_extent_map(em);
+ cur_offset = last_byte;
+ if (err || cur_offset >= block_end)
+ break;
+ }
+
+ btrfs_end_transaction(trans, root);
+ unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+ return err;
+}
+
+static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ int err;
+
+ err = inode_change_ok(inode, attr);
+ if (err)
+ return err;
+
+ if (S_ISREG(inode->i_mode) &&
+ attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+ err = btrfs_cont_expand(inode, attr->ia_size);
+ if (err)
+ return err;
+ }
+
+ err = inode_setattr(inode, attr);
+
+ if (!err && ((attr->ia_valid & ATTR_MODE)))
+ err = btrfs_acl_chmod(inode);
+ return err;
+}
+
+void btrfs_delete_inode(struct inode *inode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ unsigned long nr;
+ int ret;
+
+ truncate_inode_pages(&inode->i_data, 0);
+ if (is_bad_inode(inode)) {
+ btrfs_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+
+ btrfs_i_size_write(inode, 0);
+ trans = btrfs_join_transaction(root, 1);
+
+ btrfs_set_trans_block_group(trans, inode);
+ ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
+ if (ret) {
+ btrfs_orphan_del(NULL, inode);
+ goto no_delete_lock;
+ }
+
+ btrfs_orphan_del(trans, inode);
+
+ nr = trans->blocks_used;
+ clear_inode(inode);
+
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+ return;
+
+no_delete_lock:
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+no_delete:
+ clear_inode(inode);
+}
+
+/*
+ * this returns the key found in the dir entry in the location pointer.
+ * If no dir entries were found, location->objectid is 0.
+ */
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+ struct btrfs_key *location)
+{
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
+ namelen, 0);
+ if (IS_ERR(di))
+ ret = PTR_ERR(di);
+
+ if (!di || IS_ERR(di))
+ goto out_err;
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+out:
+ btrfs_free_path(path);
+ return ret;
+out_err:
+ location->objectid = 0;
+ goto out;
+}
+
+/*
+ * when we hit a tree root in a directory, the btrfs part of the inode
+ * needs to be changed to reflect the root directory of the tree root. This
+ * is kind of like crossing a mount point.
+ */
+static int fixup_tree_root_location(struct btrfs_root *root,
+ struct btrfs_key *location,
+ struct btrfs_root **sub_root,
+ struct dentry *dentry)
+{
+ struct btrfs_root_item *ri;
+
+ if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
+ return 0;
+ if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return 0;
+
+ *sub_root = btrfs_read_fs_root(root->fs_info, location,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ if (IS_ERR(*sub_root))
+ return PTR_ERR(*sub_root);
+
+ ri = &(*sub_root)->root_item;
+ location->objectid = btrfs_root_dirid(ri);
+ btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+ location->offset = 0;
+
+ return 0;
+}
+
+static noinline void init_btrfs_i(struct inode *inode)
+{
+ struct btrfs_inode *bi = BTRFS_I(inode);
+
+ bi->i_acl = NULL;
+ bi->i_default_acl = NULL;
+
+ bi->generation = 0;
+ bi->sequence = 0;
+ bi->last_trans = 0;
+ bi->logged_trans = 0;
+ bi->delalloc_bytes = 0;
+ bi->disk_i_size = 0;
+ bi->flags = 0;
+ bi->index_cnt = (u64)-1;
+ bi->log_dirty_trans = 0;
+ extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+ extent_io_tree_init(&BTRFS_I(inode)->io_tree,
+ inode->i_mapping, GFP_NOFS);
+ extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+ inode->i_mapping, GFP_NOFS);
+ INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+ btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
+ mutex_init(&BTRFS_I(inode)->extent_mutex);
+ mutex_init(&BTRFS_I(inode)->log_mutex);
+}
+
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+ struct btrfs_iget_args *args = p;
+ inode->i_ino = args->ino;
+ init_btrfs_i(inode);
+ BTRFS_I(inode)->root = args->root;
+ return 0;
+}
+
+static int btrfs_find_actor(struct inode *inode, void *opaque)
+{
+ struct btrfs_iget_args *args = opaque;
+ return args->ino == inode->i_ino &&
+ args->root == BTRFS_I(inode)->root;
+}
+
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+ struct btrfs_root *root, int wait)
+{
+ struct inode *inode;
+ struct btrfs_iget_args args;
+ args.ino = objectid;
+ args.root = root;
+
+ if (wait) {
+ inode = ilookup5(s, objectid, btrfs_find_actor,
+ (void *)&args);
+ } else {
+ inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
+ (void *)&args);
+ }
+ return inode;
+}
+
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+ struct btrfs_root *root)
+{
+ struct inode *inode;
+ struct btrfs_iget_args args;
+ args.ino = objectid;
+ args.root = root;
+
+ inode = iget5_locked(s, objectid, btrfs_find_actor,
+ btrfs_init_locked_inode,
+ (void *)&args);
+ return inode;
+}
+
+/* Get an inode object given its location and corresponding root.
+ * Returns in *is_new if the inode was read from disk
+ */
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+ struct btrfs_root *root, int *is_new)
+{
+ struct inode *inode;
+
+ inode = btrfs_iget_locked(s, location->objectid, root);
+ if (!inode)
+ return ERR_PTR(-EACCES);
+
+ if (inode->i_state & I_NEW) {
+ BTRFS_I(inode)->root = root;
+ memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
+ btrfs_read_locked_inode(inode);
+ unlock_new_inode(inode);
+ if (is_new)
+ *is_new = 1;
+ } else {
+ if (is_new)
+ *is_new = 0;
+ }
+
+ return inode;
+}
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode;
+ struct btrfs_inode *bi = BTRFS_I(dir);
+ struct btrfs_root *root = bi->root;
+ struct btrfs_root *sub_root = root;
+ struct btrfs_key location;
+ int ret, new;
+
+ if (dentry->d_name.len > BTRFS_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ ret = btrfs_inode_by_name(dir, dentry, &location);
+
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ inode = NULL;
+ if (location.objectid) {
+ ret = fixup_tree_root_location(root, &location, &sub_root,
+ dentry);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return ERR_PTR(-ENOENT);
+ inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ }
+ return inode;
+}
+
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct inode *inode;
+
+ if (dentry->d_name.len > BTRFS_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ inode = btrfs_lookup_dentry(dir, dentry);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return d_splice_alias(inode, dentry);
+}
+
+static unsigned char btrfs_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int btrfs_real_readdir(struct file *filp, void *dirent,
+ filldir_t filldir)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_item *item;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ int ret;
+ u32 nritems;
+ struct extent_buffer *leaf;
+ int slot;
+ int advance;
+ unsigned char d_type;
+ int over = 0;
+ u32 di_cur;
+ u32 di_total;
+ u32 di_len;
+ int key_type = BTRFS_DIR_INDEX_KEY;
+ char tmp_name[32];
+ char *name_ptr;
+ int name_len;
+
+ /* FIXME, use a real flag for deciding about the key type */
+ if (root->fs_info->tree_root == root)
+ key_type = BTRFS_DIR_ITEM_KEY;
+
+ /* special case for "." */
+ if (filp->f_pos == 0) {
+ over = filldir(dirent, ".", 1,
+ 1, inode->i_ino,
+ DT_DIR);
+ if (over)
+ return 0;
+ filp->f_pos = 1;
+ }
+ /* special case for .., just use the back ref */
+ if (filp->f_pos == 1) {
+ u64 pino = parent_ino(filp->f_path.dentry);
+ over = filldir(dirent, "..", 2,
+ 2, pino, DT_DIR);
+ if (over)
+ return 0;
+ filp->f_pos = 2;
+ }
+ path = btrfs_alloc_path();
+ path->reada = 2;
+
+ btrfs_set_key_type(&key, key_type);
+ key.offset = filp->f_pos;
+ key.objectid = inode->i_ino;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+ advance = 0;
+
+ while (1) {
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+ if (advance || slot >= nritems) {
+ if (slot >= nritems - 1) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+ } else {
+ slot++;
+ path->slots[0]++;
+ }
+ }
+
+ advance = 1;
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.objectid != key.objectid)
+ break;
+ if (btrfs_key_type(&found_key) != key_type)
+ break;
+ if (found_key.offset < filp->f_pos)
+ continue;
+
+ filp->f_pos = found_key.offset;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ di_cur = 0;
+ di_total = btrfs_item_size(leaf, item);
+
+ while (di_cur < di_total) {
+ struct btrfs_key location;
+
+ name_len = btrfs_dir_name_len(leaf, di);
+ if (name_len <= sizeof(tmp_name)) {
+ name_ptr = tmp_name;
+ } else {
+ name_ptr = kmalloc(name_len, GFP_NOFS);
+ if (!name_ptr) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+ read_extent_buffer(leaf, name_ptr,
+ (unsigned long)(di + 1), name_len);
+
+ d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+ btrfs_dir_item_key_to_cpu(leaf, di, &location);
+
+ /* is this a reference to our own snapshot? If so
+ * skip it
+ */
+ if (location.type == BTRFS_ROOT_ITEM_KEY &&
+ location.objectid == root->root_key.objectid) {
+ over = 0;
+ goto skip;
+ }
+ over = filldir(dirent, name_ptr, name_len,
+ found_key.offset, location.objectid,
+ d_type);
+
+skip:
+ if (name_ptr != tmp_name)
+ kfree(name_ptr);
+
+ if (over)
+ goto nopos;
+ di_len = btrfs_dir_name_len(leaf, di) +
+ btrfs_dir_data_len(leaf, di) + sizeof(*di);
+ di_cur += di_len;
+ di = (struct btrfs_dir_item *)((char *)di + di_len);
+ }
+ }
+
+ /* Reached end of directory/root. Bump pos past the last item. */
+ if (key_type == BTRFS_DIR_INDEX_KEY)
+ filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
+ else
+ filp->f_pos++;
+nopos:
+ ret = 0;
+err:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_write_inode(struct inode *inode, int wait)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (root->fs_info->btree_inode == inode)
+ return 0;
+
+ if (wait) {
+ trans = btrfs_join_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ ret = btrfs_commit_transaction(trans, root);
+ }
+ return ret;
+}
+
+/*
+ * This is somewhat expensive, updating the tree every time the
+ * inode changes. But, it is most likely to find the inode in cache.
+ * FIXME, needs more benchmarking...there are no reasons other than performance
+ * to keep or drop this code.
+ */
+void btrfs_dirty_inode(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_join_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ btrfs_update_inode(trans, root, inode);
+ btrfs_end_transaction(trans, root);
+}
+
+/*
+ * find the highest existing sequence number in a directory
+ * and then set the in-memory index_cnt variable to reflect
+ * free sequence numbers
+ */
+static int btrfs_set_inode_index_count(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_key key, found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = inode->i_ino;
+ btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+ key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ /* FIXME: we should be able to handle this */
+ if (ret == 0)
+ goto out;
+ ret = 0;
+
+ /*
+ * MAGIC NUMBER EXPLANATION:
+ * since we search a directory based on f_pos we have to start at 2
+ * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
+ * else has to start at 2
+ */
+ if (path->slots[0] == 0) {
+ BTRFS_I(inode)->index_cnt = 2;
+ goto out;
+ }
+
+ path->slots[0]--;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid != inode->i_ino ||
+ btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+ BTRFS_I(inode)->index_cnt = 2;
+ goto out;
+ }
+
+ BTRFS_I(inode)->index_cnt = found_key.offset + 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper to find a free sequence number in a given directory. This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
+int btrfs_set_inode_index(struct inode *dir, u64 *index)
+{
+ int ret = 0;
+
+ if (BTRFS_I(dir)->index_cnt == (u64)-1) {
+ ret = btrfs_set_inode_index_count(dir);
+ if (ret)
+ return ret;
+ }
+
+ *index = BTRFS_I(dir)->index_cnt;
+ BTRFS_I(dir)->index_cnt++;
+
+ return ret;
+}
+
+static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir,
+ const char *name, int name_len,
+ u64 ref_objectid, u64 objectid,
+ u64 alloc_hint, int mode, u64 *index)
+{
+ struct inode *inode;
+ struct btrfs_inode_item *inode_item;
+ struct btrfs_key *location;
+ struct btrfs_path *path;
+ struct btrfs_inode_ref *ref;
+ struct btrfs_key key[2];
+ u32 sizes[2];
+ unsigned long ptr;
+ int ret;
+ int owner;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ inode = new_inode(root->fs_info->sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (dir) {
+ ret = btrfs_set_inode_index(dir, index);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ /*
+ * index_cnt is ignored for everything but a dir,
+ * btrfs_get_inode_index_count has an explanation for the magic
+ * number
+ */
+ init_btrfs_i(inode);
+ BTRFS_I(inode)->index_cnt = 2;
+ BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->generation = trans->transid;
+
+ if (mode & S_IFDIR)
+ owner = 0;
+ else
+ owner = 1;
+ BTRFS_I(inode)->block_group =
+ btrfs_find_block_group(root, 0, alloc_hint, owner);
+ if ((mode & S_IFREG)) {
+ if (btrfs_test_opt(root, NODATASUM))
+ btrfs_set_flag(inode, NODATASUM);
+ if (btrfs_test_opt(root, NODATACOW))
+ btrfs_set_flag(inode, NODATACOW);
+ }
+
+ key[0].objectid = objectid;
+ btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+ key[0].offset = 0;
+
+ key[1].objectid = objectid;
+ btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+ key[1].offset = ref_objectid;
+
+ sizes[0] = sizeof(struct btrfs_inode_item);
+ sizes[1] = name_len + sizeof(*ref);
+
+ ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+ if (ret != 0)
+ goto fail;
+
+ if (objectid > root->highest_inode)
+ root->highest_inode = objectid;
+
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_mode = mode;
+ inode->i_ino = objectid;
+ inode_set_bytes(inode, 0);
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ fill_inode_item(trans, path->nodes[0], inode_item, inode);
+
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+ struct btrfs_inode_ref);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+ ptr = (unsigned long)(ref + 1);
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_free_path(path);
+
+ location = &BTRFS_I(inode)->location;
+ location->objectid = objectid;
+ location->offset = 0;
+ btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+
+ insert_inode_hash(inode);
+ return inode;
+fail:
+ if (dir)
+ BTRFS_I(dir)->index_cnt--;
+ btrfs_free_path(path);
+ return ERR_PTR(ret);
+}
+
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+ return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
+}
+
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+ struct inode *parent_inode, struct inode *inode,
+ const char *name, int name_len, int add_backref, u64 index)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+
+ key.objectid = inode->i_ino;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+
+ ret = btrfs_insert_dir_item(trans, root, name, name_len,
+ parent_inode->i_ino,
+ &key, btrfs_inode_type(inode),
+ index);
+ if (ret == 0) {
+ if (add_backref) {
+ ret = btrfs_insert_inode_ref(trans, root,
+ name, name_len,
+ inode->i_ino,
+ parent_inode->i_ino,
+ index);
+ }
+ btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ name_len * 2);
+ parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, parent_inode);
+ }
+ return ret;
+}
+
+static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
+ struct dentry *dentry, struct inode *inode,
+ int backref, u64 index)
+{
+ int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+ inode, dentry->d_name.name,
+ dentry->d_name.len, backref, index);
+ if (!err) {
+ d_instantiate(dentry, inode);
+ return 0;
+ }
+ if (err > 0)
+ err = -EEXIST;
+ return err;
+}
+
+static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+ int mode, dev_t rdev)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = NULL;
+ int err;
+ int drop_inode = 0;
+ u64 objectid;
+ unsigned long nr = 0;
+ u64 index = 0;
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ err = btrfs_check_free_space(root, 1, 0);
+ if (err)
+ goto fail;
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+ if (err) {
+ err = -ENOSPC;
+ goto out_unlock;
+ }
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len,
+ dentry->d_parent->d_inode->i_ino, objectid,
+ BTRFS_I(dir)->block_group, mode, &index);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_unlock;
+
+ err = btrfs_init_acl(inode, dir);
+ if (err) {
+ drop_inode = 1;
+ goto out_unlock;
+ }
+
+ btrfs_set_trans_block_group(trans, inode);
+ err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+ if (err)
+ drop_inode = 1;
+ else {
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ btrfs_update_inode(trans, root, inode);
+ }
+ dir->i_sb->s_dirt = 1;
+ btrfs_update_inode_block_group(trans, inode);
+ btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+fail:
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_btree_balance_dirty(root, nr);
+ return err;
+}
+
+static int btrfs_create(struct inode *dir, struct dentry *dentry,
+ int mode, struct nameidata *nd)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = NULL;
+ int err;
+ int drop_inode = 0;
+ unsigned long nr = 0;
+ u64 objectid;
+ u64 index = 0;
+
+ err = btrfs_check_free_space(root, 1, 0);
+ if (err)
+ goto fail;
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+ if (err) {
+ err = -ENOSPC;
+ goto out_unlock;
+ }
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len,
+ dentry->d_parent->d_inode->i_ino,
+ objectid, BTRFS_I(dir)->block_group, mode,
+ &index);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_unlock;
+
+ err = btrfs_init_acl(inode, dir);
+ if (err) {
+ drop_inode = 1;
+ goto out_unlock;
+ }
+
+ btrfs_set_trans_block_group(trans, inode);
+ err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+ if (err)
+ drop_inode = 1;
+ else {
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ }
+ dir->i_sb->s_dirt = 1;
+ btrfs_update_inode_block_group(trans, inode);
+ btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+fail:
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_btree_balance_dirty(root, nr);
+ return err;
+}
+
+static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = old_dentry->d_inode;
+ u64 index;
+ unsigned long nr = 0;
+ int err;
+ int drop_inode = 0;
+
+ if (inode->i_nlink == 0)
+ return -ENOENT;
+
+ btrfs_inc_nlink(inode);
+ err = btrfs_check_free_space(root, 1, 0);
+ if (err)
+ goto fail;
+ err = btrfs_set_inode_index(dir, &index);
+ if (err)
+ goto fail;
+
+ trans = btrfs_start_transaction(root, 1);
+
+ btrfs_set_trans_block_group(trans, dir);
+ atomic_inc(&inode->i_count);
+
+ err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+
+ if (err)
+ drop_inode = 1;
+
+ dir->i_sb->s_dirt = 1;
+ btrfs_update_inode_block_group(trans, dir);
+ err = btrfs_update_inode(trans, root, inode);
+
+ if (err)
+ drop_inode = 1;
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+fail:
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_btree_balance_dirty(root, nr);
+ return err;
+}
+
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct inode *inode = NULL;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ int err = 0;
+ int drop_on_err = 0;
+ u64 objectid = 0;
+ u64 index = 0;
+ unsigned long nr = 1;
+
+ err = btrfs_check_free_space(root, 1, 0);
+ if (err)
+ goto out_unlock;
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, dir);
+
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_unlock;
+ }
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+ if (err) {
+ err = -ENOSPC;
+ goto out_unlock;
+ }
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len,
+ dentry->d_parent->d_inode->i_ino, objectid,
+ BTRFS_I(dir)->block_group, S_IFDIR | mode,
+ &index);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_fail;
+ }
+
+ drop_on_err = 1;
+
+ err = btrfs_init_acl(inode, dir);
+ if (err)
+ goto out_fail;
+
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+ btrfs_set_trans_block_group(trans, inode);
+
+ btrfs_i_size_write(inode, 0);
+ err = btrfs_update_inode(trans, root, inode);
+ if (err)
+ goto out_fail;
+
+ err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+ inode, dentry->d_name.name,
+ dentry->d_name.len, 0, index);
+ if (err)
+ goto out_fail;
+
+ d_instantiate(dentry, inode);
+ drop_on_err = 0;
+ dir->i_sb->s_dirt = 1;
+ btrfs_update_inode_block_group(trans, inode);
+ btrfs_update_inode_block_group(trans, dir);
+
+out_fail:
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+
+out_unlock:
+ if (drop_on_err)
+ iput(inode);
+ btrfs_btree_balance_dirty(root, nr);
+ return err;
+}
+
+/* helper for btfs_get_extent. Given an existing extent in the tree,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the new extent into the tree.
+ */
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+ struct extent_map *existing,
+ struct extent_map *em,
+ u64 map_start, u64 map_len)
+{
+ u64 start_diff;
+
+ BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+ start_diff = map_start - em->start;
+ em->start = map_start;
+ em->len = map_len;
+ if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+ !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ em->block_start += start_diff;
+ em->block_len -= start_diff;
+ }
+ return add_extent_mapping(em_tree, em);
+}
+
+static noinline int uncompress_inline(struct btrfs_path *path,
+ struct inode *inode, struct page *page,
+ size_t pg_offset, u64 extent_offset,
+ struct btrfs_file_extent_item *item)
+{
+ int ret;
+ struct extent_buffer *leaf = path->nodes[0];
+ char *tmp;
+ size_t max_size;
+ unsigned long inline_size;
+ unsigned long ptr;
+
+ WARN_ON(pg_offset != 0);
+ max_size = btrfs_file_extent_ram_bytes(leaf, item);
+ inline_size = btrfs_file_extent_inline_item_len(leaf,
+ btrfs_item_nr(leaf, path->slots[0]));
+ tmp = kmalloc(inline_size, GFP_NOFS);
+ ptr = btrfs_file_extent_inline_start(item);
+
+ read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+ max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+ ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+ inline_size, max_size);
+ if (ret) {
+ char *kaddr = kmap_atomic(page, KM_USER0);
+ unsigned long copy_size = min_t(u64,
+ PAGE_CACHE_SIZE - pg_offset,
+ max_size - extent_offset);
+ memset(kaddr + pg_offset, 0, copy_size);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+ kfree(tmp);
+ return 0;
+}
+
+/*
+ * a bit scary, this does extent mapping from logical file offset to the disk.
+ * the ugly parts come from merging extents from the disk with the in-ram
+ * representation. This gets more complex because of the data=ordered code,
+ * where the in-ram extents might be locked pending data=ordered completion.
+ *
+ * This also copies inline extents directly into the page.
+ */
+
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create)
+{
+ int ret;
+ int err = 0;
+ u64 bytenr;
+ u64 extent_start = 0;
+ u64 extent_end = 0;
+ u64 objectid = inode->i_ino;
+ u32 found_type;
+ struct btrfs_path *path = NULL;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *item;
+ struct extent_buffer *leaf;
+ struct btrfs_key found_key;
+ struct extent_map *em = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_trans_handle *trans = NULL;
+ int compressed;
+
+again:
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (em)
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ spin_unlock(&em_tree->lock);
+
+ if (em) {
+ if (em->start > start || em->start + em->len <= start)
+ free_extent_map(em);
+ else if (em->block_start == EXTENT_MAP_INLINE && page)
+ free_extent_map(em);
+ else
+ goto out;
+ }
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+ err = -ENOMEM;
+ goto out;
+ }
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->start = EXTENT_MAP_HOLE;
+ em->orig_start = EXTENT_MAP_HOLE;
+ em->len = (u64)-1;
+ em->block_len = (u64)-1;
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ }
+
+ ret = btrfs_lookup_file_extent(trans, root, path,
+ objectid, start, trans != NULL);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+
+ if (ret != 0) {
+ if (path->slots[0] == 0)
+ goto not_found;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ /* are we inside the extent that was found? */
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ found_type = btrfs_key_type(&found_key);
+ if (found_key.objectid != objectid ||
+ found_type != BTRFS_EXTENT_DATA_KEY) {
+ goto not_found;
+ }
+
+ found_type = btrfs_file_extent_type(leaf, item);
+ extent_start = found_key.offset;
+ compressed = btrfs_file_extent_compression(leaf, item);
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ extent_end = extent_start +
+ btrfs_file_extent_num_bytes(leaf, item);
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ size_t size;
+ size = btrfs_file_extent_inline_len(leaf, item);
+ extent_end = (extent_start + size + root->sectorsize - 1) &
+ ~((u64)root->sectorsize - 1);
+ }
+
+ if (start >= extent_end) {
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret > 0)
+ goto not_found;
+ leaf = path->nodes[0];
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != objectid ||
+ found_key.type != BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
+ if (start + len <= found_key.offset)
+ goto not_found;
+ em->start = start;
+ em->len = found_key.offset - start;
+ goto not_found_em;
+ }
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ em->orig_start = extent_start -
+ btrfs_file_extent_offset(leaf, item);
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
+ if (bytenr == 0) {
+ em->block_start = EXTENT_MAP_HOLE;
+ goto insert;
+ }
+ if (compressed) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->block_start = bytenr;
+ em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+ item);
+ } else {
+ bytenr += btrfs_file_extent_offset(leaf, item);
+ em->block_start = bytenr;
+ em->block_len = em->len;
+ if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ }
+ goto insert;
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ unsigned long ptr;
+ char *map;
+ size_t size;
+ size_t extent_offset;
+ size_t copy_size;
+
+ em->block_start = EXTENT_MAP_INLINE;
+ if (!page || create) {
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ goto out;
+ }
+
+ size = btrfs_file_extent_inline_len(leaf, item);
+ extent_offset = page_offset(page) + pg_offset - extent_start;
+ copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
+ size - extent_offset);
+ em->start = extent_start + extent_offset;
+ em->len = (copy_size + root->sectorsize - 1) &
+ ~((u64)root->sectorsize - 1);
+ em->orig_start = EXTENT_MAP_INLINE;
+ if (compressed)
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+ if (create == 0 && !PageUptodate(page)) {
+ if (btrfs_file_extent_compression(leaf, item) ==
+ BTRFS_COMPRESS_ZLIB) {
+ ret = uncompress_inline(path, inode, page,
+ pg_offset,
+ extent_offset, item);
+ BUG_ON(ret);
+ } else {
+ map = kmap(page);
+ read_extent_buffer(leaf, map + pg_offset, ptr,
+ copy_size);
+ kunmap(page);
+ }
+ flush_dcache_page(page);
+ } else if (create && PageUptodate(page)) {
+ if (!trans) {
+ kunmap(page);
+ free_extent_map(em);
+ em = NULL;
+ btrfs_release_path(root, path);
+ trans = btrfs_join_transaction(root, 1);
+ goto again;
+ }
+ map = kmap(page);
+ write_extent_buffer(leaf, map + pg_offset, ptr,
+ copy_size);
+ kunmap(page);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+ set_extent_uptodate(io_tree, em->start,
+ extent_map_end(em) - 1, GFP_NOFS);
+ goto insert;
+ } else {
+ printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
+ WARN_ON(1);
+ }
+not_found:
+ em->start = start;
+ em->len = len;
+not_found_em:
+ em->block_start = EXTENT_MAP_HOLE;
+ set_bit(EXTENT_FLAG_VACANCY, &em->flags);
+insert:
+ btrfs_release_path(root, path);
+ if (em->start > start || extent_map_end(em) <= start) {
+ printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
+ "[%llu %llu]\n", (unsigned long long)em->start,
+ (unsigned long long)em->len,
+ (unsigned long long)start,
+ (unsigned long long)len);
+ err = -EIO;
+ goto out;
+ }
+
+ err = 0;
+ spin_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ /* it is possible that someone inserted the extent into the tree
+ * while we had the lock dropped. It is also possible that
+ * an overlapping map exists in the tree
+ */
+ if (ret == -EEXIST) {
+ struct extent_map *existing;
+
+ ret = 0;
+
+ existing = lookup_extent_mapping(em_tree, start, len);
+ if (existing && (existing->start > start ||
+ existing->start + existing->len <= start)) {
+ free_extent_map(existing);
+ existing = NULL;
+ }
+ if (!existing) {
+ existing = lookup_extent_mapping(em_tree, em->start,
+ em->len);
+ if (existing) {
+ err = merge_extent_mapping(em_tree, existing,
+ em, start,
+ root->sectorsize);
+ free_extent_map(existing);
+ if (err) {
+ free_extent_map(em);
+ em = NULL;
+ }
+ } else {
+ err = -EIO;
+ free_extent_map(em);
+ em = NULL;
+ }
+ } else {
+ free_extent_map(em);
+ em = existing;
+ err = 0;
+ }
+ }
+ spin_unlock(&em_tree->lock);
+out:
+ if (path)
+ btrfs_free_path(path);
+ if (trans) {
+ ret = btrfs_end_transaction(trans, root);
+ if (!err)
+ err = ret;
+ }
+ if (err) {
+ free_extent_map(em);
+ WARN_ON(1);
+ return ERR_PTR(err);
+ }
+ return em;
+}
+
+static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ return -EINVAL;
+}
+
+static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
+{
+ return extent_bmap(mapping, iblock, btrfs_get_extent);
+}
+
+int btrfs_readpage(struct file *file, struct page *page)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ return extent_read_full_page(tree, page, btrfs_get_extent);
+}
+
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+
+
+ if (current->flags & PF_MEMALLOC) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+}
+
+int btrfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct extent_io_tree *tree;
+
+ tree = &BTRFS_I(mapping->host)->io_tree;
+ return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+}
+
+static int
+btrfs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(mapping->host)->io_tree;
+ return extent_readpages(tree, mapping, pages, nr_pages,
+ btrfs_get_extent);
+}
+static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+ struct extent_io_tree *tree;
+ struct extent_map_tree *map;
+ int ret;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ map = &BTRFS_I(page->mapping->host)->extent_tree;
+ ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+ if (ret == 1) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+ return ret;
+}
+
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+ if (PageWriteback(page) || PageDirty(page))
+ return 0;
+ return __btrfs_releasepage(page, gfp_flags);
+}
+
+static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct extent_io_tree *tree;
+ struct btrfs_ordered_extent *ordered;
+ u64 page_start = page_offset(page);
+ u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ wait_on_page_writeback(page);
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (offset) {
+ btrfs_releasepage(page, GFP_NOFS);
+ return;
+ }
+
+ lock_extent(tree, page_start, page_end, GFP_NOFS);
+ ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+ page_offset(page));
+ if (ordered) {
+ /*
+ * IO on this page will never be started, so we need
+ * to account for any ordered extents now
+ */
+ clear_extent_bit(tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_LOCKED, 1, 0, GFP_NOFS);
+ btrfs_finish_ordered_io(page->mapping->host,
+ page_start, page_end);
+ btrfs_put_ordered_extent(ordered);
+ lock_extent(tree, page_start, page_end, GFP_NOFS);
+ }
+ clear_extent_bit(tree, page_start, page_end,
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_ORDERED,
+ 1, 1, GFP_NOFS);
+ __btrfs_releasepage(page, GFP_NOFS);
+
+ ClearPageChecked(page);
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+ }
+}
+
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF. Because
+ * vmtruncate() writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ struct inode *inode = fdentry(vma->vm_file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ char *kaddr;
+ unsigned long zero_start;
+ loff_t size;
+ int ret;
+ u64 page_start;
+ u64 page_end;
+
+ ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+again:
+ lock_page(page);
+ size = i_size_read(inode);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ if ((page->mapping != inode->i_mapping) ||
+ (page_start >= size)) {
+ /* page got truncated out from underneath us */
+ goto out_unlock;
+ }
+ wait_on_page_writeback(page);
+
+ lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ set_page_extent_mapped(page);
+
+ /*
+ * we can't set the delalloc bits if there are pending ordered
+ * extents. Drop our locks and wait for them to finish
+ */
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+
+ btrfs_set_extent_delalloc(inode, page_start, page_end);
+ ret = 0;
+
+ /* page is wholly or partially inside EOF */
+ if (page_start + PAGE_CACHE_SIZE > size)
+ zero_start = size & ~PAGE_CACHE_MASK;
+ else
+ zero_start = PAGE_CACHE_SIZE;
+
+ if (zero_start != PAGE_CACHE_SIZE) {
+ kaddr = kmap(page);
+ memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+ flush_dcache_page(page);
+ kunmap(page);
+ }
+ ClearPageChecked(page);
+ set_page_dirty(page);
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+out_unlock:
+ unlock_page(page);
+out:
+ return ret;
+}
+
+static void btrfs_truncate(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+ struct btrfs_trans_handle *trans;
+ unsigned long nr;
+ u64 mask = root->sectorsize - 1;
+
+ if (!S_ISREG(inode->i_mode))
+ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+
+ btrfs_truncate_page(inode->i_mapping, inode->i_size);
+ btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ btrfs_i_size_write(inode, inode->i_size);
+
+ ret = btrfs_orphan_add(trans, inode);
+ if (ret)
+ goto out;
+ /* FIXME, add redo link to tree so we don't leak on crash */
+ ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
+ BTRFS_EXTENT_DATA_KEY);
+ btrfs_update_inode(trans, root, inode);
+
+ ret = btrfs_orphan_del(trans, inode);
+ BUG_ON(ret);
+
+out:
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction_throttle(trans, root);
+ BUG_ON(ret);
+ btrfs_btree_balance_dirty(root, nr);
+}
+
+/*
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *new_root, struct dentry *dentry,
+ u64 new_dirid, u64 alloc_hint)
+{
+ struct inode *inode;
+ int error;
+ u64 index = 0;
+
+ inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
+ new_dirid, alloc_hint, S_IFDIR | 0700, &index);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+
+ inode->i_nlink = 1;
+ btrfs_i_size_write(inode, 0);
+
+ error = btrfs_update_inode(trans, new_root, inode);
+ if (error)
+ return error;
+
+ d_instantiate(dentry, inode);
+ return 0;
+}
+
+/* helper function for file defrag and space balancing. This
+ * forces readahead on a given range of bytes in an inode
+ */
+unsigned long btrfs_force_ra(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *file,
+ pgoff_t offset, pgoff_t last_index)
+{
+ pgoff_t req_size = last_index - offset + 1;
+
+ page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+ return offset + req_size;
+}
+
+struct inode *btrfs_alloc_inode(struct super_block *sb)
+{
+ struct btrfs_inode *ei;
+
+ ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+ if (!ei)
+ return NULL;
+ ei->last_trans = 0;
+ ei->logged_trans = 0;
+ btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+ ei->i_acl = BTRFS_ACL_NOT_CACHED;
+ ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
+ INIT_LIST_HEAD(&ei->i_orphan);
+ return &ei->vfs_inode;
+}
+
+void btrfs_destroy_inode(struct inode *inode)
+{
+ struct btrfs_ordered_extent *ordered;
+ WARN_ON(!list_empty(&inode->i_dentry));
+ WARN_ON(inode->i_data.nrpages);
+
+ if (BTRFS_I(inode)->i_acl &&
+ BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
+ posix_acl_release(BTRFS_I(inode)->i_acl);
+ if (BTRFS_I(inode)->i_default_acl &&
+ BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
+ posix_acl_release(BTRFS_I(inode)->i_default_acl);
+
+ spin_lock(&BTRFS_I(inode)->root->list_lock);
+ if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+ printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
+ " list\n", inode->i_ino);
+ dump_stack();
+ }
+ spin_unlock(&BTRFS_I(inode)->root->list_lock);
+
+ while (1) {
+ ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+ if (!ordered)
+ break;
+ else {
+ printk(KERN_ERR "btrfs found ordered "
+ "extent %llu %llu on inode cleanup\n",
+ (unsigned long long)ordered->file_offset,
+ (unsigned long long)ordered->len);
+ btrfs_remove_ordered_extent(inode, ordered);
+ btrfs_put_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ }
+ }
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
+static void init_once(void *foo)
+{
+ struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+
+ inode_init_once(&ei->vfs_inode);
+}
+
+void btrfs_destroy_cachep(void)
+{
+ if (btrfs_inode_cachep)
+ kmem_cache_destroy(btrfs_inode_cachep);
+ if (btrfs_trans_handle_cachep)
+ kmem_cache_destroy(btrfs_trans_handle_cachep);
+ if (btrfs_transaction_cachep)
+ kmem_cache_destroy(btrfs_transaction_cachep);
+ if (btrfs_bit_radix_cachep)
+ kmem_cache_destroy(btrfs_bit_radix_cachep);
+ if (btrfs_path_cachep)
+ kmem_cache_destroy(btrfs_path_cachep);
+}
+
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+ unsigned long extra_flags,
+ void (*ctor)(void *))
+{
+ return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD | extra_flags), ctor);
+}
+
+int btrfs_init_cachep(void)
+{
+ btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
+ sizeof(struct btrfs_inode),
+ 0, init_once);
+ if (!btrfs_inode_cachep)
+ goto fail;
+ btrfs_trans_handle_cachep =
+ btrfs_cache_create("btrfs_trans_handle_cache",
+ sizeof(struct btrfs_trans_handle),
+ 0, NULL);
+ if (!btrfs_trans_handle_cachep)
+ goto fail;
+ btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
+ sizeof(struct btrfs_transaction),
+ 0, NULL);
+ if (!btrfs_transaction_cachep)
+ goto fail;
+ btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
+ sizeof(struct btrfs_path),
+ 0, NULL);
+ if (!btrfs_path_cachep)
+ goto fail;
+ btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
+ SLAB_DESTROY_BY_RCU, NULL);
+ if (!btrfs_bit_radix_cachep)
+ goto fail;
+ return 0;
+fail:
+ btrfs_destroy_cachep();
+ return -ENOMEM;
+}
+
+static int btrfs_getattr(struct vfsmount *mnt,
+ struct dentry *dentry, struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+ stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
+ stat->blksize = PAGE_CACHE_SIZE;
+ stat->blocks = (inode_get_bytes(inode) +
+ BTRFS_I(inode)->delalloc_bytes) >> 9;
+ return 0;
+}
+
+static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(old_dir)->root;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = old_dentry->d_inode;
+ struct timespec ctime = CURRENT_TIME;
+ u64 index = 0;
+ int ret;
+
+ /* we're not allowed to rename between subvolumes */
+ if (BTRFS_I(old_inode)->root->root_key.objectid !=
+ BTRFS_I(new_dir)->root->root_key.objectid)
+ return -EXDEV;
+
+ if (S_ISDIR(old_inode->i_mode) && new_inode &&
+ new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+ return -ENOTEMPTY;
+ }
+
+ /* to rename a snapshot or subvolume, we need to juggle the
+ * backrefs. This isn't coded yet
+ */
+ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ return -EXDEV;
+
+ ret = btrfs_check_free_space(root, 1, 0);
+ if (ret)
+ goto out_unlock;
+
+ trans = btrfs_start_transaction(root, 1);
+
+ btrfs_set_trans_block_group(trans, new_dir);
+
+ btrfs_inc_nlink(old_dentry->d_inode);
+ old_dir->i_ctime = old_dir->i_mtime = ctime;
+ new_dir->i_ctime = new_dir->i_mtime = ctime;
+ old_inode->i_ctime = ctime;
+
+ ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ if (ret)
+ goto out_fail;
+
+ if (new_inode) {
+ new_inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_unlink_inode(trans, root, new_dir,
+ new_dentry->d_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ if (ret)
+ goto out_fail;
+ if (new_inode->i_nlink == 0) {
+ ret = btrfs_orphan_add(trans, new_dentry->d_inode);
+ if (ret)
+ goto out_fail;
+ }
+
+ }
+ ret = btrfs_set_inode_index(new_dir, &index);
+ if (ret)
+ goto out_fail;
+
+ ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
+ old_inode, new_dentry->d_name.name,
+ new_dentry->d_name.len, 1, index);
+ if (ret)
+ goto out_fail;
+
+out_fail:
+ btrfs_end_transaction_throttle(trans, root);
+out_unlock:
+ return ret;
+}
+
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
+int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+{
+ struct list_head *head = &root->fs_info->delalloc_inodes;
+ struct btrfs_inode *binode;
+ struct inode *inode;
+
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ while (!list_empty(head)) {
+ binode = list_entry(head->next, struct btrfs_inode,
+ delalloc_inodes);
+ inode = igrab(&binode->vfs_inode);
+ if (!inode)
+ list_del_init(&binode->delalloc_inodes);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ if (inode) {
+ filemap_flush(inode->i_mapping);
+ iput(inode);
+ }
+ cond_resched();
+ spin_lock(&root->fs_info->delalloc_lock);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+
+ /* the filemap_flush will queue IO into the worker threads, but
+ * we have to make sure the IO is actually started and that
+ * ordered extents get created before we return
+ */
+ atomic_inc(&root->fs_info->async_submit_draining);
+ while (atomic_read(&root->fs_info->nr_async_submits) ||
+ atomic_read(&root->fs_info->async_delalloc_pages)) {
+ wait_event(root->fs_info->async_submit_wait,
+ (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+ atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+ }
+ atomic_dec(&root->fs_info->async_submit_draining);
+ return 0;
+}
+
+static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct inode *inode = NULL;
+ int err;
+ int drop_inode = 0;
+ u64 objectid;
+ u64 index = 0 ;
+ int name_len;
+ int datasize;
+ unsigned long ptr;
+ struct btrfs_file_extent_item *ei;
+ struct extent_buffer *leaf;
+ unsigned long nr = 0;
+
+ name_len = strlen(symname) + 1;
+ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+ return -ENAMETOOLONG;
+
+ err = btrfs_check_free_space(root, 1, 0);
+ if (err)
+ goto out_fail;
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, dir);
+
+ err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+ if (err) {
+ err = -ENOSPC;
+ goto out_unlock;
+ }
+
+ inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+ dentry->d_name.len,
+ dentry->d_parent->d_inode->i_ino, objectid,
+ BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
+ &index);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_unlock;
+
+ err = btrfs_init_acl(inode, dir);
+ if (err) {
+ drop_inode = 1;
+ goto out_unlock;
+ }
+
+ btrfs_set_trans_block_group(trans, inode);
+ err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+ if (err)
+ drop_inode = 1;
+ else {
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ }
+ dir->i_sb->s_dirt = 1;
+ btrfs_update_inode_block_group(trans, inode);
+ btrfs_update_inode_block_group(trans, dir);
+ if (drop_inode)
+ goto out_unlock;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ key.objectid = inode->i_ino;
+ key.offset = 0;
+ btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+ datasize = btrfs_file_extent_calc_inline_size(name_len);
+ err = btrfs_insert_empty_item(trans, root, path, &key,
+ datasize);
+ if (err) {
+ drop_inode = 1;
+ goto out_unlock;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+ btrfs_set_file_extent_type(leaf, ei,
+ BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_encryption(leaf, ei, 0);
+ btrfs_set_file_extent_compression(leaf, ei, 0);
+ btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+ btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
+ ptr = btrfs_file_extent_inline_start(ei);
+ write_extent_buffer(leaf, symname, ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ inode_set_bytes(inode, name_len);
+ btrfs_i_size_write(inode, name_len - 1);
+ err = btrfs_update_inode(trans, root, inode);
+ if (err)
+ drop_inode = 1;
+
+out_unlock:
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, root);
+out_fail:
+ if (drop_inode) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ }
+ btrfs_btree_balance_dirty(root, nr);
+ return err;
+}
+
+static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+ u64 alloc_hint, int mode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_key ins;
+ u64 alloc_size;
+ u64 cur_offset = start;
+ u64 num_bytes = end - start;
+ int ret = 0;
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
+ btrfs_set_trans_block_group(trans, inode);
+
+ while (num_bytes > 0) {
+ alloc_size = min(num_bytes, root->fs_info->max_extent);
+ ret = btrfs_reserve_extent(trans, root, alloc_size,
+ root->sectorsize, 0, alloc_hint,
+ (u64)-1, &ins, 1);
+ if (ret) {
+ WARN_ON(1);
+ goto out;
+ }
+ ret = insert_reserved_file_extent(trans, inode,
+ cur_offset, ins.objectid,
+ ins.offset, ins.offset,
+ ins.offset, 0, 0, 0,
+ BTRFS_FILE_EXTENT_PREALLOC);
+ BUG_ON(ret);
+ num_bytes -= ins.offset;
+ cur_offset += ins.offset;
+ alloc_hint = ins.objectid + ins.offset;
+ }
+out:
+ if (cur_offset > start) {
+ inode->i_ctime = CURRENT_TIME;
+ btrfs_set_flag(inode, PREALLOC);
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ cur_offset > i_size_read(inode))
+ btrfs_i_size_write(inode, cur_offset);
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ }
+
+ btrfs_end_transaction(trans, root);
+ return ret;
+}
+
+static long btrfs_fallocate(struct inode *inode, int mode,
+ loff_t offset, loff_t len)
+{
+ u64 cur_offset;
+ u64 last_byte;
+ u64 alloc_start;
+ u64 alloc_end;
+ u64 alloc_hint = 0;
+ u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+ struct extent_map *em;
+ int ret;
+
+ alloc_start = offset & ~mask;
+ alloc_end = (offset + len + mask) & ~mask;
+
+ mutex_lock(&inode->i_mutex);
+ if (alloc_start > inode->i_size) {
+ ret = btrfs_cont_expand(inode, alloc_start);
+ if (ret)
+ goto out;
+ }
+
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+ lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
+ alloc_end - 1, GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ alloc_end - 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > alloc_start &&
+ ordered->file_offset < alloc_end) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ alloc_start, alloc_end - 1, GFP_NOFS);
+ btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ } else {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ }
+
+ cur_offset = alloc_start;
+ while (1) {
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+ alloc_end - cur_offset, 0);
+ BUG_ON(IS_ERR(em) || !em);
+ last_byte = min(extent_map_end(em), alloc_end);
+ last_byte = (last_byte + mask) & ~mask;
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ ret = prealloc_file_range(inode, cur_offset,
+ last_byte, alloc_hint, mode);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
+ }
+ }
+ if (em->block_start <= EXTENT_MAP_LAST_BYTE)
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+
+ cur_offset = last_byte;
+ if (cur_offset >= alloc_end) {
+ ret = 0;
+ break;
+ }
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
+ GFP_NOFS);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
+static int btrfs_set_page_dirty(struct page *page)
+{
+ return __set_page_dirty_nobuffers(page);
+}
+
+static int btrfs_permission(struct inode *inode, int mask)
+{
+ if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
+ return -EACCES;
+ return generic_permission(inode, mask, btrfs_check_acl);
+}
+
+static struct inode_operations btrfs_dir_inode_operations = {
+ .getattr = btrfs_getattr,
+ .lookup = btrfs_lookup,
+ .create = btrfs_create,
+ .unlink = btrfs_unlink,
+ .link = btrfs_link,
+ .mkdir = btrfs_mkdir,
+ .rmdir = btrfs_rmdir,
+ .rename = btrfs_rename,
+ .symlink = btrfs_symlink,
+ .setattr = btrfs_setattr,
+ .mknod = btrfs_mknod,
+ .setxattr = btrfs_setxattr,
+ .getxattr = btrfs_getxattr,
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+ .permission = btrfs_permission,
+};
+static struct inode_operations btrfs_dir_ro_inode_operations = {
+ .lookup = btrfs_lookup,
+ .permission = btrfs_permission,
+};
+static struct file_operations btrfs_dir_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .readdir = btrfs_real_readdir,
+ .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = btrfs_ioctl,
+#endif
+ .release = btrfs_release_file,
+ .fsync = btrfs_sync_file,
+};
+
+static struct extent_io_ops btrfs_extent_io_ops = {
+ .fill_delalloc = run_delalloc_range,
+ .submit_bio_hook = btrfs_submit_bio_hook,
+ .merge_bio_hook = btrfs_merge_bio_hook,
+ .readpage_end_io_hook = btrfs_readpage_end_io_hook,
+ .writepage_end_io_hook = btrfs_writepage_end_io_hook,
+ .writepage_start_hook = btrfs_writepage_start_hook,
+ .readpage_io_failed_hook = btrfs_io_failed_hook,
+ .set_bit_hook = btrfs_set_bit_hook,
+ .clear_bit_hook = btrfs_clear_bit_hook,
+};
+
+static struct address_space_operations btrfs_aops = {
+ .readpage = btrfs_readpage,
+ .writepage = btrfs_writepage,
+ .writepages = btrfs_writepages,
+ .readpages = btrfs_readpages,
+ .sync_page = block_sync_page,
+ .bmap = btrfs_bmap,
+ .direct_IO = btrfs_direct_IO,
+ .invalidatepage = btrfs_invalidatepage,
+ .releasepage = btrfs_releasepage,
+ .set_page_dirty = btrfs_set_page_dirty,
+};
+
+static struct address_space_operations btrfs_symlink_aops = {
+ .readpage = btrfs_readpage,
+ .writepage = btrfs_writepage,
+ .invalidatepage = btrfs_invalidatepage,
+ .releasepage = btrfs_releasepage,
+};
+
+static struct inode_operations btrfs_file_inode_operations = {
+ .truncate = btrfs_truncate,
+ .getattr = btrfs_getattr,
+ .setattr = btrfs_setattr,
+ .setxattr = btrfs_setxattr,
+ .getxattr = btrfs_getxattr,
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+ .permission = btrfs_permission,
+ .fallocate = btrfs_fallocate,
+};
+static struct inode_operations btrfs_special_inode_operations = {
+ .getattr = btrfs_getattr,
+ .setattr = btrfs_setattr,
+ .permission = btrfs_permission,
+ .setxattr = btrfs_setxattr,
+ .getxattr = btrfs_getxattr,
+ .listxattr = btrfs_listxattr,
+ .removexattr = btrfs_removexattr,
+};
+static struct inode_operations btrfs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
+ .permission = btrfs_permission,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 00000000000..c2aa33e3feb
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,1132 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/security.h>
+#include <linux/version.h>
+#include <linux/xattr.h>
+#include <linux/vmalloc.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "locking.h"
+
+
+
+static noinline int create_subvol(struct btrfs_root *root,
+ struct dentry *dentry,
+ char *name, int namelen)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key key;
+ struct btrfs_root_item root_item;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+ struct btrfs_root *new_root = root;
+ struct inode *dir;
+ int ret;
+ int err;
+ u64 objectid;
+ u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+ u64 index = 0;
+ unsigned long nr = 1;
+
+ ret = btrfs_check_free_space(root, 1, 0);
+ if (ret)
+ goto fail_commit;
+
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+
+ ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+ 0, &objectid);
+ if (ret)
+ goto fail;
+
+ leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+ objectid, trans->transid, 0, 0, 0);
+ if (IS_ERR(leaf)) {
+ ret = PTR_ERR(leaf);
+ goto fail;
+ }
+
+ btrfs_set_header_nritems(leaf, 0);
+ btrfs_set_header_level(leaf, 0);
+ btrfs_set_header_bytenr(leaf, leaf->start);
+ btrfs_set_header_generation(leaf, trans->transid);
+ btrfs_set_header_owner(leaf, objectid);
+
+ write_extent_buffer(leaf, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(leaf),
+ BTRFS_FSID_SIZE);
+ btrfs_mark_buffer_dirty(leaf);
+
+ inode_item = &root_item.inode;
+ memset(inode_item, 0, sizeof(*inode_item));
+ inode_item->generation = cpu_to_le64(1);
+ inode_item->size = cpu_to_le64(3);
+ inode_item->nlink = cpu_to_le32(1);
+ inode_item->nbytes = cpu_to_le64(root->leafsize);
+ inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+ btrfs_set_root_bytenr(&root_item, leaf->start);
+ btrfs_set_root_generation(&root_item, trans->transid);
+ btrfs_set_root_level(&root_item, 0);
+ btrfs_set_root_refs(&root_item, 1);
+ btrfs_set_root_used(&root_item, 0);
+ btrfs_set_root_last_snapshot(&root_item, 0);
+
+ memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+ root_item.drop_level = 0;
+
+ btrfs_tree_unlock(leaf);
+ free_extent_buffer(leaf);
+ leaf = NULL;
+
+ btrfs_set_root_dirid(&root_item, new_dirid);
+
+ key.objectid = objectid;
+ key.offset = 1;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+ &root_item);
+ if (ret)
+ goto fail;
+
+ /*
+ * insert the directory item
+ */
+ key.offset = (u64)-1;
+ dir = dentry->d_parent->d_inode;
+ ret = btrfs_set_inode_index(dir, &index);
+ BUG_ON(ret);
+
+ ret = btrfs_insert_dir_item(trans, root,
+ name, namelen, dir->i_ino, &key,
+ BTRFS_FT_DIR, index);
+ if (ret)
+ goto fail;
+
+ btrfs_i_size_write(dir, dir->i_size + namelen * 2);
+ ret = btrfs_update_inode(trans, root, dir);
+ BUG_ON(ret);
+
+ /* add the backref first */
+ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+ objectid, BTRFS_ROOT_BACKREF_KEY,
+ root->root_key.objectid,
+ dir->i_ino, index, name, namelen);
+
+ BUG_ON(ret);
+
+ /* now add the forward ref */
+ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+ root->root_key.objectid, BTRFS_ROOT_REF_KEY,
+ objectid,
+ dir->i_ino, index, name, namelen);
+
+ BUG_ON(ret);
+
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ goto fail_commit;
+
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ BUG_ON(!new_root);
+
+ trans = btrfs_start_transaction(new_root, 1);
+ BUG_ON(!trans);
+
+ ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
+ BTRFS_I(dir)->block_group);
+ if (ret)
+ goto fail;
+
+fail:
+ nr = trans->blocks_used;
+ err = btrfs_commit_transaction(trans, new_root);
+ if (err && !ret)
+ ret = err;
+fail_commit:
+ btrfs_btree_balance_dirty(root, nr);
+ return ret;
+}
+
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+ char *name, int namelen)
+{
+ struct btrfs_pending_snapshot *pending_snapshot;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+ int err;
+ unsigned long nr = 0;
+
+ if (!root->ref_cows)
+ return -EINVAL;
+
+ ret = btrfs_check_free_space(root, 1, 0);
+ if (ret)
+ goto fail_unlock;
+
+ pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+ if (!pending_snapshot) {
+ ret = -ENOMEM;
+ goto fail_unlock;
+ }
+ pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
+ if (!pending_snapshot->name) {
+ ret = -ENOMEM;
+ kfree(pending_snapshot);
+ goto fail_unlock;
+ }
+ memcpy(pending_snapshot->name, name, namelen);
+ pending_snapshot->name[namelen] = '\0';
+ pending_snapshot->dentry = dentry;
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+ pending_snapshot->root = root;
+ list_add(&pending_snapshot->list,
+ &trans->transaction->pending_snapshots);
+ err = btrfs_commit_transaction(trans, root);
+
+fail_unlock:
+ btrfs_btree_balance_dirty(root, nr);
+ return ret;
+}
+
+/* copy of may_create in fs/namei.c() */
+static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+{
+ if (child->d_inode)
+ return -EEXIST;
+ if (IS_DEADDIR(dir))
+ return -ENOENT;
+ return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/*
+ * Create a new subvolume below @parent. This is largely modeled after
+ * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+ * inside this filesystem so it's quite a bit simpler.
+ */
+static noinline int btrfs_mksubvol(struct path *parent, char *name,
+ int mode, int namelen,
+ struct btrfs_root *snap_src)
+{
+ struct dentry *dentry;
+ int error;
+
+ mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+
+ dentry = lookup_one_len(name, parent->dentry, namelen);
+ error = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ goto out_unlock;
+
+ error = -EEXIST;
+ if (dentry->d_inode)
+ goto out_dput;
+
+ if (!IS_POSIXACL(parent->dentry->d_inode))
+ mode &= ~current->fs->umask;
+
+ error = mnt_want_write(parent->mnt);
+ if (error)
+ goto out_dput;
+
+ error = btrfs_may_create(parent->dentry->d_inode, dentry);
+ if (error)
+ goto out_drop_write;
+
+ /*
+ * Actually perform the low-level subvolume creation after all
+ * this VFS fuzz.
+ *
+ * Eventually we want to pass in an inode under which we create this
+ * subvolume, but for now all are under the filesystem root.
+ *
+ * Also we should pass on the mode eventually to allow creating new
+ * subvolume with specific mode bits.
+ */
+ if (snap_src) {
+ struct dentry *dir = dentry->d_parent;
+ struct dentry *test = dir->d_parent;
+ struct btrfs_path *path = btrfs_alloc_path();
+ int ret;
+ u64 test_oid;
+ u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
+
+ test_oid = snap_src->root_key.objectid;
+
+ ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+ path, parent_oid, test_oid);
+ if (ret == 0)
+ goto create;
+ btrfs_release_path(snap_src->fs_info->tree_root, path);
+
+ /* we need to make sure we aren't creating a directory loop
+ * by taking a snapshot of something that has our current
+ * subvol in its directory tree. So, this loops through
+ * the dentries and checks the forward refs for each subvolume
+ * to see if is references the subvolume where we are
+ * placing this new snapshot.
+ */
+ while (1) {
+ if (!test ||
+ dir == snap_src->fs_info->sb->s_root ||
+ test == snap_src->fs_info->sb->s_root ||
+ test->d_inode->i_sb != snap_src->fs_info->sb) {
+ break;
+ }
+ if (S_ISLNK(test->d_inode->i_mode)) {
+ printk(KERN_INFO "Btrfs symlink in snapshot "
+ "path, failed\n");
+ error = -EMLINK;
+ btrfs_free_path(path);
+ goto out_drop_write;
+ }
+ test_oid =
+ BTRFS_I(test->d_inode)->root->root_key.objectid;
+ ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+ path, test_oid, parent_oid);
+ if (ret == 0) {
+ printk(KERN_INFO "Btrfs snapshot creation "
+ "failed, looping\n");
+ error = -EMLINK;
+ btrfs_free_path(path);
+ goto out_drop_write;
+ }
+ btrfs_release_path(snap_src->fs_info->tree_root, path);
+ test = test->d_parent;
+ }
+create:
+ btrfs_free_path(path);
+ error = create_snapshot(snap_src, dentry, name, namelen);
+ } else {
+ error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
+ dentry, name, namelen);
+ }
+ if (error)
+ goto out_drop_write;
+
+ fsnotify_mkdir(parent->dentry->d_inode, dentry);
+out_drop_write:
+ mnt_drop_write(parent->mnt);
+out_dput:
+ dput(dentry);
+out_unlock:
+ mutex_unlock(&parent->dentry->d_inode->i_mutex);
+ return error;
+}
+
+
+static int btrfs_defrag_file(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ struct page *page;
+ unsigned long last_index;
+ unsigned long ra_pages = root->fs_info->bdi.ra_pages;
+ unsigned long total_read = 0;
+ u64 page_start;
+ u64 page_end;
+ unsigned long i;
+ int ret;
+
+ ret = btrfs_check_free_space(root, inode->i_size, 0);
+ if (ret)
+ return -ENOSPC;
+
+ mutex_lock(&inode->i_mutex);
+ last_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ for (i = 0; i <= last_index; i++) {
+ if (total_read % ra_pages == 0) {
+ btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
+ min(last_index, i + ra_pages - 1));
+ }
+ total_read++;
+again:
+ page = grab_cache_page(inode->i_mapping, i);
+ if (!page)
+ goto out_unlock;
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto out_unlock;
+ }
+ }
+
+ wait_on_page_writeback(page);
+
+ page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+ lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ if (ordered) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_page(page);
+ page_cache_release(page);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ goto again;
+ }
+ set_page_extent_mapped(page);
+
+ /*
+ * this makes sure page_mkwrite is called on the
+ * page if it is dirtied again later
+ */
+ clear_page_dirty_for_io(page);
+
+ btrfs_set_extent_delalloc(inode, page_start, page_end);
+
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+ }
+
+out_unlock:
+ mutex_unlock(&inode->i_mutex);
+ return 0;
+}
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+
+static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
+{
+ u64 new_size;
+ u64 old_size;
+ u64 devid = 1;
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_device *device = NULL;
+ char *sizestr;
+ char *devstr = NULL;
+ int ret = 0;
+ int namelen;
+ int mod = 0;
+
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+ if (!vol_args)
+ return -ENOMEM;
+
+ if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ namelen = strlen(vol_args->name);
+
+ mutex_lock(&root->fs_info->volume_mutex);
+ sizestr = vol_args->name;
+ devstr = strchr(sizestr, ':');
+ if (devstr) {
+ char *end;
+ sizestr = devstr + 1;
+ *devstr = '\0';
+ devstr = vol_args->name;
+ devid = simple_strtoull(devstr, &end, 10);
+ printk(KERN_INFO "resizing devid %llu\n", devid);
+ }
+ device = btrfs_find_device(root, devid, NULL, NULL);
+ if (!device) {
+ printk(KERN_INFO "resizer unable to find device %llu\n", devid);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ if (!strcmp(sizestr, "max"))
+ new_size = device->bdev->bd_inode->i_size;
+ else {
+ if (sizestr[0] == '-') {
+ mod = -1;
+ sizestr++;
+ } else if (sizestr[0] == '+') {
+ mod = 1;
+ sizestr++;
+ }
+ new_size = btrfs_parse_size(sizestr);
+ if (new_size == 0) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ }
+
+ old_size = device->total_bytes;
+
+ if (mod < 0) {
+ if (new_size > old_size) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ new_size = old_size - new_size;
+ } else if (mod > 0) {
+ new_size = old_size + new_size;
+ }
+
+ if (new_size < 256 * 1024 * 1024) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ if (new_size > device->bdev->bd_inode->i_size) {
+ ret = -EFBIG;
+ goto out_unlock;
+ }
+
+ do_div(new_size, root->sectorsize);
+ new_size *= root->sectorsize;
+
+ printk(KERN_INFO "new size for %s is %llu\n",
+ device->name, (unsigned long long)new_size);
+
+ if (new_size > old_size) {
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_grow_device(trans, device, new_size);
+ btrfs_commit_transaction(trans, root);
+ } else {
+ ret = btrfs_shrink_device(device, new_size);
+ }
+
+out_unlock:
+ mutex_unlock(&root->fs_info->volume_mutex);
+out:
+ kfree(vol_args);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+ void __user *arg, int subvol)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct file *src_file;
+ u64 root_dirid;
+ int namelen;
+ int ret = 0;
+
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+ if (!vol_args)
+ return -ENOMEM;
+
+ if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ namelen = strlen(vol_args->name);
+ if (strchr(vol_args->name, '/')) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
+ di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+ path, root_dirid,
+ vol_args->name, namelen, 0);
+ btrfs_free_path(path);
+
+ if (di && !IS_ERR(di)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ if (subvol) {
+ ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+ file->f_path.dentry->d_inode->i_mode,
+ namelen, NULL);
+ } else {
+ struct inode *src_inode;
+ src_file = fget(vol_args->fd);
+ if (!src_file) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ src_inode = src_file->f_path.dentry->d_inode;
+ if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
+ printk(KERN_INFO "btrfs: Snapshot src from "
+ "another FS\n");
+ ret = -EINVAL;
+ fput(src_file);
+ goto out;
+ }
+ ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+ file->f_path.dentry->d_inode->i_mode,
+ namelen, BTRFS_I(src_inode)->root);
+ fput(src_file);
+ }
+
+out:
+ kfree(vol_args);
+ return ret;
+}
+
+static int btrfs_ioctl_defrag(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ ret = mnt_want_write(file->f_path.mnt);
+ if (ret)
+ return ret;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFDIR:
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
+ }
+ btrfs_defrag_root(root, 0);
+ btrfs_defrag_root(root->fs_info->extent_root, 0);
+ break;
+ case S_IFREG:
+ if (!(file->f_mode & FMODE_WRITE)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ btrfs_defrag_file(file);
+ break;
+ }
+out:
+ mnt_drop_write(file->f_path.mnt);
+ return ret;
+}
+
+static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_vol_args *vol_args;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+ if (!vol_args)
+ return -ENOMEM;
+
+ if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_init_new_device(root, vol_args->name);
+
+out:
+ kfree(vol_args);
+ return ret;
+}
+
+static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_vol_args *vol_args;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+ if (!vol_args)
+ return -ENOMEM;
+
+ if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ ret = btrfs_rm_device(root, vol_args->name);
+
+out:
+ kfree(vol_args);
+ return ret;
+}
+
+static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct file *src_file;
+ struct inode *src;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ char *buf;
+ struct btrfs_key key;
+ u32 nritems;
+ int slot;
+ int ret;
+ u64 len = olen;
+ u64 bs = root->fs_info->sb->s_blocksize;
+ u64 hint_byte;
+
+ /*
+ * TODO:
+ * - split compressed inline extents. annoying: we need to
+ * decompress into destination's address_space (the file offset
+ * may change, so source mapping won't do), then recompress (or
+ * otherwise reinsert) a subrange.
+ * - allow ranges within the same file to be cloned (provided
+ * they don't overlap)?
+ */
+
+ /* the destination must be opened for writing */
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EINVAL;
+
+ ret = mnt_want_write(file->f_path.mnt);
+ if (ret)
+ return ret;
+
+ src_file = fget(srcfd);
+ if (!src_file) {
+ ret = -EBADF;
+ goto out_drop_write;
+ }
+ src = src_file->f_dentry->d_inode;
+
+ ret = -EINVAL;
+ if (src == inode)
+ goto out_fput;
+
+ ret = -EISDIR;
+ if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+ goto out_fput;
+
+ ret = -EXDEV;
+ if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
+ goto out_fput;
+
+ ret = -ENOMEM;
+ buf = vmalloc(btrfs_level_size(root, 0));
+ if (!buf)
+ goto out_fput;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ vfree(buf);
+ goto out_fput;
+ }
+ path->reada = 2;
+
+ if (inode < src) {
+ mutex_lock(&inode->i_mutex);
+ mutex_lock(&src->i_mutex);
+ } else {
+ mutex_lock(&src->i_mutex);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ /* determine range to clone */
+ ret = -EINVAL;
+ if (off >= src->i_size || off + len > src->i_size)
+ goto out_unlock;
+ if (len == 0)
+ olen = len = src->i_size - off;
+ /* if we extend to eof, continue to block boundary */
+ if (off + len == src->i_size)
+ len = ((src->i_size + bs-1) & ~(bs-1))
+ - off;
+
+ /* verify the end result is block aligned */
+ if ((off & (bs-1)) ||
+ ((off + len) & (bs-1)))
+ goto out_unlock;
+
+ /* do any pending delalloc/csum calc on src, one way or
+ another, and lock file content */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+ lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
+ if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
+ break;
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ btrfs_wait_ordered_range(src, off, off+len);
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+
+ /* punch hole in destination first */
+ btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
+
+ /* clone data */
+ key.objectid = src->i_ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+
+ while (1) {
+ /*
+ * note the key will change type as we walk through the
+ * tree.
+ */
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ }
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+ key.objectid != src->i_ino)
+ break;
+
+ if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+ struct btrfs_file_extent_item *extent;
+ int type;
+ u32 size;
+ struct btrfs_key new_key;
+ u64 disko = 0, diskl = 0;
+ u64 datao = 0, datal = 0;
+ u8 comp;
+
+ size = btrfs_item_size_nr(leaf, slot);
+ read_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ comp = btrfs_file_extent_compression(leaf, extent);
+ type = btrfs_file_extent_type(leaf, extent);
+ if (type == BTRFS_FILE_EXTENT_REG) {
+ disko = btrfs_file_extent_disk_bytenr(leaf,
+ extent);
+ diskl = btrfs_file_extent_disk_num_bytes(leaf,
+ extent);
+ datao = btrfs_file_extent_offset(leaf, extent);
+ datal = btrfs_file_extent_num_bytes(leaf,
+ extent);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /* take upper bound, may be compressed */
+ datal = btrfs_file_extent_ram_bytes(leaf,
+ extent);
+ }
+ btrfs_release_path(root, path);
+
+ if (key.offset + datal < off ||
+ key.offset >= off+len)
+ goto next;
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.objectid = inode->i_ino;
+ new_key.offset = key.offset + destoff - off;
+
+ if (type == BTRFS_FILE_EXTENT_REG) {
+ ret = btrfs_insert_empty_item(trans, root, path,
+ &new_key, size);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ write_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+
+ if (off > key.offset) {
+ datao += off - key.offset;
+ datal -= off - key.offset;
+ }
+ if (key.offset + datao + datal + key.offset >
+ off + len)
+ datal = off + len - key.offset - datao;
+ /* disko == 0 means it's a hole */
+ if (!disko)
+ datao = 0;
+
+ btrfs_set_file_extent_offset(leaf, extent,
+ datao);
+ btrfs_set_file_extent_num_bytes(leaf, extent,
+ datal);
+ if (disko) {
+ inode_add_bytes(inode, datal);
+ ret = btrfs_inc_extent_ref(trans, root,
+ disko, diskl, leaf->start,
+ root->root_key.objectid,
+ trans->transid,
+ inode->i_ino);
+ BUG_ON(ret);
+ }
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ u64 skip = 0;
+ u64 trim = 0;
+ if (off > key.offset) {
+ skip = off - key.offset;
+ new_key.offset += skip;
+ }
+
+ if (key.offset + datal > off+len)
+ trim = key.offset + datal - (off+len);
+
+ if (comp && (skip || trim)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ size -= skip + trim;
+ datal -= skip + trim;
+ ret = btrfs_insert_empty_item(trans, root, path,
+ &new_key, size);
+ if (ret)
+ goto out;
+
+ if (skip) {
+ u32 start =
+ btrfs_file_extent_calc_inline_size(0);
+ memmove(buf+start, buf+start+skip,
+ datal);
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ write_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ size);
+ inode_add_bytes(inode, datal);
+ }
+
+ btrfs_mark_buffer_dirty(leaf);
+ }
+
+next:
+ btrfs_release_path(root, path);
+ key.offset++;
+ }
+ ret = 0;
+out:
+ btrfs_release_path(root, path);
+ if (ret == 0) {
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ if (destoff + olen > inode->i_size)
+ btrfs_i_size_write(inode, destoff + olen);
+ BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+ ret = btrfs_update_inode(trans, root, inode);
+ }
+ btrfs_end_transaction(trans, root);
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ if (ret)
+ vmtruncate(inode, 0);
+out_unlock:
+ mutex_unlock(&src->i_mutex);
+ mutex_unlock(&inode->i_mutex);
+ vfree(buf);
+ btrfs_free_path(path);
+out_fput:
+ fput(src_file);
+out_drop_write:
+ mnt_drop_write(file->f_path.mnt);
+ return ret;
+}
+
+static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+{
+ struct btrfs_ioctl_clone_range_args args;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
+ args.src_length, args.dest_offset);
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks. They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+static long btrfs_ioctl_trans_start(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (file->private_data) {
+ ret = -EINPROGRESS;
+ goto out;
+ }
+
+ ret = mnt_want_write(file->f_path.mnt);
+ if (ret)
+ goto out;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ root->fs_info->open_ioctl_trans++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ trans = btrfs_start_ioctl_transaction(root, 0);
+ if (trans)
+ file->private_data = trans;
+ else
+ ret = -ENOMEM;
+ /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+out:
+ return ret;
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks. They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_end(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ trans = file->private_data;
+ if (!trans) {
+ ret = -EINVAL;
+ goto out;
+ }
+ btrfs_end_transaction(trans, root);
+ file->private_data = NULL;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ root->fs_info->open_ioctl_trans--;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ mnt_drop_write(file->f_path.mnt);
+
+out:
+ return ret;
+}
+
+long btrfs_ioctl(struct file *file, unsigned int
+ cmd, unsigned long arg)
+{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case BTRFS_IOC_SNAP_CREATE:
+ return btrfs_ioctl_snap_create(file, argp, 0);
+ case BTRFS_IOC_SUBVOL_CREATE:
+ return btrfs_ioctl_snap_create(file, argp, 1);
+ case BTRFS_IOC_DEFRAG:
+ return btrfs_ioctl_defrag(file);
+ case BTRFS_IOC_RESIZE:
+ return btrfs_ioctl_resize(root, argp);
+ case BTRFS_IOC_ADD_DEV:
+ return btrfs_ioctl_add_dev(root, argp);
+ case BTRFS_IOC_RM_DEV:
+ return btrfs_ioctl_rm_dev(root, argp);
+ case BTRFS_IOC_BALANCE:
+ return btrfs_balance(root->fs_info->dev_root);
+ case BTRFS_IOC_CLONE:
+ return btrfs_ioctl_clone(file, arg, 0, 0, 0);
+ case BTRFS_IOC_CLONE_RANGE:
+ return btrfs_ioctl_clone_range(file, argp);
+ case BTRFS_IOC_TRANS_START:
+ return btrfs_ioctl_trans_start(file);
+ case BTRFS_IOC_TRANS_END:
+ return btrfs_ioctl_trans_end(file);
+ case BTRFS_IOC_SYNC:
+ btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ return 0;
+ }
+
+ return -ENOTTY;
+}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 00000000000..78049ea208d
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __IOCTL_
+#define __IOCTL_
+#include <linux/ioctl.h>
+
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_VOL_NAME_MAX 255
+#define BTRFS_PATH_NAME_MAX 3072
+
+struct btrfs_ioctl_vol_args {
+ __s64 fd;
+ char name[BTRFS_PATH_NAME_MAX + 1];
+};
+
+#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
+ struct btrfs_ioctl_vol_args)
+/* trans start and trans end are dangerous, and only for
+ * use by applications that know how to avoid the
+ * resulting deadlocks
+ */
+#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
+
+#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
+ struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
+ struct btrfs_ioctl_vol_args)
+struct btrfs_ioctl_clone_range_args {
+ __s64 src_fd;
+ __u64 src_offset, src_length;
+ __u64 dest_offset;
+};
+
+#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
+ struct btrfs_ioctl_clone_range_args)
+
+#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+ struct btrfs_ioctl_vol_args)
+
+#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 00000000000..39bae7761db
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <asm/bug.h>
+#include "ctree.h"
+#include "extent_io.h"
+#include "locking.h"
+
+/*
+ * locks the per buffer mutex in an extent buffer. This uses adaptive locks
+ * and the spin is not tuned very extensively. The spinning does make a big
+ * difference in almost every workload, but spinning for the right amount of
+ * time needs some help.
+ *
+ * In general, we want to spin as long as the lock holder is doing btree
+ * searches, and we should give up if they are in more expensive code.
+ */
+
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+ int i;
+
+ if (mutex_trylock(&eb->mutex))
+ return 0;
+ for (i = 0; i < 512; i++) {
+ cpu_relax();
+ if (mutex_trylock(&eb->mutex))
+ return 0;
+ }
+ cpu_relax();
+ mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
+ return 0;
+}
+
+int btrfs_try_tree_lock(struct extent_buffer *eb)
+{
+ return mutex_trylock(&eb->mutex);
+}
+
+int btrfs_tree_unlock(struct extent_buffer *eb)
+{
+ mutex_unlock(&eb->mutex);
+ return 0;
+}
+
+int btrfs_tree_locked(struct extent_buffer *eb)
+{
+ return mutex_is_locked(&eb->mutex);
+}
+
+/*
+ * btrfs_search_slot uses this to decide if it should drop its locks
+ * before doing something expensive like allocating free blocks for cow.
+ */
+int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
+{
+ int i;
+ struct extent_buffer *eb;
+ for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
+ eb = path->nodes[i];
+ if (!eb)
+ break;
+ smp_mb();
+ if (!list_empty(&eb->mutex.wait_list))
+ return 1;
+ }
+ return 0;
+}
+
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 00000000000..bc1faef1251
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_LOCKING_
+#define __BTRFS_LOCKING_
+
+int btrfs_tree_lock(struct extent_buffer *eb);
+int btrfs_tree_unlock(struct extent_buffer *eb);
+int btrfs_tree_locked(struct extent_buffer *eb);
+int btrfs_try_tree_lock(struct extent_buffer *eb);
+int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 00000000000..a2094017027
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "extent_io.h"
+
+static u64 entry_end(struct btrfs_ordered_extent *entry)
+{
+ if (entry->file_offset + entry->len < entry->file_offset)
+ return (u64)-1;
+ return entry->file_offset + entry->len;
+}
+
+/* returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_ordered_extent *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
+
+ if (file_offset < entry->file_offset)
+ p = &(*p)->rb_left;
+ else if (file_offset >= entry_end(entry))
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/*
+ * look for a given offset in the tree, and if it can't be found return the
+ * first lesser offset
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+ struct rb_node **prev_ret)
+{
+ struct rb_node *n = root->rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *test;
+ struct btrfs_ordered_extent *entry;
+ struct btrfs_ordered_extent *prev_entry = NULL;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+ prev = n;
+ prev_entry = entry;
+
+ if (file_offset < entry->file_offset)
+ n = n->rb_left;
+ else if (file_offset >= entry_end(entry))
+ n = n->rb_right;
+ else
+ return n;
+ }
+ if (!prev_ret)
+ return NULL;
+
+ while (prev && file_offset >= entry_end(prev_entry)) {
+ test = rb_next(prev);
+ if (!test)
+ break;
+ prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+ rb_node);
+ if (file_offset < entry_end(prev_entry))
+ break;
+
+ prev = test;
+ }
+ if (prev)
+ prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+ rb_node);
+ while (prev && file_offset < entry_end(prev_entry)) {
+ test = rb_prev(prev);
+ if (!test)
+ break;
+ prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+ rb_node);
+ prev = test;
+ }
+ *prev_ret = prev;
+ return NULL;
+}
+
+/*
+ * helper to check if a given offset is inside a given entry
+ */
+static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
+{
+ if (file_offset < entry->file_offset ||
+ entry->file_offset + entry->len <= file_offset)
+ return 0;
+ return 1;
+}
+
+/*
+ * look find the first ordered struct that has this offset, otherwise
+ * the first one less than this offset
+ */
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+ u64 file_offset)
+{
+ struct rb_root *root = &tree->tree;
+ struct rb_node *prev;
+ struct rb_node *ret;
+ struct btrfs_ordered_extent *entry;
+
+ if (tree->last) {
+ entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+ rb_node);
+ if (offset_in_entry(entry, file_offset))
+ return tree->last;
+ }
+ ret = __tree_search(root, file_offset, &prev);
+ if (!ret)
+ ret = prev;
+ if (ret)
+ tree->last = ret;
+ return ret;
+}
+
+/* allocate and add a new ordered_extent into the per-inode tree.
+ * file_offset is the logical offset in the file
+ *
+ * start is the disk block number of an extent already reserved in the
+ * extent allocation tree
+ *
+ * len is the length of the extent
+ *
+ * This also sets the EXTENT_ORDERED bit on the range in the inode.
+ *
+ * The tree is given a single reference on the ordered extent that was
+ * inserted.
+ */
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ entry = kzalloc(sizeof(*entry), GFP_NOFS);
+ if (!entry)
+ return -ENOMEM;
+
+ mutex_lock(&tree->mutex);
+ entry->file_offset = file_offset;
+ entry->start = start;
+ entry->len = len;
+ entry->disk_len = disk_len;
+ entry->inode = inode;
+ if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
+ set_bit(type, &entry->flags);
+
+ /* one ref for the tree */
+ atomic_set(&entry->refs, 1);
+ init_waitqueue_head(&entry->wait);
+ INIT_LIST_HEAD(&entry->list);
+ INIT_LIST_HEAD(&entry->root_extent_list);
+
+ node = tree_insert(&tree->tree, file_offset,
+ &entry->rb_node);
+ BUG_ON(node);
+
+ set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
+ entry_end(entry) - 1, GFP_NOFS);
+
+ spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ list_add_tail(&entry->root_extent_list,
+ &BTRFS_I(inode)->root->fs_info->ordered_extents);
+ spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+
+ mutex_unlock(&tree->mutex);
+ BUG_ON(node);
+ return 0;
+}
+
+/*
+ * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
+ * when an ordered extent is finished. If the list covers more than one
+ * ordered extent, it is split across multiples.
+ */
+int btrfs_add_ordered_sum(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum)
+{
+ struct btrfs_ordered_inode_tree *tree;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+ list_add_tail(&sum->list, &entry->list);
+ mutex_unlock(&tree->mutex);
+ return 0;
+}
+
+/*
+ * this is used to account for finished IO across a given range
+ * of the file. The IO should not span ordered extents. If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ */
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+ u64 file_offset, u64 io_size)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int ret;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+ clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
+ GFP_NOFS);
+ node = tree_search(tree, file_offset);
+ if (!node) {
+ ret = 1;
+ goto out;
+ }
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (!offset_in_entry(entry, file_offset)) {
+ ret = 1;
+ goto out;
+ }
+
+ ret = test_range_bit(io_tree, entry->file_offset,
+ entry->file_offset + entry->len - 1,
+ EXTENT_ORDERED, 0);
+ if (ret == 0)
+ ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+out:
+ mutex_unlock(&tree->mutex);
+ return ret == 0;
+}
+
+/*
+ * used to drop a reference on an ordered extent. This will free
+ * the extent if the last reference is dropped
+ */
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+ struct list_head *cur;
+ struct btrfs_ordered_sum *sum;
+
+ if (atomic_dec_and_test(&entry->refs)) {
+ while (!list_empty(&entry->list)) {
+ cur = entry->list.next;
+ sum = list_entry(cur, struct btrfs_ordered_sum, list);
+ list_del(&sum->list);
+ kfree(sum);
+ }
+ kfree(entry);
+ }
+ return 0;
+}
+
+/*
+ * remove an ordered extent from the tree. No references are dropped
+ * but, anyone waiting on this extent is woken up.
+ */
+int btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+ node = &entry->rb_node;
+ rb_erase(node, &tree->tree);
+ tree->last = NULL;
+ set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+
+ spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+ list_del_init(&entry->root_extent_list);
+ spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+
+ mutex_unlock(&tree->mutex);
+ wake_up(&entry->wait);
+ return 0;
+}
+
+/*
+ * wait for all the ordered extents in a root. This is done when balancing
+ * space between drives.
+ */
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
+{
+ struct list_head splice;
+ struct list_head *cur;
+ struct btrfs_ordered_extent *ordered;
+ struct inode *inode;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ list_splice_init(&root->fs_info->ordered_extents, &splice);
+ while (!list_empty(&splice)) {
+ cur = splice.next;
+ ordered = list_entry(cur, struct btrfs_ordered_extent,
+ root_extent_list);
+ if (nocow_only &&
+ !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+ list_move(&ordered->root_extent_list,
+ &root->fs_info->ordered_extents);
+ cond_resched_lock(&root->fs_info->ordered_extent_lock);
+ continue;
+ }
+
+ list_del_init(&ordered->root_extent_list);
+ atomic_inc(&ordered->refs);
+
+ /*
+ * the inode may be getting freed (in sys_unlink path).
+ */
+ inode = igrab(ordered->inode);
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ if (inode) {
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ iput(inode);
+ } else {
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ }
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ return 0;
+}
+
+/*
+ * Used to start IO or wait for a given ordered extent to finish.
+ *
+ * If wait is one, this effectively waits on page writeback for all the pages
+ * in the extent, and it waits on the io completion code to insert
+ * metadata into the btree corresponding to the extent
+ */
+void btrfs_start_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ int wait)
+{
+ u64 start = entry->file_offset;
+ u64 end = start + entry->len - 1;
+
+ /*
+ * pages in the range can be dirty, clean or writeback. We
+ * start IO on any dirty ones so the wait doesn't stall waiting
+ * for pdflush to find them
+ */
+ btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
+ if (wait) {
+ wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+ &entry->flags));
+ }
+}
+
+/*
+ * Used to wait on ordered extents across a large range of bytes.
+ */
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+ u64 end;
+ u64 orig_end;
+ u64 wait_end;
+ struct btrfs_ordered_extent *ordered;
+
+ if (start + len < start) {
+ orig_end = INT_LIMIT(loff_t);
+ } else {
+ orig_end = start + len - 1;
+ if (orig_end > INT_LIMIT(loff_t))
+ orig_end = INT_LIMIT(loff_t);
+ }
+ wait_end = orig_end;
+again:
+ /* start IO across the range first to instantiate any delalloc
+ * extents
+ */
+ btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+
+ /* The compression code will leave pages locked but return from
+ * writepage without setting the page writeback. Starting again
+ * with WB_SYNC_ALL will end up waiting for the IO to actually start.
+ */
+ btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+
+ btrfs_wait_on_page_writeback_range(inode->i_mapping,
+ start >> PAGE_CACHE_SHIFT,
+ orig_end >> PAGE_CACHE_SHIFT);
+
+ end = orig_end;
+ while (1) {
+ ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ if (!ordered)
+ break;
+ if (ordered->file_offset > orig_end) {
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ if (ordered->file_offset + ordered->len < start) {
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ end = ordered->file_offset;
+ btrfs_put_ordered_extent(ordered);
+ if (end == 0 || end == start)
+ break;
+ end--;
+ }
+ if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+ EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
+ schedule_timeout(1);
+ goto again;
+ }
+ return 0;
+}
+
+/*
+ * find an ordered extent corresponding to file_offset. return NULL if
+ * nothing is found, otherwise take a reference on the extent and return it
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+ u64 file_offset)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+ node = tree_search(tree, file_offset);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (!offset_in_entry(entry, file_offset))
+ entry = NULL;
+ if (entry)
+ atomic_inc(&entry->refs);
+out:
+ mutex_unlock(&tree->mutex);
+ return entry;
+}
+
+/*
+ * lookup and return any extent before 'file_offset'. NULL is returned
+ * if none is found
+ */
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ mutex_lock(&tree->mutex);
+ node = tree_search(tree, file_offset);
+ if (!node)
+ goto out;
+
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ atomic_inc(&entry->refs);
+out:
+ mutex_unlock(&tree->mutex);
+ return entry;
+}
+
+/*
+ * After an extent is done, call this to conditionally update the on disk
+ * i_size. i_size is updated to cover any fully written part of the file.
+ */
+int btrfs_ordered_update_i_size(struct inode *inode,
+ struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ u64 disk_i_size;
+ u64 new_i_size;
+ u64 i_size_test;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *test;
+
+ mutex_lock(&tree->mutex);
+ disk_i_size = BTRFS_I(inode)->disk_i_size;
+
+ /*
+ * if the disk i_size is already at the inode->i_size, or
+ * this ordered extent is inside the disk i_size, we're done
+ */
+ if (disk_i_size >= inode->i_size ||
+ ordered->file_offset + ordered->len <= disk_i_size) {
+ goto out;
+ }
+
+ /*
+ * we can't update the disk_isize if there are delalloc bytes
+ * between disk_i_size and this ordered extent
+ */
+ if (test_range_bit(io_tree, disk_i_size,
+ ordered->file_offset + ordered->len - 1,
+ EXTENT_DELALLOC, 0)) {
+ goto out;
+ }
+ /*
+ * walk backward from this ordered extent to disk_i_size.
+ * if we find an ordered extent then we can't update disk i_size
+ * yet
+ */
+ node = &ordered->rb_node;
+ while (1) {
+ node = rb_prev(node);
+ if (!node)
+ break;
+ test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (test->file_offset + test->len <= disk_i_size)
+ break;
+ if (test->file_offset >= inode->i_size)
+ break;
+ if (test->file_offset >= disk_i_size)
+ goto out;
+ }
+ new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
+
+ /*
+ * at this point, we know we can safely update i_size to at least
+ * the offset from this ordered extent. But, we need to
+ * walk forward and see if ios from higher up in the file have
+ * finished.
+ */
+ node = rb_next(&ordered->rb_node);
+ i_size_test = 0;
+ if (node) {
+ /*
+ * do we have an area where IO might have finished
+ * between our ordered extent and the next one.
+ */
+ test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (test->file_offset > entry_end(ordered))
+ i_size_test = test->file_offset;
+ } else {
+ i_size_test = i_size_read(inode);
+ }
+
+ /*
+ * i_size_test is the end of a region after this ordered
+ * extent where there are no ordered extents. As long as there
+ * are no delalloc bytes in this area, it is safe to update
+ * disk_i_size to the end of the region.
+ */
+ if (i_size_test > entry_end(ordered) &&
+ !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
+ EXTENT_DELALLOC, 0)) {
+ new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+ }
+ BTRFS_I(inode)->disk_i_size = new_i_size;
+out:
+ mutex_unlock(&tree->mutex);
+ return 0;
+}
+
+/*
+ * search the ordered extents for one corresponding to 'offset' and
+ * try to find a checksum. This is used because we allow pages to
+ * be reclaimed before their checksum is actually put into the btree
+ */
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+ u32 *sum)
+{
+ struct btrfs_ordered_sum *ordered_sum;
+ struct btrfs_sector_sum *sector_sums;
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct list_head *cur;
+ unsigned long num_sectors;
+ unsigned long i;
+ u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
+ int ret = 1;
+
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
+ if (!ordered)
+ return 1;
+
+ mutex_lock(&tree->mutex);
+ list_for_each_prev(cur, &ordered->list) {
+ ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
+ if (disk_bytenr >= ordered_sum->bytenr) {
+ num_sectors = ordered_sum->len / sectorsize;
+ sector_sums = ordered_sum->sums;
+ for (i = 0; i < num_sectors; i++) {
+ if (sector_sums[i].bytenr == disk_bytenr) {
+ *sum = sector_sums[i].sum;
+ ret = 0;
+ goto out;
+ }
+ }
+ }
+ }
+out:
+ mutex_unlock(&tree->mutex);
+ btrfs_put_ordered_extent(ordered);
+ return ret;
+}
+
+
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * @mapping: address space structure to write
+ * @start: offset in bytes where the range starts
+ * @end: offset in bytes where the range ends (inclusive)
+ * @sync_mode: enable synchronous operation
+ *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
+ * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
+ * opposed to a regular memory cleansing writeback. The difference between
+ * these two operations is that if a dirty page/buffer is encountered, it must
+ * be waited upon, and not just skipped over.
+ */
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end, int sync_mode)
+{
+ struct writeback_control wbc = {
+ .sync_mode = sync_mode,
+ .nr_to_write = mapping->nrpages * 2,
+ .range_start = start,
+ .range_end = end,
+ .for_writepages = 1,
+ };
+ return btrfs_writepages(mapping, &wbc);
+}
+
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping: target address_space
+ * @start: beginning page index
+ * @end: ending page index
+ *
+ * Wait for writeback to complete against pages indexed by start->end
+ * inclusive
+ */
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ struct pagevec pvec;
+ int nr_pages;
+ int ret = 0;
+ pgoff_t index;
+
+ if (end < start)
+ return 0;
+
+ pagevec_init(&pvec, 0);
+ index = start;
+ while ((index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_WRITEBACK,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /* until radix tree lookup accepts end_index */
+ if (page->index > end)
+ continue;
+
+ wait_on_page_writeback(page);
+ if (PageError(page))
+ ret = -EIO;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+
+ /* Check for outstanding write errors */
+ if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ ret = -ENOSPC;
+ if (test_and_clear_bit(AS_EIO, &mapping->flags))
+ ret = -EIO;
+
+ return ret;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 00000000000..ab66d5e8d6d
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+
+/* one of these per inode */
+struct btrfs_ordered_inode_tree {
+ struct mutex mutex;
+ struct rb_root tree;
+ struct rb_node *last;
+};
+
+/*
+ * these are used to collect checksums done just before bios submission.
+ * They are attached via a list into the ordered extent, and
+ * checksum items are inserted into the tree after all the blocks in
+ * the ordered extent are on disk
+ */
+struct btrfs_sector_sum {
+ /* bytenr on disk */
+ u64 bytenr;
+ u32 sum;
+};
+
+struct btrfs_ordered_sum {
+ /* bytenr is the start of this extent on disk */
+ u64 bytenr;
+
+ /*
+ * this is the length in bytes covered by the sums array below.
+ */
+ unsigned long len;
+ struct list_head list;
+ /* last field is a variable length array of btrfs_sector_sums */
+ struct btrfs_sector_sum sums[];
+};
+
+/*
+ * bits for the flags field:
+ *
+ * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
+ * It is used to make sure metadata is inserted into the tree only once
+ * per extent.
+ *
+ * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
+ * rbtree, just before waking any waiters. It is used to indicate the
+ * IO is done and any metadata is inserted into the tree.
+ */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+
+#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+
+struct btrfs_ordered_extent {
+ /* logical offset in the file */
+ u64 file_offset;
+
+ /* disk byte number */
+ u64 start;
+
+ /* ram length of the extent in bytes */
+ u64 len;
+
+ /* extent length on disk */
+ u64 disk_len;
+
+ /* flags (described above) */
+ unsigned long flags;
+
+ /* reference count */
+ atomic_t refs;
+
+ /* the inode we belong to */
+ struct inode *inode;
+
+ /* list of checksums for insertion when the extent io is done */
+ struct list_head list;
+
+ /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
+ wait_queue_head_t wait;
+
+ /* our friendly rbtree entry */
+ struct rb_node rb_node;
+
+ /* a per root list of all the pending ordered extents */
+ struct list_head root_extent_list;
+};
+
+
+/*
+ * calculates the total size you need to allocate for an ordered sum
+ * structure spanning 'bytes' in the file
+ */
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
+ unsigned long bytes)
+{
+ unsigned long num_sectors = (bytes + root->sectorsize - 1) /
+ root->sectorsize;
+ num_sectors++;
+ return sizeof(struct btrfs_ordered_sum) +
+ num_sectors * sizeof(struct btrfs_sector_sum);
+}
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+ mutex_init(&t->mutex);
+ t->tree.rb_node = NULL;
+ t->last = NULL;
+}
+
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+int btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+ u64 file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int tyep);
+int btrfs_add_ordered_sum(struct inode *inode,
+ struct btrfs_ordered_extent *entry,
+ struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+ u64 file_offset);
+void btrfs_start_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry, int wait);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+int btrfs_ordered_update_i_size(struct inode *inode,
+ struct btrfs_ordered_extent *ordered);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+ pgoff_t start, pgoff_t end);
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end, int sync_mode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 00000000000..3c0d52af4f8
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2008 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ ret = btrfs_del_item(trans, root, path);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 00000000000..5f8f218c100
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+{
+ int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
+ int i;
+ printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
+ "num_stripes %d\n",
+ (unsigned long long)btrfs_chunk_length(eb, chunk),
+ (unsigned long long)btrfs_chunk_owner(eb, chunk),
+ (unsigned long long)btrfs_chunk_type(eb, chunk),
+ num_stripes);
+ for (i = 0 ; i < num_stripes ; i++) {
+ printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
+ (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
+ (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
+ }
+}
+static void print_dev_item(struct extent_buffer *eb,
+ struct btrfs_dev_item *dev_item)
+{
+ printk(KERN_INFO "\t\tdev item devid %llu "
+ "total_bytes %llu bytes used %llu\n",
+ (unsigned long long)btrfs_device_id(eb, dev_item),
+ (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
+ (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
+}
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
+{
+ int i;
+ u32 nr = btrfs_header_nritems(l);
+ struct btrfs_item *item;
+ struct btrfs_extent_item *ei;
+ struct btrfs_root_item *ri;
+ struct btrfs_dir_item *di;
+ struct btrfs_inode_item *ii;
+ struct btrfs_block_group_item *bi;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_extent_ref *ref;
+ struct btrfs_dev_extent *dev_extent;
+ u32 type;
+
+ printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
+ (unsigned long long)btrfs_header_bytenr(l), nr,
+ btrfs_leaf_free_space(root, l));
+ for (i = 0 ; i < nr ; i++) {
+ item = btrfs_item_nr(l, i);
+ btrfs_item_key_to_cpu(l, &key, i);
+ type = btrfs_key_type(&key);
+ printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+ "itemsize %d\n",
+ i,
+ (unsigned long long)key.objectid, type,
+ (unsigned long long)key.offset,
+ btrfs_item_offset(l, item), btrfs_item_size(l, item));
+ switch (type) {
+ case BTRFS_INODE_ITEM_KEY:
+ ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+ printk(KERN_INFO "\t\tinode generation %llu size %llu "
+ "mode %o\n",
+ (unsigned long long)
+ btrfs_inode_generation(l, ii),
+ (unsigned long long)btrfs_inode_size(l, ii),
+ btrfs_inode_mode(l, ii));
+ break;
+ case BTRFS_DIR_ITEM_KEY:
+ di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(l, di, &found_key);
+ printk(KERN_INFO "\t\tdir oid %llu type %u\n",
+ (unsigned long long)found_key.objectid,
+ btrfs_dir_type(l, di));
+ break;
+ case BTRFS_ROOT_ITEM_KEY:
+ ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
+ printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
+ (unsigned long long)
+ btrfs_disk_root_bytenr(l, ri),
+ btrfs_disk_root_refs(l, ri));
+ break;
+ case BTRFS_EXTENT_ITEM_KEY:
+ ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
+ printk(KERN_INFO "\t\textent data refs %u\n",
+ btrfs_extent_refs(l, ei));
+ break;
+ case BTRFS_EXTENT_REF_KEY:
+ ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
+ printk(KERN_INFO "\t\textent back ref root %llu "
+ "gen %llu owner %llu num_refs %lu\n",
+ (unsigned long long)btrfs_ref_root(l, ref),
+ (unsigned long long)btrfs_ref_generation(l, ref),
+ (unsigned long long)btrfs_ref_objectid(l, ref),
+ (unsigned long)btrfs_ref_num_refs(l, ref));
+ break;
+
+ case BTRFS_EXTENT_DATA_KEY:
+ fi = btrfs_item_ptr(l, i,
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(l, fi) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ printk(KERN_INFO "\t\tinline extent data "
+ "size %u\n",
+ btrfs_file_extent_inline_len(l, fi));
+ break;
+ }
+ printk(KERN_INFO "\t\textent data disk bytenr %llu "
+ "nr %llu\n",
+ (unsigned long long)
+ btrfs_file_extent_disk_bytenr(l, fi),
+ (unsigned long long)
+ btrfs_file_extent_disk_num_bytes(l, fi));
+ printk(KERN_INFO "\t\textent data offset %llu "
+ "nr %llu ram %llu\n",
+ (unsigned long long)
+ btrfs_file_extent_offset(l, fi),
+ (unsigned long long)
+ btrfs_file_extent_num_bytes(l, fi),
+ (unsigned long long)
+ btrfs_file_extent_ram_bytes(l, fi));
+ break;
+ case BTRFS_BLOCK_GROUP_ITEM_KEY:
+ bi = btrfs_item_ptr(l, i,
+ struct btrfs_block_group_item);
+ printk(KERN_INFO "\t\tblock group used %llu\n",
+ (unsigned long long)
+ btrfs_disk_block_group_used(l, bi));
+ break;
+ case BTRFS_CHUNK_ITEM_KEY:
+ print_chunk(l, btrfs_item_ptr(l, i,
+ struct btrfs_chunk));
+ break;
+ case BTRFS_DEV_ITEM_KEY:
+ print_dev_item(l, btrfs_item_ptr(l, i,
+ struct btrfs_dev_item));
+ break;
+ case BTRFS_DEV_EXTENT_KEY:
+ dev_extent = btrfs_item_ptr(l, i,
+ struct btrfs_dev_extent);
+ printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
+ "\t\tchunk objectid %llu chunk offset %llu "
+ "length %llu\n",
+ (unsigned long long)
+ btrfs_dev_extent_chunk_tree(l, dev_extent),
+ (unsigned long long)
+ btrfs_dev_extent_chunk_objectid(l, dev_extent),
+ (unsigned long long)
+ btrfs_dev_extent_chunk_offset(l, dev_extent),
+ (unsigned long long)
+ btrfs_dev_extent_length(l, dev_extent));
+ };
+ }
+}
+
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
+{
+ int i; u32 nr;
+ struct btrfs_key key;
+ int level;
+
+ if (!c)
+ return;
+ nr = btrfs_header_nritems(c);
+ level = btrfs_header_level(c);
+ if (level == 0) {
+ btrfs_print_leaf(root, c);
+ return;
+ }
+ printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
+ (unsigned long long)btrfs_header_bytenr(c),
+ btrfs_header_level(c), nr,
+ (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+ for (i = 0; i < nr; i++) {
+ btrfs_node_key_to_cpu(c, &key, i);
+ printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
+ i,
+ (unsigned long long)key.objectid,
+ key.type,
+ (unsigned long long)key.offset,
+ (unsigned long long)btrfs_node_blockptr(c, i));
+ }
+ for (i = 0; i < nr; i++) {
+ struct extent_buffer *next = read_tree_block(root,
+ btrfs_node_blockptr(c, i),
+ btrfs_level_size(root, level - 1),
+ btrfs_node_ptr_generation(c, i));
+ if (btrfs_is_leaf(next) &&
+ btrfs_header_level(c) != 1)
+ BUG();
+ if (btrfs_header_level(next) !=
+ btrfs_header_level(c) - 1)
+ BUG();
+ btrfs_print_tree(root, next);
+ free_extent_buffer(next);
+ }
+}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 00000000000..da75efe534d
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __PRINT_TREE_
+#define __PRINT_TREE_
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
+#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 00000000000..6f0acc4c9ea
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "ref-cache.h"
+#include "transaction.h"
+
+/*
+ * leaf refs are used to cache the information about which extents
+ * a given leaf has references on. This allows us to process that leaf
+ * in btrfs_drop_snapshot without needing to read it back from disk.
+ */
+
+/*
+ * kmalloc a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+ int nr_extents)
+{
+ struct btrfs_leaf_ref *ref;
+ size_t size = btrfs_leaf_ref_size(nr_extents);
+
+ ref = kmalloc(size, GFP_NOFS);
+ if (ref) {
+ spin_lock(&root->fs_info->ref_cache_lock);
+ root->fs_info->total_ref_cache_size += size;
+ spin_unlock(&root->fs_info->ref_cache_lock);
+
+ memset(ref, 0, sizeof(*ref));
+ atomic_set(&ref->usage, 1);
+ INIT_LIST_HEAD(&ref->list);
+ }
+ return ref;
+}
+
+/*
+ * free a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+ if (!ref)
+ return;
+ WARN_ON(atomic_read(&ref->usage) == 0);
+ if (atomic_dec_and_test(&ref->usage)) {
+ size_t size = btrfs_leaf_ref_size(ref->nritems);
+
+ BUG_ON(ref->in_tree);
+ kfree(ref);
+
+ spin_lock(&root->fs_info->ref_cache_lock);
+ root->fs_info->total_ref_cache_size -= size;
+ spin_unlock(&root->fs_info->ref_cache_lock);
+ }
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct btrfs_leaf_ref *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
+
+ if (bytenr < entry->bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_leaf_ref *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
+ WARN_ON(!entry->in_tree);
+
+ if (bytenr < entry->bytenr)
+ n = n->rb_left;
+ else if (bytenr > entry->bytenr)
+ n = n->rb_right;
+ else
+ return n;
+ }
+ return NULL;
+}
+
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+ int shared)
+{
+ struct btrfs_leaf_ref *ref = NULL;
+ struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+
+ if (shared)
+ tree = &root->fs_info->shared_ref_tree;
+ if (!tree)
+ return 0;
+
+ spin_lock(&tree->lock);
+ while (!list_empty(&tree->list)) {
+ ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
+ BUG_ON(ref->tree != tree);
+ if (ref->root_gen > max_root_gen)
+ break;
+ if (!xchg(&ref->in_tree, 0)) {
+ cond_resched_lock(&tree->lock);
+ continue;
+ }
+
+ rb_erase(&ref->rb_node, &tree->root);
+ list_del_init(&ref->list);
+
+ spin_unlock(&tree->lock);
+ btrfs_free_leaf_ref(root, ref);
+ cond_resched();
+ spin_lock(&tree->lock);
+ }
+ spin_unlock(&tree->lock);
+ return 0;
+}
+
+/*
+ * find the leaf ref for a given extent. This returns the ref struct with
+ * a usage reference incremented
+ */
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+ u64 bytenr)
+{
+ struct rb_node *rb;
+ struct btrfs_leaf_ref *ref = NULL;
+ struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+again:
+ if (tree) {
+ spin_lock(&tree->lock);
+ rb = tree_search(&tree->root, bytenr);
+ if (rb)
+ ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+ if (ref)
+ atomic_inc(&ref->usage);
+ spin_unlock(&tree->lock);
+ if (ref)
+ return ref;
+ }
+ if (tree != &root->fs_info->shared_ref_tree) {
+ tree = &root->fs_info->shared_ref_tree;
+ goto again;
+ }
+ return NULL;
+}
+
+/*
+ * add a fully filled in leaf ref struct
+ * remove all the refs older than a given root generation
+ */
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+ int shared)
+{
+ int ret = 0;
+ struct rb_node *rb;
+ struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+
+ if (shared)
+ tree = &root->fs_info->shared_ref_tree;
+
+ spin_lock(&tree->lock);
+ rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
+ if (rb) {
+ ret = -EEXIST;
+ } else {
+ atomic_inc(&ref->usage);
+ ref->tree = tree;
+ ref->in_tree = 1;
+ list_add_tail(&ref->list, &tree->list);
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * remove a single leaf ref from the tree. This drops the ref held by the tree
+ * only
+ */
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+ struct btrfs_leaf_ref_tree *tree;
+
+ if (!xchg(&ref->in_tree, 0))
+ return 0;
+
+ tree = ref->tree;
+ spin_lock(&tree->lock);
+
+ rb_erase(&ref->rb_node, &tree->root);
+ list_del_init(&ref->list);
+
+ spin_unlock(&tree->lock);
+
+ btrfs_free_leaf_ref(root, ref);
+ return 0;
+}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 00000000000..16f3183d7c5
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __REFCACHE__
+#define __REFCACHE__
+
+struct btrfs_extent_info {
+ /* bytenr and num_bytes find the extent in the extent allocation tree */
+ u64 bytenr;
+ u64 num_bytes;
+
+ /* objectid and offset find the back reference for the file */
+ u64 objectid;
+ u64 offset;
+};
+
+struct btrfs_leaf_ref {
+ struct rb_node rb_node;
+ struct btrfs_leaf_ref_tree *tree;
+ int in_tree;
+ atomic_t usage;
+
+ u64 root_gen;
+ u64 bytenr;
+ u64 owner;
+ u64 generation;
+ int nritems;
+
+ struct list_head list;
+ struct btrfs_extent_info extents[];
+};
+
+static inline size_t btrfs_leaf_ref_size(int nr_extents)
+{
+ return sizeof(struct btrfs_leaf_ref) +
+ sizeof(struct btrfs_extent_info) * nr_extents;
+}
+
+static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
+{
+ tree->root.rb_node = NULL;
+ INIT_LIST_HEAD(&tree->list);
+ spin_lock_init(&tree->lock);
+}
+
+static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
+{
+ return RB_EMPTY_ROOT(&tree->root);
+}
+
+void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+ int nr_extents);
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+ u64 bytenr);
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+ int shared);
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+ int shared);
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+
+#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 00000000000..b48650de447
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+/*
+ * search forward for a root, starting with objectid 'search_start'
+ * if a root key is found, the objectid we find is filled into 'found_objectid'
+ * and 0 is returned. < 0 is returned on error, 1 if there is nothing
+ * left in the tree.
+ */
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+ u64 *found_objectid)
+{
+ struct btrfs_path *path;
+ struct btrfs_key search_key;
+ int ret;
+
+ root = root->fs_info->tree_root;
+ search_key.objectid = search_start;
+ search_key.type = (u8)-1;
+ search_key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret == 0) {
+ ret = 1;
+ goto out;
+ }
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ goto out;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
+ if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
+ search_key.offset++;
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ ret = 0;
+ *found_objectid = search_key.objectid;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * lookup the root with the highest offset for a given objectid. The key we do
+ * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
+ * on error.
+ */
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+ struct btrfs_root_item *item, struct btrfs_key *key)
+{
+ struct btrfs_path *path;
+ struct btrfs_key search_key;
+ struct btrfs_key found_key;
+ struct extent_buffer *l;
+ int ret;
+ int slot;
+
+ search_key.objectid = objectid;
+ search_key.type = BTRFS_ROOT_ITEM_KEY;
+ search_key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ BUG_ON(ret == 0);
+ l = path->nodes[0];
+ BUG_ON(path->slots[0] == 0);
+ slot = path->slots[0] - 1;
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ if (found_key.objectid != objectid) {
+ ret = 1;
+ goto out;
+ }
+ read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+ sizeof(*item));
+ memcpy(key, &found_key, sizeof(found_key));
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * copy the data in 'item' into the btree
+ */
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_root_item
+ *item)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *l;
+ int ret;
+ int slot;
+ unsigned long ptr;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret != 0) {
+ btrfs_print_leaf(root, path->nodes[0]);
+ printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+ (unsigned long long)key->objectid, key->type,
+ (unsigned long long)key->offset);
+ BUG_ON(1);
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ ptr = btrfs_item_ptr_offset(l, slot);
+ write_extent_buffer(l, item, ptr, sizeof(*item));
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+ btrfs_release_path(root, path);
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_key *key, struct btrfs_root_item
+ *item)
+{
+ int ret;
+ ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
+ return ret;
+}
+
+/*
+ * at mount time we want to find all the old transaction snapshots that were in
+ * the process of being deleted if we crashed. This is any root item with an
+ * offset lower than the latest root. They need to be queued for deletion to
+ * finish what was happening when we crashed.
+ */
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+ struct btrfs_root *latest)
+{
+ struct btrfs_root *dead_root;
+ struct btrfs_item *item;
+ struct btrfs_root_item *ri;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ int ret;
+ u32 nritems;
+ struct extent_buffer *leaf;
+ int slot;
+
+ key.objectid = objectid;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = 0;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+again:
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+ while (1) {
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+ if (slot >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+ }
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
+ goto next;
+
+ if (key.objectid < objectid)
+ goto next;
+
+ if (key.objectid > objectid)
+ break;
+
+ ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
+ if (btrfs_disk_root_refs(leaf, ri) != 0)
+ goto next;
+
+ memcpy(&found_key, &key, sizeof(key));
+ key.offset++;
+ btrfs_release_path(root, path);
+ dead_root =
+ btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+ &found_key);
+ if (IS_ERR(dead_root)) {
+ ret = PTR_ERR(dead_root);
+ goto err;
+ }
+
+ if (objectid == BTRFS_TREE_RELOC_OBJECTID)
+ ret = btrfs_add_dead_reloc_root(dead_root);
+ else
+ ret = btrfs_add_dead_root(dead_root, latest);
+ if (ret)
+ goto err;
+ goto again;
+next:
+ slot++;
+ path->slots[0]++;
+ }
+ ret = 0;
+err:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/* drop the root item for 'key' from 'root' */
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key)
+{
+ struct btrfs_path *path;
+ int ret;
+ u32 refs;
+ struct btrfs_root_item *ri;
+ struct extent_buffer *leaf;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(trans, root, key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ BUG_ON(ret != 0);
+ leaf = path->nodes[0];
+ ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
+
+ refs = btrfs_disk_root_refs(leaf, ri);
+ BUG_ON(refs != 0);
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_release_path(root, path);
+ btrfs_free_path(path);
+ return ret;
+}
+
+#if 0 /* this will get used when snapshot deletion is implemented */
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u8 type, u64 ref_id)
+{
+ struct btrfs_key key;
+ int ret;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+
+ key.objectid = root_id;
+ key.type = type;
+ key.offset = ref_id;
+
+ ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+ BUG_ON(ret);
+
+ ret = btrfs_del_item(trans, tree_root, path);
+ BUG_ON(ret);
+
+ btrfs_free_path(path);
+ return ret;
+}
+#endif
+
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+ u64 root_id, u64 ref_id)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = root_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = ref_id;
+
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ return ret;
+}
+
+
+/*
+ * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
+ * or BTRFS_ROOT_BACKREF_KEY.
+ *
+ * The dirid, sequence, name and name_len refer to the directory entry
+ * that is referencing the root.
+ *
+ * For a forward ref, the root_id is the id of the tree referencing
+ * the root and ref_id is the id of the subvol or snapshot.
+ *
+ * For a back ref the root_id is the id of the subvol or snapshot and
+ * ref_id is the id of the tree referencing it.
+ */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u8 type, u64 ref_id,
+ u64 dirid, u64 sequence,
+ const char *name, int name_len)
+{
+ struct btrfs_key key;
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ unsigned long ptr;
+
+
+ path = btrfs_alloc_path();
+
+ key.objectid = root_id;
+ key.type = type;
+ key.offset = ref_id;
+
+ ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+ sizeof(*ref) + name_len);
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ btrfs_set_root_ref_dirid(leaf, ref, dirid);
+ btrfs_set_root_ref_sequence(leaf, ref, sequence);
+ btrfs_set_root_ref_name_len(leaf, ref, name_len);
+ ptr = (unsigned long)(ref + 1);
+ write_extent_buffer(leaf, name, ptr, name_len);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 00000000000..c0f7ecaf1e7
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/highmem.h>
+
+/* this is some deeply nasty code. ctree.h has a different
+ * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
+ *
+ * The end result is that anyone who #includes ctree.h gets a
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions
+ *
+ * This file declares the macros and then #includes ctree.h, which results
+ * in cpp creating the function here based on the template below.
+ *
+ * These setget functions do all the extent_buffer related mapping
+ * required to efficiently read and write specific fields in the extent
+ * buffers. Every pointer to metadata items in btrfs is really just
+ * an unsigned long offset into the extent buffer which has been
+ * cast to a specific type. This gives us all the gcc type checking.
+ *
+ * The extent buffer api is used to do all the kmapping and page
+ * spanning work required to get extent buffers in highmem and have
+ * a metadata blocksize different from the page size.
+ *
+ * The macro starts with a simple function prototype declaration so that
+ * sparse won't complain about it being static.
+ */
+
+#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
+u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \
+u##bits btrfs_##name(struct extent_buffer *eb, \
+ type *s) \
+{ \
+ unsigned long part_offset = (unsigned long)s; \
+ unsigned long offset = part_offset + offsetof(type, member); \
+ type *p; \
+ /* ugly, but we want the fast path here */ \
+ if (eb->map_token && offset >= eb->map_start && \
+ offset + sizeof(((type *)0)->member) <= eb->map_start + \
+ eb->map_len) { \
+ p = (type *)(eb->kaddr + part_offset - eb->map_start); \
+ return le##bits##_to_cpu(p->member); \
+ } \
+ { \
+ int err; \
+ char *map_token; \
+ char *kaddr; \
+ int unmap_on_exit = (eb->map_token == NULL); \
+ unsigned long map_start; \
+ unsigned long map_len; \
+ u##bits res; \
+ err = map_extent_buffer(eb, offset, \
+ sizeof(((type *)0)->member), \
+ &map_token, &kaddr, \
+ &map_start, &map_len, KM_USER1); \
+ if (err) { \
+ __le##bits leres; \
+ read_eb_member(eb, s, type, member, &leres); \
+ return le##bits##_to_cpu(leres); \
+ } \
+ p = (type *)(kaddr + part_offset - map_start); \
+ res = le##bits##_to_cpu(p->member); \
+ if (unmap_on_exit) \
+ unmap_extent_buffer(eb, map_token, KM_USER1); \
+ return res; \
+ } \
+} \
+void btrfs_set_##name(struct extent_buffer *eb, \
+ type *s, u##bits val) \
+{ \
+ unsigned long part_offset = (unsigned long)s; \
+ unsigned long offset = part_offset + offsetof(type, member); \
+ type *p; \
+ /* ugly, but we want the fast path here */ \
+ if (eb->map_token && offset >= eb->map_start && \
+ offset + sizeof(((type *)0)->member) <= eb->map_start + \
+ eb->map_len) { \
+ p = (type *)(eb->kaddr + part_offset - eb->map_start); \
+ p->member = cpu_to_le##bits(val); \
+ return; \
+ } \
+ { \
+ int err; \
+ char *map_token; \
+ char *kaddr; \
+ int unmap_on_exit = (eb->map_token == NULL); \
+ unsigned long map_start; \
+ unsigned long map_len; \
+ err = map_extent_buffer(eb, offset, \
+ sizeof(((type *)0)->member), \
+ &map_token, &kaddr, \
+ &map_start, &map_len, KM_USER1); \
+ if (err) { \
+ __le##bits val2; \
+ val2 = cpu_to_le##bits(val); \
+ write_eb_member(eb, s, type, member, &val2); \
+ return; \
+ } \
+ p = (type *)(kaddr + part_offset - map_start); \
+ p->member = cpu_to_le##bits(val); \
+ if (unmap_on_exit) \
+ unmap_extent_buffer(eb, map_token, KM_USER1); \
+ } \
+}
+
+#include "ctree.h"
+
+void btrfs_node_key(struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int nr)
+{
+ unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+ if (eb->map_token && ptr >= eb->map_start &&
+ ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
+ memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
+ sizeof(*disk_key));
+ return;
+ } else if (eb->map_token) {
+ unmap_extent_buffer(eb, eb->map_token, KM_USER1);
+ eb->map_token = NULL;
+ }
+ read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+ struct btrfs_key_ptr, key, disk_key);
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 00000000000..b4c101d9322
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/parser.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/miscdevice.h>
+#include <linux/version.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "xattr.h"
+#include "volumes.h"
+#include "version.h"
+#include "export.h"
+#include "compression.h"
+
+#define BTRFS_SUPER_MAGIC 0x9123683E
+
+static struct super_operations btrfs_super_ops;
+
+static void btrfs_put_super(struct super_block *sb)
+{
+ struct btrfs_root *root = btrfs_sb(sb);
+ int ret;
+
+ ret = close_ctree(root);
+ sb->s_fs_info = NULL;
+}
+
+enum {
+ Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
+ Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
+ Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err,
+};
+
+static match_table_t tokens = {
+ {Opt_degraded, "degraded"},
+ {Opt_subvol, "subvol=%s"},
+ {Opt_device, "device=%s"},
+ {Opt_nodatasum, "nodatasum"},
+ {Opt_nodatacow, "nodatacow"},
+ {Opt_nobarrier, "nobarrier"},
+ {Opt_max_extent, "max_extent=%s"},
+ {Opt_max_inline, "max_inline=%s"},
+ {Opt_alloc_start, "alloc_start=%s"},
+ {Opt_thread_pool, "thread_pool=%d"},
+ {Opt_compress, "compress"},
+ {Opt_ssd, "ssd"},
+ {Opt_noacl, "noacl"},
+ {Opt_err, NULL},
+};
+
+u64 btrfs_parse_size(char *str)
+{
+ u64 res;
+ int mult = 1;
+ char *end;
+ char last;
+
+ res = simple_strtoul(str, &end, 10);
+
+ last = end[0];
+ if (isalpha(last)) {
+ last = tolower(last);
+ switch (last) {
+ case 'g':
+ mult *= 1024;
+ case 'm':
+ mult *= 1024;
+ case 'k':
+ mult *= 1024;
+ }
+ res = res * mult;
+ }
+ return res;
+}
+
+/*
+ * Regular mount options parser. Everything that is needed only when
+ * reading in a new superblock is parsed here.
+ */
+int btrfs_parse_options(struct btrfs_root *root, char *options)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ substring_t args[MAX_OPT_ARGS];
+ char *p, *num;
+ int intarg;
+
+ if (!options)
+ return 0;
+
+ /*
+ * strsep changes the string, duplicate it because parse_options
+ * gets called twice
+ */
+ options = kstrdup(options, GFP_NOFS);
+ if (!options)
+ return -ENOMEM;
+
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_degraded:
+ printk(KERN_INFO "btrfs: allowing degraded mounts\n");
+ btrfs_set_opt(info->mount_opt, DEGRADED);
+ break;
+ case Opt_subvol:
+ case Opt_device:
+ /*
+ * These are parsed by btrfs_parse_early_options
+ * and can be happily ignored here.
+ */
+ break;
+ case Opt_nodatasum:
+ printk(KERN_INFO "btrfs: setting nodatacsum\n");
+ btrfs_set_opt(info->mount_opt, NODATASUM);
+ break;
+ case Opt_nodatacow:
+ printk(KERN_INFO "btrfs: setting nodatacow\n");
+ btrfs_set_opt(info->mount_opt, NODATACOW);
+ btrfs_set_opt(info->mount_opt, NODATASUM);
+ break;
+ case Opt_compress:
+ printk(KERN_INFO "btrfs: use compression\n");
+ btrfs_set_opt(info->mount_opt, COMPRESS);
+ break;
+ case Opt_ssd:
+ printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
+ btrfs_set_opt(info->mount_opt, SSD);
+ break;
+ case Opt_nobarrier:
+ printk(KERN_INFO "btrfs: turning off barriers\n");
+ btrfs_set_opt(info->mount_opt, NOBARRIER);
+ break;
+ case Opt_thread_pool:
+ intarg = 0;
+ match_int(&args[0], &intarg);
+ if (intarg) {
+ info->thread_pool_size = intarg;
+ printk(KERN_INFO "btrfs: thread pool %d\n",
+ info->thread_pool_size);
+ }
+ break;
+ case Opt_max_extent:
+ num = match_strdup(&args[0]);
+ if (num) {
+ info->max_extent = btrfs_parse_size(num);
+ kfree(num);
+
+ info->max_extent = max_t(u64,
+ info->max_extent, root->sectorsize);
+ printk(KERN_INFO "btrfs: max_extent at %llu\n",
+ info->max_extent);
+ }
+ break;
+ case Opt_max_inline:
+ num = match_strdup(&args[0]);
+ if (num) {
+ info->max_inline = btrfs_parse_size(num);
+ kfree(num);
+
+ if (info->max_inline) {
+ info->max_inline = max_t(u64,
+ info->max_inline,
+ root->sectorsize);
+ }
+ printk(KERN_INFO "btrfs: max_inline at %llu\n",
+ info->max_inline);
+ }
+ break;
+ case Opt_alloc_start:
+ num = match_strdup(&args[0]);
+ if (num) {
+ info->alloc_start = btrfs_parse_size(num);
+ kfree(num);
+ printk(KERN_INFO
+ "btrfs: allocations start at %llu\n",
+ info->alloc_start);
+ }
+ break;
+ case Opt_noacl:
+ root->fs_info->sb->s_flags &= ~MS_POSIXACL;
+ break;
+ default:
+ break;
+ }
+ }
+ kfree(options);
+ return 0;
+}
+
+/*
+ * Parse mount options that are required early in the mount process.
+ *
+ * All other options will be parsed on much later in the mount process and
+ * only when we need to allocate a new super block.
+ */
+static int btrfs_parse_early_options(const char *options, fmode_t flags,
+ void *holder, char **subvol_name,
+ struct btrfs_fs_devices **fs_devices)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *opts, *p;
+ int error = 0;
+
+ if (!options)
+ goto out;
+
+ /*
+ * strsep changes the string, duplicate it because parse_options
+ * gets called twice
+ */
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ while ((p = strsep(&opts, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_subvol:
+ *subvol_name = match_strdup(&args[0]);
+ break;
+ case Opt_device:
+ error = btrfs_scan_one_device(match_strdup(&args[0]),
+ flags, holder, fs_devices);
+ if (error)
+ goto out_free_opts;
+ break;
+ default:
+ break;
+ }
+ }
+
+ out_free_opts:
+ kfree(opts);
+ out:
+ /*
+ * If no subvolume name is specified we use the default one. Allocate
+ * a copy of the string "." here so that code later in the
+ * mount path doesn't care if it's the default volume or another one.
+ */
+ if (!*subvol_name) {
+ *subvol_name = kstrdup(".", GFP_KERNEL);
+ if (!*subvol_name)
+ return -ENOMEM;
+ }
+ return error;
+}
+
+static int btrfs_fill_super(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ void *data, int silent)
+{
+ struct inode *inode;
+ struct dentry *root_dentry;
+ struct btrfs_super_block *disk_super;
+ struct btrfs_root *tree_root;
+ struct btrfs_inode *bi;
+ int err;
+
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_magic = BTRFS_SUPER_MAGIC;
+ sb->s_op = &btrfs_super_ops;
+ sb->s_export_op = &btrfs_export_ops;
+ sb->s_xattr = btrfs_xattr_handlers;
+ sb->s_time_gran = 1;
+ sb->s_flags |= MS_POSIXACL;
+
+ tree_root = open_ctree(sb, fs_devices, (char *)data);
+
+ if (IS_ERR(tree_root)) {
+ printk("btrfs: open_ctree failed\n");
+ return PTR_ERR(tree_root);
+ }
+ sb->s_fs_info = tree_root;
+ disk_super = &tree_root->fs_info->super_copy;
+ inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
+ tree_root->fs_info->fs_root);
+ bi = BTRFS_I(inode);
+ bi->location.objectid = inode->i_ino;
+ bi->location.offset = 0;
+ bi->root = tree_root->fs_info->fs_root;
+
+ btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
+
+ if (!inode) {
+ err = -ENOMEM;
+ goto fail_close;
+ }
+ if (inode->i_state & I_NEW) {
+ btrfs_read_locked_inode(inode);
+ unlock_new_inode(inode);
+ }
+
+ root_dentry = d_alloc_root(inode);
+ if (!root_dentry) {
+ iput(inode);
+ err = -ENOMEM;
+ goto fail_close;
+ }
+#if 0
+ /* this does the super kobj at the same time */
+ err = btrfs_sysfs_add_super(tree_root->fs_info);
+ if (err)
+ goto fail_close;
+#endif
+
+ sb->s_root = root_dentry;
+
+ save_mount_options(sb, data);
+ return 0;
+
+fail_close:
+ close_ctree(tree_root);
+ return err;
+}
+
+int btrfs_sync_fs(struct super_block *sb, int wait)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root;
+ int ret;
+ root = btrfs_sb(sb);
+
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ sb->s_dirt = 0;
+ if (!wait) {
+ filemap_flush(root->fs_info->btree_inode->i_mapping);
+ return 0;
+ }
+
+ btrfs_start_delalloc_inodes(root);
+ btrfs_wait_ordered_extents(root, 0);
+
+ btrfs_clean_old_snapshots(root);
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_commit_transaction(trans, root);
+ sb->s_dirt = 0;
+ return ret;
+}
+
+static void btrfs_write_super(struct super_block *sb)
+{
+ sb->s_dirt = 0;
+}
+
+static int btrfs_test_super(struct super_block *s, void *data)
+{
+ struct btrfs_fs_devices *test_fs_devices = data;
+ struct btrfs_root *root = btrfs_sb(s);
+
+ return root->fs_info->fs_devices == test_fs_devices;
+}
+
+/*
+ * Find a superblock for the given device / mount point.
+ *
+ * Note: This is based on get_sb_bdev from fs/super.c with a few additions
+ * for multiple device setup. Make sure to keep it in sync.
+ */
+static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ char *subvol_name = NULL;
+ struct block_device *bdev = NULL;
+ struct super_block *s;
+ struct dentry *root;
+ struct btrfs_fs_devices *fs_devices = NULL;
+ fmode_t mode = FMODE_READ;
+ int error = 0;
+
+ if (!(flags & MS_RDONLY))
+ mode |= FMODE_WRITE;
+
+ error = btrfs_parse_early_options(data, mode, fs_type,
+ &subvol_name, &fs_devices);
+ if (error)
+ return error;
+
+ error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
+ if (error)
+ goto error_free_subvol_name;
+
+ error = btrfs_open_devices(fs_devices, mode, fs_type);
+ if (error)
+ goto error_free_subvol_name;
+
+ if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+ error = -EACCES;
+ goto error_close_devices;
+ }
+
+ bdev = fs_devices->latest_bdev;
+ s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+ if (IS_ERR(s))
+ goto error_s;
+
+ if (s->s_root) {
+ if ((flags ^ s->s_flags) & MS_RDONLY) {
+ up_write(&s->s_umount);
+ deactivate_super(s);
+ error = -EBUSY;
+ goto error_close_devices;
+ }
+
+ btrfs_close_devices(fs_devices);
+ } else {
+ char b[BDEVNAME_SIZE];
+
+ s->s_flags = flags;
+ strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ error = btrfs_fill_super(s, fs_devices, data,
+ flags & MS_SILENT ? 1 : 0);
+ if (error) {
+ up_write(&s->s_umount);
+ deactivate_super(s);
+ goto error_free_subvol_name;
+ }
+
+ btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+ s->s_flags |= MS_ACTIVE;
+ }
+
+ if (!strcmp(subvol_name, "."))
+ root = dget(s->s_root);
+ else {
+ mutex_lock(&s->s_root->d_inode->i_mutex);
+ root = lookup_one_len(subvol_name, s->s_root,
+ strlen(subvol_name));
+ mutex_unlock(&s->s_root->d_inode->i_mutex);
+
+ if (IS_ERR(root)) {
+ up_write(&s->s_umount);
+ deactivate_super(s);
+ error = PTR_ERR(root);
+ goto error_free_subvol_name;
+ }
+ if (!root->d_inode) {
+ dput(root);
+ up_write(&s->s_umount);
+ deactivate_super(s);
+ error = -ENXIO;
+ goto error_free_subvol_name;
+ }
+ }
+
+ mnt->mnt_sb = s;
+ mnt->mnt_root = root;
+
+ kfree(subvol_name);
+ return 0;
+
+error_s:
+ error = PTR_ERR(s);
+error_close_devices:
+ btrfs_close_devices(fs_devices);
+error_free_subvol_name:
+ kfree(subvol_name);
+ return error;
+}
+
+static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct btrfs_root *root = btrfs_sb(sb);
+ int ret;
+
+ if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+ return 0;
+
+ if (*flags & MS_RDONLY) {
+ sb->s_flags |= MS_RDONLY;
+
+ ret = btrfs_commit_super(root);
+ WARN_ON(ret);
+ } else {
+ if (root->fs_info->fs_devices->rw_devices == 0)
+ return -EACCES;
+
+ if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+ return -EINVAL;
+
+ ret = btrfs_cleanup_reloc_trees(root);
+ WARN_ON(ret);
+
+ ret = btrfs_cleanup_fs_roots(root->fs_info);
+ WARN_ON(ret);
+
+ sb->s_flags &= ~MS_RDONLY;
+ }
+
+ return 0;
+}
+
+static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct btrfs_root *root = btrfs_sb(dentry->d_sb);
+ struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ int bits = dentry->d_sb->s_blocksize_bits;
+ __be32 *fsid = (__be32 *)root->fs_info->fsid;
+
+ buf->f_namelen = BTRFS_NAME_LEN;
+ buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
+ buf->f_bfree = buf->f_blocks -
+ (btrfs_super_bytes_used(disk_super) >> bits);
+ buf->f_bavail = buf->f_bfree;
+ buf->f_bsize = dentry->d_sb->s_blocksize;
+ buf->f_type = BTRFS_SUPER_MAGIC;
+
+ /* We treat it as constant endianness (it doesn't matter _which_)
+ because we want the fsid to come out the same whether mounted
+ on a big-endian or little-endian host */
+ buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+ buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+ /* Mask in the root object ID too, to disambiguate subvols */
+ buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
+ buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
+
+ return 0;
+}
+
+static struct file_system_type btrfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .get_sb = btrfs_get_sb,
+ .kill_sb = kill_anon_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+/*
+ * used by btrfsctl to scan devices when no FS is mounted
+ */
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct btrfs_ioctl_vol_args *vol;
+ struct btrfs_fs_devices *fs_devices;
+ int ret = 0;
+ int len;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+ if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
+ switch (cmd) {
+ case BTRFS_IOC_SCAN_DEV:
+ ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+ &btrfs_fs_type, &fs_devices);
+ break;
+ }
+out:
+ kfree(vol);
+ return ret;
+}
+
+static void btrfs_write_super_lockfs(struct super_block *sb)
+{
+ struct btrfs_root *root = btrfs_sb(sb);
+ mutex_lock(&root->fs_info->transaction_kthread_mutex);
+ mutex_lock(&root->fs_info->cleaner_mutex);
+}
+
+static void btrfs_unlockfs(struct super_block *sb)
+{
+ struct btrfs_root *root = btrfs_sb(sb);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+ mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+}
+
+static struct super_operations btrfs_super_ops = {
+ .delete_inode = btrfs_delete_inode,
+ .put_super = btrfs_put_super,
+ .write_super = btrfs_write_super,
+ .sync_fs = btrfs_sync_fs,
+ .show_options = generic_show_options,
+ .write_inode = btrfs_write_inode,
+ .dirty_inode = btrfs_dirty_inode,
+ .alloc_inode = btrfs_alloc_inode,
+ .destroy_inode = btrfs_destroy_inode,
+ .statfs = btrfs_statfs,
+ .remount_fs = btrfs_remount,
+ .write_super_lockfs = btrfs_write_super_lockfs,
+ .unlockfs = btrfs_unlockfs,
+};
+
+static const struct file_operations btrfs_ctl_fops = {
+ .unlocked_ioctl = btrfs_control_ioctl,
+ .compat_ioctl = btrfs_control_ioctl,
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice btrfs_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "btrfs-control",
+ .fops = &btrfs_ctl_fops
+};
+
+static int btrfs_interface_init(void)
+{
+ return misc_register(&btrfs_misc);
+}
+
+static void btrfs_interface_exit(void)
+{
+ if (misc_deregister(&btrfs_misc) < 0)
+ printk(KERN_INFO "misc_deregister failed for control device");
+}
+
+static int __init init_btrfs_fs(void)
+{
+ int err;
+
+ err = btrfs_init_sysfs();
+ if (err)
+ return err;
+
+ err = btrfs_init_cachep();
+ if (err)
+ goto free_sysfs;
+
+ err = extent_io_init();
+ if (err)
+ goto free_cachep;
+
+ err = extent_map_init();
+ if (err)
+ goto free_extent_io;
+
+ err = btrfs_interface_init();
+ if (err)
+ goto free_extent_map;
+
+ err = register_filesystem(&btrfs_fs_type);
+ if (err)
+ goto unregister_ioctl;
+
+ printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
+ return 0;
+
+unregister_ioctl:
+ btrfs_interface_exit();
+free_extent_map:
+ extent_map_exit();
+free_extent_io:
+ extent_io_exit();
+free_cachep:
+ btrfs_destroy_cachep();
+free_sysfs:
+ btrfs_exit_sysfs();
+ return err;
+}
+
+static void __exit exit_btrfs_fs(void)
+{
+ btrfs_destroy_cachep();
+ extent_map_exit();
+ extent_io_exit();
+ btrfs_interface_exit();
+ unregister_filesystem(&btrfs_fs_type);
+ btrfs_exit_sysfs();
+ btrfs_cleanup_fs_uuids();
+ btrfs_zlib_exit();
+}
+
+module_init(init_btrfs_fs)
+module_exit(exit_btrfs_fs)
+
+MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 00000000000..a240b6fa81d
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)btrfs_root_used(&root->root_item));
+}
+
+static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)btrfs_root_limit(&root->root_item));
+}
+
+static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
+{
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
+}
+
+static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
+}
+
+static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
+}
+
+/* this is for root attrs (subvols/snapshots) */
+struct btrfs_root_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct btrfs_root *, char *);
+ ssize_t (*store)(struct btrfs_root *, const char *, size_t);
+};
+
+#define ROOT_ATTR(name, mode, show, store) \
+static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
+ show, store)
+
+ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL);
+ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL);
+
+static struct attribute *btrfs_root_attrs[] = {
+ &btrfs_root_attr_blocks_used.attr,
+ &btrfs_root_attr_block_limit.attr,
+ NULL,
+};
+
+/* this is for super attrs (actual full fs) */
+struct btrfs_super_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct btrfs_fs_info *, char *);
+ ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
+};
+
+#define SUPER_ATTR(name, mode, show, store) \
+static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
+ show, store)
+
+SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL);
+SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL);
+SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL);
+
+static struct attribute *btrfs_super_attrs[] = {
+ &btrfs_super_attr_blocks_used.attr,
+ &btrfs_super_attr_total_blocks.attr,
+ &btrfs_super_attr_blocksize.attr,
+ NULL,
+};
+
+static ssize_t btrfs_super_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+ super_kobj);
+ struct btrfs_super_attr *a = container_of(attr,
+ struct btrfs_super_attr,
+ attr);
+
+ return a->show ? a->show(fs, buf) : 0;
+}
+
+static ssize_t btrfs_super_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+ super_kobj);
+ struct btrfs_super_attr *a = container_of(attr,
+ struct btrfs_super_attr,
+ attr);
+
+ return a->store ? a->store(fs, buf, len) : 0;
+}
+
+static ssize_t btrfs_root_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+ root_kobj);
+ struct btrfs_root_attr *a = container_of(attr,
+ struct btrfs_root_attr,
+ attr);
+
+ return a->show ? a->show(root, buf) : 0;
+}
+
+static ssize_t btrfs_root_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+ root_kobj);
+ struct btrfs_root_attr *a = container_of(attr,
+ struct btrfs_root_attr,
+ attr);
+ return a->store ? a->store(root, buf, len) : 0;
+}
+
+static void btrfs_super_release(struct kobject *kobj)
+{
+ struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+ super_kobj);
+ complete(&fs->kobj_unregister);
+}
+
+static void btrfs_root_release(struct kobject *kobj)
+{
+ struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+ root_kobj);
+ complete(&root->kobj_unregister);
+}
+
+static struct sysfs_ops btrfs_super_attr_ops = {
+ .show = btrfs_super_attr_show,
+ .store = btrfs_super_attr_store,
+};
+
+static struct sysfs_ops btrfs_root_attr_ops = {
+ .show = btrfs_root_attr_show,
+ .store = btrfs_root_attr_store,
+};
+
+static struct kobj_type btrfs_root_ktype = {
+ .default_attrs = btrfs_root_attrs,
+ .sysfs_ops = &btrfs_root_attr_ops,
+ .release = btrfs_root_release,
+};
+
+static struct kobj_type btrfs_super_ktype = {
+ .default_attrs = btrfs_super_attrs,
+ .sysfs_ops = &btrfs_super_attr_ops,
+ .release = btrfs_super_release,
+};
+
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
+
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
+{
+ int error;
+ char *name;
+ char c;
+ int len = strlen(fs->sb->s_id) + 1;
+ int i;
+
+ name = kmalloc(len, GFP_NOFS);
+ if (!name) {
+ error = -ENOMEM;
+ goto fail;
+ }
+
+ for (i = 0; i < len; i++) {
+ c = fs->sb->s_id[i];
+ if (c == '/' || c == '\\')
+ c = '!';
+ name[i] = c;
+ }
+ name[len] = '\0';
+
+ fs->super_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
+ NULL, "%s", name);
+ kfree(name);
+ if (error)
+ goto fail;
+
+ return 0;
+
+fail:
+ printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
+ return error;
+}
+
+int btrfs_sysfs_add_root(struct btrfs_root *root)
+{
+ int error;
+
+ error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
+ &root->fs_info->super_kobj,
+ "%s", root->name);
+ if (error)
+ goto fail;
+
+ return 0;
+
+fail:
+ printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
+ return error;
+}
+
+void btrfs_sysfs_del_root(struct btrfs_root *root)
+{
+ kobject_put(&root->root_kobj);
+ wait_for_completion(&root->kobj_unregister);
+}
+
+void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
+{
+ kobject_put(&fs->super_kobj);
+ wait_for_completion(&fs->kobj_unregister);
+}
+
+int btrfs_init_sysfs(void)
+{
+ btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
+ if (!btrfs_kset)
+ return -ENOMEM;
+ return 0;
+}
+
+void btrfs_exit_sysfs(void)
+{
+ kset_unregister(btrfs_kset);
+}
+
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 00000000000..8a08f944334
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1097 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "tree-log.h"
+
+#define BTRFS_ROOT_TRANS_TAG 0
+
+static noinline void put_transaction(struct btrfs_transaction *transaction)
+{
+ WARN_ON(transaction->use_count == 0);
+ transaction->use_count--;
+ if (transaction->use_count == 0) {
+ list_del_init(&transaction->list);
+ memset(transaction, 0, sizeof(*transaction));
+ kmem_cache_free(btrfs_transaction_cachep, transaction);
+ }
+}
+
+/*
+ * either allocate a new transaction or hop into the existing one
+ */
+static noinline int join_transaction(struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans;
+ cur_trans = root->fs_info->running_transaction;
+ if (!cur_trans) {
+ cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
+ GFP_NOFS);
+ BUG_ON(!cur_trans);
+ root->fs_info->generation++;
+ root->fs_info->last_alloc = 0;
+ root->fs_info->last_data_alloc = 0;
+ cur_trans->num_writers = 1;
+ cur_trans->num_joined = 0;
+ cur_trans->transid = root->fs_info->generation;
+ init_waitqueue_head(&cur_trans->writer_wait);
+ init_waitqueue_head(&cur_trans->commit_wait);
+ cur_trans->in_commit = 0;
+ cur_trans->blocked = 0;
+ cur_trans->use_count = 1;
+ cur_trans->commit_done = 0;
+ cur_trans->start_time = get_seconds();
+ INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+ list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+ extent_io_tree_init(&cur_trans->dirty_pages,
+ root->fs_info->btree_inode->i_mapping,
+ GFP_NOFS);
+ spin_lock(&root->fs_info->new_trans_lock);
+ root->fs_info->running_transaction = cur_trans;
+ spin_unlock(&root->fs_info->new_trans_lock);
+ } else {
+ cur_trans->num_writers++;
+ cur_trans->num_joined++;
+ }
+
+ return 0;
+}
+
+/*
+ * this does all the record keeping required to make sure that a reference
+ * counted root is properly recorded in a given transaction. This is required
+ * to make sure the old root from before we joined the transaction is deleted
+ * when the transaction commits
+ */
+noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
+{
+ struct btrfs_dirty_root *dirty;
+ u64 running_trans_id = root->fs_info->running_transaction->transid;
+ if (root->ref_cows && root->last_trans < running_trans_id) {
+ WARN_ON(root == root->fs_info->extent_root);
+ if (root->root_item.refs != 0) {
+ radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+
+ dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+ BUG_ON(!dirty);
+ dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+ BUG_ON(!dirty->root);
+ dirty->latest_root = root;
+ INIT_LIST_HEAD(&dirty->list);
+
+ root->commit_root = btrfs_root_node(root);
+
+ memcpy(dirty->root, root, sizeof(*root));
+ spin_lock_init(&dirty->root->node_lock);
+ spin_lock_init(&dirty->root->list_lock);
+ mutex_init(&dirty->root->objectid_mutex);
+ mutex_init(&dirty->root->log_mutex);
+ INIT_LIST_HEAD(&dirty->root->dead_list);
+ dirty->root->node = root->commit_root;
+ dirty->root->commit_root = NULL;
+
+ spin_lock(&root->list_lock);
+ list_add(&dirty->root->dead_list, &root->dead_list);
+ spin_unlock(&root->list_lock);
+
+ root->dirty_root = dirty;
+ } else {
+ WARN_ON(1);
+ }
+ root->last_trans = running_trans_id;
+ }
+ return 0;
+}
+
+/* wait for commit against the current transaction to become unblocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
+static void wait_current_trans(struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans;
+
+ cur_trans = root->fs_info->running_transaction;
+ if (cur_trans && cur_trans->blocked) {
+ DEFINE_WAIT(wait);
+ cur_trans->use_count++;
+ while (1) {
+ prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (cur_trans->blocked) {
+ mutex_unlock(&root->fs_info->trans_mutex);
+ schedule();
+ mutex_lock(&root->fs_info->trans_mutex);
+ finish_wait(&root->fs_info->transaction_wait,
+ &wait);
+ } else {
+ finish_wait(&root->fs_info->transaction_wait,
+ &wait);
+ break;
+ }
+ }
+ put_transaction(cur_trans);
+ }
+}
+
+static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+ int num_blocks, int wait)
+{
+ struct btrfs_trans_handle *h =
+ kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ int ret;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (!root->fs_info->log_root_recovering &&
+ ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
+ wait_current_trans(root);
+ ret = join_transaction(root);
+ BUG_ON(ret);
+
+ btrfs_record_root_in_trans(root);
+ h->transid = root->fs_info->running_transaction->transid;
+ h->transaction = root->fs_info->running_transaction;
+ h->blocks_reserved = num_blocks;
+ h->blocks_used = 0;
+ h->block_group = 0;
+ h->alloc_exclude_nr = 0;
+ h->alloc_exclude_start = 0;
+ root->fs_info->running_transaction->use_count++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ return h;
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+ int num_blocks)
+{
+ return start_transaction(root, num_blocks, 1);
+}
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+ int num_blocks)
+{
+ return start_transaction(root, num_blocks, 0);
+}
+
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+ int num_blocks)
+{
+ return start_transaction(r, num_blocks, 2);
+}
+
+/* wait for a transaction commit to be fully complete */
+static noinline int wait_for_commit(struct btrfs_root *root,
+ struct btrfs_transaction *commit)
+{
+ DEFINE_WAIT(wait);
+ mutex_lock(&root->fs_info->trans_mutex);
+ while (!commit->commit_done) {
+ prepare_to_wait(&commit->commit_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (commit->commit_done)
+ break;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ schedule();
+ mutex_lock(&root->fs_info->trans_mutex);
+ }
+ mutex_unlock(&root->fs_info->trans_mutex);
+ finish_wait(&commit->commit_wait, &wait);
+ return 0;
+}
+
+/*
+ * rate limit against the drop_snapshot code. This helps to slow down new
+ * operations if the drop_snapshot code isn't able to keep up.
+ */
+static void throttle_on_drops(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ int harder_count = 0;
+
+harder:
+ if (atomic_read(&info->throttles)) {
+ DEFINE_WAIT(wait);
+ int thr;
+ thr = atomic_read(&info->throttle_gen);
+
+ do {
+ prepare_to_wait(&info->transaction_throttle,
+ &wait, TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&info->throttles)) {
+ finish_wait(&info->transaction_throttle, &wait);
+ break;
+ }
+ schedule();
+ finish_wait(&info->transaction_throttle, &wait);
+ } while (thr == atomic_read(&info->throttle_gen));
+ harder_count++;
+
+ if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
+ harder_count < 2)
+ goto harder;
+
+ if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
+ harder_count < 10)
+ goto harder;
+
+ if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
+ harder_count < 20)
+ goto harder;
+ }
+}
+
+void btrfs_throttle(struct btrfs_root *root)
+{
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (!root->fs_info->open_ioctl_trans)
+ wait_current_trans(root);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ throttle_on_drops(root);
+}
+
+static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int throttle)
+{
+ struct btrfs_transaction *cur_trans;
+ struct btrfs_fs_info *info = root->fs_info;
+
+ mutex_lock(&info->trans_mutex);
+ cur_trans = info->running_transaction;
+ WARN_ON(cur_trans != trans->transaction);
+ WARN_ON(cur_trans->num_writers < 1);
+ cur_trans->num_writers--;
+
+ if (waitqueue_active(&cur_trans->writer_wait))
+ wake_up(&cur_trans->writer_wait);
+ put_transaction(cur_trans);
+ mutex_unlock(&info->trans_mutex);
+ memset(trans, 0, sizeof(*trans));
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+ if (throttle)
+ throttle_on_drops(root);
+
+ return 0;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ return __btrfs_end_transaction(trans, root, 0);
+}
+
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ return __btrfs_end_transaction(trans, root, 1);
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees. This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages)
+{
+ int ret;
+ int err = 0;
+ int werr = 0;
+ struct page *page;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ u64 start = 0;
+ u64 end;
+ unsigned long index;
+
+ while (1) {
+ ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+ EXTENT_DIRTY);
+ if (ret)
+ break;
+ while (start <= end) {
+ cond_resched();
+
+ index = start >> PAGE_CACHE_SHIFT;
+ start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+ page = find_get_page(btree_inode->i_mapping, index);
+ if (!page)
+ continue;
+
+ btree_lock_page_hook(page);
+ if (!page->mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ continue;
+ }
+
+ if (PageWriteback(page)) {
+ if (PageDirty(page))
+ wait_on_page_writeback(page);
+ else {
+ unlock_page(page);
+ page_cache_release(page);
+ continue;
+ }
+ }
+ err = write_one_page(page, 0);
+ if (err)
+ werr = err;
+ page_cache_release(page);
+ }
+ }
+ while (1) {
+ ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+ EXTENT_DIRTY);
+ if (ret)
+ break;
+
+ clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
+ while (start <= end) {
+ index = start >> PAGE_CACHE_SHIFT;
+ start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+ page = find_get_page(btree_inode->i_mapping, index);
+ if (!page)
+ continue;
+ if (PageDirty(page)) {
+ btree_lock_page_hook(page);
+ wait_on_page_writeback(page);
+ err = write_one_page(page, 0);
+ if (err)
+ werr = err;
+ }
+ wait_on_page_writeback(page);
+ page_cache_release(page);
+ cond_resched();
+ }
+ }
+ if (err)
+ werr = err;
+ return werr;
+}
+
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (!trans || !trans->transaction) {
+ struct inode *btree_inode;
+ btree_inode = root->fs_info->btree_inode;
+ return filemap_write_and_wait(btree_inode->i_mapping);
+ }
+ return btrfs_write_and_wait_marked_extents(root,
+ &trans->transaction->dirty_pages);
+}
+
+/*
+ * this is used to update the root pointer in the tree of tree roots.
+ *
+ * But, in the case of the extent allocation tree, updating the root
+ * pointer may allocate blocks which may change the root of the extent
+ * allocation tree.
+ *
+ * So, this loops and repeats and makes sure the cowonly root didn't
+ * change while the root pointer was being updated in the metadata.
+ */
+static int update_cowonly_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+ u64 old_root_bytenr;
+ struct btrfs_root *tree_root = root->fs_info->tree_root;
+
+ btrfs_extent_post_op(trans, root);
+ btrfs_write_dirty_block_groups(trans, root);
+ btrfs_extent_post_op(trans, root);
+
+ while (1) {
+ old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+ if (old_root_bytenr == root->node->start)
+ break;
+ btrfs_set_root_bytenr(&root->root_item,
+ root->node->start);
+ btrfs_set_root_level(&root->root_item,
+ btrfs_header_level(root->node));
+ btrfs_set_root_generation(&root->root_item, trans->transid);
+
+ btrfs_extent_post_op(trans, root);
+
+ ret = btrfs_update_root(trans, tree_root,
+ &root->root_key,
+ &root->root_item);
+ BUG_ON(ret);
+ btrfs_write_dirty_block_groups(trans, root);
+ btrfs_extent_post_op(trans, root);
+ }
+ return 0;
+}
+
+/*
+ * update all the cowonly tree roots on disk
+ */
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct list_head *next;
+ struct extent_buffer *eb;
+
+ btrfs_extent_post_op(trans, fs_info->tree_root);
+
+ eb = btrfs_lock_root_node(fs_info->tree_root);
+ btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+
+ btrfs_extent_post_op(trans, fs_info->tree_root);
+
+ while (!list_empty(&fs_info->dirty_cowonly_roots)) {
+ next = fs_info->dirty_cowonly_roots.next;
+ list_del_init(next);
+ root = list_entry(next, struct btrfs_root, dirty_list);
+
+ update_cowonly_root(trans, root);
+ }
+ return 0;
+}
+
+/*
+ * dead roots are old snapshots that need to be deleted. This allocates
+ * a dirty root struct and adds it into the list of dead roots that need to
+ * be deleted
+ */
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
+{
+ struct btrfs_dirty_root *dirty;
+
+ dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+ if (!dirty)
+ return -ENOMEM;
+ dirty->root = root;
+ dirty->latest_root = latest;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ list_add(&dirty->list, &latest->fs_info->dead_roots);
+ mutex_unlock(&root->fs_info->trans_mutex);
+ return 0;
+}
+
+/*
+ * at transaction commit time we need to schedule the old roots for
+ * deletion via btrfs_drop_snapshot. This runs through all the
+ * reference counted roots that were modified in the current
+ * transaction and puts them into the drop list
+ */
+static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
+ struct radix_tree_root *radix,
+ struct list_head *list)
+{
+ struct btrfs_dirty_root *dirty;
+ struct btrfs_root *gang[8];
+ struct btrfs_root *root;
+ int i;
+ int ret;
+ int err = 0;
+ u32 refs;
+
+ while (1) {
+ ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
+ ARRAY_SIZE(gang),
+ BTRFS_ROOT_TRANS_TAG);
+ if (ret == 0)
+ break;
+ for (i = 0; i < ret; i++) {
+ root = gang[i];
+ radix_tree_tag_clear(radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+
+ BUG_ON(!root->ref_tree);
+ dirty = root->dirty_root;
+
+ btrfs_free_log(trans, root);
+ btrfs_free_reloc_root(trans, root);
+
+ if (root->commit_root == root->node) {
+ WARN_ON(root->node->start !=
+ btrfs_root_bytenr(&root->root_item));
+
+ free_extent_buffer(root->commit_root);
+ root->commit_root = NULL;
+ root->dirty_root = NULL;
+
+ spin_lock(&root->list_lock);
+ list_del_init(&dirty->root->dead_list);
+ spin_unlock(&root->list_lock);
+
+ kfree(dirty->root);
+ kfree(dirty);
+
+ /* make sure to update the root on disk
+ * so we get any updates to the block used
+ * counts
+ */
+ err = btrfs_update_root(trans,
+ root->fs_info->tree_root,
+ &root->root_key,
+ &root->root_item);
+ continue;
+ }
+
+ memset(&root->root_item.drop_progress, 0,
+ sizeof(struct btrfs_disk_key));
+ root->root_item.drop_level = 0;
+ root->commit_root = NULL;
+ root->dirty_root = NULL;
+ root->root_key.offset = root->fs_info->generation;
+ btrfs_set_root_bytenr(&root->root_item,
+ root->node->start);
+ btrfs_set_root_level(&root->root_item,
+ btrfs_header_level(root->node));
+ btrfs_set_root_generation(&root->root_item,
+ root->root_key.offset);
+
+ err = btrfs_insert_root(trans, root->fs_info->tree_root,
+ &root->root_key,
+ &root->root_item);
+ if (err)
+ break;
+
+ refs = btrfs_root_refs(&dirty->root->root_item);
+ btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
+ err = btrfs_update_root(trans, root->fs_info->tree_root,
+ &dirty->root->root_key,
+ &dirty->root->root_item);
+
+ BUG_ON(err);
+ if (refs == 1) {
+ list_add(&dirty->list, list);
+ } else {
+ WARN_ON(1);
+ free_extent_buffer(dirty->root->node);
+ kfree(dirty->root);
+ kfree(dirty);
+ }
+ }
+ }
+ return err;
+}
+
+/*
+ * defrag a given btree. If cacheonly == 1, this won't read from the disk,
+ * otherwise every leaf in the btree is read and defragged.
+ */
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ int ret;
+ struct btrfs_trans_handle *trans;
+ unsigned long nr;
+
+ smp_mb();
+ if (root->defrag_running)
+ return 0;
+ trans = btrfs_start_transaction(root, 1);
+ while (1) {
+ root->defrag_running = 1;
+ ret = btrfs_defrag_leaves(trans, root, cacheonly);
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(info->tree_root, nr);
+ cond_resched();
+
+ trans = btrfs_start_transaction(root, 1);
+ if (root->fs_info->closing || ret != -EAGAIN)
+ break;
+ }
+ root->defrag_running = 0;
+ smp_mb();
+ btrfs_end_transaction(trans, root);
+ return 0;
+}
+
+/*
+ * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
+ * all of them
+ */
+static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
+ struct list_head *list)
+{
+ struct btrfs_dirty_root *dirty;
+ struct btrfs_trans_handle *trans;
+ unsigned long nr;
+ u64 num_bytes;
+ u64 bytes_used;
+ u64 max_useless;
+ int ret = 0;
+ int err;
+
+ while (!list_empty(list)) {
+ struct btrfs_root *root;
+
+ dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
+ list_del_init(&dirty->list);
+
+ num_bytes = btrfs_root_used(&dirty->root->root_item);
+ root = dirty->latest_root;
+ atomic_inc(&root->fs_info->throttles);
+
+ while (1) {
+ trans = btrfs_start_transaction(tree_root, 1);
+ mutex_lock(&root->fs_info->drop_mutex);
+ ret = btrfs_drop_snapshot(trans, dirty->root);
+ if (ret != -EAGAIN)
+ break;
+ mutex_unlock(&root->fs_info->drop_mutex);
+
+ err = btrfs_update_root(trans,
+ tree_root,
+ &dirty->root->root_key,
+ &dirty->root->root_item);
+ if (err)
+ ret = err;
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction(trans, tree_root);
+ BUG_ON(ret);
+
+ btrfs_btree_balance_dirty(tree_root, nr);
+ cond_resched();
+ }
+ BUG_ON(ret);
+ atomic_dec(&root->fs_info->throttles);
+ wake_up(&root->fs_info->transaction_throttle);
+
+ num_bytes -= btrfs_root_used(&dirty->root->root_item);
+ bytes_used = btrfs_root_used(&root->root_item);
+ if (num_bytes) {
+ btrfs_record_root_in_trans(root);
+ btrfs_set_root_used(&root->root_item,
+ bytes_used - num_bytes);
+ }
+
+ ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
+ if (ret) {
+ BUG();
+ break;
+ }
+ mutex_unlock(&root->fs_info->drop_mutex);
+
+ spin_lock(&root->list_lock);
+ list_del_init(&dirty->root->dead_list);
+ if (!list_empty(&root->dead_list)) {
+ struct btrfs_root *oldest;
+ oldest = list_entry(root->dead_list.prev,
+ struct btrfs_root, dead_list);
+ max_useless = oldest->root_key.offset - 1;
+ } else {
+ max_useless = root->root_key.offset - 1;
+ }
+ spin_unlock(&root->list_lock);
+
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction(trans, tree_root);
+ BUG_ON(ret);
+
+ ret = btrfs_remove_leaf_refs(root, max_useless, 0);
+ BUG_ON(ret);
+
+ free_extent_buffer(dirty->root->node);
+ kfree(dirty->root);
+ kfree(dirty);
+
+ btrfs_btree_balance_dirty(tree_root, nr);
+ cond_resched();
+ }
+ return ret;
+}
+
+/*
+ * new snapshots need to be created at a very specific time in the
+ * transaction commit. This does the actual creation
+ */
+static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_key key;
+ struct btrfs_root_item *new_root_item;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *root = pending->root;
+ struct extent_buffer *tmp;
+ struct extent_buffer *old;
+ int ret;
+ u64 objectid;
+
+ new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
+ if (!new_root_item) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
+ if (ret)
+ goto fail;
+
+ btrfs_record_root_in_trans(root);
+ btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
+ memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+
+ key.objectid = objectid;
+ key.offset = trans->transid;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+ old = btrfs_lock_root_node(root);
+ btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+
+ btrfs_copy_root(trans, root, old, &tmp, objectid);
+ btrfs_tree_unlock(old);
+ free_extent_buffer(old);
+
+ btrfs_set_root_bytenr(new_root_item, tmp->start);
+ btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
+ btrfs_set_root_generation(new_root_item, trans->transid);
+ ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+ new_root_item);
+ btrfs_tree_unlock(tmp);
+ free_extent_buffer(tmp);
+ if (ret)
+ goto fail;
+
+ key.offset = (u64)-1;
+ memcpy(&pending->root_key, &key, sizeof(key));
+fail:
+ kfree(new_root_item);
+ return ret;
+}
+
+static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
+ struct btrfs_pending_snapshot *pending)
+{
+ int ret;
+ int namelen;
+ u64 index = 0;
+ struct btrfs_trans_handle *trans;
+ struct inode *parent_inode;
+ struct inode *inode;
+ struct btrfs_root *parent_root;
+
+ parent_inode = pending->dentry->d_parent->d_inode;
+ parent_root = BTRFS_I(parent_inode)->root;
+ trans = btrfs_join_transaction(parent_root, 1);
+
+ /*
+ * insert the directory item
+ */
+ namelen = strlen(pending->name);
+ ret = btrfs_set_inode_index(parent_inode, &index);
+ ret = btrfs_insert_dir_item(trans, parent_root,
+ pending->name, namelen,
+ parent_inode->i_ino,
+ &pending->root_key, BTRFS_FT_DIR, index);
+
+ if (ret)
+ goto fail;
+
+ btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+ ret = btrfs_update_inode(trans, parent_root, parent_inode);
+ BUG_ON(ret);
+
+ /* add the backref first */
+ ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+ pending->root_key.objectid,
+ BTRFS_ROOT_BACKREF_KEY,
+ parent_root->root_key.objectid,
+ parent_inode->i_ino, index, pending->name,
+ namelen);
+
+ BUG_ON(ret);
+
+ /* now add the forward ref */
+ ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+ parent_root->root_key.objectid,
+ BTRFS_ROOT_REF_KEY,
+ pending->root_key.objectid,
+ parent_inode->i_ino, index, pending->name,
+ namelen);
+
+ inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
+ d_instantiate(pending->dentry, inode);
+fail:
+ btrfs_end_transaction(trans, fs_info->fs_root);
+ return ret;
+}
+
+/*
+ * create all the snapshots we've scheduled for creation
+ */
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_pending_snapshot *pending;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+ struct list_head *cur;
+ int ret;
+
+ list_for_each(cur, head) {
+ pending = list_entry(cur, struct btrfs_pending_snapshot, list);
+ ret = create_pending_snapshot(trans, fs_info, pending);
+ BUG_ON(ret);
+ }
+ return 0;
+}
+
+static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_pending_snapshot *pending;
+ struct list_head *head = &trans->transaction->pending_snapshots;
+ int ret;
+
+ while (!list_empty(head)) {
+ pending = list_entry(head->next,
+ struct btrfs_pending_snapshot, list);
+ ret = finish_pending_snapshot(fs_info, pending);
+ BUG_ON(ret);
+ list_del(&pending->list);
+ kfree(pending->name);
+ kfree(pending);
+ }
+ return 0;
+}
+
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ unsigned long joined = 0;
+ unsigned long timeout = 1;
+ struct btrfs_transaction *cur_trans;
+ struct btrfs_transaction *prev_trans = NULL;
+ struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+ struct list_head dirty_fs_roots;
+ struct extent_io_tree *pinned_copy;
+ DEFINE_WAIT(wait);
+ int ret;
+
+ INIT_LIST_HEAD(&dirty_fs_roots);
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (trans->transaction->in_commit) {
+ cur_trans = trans->transaction;
+ trans->transaction->use_count++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ btrfs_end_transaction(trans, root);
+
+ ret = wait_for_commit(root, cur_trans);
+ BUG_ON(ret);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ put_transaction(cur_trans);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ return 0;
+ }
+
+ pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
+ if (!pinned_copy)
+ return -ENOMEM;
+
+ extent_io_tree_init(pinned_copy,
+ root->fs_info->btree_inode->i_mapping, GFP_NOFS);
+
+ trans->transaction->in_commit = 1;
+ trans->transaction->blocked = 1;
+ cur_trans = trans->transaction;
+ if (cur_trans->list.prev != &root->fs_info->trans_list) {
+ prev_trans = list_entry(cur_trans->list.prev,
+ struct btrfs_transaction, list);
+ if (!prev_trans->commit_done) {
+ prev_trans->use_count++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ wait_for_commit(root, prev_trans);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ put_transaction(prev_trans);
+ }
+ }
+
+ do {
+ int snap_pending = 0;
+ joined = cur_trans->num_joined;
+ if (!list_empty(&trans->transaction->pending_snapshots))
+ snap_pending = 1;
+
+ WARN_ON(cur_trans != trans->transaction);
+ prepare_to_wait(&cur_trans->writer_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (cur_trans->num_writers > 1)
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ else
+ timeout = 1;
+
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ if (snap_pending) {
+ ret = btrfs_wait_ordered_extents(root, 1);
+ BUG_ON(ret);
+ }
+
+ schedule_timeout(timeout);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ finish_wait(&cur_trans->writer_wait, &wait);
+ } while (cur_trans->num_writers > 1 ||
+ (cur_trans->num_joined != joined));
+
+ ret = create_pending_snapshots(trans, root->fs_info);
+ BUG_ON(ret);
+
+ WARN_ON(cur_trans != trans->transaction);
+
+ /* btrfs_commit_tree_roots is responsible for getting the
+ * various roots consistent with each other. Every pointer
+ * in the tree of tree roots has to point to the most up to date
+ * root for every subvolume and other tree. So, we have to keep
+ * the tree logging code from jumping in and changing any
+ * of the trees.
+ *
+ * At this point in the commit, there can't be any tree-log
+ * writers, but a little lower down we drop the trans mutex
+ * and let new people in. By holding the tree_log_mutex
+ * from now until after the super is written, we avoid races
+ * with the tree-log code.
+ */
+ mutex_lock(&root->fs_info->tree_log_mutex);
+ /*
+ * keep tree reloc code from adding new reloc trees
+ */
+ mutex_lock(&root->fs_info->tree_reloc_mutex);
+
+
+ ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
+ &dirty_fs_roots);
+ BUG_ON(ret);
+
+ /* add_dirty_roots gets rid of all the tree log roots, it is now
+ * safe to free the root of tree log roots
+ */
+ btrfs_free_log_root_tree(trans, root->fs_info);
+
+ ret = btrfs_commit_tree_roots(trans, root);
+ BUG_ON(ret);
+
+ cur_trans = root->fs_info->running_transaction;
+ spin_lock(&root->fs_info->new_trans_lock);
+ root->fs_info->running_transaction = NULL;
+ spin_unlock(&root->fs_info->new_trans_lock);
+ btrfs_set_super_generation(&root->fs_info->super_copy,
+ cur_trans->transid);
+ btrfs_set_super_root(&root->fs_info->super_copy,
+ root->fs_info->tree_root->node->start);
+ btrfs_set_super_root_level(&root->fs_info->super_copy,
+ btrfs_header_level(root->fs_info->tree_root->node));
+
+ btrfs_set_super_chunk_root(&root->fs_info->super_copy,
+ chunk_root->node->start);
+ btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
+ btrfs_header_level(chunk_root->node));
+ btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
+ btrfs_header_generation(chunk_root->node));
+
+ if (!root->fs_info->log_root_recovering) {
+ btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
+ btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+ }
+
+ memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+ sizeof(root->fs_info->super_copy));
+
+ btrfs_copy_pinned(root, pinned_copy);
+
+ trans->transaction->blocked = 0;
+ wake_up(&root->fs_info->transaction_throttle);
+ wake_up(&root->fs_info->transaction_wait);
+
+ mutex_unlock(&root->fs_info->trans_mutex);
+ ret = btrfs_write_and_wait_transaction(trans, root);
+ BUG_ON(ret);
+ write_ctree_super(trans, root, 0);
+
+ /*
+ * the super is written, we can safely allow the tree-loggers
+ * to go about their business
+ */
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+
+ btrfs_finish_extent_commit(trans, root, pinned_copy);
+ kfree(pinned_copy);
+
+ btrfs_drop_dead_reloc_roots(root);
+ mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+ /* do the directory inserts of any pending snapshot creations */
+ finish_pending_snapshots(trans, root->fs_info);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+
+ cur_trans->commit_done = 1;
+ root->fs_info->last_trans_committed = cur_trans->transid;
+ wake_up(&cur_trans->commit_wait);
+
+ put_transaction(cur_trans);
+ put_transaction(cur_trans);
+
+ list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
+ if (root->fs_info->closing)
+ list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
+
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+ if (root->fs_info->closing)
+ drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
+ return ret;
+}
+
+/*
+ * interface function to delete all the snapshots we have scheduled for deletion
+ */
+int btrfs_clean_old_snapshots(struct btrfs_root *root)
+{
+ struct list_head dirty_roots;
+ INIT_LIST_HEAD(&dirty_roots);
+again:
+ mutex_lock(&root->fs_info->trans_mutex);
+ list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ if (!list_empty(&dirty_roots)) {
+ drop_dirty_roots(root, &dirty_roots);
+ goto again;
+ }
+ return 0;
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 00000000000..ea292117f88
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
+#include "btrfs_inode.h"
+
+struct btrfs_transaction {
+ u64 transid;
+ unsigned long num_writers;
+ unsigned long num_joined;
+ int in_commit;
+ int use_count;
+ int commit_done;
+ int blocked;
+ struct list_head list;
+ struct extent_io_tree dirty_pages;
+ unsigned long start_time;
+ wait_queue_head_t writer_wait;
+ wait_queue_head_t commit_wait;
+ struct list_head pending_snapshots;
+};
+
+struct btrfs_trans_handle {
+ u64 transid;
+ unsigned long blocks_reserved;
+ unsigned long blocks_used;
+ struct btrfs_transaction *transaction;
+ u64 block_group;
+ u64 alloc_exclude_start;
+ u64 alloc_exclude_nr;
+};
+
+struct btrfs_pending_snapshot {
+ struct dentry *dentry;
+ struct btrfs_root *root;
+ char *name;
+ struct btrfs_key root_key;
+ struct list_head list;
+};
+
+struct btrfs_dirty_root {
+ struct list_head list;
+ struct btrfs_root *root;
+ struct btrfs_root *latest_root;
+};
+
+static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ trans->block_group = BTRFS_I(inode)->block_group;
+}
+
+static inline void btrfs_update_inode_block_group(
+ struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ BTRFS_I(inode)->block_group = trans->block_group;
+}
+
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ BTRFS_I(inode)->last_trans = trans->transaction->transid;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+ int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+ int num_blocks);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+ int num_blocks);
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
+int btrfs_clean_old_snapshots(struct btrfs_root *root);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+void btrfs_throttle(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_root *root);
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages);
+#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 00000000000..3e8358c3616
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+
+/* defrag all the leaves in a given btree. If cache_only == 1, don't read
+ * things from disk, otherwise read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int cache_only)
+{
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ int ret = 0;
+ int wret;
+ int level;
+ int orig_level;
+ int is_extent = 0;
+ int next_key_ret = 0;
+ u64 last_ret = 0;
+ u64 min_trans = 0;
+
+ if (cache_only)
+ goto out;
+
+ if (root->fs_info->extent_root == root) {
+ /*
+ * there's recursion here right now in the tree locking,
+ * we can't defrag the extent root without deadlock
+ */
+ goto out;
+ }
+
+ if (root->ref_cows == 0 && !is_extent)
+ goto out;
+
+ if (btrfs_test_opt(root, SSD))
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ level = btrfs_header_level(root->node);
+ orig_level = level;
+
+ if (level == 0)
+ goto out;
+
+ if (root->defrag_progress.objectid == 0) {
+ struct extent_buffer *root_node;
+ u32 nritems;
+
+ root_node = btrfs_lock_root_node(root);
+ nritems = btrfs_header_nritems(root_node);
+ root->defrag_max.objectid = 0;
+ /* from above we know this is not a leaf */
+ btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+ nritems - 1);
+ btrfs_tree_unlock(root_node);
+ free_extent_buffer(root_node);
+ memset(&key, 0, sizeof(key));
+ } else {
+ memcpy(&key, &root->defrag_progress, sizeof(key));
+ }
+
+ path->keep_locks = 1;
+ if (cache_only)
+ min_trans = root->defrag_trans_start;
+
+ ret = btrfs_search_forward(root, &key, NULL, path,
+ cache_only, min_trans);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+ btrfs_release_path(root, path);
+ wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+ if (!path->nodes[1]) {
+ ret = 0;
+ goto out;
+ }
+ path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+ min_trans);
+ ret = btrfs_realloc_node(trans, root,
+ path->nodes[1], 0,
+ cache_only, &last_ret,
+ &root->defrag_progress);
+ WARN_ON(ret && ret != -EAGAIN);
+ if (next_key_ret == 0) {
+ memcpy(&root->defrag_progress, &key, sizeof(key));
+ ret = -EAGAIN;
+ }
+
+ btrfs_release_path(root, path);
+ if (is_extent)
+ btrfs_extent_post_op(trans, root);
+out:
+ if (path)
+ btrfs_free_path(path);
+ if (ret == -EAGAIN) {
+ if (root->defrag_max.objectid > root->defrag_progress.objectid)
+ goto done;
+ if (root->defrag_max.type > root->defrag_progress.type)
+ goto done;
+ if (root->defrag_max.offset > root->defrag_progress.offset)
+ goto done;
+ ret = 0;
+ }
+done:
+ if (ret != -EAGAIN) {
+ memset(&root->defrag_progress, 0,
+ sizeof(root->defrag_progress));
+ root->defrag_trans_start = trans->transid;
+ }
+ return ret;
+}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 00000000000..d81cda2e077
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2898 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "compat.h"
+#include "tree-log.h"
+
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+#define LOG_INODE_ALL 0
+#define LOG_INODE_EXISTS 1
+
+/*
+ * stages for the tree walking. The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+#define LOG_WALK_PIN_ONLY 0
+#define LOG_WALK_REPLAY_INODES 1
+#define LOG_WALK_REPLAY_ALL 2
+
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only);
+static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
+
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction. Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree. Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+
+/*
+ * btrfs_add_log_tree adds a new per-subvolume log tree into the
+ * tree of log tree roots. This must be called with a tree log transaction
+ * running (see start_log_trans).
+ */
+static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_key key;
+ struct btrfs_root_item root_item;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+ struct btrfs_root *new_root = root;
+ int ret;
+ u64 objectid = root->root_key.objectid;
+
+ leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+ BTRFS_TREE_LOG_OBJECTID,
+ trans->transid, 0, 0, 0);
+ if (IS_ERR(leaf)) {
+ ret = PTR_ERR(leaf);
+ return ret;
+ }
+
+ btrfs_set_header_nritems(leaf, 0);
+ btrfs_set_header_level(leaf, 0);
+ btrfs_set_header_bytenr(leaf, leaf->start);
+ btrfs_set_header_generation(leaf, trans->transid);
+ btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
+
+ write_extent_buffer(leaf, root->fs_info->fsid,
+ (unsigned long)btrfs_header_fsid(leaf),
+ BTRFS_FSID_SIZE);
+ btrfs_mark_buffer_dirty(leaf);
+
+ inode_item = &root_item.inode;
+ memset(inode_item, 0, sizeof(*inode_item));
+ inode_item->generation = cpu_to_le64(1);
+ inode_item->size = cpu_to_le64(3);
+ inode_item->nlink = cpu_to_le32(1);
+ inode_item->nbytes = cpu_to_le64(root->leafsize);
+ inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+ btrfs_set_root_bytenr(&root_item, leaf->start);
+ btrfs_set_root_generation(&root_item, trans->transid);
+ btrfs_set_root_level(&root_item, 0);
+ btrfs_set_root_refs(&root_item, 0);
+ btrfs_set_root_used(&root_item, 0);
+
+ memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+ root_item.drop_level = 0;
+
+ btrfs_tree_unlock(leaf);
+ free_extent_buffer(leaf);
+ leaf = NULL;
+
+ btrfs_set_root_dirid(&root_item, 0);
+
+ key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ key.offset = objectid;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
+ &root_item);
+ if (ret)
+ goto fail;
+
+ new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
+ &key);
+ BUG_ON(!new_root);
+
+ WARN_ON(root->log_root);
+ root->log_root = new_root;
+
+ /*
+ * log trees do not get reference counted because they go away
+ * before a real commit is actually done. They do store pointers
+ * to file data extents, and those reference counts still get
+ * updated (along with back refs to the log tree).
+ */
+ new_root->ref_cows = 0;
+ new_root->last_trans = trans->transid;
+
+ /*
+ * we need to make sure the root block for this new tree
+ * is marked as dirty in the dirty_log_pages tree. This
+ * is how it gets flushed down to disk at tree log commit time.
+ *
+ * the tree logging mutex keeps others from coming in and changing
+ * the new_root->node, so we can safely access it here
+ */
+ set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
+ new_root->node->start + new_root->node->len - 1,
+ GFP_NOFS);
+
+fail:
+ return ret;
+}
+
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+ mutex_lock(&root->fs_info->tree_log_mutex);
+ if (!root->fs_info->log_root_tree) {
+ ret = btrfs_init_log_root_tree(trans, root->fs_info);
+ BUG_ON(ret);
+ }
+ if (!root->log_root) {
+ ret = btrfs_add_log_tree(trans, root);
+ BUG_ON(ret);
+ }
+ atomic_inc(&root->fs_info->tree_log_writers);
+ root->fs_info->tree_log_batch++;
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ return 0;
+}
+
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+ int ret = -ENOENT;
+
+ smp_mb();
+ if (!root->log_root)
+ return -ENOENT;
+
+ mutex_lock(&root->fs_info->tree_log_mutex);
+ if (root->log_root) {
+ ret = 0;
+ atomic_inc(&root->fs_info->tree_log_writers);
+ root->fs_info->tree_log_batch++;
+ }
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ return ret;
+}
+
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+static int end_log_trans(struct btrfs_root *root)
+{
+ atomic_dec(&root->fs_info->tree_log_writers);
+ smp_mb();
+ if (waitqueue_active(&root->fs_info->tree_log_wait))
+ wake_up(&root->fs_info->tree_log_wait);
+ return 0;
+}
+
+
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree. The stage field tells us which part
+ * of the log tree processing we are currently doing. The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+ /* should we free the extent on disk when done? This is used
+ * at transaction commit time while freeing a log tree
+ */
+ int free;
+
+ /* should we write out the extent buffer? This is used
+ * while flushing the log tree to disk during a sync
+ */
+ int write;
+
+ /* should we wait for the extent buffer io to finish? Also used
+ * while flushing the log tree to disk for a sync
+ */
+ int wait;
+
+ /* pin only walk, we record which extents on disk belong to the
+ * log trees
+ */
+ int pin;
+
+ /* what stage of the replay code we're currently in */
+ int stage;
+
+ /* the root we are currently replaying */
+ struct btrfs_root *replay_dest;
+
+ /* the trans handle for the current replay */
+ struct btrfs_trans_handle *trans;
+
+ /* the function that gets used to process blocks we find in the
+ * tree. Note the extent_buffer might not be up to date when it is
+ * passed in, and it must be checked or read if you need the data
+ * inside it
+ */
+ int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen);
+};
+
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+ struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen)
+{
+ if (wc->pin) {
+ mutex_lock(&log->fs_info->pinned_mutex);
+ btrfs_update_pinned_extents(log->fs_info->extent_root,
+ eb->start, eb->len, 1);
+ mutex_unlock(&log->fs_info->pinned_mutex);
+ }
+
+ if (btrfs_buffer_uptodate(eb, gen)) {
+ if (wc->write)
+ btrfs_write_tree_block(eb);
+ if (wc->wait)
+ btrfs_wait_tree_block_writeback(eb);
+ }
+ return 0;
+}
+
+/*
+ * Item overwrite used by replay and tree logging. eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten. If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static noinline int overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+ u32 item_size;
+ u64 saved_i_size = 0;
+ int save_old_i_size = 0;
+ unsigned long src_ptr;
+ unsigned long dst_ptr;
+ int overwrite_root = 0;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ overwrite_root = 1;
+
+ item_size = btrfs_item_size_nr(eb, slot);
+ src_ptr = btrfs_item_ptr_offset(eb, slot);
+
+ /* look for the key in the destination tree */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret == 0) {
+ char *src_copy;
+ char *dst_copy;
+ u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ if (dst_size != item_size)
+ goto insert;
+
+ if (item_size == 0) {
+ btrfs_release_path(root, path);
+ return 0;
+ }
+ dst_copy = kmalloc(item_size, GFP_NOFS);
+ src_copy = kmalloc(item_size, GFP_NOFS);
+
+ read_extent_buffer(eb, src_copy, src_ptr, item_size);
+
+ dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+ item_size);
+ ret = memcmp(dst_copy, src_copy, item_size);
+
+ kfree(dst_copy);
+ kfree(src_copy);
+ /*
+ * they have the same contents, just return, this saves
+ * us from cowing blocks in the destination tree and doing
+ * extra writes that may not have been done by a previous
+ * sync
+ */
+ if (ret == 0) {
+ btrfs_release_path(root, path);
+ return 0;
+ }
+
+ }
+insert:
+ btrfs_release_path(root, path);
+ /* try to insert the key into the destination tree */
+ ret = btrfs_insert_empty_item(trans, root, path,
+ key, item_size);
+
+ /* make sure any existing item is the correct size */
+ if (ret == -EEXIST) {
+ u32 found_size;
+ found_size = btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ if (found_size > item_size) {
+ btrfs_truncate_item(trans, root, path, item_size, 1);
+ } else if (found_size < item_size) {
+ ret = btrfs_extend_item(trans, root, path,
+ item_size - found_size);
+ BUG_ON(ret);
+ }
+ } else if (ret) {
+ BUG();
+ }
+ dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+
+ /* don't overwrite an existing inode if the generation number
+ * was logged as zero. This is done when the tree logging code
+ * is just logging an inode to make sure it exists after recovery.
+ *
+ * Also, don't overwrite i_size on directories during replay.
+ * log replay inserts and removes directory items based on the
+ * state of the tree found in the subvolume, and i_size is modified
+ * as it goes
+ */
+ if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+ struct btrfs_inode_item *src_item;
+ struct btrfs_inode_item *dst_item;
+
+ src_item = (struct btrfs_inode_item *)src_ptr;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+
+ if (btrfs_inode_generation(eb, src_item) == 0)
+ goto no_copy;
+
+ if (overwrite_root &&
+ S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+ S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+ save_old_i_size = 1;
+ saved_i_size = btrfs_inode_size(path->nodes[0],
+ dst_item);
+ }
+ }
+
+ copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+ src_ptr, item_size);
+
+ if (save_old_i_size) {
+ struct btrfs_inode_item *dst_item;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+ btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+ }
+
+ /* make sure the generation is filled in */
+ if (key->type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *dst_item;
+ dst_item = (struct btrfs_inode_item *)dst_ptr;
+ if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+ btrfs_set_inode_generation(path->nodes[0], dst_item,
+ trans->transid);
+ }
+ }
+no_copy:
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(root, path);
+ return 0;
+}
+
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+ u64 objectid)
+{
+ struct inode *inode;
+ inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+ if (inode->i_state & I_NEW) {
+ BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->location.objectid = objectid;
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.offset = 0;
+ btrfs_read_locked_inode(inode);
+ unlock_new_inode(inode);
+
+ }
+ if (is_bad_inode(inode)) {
+ iput(inode);
+ inode = NULL;
+ }
+ return inode;
+}
+
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'. path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet. So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int found_type;
+ u64 mask = root->sectorsize - 1;
+ u64 extent_end;
+ u64 alloc_hint;
+ u64 start = key->offset;
+ u64 saved_nbytes;
+ struct btrfs_file_extent_item *item;
+ struct inode *inode = NULL;
+ unsigned long size;
+ int ret = 0;
+
+ item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(eb, item);
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC)
+ extent_end = start + btrfs_file_extent_num_bytes(eb, item);
+ else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ size = btrfs_file_extent_inline_len(eb, item);
+ extent_end = (start + size + mask) & ~mask;
+ } else {
+ ret = 0;
+ goto out;
+ }
+
+ inode = read_one_inode(root, key->objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * first check to see if we already have this extent in the
+ * file. This must be done before the btrfs_drop_extents run
+ * so we don't try to drop this extent.
+ */
+ ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ start, 0);
+
+ if (ret == 0 &&
+ (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+ struct btrfs_file_extent_item cmp1;
+ struct btrfs_file_extent_item cmp2;
+ struct btrfs_file_extent_item *existing;
+ struct extent_buffer *leaf;
+
+ leaf = path->nodes[0];
+ existing = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ read_extent_buffer(eb, &cmp1, (unsigned long)item,
+ sizeof(cmp1));
+ read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+ sizeof(cmp2));
+
+ /*
+ * we already have a pointer to this exact extent,
+ * we don't have to do anything
+ */
+ if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+ btrfs_release_path(root, path);
+ goto out;
+ }
+ }
+ btrfs_release_path(root, path);
+
+ saved_nbytes = inode_get_bytes(inode);
+ /* drop any overlapping extents */
+ ret = btrfs_drop_extents(trans, root, inode,
+ start, extent_end, start, &alloc_hint);
+ BUG_ON(ret);
+
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ unsigned long dest_offset;
+ struct btrfs_key ins;
+
+ ret = btrfs_insert_empty_item(trans, root, path, key,
+ sizeof(*item));
+ BUG_ON(ret);
+ dest_offset = btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ copy_extent_buffer(path->nodes[0], eb, dest_offset,
+ (unsigned long)item, sizeof(*item));
+
+ ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+ ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+ if (ins.objectid > 0) {
+ u64 csum_start;
+ u64 csum_end;
+ LIST_HEAD(ordered_sums);
+ /*
+ * is this extent already allocated in the extent
+ * allocation tree? If so, just add a reference
+ */
+ ret = btrfs_lookup_extent(root, ins.objectid,
+ ins.offset);
+ if (ret == 0) {
+ ret = btrfs_inc_extent_ref(trans, root,
+ ins.objectid, ins.offset,
+ path->nodes[0]->start,
+ root->root_key.objectid,
+ trans->transid, key->objectid);
+ } else {
+ /*
+ * insert the extent pointer in the extent
+ * allocation tree
+ */
+ ret = btrfs_alloc_logged_extent(trans, root,
+ path->nodes[0]->start,
+ root->root_key.objectid,
+ trans->transid, key->objectid,
+ &ins);
+ BUG_ON(ret);
+ }
+ btrfs_release_path(root, path);
+
+ if (btrfs_file_extent_compression(eb, item)) {
+ csum_start = ins.objectid;
+ csum_end = csum_start + ins.offset;
+ } else {
+ csum_start = ins.objectid +
+ btrfs_file_extent_offset(eb, item);
+ csum_end = csum_start +
+ btrfs_file_extent_num_bytes(eb, item);
+ }
+
+ ret = btrfs_lookup_csums_range(root->log_root,
+ csum_start, csum_end - 1,
+ &ordered_sums);
+ BUG_ON(ret);
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums;
+ sums = list_entry(ordered_sums.next,
+ struct btrfs_ordered_sum,
+ list);
+ ret = btrfs_csum_file_blocks(trans,
+ root->fs_info->csum_root,
+ sums);
+ BUG_ON(ret);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ } else {
+ btrfs_release_path(root, path);
+ }
+ } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ /* inline extents are easy, we just overwrite them */
+ ret = overwrite_item(trans, root, path, eb, slot, key);
+ BUG_ON(ret);
+ }
+
+ inode_set_bytes(inode, saved_nbytes);
+ btrfs_update_inode(trans, root, inode);
+out:
+ if (inode)
+ iput(inode);
+ return ret;
+}
+
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct inode *dir,
+ struct btrfs_dir_item *di)
+{
+ struct inode *inode;
+ char *name;
+ int name_len;
+ struct extent_buffer *leaf;
+ struct btrfs_key location;
+ int ret;
+
+ leaf = path->nodes[0];
+
+ btrfs_dir_item_key_to_cpu(leaf, di, &location);
+ name_len = btrfs_dir_name_len(leaf, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
+ btrfs_release_path(root, path);
+
+ inode = read_one_inode(root, location.objectid);
+ BUG_ON(!inode);
+
+ ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ BUG_ON(ret);
+ ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ BUG_ON(ret);
+ kfree(name);
+
+ iput(inode);
+ return ret;
+}
+
+/*
+ * helper function to see if a given name and sequence number found
+ * in an inode back reference are already in a directory and correctly
+ * point to this inode
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 dirid, u64 objectid, u64 index,
+ const char *name, int name_len)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_key location;
+ int match = 0;
+
+ di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+ index, name, name_len, 0);
+ if (di && !IS_ERR(di)) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid != objectid)
+ goto out;
+ } else
+ goto out;
+ btrfs_release_path(root, path);
+
+ di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+ if (di && !IS_ERR(di)) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid != objectid)
+ goto out;
+ } else
+ goto out;
+ match = 1;
+out:
+ btrfs_release_path(root, path);
+ return match;
+}
+
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode. This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+ struct btrfs_key *key,
+ char *name, int namelen)
+{
+ struct btrfs_path *path;
+ struct btrfs_inode_ref *ref;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ unsigned long name_ptr;
+ int found_name_len;
+ int item_size;
+ int ret;
+ int match = 0;
+
+ path = btrfs_alloc_path();
+ ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+ if (ret != 0)
+ goto out;
+
+ item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ ptr_end = ptr + item_size;
+ while (ptr < ptr_end) {
+ ref = (struct btrfs_inode_ref *)ptr;
+ found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
+ if (found_name_len == namelen) {
+ name_ptr = (unsigned long)(ref + 1);
+ ret = memcmp_extent_buffer(path->nodes[0], name,
+ name_ptr, namelen);
+ if (ret == 0) {
+ match = 1;
+ goto out;
+ }
+ }
+ ptr = (unsigned long)(ref + 1) + found_name_len;
+ }
+out:
+ btrfs_free_path(path);
+ return match;
+}
+
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function. (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ struct inode *dir;
+ int ret;
+ struct btrfs_key location;
+ struct btrfs_inode_ref *ref;
+ struct btrfs_dir_item *di;
+ struct inode *inode;
+ char *name;
+ int namelen;
+ unsigned long ref_ptr;
+ unsigned long ref_end;
+
+ location.objectid = key->objectid;
+ location.type = BTRFS_INODE_ITEM_KEY;
+ location.offset = 0;
+
+ /*
+ * it is possible that we didn't log all the parent directories
+ * for a given inode. If we don't find the dir, just don't
+ * copy the back ref in. The link count fixup code will take
+ * care of the rest
+ */
+ dir = read_one_inode(root, key->offset);
+ if (!dir)
+ return -ENOENT;
+
+ inode = read_one_inode(root, key->objectid);
+ BUG_ON(!dir);
+
+ ref_ptr = btrfs_item_ptr_offset(eb, slot);
+ ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+again:
+ ref = (struct btrfs_inode_ref *)ref_ptr;
+
+ namelen = btrfs_inode_ref_name_len(eb, ref);
+ name = kmalloc(namelen, GFP_NOFS);
+ BUG_ON(!name);
+
+ read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+
+ /* if we already have a perfect match, we're done */
+ if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
+ btrfs_inode_ref_index(eb, ref),
+ name, namelen)) {
+ goto out;
+ }
+
+ /*
+ * look for a conflicting back reference in the metadata.
+ * if we find one we have to unlink that name of the file
+ * before we add our new link. Later on, we overwrite any
+ * existing back reference, and we don't want to create
+ * dangling pointers in the directory.
+ */
+conflict_again:
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret == 0) {
+ char *victim_name;
+ int victim_name_len;
+ struct btrfs_inode_ref *victim_ref;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ struct extent_buffer *leaf = path->nodes[0];
+
+ /* are we trying to overwrite a back ref for the root directory
+ * if so, just jump out, we're done
+ */
+ if (key->objectid == key->offset)
+ goto out_nowrite;
+
+ /* check all the names in this back reference to see
+ * if they are in the log. if so, we allow them to stay
+ * otherwise they must be unlinked as a conflict
+ */
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+ while (ptr < ptr_end) {
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ victim_name_len = btrfs_inode_ref_name_len(leaf,
+ victim_ref);
+ victim_name = kmalloc(victim_name_len, GFP_NOFS);
+ BUG_ON(!victim_name);
+
+ read_extent_buffer(leaf, victim_name,
+ (unsigned long)(victim_ref + 1),
+ victim_name_len);
+
+ if (!backref_in_log(log, key, victim_name,
+ victim_name_len)) {
+ btrfs_inc_nlink(inode);
+ btrfs_release_path(root, path);
+ ret = btrfs_unlink_inode(trans, root, dir,
+ inode, victim_name,
+ victim_name_len);
+ kfree(victim_name);
+ btrfs_release_path(root, path);
+ goto conflict_again;
+ }
+ kfree(victim_name);
+ ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+ }
+ BUG_ON(ret);
+ }
+ btrfs_release_path(root, path);
+
+ /* look for a conflicting sequence number */
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+ btrfs_inode_ref_index(eb, ref),
+ name, namelen, 0);
+ if (di && !IS_ERR(di)) {
+ ret = drop_one_dir_item(trans, root, path, dir, di);
+ BUG_ON(ret);
+ }
+ btrfs_release_path(root, path);
+
+
+ /* look for a conflicting name */
+ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ name, namelen, 0);
+ if (di && !IS_ERR(di)) {
+ ret = drop_one_dir_item(trans, root, path, dir, di);
+ BUG_ON(ret);
+ }
+ btrfs_release_path(root, path);
+
+ /* insert our name */
+ ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
+ btrfs_inode_ref_index(eb, ref));
+ BUG_ON(ret);
+
+ btrfs_update_inode(trans, root, inode);
+
+out:
+ ref_ptr = (unsigned long)(ref + 1) + namelen;
+ kfree(name);
+ if (ref_ptr < ref_end)
+ goto again;
+
+ /* finally write the back reference in the inode */
+ ret = overwrite_item(trans, root, path, eb, slot, key);
+ BUG_ON(ret);
+
+out_nowrite:
+ btrfs_release_path(root, path);
+ iput(dir);
+ iput(inode);
+ return 0;
+}
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay. So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found. If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct btrfs_key key;
+ u64 nlink = 0;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ int name_len;
+
+ key.objectid = inode->i_ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid != inode->i_ino ||
+ key.type != BTRFS_INODE_REF_KEY)
+ break;
+ ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+ path->slots[0]);
+ while (ptr < ptr_end) {
+ struct btrfs_inode_ref *ref;
+
+ ref = (struct btrfs_inode_ref *)ptr;
+ name_len = btrfs_inode_ref_name_len(path->nodes[0],
+ ref);
+ ptr = (unsigned long)(ref + 1) + name_len;
+ nlink++;
+ }
+
+ if (key.offset == 0)
+ break;
+ key.offset--;
+ btrfs_release_path(root, path);
+ }
+ btrfs_free_path(path);
+ if (nlink != inode->i_nlink) {
+ inode->i_nlink = nlink;
+ btrfs_update_inode(trans, root, inode);
+ }
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+
+ return 0;
+}
+
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ struct inode *inode;
+
+ key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = (u64)-1;
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ break;
+
+ if (ret == 1) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+ key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ ret = btrfs_del_item(trans, root, path);
+ BUG_ON(ret);
+
+ btrfs_release_path(root, path);
+ inode = read_one_inode(root, key.offset);
+ BUG_ON(!inode);
+
+ ret = fixup_inode_link_count(trans, root, inode);
+ BUG_ON(ret);
+
+ iput(inode);
+
+ if (key.offset == 0)
+ break;
+ key.offset--;
+ }
+ btrfs_release_path(root, path);
+ return 0;
+}
+
+
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done. The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid)
+{
+ struct btrfs_key key;
+ int ret = 0;
+ struct inode *inode;
+
+ inode = read_one_inode(root, objectid);
+ BUG_ON(!inode);
+
+ key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+ btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+ key.offset = objectid;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+ btrfs_release_path(root, path);
+ if (ret == 0) {
+ btrfs_inc_nlink(inode);
+ btrfs_update_inode(trans, root, inode);
+ } else if (ret == -EEXIST) {
+ ret = 0;
+ } else {
+ BUG();
+ }
+ iput(inode);
+
+ return ret;
+}
+
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist. This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 dirid, u64 index,
+ char *name, int name_len, u8 type,
+ struct btrfs_key *location)
+{
+ struct inode *inode;
+ struct inode *dir;
+ int ret;
+
+ inode = read_one_inode(root, location->objectid);
+ if (!inode)
+ return -ENOENT;
+
+ dir = read_one_inode(root, dirid);
+ if (!dir) {
+ iput(inode);
+ return -EIO;
+ }
+ ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+
+ /* FIXME, put inode into FIXUP list */
+
+ iput(inode);
+ iput(dir);
+ return ret;
+}
+
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped. fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ struct btrfs_dir_item *di,
+ struct btrfs_key *key)
+{
+ char *name;
+ int name_len;
+ struct btrfs_dir_item *dst_di;
+ struct btrfs_key found_key;
+ struct btrfs_key log_key;
+ struct inode *dir;
+ u8 log_type;
+ int exists;
+ int ret;
+
+ dir = read_one_inode(root, key->objectid);
+ BUG_ON(!dir);
+
+ name_len = btrfs_dir_name_len(eb, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ log_type = btrfs_dir_type(eb, di);
+ read_extent_buffer(eb, name, (unsigned long)(di + 1),
+ name_len);
+
+ btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+ exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+ if (exists == 0)
+ exists = 1;
+ else
+ exists = 0;
+ btrfs_release_path(root, path);
+
+ if (key->type == BTRFS_DIR_ITEM_KEY) {
+ dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+ name, name_len, 1);
+ } else if (key->type == BTRFS_DIR_INDEX_KEY) {
+ dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+ key->objectid,
+ key->offset, name,
+ name_len, 1);
+ } else {
+ BUG();
+ }
+ if (!dst_di || IS_ERR(dst_di)) {
+ /* we need a sequence number to insert, so we only
+ * do inserts for the BTRFS_DIR_INDEX_KEY types
+ */
+ if (key->type != BTRFS_DIR_INDEX_KEY)
+ goto out;
+ goto insert;
+ }
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+ /* the existing item matches the logged item */
+ if (found_key.objectid == log_key.objectid &&
+ found_key.type == log_key.type &&
+ found_key.offset == log_key.offset &&
+ btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+ goto out;
+ }
+
+ /*
+ * don't drop the conflicting directory entry if the inode
+ * for the new entry doesn't exist
+ */
+ if (!exists)
+ goto out;
+
+ ret = drop_one_dir_item(trans, root, path, dir, dst_di);
+ BUG_ON(ret);
+
+ if (key->type == BTRFS_DIR_INDEX_KEY)
+ goto insert;
+out:
+ btrfs_release_path(root, path);
+ kfree(name);
+ iput(dir);
+ return 0;
+
+insert:
+ btrfs_release_path(root, path);
+ ret = insert_one_name(trans, root, path, key->objectid, key->offset,
+ name, name_len, log_type, &log_key);
+
+ if (ret && ret != -ENOENT)
+ BUG();
+ goto out;
+}
+
+/*
+ * find all the names in a directory item and reconcile them into
+ * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
+ * one name in a directory item, but the same code gets used for
+ * both directory index types
+ */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+ u32 item_size = btrfs_item_size_nr(eb, slot);
+ struct btrfs_dir_item *di;
+ int name_len;
+ unsigned long ptr;
+ unsigned long ptr_end;
+
+ ptr = btrfs_item_ptr_offset(eb, slot);
+ ptr_end = ptr + item_size;
+ while (ptr < ptr_end) {
+ di = (struct btrfs_dir_item *)ptr;
+ name_len = btrfs_dir_name_len(eb, di);
+ ret = replay_one_name(trans, root, path, eb, di, key);
+ BUG_ON(ret);
+ ptr = (unsigned long)(di + 1);
+ ptr += name_len;
+ }
+ return 0;
+}
+
+/*
+ * directory replay has two parts. There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for. During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 dirid, int key_type,
+ u64 *start_ret, u64 *end_ret)
+{
+ struct btrfs_key key;
+ u64 found_end;
+ struct btrfs_dir_log_item *item;
+ int ret;
+ int nritems;
+
+ if (*start_ret == (u64)-1)
+ return 1;
+
+ key.objectid = dirid;
+ key.type = key_type;
+ key.offset = *start_ret;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+ if (ret != 0)
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != key_type || key.objectid != dirid) {
+ ret = 1;
+ goto next;
+ }
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ found_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ if (*start_ret >= key.offset && *start_ret <= found_end) {
+ ret = 0;
+ *start_ret = key.offset;
+ *end_ret = found_end;
+ goto out;
+ }
+ ret = 1;
+next:
+ /* check the next slot in the tree to see if it is a valid item */
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ goto out;
+ } else {
+ path->slots[0]++;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != key_type || key.objectid != dirid) {
+ ret = 1;
+ goto out;
+ }
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ found_end = btrfs_dir_log_end(path->nodes[0], item);
+ *start_ret = key.offset;
+ *end_ret = found_end;
+ ret = 0;
+out:
+ btrfs_release_path(root, path);
+ return ret;
+}
+
+/*
+ * this looks for a given directory item in the log. If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ struct btrfs_path *log_path,
+ struct inode *dir,
+ struct btrfs_key *dir_key)
+{
+ int ret;
+ struct extent_buffer *eb;
+ int slot;
+ u32 item_size;
+ struct btrfs_dir_item *di;
+ struct btrfs_dir_item *log_di;
+ int name_len;
+ unsigned long ptr;
+ unsigned long ptr_end;
+ char *name;
+ struct inode *inode;
+ struct btrfs_key location;
+
+again:
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+ ptr = btrfs_item_ptr_offset(eb, slot);
+ ptr_end = ptr + item_size;
+ while (ptr < ptr_end) {
+ di = (struct btrfs_dir_item *)ptr;
+ name_len = btrfs_dir_name_len(eb, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(eb, name, (unsigned long)(di + 1),
+ name_len);
+ log_di = NULL;
+ if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+ log_di = btrfs_lookup_dir_item(trans, log, log_path,
+ dir_key->objectid,
+ name, name_len, 0);
+ } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+ log_di = btrfs_lookup_dir_index_item(trans, log,
+ log_path,
+ dir_key->objectid,
+ dir_key->offset,
+ name, name_len, 0);
+ }
+ if (!log_di || IS_ERR(log_di)) {
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ btrfs_release_path(root, path);
+ btrfs_release_path(log, log_path);
+ inode = read_one_inode(root, location.objectid);
+ BUG_ON(!inode);
+
+ ret = link_to_fixup_dir(trans, root,
+ path, location.objectid);
+ BUG_ON(ret);
+ btrfs_inc_nlink(inode);
+ ret = btrfs_unlink_inode(trans, root, dir, inode,
+ name, name_len);
+ BUG_ON(ret);
+ kfree(name);
+ iput(inode);
+
+ /* there might still be more names under this key
+ * check and repeat if required
+ */
+ ret = btrfs_search_slot(NULL, root, dir_key, path,
+ 0, 0);
+ if (ret == 0)
+ goto again;
+ ret = 0;
+ goto out;
+ }
+ btrfs_release_path(log, log_path);
+ kfree(name);
+
+ ptr = (unsigned long)(di + 1);
+ ptr += name_len;
+ }
+ ret = 0;
+out:
+ btrfs_release_path(root, path);
+ btrfs_release_path(log, log_path);
+ return ret;
+}
+
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes. It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dirid)
+{
+ u64 range_start;
+ u64 range_end;
+ int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+ int ret = 0;
+ struct btrfs_key dir_key;
+ struct btrfs_key found_key;
+ struct btrfs_path *log_path;
+ struct inode *dir;
+
+ dir_key.objectid = dirid;
+ dir_key.type = BTRFS_DIR_ITEM_KEY;
+ log_path = btrfs_alloc_path();
+ if (!log_path)
+ return -ENOMEM;
+
+ dir = read_one_inode(root, dirid);
+ /* it isn't an error if the inode isn't there, that can happen
+ * because we replay the deletes before we copy in the inode item
+ * from the log
+ */
+ if (!dir) {
+ btrfs_free_path(log_path);
+ return 0;
+ }
+again:
+ range_start = 0;
+ range_end = 0;
+ while (1) {
+ ret = find_dir_range(log, path, dirid, key_type,
+ &range_start, &range_end);
+ if (ret != 0)
+ break;
+
+ dir_key.offset = range_start;
+ while (1) {
+ int nritems;
+ ret = btrfs_search_slot(NULL, root, &dir_key, path,
+ 0, 0);
+ if (ret < 0)
+ goto out;
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ break;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != dirid ||
+ found_key.type != dir_key.type)
+ goto next_type;
+
+ if (found_key.offset > range_end)
+ break;
+
+ ret = check_item_in_log(trans, root, log, path,
+ log_path, dir, &found_key);
+ BUG_ON(ret);
+ if (found_key.offset == (u64)-1)
+ break;
+ dir_key.offset = found_key.offset + 1;
+ }
+ btrfs_release_path(root, path);
+ if (range_end == (u64)-1)
+ break;
+ range_start = range_end + 1;
+ }
+
+next_type:
+ ret = 0;
+ if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+ key_type = BTRFS_DIR_LOG_INDEX_KEY;
+ dir_key.type = BTRFS_DIR_INDEX_KEY;
+ btrfs_release_path(root, path);
+ goto again;
+ }
+out:
+ btrfs_release_path(root, path);
+ btrfs_free_path(log_path);
+ iput(dir);
+ return ret;
+}
+
+/*
+ * the process_func used to replay items from the log tree. This
+ * gets called in two different stages. The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume. The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen)
+{
+ int nritems;
+ struct btrfs_path *path;
+ struct btrfs_root *root = wc->replay_dest;
+ struct btrfs_key key;
+ u32 item_size;
+ int level;
+ int i;
+ int ret;
+
+ btrfs_read_buffer(eb, gen);
+
+ level = btrfs_header_level(eb);
+
+ if (level != 0)
+ return 0;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ nritems = btrfs_header_nritems(eb);
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+ item_size = btrfs_item_size_nr(eb, i);
+
+ /* inode keys are done during the first stage */
+ if (key.type == BTRFS_INODE_ITEM_KEY &&
+ wc->stage == LOG_WALK_REPLAY_INODES) {
+ struct inode *inode;
+ struct btrfs_inode_item *inode_item;
+ u32 mode;
+
+ inode_item = btrfs_item_ptr(eb, i,
+ struct btrfs_inode_item);
+ mode = btrfs_inode_mode(eb, inode_item);
+ if (S_ISDIR(mode)) {
+ ret = replay_dir_deletes(wc->trans,
+ root, log, path, key.objectid);
+ BUG_ON(ret);
+ }
+ ret = overwrite_item(wc->trans, root, path,
+ eb, i, &key);
+ BUG_ON(ret);
+
+ /* for regular files, truncate away
+ * extents past the new EOF
+ */
+ if (S_ISREG(mode)) {
+ inode = read_one_inode(root,
+ key.objectid);
+ BUG_ON(!inode);
+
+ ret = btrfs_truncate_inode_items(wc->trans,
+ root, inode, inode->i_size,
+ BTRFS_EXTENT_DATA_KEY);
+ BUG_ON(ret);
+ iput(inode);
+ }
+ ret = link_to_fixup_dir(wc->trans, root,
+ path, key.objectid);
+ BUG_ON(ret);
+ }
+ if (wc->stage < LOG_WALK_REPLAY_ALL)
+ continue;
+
+ /* these keys are simply copied */
+ if (key.type == BTRFS_XATTR_ITEM_KEY) {
+ ret = overwrite_item(wc->trans, root, path,
+ eb, i, &key);
+ BUG_ON(ret);
+ } else if (key.type == BTRFS_INODE_REF_KEY) {
+ ret = add_inode_ref(wc->trans, root, log, path,
+ eb, i, &key);
+ BUG_ON(ret && ret != -ENOENT);
+ } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+ ret = replay_one_extent(wc->trans, root, path,
+ eb, i, &key);
+ BUG_ON(ret);
+ } else if (key.type == BTRFS_DIR_ITEM_KEY ||
+ key.type == BTRFS_DIR_INDEX_KEY) {
+ ret = replay_one_dir_item(wc->trans, root, path,
+ eb, i, &key);
+ BUG_ON(ret);
+ }
+ }
+ btrfs_free_path(path);
+ return 0;
+}
+
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level,
+ struct walk_control *wc)
+{
+ u64 root_owner;
+ u64 root_gen;
+ u64 bytenr;
+ u64 ptr_gen;
+ struct extent_buffer *next;
+ struct extent_buffer *cur;
+ struct extent_buffer *parent;
+ u32 blocksize;
+ int ret = 0;
+
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+ while (*level > 0) {
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+ cur = path->nodes[*level];
+
+ if (btrfs_header_level(cur) != *level)
+ WARN_ON(1);
+
+ if (path->slots[*level] >=
+ btrfs_header_nritems(cur))
+ break;
+
+ bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+ ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+ blocksize = btrfs_level_size(root, *level - 1);
+
+ parent = path->nodes[*level];
+ root_owner = btrfs_header_owner(parent);
+ root_gen = btrfs_header_generation(parent);
+
+ next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+
+ wc->process_func(root, next, wc, ptr_gen);
+
+ if (*level == 1) {
+ path->slots[*level]++;
+ if (wc->free) {
+ btrfs_read_buffer(next, ptr_gen);
+
+ btrfs_tree_lock(next);
+ clean_tree_block(trans, root, next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+
+ ret = btrfs_drop_leaf_ref(trans, root, next);
+ BUG_ON(ret);
+
+ WARN_ON(root_owner !=
+ BTRFS_TREE_LOG_OBJECTID);
+ ret = btrfs_free_reserved_extent(root,
+ bytenr, blocksize);
+ BUG_ON(ret);
+ }
+ free_extent_buffer(next);
+ continue;
+ }
+ btrfs_read_buffer(next, ptr_gen);
+
+ WARN_ON(*level <= 0);
+ if (path->nodes[*level-1])
+ free_extent_buffer(path->nodes[*level-1]);
+ path->nodes[*level-1] = next;
+ *level = btrfs_header_level(next);
+ path->slots[*level] = 0;
+ cond_resched();
+ }
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+ if (path->nodes[*level] == root->node)
+ parent = path->nodes[*level];
+ else
+ parent = path->nodes[*level + 1];
+
+ bytenr = path->nodes[*level]->start;
+
+ blocksize = btrfs_level_size(root, *level);
+ root_owner = btrfs_header_owner(parent);
+ root_gen = btrfs_header_generation(parent);
+
+ wc->process_func(root, path->nodes[*level], wc,
+ btrfs_header_generation(path->nodes[*level]));
+
+ if (wc->free) {
+ next = path->nodes[*level];
+ btrfs_tree_lock(next);
+ clean_tree_block(trans, root, next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+
+ if (*level == 0) {
+ ret = btrfs_drop_leaf_ref(trans, root, next);
+ BUG_ON(ret);
+ }
+ WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+ ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
+ BUG_ON(ret);
+ }
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level += 1;
+
+ cond_resched();
+ return 0;
+}
+
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level,
+ struct walk_control *wc)
+{
+ u64 root_owner;
+ u64 root_gen;
+ int i;
+ int slot;
+ int ret;
+
+ for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+ slot = path->slots[i];
+ if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+ struct extent_buffer *node;
+ node = path->nodes[i];
+ path->slots[i]++;
+ *level = i;
+ WARN_ON(*level == 0);
+ return 0;
+ } else {
+ struct extent_buffer *parent;
+ if (path->nodes[*level] == root->node)
+ parent = path->nodes[*level];
+ else
+ parent = path->nodes[*level + 1];
+
+ root_owner = btrfs_header_owner(parent);
+ root_gen = btrfs_header_generation(parent);
+ wc->process_func(root, path->nodes[*level], wc,
+ btrfs_header_generation(path->nodes[*level]));
+ if (wc->free) {
+ struct extent_buffer *next;
+
+ next = path->nodes[*level];
+
+ btrfs_tree_lock(next);
+ clean_tree_block(trans, root, next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+
+ if (*level == 0) {
+ ret = btrfs_drop_leaf_ref(trans, root,
+ next);
+ BUG_ON(ret);
+ }
+
+ WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+ ret = btrfs_free_reserved_extent(root,
+ path->nodes[*level]->start,
+ path->nodes[*level]->len);
+ BUG_ON(ret);
+ }
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level = i + 1;
+ }
+ }
+ return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'. This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, struct walk_control *wc)
+{
+ int ret = 0;
+ int wret;
+ int level;
+ struct btrfs_path *path;
+ int i;
+ int orig_level;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ level = btrfs_header_level(log->node);
+ orig_level = level;
+ path->nodes[level] = log->node;
+ extent_buffer_get(log->node);
+ path->slots[level] = 0;
+
+ while (1) {
+ wret = walk_down_log_tree(trans, log, path, &level, wc);
+ if (wret > 0)
+ break;
+ if (wret < 0)
+ ret = wret;
+
+ wret = walk_up_log_tree(trans, log, path, &level, wc);
+ if (wret > 0)
+ break;
+ if (wret < 0)
+ ret = wret;
+ }
+
+ /* was the root node processed? if not, catch it here */
+ if (path->nodes[orig_level]) {
+ wc->process_func(log, path->nodes[orig_level], wc,
+ btrfs_header_generation(path->nodes[orig_level]));
+ if (wc->free) {
+ struct extent_buffer *next;
+
+ next = path->nodes[orig_level];
+
+ btrfs_tree_lock(next);
+ clean_tree_block(trans, log, next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+
+ if (orig_level == 0) {
+ ret = btrfs_drop_leaf_ref(trans, log,
+ next);
+ BUG_ON(ret);
+ }
+ WARN_ON(log->root_key.objectid !=
+ BTRFS_TREE_LOG_OBJECTID);
+ ret = btrfs_free_reserved_extent(log, next->start,
+ next->len);
+ BUG_ON(ret);
+ }
+ }
+
+ for (i = 0; i <= orig_level; i++) {
+ if (path->nodes[i]) {
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
+ }
+ }
+ btrfs_free_path(path);
+ if (wc->free)
+ free_extent_buffer(log->node);
+ return ret;
+}
+
+static int wait_log_commit(struct btrfs_root *log)
+{
+ DEFINE_WAIT(wait);
+ u64 transid = log->fs_info->tree_log_transid;
+
+ do {
+ prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&log->fs_info->tree_log_mutex);
+ if (atomic_read(&log->fs_info->tree_log_commit))
+ schedule();
+ finish_wait(&log->fs_info->tree_log_wait, &wait);
+ mutex_lock(&log->fs_info->tree_log_mutex);
+ } while (transid == log->fs_info->tree_log_transid &&
+ atomic_read(&log->fs_info->tree_log_commit));
+ return 0;
+}
+
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it. When this call is done,
+ * you know that any inodes previously logged are safely on disk
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+ unsigned long batch;
+ struct btrfs_root *log = root->log_root;
+
+ mutex_lock(&log->fs_info->tree_log_mutex);
+ if (atomic_read(&log->fs_info->tree_log_commit)) {
+ wait_log_commit(log);
+ goto out;
+ }
+ atomic_set(&log->fs_info->tree_log_commit, 1);
+
+ while (1) {
+ batch = log->fs_info->tree_log_batch;
+ mutex_unlock(&log->fs_info->tree_log_mutex);
+ schedule_timeout_uninterruptible(1);
+ mutex_lock(&log->fs_info->tree_log_mutex);
+
+ while (atomic_read(&log->fs_info->tree_log_writers)) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&log->fs_info->tree_log_mutex);
+ if (atomic_read(&log->fs_info->tree_log_writers))
+ schedule();
+ mutex_lock(&log->fs_info->tree_log_mutex);
+ finish_wait(&log->fs_info->tree_log_wait, &wait);
+ }
+ if (batch == log->fs_info->tree_log_batch)
+ break;
+ }
+
+ ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
+ BUG_ON(ret);
+ ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
+ &root->fs_info->log_root_tree->dirty_log_pages);
+ BUG_ON(ret);
+
+ btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+ log->fs_info->log_root_tree->node->start);
+ btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+ btrfs_header_level(log->fs_info->log_root_tree->node));
+
+ write_ctree_super(trans, log->fs_info->tree_root, 2);
+ log->fs_info->tree_log_transid++;
+ log->fs_info->tree_log_batch = 0;
+ atomic_set(&log->fs_info->tree_log_commit, 0);
+ smp_mb();
+ if (waitqueue_active(&log->fs_info->tree_log_wait))
+ wake_up(&log->fs_info->tree_log_wait);
+out:
+ mutex_unlock(&log->fs_info->tree_log_mutex);
+ return 0;
+}
+
+/* * free all the extents used by the tree log. This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+ int ret;
+ struct btrfs_root *log;
+ struct key;
+ u64 start;
+ u64 end;
+ struct walk_control wc = {
+ .free = 1,
+ .process_func = process_one_buffer
+ };
+
+ if (!root->log_root || root->fs_info->log_root_recovering)
+ return 0;
+
+ log = root->log_root;
+ ret = walk_log_tree(trans, log, &wc);
+ BUG_ON(ret);
+
+ while (1) {
+ ret = find_first_extent_bit(&log->dirty_log_pages,
+ 0, &start, &end, EXTENT_DIRTY);
+ if (ret)
+ break;
+
+ clear_extent_dirty(&log->dirty_log_pages,
+ start, end, GFP_NOFS);
+ }
+
+ log = root->log_root;
+ ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
+ &log->root_key);
+ BUG_ON(ret);
+ root->log_root = NULL;
+ kfree(root->log_root);
+ return 0;
+}
+
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log)
+{
+ u64 bytenr = btrfs_root_bytenr(&log->root_item);
+ int ret;
+
+ if (log->node->start == bytenr)
+ return 0;
+
+ btrfs_set_root_bytenr(&log->root_item, log->node->start);
+ btrfs_set_root_generation(&log->root_item, trans->transid);
+ btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+ ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+ &log->root_key, &log->root_item);
+ BUG_ON(ret);
+ return ret;
+}
+
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist. But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked. Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *dir, u64 index)
+{
+ struct btrfs_root *log;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ int ret;
+ int bytes_del = 0;
+
+ if (BTRFS_I(dir)->logged_trans < trans->transid)
+ return 0;
+
+ ret = join_running_log_trans(root);
+ if (ret)
+ return 0;
+
+ mutex_lock(&BTRFS_I(dir)->log_mutex);
+
+ log = root->log_root;
+ path = btrfs_alloc_path();
+ di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+ name, name_len, -1);
+ if (di && !IS_ERR(di)) {
+ ret = btrfs_delete_one_dir_name(trans, log, path, di);
+ bytes_del += name_len;
+ BUG_ON(ret);
+ }
+ btrfs_release_path(log, path);
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+ index, name, name_len, -1);
+ if (di && !IS_ERR(di)) {
+ ret = btrfs_delete_one_dir_name(trans, log, path, di);
+ bytes_del += name_len;
+ BUG_ON(ret);
+ }
+
+ /* update the directory size in the log to reflect the names
+ * we have removed
+ */
+ if (bytes_del) {
+ struct btrfs_key key;
+
+ key.objectid = dir->i_ino;
+ key.offset = 0;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ btrfs_release_path(log, path);
+
+ ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+ if (ret == 0) {
+ struct btrfs_inode_item *item;
+ u64 i_size;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ i_size = btrfs_inode_size(path->nodes[0], item);
+ if (i_size > bytes_del)
+ i_size -= bytes_del;
+ else
+ i_size = 0;
+ btrfs_set_inode_size(path->nodes[0], item, i_size);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ } else
+ ret = 0;
+ btrfs_release_path(log, path);
+ }
+
+ btrfs_free_path(path);
+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
+ end_log_trans(root);
+
+ return 0;
+}
+
+/* see comments for btrfs_del_dir_entries_in_log */
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *inode, u64 dirid)
+{
+ struct btrfs_root *log;
+ u64 index;
+ int ret;
+
+ if (BTRFS_I(inode)->logged_trans < trans->transid)
+ return 0;
+
+ ret = join_running_log_trans(root);
+ if (ret)
+ return 0;
+ log = root->log_root;
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+ ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+ dirid, &index);
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+ end_log_trans(root);
+
+ return ret;
+}
+
+/*
+ * creates a range item in the log for 'dirid'. first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ int key_type, u64 dirid,
+ u64 first_offset, u64 last_offset)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_dir_log_item *item;
+
+ key.objectid = dirid;
+ key.offset = first_offset;
+ if (key_type == BTRFS_DIR_ITEM_KEY)
+ key.type = BTRFS_DIR_LOG_ITEM_KEY;
+ else
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+ BUG_ON(ret);
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_dir_log_item);
+ btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(log, path);
+ return 0;
+}
+
+/*
+ * log all the items included in the current transaction for a given
+ * directory. This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path, int key_type,
+ u64 min_offset, u64 *last_offset_ret)
+{
+ struct btrfs_key min_key;
+ struct btrfs_key max_key;
+ struct btrfs_root *log = root->log_root;
+ struct extent_buffer *src;
+ int ret;
+ int i;
+ int nritems;
+ u64 first_offset = min_offset;
+ u64 last_offset = (u64)-1;
+
+ log = root->log_root;
+ max_key.objectid = inode->i_ino;
+ max_key.offset = (u64)-1;
+ max_key.type = key_type;
+
+ min_key.objectid = inode->i_ino;
+ min_key.type = key_type;
+ min_key.offset = min_offset;
+
+ path->keep_locks = 1;
+
+ ret = btrfs_search_forward(root, &min_key, &max_key,
+ path, 0, trans->transid);
+
+ /*
+ * we didn't find anything from this transaction, see if there
+ * is anything at all
+ */
+ if (ret != 0 || min_key.objectid != inode->i_ino ||
+ min_key.type != key_type) {
+ min_key.objectid = inode->i_ino;
+ min_key.type = key_type;
+ min_key.offset = (u64)-1;
+ btrfs_release_path(root, path);
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+ if (ret < 0) {
+ btrfs_release_path(root, path);
+ return ret;
+ }
+ ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+
+ /* if ret == 0 there are items for this type,
+ * create a range to tell us the last key of this type.
+ * otherwise, there are no items in this directory after
+ * *min_offset, and we create a range to indicate that.
+ */
+ if (ret == 0) {
+ struct btrfs_key tmp;
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+ path->slots[0]);
+ if (key_type == tmp.type)
+ first_offset = max(min_offset, tmp.offset) + 1;
+ }
+ goto done;
+ }
+
+ /* go backward to find any previous key */
+ ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+ if (ret == 0) {
+ struct btrfs_key tmp;
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+ if (key_type == tmp.type) {
+ first_offset = tmp.offset;
+ ret = overwrite_item(trans, log, dst_path,
+ path->nodes[0], path->slots[0],
+ &tmp);
+ }
+ }
+ btrfs_release_path(root, path);
+
+ /* find the first key from this transaction again */
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+ if (ret != 0) {
+ WARN_ON(1);
+ goto done;
+ }
+
+ /*
+ * we have a block from this transaction, log every item in it
+ * from our directory
+ */
+ while (1) {
+ struct btrfs_key tmp;
+ src = path->nodes[0];
+ nritems = btrfs_header_nritems(src);
+ for (i = path->slots[0]; i < nritems; i++) {
+ btrfs_item_key_to_cpu(src, &min_key, i);
+
+ if (min_key.objectid != inode->i_ino ||
+ min_key.type != key_type)
+ goto done;
+ ret = overwrite_item(trans, log, dst_path, src, i,
+ &min_key);
+ BUG_ON(ret);
+ }
+ path->slots[0] = nritems;
+
+ /*
+ * look ahead to the next item and see if it is also
+ * from this directory and from this transaction
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 1) {
+ last_offset = (u64)-1;
+ goto done;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+ if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+ last_offset = (u64)-1;
+ goto done;
+ }
+ if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+ ret = overwrite_item(trans, log, dst_path,
+ path->nodes[0], path->slots[0],
+ &tmp);
+
+ BUG_ON(ret);
+ last_offset = tmp.offset;
+ goto done;
+ }
+ }
+done:
+ *last_offset_ret = last_offset;
+ btrfs_release_path(root, path);
+ btrfs_release_path(log, dst_path);
+
+ /* insert the log range keys to indicate where the log is valid */
+ ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+ first_offset, last_offset);
+ BUG_ON(ret);
+ return 0;
+}
+
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path)
+{
+ u64 min_key;
+ u64 max_key;
+ int ret;
+ int key_type = BTRFS_DIR_ITEM_KEY;
+
+again:
+ min_key = 0;
+ max_key = 0;
+ while (1) {
+ ret = log_dir_items(trans, root, inode, path,
+ dst_path, key_type, min_key,
+ &max_key);
+ BUG_ON(ret);
+ if (max_key == (u64)-1)
+ break;
+ min_key = max_key + 1;
+ }
+
+ if (key_type == BTRFS_DIR_ITEM_KEY) {
+ key_type = BTRFS_DIR_INDEX_KEY;
+ goto again;
+ }
+ return 0;
+}
+
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode. max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_objectid_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 objectid, int max_key_type)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+
+ key.objectid = objectid;
+ key.type = max_key_type;
+ key.offset = (u64)-1;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+
+ if (ret != 1)
+ break;
+
+ if (path->slots[0] == 0)
+ break;
+
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+
+ if (found_key.objectid != objectid)
+ break;
+
+ ret = btrfs_del_item(trans, log, path);
+ BUG_ON(ret);
+ btrfs_release_path(log, path);
+ }
+ btrfs_release_path(log, path);
+ return 0;
+}
+
+static noinline int copy_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *dst_path,
+ struct extent_buffer *src,
+ int start_slot, int nr, int inode_only)
+{
+ unsigned long src_offset;
+ unsigned long dst_offset;
+ struct btrfs_file_extent_item *extent;
+ struct btrfs_inode_item *inode_item;
+ int ret;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ char *ins_data;
+ int i;
+ struct list_head ordered_sums;
+
+ INIT_LIST_HEAD(&ordered_sums);
+
+ ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
+ nr * sizeof(u32), GFP_NOFS);
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+
+ for (i = 0; i < nr; i++) {
+ ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+ btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+ }
+ ret = btrfs_insert_empty_items(trans, log, dst_path,
+ ins_keys, ins_sizes, nr);
+ BUG_ON(ret);
+
+ for (i = 0; i < nr; i++) {
+ dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+ dst_path->slots[0]);
+
+ src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+
+ copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+ src_offset, ins_sizes[i]);
+
+ if (inode_only == LOG_INODE_EXISTS &&
+ ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(dst_path->nodes[0],
+ dst_path->slots[0],
+ struct btrfs_inode_item);
+ btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+
+ /* set the generation to zero so the recover code
+ * can tell the difference between an logging
+ * just to say 'this inode exists' and a logging
+ * to say 'update this inode with these values'
+ */
+ btrfs_set_inode_generation(dst_path->nodes[0],
+ inode_item, 0);
+ }
+ /* take a reference on file data extents so that truncates
+ * or deletes of this inode don't have to relog the inode
+ * again
+ */
+ if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+ int found_type;
+ extent = btrfs_item_ptr(src, start_slot + i,
+ struct btrfs_file_extent_item);
+
+ found_type = btrfs_file_extent_type(src, extent);
+ if (found_type == BTRFS_FILE_EXTENT_REG ||
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 ds = btrfs_file_extent_disk_bytenr(src,
+ extent);
+ u64 dl = btrfs_file_extent_disk_num_bytes(src,
+ extent);
+ u64 cs = btrfs_file_extent_offset(src, extent);
+ u64 cl = btrfs_file_extent_num_bytes(src,
+ extent);;
+ if (btrfs_file_extent_compression(src,
+ extent)) {
+ cs = 0;
+ cl = dl;
+ }
+ /* ds == 0 is a hole */
+ if (ds != 0) {
+ ret = btrfs_inc_extent_ref(trans, log,
+ ds, dl,
+ dst_path->nodes[0]->start,
+ BTRFS_TREE_LOG_OBJECTID,
+ trans->transid,
+ ins_keys[i].objectid);
+ BUG_ON(ret);
+ ret = btrfs_lookup_csums_range(
+ log->fs_info->csum_root,
+ ds + cs, ds + cs + cl - 1,
+ &ordered_sums);
+ BUG_ON(ret);
+ }
+ }
+ }
+ dst_path->slots[0]++;
+ }
+
+ btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+ btrfs_release_path(log, dst_path);
+ kfree(ins_data);
+
+ /*
+ * we have to do this after the loop above to avoid changing the
+ * log tree while trying to change the log tree.
+ */
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+ struct btrfs_ordered_sum,
+ list);
+ ret = btrfs_csum_file_blocks(trans, log, sums);
+ BUG_ON(ret);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ return 0;
+}
+
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree. An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only)
+{
+ struct btrfs_path *path;
+ struct btrfs_path *dst_path;
+ struct btrfs_key min_key;
+ struct btrfs_key max_key;
+ struct btrfs_root *log = root->log_root;
+ struct extent_buffer *src = NULL;
+ u32 size;
+ int ret;
+ int nritems;
+ int ins_start_slot = 0;
+ int ins_nr;
+
+ log = root->log_root;
+
+ path = btrfs_alloc_path();
+ dst_path = btrfs_alloc_path();
+
+ min_key.objectid = inode->i_ino;
+ min_key.type = BTRFS_INODE_ITEM_KEY;
+ min_key.offset = 0;
+
+ max_key.objectid = inode->i_ino;
+ if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ else
+ max_key.type = (u8)-1;
+ max_key.offset = (u64)-1;
+
+ /*
+ * if this inode has already been logged and we're in inode_only
+ * mode, we don't want to delete the things that have already
+ * been written to the log.
+ *
+ * But, if the inode has been through an inode_only log,
+ * the logged_trans field is not set. This allows us to catch
+ * any new names for this inode in the backrefs by logging it
+ * again
+ */
+ if (inode_only == LOG_INODE_EXISTS &&
+ BTRFS_I(inode)->logged_trans == trans->transid) {
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+ goto out;
+ }
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+ /*
+ * a brute force approach to making sure we get the most uptodate
+ * copies of everything.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+
+ if (inode_only == LOG_INODE_EXISTS)
+ max_key_type = BTRFS_XATTR_ITEM_KEY;
+ ret = drop_objectid_items(trans, log, path,
+ inode->i_ino, max_key_type);
+ } else {
+ ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+ }
+ BUG_ON(ret);
+ path->keep_locks = 1;
+
+ while (1) {
+ ins_nr = 0;
+ ret = btrfs_search_forward(root, &min_key, &max_key,
+ path, 0, trans->transid);
+ if (ret != 0)
+ break;
+again:
+ /* note, ins_nr might be > 0 here, cleanup outside the loop */
+ if (min_key.objectid != inode->i_ino)
+ break;
+ if (min_key.type > max_key.type)
+ break;
+
+ src = path->nodes[0];
+ size = btrfs_item_size_nr(src, path->slots[0]);
+ if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+ ins_nr++;
+ goto next_slot;
+ } else if (!ins_nr) {
+ ins_start_slot = path->slots[0];
+ ins_nr = 1;
+ goto next_slot;
+ }
+
+ ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+ ins_nr, inode_only);
+ BUG_ON(ret);
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+next_slot:
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ path->slots[0]++;
+ if (path->slots[0] < nritems) {
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key,
+ path->slots[0]);
+ goto again;
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, log, dst_path, src,
+ ins_start_slot,
+ ins_nr, inode_only);
+ BUG_ON(ret);
+ ins_nr = 0;
+ }
+ btrfs_release_path(root, path);
+
+ if (min_key.offset < (u64)-1)
+ min_key.offset++;
+ else if (min_key.type < (u8)-1)
+ min_key.type++;
+ else if (min_key.objectid < (u64)-1)
+ min_key.objectid++;
+ else
+ break;
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, log, dst_path, src,
+ ins_start_slot,
+ ins_nr, inode_only);
+ BUG_ON(ret);
+ ins_nr = 0;
+ }
+ WARN_ON(ins_nr);
+ if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+ btrfs_release_path(root, path);
+ btrfs_release_path(log, dst_path);
+ BTRFS_I(inode)->log_dirty_trans = 0;
+ ret = log_directory_changes(trans, root, inode, path, dst_path);
+ BUG_ON(ret);
+ }
+ BTRFS_I(inode)->logged_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+
+ mutex_lock(&root->fs_info->tree_log_mutex);
+ ret = update_log_root(trans, log);
+ BUG_ON(ret);
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+out:
+ return 0;
+}
+
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only)
+{
+ int ret;
+
+ start_log_trans(trans, root);
+ ret = __btrfs_log_inode(trans, root, inode, inode_only);
+ end_log_trans(root);
+ return ret;
+}
+
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log. A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct dentry *dentry)
+{
+ int inode_only = LOG_INODE_ALL;
+ struct super_block *sb;
+ int ret;
+
+ start_log_trans(trans, root);
+ sb = dentry->d_inode->i_sb;
+ while (1) {
+ ret = __btrfs_log_inode(trans, root, dentry->d_inode,
+ inode_only);
+ BUG_ON(ret);
+ inode_only = LOG_INODE_EXISTS;
+
+ dentry = dentry->d_parent;
+ if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+ break;
+
+ if (BTRFS_I(dentry->d_inode)->generation <=
+ root->fs_info->last_trans_committed)
+ break;
+ }
+ end_log_trans(root);
+ return 0;
+}
+
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct dentry *dentry)
+{
+ u64 gen;
+ gen = root->fs_info->last_trans_new_blockgroup;
+ if (gen > root->fs_info->last_trans_committed)
+ return 1;
+ else
+ return btrfs_log_dentry(trans, root, dentry);
+}
+
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_key tmp_key;
+ struct btrfs_root *log;
+ struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+ u64 highest_inode;
+ struct walk_control wc = {
+ .process_func = process_one_buffer,
+ .stage = 0,
+ };
+
+ fs_info->log_root_recovering = 1;
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+ wc.trans = trans;
+ wc.pin = 1;
+
+ walk_log_tree(trans, log_root_tree, &wc);
+
+again:
+ key.objectid = BTRFS_TREE_LOG_OBJECTID;
+ key.offset = (u64)-1;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ btrfs_release_path(log_root_tree, path);
+ if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ break;
+
+ log = btrfs_read_fs_root_no_radix(log_root_tree,
+ &found_key);
+ BUG_ON(!log);
+
+
+ tmp_key.objectid = found_key.offset;
+ tmp_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_key.offset = (u64)-1;
+
+ wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+ BUG_ON(!wc.replay_dest);
+
+ wc.replay_dest->log_root = log;
+ btrfs_record_root_in_trans(wc.replay_dest);
+ ret = walk_log_tree(trans, log, &wc);
+ BUG_ON(ret);
+
+ if (wc.stage == LOG_WALK_REPLAY_ALL) {
+ ret = fixup_inode_link_counts(trans, wc.replay_dest,
+ path);
+ BUG_ON(ret);
+ }
+ ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
+ if (ret == 0) {
+ wc.replay_dest->highest_inode = highest_inode;
+ wc.replay_dest->last_inode_alloc = highest_inode;
+ }
+
+ key.offset = found_key.offset - 1;
+ wc.replay_dest->log_root = NULL;
+ free_extent_buffer(log->node);
+ kfree(log);
+
+ if (found_key.offset == 0)
+ break;
+ }
+ btrfs_release_path(log_root_tree, path);
+
+ /* step one is to pin it all, step two is to replay just inodes */
+ if (wc.pin) {
+ wc.pin = 0;
+ wc.process_func = replay_one_buffer;
+ wc.stage = LOG_WALK_REPLAY_INODES;
+ goto again;
+ }
+ /* step three is to replay everything */
+ if (wc.stage < LOG_WALK_REPLAY_ALL) {
+ wc.stage++;
+ goto again;
+ }
+
+ btrfs_free_path(path);
+
+ free_extent_buffer(log_root_tree->node);
+ log_root_tree->log_root = NULL;
+ fs_info->log_root_recovering = 0;
+
+ /* step 4: commit the transaction, which also unpins the blocks */
+ btrfs_commit_transaction(trans, fs_info->tree_root);
+
+ kfree(log_root_tree);
+ return 0;
+}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 00000000000..b9409b32ed0
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __TREE_LOG_
+#define __TREE_LOG_
+
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct dentry *dentry);
+int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct dentry *dentry);
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ int inode_only);
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *dir, u64 index);
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct inode *inode, u64 dirid);
+#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 00000000000..9bf3946d5ef
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
+#ifndef __BTRFS_VERSION_H
+#define __BTRFS_VERSION_H
+#define BTRFS_BUILD_VERSION "Btrfs"
+#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 00000000000..1ca1952fd91
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# determine-version -- report a useful version for releases
+#
+# Copyright 2008, Aron Griffis <agriffis@n01se.net>
+# Copyright 2008, Oracle
+# Released under the GNU GPLv2
+
+v="v0.16"
+
+which git &> /dev/null
+if [ $? == 0 ]; then
+ git branch >& /dev/null
+ if [ $? == 0 ]; then
+ if head=`git rev-parse --verify HEAD 2>/dev/null`; then
+ if tag=`git describe --tags 2>/dev/null`; then
+ v="$tag"
+ fi
+
+ # Are there uncommitted changes?
+ git update-index --refresh --unmerged > /dev/null
+ if git diff-index --name-only HEAD | \
+ grep -v "^scripts/package" \
+ | read dummy; then
+ v="$v"-dirty
+ fi
+ fi
+ fi
+fi
+
+echo "#ifndef __BUILD_VERSION" > .build-version.h
+echo "#define __BUILD_VERSION" >> .build-version.h
+echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
+echo "#endif" >> .build-version.h
+
+diff -q version.h .build-version.h >& /dev/null
+
+if [ $? == 0 ]; then
+ rm .build-version.h
+ exit 0
+fi
+
+mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 00000000000..b187b537888
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3218 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/version.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+
+struct map_lookup {
+ u64 type;
+ int io_align;
+ int io_width;
+ int stripe_len;
+ int sector_size;
+ int num_stripes;
+ int sub_stripes;
+ struct btrfs_bio_stripe stripes[];
+};
+
+static int init_first_rw_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device);
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+ (sizeof(struct btrfs_bio_stripe) * (n)))
+
+static DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+
+void btrfs_lock_volumes(void)
+{
+ mutex_lock(&uuid_mutex);
+}
+
+void btrfs_unlock_volumes(void)
+{
+ mutex_unlock(&uuid_mutex);
+}
+
+static void lock_chunks(struct btrfs_root *root)
+{
+ mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static void unlock_chunks(struct btrfs_root *root)
+{
+ mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+ WARN_ON(fs_devices->opened);
+ while (!list_empty(&fs_devices->devices)) {
+ device = list_entry(fs_devices->devices.next,
+ struct btrfs_device, dev_list);
+ list_del(&device->dev_list);
+ kfree(device->name);
+ kfree(device);
+ }
+ kfree(fs_devices);
+}
+
+int btrfs_cleanup_fs_uuids(void)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ while (!list_empty(&fs_uuids)) {
+ fs_devices = list_entry(fs_uuids.next,
+ struct btrfs_fs_devices, list);
+ list_del(&fs_devices->list);
+ free_fs_devices(fs_devices);
+ }
+ return 0;
+}
+
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+ u64 devid, u8 *uuid)
+{
+ struct btrfs_device *dev;
+ struct list_head *cur;
+
+ list_for_each(cur, head) {
+ dev = list_entry(cur, struct btrfs_device, dev_list);
+ if (dev->devid == devid &&
+ (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
+ return dev;
+ }
+ }
+ return NULL;
+}
+
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
+{
+ struct list_head *cur;
+ struct btrfs_fs_devices *fs_devices;
+
+ list_for_each(cur, &fs_uuids) {
+ fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ }
+ return NULL;
+}
+
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device. This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block. The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested. This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+static noinline int run_scheduled_bios(struct btrfs_device *device)
+{
+ struct bio *pending;
+ struct backing_dev_info *bdi;
+ struct btrfs_fs_info *fs_info;
+ struct bio *tail;
+ struct bio *cur;
+ int again = 0;
+ unsigned long num_run = 0;
+ unsigned long limit;
+
+ bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+ fs_info = device->dev_root->fs_info;
+ limit = btrfs_async_submit_limit(fs_info);
+ limit = limit * 2 / 3;
+
+loop:
+ spin_lock(&device->io_lock);
+
+ /* take all the bios off the list at once and process them
+ * later on (without the lock held). But, remember the
+ * tail and other pointers so the bios can be properly reinserted
+ * into the list if we hit congestion
+ */
+ pending = device->pending_bios;
+ tail = device->pending_bio_tail;
+ WARN_ON(pending && !tail);
+ device->pending_bios = NULL;
+ device->pending_bio_tail = NULL;
+
+ /*
+ * if pending was null this time around, no bios need processing
+ * at all and we can stop. Otherwise it'll loop back up again
+ * and do an additional check so no bios are missed.
+ *
+ * device->running_pending is used to synchronize with the
+ * schedule_bio code.
+ */
+ if (pending) {
+ again = 1;
+ device->running_pending = 1;
+ } else {
+ again = 0;
+ device->running_pending = 0;
+ }
+ spin_unlock(&device->io_lock);
+
+ while (pending) {
+ cur = pending;
+ pending = pending->bi_next;
+ cur->bi_next = NULL;
+ atomic_dec(&fs_info->nr_async_bios);
+
+ if (atomic_read(&fs_info->nr_async_bios) < limit &&
+ waitqueue_active(&fs_info->async_submit_wait))
+ wake_up(&fs_info->async_submit_wait);
+
+ BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+ bio_get(cur);
+ submit_bio(cur->bi_rw, cur);
+ bio_put(cur);
+ num_run++;
+
+ /*
+ * we made progress, there is more work to do and the bdi
+ * is now congested. Back off and let other work structs
+ * run instead
+ */
+ if (pending && bdi_write_congested(bdi) &&
+ fs_info->fs_devices->open_devices > 1) {
+ struct bio *old_head;
+
+ spin_lock(&device->io_lock);
+
+ old_head = device->pending_bios;
+ device->pending_bios = pending;
+ if (device->pending_bio_tail)
+ tail->bi_next = old_head;
+ else
+ device->pending_bio_tail = tail;
+
+ spin_unlock(&device->io_lock);
+ btrfs_requeue_work(&device->work);
+ goto done;
+ }
+ }
+ if (again)
+ goto loop;
+done:
+ return 0;
+}
+
+static void pending_bios_fn(struct btrfs_work *work)
+{
+ struct btrfs_device *device;
+
+ device = container_of(work, struct btrfs_device, work);
+ run_scheduled_bios(device);
+}
+
+static noinline int device_list_add(const char *path,
+ struct btrfs_super_block *disk_super,
+ u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *fs_devices;
+ u64 found_transid = btrfs_super_generation(disk_super);
+
+ fs_devices = find_fsid(disk_super->fsid);
+ if (!fs_devices) {
+ fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+ if (!fs_devices)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&fs_devices->devices);
+ INIT_LIST_HEAD(&fs_devices->alloc_list);
+ list_add(&fs_devices->list, &fs_uuids);
+ memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
+ fs_devices->latest_devid = devid;
+ fs_devices->latest_trans = found_transid;
+ device = NULL;
+ } else {
+ device = __find_device(&fs_devices->devices, devid,
+ disk_super->dev_item.uuid);
+ }
+ if (!device) {
+ if (fs_devices->opened)
+ return -EBUSY;
+
+ device = kzalloc(sizeof(*device), GFP_NOFS);
+ if (!device) {
+ /* we can safely leave the fs_devices entry around */
+ return -ENOMEM;
+ }
+ device->devid = devid;
+ device->work.func = pending_bios_fn;
+ memcpy(device->uuid, disk_super->dev_item.uuid,
+ BTRFS_UUID_SIZE);
+ device->barriers = 1;
+ spin_lock_init(&device->io_lock);
+ device->name = kstrdup(path, GFP_NOFS);
+ if (!device->name) {
+ kfree(device);
+ return -ENOMEM;
+ }
+ INIT_LIST_HEAD(&device->dev_alloc_list);
+ list_add(&device->dev_list, &fs_devices->devices);
+ device->fs_devices = fs_devices;
+ fs_devices->num_devices++;
+ }
+
+ if (found_transid > fs_devices->latest_trans) {
+ fs_devices->latest_devid = devid;
+ fs_devices->latest_trans = found_transid;
+ }
+ *fs_devices_ret = fs_devices;
+ return 0;
+}
+
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+ struct btrfs_fs_devices *fs_devices;
+ struct btrfs_device *device;
+ struct btrfs_device *orig_dev;
+
+ fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+ if (!fs_devices)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&fs_devices->devices);
+ INIT_LIST_HEAD(&fs_devices->alloc_list);
+ INIT_LIST_HEAD(&fs_devices->list);
+ fs_devices->latest_devid = orig->latest_devid;
+ fs_devices->latest_trans = orig->latest_trans;
+ memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+
+ list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+ device = kzalloc(sizeof(*device), GFP_NOFS);
+ if (!device)
+ goto error;
+
+ device->name = kstrdup(orig_dev->name, GFP_NOFS);
+ if (!device->name)
+ goto error;
+
+ device->devid = orig_dev->devid;
+ device->work.func = pending_bios_fn;
+ memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
+ device->barriers = 1;
+ spin_lock_init(&device->io_lock);
+ INIT_LIST_HEAD(&device->dev_list);
+ INIT_LIST_HEAD(&device->dev_alloc_list);
+
+ list_add(&device->dev_list, &fs_devices->devices);
+ device->fs_devices = fs_devices;
+ fs_devices->num_devices++;
+ }
+ return fs_devices;
+error:
+ free_fs_devices(fs_devices);
+ return ERR_PTR(-ENOMEM);
+}
+
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct list_head *tmp;
+ struct list_head *cur;
+ struct btrfs_device *device;
+
+ mutex_lock(&uuid_mutex);
+again:
+ list_for_each_safe(cur, tmp, &fs_devices->devices) {
+ device = list_entry(cur, struct btrfs_device, dev_list);
+ if (device->in_fs_metadata)
+ continue;
+
+ if (device->bdev) {
+ close_bdev_exclusive(device->bdev, device->mode);
+ device->bdev = NULL;
+ fs_devices->open_devices--;
+ }
+ if (device->writeable) {
+ list_del_init(&device->dev_alloc_list);
+ device->writeable = 0;
+ fs_devices->rw_devices--;
+ }
+ list_del_init(&device->dev_list);
+ fs_devices->num_devices--;
+ kfree(device->name);
+ kfree(device);
+ }
+
+ if (fs_devices->seed) {
+ fs_devices = fs_devices->seed;
+ goto again;
+ }
+
+ mutex_unlock(&uuid_mutex);
+ return 0;
+}
+
+static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct list_head *cur;
+ struct btrfs_device *device;
+
+ if (--fs_devices->opened > 0)
+ return 0;
+
+ list_for_each(cur, &fs_devices->devices) {
+ device = list_entry(cur, struct btrfs_device, dev_list);
+ if (device->bdev) {
+ close_bdev_exclusive(device->bdev, device->mode);
+ fs_devices->open_devices--;
+ }
+ if (device->writeable) {
+ list_del_init(&device->dev_alloc_list);
+ fs_devices->rw_devices--;
+ }
+
+ device->bdev = NULL;
+ device->writeable = 0;
+ device->in_fs_metadata = 0;
+ }
+ WARN_ON(fs_devices->open_devices);
+ WARN_ON(fs_devices->rw_devices);
+ fs_devices->opened = 0;
+ fs_devices->seeding = 0;
+
+ return 0;
+}
+
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_fs_devices *seed_devices = NULL;
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+ ret = __btrfs_close_devices(fs_devices);
+ if (!fs_devices->opened) {
+ seed_devices = fs_devices->seed;
+ fs_devices->seed = NULL;
+ }
+ mutex_unlock(&uuid_mutex);
+
+ while (seed_devices) {
+ fs_devices = seed_devices;
+ seed_devices = fs_devices->seed;
+ __btrfs_close_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ }
+ return ret;
+}
+
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder)
+{
+ struct block_device *bdev;
+ struct list_head *head = &fs_devices->devices;
+ struct list_head *cur;
+ struct btrfs_device *device;
+ struct block_device *latest_bdev = NULL;
+ struct buffer_head *bh;
+ struct btrfs_super_block *disk_super;
+ u64 latest_devid = 0;
+ u64 latest_transid = 0;
+ u64 devid;
+ int seeding = 1;
+ int ret = 0;
+
+ list_for_each(cur, head) {
+ device = list_entry(cur, struct btrfs_device, dev_list);
+ if (device->bdev)
+ continue;
+ if (!device->name)
+ continue;
+
+ bdev = open_bdev_exclusive(device->name, flags, holder);
+ if (IS_ERR(bdev)) {
+ printk(KERN_INFO "open %s failed\n", device->name);
+ goto error;
+ }
+ set_blocksize(bdev, 4096);
+
+ bh = btrfs_read_dev_super(bdev);
+ if (!bh)
+ goto error_close;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = le64_to_cpu(disk_super->dev_item.devid);
+ if (devid != device->devid)
+ goto error_brelse;
+
+ if (memcmp(device->uuid, disk_super->dev_item.uuid,
+ BTRFS_UUID_SIZE))
+ goto error_brelse;
+
+ device->generation = btrfs_super_generation(disk_super);
+ if (!latest_transid || device->generation > latest_transid) {
+ latest_devid = devid;
+ latest_transid = device->generation;
+ latest_bdev = bdev;
+ }
+
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+ device->writeable = 0;
+ } else {
+ device->writeable = !bdev_read_only(bdev);
+ seeding = 0;
+ }
+
+ device->bdev = bdev;
+ device->in_fs_metadata = 0;
+ device->mode = flags;
+
+ fs_devices->open_devices++;
+ if (device->writeable) {
+ fs_devices->rw_devices++;
+ list_add(&device->dev_alloc_list,
+ &fs_devices->alloc_list);
+ }
+ continue;
+
+error_brelse:
+ brelse(bh);
+error_close:
+ close_bdev_exclusive(bdev, FMODE_READ);
+error:
+ continue;
+ }
+ if (fs_devices->open_devices == 0) {
+ ret = -EIO;
+ goto out;
+ }
+ fs_devices->seeding = seeding;
+ fs_devices->opened = 1;
+ fs_devices->latest_bdev = latest_bdev;
+ fs_devices->latest_devid = latest_devid;
+ fs_devices->latest_trans = latest_transid;
+ fs_devices->total_rw_bytes = 0;
+out:
+ return ret;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder)
+{
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+ if (fs_devices->opened) {
+ fs_devices->opened++;
+ ret = 0;
+ } else {
+ ret = __btrfs_open_devices(fs_devices, flags, holder);
+ }
+ mutex_unlock(&uuid_mutex);
+ return ret;
+}
+
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+ struct btrfs_fs_devices **fs_devices_ret)
+{
+ struct btrfs_super_block *disk_super;
+ struct block_device *bdev;
+ struct buffer_head *bh;
+ int ret;
+ u64 devid;
+ u64 transid;
+
+ mutex_lock(&uuid_mutex);
+
+ bdev = open_bdev_exclusive(path, flags, holder);
+
+ if (IS_ERR(bdev)) {
+ ret = PTR_ERR(bdev);
+ goto error;
+ }
+
+ ret = set_blocksize(bdev, 4096);
+ if (ret)
+ goto error_close;
+ bh = btrfs_read_dev_super(bdev);
+ if (!bh) {
+ ret = -EIO;
+ goto error_close;
+ }
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = le64_to_cpu(disk_super->dev_item.devid);
+ transid = btrfs_super_generation(disk_super);
+ if (disk_super->label[0])
+ printk(KERN_INFO "device label %s ", disk_super->label);
+ else {
+ /* FIXME, make a readl uuid parser */
+ printk(KERN_INFO "device fsid %llx-%llx ",
+ *(unsigned long long *)disk_super->fsid,
+ *(unsigned long long *)(disk_super->fsid + 8));
+ }
+ printk(KERN_INFO "devid %llu transid %llu %s\n",
+ (unsigned long long)devid, (unsigned long long)transid, path);
+ ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+
+ brelse(bh);
+error_close:
+ close_bdev_exclusive(bdev, flags);
+error:
+ mutex_unlock(&uuid_mutex);
+ return ret;
+}
+
+/*
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ */
+static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 num_bytes, u64 *start)
+{
+ struct btrfs_key key;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_path *path;
+ u64 hole_size = 0;
+ u64 last_byte = 0;
+ u64 search_start = 0;
+ u64 search_end = device->total_bytes;
+ int ret;
+ int slot = 0;
+ int start_found;
+ struct extent_buffer *l;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 2;
+ start_found = 0;
+
+ /* FIXME use last free of some kind */
+
+ /* we don't want to overwrite the superblock on the drive,
+ * so we make sure to start at an offset of at least 1MB
+ */
+ search_start = max((u64)1024 * 1024, search_start);
+
+ if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+ search_start = max(root->fs_info->alloc_start, search_start);
+
+ key.objectid = device->devid;
+ key.offset = search_start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ ret = btrfs_previous_item(root, path, 0, key.type);
+ if (ret < 0)
+ goto error;
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+ while (1) {
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto error;
+no_more_items:
+ if (!start_found) {
+ if (search_start >= search_end) {
+ ret = -ENOSPC;
+ goto error;
+ }
+ *start = search_start;
+ start_found = 1;
+ goto check_pending;
+ }
+ *start = last_byte > search_start ?
+ last_byte : search_start;
+ if (search_end <= *start) {
+ ret = -ENOSPC;
+ goto error;
+ }
+ goto check_pending;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid < device->devid)
+ goto next;
+
+ if (key.objectid > device->devid)
+ goto no_more_items;
+
+ if (key.offset >= search_start && key.offset > last_byte &&
+ start_found) {
+ if (last_byte < search_start)
+ last_byte = search_start;
+ hole_size = key.offset - last_byte;
+ if (key.offset > last_byte &&
+ hole_size >= num_bytes) {
+ *start = last_byte;
+ goto check_pending;
+ }
+ }
+ if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ goto next;
+
+ start_found = 1;
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+next:
+ path->slots[0]++;
+ cond_resched();
+ }
+check_pending:
+ /* we have to make sure we didn't find an extent that has already
+ * been allocated by the map tree or the original allocation
+ */
+ BUG_ON(*start < search_start);
+
+ if (*start + num_bytes > search_end) {
+ ret = -ENOSPC;
+ goto error;
+ }
+ /* check for pending inserts here */
+ ret = 0;
+
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 start)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf = NULL;
+ struct btrfs_dev_extent *extent = NULL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.offset = start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid,
+ BTRFS_DEV_EXTENT_KEY);
+ BUG_ON(ret);
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ BUG_ON(found_key.offset > start || found_key.offset +
+ btrfs_dev_extent_length(leaf, extent) < start);
+ ret = 0;
+ } else if (ret == 0) {
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ }
+ BUG_ON(ret);
+
+ if (device->bytes_used > 0)
+ device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+ ret = btrfs_del_item(trans, root, path);
+ BUG_ON(ret);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset, u64 start, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *extent;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ WARN_ON(!device->in_fs_metadata);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.offset = start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*extent));
+ BUG_ON(ret);
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_extent);
+ btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+ btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+ btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+ write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+ (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
+ BTRFS_UUID_SIZE);
+
+ btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int find_next_chunk(struct btrfs_root *root,
+ u64 objectid, u64 *offset)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_chunk *chunk;
+ struct btrfs_key found_key;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ key.objectid = objectid;
+ key.offset = (u64)-1;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+
+ BUG_ON(ret == 0);
+
+ ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
+ if (ret) {
+ *offset = 0;
+ } else {
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != objectid)
+ *offset = 0;
+ else {
+ chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_chunk);
+ *offset = found_key.offset +
+ btrfs_chunk_length(path->nodes[0], chunk);
+ }
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
+{
+ int ret;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+
+ root = root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+
+ BUG_ON(ret == 0);
+
+ ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+ BTRFS_DEV_ITEM_KEY);
+ if (ret) {
+ *objectid = 1;
+ } else {
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ *objectid = found_key.offset + 1;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_dev_item *dev_item;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long ptr;
+
+ root = root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*dev_item));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+ btrfs_set_device_id(leaf, dev_item, device->devid);
+ btrfs_set_device_generation(leaf, dev_item, 0);
+ btrfs_set_device_type(leaf, dev_item, device->type);
+ btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+ btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+ btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+ btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+ btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_set_device_group(leaf, dev_item, 0);
+ btrfs_set_device_seek_speed(leaf, dev_item, 0);
+ btrfs_set_device_bandwidth(leaf, dev_item, 0);
+ btrfs_set_device_start_offset(leaf, dev_item, 0);
+
+ ptr = (unsigned long)btrfs_device_uuid(dev_item);
+ write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+ ptr = (unsigned long)btrfs_device_fsid(dev_item);
+ write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+ struct btrfs_device *device)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_trans_handle *trans;
+
+ root = root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 1);
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+ lock_chunks(root);
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+out:
+ btrfs_free_path(path);
+ unlock_chunks(root);
+ btrfs_commit_transaction(trans, root);
+ return ret;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+ struct btrfs_device *device;
+ struct btrfs_device *next_device;
+ struct block_device *bdev;
+ struct buffer_head *bh = NULL;
+ struct btrfs_super_block *disk_super;
+ u64 all_avail;
+ u64 devid;
+ u64 num_devices;
+ u8 *dev_uuid;
+ int ret = 0;
+
+ mutex_lock(&uuid_mutex);
+ mutex_lock(&root->fs_info->volume_mutex);
+
+ all_avail = root->fs_info->avail_data_alloc_bits |
+ root->fs_info->avail_system_alloc_bits |
+ root->fs_info->avail_metadata_alloc_bits;
+
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+ root->fs_info->fs_devices->rw_devices <= 4) {
+ printk(KERN_ERR "btrfs: unable to go below four devices "
+ "on raid10\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+ root->fs_info->fs_devices->rw_devices <= 2) {
+ printk(KERN_ERR "btrfs: unable to go below two "
+ "devices on raid1\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (strcmp(device_path, "missing") == 0) {
+ struct list_head *cur;
+ struct list_head *devices;
+ struct btrfs_device *tmp;
+
+ device = NULL;
+ devices = &root->fs_info->fs_devices->devices;
+ list_for_each(cur, devices) {
+ tmp = list_entry(cur, struct btrfs_device, dev_list);
+ if (tmp->in_fs_metadata && !tmp->bdev) {
+ device = tmp;
+ break;
+ }
+ }
+ bdev = NULL;
+ bh = NULL;
+ disk_super = NULL;
+ if (!device) {
+ printk(KERN_ERR "btrfs: no missing devices found to "
+ "remove\n");
+ goto out;
+ }
+ } else {
+ bdev = open_bdev_exclusive(device_path, FMODE_READ,
+ root->fs_info->bdev_holder);
+ if (IS_ERR(bdev)) {
+ ret = PTR_ERR(bdev);
+ goto out;
+ }
+
+ set_blocksize(bdev, 4096);
+ bh = btrfs_read_dev_super(bdev);
+ if (!bh) {
+ ret = -EIO;
+ goto error_close;
+ }
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+ devid = le64_to_cpu(disk_super->dev_item.devid);
+ dev_uuid = disk_super->dev_item.uuid;
+ device = btrfs_find_device(root, devid, dev_uuid,
+ disk_super->fsid);
+ if (!device) {
+ ret = -ENOENT;
+ goto error_brelse;
+ }
+ }
+
+ if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+ printk(KERN_ERR "btrfs: unable to remove the only writeable "
+ "device\n");
+ ret = -EINVAL;
+ goto error_brelse;
+ }
+
+ if (device->writeable) {
+ list_del_init(&device->dev_alloc_list);
+ root->fs_info->fs_devices->rw_devices--;
+ }
+
+ ret = btrfs_shrink_device(device, 0);
+ if (ret)
+ goto error_brelse;
+
+ ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+ if (ret)
+ goto error_brelse;
+
+ device->in_fs_metadata = 0;
+ list_del_init(&device->dev_list);
+ device->fs_devices->num_devices--;
+
+ next_device = list_entry(root->fs_info->fs_devices->devices.next,
+ struct btrfs_device, dev_list);
+ if (device->bdev == root->fs_info->sb->s_bdev)
+ root->fs_info->sb->s_bdev = next_device->bdev;
+ if (device->bdev == root->fs_info->fs_devices->latest_bdev)
+ root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+
+ if (device->bdev) {
+ close_bdev_exclusive(device->bdev, device->mode);
+ device->bdev = NULL;
+ device->fs_devices->open_devices--;
+ }
+
+ num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+ btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+
+ if (device->fs_devices->open_devices == 0) {
+ struct btrfs_fs_devices *fs_devices;
+ fs_devices = root->fs_info->fs_devices;
+ while (fs_devices) {
+ if (fs_devices->seed == device->fs_devices)
+ break;
+ fs_devices = fs_devices->seed;
+ }
+ fs_devices->seed = device->fs_devices->seed;
+ device->fs_devices->seed = NULL;
+ __btrfs_close_devices(device->fs_devices);
+ free_fs_devices(device->fs_devices);
+ }
+
+ /*
+ * at this point, the device is zero sized. We want to
+ * remove it from the devices list and zero out the old super
+ */
+ if (device->writeable) {
+ /* make sure this device isn't detected as part of
+ * the FS anymore
+ */
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ }
+
+ kfree(device->name);
+ kfree(device);
+ ret = 0;
+
+error_brelse:
+ brelse(bh);
+error_close:
+ if (bdev)
+ close_bdev_exclusive(bdev, FMODE_READ);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
+ mutex_unlock(&uuid_mutex);
+ return ret;
+}
+
+/*
+ * does all the dirty work required for changing file system's UUID.
+ */
+static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *old_devices;
+ struct btrfs_fs_devices *seed_devices;
+ struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_device *device;
+ u64 super_flags;
+
+ BUG_ON(!mutex_is_locked(&uuid_mutex));
+ if (!fs_devices->seeding)
+ return -EINVAL;
+
+ seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+ if (!seed_devices)
+ return -ENOMEM;
+
+ old_devices = clone_fs_devices(fs_devices);
+ if (IS_ERR(old_devices)) {
+ kfree(seed_devices);
+ return PTR_ERR(old_devices);
+ }
+
+ list_add(&old_devices->list, &fs_uuids);
+
+ memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+ seed_devices->opened = 1;
+ INIT_LIST_HEAD(&seed_devices->devices);
+ INIT_LIST_HEAD(&seed_devices->alloc_list);
+ list_splice_init(&fs_devices->devices, &seed_devices->devices);
+ list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+ list_for_each_entry(device, &seed_devices->devices, dev_list) {
+ device->fs_devices = seed_devices;
+ }
+
+ fs_devices->seeding = 0;
+ fs_devices->num_devices = 0;
+ fs_devices->open_devices = 0;
+ fs_devices->seed = seed_devices;
+
+ generate_random_uuid(fs_devices->fsid);
+ memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ super_flags = btrfs_super_flags(disk_super) &
+ ~BTRFS_SUPER_FLAG_SEEDING;
+ btrfs_set_super_flags(disk_super, super_flags);
+
+ return 0;
+}
+
+/*
+ * strore the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_dev_item *dev_item;
+ struct btrfs_device *device;
+ struct btrfs_key key;
+ u8 fs_uuid[BTRFS_UUID_SIZE];
+ u8 dev_uuid[BTRFS_UUID_SIZE];
+ u64 devid;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ root = root->fs_info->chunk_root;
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.offset = 0;
+ key.type = BTRFS_DEV_ITEM_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto error;
+
+ leaf = path->nodes[0];
+next_slot:
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret > 0)
+ break;
+ if (ret < 0)
+ goto error;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(root, path);
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+ key.type != BTRFS_DEV_ITEM_KEY)
+ break;
+
+ dev_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_dev_item);
+ devid = btrfs_device_id(leaf, dev_item);
+ read_extent_buffer(leaf, dev_uuid,
+ (unsigned long)btrfs_device_uuid(dev_item),
+ BTRFS_UUID_SIZE);
+ read_extent_buffer(leaf, fs_uuid,
+ (unsigned long)btrfs_device_fsid(dev_item),
+ BTRFS_UUID_SIZE);
+ device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+ BUG_ON(!device);
+
+ if (device->fs_devices->seeding) {
+ btrfs_set_device_generation(leaf, dev_item,
+ device->generation);
+ btrfs_mark_buffer_dirty(leaf);
+ }
+
+ path->slots[0]++;
+ goto next_slot;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_device *device;
+ struct block_device *bdev;
+ struct list_head *cur;
+ struct list_head *devices;
+ struct super_block *sb = root->fs_info->sb;
+ u64 total_bytes;
+ int seeding_dev = 0;
+ int ret = 0;
+
+ if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+ return -EINVAL;
+
+ bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+ if (!bdev)
+ return -EIO;
+
+ if (root->fs_info->fs_devices->seeding) {
+ seeding_dev = 1;
+ down_write(&sb->s_umount);
+ mutex_lock(&uuid_mutex);
+ }
+
+ filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ mutex_lock(&root->fs_info->volume_mutex);
+
+ devices = &root->fs_info->fs_devices->devices;
+ list_for_each(cur, devices) {
+ device = list_entry(cur, struct btrfs_device, dev_list);
+ if (device->bdev == bdev) {
+ ret = -EEXIST;
+ goto error;
+ }
+ }
+
+ device = kzalloc(sizeof(*device), GFP_NOFS);
+ if (!device) {
+ /* we can safely leave the fs_devices entry around */
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ device->name = kstrdup(device_path, GFP_NOFS);
+ if (!device->name) {
+ kfree(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = find_next_devid(root, &device->devid);
+ if (ret) {
+ kfree(device);
+ goto error;
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ lock_chunks(root);
+
+ device->barriers = 1;
+ device->writeable = 1;
+ device->work.func = pending_bios_fn;
+ generate_random_uuid(device->uuid);
+ spin_lock_init(&device->io_lock);
+ device->generation = trans->transid;
+ device->io_width = root->sectorsize;
+ device->io_align = root->sectorsize;
+ device->sector_size = root->sectorsize;
+ device->total_bytes = i_size_read(bdev->bd_inode);
+ device->dev_root = root->fs_info->dev_root;
+ device->bdev = bdev;
+ device->in_fs_metadata = 1;
+ device->mode = 0;
+ set_blocksize(device->bdev, 4096);
+
+ if (seeding_dev) {
+ sb->s_flags &= ~MS_RDONLY;
+ ret = btrfs_prepare_sprout(trans, root);
+ BUG_ON(ret);
+ }
+
+ device->fs_devices = root->fs_info->fs_devices;
+ list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+ list_add(&device->dev_alloc_list,
+ &root->fs_info->fs_devices->alloc_list);
+ root->fs_info->fs_devices->num_devices++;
+ root->fs_info->fs_devices->open_devices++;
+ root->fs_info->fs_devices->rw_devices++;
+ root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+
+ total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+ btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+ total_bytes + device->total_bytes);
+
+ total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+ btrfs_set_super_num_devices(&root->fs_info->super_copy,
+ total_bytes + 1);
+
+ if (seeding_dev) {
+ ret = init_first_rw_device(trans, root, device);
+ BUG_ON(ret);
+ ret = btrfs_finish_sprout(trans, root);
+ BUG_ON(ret);
+ } else {
+ ret = btrfs_add_device(trans, root, device);
+ }
+
+ unlock_chunks(root);
+ btrfs_commit_transaction(trans, root);
+
+ if (seeding_dev) {
+ mutex_unlock(&uuid_mutex);
+ up_write(&sb->s_umount);
+
+ ret = btrfs_relocate_sys_chunks(root);
+ BUG_ON(ret);
+ }
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
+ return ret;
+error:
+ close_bdev_exclusive(bdev, 0);
+ if (seeding_dev) {
+ mutex_unlock(&uuid_mutex);
+ up_write(&sb->s_umount);
+ }
+ goto out;
+}
+
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_root *root;
+ struct btrfs_dev_item *dev_item;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ root = device->dev_root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = device->devid;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+ btrfs_set_device_id(leaf, dev_item, device->devid);
+ btrfs_set_device_type(leaf, dev_item, device->type);
+ btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+ btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+ btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+ btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+ btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+ btrfs_mark_buffer_dirty(leaf);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 new_size)
+{
+ struct btrfs_super_block *super_copy =
+ &device->dev_root->fs_info->super_copy;
+ u64 old_total = btrfs_super_total_bytes(super_copy);
+ u64 diff = new_size - device->total_bytes;
+
+ if (!device->writeable)
+ return -EACCES;
+ if (new_size <= device->total_bytes)
+ return -EINVAL;
+
+ btrfs_set_super_total_bytes(super_copy, old_total + diff);
+ device->fs_devices->total_rw_bytes += diff;
+
+ device->total_bytes = new_size;
+ return btrfs_update_device(trans, device);
+}
+
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 new_size)
+{
+ int ret;
+ lock_chunks(device->dev_root);
+ ret = __btrfs_grow_device(trans, device, new_size);
+ unlock_chunks(device->dev_root);
+ return ret;
+}
+
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+
+ root = root->fs_info->chunk_root;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = chunk_objectid;
+ key.offset = chunk_offset;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ BUG_ON(ret);
+
+ ret = btrfs_del_item(trans, root, path);
+ BUG_ON(ret);
+
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+ chunk_offset)
+{
+ struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_disk_key *disk_key;
+ struct btrfs_chunk *chunk;
+ u8 *ptr;
+ int ret = 0;
+ u32 num_stripes;
+ u32 array_size;
+ u32 len = 0;
+ u32 cur;
+ struct btrfs_key key;
+
+ array_size = btrfs_super_sys_array_size(super_copy);
+
+ ptr = super_copy->sys_chunk_array;
+ cur = 0;
+
+ while (cur < array_size) {
+ disk_key = (struct btrfs_disk_key *)ptr;
+ btrfs_disk_key_to_cpu(&key, disk_key);
+
+ len = sizeof(*disk_key);
+
+ if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+ chunk = (struct btrfs_chunk *)(ptr + len);
+ num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+ len += btrfs_chunk_item_size(num_stripes);
+ } else {
+ ret = -EIO;
+ break;
+ }
+ if (key.objectid == chunk_objectid &&
+ key.offset == chunk_offset) {
+ memmove(ptr, ptr + len, array_size - (cur + len));
+ array_size -= len;
+ btrfs_set_super_sys_array_size(super_copy, array_size);
+ } else {
+ ptr += len;
+ cur += len;
+ }
+ }
+ return ret;
+}
+
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree;
+ struct btrfs_root *extent_root;
+ struct btrfs_trans_handle *trans;
+ struct extent_map *em;
+ struct map_lookup *map;
+ int ret;
+ int i;
+
+ printk(KERN_INFO "btrfs relocating chunk %llu\n",
+ (unsigned long long)chunk_offset);
+ root = root->fs_info->chunk_root;
+ extent_root = root->fs_info->extent_root;
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+
+ /* step one, relocate all the extents inside this chunk */
+ ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ BUG_ON(ret);
+
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+
+ lock_chunks(root);
+
+ /*
+ * step two, delete the device extents and the
+ * chunk tree entries
+ */
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ spin_unlock(&em_tree->lock);
+
+ BUG_ON(em->start > chunk_offset ||
+ em->start + em->len < chunk_offset);
+ map = (struct map_lookup *)em->bdev;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
+ map->stripes[i].physical);
+ BUG_ON(ret);
+
+ if (map->stripes[i].dev) {
+ ret = btrfs_update_device(trans, map->stripes[i].dev);
+ BUG_ON(ret);
+ }
+ }
+ ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
+ chunk_offset);
+
+ BUG_ON(ret);
+
+ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+ BUG_ON(ret);
+ }
+
+ ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+ BUG_ON(ret);
+
+ spin_lock(&em_tree->lock);
+ remove_extent_mapping(em_tree, em);
+ spin_unlock(&em_tree->lock);
+
+ kfree(map);
+ em->bdev = NULL;
+
+ /* once for the tree */
+ free_extent_map(em);
+ /* once for us */
+ free_extent_map(em);
+
+ unlock_chunks(root);
+ btrfs_end_transaction(trans, root);
+ return 0;
+}
+
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+{
+ struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_chunk *chunk;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 chunk_tree = chunk_root->root_key.objectid;
+ u64 chunk_type;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.offset = (u64)-1;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ BUG_ON(ret == 0);
+
+ ret = btrfs_previous_item(chunk_root, path, key.objectid,
+ key.type);
+ if (ret < 0)
+ goto error;
+ if (ret > 0)
+ break;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ chunk = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_chunk);
+ chunk_type = btrfs_chunk_type(leaf, chunk);
+ btrfs_release_path(chunk_root, path);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+ found_key.objectid,
+ found_key.offset);
+ BUG_ON(ret);
+ }
+
+ if (found_key.offset == 0)
+ break;
+ key.offset = found_key.offset - 1;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static u64 div_factor(u64 num, int factor)
+{
+ if (factor == 10)
+ return num;
+ num *= factor;
+ do_div(num, 10);
+ return num;
+}
+
+int btrfs_balance(struct btrfs_root *dev_root)
+{
+ int ret;
+ struct list_head *cur;
+ struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+ struct btrfs_device *device;
+ u64 old_size;
+ u64 size_to_free;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_chunk *chunk;
+ struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key found_key;
+
+ if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ mutex_lock(&dev_root->fs_info->volume_mutex);
+ dev_root = dev_root->fs_info->dev_root;
+
+ /* step one make some room on all the devices */
+ list_for_each(cur, devices) {
+ device = list_entry(cur, struct btrfs_device, dev_list);
+ old_size = device->total_bytes;
+ size_to_free = div_factor(old_size, 1);
+ size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+ if (!device->writeable ||
+ device->total_bytes - device->bytes_used > size_to_free)
+ continue;
+
+ ret = btrfs_shrink_device(device, old_size - size_to_free);
+ BUG_ON(ret);
+
+ trans = btrfs_start_transaction(dev_root, 1);
+ BUG_ON(!trans);
+
+ ret = btrfs_grow_device(trans, device, old_size);
+ BUG_ON(ret);
+
+ btrfs_end_transaction(trans, dev_root);
+ }
+
+ /* step two, relocate all the chunks */
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.offset = (u64)-1;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+
+ /*
+ * this shouldn't happen, it means the last relocate
+ * failed
+ */
+ if (ret == 0)
+ break;
+
+ ret = btrfs_previous_item(chunk_root, path, 0,
+ BTRFS_CHUNK_ITEM_KEY);
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != key.objectid)
+ break;
+
+ chunk = btrfs_item_ptr(path->nodes[0],
+ path->slots[0],
+ struct btrfs_chunk);
+ key.offset = found_key.offset;
+ /* chunk zero is special */
+ if (key.offset == 0)
+ break;
+
+ btrfs_release_path(chunk_root, path);
+ ret = btrfs_relocate_chunk(chunk_root,
+ chunk_root->root_key.objectid,
+ found_key.objectid,
+ found_key.offset);
+ BUG_ON(ret);
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ mutex_unlock(&dev_root->fs_info->volume_mutex);
+ return ret;
+}
+
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_path *path;
+ u64 length;
+ u64 chunk_tree;
+ u64 chunk_objectid;
+ u64 chunk_offset;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ u64 old_total = btrfs_super_total_bytes(super_copy);
+ u64 diff = device->total_bytes - new_size;
+
+ if (new_size >= device->total_bytes)
+ return -EINVAL;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ path->reada = 2;
+
+ lock_chunks(root);
+
+ device->total_bytes = new_size;
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes -= diff;
+ ret = btrfs_update_device(trans, device);
+ if (ret) {
+ unlock_chunks(root);
+ btrfs_end_transaction(trans, root);
+ goto done;
+ }
+ WARN_ON(diff > old_total);
+ btrfs_set_super_total_bytes(super_copy, old_total - diff);
+ unlock_chunks(root);
+ btrfs_end_transaction(trans, root);
+
+ key.objectid = device->devid;
+ key.offset = (u64)-1;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto done;
+
+ ret = btrfs_previous_item(root, path, 0, key.type);
+ if (ret < 0)
+ goto done;
+ if (ret) {
+ ret = 0;
+ goto done;
+ }
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
+ if (key.objectid != device->devid)
+ goto done;
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ length = btrfs_dev_extent_length(l, dev_extent);
+
+ if (key.offset + length <= new_size)
+ goto done;
+
+ chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+ chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+ chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+ btrfs_release_path(root, path);
+
+ ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+ chunk_offset);
+ if (ret)
+ goto done;
+ }
+
+done:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key,
+ struct btrfs_chunk *chunk, int item_size)
+{
+ struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_disk_key disk_key;
+ u32 array_size;
+ u8 *ptr;
+
+ array_size = btrfs_super_sys_array_size(super_copy);
+ if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+ return -EFBIG;
+
+ ptr = super_copy->sys_chunk_array + array_size;
+ btrfs_cpu_key_to_disk(&disk_key, key);
+ memcpy(ptr, &disk_key, sizeof(disk_key));
+ ptr += sizeof(disk_key);
+ memcpy(ptr, chunk, item_size);
+ item_size += sizeof(disk_key);
+ btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+ return 0;
+}
+
+static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
+ int num_stripes, int sub_stripes)
+{
+ if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
+ return calc_size;
+ else if (type & BTRFS_BLOCK_GROUP_RAID10)
+ return calc_size * (num_stripes / sub_stripes);
+ else
+ return calc_size * num_stripes;
+}
+
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct map_lookup **map_ret,
+ u64 *num_bytes, u64 *stripe_size,
+ u64 start, u64 type)
+{
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ struct btrfs_device *device = NULL;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct list_head *cur;
+ struct map_lookup *map = NULL;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct list_head private_devs;
+ int min_stripe_size = 1 * 1024 * 1024;
+ u64 calc_size = 1024 * 1024 * 1024;
+ u64 max_chunk_size = calc_size;
+ u64 min_free;
+ u64 avail;
+ u64 max_avail = 0;
+ u64 dev_offset;
+ int num_stripes = 1;
+ int min_stripes = 1;
+ int sub_stripes = 0;
+ int looped = 0;
+ int ret;
+ int index;
+ int stripe_len = 64 * 1024;
+
+ if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+ (type & BTRFS_BLOCK_GROUP_DUP)) {
+ WARN_ON(1);
+ type &= ~BTRFS_BLOCK_GROUP_DUP;
+ }
+ if (list_empty(&fs_devices->alloc_list))
+ return -ENOSPC;
+
+ if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
+ num_stripes = fs_devices->rw_devices;
+ min_stripes = 2;
+ }
+ if (type & (BTRFS_BLOCK_GROUP_DUP)) {
+ num_stripes = 2;
+ min_stripes = 2;
+ }
+ if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+ num_stripes = min_t(u64, 2, fs_devices->rw_devices);
+ if (num_stripes < 2)
+ return -ENOSPC;
+ min_stripes = 2;
+ }
+ if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+ num_stripes = fs_devices->rw_devices;
+ if (num_stripes < 4)
+ return -ENOSPC;
+ num_stripes &= ~(u32)1;
+ sub_stripes = 2;
+ min_stripes = 4;
+ }
+
+ if (type & BTRFS_BLOCK_GROUP_DATA) {
+ max_chunk_size = 10 * calc_size;
+ min_stripe_size = 64 * 1024 * 1024;
+ } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+ max_chunk_size = 4 * calc_size;
+ min_stripe_size = 32 * 1024 * 1024;
+ } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ calc_size = 8 * 1024 * 1024;
+ max_chunk_size = calc_size * 2;
+ min_stripe_size = 1 * 1024 * 1024;
+ }
+
+ /* we don't want a chunk larger than 10% of writeable space */
+ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+ max_chunk_size);
+
+again:
+ if (!map || map->num_stripes != num_stripes) {
+ kfree(map);
+ map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+ if (!map)
+ return -ENOMEM;
+ map->num_stripes = num_stripes;
+ }
+
+ if (calc_size * num_stripes > max_chunk_size) {
+ calc_size = max_chunk_size;
+ do_div(calc_size, num_stripes);
+ do_div(calc_size, stripe_len);
+ calc_size *= stripe_len;
+ }
+ /* we don't want tiny stripes */
+ calc_size = max_t(u64, min_stripe_size, calc_size);
+
+ do_div(calc_size, stripe_len);
+ calc_size *= stripe_len;
+
+ cur = fs_devices->alloc_list.next;
+ index = 0;
+
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ min_free = calc_size * 2;
+ else
+ min_free = calc_size;
+
+ /*
+ * we add 1MB because we never use the first 1MB of the device, unless
+ * we've looped, then we are likely allocating the maximum amount of
+ * space left already
+ */
+ if (!looped)
+ min_free += 1024 * 1024;
+
+ INIT_LIST_HEAD(&private_devs);
+ while (index < num_stripes) {
+ device = list_entry(cur, struct btrfs_device, dev_alloc_list);
+ BUG_ON(!device->writeable);
+ if (device->total_bytes > device->bytes_used)
+ avail = device->total_bytes - device->bytes_used;
+ else
+ avail = 0;
+ cur = cur->next;
+
+ if (device->in_fs_metadata && avail >= min_free) {
+ ret = find_free_dev_extent(trans, device,
+ min_free, &dev_offset);
+ if (ret == 0) {
+ list_move_tail(&device->dev_alloc_list,
+ &private_devs);
+ map->stripes[index].dev = device;
+ map->stripes[index].physical = dev_offset;
+ index++;
+ if (type & BTRFS_BLOCK_GROUP_DUP) {
+ map->stripes[index].dev = device;
+ map->stripes[index].physical =
+ dev_offset + calc_size;
+ index++;
+ }
+ }
+ } else if (device->in_fs_metadata && avail > max_avail)
+ max_avail = avail;
+ if (cur == &fs_devices->alloc_list)
+ break;
+ }
+ list_splice(&private_devs, &fs_devices->alloc_list);
+ if (index < num_stripes) {
+ if (index >= min_stripes) {
+ num_stripes = index;
+ if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+ num_stripes /= sub_stripes;
+ num_stripes *= sub_stripes;
+ }
+ looped = 1;
+ goto again;
+ }
+ if (!looped && max_avail > 0) {
+ looped = 1;
+ calc_size = max_avail;
+ goto again;
+ }
+ kfree(map);
+ return -ENOSPC;
+ }
+ map->sector_size = extent_root->sectorsize;
+ map->stripe_len = stripe_len;
+ map->io_align = stripe_len;
+ map->io_width = stripe_len;
+ map->type = type;
+ map->num_stripes = num_stripes;
+ map->sub_stripes = sub_stripes;
+
+ *map_ret = map;
+ *stripe_size = calc_size;
+ *num_bytes = chunk_bytes_by_type(type, calc_size,
+ num_stripes, sub_stripes);
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ em->bdev = (struct block_device *)map;
+ em->start = start;
+ em->len = *num_bytes;
+ em->block_start = 0;
+ em->block_len = em->len;
+
+ em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+ spin_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ spin_unlock(&em_tree->lock);
+ BUG_ON(ret);
+ free_extent_map(em);
+
+ ret = btrfs_make_block_group(trans, extent_root, 0, type,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ start, *num_bytes);
+ BUG_ON(ret);
+
+ index = 0;
+ while (index < map->num_stripes) {
+ device = map->stripes[index].dev;
+ dev_offset = map->stripes[index].physical;
+
+ ret = btrfs_alloc_dev_extent(trans, device,
+ info->chunk_root->root_key.objectid,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ start, dev_offset, calc_size);
+ BUG_ON(ret);
+ index++;
+ }
+
+ return 0;
+}
+
+static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct map_lookup *map, u64 chunk_offset,
+ u64 chunk_size, u64 stripe_size)
+{
+ u64 dev_offset;
+ struct btrfs_key key;
+ struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+ struct btrfs_device *device;
+ struct btrfs_chunk *chunk;
+ struct btrfs_stripe *stripe;
+ size_t item_size = btrfs_chunk_item_size(map->num_stripes);
+ int index = 0;
+ int ret;
+
+ chunk = kzalloc(item_size, GFP_NOFS);
+ if (!chunk)
+ return -ENOMEM;
+
+ index = 0;
+ while (index < map->num_stripes) {
+ device = map->stripes[index].dev;
+ device->bytes_used += stripe_size;
+ ret = btrfs_update_device(trans, device);
+ BUG_ON(ret);
+ index++;
+ }
+
+ index = 0;
+ stripe = &chunk->stripe;
+ while (index < map->num_stripes) {
+ device = map->stripes[index].dev;
+ dev_offset = map->stripes[index].physical;
+
+ btrfs_set_stack_stripe_devid(stripe, device->devid);
+ btrfs_set_stack_stripe_offset(stripe, dev_offset);
+ memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+ stripe++;
+ index++;
+ }
+
+ btrfs_set_stack_chunk_length(chunk, chunk_size);
+ btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+ btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_type(chunk, map->type);
+ btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+ btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+ btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = chunk_offset;
+
+ ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+ BUG_ON(ret);
+
+ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+ item_size);
+ BUG_ON(ret);
+ }
+ kfree(chunk);
+ return 0;
+}
+
+/*
+ * Chunk allocation falls into two parts. The first part does works
+ * that make the new allocated chunk useable, but not do any operation
+ * that modifies the chunk tree. The second part does the works that
+ * require modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 type)
+{
+ u64 chunk_offset;
+ u64 chunk_size;
+ u64 stripe_size;
+ struct map_lookup *map;
+ struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+ int ret;
+
+ ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ &chunk_offset);
+ if (ret)
+ return ret;
+
+ ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+ &stripe_size, chunk_offset, type);
+ if (ret)
+ return ret;
+
+ ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+ chunk_size, stripe_size);
+ BUG_ON(ret);
+ return 0;
+}
+
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device)
+{
+ u64 chunk_offset;
+ u64 sys_chunk_offset;
+ u64 chunk_size;
+ u64 sys_chunk_size;
+ u64 stripe_size;
+ u64 sys_stripe_size;
+ u64 alloc_profile;
+ struct map_lookup *map;
+ struct map_lookup *sys_map;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ int ret;
+
+ ret = find_next_chunk(fs_info->chunk_root,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
+ BUG_ON(ret);
+
+ alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
+ (fs_info->metadata_alloc_profile &
+ fs_info->avail_metadata_alloc_bits);
+ alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+ ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+ &stripe_size, chunk_offset, alloc_profile);
+ BUG_ON(ret);
+
+ sys_chunk_offset = chunk_offset + chunk_size;
+
+ alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
+ (fs_info->system_alloc_profile &
+ fs_info->avail_system_alloc_bits);
+ alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+ ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
+ &sys_chunk_size, &sys_stripe_size,
+ sys_chunk_offset, alloc_profile);
+ BUG_ON(ret);
+
+ ret = btrfs_add_device(trans, fs_info->chunk_root, device);
+ BUG_ON(ret);
+
+ /*
+ * Modifying chunk tree needs allocating new blocks from both
+ * system block group and metadata block group. So we only can
+ * do operations require modifying the chunk tree after both
+ * block groups were created.
+ */
+ ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+ chunk_size, stripe_size);
+ BUG_ON(ret);
+
+ ret = __finish_chunk_alloc(trans, extent_root, sys_map,
+ sys_chunk_offset, sys_chunk_size,
+ sys_stripe_size);
+ BUG_ON(ret);
+ return 0;
+}
+
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ int readonly = 0;
+ int i;
+
+ spin_lock(&map_tree->map_tree.lock);
+ em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+ spin_unlock(&map_tree->map_tree.lock);
+ if (!em)
+ return 1;
+
+ map = (struct map_lookup *)em->bdev;
+ for (i = 0; i < map->num_stripes; i++) {
+ if (!map->stripes[i].dev->writeable) {
+ readonly = 1;
+ break;
+ }
+ }
+ free_extent_map(em);
+ return readonly;
+}
+
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+ extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+}
+
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+ struct extent_map *em;
+
+ while (1) {
+ spin_lock(&tree->map_tree.lock);
+ em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+ if (em)
+ remove_extent_mapping(&tree->map_tree, em);
+ spin_unlock(&tree->map_tree.lock);
+ if (!em)
+ break;
+ kfree(em->bdev);
+ /* once for us */
+ free_extent_map(em);
+ /* once for the tree */
+ free_extent_map(em);
+ }
+}
+
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ int ret;
+
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, len);
+ spin_unlock(&em_tree->lock);
+ BUG_ON(!em);
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+ map = (struct map_lookup *)em->bdev;
+ if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+ ret = map->num_stripes;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ ret = map->sub_stripes;
+ else
+ ret = 1;
+ free_extent_map(em);
+ return ret;
+}
+
+static int find_live_mirror(struct map_lookup *map, int first, int num,
+ int optimal)
+{
+ int i;
+ if (map->stripes[optimal].dev->bdev)
+ return optimal;
+ for (i = first; i < first + num; i++) {
+ if (map->stripes[i].dev->bdev)
+ return i;
+ }
+ /* we couldn't find one that doesn't fail. Just return something
+ * and the io error handling code will clean up eventually
+ */
+ return optimal;
+}
+
+static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_multi_bio **multi_ret,
+ int mirror_num, struct page *unplug_page)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ u64 offset;
+ u64 stripe_offset;
+ u64 stripe_nr;
+ int stripes_allocated = 8;
+ int stripes_required = 1;
+ int stripe_index;
+ int i;
+ int num_stripes;
+ int max_errors = 0;
+ struct btrfs_multi_bio *multi = NULL;
+
+ if (multi_ret && !(rw & (1 << BIO_RW)))
+ stripes_allocated = 1;
+again:
+ if (multi_ret) {
+ multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+ GFP_NOFS);
+ if (!multi)
+ return -ENOMEM;
+
+ atomic_set(&multi->error, 0);
+ }
+
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, *length);
+ spin_unlock(&em_tree->lock);
+
+ if (!em && unplug_page)
+ return 0;
+
+ if (!em) {
+ printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+ (unsigned long long)logical,
+ (unsigned long long)*length);
+ BUG();
+ }
+
+ BUG_ON(em->start > logical || em->start + em->len < logical);
+ map = (struct map_lookup *)em->bdev;
+ offset = logical - em->start;
+
+ if (mirror_num > map->num_stripes)
+ mirror_num = 0;
+
+ /* if our multi bio struct is too small, back off and try again */
+ if (rw & (1 << BIO_RW)) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ stripes_required = map->num_stripes;
+ max_errors = 1;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ stripes_required = map->sub_stripes;
+ max_errors = 1;
+ }
+ }
+ if (multi_ret && rw == WRITE &&
+ stripes_allocated < stripes_required) {
+ stripes_allocated = map->num_stripes;
+ free_extent_map(em);
+ kfree(multi);
+ goto again;
+ }
+ stripe_nr = offset;
+ /*
+ * stripe_nr counts the total number of stripes we have to stride
+ * to get to this block
+ */
+ do_div(stripe_nr, map->stripe_len);
+
+ stripe_offset = stripe_nr * map->stripe_len;
+ BUG_ON(offset < stripe_offset);
+
+ /* stripe_offset is the offset of this block in its stripe*/
+ stripe_offset = offset - stripe_offset;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ /* we limit the length of each bio to what fits in a stripe */
+ *length = min_t(u64, em->len - offset,
+ map->stripe_len - stripe_offset);
+ } else {
+ *length = em->len - offset;
+ }
+
+ if (!multi_ret && !unplug_page)
+ goto out;
+
+ num_stripes = 1;
+ stripe_index = 0;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+ if (unplug_page || (rw & (1 << BIO_RW)))
+ num_stripes = map->num_stripes;
+ else if (mirror_num)
+ stripe_index = mirror_num - 1;
+ else {
+ stripe_index = find_live_mirror(map, 0,
+ map->num_stripes,
+ current->pid % map->num_stripes);
+ }
+
+ } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+ if (rw & (1 << BIO_RW))
+ num_stripes = map->num_stripes;
+ else if (mirror_num)
+ stripe_index = mirror_num - 1;
+
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ int factor = map->num_stripes / map->sub_stripes;
+
+ stripe_index = do_div(stripe_nr, factor);
+ stripe_index *= map->sub_stripes;
+
+ if (unplug_page || (rw & (1 << BIO_RW)))
+ num_stripes = map->sub_stripes;
+ else if (mirror_num)
+ stripe_index += mirror_num - 1;
+ else {
+ stripe_index = find_live_mirror(map, stripe_index,
+ map->sub_stripes, stripe_index +
+ current->pid % map->sub_stripes);
+ }
+ } else {
+ /*
+ * after this do_div call, stripe_nr is the number of stripes
+ * on this device we have to walk to find the data, and
+ * stripe_index is the number of our device in the stripe array
+ */
+ stripe_index = do_div(stripe_nr, map->num_stripes);
+ }
+ BUG_ON(stripe_index >= map->num_stripes);
+
+ for (i = 0; i < num_stripes; i++) {
+ if (unplug_page) {
+ struct btrfs_device *device;
+ struct backing_dev_info *bdi;
+
+ device = map->stripes[stripe_index].dev;
+ if (device->bdev) {
+ bdi = blk_get_backing_dev_info(device->bdev);
+ if (bdi->unplug_io_fn)
+ bdi->unplug_io_fn(bdi, unplug_page);
+ }
+ } else {
+ multi->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+ multi->stripes[i].dev = map->stripes[stripe_index].dev;
+ }
+ stripe_index++;
+ }
+ if (multi_ret) {
+ *multi_ret = multi;
+ multi->num_stripes = num_stripes;
+ multi->max_errors = max_errors;
+ }
+out:
+ free_extent_map(em);
+ return 0;
+}
+
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_multi_bio **multi_ret, int mirror_num)
+{
+ return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+ mirror_num, NULL);
+}
+
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+ u64 chunk_start, u64 physical, u64 devid,
+ u64 **logical, int *naddrs, int *stripe_len)
+{
+ struct extent_map_tree *em_tree = &map_tree->map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 *buf;
+ u64 bytenr;
+ u64 length;
+ u64 stripe_nr;
+ int i, j, nr = 0;
+
+ spin_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_start, 1);
+ spin_unlock(&em_tree->lock);
+
+ BUG_ON(!em || em->start != chunk_start);
+ map = (struct map_lookup *)em->bdev;
+
+ length = em->len;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ do_div(length, map->num_stripes / map->sub_stripes);
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ do_div(length, map->num_stripes);
+
+ buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+ BUG_ON(!buf);
+
+ for (i = 0; i < map->num_stripes; i++) {
+ if (devid && map->stripes[i].dev->devid != devid)
+ continue;
+ if (map->stripes[i].physical > physical ||
+ map->stripes[i].physical + length <= physical)
+ continue;
+
+ stripe_nr = physical - map->stripes[i].physical;
+ do_div(stripe_nr, map->stripe_len);
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ do_div(stripe_nr, map->sub_stripes);
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ }
+ bytenr = chunk_start + stripe_nr * map->stripe_len;
+ WARN_ON(nr >= map->num_stripes);
+ for (j = 0; j < nr; j++) {
+ if (buf[j] == bytenr)
+ break;
+ }
+ if (j == nr) {
+ WARN_ON(nr >= map->num_stripes);
+ buf[nr++] = bytenr;
+ }
+ }
+
+ for (i = 0; i > nr; i++) {
+ struct btrfs_multi_bio *multi;
+ struct btrfs_bio_stripe *stripe;
+ int ret;
+
+ length = 1;
+ ret = btrfs_map_block(map_tree, WRITE, buf[i],
+ &length, &multi, 0);
+ BUG_ON(ret);
+
+ stripe = multi->stripes;
+ for (j = 0; j < multi->num_stripes; j++) {
+ if (stripe->physical >= physical &&
+ physical < stripe->physical + length)
+ break;
+ }
+ BUG_ON(j >= multi->num_stripes);
+ kfree(multi);
+ }
+
+ *logical = buf;
+ *naddrs = nr;
+ *stripe_len = map->stripe_len;
+
+ free_extent_map(em);
+ return 0;
+}
+
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+ u64 logical, struct page *page)
+{
+ u64 length = PAGE_CACHE_SIZE;
+ return __btrfs_map_block(map_tree, READ, logical, &length,
+ NULL, 0, page);
+}
+
+static void end_bio_multi_stripe(struct bio *bio, int err)
+{
+ struct btrfs_multi_bio *multi = bio->bi_private;
+ int is_orig_bio = 0;
+
+ if (err)
+ atomic_inc(&multi->error);
+
+ if (bio == multi->orig_bio)
+ is_orig_bio = 1;
+
+ if (atomic_dec_and_test(&multi->stripes_pending)) {
+ if (!is_orig_bio) {
+ bio_put(bio);
+ bio = multi->orig_bio;
+ }
+ bio->bi_private = multi->private;
+ bio->bi_end_io = multi->end_io;
+ /* only send an error to the higher layers if it is
+ * beyond the tolerance of the multi-bio
+ */
+ if (atomic_read(&multi->error) > multi->max_errors) {
+ err = -EIO;
+ } else if (err) {
+ /*
+ * this bio is actually up to date, we didn't
+ * go over the max number of errors
+ */
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ err = 0;
+ }
+ kfree(multi);
+
+ bio_endio(bio, err);
+ } else if (!is_orig_bio) {
+ bio_put(bio);
+ }
+}
+
+struct async_sched {
+ struct bio *bio;
+ int rw;
+ struct btrfs_fs_info *info;
+ struct btrfs_work work;
+};
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+static noinline int schedule_bio(struct btrfs_root *root,
+ struct btrfs_device *device,
+ int rw, struct bio *bio)
+{
+ int should_queue = 1;
+
+ /* don't bother with additional async steps for reads, right now */
+ if (!(rw & (1 << BIO_RW))) {
+ bio_get(bio);
+ submit_bio(rw, bio);
+ bio_put(bio);
+ return 0;
+ }
+
+ /*
+ * nr_async_bios allows us to reliably return congestion to the
+ * higher layers. Otherwise, the async bio makes it appear we have
+ * made progress against dirty pages when we've really just put it
+ * on a queue for later
+ */
+ atomic_inc(&root->fs_info->nr_async_bios);
+ WARN_ON(bio->bi_next);
+ bio->bi_next = NULL;
+ bio->bi_rw |= rw;
+
+ spin_lock(&device->io_lock);
+
+ if (device->pending_bio_tail)
+ device->pending_bio_tail->bi_next = bio;
+
+ device->pending_bio_tail = bio;
+ if (!device->pending_bios)
+ device->pending_bios = bio;
+ if (device->running_pending)
+ should_queue = 0;
+
+ spin_unlock(&device->io_lock);
+
+ if (should_queue)
+ btrfs_queue_worker(&root->fs_info->submit_workers,
+ &device->work);
+ return 0;
+}
+
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+ int mirror_num, int async_submit)
+{
+ struct btrfs_mapping_tree *map_tree;
+ struct btrfs_device *dev;
+ struct bio *first_bio = bio;
+ u64 logical = (u64)bio->bi_sector << 9;
+ u64 length = 0;
+ u64 map_length;
+ struct btrfs_multi_bio *multi = NULL;
+ int ret;
+ int dev_nr = 0;
+ int total_devs = 1;
+
+ length = bio->bi_size;
+ map_tree = &root->fs_info->mapping_tree;
+ map_length = length;
+
+ ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+ mirror_num);
+ BUG_ON(ret);
+
+ total_devs = multi->num_stripes;
+ if (map_length < length) {
+ printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+ "len %llu\n", (unsigned long long)logical,
+ (unsigned long long)length,
+ (unsigned long long)map_length);
+ BUG();
+ }
+ multi->end_io = first_bio->bi_end_io;
+ multi->private = first_bio->bi_private;
+ multi->orig_bio = first_bio;
+ atomic_set(&multi->stripes_pending, multi->num_stripes);
+
+ while (dev_nr < total_devs) {
+ if (total_devs > 1) {
+ if (dev_nr < total_devs - 1) {
+ bio = bio_clone(first_bio, GFP_NOFS);
+ BUG_ON(!bio);
+ } else {
+ bio = first_bio;
+ }
+ bio->bi_private = multi;
+ bio->bi_end_io = end_bio_multi_stripe;
+ }
+ bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
+ dev = multi->stripes[dev_nr].dev;
+ BUG_ON(rw == WRITE && !dev->writeable);
+ if (dev && dev->bdev) {
+ bio->bi_bdev = dev->bdev;
+ if (async_submit)
+ schedule_bio(root, dev, rw, bio);
+ else
+ submit_bio(rw, bio);
+ } else {
+ bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
+ bio->bi_sector = logical >> 9;
+ bio_endio(bio, -EIO);
+ }
+ dev_nr++;
+ }
+ if (total_devs == 1)
+ kfree(multi);
+ return 0;
+}
+
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+ u8 *uuid, u8 *fsid)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *cur_devices;
+
+ cur_devices = root->fs_info->fs_devices;
+ while (cur_devices) {
+ if (!fsid ||
+ !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+ device = __find_device(&cur_devices->devices,
+ devid, uuid);
+ if (device)
+ return device;
+ }
+ cur_devices = cur_devices->seed;
+ }
+ return NULL;
+}
+
+static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+ u64 devid, u8 *dev_uuid)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+
+ device = kzalloc(sizeof(*device), GFP_NOFS);
+ if (!device)
+ return NULL;
+ list_add(&device->dev_list,
+ &fs_devices->devices);
+ device->barriers = 1;
+ device->dev_root = root->fs_info->dev_root;
+ device->devid = devid;
+ device->work.func = pending_bios_fn;
+ device->fs_devices = fs_devices;
+ fs_devices->num_devices++;
+ spin_lock_init(&device->io_lock);
+ INIT_LIST_HEAD(&device->dev_alloc_list);
+ memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+ return device;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk)
+{
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ struct map_lookup *map;
+ struct extent_map *em;
+ u64 logical;
+ u64 length;
+ u64 devid;
+ u8 uuid[BTRFS_UUID_SIZE];
+ int num_stripes;
+ int ret;
+ int i;
+
+ logical = key->offset;
+ length = btrfs_chunk_length(leaf, chunk);
+
+ spin_lock(&map_tree->map_tree.lock);
+ em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+ spin_unlock(&map_tree->map_tree.lock);
+
+ /* already mapped? */
+ if (em && em->start <= logical && em->start + em->len > logical) {
+ free_extent_map(em);
+ return 0;
+ } else if (em) {
+ free_extent_map(em);
+ }
+
+ map = kzalloc(sizeof(*map), GFP_NOFS);
+ if (!map)
+ return -ENOMEM;
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em)
+ return -ENOMEM;
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+ if (!map) {
+ free_extent_map(em);
+ return -ENOMEM;
+ }
+
+ em->bdev = (struct block_device *)map;
+ em->start = logical;
+ em->len = length;
+ em->block_start = 0;
+ em->block_len = em->len;
+
+ map->num_stripes = num_stripes;
+ map->io_width = btrfs_chunk_io_width(leaf, chunk);
+ map->io_align = btrfs_chunk_io_align(leaf, chunk);
+ map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
+ map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ map->type = btrfs_chunk_type(leaf, chunk);
+ map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ for (i = 0; i < num_stripes; i++) {
+ map->stripes[i].physical =
+ btrfs_stripe_offset_nr(leaf, chunk, i);
+ devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+ read_extent_buffer(leaf, uuid, (unsigned long)
+ btrfs_stripe_dev_uuid_nr(chunk, i),
+ BTRFS_UUID_SIZE);
+ map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
+ NULL);
+ if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
+ kfree(map);
+ free_extent_map(em);
+ return -EIO;
+ }
+ if (!map->stripes[i].dev) {
+ map->stripes[i].dev =
+ add_missing_dev(root, devid, uuid);
+ if (!map->stripes[i].dev) {
+ kfree(map);
+ free_extent_map(em);
+ return -EIO;
+ }
+ }
+ map->stripes[i].dev->in_fs_metadata = 1;
+ }
+
+ spin_lock(&map_tree->map_tree.lock);
+ ret = add_extent_mapping(&map_tree->map_tree, em);
+ spin_unlock(&map_tree->map_tree.lock);
+ BUG_ON(ret);
+ free_extent_map(em);
+
+ return 0;
+}
+
+static int fill_device_from_item(struct extent_buffer *leaf,
+ struct btrfs_dev_item *dev_item,
+ struct btrfs_device *device)
+{
+ unsigned long ptr;
+
+ device->devid = btrfs_device_id(leaf, dev_item);
+ device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+ device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+ device->type = btrfs_device_type(leaf, dev_item);
+ device->io_align = btrfs_device_io_align(leaf, dev_item);
+ device->io_width = btrfs_device_io_width(leaf, dev_item);
+ device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+
+ ptr = (unsigned long)btrfs_device_uuid(dev_item);
+ read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+
+ return 0;
+}
+
+static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+{
+ struct btrfs_fs_devices *fs_devices;
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+
+ fs_devices = root->fs_info->fs_devices->seed;
+ while (fs_devices) {
+ if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+ ret = 0;
+ goto out;
+ }
+ fs_devices = fs_devices->seed;
+ }
+
+ fs_devices = find_fsid(fsid);
+ if (!fs_devices) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ fs_devices = clone_fs_devices(fs_devices);
+ if (IS_ERR(fs_devices)) {
+ ret = PTR_ERR(fs_devices);
+ goto out;
+ }
+
+ ret = __btrfs_open_devices(fs_devices, FMODE_READ,
+ root->fs_info->bdev_holder);
+ if (ret)
+ goto out;
+
+ if (!fs_devices->seeding) {
+ __btrfs_close_devices(fs_devices);
+ free_fs_devices(fs_devices);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ fs_devices->seed = root->fs_info->fs_devices->seed;
+ root->fs_info->fs_devices->seed = fs_devices;
+out:
+ mutex_unlock(&uuid_mutex);
+ return ret;
+}
+
+static int read_one_dev(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_dev_item *dev_item)
+{
+ struct btrfs_device *device;
+ u64 devid;
+ int ret;
+ u8 fs_uuid[BTRFS_UUID_SIZE];
+ u8 dev_uuid[BTRFS_UUID_SIZE];
+
+ devid = btrfs_device_id(leaf, dev_item);
+ read_extent_buffer(leaf, dev_uuid,
+ (unsigned long)btrfs_device_uuid(dev_item),
+ BTRFS_UUID_SIZE);
+ read_extent_buffer(leaf, fs_uuid,
+ (unsigned long)btrfs_device_fsid(dev_item),
+ BTRFS_UUID_SIZE);
+
+ if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
+ ret = open_seed_devices(root, fs_uuid);
+ if (ret && !btrfs_test_opt(root, DEGRADED))
+ return ret;
+ }
+
+ device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+ if (!device || !device->bdev) {
+ if (!btrfs_test_opt(root, DEGRADED))
+ return -EIO;
+
+ if (!device) {
+ printk(KERN_WARNING "warning devid %llu missing\n",
+ (unsigned long long)devid);
+ device = add_missing_dev(root, devid, dev_uuid);
+ if (!device)
+ return -ENOMEM;
+ }
+ }
+
+ if (device->fs_devices != root->fs_info->fs_devices) {
+ BUG_ON(device->writeable);
+ if (device->generation !=
+ btrfs_device_generation(leaf, dev_item))
+ return -EINVAL;
+ }
+
+ fill_device_from_item(leaf, dev_item, device);
+ device->dev_root = root->fs_info->dev_root;
+ device->in_fs_metadata = 1;
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += device->total_bytes;
+ ret = 0;
+ return ret;
+}
+
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
+{
+ struct btrfs_dev_item *dev_item;
+
+ dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
+ dev_item);
+ return read_one_dev(root, buf, dev_item);
+}
+
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+ struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct extent_buffer *sb;
+ struct btrfs_disk_key *disk_key;
+ struct btrfs_chunk *chunk;
+ u8 *ptr;
+ unsigned long sb_ptr;
+ int ret = 0;
+ u32 num_stripes;
+ u32 array_size;
+ u32 len = 0;
+ u32 cur;
+ struct btrfs_key key;
+
+ sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+ BTRFS_SUPER_INFO_SIZE);
+ if (!sb)
+ return -ENOMEM;
+ btrfs_set_buffer_uptodate(sb);
+ write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
+ array_size = btrfs_super_sys_array_size(super_copy);
+
+ ptr = super_copy->sys_chunk_array;
+ sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
+ cur = 0;
+
+ while (cur < array_size) {
+ disk_key = (struct btrfs_disk_key *)ptr;
+ btrfs_disk_key_to_cpu(&key, disk_key);
+
+ len = sizeof(*disk_key); ptr += len;
+ sb_ptr += len;
+ cur += len;
+
+ if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+ chunk = (struct btrfs_chunk *)sb_ptr;
+ ret = read_one_chunk(root, &key, sb, chunk);
+ if (ret)
+ break;
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ len = btrfs_chunk_item_size(num_stripes);
+ } else {
+ ret = -EIO;
+ break;
+ }
+ ptr += len;
+ sb_ptr += len;
+ cur += len;
+ }
+ free_extent_buffer(sb);
+ return ret;
+}
+
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int ret;
+ int slot;
+
+ root = root->fs_info->chunk_root;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* first we search for all of the device items, and then we
+ * read in all of the chunk items. This way we can create chunk
+ * mappings that reference all of the devices that are afound
+ */
+ key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+ key.offset = 0;
+ key.type = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto error;
+ break;
+ }
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+ if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
+ break;
+ if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+ struct btrfs_dev_item *dev_item;
+ dev_item = btrfs_item_ptr(leaf, slot,
+ struct btrfs_dev_item);
+ ret = read_one_dev(root, leaf, dev_item);
+ if (ret)
+ goto error;
+ }
+ } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+ struct btrfs_chunk *chunk;
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ ret = read_one_chunk(root, &found_key, leaf, chunk);
+ if (ret)
+ goto error;
+ }
+ path->slots[0]++;
+ }
+ if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+ key.objectid = 0;
+ btrfs_release_path(root, path);
+ goto again;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 00000000000..86c44e9ae11
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_VOLUMES_
+#define __BTRFS_VOLUMES_
+
+#include <linux/bio.h>
+#include "async-thread.h"
+
+struct buffer_head;
+struct btrfs_device {
+ struct list_head dev_list;
+ struct list_head dev_alloc_list;
+ struct btrfs_fs_devices *fs_devices;
+ struct btrfs_root *dev_root;
+ struct bio *pending_bios;
+ struct bio *pending_bio_tail;
+ int running_pending;
+ u64 generation;
+
+ int barriers;
+ int writeable;
+ int in_fs_metadata;
+
+ spinlock_t io_lock;
+
+ struct block_device *bdev;
+
+ /* the mode sent to open_bdev_exclusive */
+ fmode_t mode;
+
+ char *name;
+
+ /* the internal btrfs device id */
+ u64 devid;
+
+ /* size of the device */
+ u64 total_bytes;
+
+ /* bytes used */
+ u64 bytes_used;
+
+ /* optimal io alignment for this device */
+ u32 io_align;
+
+ /* optimal io width for this device */
+ u32 io_width;
+
+ /* minimal io size for this device */
+ u32 sector_size;
+
+ /* type and info about this device */
+ u64 type;
+
+ /* physical drive uuid (or lvm uuid) */
+ u8 uuid[BTRFS_UUID_SIZE];
+
+ struct btrfs_work work;
+};
+
+struct btrfs_fs_devices {
+ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+
+ /* the device with this id has the most recent coyp of the super */
+ u64 latest_devid;
+ u64 latest_trans;
+ u64 num_devices;
+ u64 open_devices;
+ u64 rw_devices;
+ u64 total_rw_bytes;
+ struct block_device *latest_bdev;
+ /* all of the devices in the FS */
+ struct list_head devices;
+
+ /* devices not currently being allocated */
+ struct list_head alloc_list;
+ struct list_head list;
+
+ struct btrfs_fs_devices *seed;
+ int seeding;
+
+ int opened;
+};
+
+struct btrfs_bio_stripe {
+ struct btrfs_device *dev;
+ u64 physical;
+};
+
+struct btrfs_multi_bio {
+ atomic_t stripes_pending;
+ bio_end_io_t *end_io;
+ struct bio *orig_bio;
+ void *private;
+ atomic_t error;
+ int max_errors;
+ int num_stripes;
+ struct btrfs_bio_stripe stripes[];
+};
+
+#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+ (sizeof(struct btrfs_bio_stripe) * (n)))
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device,
+ u64 chunk_tree, u64 chunk_objectid,
+ u64 chunk_offset, u64 start, u64 num_bytes);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+ u64 logical, u64 *length,
+ struct btrfs_multi_bio **multi_ret, int mirror_num);
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+ u64 chunk_start, u64 physical, u64 devid,
+ u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_read_chunk_tree(struct btrfs_root *root);
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root, u64 type);
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+ int mirror_num, int async_submit);
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+ fmode_t flags, void *holder);
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+ struct btrfs_fs_devices **fs_devices_ret);
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_device *device);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path);
+int btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+ u64 logical, struct page *page);
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 new_size);
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+ u8 *uuid, u8 *fsid);
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
+int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_balance(struct btrfs_root *dev_root);
+void btrfs_unlock_volumes(void);
+void btrfs_lock_volumes(void);
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 00000000000..7f332e27089
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "xattr.h"
+#include "disk-io.h"
+
+
+ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret = 0;
+ unsigned long data_ptr;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* lookup the xattr by name */
+ di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
+ strlen(name), 0);
+ if (!di || IS_ERR(di)) {
+ ret = -ENODATA;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ /* if size is 0, that means we want the size of the attr */
+ if (!size) {
+ ret = btrfs_dir_data_len(leaf, di);
+ goto out;
+ }
+
+ /* now get the data out of our dir_item */
+ if (btrfs_dir_data_len(leaf, di) > size) {
+ ret = -ERANGE;
+ goto out;
+ }
+ data_ptr = (unsigned long)((char *)(di + 1) +
+ btrfs_dir_name_len(leaf, di));
+ read_extent_buffer(leaf, buffer, data_ptr,
+ btrfs_dir_data_len(leaf, di));
+ ret = btrfs_dir_data_len(leaf, di);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int __btrfs_setxattr(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ int ret = 0, mod = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+
+ /* first lets see if we already have this xattr */
+ di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
+ strlen(name), -1);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ /* ok we already have this xattr, lets remove it */
+ if (di) {
+ /* if we want create only exit */
+ if (flags & XATTR_CREATE) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret)
+ goto out;
+ btrfs_release_path(root, path);
+
+ /* if we don't have a value then we are removing the xattr */
+ if (!value) {
+ mod = 1;
+ goto out;
+ }
+ } else {
+ btrfs_release_path(root, path);
+
+ if (flags & XATTR_REPLACE) {
+ /* we couldn't find the attr to replace */
+ ret = -ENODATA;
+ goto out;
+ }
+ }
+
+ /* ok we have to create a completely new xattr */
+ ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
+ value, size, inode->i_ino);
+ if (ret)
+ goto out;
+ mod = 1;
+
+out:
+ if (mod) {
+ inode->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, inode);
+ }
+
+ btrfs_end_transaction(trans, root);
+ btrfs_free_path(path);
+ return ret;
+}
+
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct btrfs_key key, found_key;
+ struct inode *inode = dentry->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_item *item;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ int ret = 0, slot, advance;
+ size_t total_size = 0, size_left = size;
+ unsigned long name_ptr;
+ size_t name_len;
+ u32 nritems;
+
+ /*
+ * ok we want all objects associated with this id.
+ * NOTE: we set key.offset = 0; because we want to start with the
+ * first xattr that we find and walk forward
+ */
+ key.objectid = inode->i_ino;
+ btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 2;
+
+ /* search for our xattrs */
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto err;
+ ret = 0;
+ advance = 0;
+ while (1) {
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+
+ /* this is where we start walking through the path */
+ if (advance || slot >= nritems) {
+ /*
+ * if we've reached the last slot in this leaf we need
+ * to go to the next leaf and reset everything
+ */
+ if (slot >= nritems-1) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ slot = path->slots[0];
+ } else {
+ /*
+ * just walking through the slots on this leaf
+ */
+ slot++;
+ path->slots[0]++;
+ }
+ }
+ advance = 1;
+
+ item = btrfs_item_nr(leaf, slot);
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ /* check to make sure this item is what we want */
+ if (found_key.objectid != key.objectid)
+ break;
+ if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+
+ name_len = btrfs_dir_name_len(leaf, di);
+ total_size += name_len + 1;
+
+ /* we are just looking for how big our buffer needs to be */
+ if (!size)
+ continue;
+
+ if (!buffer || (name_len + 1) > size_left) {
+ ret = -ERANGE;
+ goto err;
+ }
+
+ name_ptr = (unsigned long)(di + 1);
+ read_extent_buffer(leaf, buffer, name_ptr, name_len);
+ buffer[name_len] = '\0';
+
+ size_left -= name_len + 1;
+ buffer += name_len + 1;
+ }
+ ret = total_size;
+
+err:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+/*
+ * List of handlers for synthetic system.* attributes. All real ondisk
+ * attributes are handled directly.
+ */
+struct xattr_handler *btrfs_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+ &btrfs_xattr_acl_access_handler,
+ &btrfs_xattr_acl_default_handler,
+#endif
+ NULL,
+};
+
+/*
+ * Check if the attribute is in a supported namespace.
+ *
+ * This applied after the check for the synthetic attributes in the system
+ * namespace.
+ */
+static bool btrfs_is_valid_xattr(const char *name)
+{
+ return !strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN) ||
+ !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
+ !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+ !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_getxattr(dentry, name, buffer, size);
+
+ if (!btrfs_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+ return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
+}
+
+int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
+{
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_setxattr(dentry, name, value, size, flags);
+
+ if (!btrfs_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+
+ if (size == 0)
+ value = ""; /* empty EA, do not remove */
+ return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
+}
+
+int btrfs_removexattr(struct dentry *dentry, const char *name)
+{
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_removexattr(dentry, name);
+
+ if (!btrfs_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+ return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 00000000000..5b1d08f8e68
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2007 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __XATTR__
+#define __XATTR__
+
+#include <linux/xattr.h>
+
+extern struct xattr_handler btrfs_xattr_acl_access_handler;
+extern struct xattr_handler btrfs_xattr_acl_default_handler;
+extern struct xattr_handler *btrfs_xattr_handlers[];
+
+extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+ void *buffer, size_t size);
+extern int __btrfs_setxattr(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags);
+
+extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size);
+extern int btrfs_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags);
+extern int btrfs_removexattr(struct dentry *dentry, const char *name);
+
+#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 00000000000..ecfbce836d3
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include "compression.h"
+
+/* Plan: call deflate() with avail_in == *sourcelen,
+ avail_out = *dstlen - 12 and flush == Z_FINISH.
+ If it doesn't manage to finish, call it again with
+ avail_in == 0 and avail_out set to the remaining 12
+ bytes for it to clean up.
+ Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+
+struct workspace {
+ z_stream inf_strm;
+ z_stream def_strm;
+ char *buf;
+ struct list_head list;
+};
+
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+ struct workspace *workspace;
+ int ret;
+ int cpus = num_online_cpus();
+
+again:
+ spin_lock(&workspace_lock);
+ if (!list_empty(&idle_workspace)) {
+ workspace = list_entry(idle_workspace.next, struct workspace,
+ list);
+ list_del(&workspace->list);
+ num_workspace--;
+ spin_unlock(&workspace_lock);
+ return workspace;
+
+ }
+ spin_unlock(&workspace_lock);
+ if (atomic_read(&alloc_workspace) > cpus) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&alloc_workspace) > cpus)
+ schedule();
+ finish_wait(&workspace_wait, &wait);
+ goto again;
+ }
+ atomic_inc(&alloc_workspace);
+ workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+ if (!workspace) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+ if (!workspace->def_strm.workspace) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+ if (!workspace->inf_strm.workspace) {
+ ret = -ENOMEM;
+ goto fail_inflate;
+ }
+ workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ if (!workspace->buf) {
+ ret = -ENOMEM;
+ goto fail_kmalloc;
+ }
+ return workspace;
+
+fail_kmalloc:
+ vfree(workspace->inf_strm.workspace);
+fail_inflate:
+ vfree(workspace->def_strm.workspace);
+fail:
+ kfree(workspace);
+ atomic_dec(&alloc_workspace);
+ wake_up(&workspace_wait);
+ return ERR_PTR(ret);
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+ spin_lock(&workspace_lock);
+ if (num_workspace < num_online_cpus()) {
+ list_add_tail(&workspace->list, &idle_workspace);
+ num_workspace++;
+ spin_unlock(&workspace_lock);
+ if (waitqueue_active(&workspace_wait))
+ wake_up(&workspace_wait);
+ return 0;
+ }
+ spin_unlock(&workspace_lock);
+ vfree(workspace->def_strm.workspace);
+ vfree(workspace->inf_strm.workspace);
+ kfree(workspace->buf);
+ kfree(workspace);
+
+ atomic_dec(&alloc_workspace);
+ if (waitqueue_active(&workspace_wait))
+ wake_up(&workspace_wait);
+ return 0;
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+ struct workspace *workspace;
+ while (!list_empty(&idle_workspace)) {
+ workspace = list_entry(idle_workspace.next, struct workspace,
+ list);
+ list_del(&workspace->list);
+ vfree(workspace->def_strm.workspace);
+ vfree(workspace->inf_strm.workspace);
+ kfree(workspace->buf);
+ kfree(workspace);
+ atomic_dec(&alloc_workspace);
+ }
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated. There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read. It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out)
+{
+ int ret;
+ struct workspace *workspace;
+ char *data_in;
+ char *cpage_out;
+ int nr_pages = 0;
+ struct page *in_page = NULL;
+ struct page *out_page = NULL;
+ int out_written = 0;
+ int in_read = 0;
+ unsigned long bytes_left;
+
+ *out_pages = 0;
+ *total_out = 0;
+ *total_in = 0;
+
+ workspace = find_zlib_workspace();
+ if (!workspace)
+ return -1;
+
+ if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+ printk(KERN_WARNING "deflateInit failed\n");
+ ret = -1;
+ goto out;
+ }
+
+ workspace->def_strm.total_in = 0;
+ workspace->def_strm.total_out = 0;
+
+ in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ cpage_out = kmap(out_page);
+ pages[0] = out_page;
+ nr_pages = 1;
+
+ workspace->def_strm.next_in = data_in;
+ workspace->def_strm.next_out = cpage_out;
+ workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+ out_written = 0;
+ in_read = 0;
+
+ while (workspace->def_strm.total_in < len) {
+ ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+ if (ret != Z_OK) {
+ printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+ ret);
+ zlib_deflateEnd(&workspace->def_strm);
+ ret = -1;
+ goto out;
+ }
+
+ /* we're making it bigger, give up */
+ if (workspace->def_strm.total_in > 8192 &&
+ workspace->def_strm.total_in <
+ workspace->def_strm.total_out) {
+ ret = -1;
+ goto out;
+ }
+ /* we need another page for writing out. Test this
+ * before the total_in so we will pull in a new page for
+ * the stream end if required
+ */
+ if (workspace->def_strm.avail_out == 0) {
+ kunmap(out_page);
+ if (nr_pages == nr_dest_pages) {
+ out_page = NULL;
+ ret = -1;
+ goto out;
+ }
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ cpage_out = kmap(out_page);
+ pages[nr_pages] = out_page;
+ nr_pages++;
+ workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->def_strm.next_out = cpage_out;
+ }
+ /* we're all done */
+ if (workspace->def_strm.total_in >= len)
+ break;
+
+ /* we've read in a full page, get a new one */
+ if (workspace->def_strm.avail_in == 0) {
+ if (workspace->def_strm.total_out > max_out)
+ break;
+
+ bytes_left = len - workspace->def_strm.total_in;
+ kunmap(in_page);
+ page_cache_release(in_page);
+
+ start += PAGE_CACHE_SIZE;
+ in_page = find_get_page(mapping,
+ start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+ workspace->def_strm.avail_in = min(bytes_left,
+ PAGE_CACHE_SIZE);
+ workspace->def_strm.next_in = data_in;
+ }
+ }
+ workspace->def_strm.avail_in = 0;
+ ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+ zlib_deflateEnd(&workspace->def_strm);
+
+ if (ret != Z_STREAM_END) {
+ ret = -1;
+ goto out;
+ }
+
+ if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+ *total_out = workspace->def_strm.total_out;
+ *total_in = workspace->def_strm.total_in;
+out:
+ *out_pages = nr_pages;
+ if (out_page)
+ kunmap(out_page);
+
+ if (in_page) {
+ kunmap(in_page);
+ page_cache_release(in_page);
+ }
+ free_workspace(workspace);
+ return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous. They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen)
+{
+ int ret = 0;
+ int wbits = MAX_WBITS;
+ struct workspace *workspace;
+ char *data_in;
+ size_t total_out = 0;
+ unsigned long page_bytes_left;
+ unsigned long page_in_index = 0;
+ unsigned long page_out_index = 0;
+ struct page *page_out;
+ unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE;
+ unsigned long buf_start;
+ unsigned long buf_offset;
+ unsigned long bytes;
+ unsigned long working_bytes;
+ unsigned long pg_offset;
+ unsigned long start_byte;
+ unsigned long current_buf_start;
+ char *kaddr;
+
+ workspace = find_zlib_workspace();
+ if (!workspace)
+ return -ENOMEM;
+
+ data_in = kmap(pages_in[page_in_index]);
+ workspace->inf_strm.next_in = data_in;
+ workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+ workspace->inf_strm.total_in = 0;
+
+ workspace->inf_strm.total_out = 0;
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ page_out = bvec[page_out_index].bv_page;
+ page_bytes_left = PAGE_CACHE_SIZE;
+ pg_offset = 0;
+
+ /* If it's deflate, and it's got no preset dictionary, then
+ we can tell zlib to skip the adler32 check. */
+ if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+ ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+ !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+ wbits = -((data_in[0] >> 4) + 8);
+ workspace->inf_strm.next_in += 2;
+ workspace->inf_strm.avail_in -= 2;
+ }
+
+ if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ printk(KERN_WARNING "inflateInit failed\n");
+ ret = -1;
+ goto out;
+ }
+ while (workspace->inf_strm.total_in < srclen) {
+ ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ if (ret != Z_OK && ret != Z_STREAM_END)
+ break;
+ /*
+ * buf start is the byte offset we're of the start of
+ * our workspace buffer
+ */
+ buf_start = total_out;
+
+ /* total_out is the last byte of the workspace buffer */
+ total_out = workspace->inf_strm.total_out;
+
+ working_bytes = total_out - buf_start;
+
+ /*
+ * start byte is the first byte of the page we're currently
+ * copying into relative to the start of the compressed data.
+ */
+ start_byte = page_offset(page_out) - disk_start;
+
+ if (working_bytes == 0) {
+ /* we didn't make progress in this inflate
+ * call, we're done
+ */
+ if (ret != Z_STREAM_END)
+ ret = -1;
+ break;
+ }
+
+ /* we haven't yet hit data corresponding to this page */
+ if (total_out <= start_byte)
+ goto next;
+
+ /*
+ * the start of the data we care about is offset into
+ * the middle of our working buffer
+ */
+ if (total_out > start_byte && buf_start < start_byte) {
+ buf_offset = start_byte - buf_start;
+ working_bytes -= buf_offset;
+ } else {
+ buf_offset = 0;
+ }
+ current_buf_start = buf_start;
+
+ /* copy bytes from the working buffer into the pages */
+ while (working_bytes > 0) {
+ bytes = min(PAGE_CACHE_SIZE - pg_offset,
+ PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(bytes, working_bytes);
+ kaddr = kmap_atomic(page_out, KM_USER0);
+ memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+ bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page_out);
+
+ pg_offset += bytes;
+ page_bytes_left -= bytes;
+ buf_offset += bytes;
+ working_bytes -= bytes;
+ current_buf_start += bytes;
+
+ /* check if we need to pick another page */
+ if (page_bytes_left == 0) {
+ page_out_index++;
+ if (page_out_index >= vcnt) {
+ ret = 0;
+ goto done;
+ }
+
+ page_out = bvec[page_out_index].bv_page;
+ pg_offset = 0;
+ page_bytes_left = PAGE_CACHE_SIZE;
+ start_byte = page_offset(page_out) - disk_start;
+
+ /*
+ * make sure our new page is covered by this
+ * working buffer
+ */
+ if (total_out <= start_byte)
+ goto next;
+
+ /* the next page in the biovec might not
+ * be adjacent to the last page, but it
+ * might still be found inside this working
+ * buffer. bump our offset pointer
+ */
+ if (total_out > start_byte &&
+ current_buf_start < start_byte) {
+ buf_offset = start_byte - buf_start;
+ working_bytes = total_out - start_byte;
+ current_buf_start = buf_start +
+ buf_offset;
+ }
+ }
+ }
+next:
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+
+ if (workspace->inf_strm.avail_in == 0) {
+ unsigned long tmp;
+ kunmap(pages_in[page_in_index]);
+ page_in_index++;
+ if (page_in_index >= total_pages_in) {
+ data_in = NULL;
+ break;
+ }
+ data_in = kmap(pages_in[page_in_index]);
+ workspace->inf_strm.next_in = data_in;
+ tmp = srclen - workspace->inf_strm.total_in;
+ workspace->inf_strm.avail_in = min(tmp,
+ PAGE_CACHE_SIZE);
+ }
+ }
+ if (ret != Z_STREAM_END)
+ ret = -1;
+ else
+ ret = 0;
+done:
+ zlib_inflateEnd(&workspace->inf_strm);
+ if (data_in)
+ kunmap(pages_in[page_in_index]);
+out:
+ free_workspace(workspace);
+ return ret;
+}
+
+/*
+ * a less complex decompression routine. Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen)
+{
+ int ret = 0;
+ int wbits = MAX_WBITS;
+ struct workspace *workspace;
+ unsigned long bytes_left = destlen;
+ unsigned long total_out = 0;
+ char *kaddr;
+
+ if (destlen > PAGE_CACHE_SIZE)
+ return -ENOMEM;
+
+ workspace = find_zlib_workspace();
+ if (!workspace)
+ return -ENOMEM;
+
+ workspace->inf_strm.next_in = data_in;
+ workspace->inf_strm.avail_in = srclen;
+ workspace->inf_strm.total_in = 0;
+
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->inf_strm.total_out = 0;
+ /* If it's deflate, and it's got no preset dictionary, then
+ we can tell zlib to skip the adler32 check. */
+ if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+ ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+ !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+ wbits = -((data_in[0] >> 4) + 8);
+ workspace->inf_strm.next_in += 2;
+ workspace->inf_strm.avail_in -= 2;
+ }
+
+ if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+ printk(KERN_WARNING "inflateInit failed\n");
+ ret = -1;
+ goto out;
+ }
+
+ while (bytes_left > 0) {
+ unsigned long buf_start;
+ unsigned long buf_offset;
+ unsigned long bytes;
+ unsigned long pg_offset = 0;
+
+ ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+ if (ret != Z_OK && ret != Z_STREAM_END)
+ break;
+
+ buf_start = total_out;
+ total_out = workspace->inf_strm.total_out;
+
+ if (total_out == buf_start) {
+ ret = -1;
+ break;
+ }
+
+ if (total_out <= start_byte)
+ goto next;
+
+ if (total_out > start_byte && buf_start < start_byte)
+ buf_offset = start_byte - buf_start;
+ else
+ buf_offset = 0;
+
+ bytes = min(PAGE_CACHE_SIZE - pg_offset,
+ PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(bytes, bytes_left);
+
+ kaddr = kmap_atomic(dest_page, KM_USER0);
+ memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ pg_offset += bytes;
+ bytes_left -= bytes;
+next:
+ workspace->inf_strm.next_out = workspace->buf;
+ workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+ }
+
+ if (ret != Z_STREAM_END && bytes_left != 0)
+ ret = -1;
+ else
+ ret = 0;
+
+ zlib_inflateEnd(&workspace->inf_strm);
+out:
+ free_workspace(workspace);
+ return ret;
+}
+
+void btrfs_zlib_exit(void)
+{
+ free_workspaces();
+}
diff --git a/fs/buffer.c b/fs/buffer.c
index 776ae091d3b..c26da785938 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1996,7 +1996,7 @@ int block_write_begin(struct file *file, struct address_space *mapping,
page = *pagep;
if (page == NULL) {
ownpage = 1;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
status = -ENOMEM;
goto out;
@@ -2022,7 +2022,6 @@ int block_write_begin(struct file *file, struct address_space *mapping,
if (pos + len > inode->i_size)
vmtruncate(inode, inode->i_size);
}
- goto out;
}
out:
@@ -2502,7 +2501,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 700697a7261..38f71222a55 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
cd->major = major;
cd->baseminor = baseminor;
cd->minorct = minorct;
- strncpy(cd->name,name, 64);
+ strlcpy(cd->name, name, sizeof(cd->name));
i = major_to_index(major);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b1e1fc6a6e6..12bb656fbe7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2074,7 +2074,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
rc = -ENOMEM;
goto out;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f247da9f4ed..5ab9896fdcb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1641,7 +1641,7 @@ do_expand:
i_size_write(inode, offset);
spin_unlock(&inode->i_lock);
out_truncate:
- if (inode->i_op && inode->i_op->truncate)
+ if (inode->i_op->truncate)
inode->i_op->truncate(inode);
return 0;
out_sig:
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 466303db2df..6a347fbc998 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
{
struct file *host_file;
- struct dentry *host_dentry;
- struct inode *host_inode, *coda_inode = coda_dentry->d_inode;
+ struct inode *coda_inode = coda_dentry->d_inode;
struct coda_file_info *cfi;
int err = 0;
@@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
host_file = cfi->cfi_container;
- if (host_file->f_op && host_file->f_op->fsync) {
- host_dentry = host_file->f_path.dentry;
- host_inode = host_dentry->d_inode;
- mutex_lock(&host_inode->i_mutex);
- err = host_file->f_op->fsync(host_file, host_dentry, datasync);
- mutex_unlock(&host_inode->i_mutex);
- }
-
+ err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
if ( !err && !datasync ) {
lock_kernel();
err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 81b7771c646..43c96ce2961 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,9 @@
#include "coda_int.h"
+#ifdef CONFIG_SYSCTL
static struct ctl_table_header *fs_table_header;
+#endif
static ctl_table coda_table[] = {
{
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = {
{}
};
+#ifdef CONFIG_SYSCTL
static ctl_table fs_table[] = {
{
.ctl_name = CTL_UNNUMBERED,
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = {
},
{}
};
-
+#endif
void coda_sysctl_init(void)
{
diff --git a/fs/compat.c b/fs/compat.c
index d1ece79b641..30f2faa22f5 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
out:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
fput(file);
return ret;
}
@@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
out:
+ if (ret > 0)
+ add_wchar(current, ret);
+ inc_syscw(current);
fput(file);
return ret;
}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4803ccc9448..5d349d38e05 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
{
inode->i_mode = mode;
- inode->i_uid = 0;
- inode->i_gid = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
}
@@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
{
struct inode * inode = new_inode(configfs_sb);
if (inode) {
- inode->i_blocks = 0;
inode->i_mapping->a_ops = &configfs_aops;
inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
inode->i_op = &configfs_inode_operations;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f40423eb1a1..a07338d2d14 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
inode->i_op = &page_symlink_inode_operations;
inode->i_data.a_ops = &cramfs_aops;
} else {
- inode->i_size = 0;
- inode->i_blocks = 0;
init_special_inode(inode, inode->i_mode,
old_decode_dev(cramfs_inode->size));
}
diff --git a/fs/dcache.c b/fs/dcache.c
index e88c23b85a3..4547f66884a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1567,10 +1567,6 @@ void d_rehash(struct dentry * entry)
spin_unlock(&dcache_lock);
}
-#define do_switch(x,y) do { \
- __typeof__ (x) __tmp = x; \
- x = y; y = __tmp; } while (0)
-
/*
* When switching names, the actual string doesn't strictly have to
* be preserved in the target - because we're dropping the target
@@ -1589,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
/*
* Both external: swap the pointers
*/
- do_switch(target->d_name.name, dentry->d_name.name);
+ swap(target->d_name.name, dentry->d_name.name);
} else {
/*
* dentry:internal, target:external. Steal target's
@@ -1620,7 +1616,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
return;
}
}
- do_switch(dentry->d_name.len, target->d_name.len);
+ swap(dentry->d_name.len, target->d_name.len);
}
/*
@@ -1680,7 +1676,7 @@ already_unhashed:
/* Switch the names.. */
switch_names(dentry, target);
- do_switch(dentry->d_name.hash, target->d_name.hash);
+ swap(dentry->d_name.hash, target->d_name.hash);
/* ... and switch the parents */
if (IS_ROOT(dentry)) {
@@ -1688,7 +1684,7 @@ already_unhashed:
target->d_parent = target;
INIT_LIST_HEAD(&target->d_u.d_child);
} else {
- do_switch(dentry->d_parent, target->d_parent);
+ swap(dentry->d_parent, target->d_parent);
/* And add them back to the (new) parent lists */
list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
@@ -1789,7 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
struct dentry *dparent, *aparent;
switch_names(dentry, anon);
- do_switch(dentry->d_name.hash, anon->d_name.hash);
+ swap(dentry->d_name.hash, anon->d_name.hash);
dparent = dentry->d_parent;
aparent = anon->d_parent;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 159a5efd6a8..33a90120f6a 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);
+
+static int debugfs_size_t_set(void *data, u64 val)
+{
+ *(size_t *)data = val;
+ return 0;
+}
+static int debugfs_size_t_get(void *data, u64 *val)
+{
+ *val = *(size_t *)data;
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
+ "%llu\n"); /* %llu and %zu are more or less the same */
+
+/**
+ * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ */
+struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+ struct dentry *parent, size_t *value)
+{
+ return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_size_t);
+
+
static ssize_t read_file_bool(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dbe2169cf3..81ae9ea3c6e 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
if (inode) {
inode->i_mode = mode;
- inode->i_uid = 0;
- inode->i_gid = 0;
- inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
switch (mode & S_IFMT) {
default:
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index fff96e152c0..5f3231b9633 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -189,8 +189,6 @@ static int mknod_ptmx(struct super_block *sb)
}
inode->i_ino = 2;
- inode->i_uid = inode->i_gid = 0;
- inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
mode = S_IFCHR|opts->ptmxmode;
@@ -300,8 +298,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
goto free_fsi;
inode->i_ino = 1;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- inode->i_blocks = 0;
- inode->i_uid = inode->i_gid = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index af0558dbe8b..b6d43908ff7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
retval = direct_io_worker(rw, iocb, inode, iov, offset,
nr_segs, blkbits, get_block, end_io, dio);
+ /*
+ * In case of error extending write may have instantiated a few
+ * blocks outside i_size. Trim these off again for DIO_LOCKING.
+ * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
+ * it's own meaner.
+ */
+ if (unlikely(retval < 0 && (rw & WRITE))) {
+ loff_t isize = i_size_read(inode);
+
+ if (end > isize && dio_lock_type == DIO_LOCKING)
+ vmtruncate(inode, isize);
+ }
+
if (rw == READ && dio_lock_type == DIO_LOCKING)
release_i_mutex = 0;
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 8bf31e3fbf0..dc2ad6008b2 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
spin_unlock(&ast_queue_lock);
}
-void dlm_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
{
if (lkb->lkb_flags & DLM_IFL_USER) {
- dlm_user_add_ast(lkb, type);
+ dlm_user_add_ast(lkb, type, bastmode);
return;
}
@@ -46,6 +46,8 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type)
list_add_tail(&lkb->lkb_astqueue, &ast_queue);
}
lkb->lkb_ast_type |= type;
+ if (bastmode)
+ lkb->lkb_bastmode = bastmode;
spin_unlock(&ast_queue_lock);
set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,50 +61,40 @@ static void process_asts(void)
struct dlm_lkb *lkb;
void (*cast) (void *astparam);
void (*bast) (void *astparam, int mode);
- int type = 0, found, bmode;
-
- for (;;) {
- found = 0;
- spin_lock(&ast_queue_lock);
- list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
- r = lkb->lkb_resource;
- ls = r->res_ls;
-
- if (dlm_locking_stopped(ls))
- continue;
-
- list_del(&lkb->lkb_astqueue);
- type = lkb->lkb_ast_type;
- lkb->lkb_ast_type = 0;
- found = 1;
- break;
- }
- spin_unlock(&ast_queue_lock);
+ int type = 0, bastmode;
+
+repeat:
+ spin_lock(&ast_queue_lock);
+ list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+ r = lkb->lkb_resource;
+ ls = r->res_ls;
+
+ if (dlm_locking_stopped(ls))
+ continue;
- if (!found)
- break;
+ list_del(&lkb->lkb_astqueue);
+ type = lkb->lkb_ast_type;
+ lkb->lkb_ast_type = 0;
+ bastmode = lkb->lkb_bastmode;
+ spin_unlock(&ast_queue_lock);
cast = lkb->lkb_astfn;
bast = lkb->lkb_bastfn;
- bmode = lkb->lkb_bastmode;
if ((type & AST_COMP) && cast)
cast(lkb->lkb_astparam);
- /* FIXME: Is it safe to look at lkb_grmode here
- without doing a lock_rsb() ?
- Look at other checks in v1 to avoid basts. */
-
if ((type & AST_BAST) && bast)
- if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
- bast(lkb->lkb_astparam, bmode);
+ bast(lkb->lkb_astparam, bastmode);
/* this removes the reference added by dlm_add_ast
and may result in the lkb being freed */
dlm_put_lkb(lkb);
- schedule();
+ cond_resched();
+ goto repeat;
}
+ spin_unlock(&ast_queue_lock);
}
static inline int no_asts(void)
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 6ee276c74c5..1b5fc5f428f 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
#ifndef __ASTD_DOT_H__
#define __ASTD_DOT_H__
-void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
void dlm_del_ast(struct dlm_lkb *lkb);
void dlm_astd_wake(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fc24f4507a..2f107d1a6a4 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
/******************************************************************************
*******************************************************************************
**
-** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -27,7 +27,7 @@ static struct dentry *dlm_root;
struct rsb_iter {
int entry;
- int locks;
+ int format;
int header;
struct dlm_ls *ls;
struct list_head *next;
@@ -60,8 +60,8 @@ static char *print_lockmode(int mode)
}
}
-static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
- struct dlm_rsb *res)
+static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
+ struct dlm_rsb *res)
{
seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
@@ -83,7 +83,7 @@ static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
seq_printf(s, "\n");
}
-static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+static int print_format1(struct dlm_rsb *res, struct seq_file *s)
{
struct dlm_lkb *lkb;
int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
@@ -134,15 +134,15 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
/* Print the locks attached to this resource */
seq_printf(s, "Granted Queue\n");
list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
- print_resource_lock(s, lkb, res);
+ print_format1_lock(s, lkb, res);
seq_printf(s, "Conversion Queue\n");
list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
- print_resource_lock(s, lkb, res);
+ print_format1_lock(s, lkb, res);
seq_printf(s, "Waiting Queue\n");
list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
- print_resource_lock(s, lkb, res);
+ print_format1_lock(s, lkb, res);
if (list_empty(&res->res_lookup))
goto out;
@@ -160,23 +160,24 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
return 0;
}
-static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r)
+static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+ struct dlm_rsb *r)
{
- unsigned int waiting = 0;
- uint64_t xid = 0;
+ u64 xid = 0;
+ u64 us;
if (lkb->lkb_flags & DLM_IFL_USER) {
if (lkb->lkb_ua)
xid = lkb->lkb_ua->xid;
}
- if (lkb->lkb_timestamp)
- waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp);
+ /* microseconds since lkb was added to current queue */
+ us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
- /* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms
+ /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
r_nodeid r_len r_name */
- seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n",
+ seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
lkb->lkb_id,
lkb->lkb_nodeid,
lkb->lkb_remid,
@@ -187,26 +188,114 @@ static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *
lkb->lkb_status,
lkb->lkb_grmode,
lkb->lkb_rqmode,
- waiting,
+ (unsigned long long)us,
r->res_nodeid,
r->res_length,
r->res_name);
}
-static int print_locks(struct dlm_rsb *r, struct seq_file *s)
+static int print_format2(struct dlm_rsb *r, struct seq_file *s)
{
struct dlm_lkb *lkb;
lock_rsb(r);
list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
- print_lock(s, lkb, r);
+ print_format2_lock(s, lkb, r);
list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
- print_lock(s, lkb, r);
+ print_format2_lock(s, lkb, r);
list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
- print_lock(s, lkb, r);
+ print_format2_lock(s, lkb, r);
+
+ unlock_rsb(r);
+ return 0;
+}
+
+static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+ int rsb_lookup)
+{
+ u64 xid = 0;
+
+ if (lkb->lkb_flags & DLM_IFL_USER) {
+ if (lkb->lkb_ua)
+ xid = lkb->lkb_ua->xid;
+ }
+
+ seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
+ lkb->lkb_id,
+ lkb->lkb_nodeid,
+ lkb->lkb_remid,
+ lkb->lkb_ownpid,
+ (unsigned long long)xid,
+ lkb->lkb_exflags,
+ lkb->lkb_flags,
+ lkb->lkb_status,
+ lkb->lkb_grmode,
+ lkb->lkb_rqmode,
+ lkb->lkb_highbast,
+ rsb_lookup,
+ lkb->lkb_wait_type,
+ lkb->lkb_lvbseq,
+ (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+ (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+}
+
+static int print_format3(struct dlm_rsb *r, struct seq_file *s)
+{
+ struct dlm_lkb *lkb;
+ int i, lvblen = r->res_ls->ls_lvblen;
+ int print_name = 1;
+
+ lock_rsb(r);
+
+ seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
+ r,
+ r->res_nodeid,
+ r->res_first_lkid,
+ r->res_flags,
+ !list_empty(&r->res_root_list),
+ !list_empty(&r->res_recover_list),
+ r->res_recover_locks_count,
+ r->res_length);
+
+ for (i = 0; i < r->res_length; i++) {
+ if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
+ print_name = 0;
+ }
+
+ seq_printf(s, "%s", print_name ? "str " : "hex");
+
+ for (i = 0; i < r->res_length; i++) {
+ if (print_name)
+ seq_printf(s, "%c", r->res_name[i]);
+ else
+ seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
+ }
+ seq_printf(s, "\n");
+
+ if (!r->res_lvbptr)
+ goto do_locks;
+
+ seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
+
+ for (i = 0; i < lvblen; i++)
+ seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
+ seq_printf(s, "\n");
+
+ do_locks:
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+ print_format3_lock(s, lkb, 0);
+
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+ print_format3_lock(s, lkb, 0);
+
+ list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+ print_format3_lock(s, lkb, 0);
+
+ list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+ print_format3_lock(s, lkb, 1);
unlock_rsb(r);
return 0;
@@ -231,7 +320,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
break;
}
read_unlock(&ls->ls_rsbtbl[i].lock);
- }
+ }
ri->entry = i;
if (ri->entry >= ls->ls_rsbtbl_size)
@@ -248,7 +337,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
read_unlock(&ls->ls_rsbtbl[i].lock);
dlm_put_rsb(old);
goto top;
- }
+ }
ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
dlm_hold_rsb(ri->rsb);
read_unlock(&ls->ls_rsbtbl[i].lock);
@@ -274,6 +363,7 @@ static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
ri->ls = ls;
ri->entry = 0;
ri->next = NULL;
+ ri->format = 1;
if (rsb_iter_next(ri)) {
rsb_iter_free(ri);
@@ -325,16 +415,26 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
{
struct rsb_iter *ri = iter_ptr;
- if (ri->locks) {
+ switch (ri->format) {
+ case 1:
+ print_format1(ri->rsb, file);
+ break;
+ case 2:
if (ri->header) {
- seq_printf(file, "id nodeid remid pid xid exflags flags "
- "sts grmode rqmode time_ms r_nodeid "
- "r_len r_name\n");
+ seq_printf(file, "id nodeid remid pid xid exflags "
+ "flags sts grmode rqmode time_ms "
+ "r_nodeid r_len r_name\n");
ri->header = 0;
}
- print_locks(ri->rsb, file);
- } else {
- print_resource(ri->rsb, file);
+ print_format2(ri->rsb, file);
+ break;
+ case 3:
+ if (ri->header) {
+ seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+ ri->header = 0;
+ }
+ print_format3(ri->rsb, file);
+ break;
}
return 0;
@@ -385,7 +485,7 @@ static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos)
ri->ls = ls;
ri->entry = 0;
ri->next = NULL;
- ri->locks = 1;
+ ri->format = 2;
if (*pos == 0)
ri->header = 1;
@@ -448,6 +548,84 @@ static const struct file_operations locks_fops = {
};
/*
+ * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks
+ * This can replace both formats 1 and 2 eventually.
+ */
+
+static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos)
+{
+ struct rsb_iter *ri;
+
+ ri = kzalloc(sizeof *ri, GFP_KERNEL);
+ if (!ri)
+ return NULL;
+
+ ri->ls = ls;
+ ri->entry = 0;
+ ri->next = NULL;
+ ri->format = 3;
+
+ if (*pos == 0)
+ ri->header = 1;
+
+ if (rsb_iter_next(ri)) {
+ rsb_iter_free(ri);
+ return NULL;
+ }
+
+ return ri;
+}
+
+static void *all_seq_start(struct seq_file *file, loff_t *pos)
+{
+ struct rsb_iter *ri;
+ loff_t n = *pos;
+
+ ri = all_iter_init(file->private, pos);
+ if (!ri)
+ return NULL;
+
+ while (n--) {
+ if (rsb_iter_next(ri)) {
+ rsb_iter_free(ri);
+ return NULL;
+ }
+ }
+
+ return ri;
+}
+
+static struct seq_operations all_seq_ops = {
+ .start = all_seq_start,
+ .next = rsb_seq_next,
+ .stop = rsb_seq_stop,
+ .show = rsb_seq_show,
+};
+
+static int all_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &all_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private;
+
+ return 0;
+}
+
+static const struct file_operations all_fops = {
+ .owner = THIS_MODULE,
+ .open = all_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+/*
* dump lkb's on the ls_waiters list
*/
@@ -489,30 +667,33 @@ static const struct file_operations waiters_fops = {
.read = waiters_read
};
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+ if (ls->ls_debug_rsb_dentry)
+ debugfs_remove(ls->ls_debug_rsb_dentry);
+ if (ls->ls_debug_waiters_dentry)
+ debugfs_remove(ls->ls_debug_waiters_dentry);
+ if (ls->ls_debug_locks_dentry)
+ debugfs_remove(ls->ls_debug_locks_dentry);
+ if (ls->ls_debug_all_dentry)
+ debugfs_remove(ls->ls_debug_all_dentry);
+}
+
int dlm_create_debug_file(struct dlm_ls *ls)
{
char name[DLM_LOCKSPACE_LEN+8];
+ /* format 1 */
+
ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
S_IFREG | S_IRUGO,
dlm_root,
ls,
&rsb_fops);
if (!ls->ls_debug_rsb_dentry)
- return -ENOMEM;
+ goto fail;
- memset(name, 0, sizeof(name));
- snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
-
- ls->ls_debug_waiters_dentry = debugfs_create_file(name,
- S_IFREG | S_IRUGO,
- dlm_root,
- ls,
- &waiters_fops);
- if (!ls->ls_debug_waiters_dentry) {
- debugfs_remove(ls->ls_debug_rsb_dentry);
- return -ENOMEM;
- }
+ /* format 2 */
memset(name, 0, sizeof(name));
snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
@@ -522,23 +703,38 @@ int dlm_create_debug_file(struct dlm_ls *ls)
dlm_root,
ls,
&locks_fops);
- if (!ls->ls_debug_locks_dentry) {
- debugfs_remove(ls->ls_debug_waiters_dentry);
- debugfs_remove(ls->ls_debug_rsb_dentry);
- return -ENOMEM;
- }
+ if (!ls->ls_debug_locks_dentry)
+ goto fail;
+
+ /* format 3 */
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
+
+ ls->ls_debug_all_dentry = debugfs_create_file(name,
+ S_IFREG | S_IRUGO,
+ dlm_root,
+ ls,
+ &all_fops);
+ if (!ls->ls_debug_all_dentry)
+ goto fail;
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+
+ ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+ S_IFREG | S_IRUGO,
+ dlm_root,
+ ls,
+ &waiters_fops);
+ if (!ls->ls_debug_waiters_dentry)
+ goto fail;
return 0;
-}
-void dlm_delete_debug_file(struct dlm_ls *ls)
-{
- if (ls->ls_debug_rsb_dentry)
- debugfs_remove(ls->ls_debug_rsb_dentry);
- if (ls->ls_debug_waiters_dentry)
- debugfs_remove(ls->ls_debug_waiters_dentry);
- if (ls->ls_debug_locks_dentry)
- debugfs_remove(ls->ls_debug_locks_dentry);
+ fail:
+ dlm_delete_debug_file(ls);
+ return -ENOMEM;
}
int __init dlm_register_debugfs(void)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 85defeb64df..92969f879a1 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -374,7 +374,7 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
struct list_head *list;
struct dlm_rsb *r;
int offset = 0, dir_nodeid;
- uint16_t be_namelen;
+ __be16 be_namelen;
down_read(&ls->ls_root_sem);
@@ -410,15 +410,15 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
/* Write end-of-block record */
- be_namelen = 0;
- memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
- offset += sizeof(uint16_t);
+ be_namelen = cpu_to_be16(0);
+ memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+ offset += sizeof(__be16);
goto out;
}
be_namelen = cpu_to_be16(r->res_length);
- memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
- offset += sizeof(uint16_t);
+ memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+ offset += sizeof(__be16);
memcpy(outbuf + offset, r->res_name, r->res_length);
offset += r->res_length;
}
@@ -430,9 +430,9 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
if ((list == &ls->ls_root_list) &&
(offset + sizeof(uint16_t) <= outlen)) {
- be_namelen = 0xFFFF;
- memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
- offset += sizeof(uint16_t);
+ be_namelen = cpu_to_be16(0xFFFF);
+ memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+ offset += sizeof(__be16);
}
out:
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 868e4c9ef12..ef2f1e35396 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -245,7 +245,8 @@ struct dlm_lkb {
struct list_head lkb_astqueue; /* need ast to be sent */
struct list_head lkb_ownqueue; /* list of locks for a process */
struct list_head lkb_time_list;
- unsigned long lkb_timestamp;
+ ktime_t lkb_time_bast; /* for debugging */
+ ktime_t lkb_timestamp;
unsigned long lkb_timeout_cs;
char *lkb_lvbptr;
@@ -481,6 +482,7 @@ struct dlm_ls {
struct dentry *ls_debug_rsb_dentry; /* debugfs */
struct dentry *ls_debug_waiters_dentry; /* debugfs */
struct dentry *ls_debug_locks_dentry; /* debugfs */
+ struct dentry *ls_debug_all_dentry; /* debugfs */
wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
int ls_uevent_result;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 724ddac9153..6cfe65bbf4a 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
lkb->lkb_lksb->sb_status = rv;
lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
- dlm_add_ast(lkb, AST_COMP);
+ dlm_add_ast(lkb, AST_COMP, 0);
}
static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -318,12 +318,12 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
{
+ lkb->lkb_time_bast = ktime_get();
+
if (is_master_copy(lkb))
send_bast(r, lkb, rqmode);
- else {
- lkb->lkb_bastmode = rqmode;
- dlm_add_ast(lkb, AST_BAST);
- }
+ else
+ dlm_add_ast(lkb, AST_BAST, rqmode);
}
/*
@@ -744,6 +744,8 @@ static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+ lkb->lkb_timestamp = ktime_get();
+
lkb->lkb_status = status;
switch (status) {
@@ -1013,10 +1015,8 @@ static void add_timeout(struct dlm_lkb *lkb)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
- if (is_master_copy(lkb)) {
- lkb->lkb_timestamp = jiffies;
+ if (is_master_copy(lkb))
return;
- }
if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
@@ -1031,7 +1031,6 @@ static void add_timeout(struct dlm_lkb *lkb)
DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
mutex_lock(&ls->ls_timeout_mutex);
hold_lkb(lkb);
- lkb->lkb_timestamp = jiffies;
list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
mutex_unlock(&ls->ls_timeout_mutex);
}
@@ -1059,6 +1058,7 @@ void dlm_scan_timeout(struct dlm_ls *ls)
struct dlm_rsb *r;
struct dlm_lkb *lkb;
int do_cancel, do_warn;
+ s64 wait_us;
for (;;) {
if (dlm_locking_stopped(ls))
@@ -1069,14 +1069,15 @@ void dlm_scan_timeout(struct dlm_ls *ls)
mutex_lock(&ls->ls_timeout_mutex);
list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
+ wait_us = ktime_to_us(ktime_sub(ktime_get(),
+ lkb->lkb_timestamp));
+
if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
- time_after_eq(jiffies, lkb->lkb_timestamp +
- lkb->lkb_timeout_cs * HZ/100))
+ wait_us >= (lkb->lkb_timeout_cs * 10000))
do_cancel = 1;
if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
- time_after_eq(jiffies, lkb->lkb_timestamp +
- dlm_config.ci_timewarn_cs * HZ/100))
+ wait_us >= dlm_config.ci_timewarn_cs * 10000)
do_warn = 1;
if (!do_cancel && !do_warn)
@@ -1122,12 +1123,12 @@ void dlm_scan_timeout(struct dlm_ls *ls)
void dlm_adjust_timeouts(struct dlm_ls *ls)
{
struct dlm_lkb *lkb;
- long adj = jiffies - ls->ls_recover_begin;
+ u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
ls->ls_recover_begin = 0;
mutex_lock(&ls->ls_timeout_mutex);
list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
- lkb->lkb_timestamp += adj;
+ lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
mutex_unlock(&ls->ls_timeout_mutex);
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3962262f991..103a5ebd137 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -295,6 +295,7 @@ static int add_sock(struct socket *sock, struct connection *con)
con->sock->sk->sk_write_space = lowcomms_write_space;
con->sock->sk->sk_state_change = lowcomms_state_change;
con->sock->sk->sk_user_data = con;
+ con->sock->sk->sk_allocation = GFP_NOFS;
return 0;
}
@@ -823,7 +824,6 @@ static void sctp_init_assoc(struct connection *con)
len = e->len;
offset = e->offset;
spin_unlock(&con->writequeue_lock);
- kmap(e->page);
/* Send the first block off the write queue */
iov[0].iov_base = page_address(e->page)+offset;
@@ -854,7 +854,6 @@ static void sctp_init_assoc(struct connection *con)
if (e->len == 0 && e->users == 0) {
list_del(&e->list);
- kunmap(e->page);
free_entry(e);
}
spin_unlock(&con->writequeue_lock);
@@ -1203,8 +1202,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
if (e) {
got_one:
- if (users == 0)
- kmap(e->page);
*ppc = page_address(e->page) + offset;
return e;
}
@@ -1233,7 +1230,6 @@ void dlm_lowcomms_commit_buffer(void *mh)
if (users)
goto out;
e->len = e->end - e->offset;
- kunmap(e->page);
spin_unlock(&con->writequeue_lock);
if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
@@ -1272,7 +1268,6 @@ static void send_to_sock(struct connection *con)
offset = e->offset;
BUG_ON(len == 0 && e->users == 0);
spin_unlock(&con->writequeue_lock);
- kmap(e->page);
ret = 0;
if (len) {
@@ -1294,7 +1289,6 @@ static void send_to_sock(struct connection *con)
if (e->len == 0 && e->users == 0) {
list_del(&e->list);
- kunmap(e->page);
free_entry(e);
continue;
}
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 54c14c6d06c..c1775b84eba 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
{
char *p;
- p = kzalloc(ls->ls_lvblen, GFP_KERNEL);
+ p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
return p;
}
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
- r = kzalloc(sizeof(*r) + namelen, GFP_KERNEL);
+ r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
return r;
}
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
{
struct dlm_lkb *lkb;
- lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL);
+ lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
return lkb;
}
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 07ac709f3ed..f3396c622ae 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -112,7 +112,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
ordinary messages). */
if (msglen > sizeof(__tmp) && p == &__tmp.p) {
- p = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
+ p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
if (p == NULL)
return ret;
}
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index aa2a5775a02..ccc9d62c462 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -115,7 +115,6 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
data->status = lkb->lkb_status;
data->grmode = lkb->lkb_grmode;
data->rqmode = lkb->lkb_rqmode;
- data->timestamp = lkb->lkb_timestamp;
if (lkb->lkb_ua)
data->xid = lkb->lkb_ua->xid;
if (r) {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b3832c67194..065149e84f4 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -175,7 +175,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
/* we could possibly check if the cancel of an orphan has resulted in the lkb
being removed and then remove that lkb from the orphans list and free it */
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
{
struct dlm_ls *ls;
struct dlm_user_args *ua;
@@ -208,6 +208,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
ast_type = lkb->lkb_ast_type;
lkb->lkb_ast_type |= type;
+ if (bastmode)
+ lkb->lkb_bastmode = bastmode;
if (!ast_type) {
kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 35eb6a13d61..1c968649228 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,7 @@
#ifndef __USER_DOT_H__
#define __USER_DOT_H__
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
int dlm_user_init(void);
void dlm_user_exit(void);
int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/dquot.c b/fs/dquot.c
index c237ccc8581..48c0571f831 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash;
struct dqstats dqstats;
-static void dqput(struct dquot *dquot);
-
static inline unsigned int
hashfn(const struct super_block *sb, unsigned int id, int type)
{
@@ -415,6 +413,17 @@ out_dqlock:
return ret;
}
+void dquot_destroy(struct dquot *dquot)
+{
+ kmem_cache_free(dquot_cachep, dquot);
+}
+EXPORT_SYMBOL(dquot_destroy);
+
+static inline void do_destroy_dquot(struct dquot *dquot)
+{
+ dquot->dq_sb->dq_op->destroy_dquot(dquot);
+}
+
/* Invalidate all dquots on the list. Note that this function is called after
* quota is disabled and pointers from inodes removed so there cannot be new
* quota users. There can still be some users of quotas due to inodes being
@@ -463,9 +472,44 @@ restart:
remove_dquot_hash(dquot);
remove_free_dquot(dquot);
remove_inuse(dquot);
- kmem_cache_free(dquot_cachep, dquot);
+ do_destroy_dquot(dquot);
+ }
+ spin_unlock(&dq_list_lock);
+}
+
+/* Call callback for every active dquot on given filesystem */
+int dquot_scan_active(struct super_block *sb,
+ int (*fn)(struct dquot *dquot, unsigned long priv),
+ unsigned long priv)
+{
+ struct dquot *dquot, *old_dquot = NULL;
+ int ret = 0;
+
+ mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+ spin_lock(&dq_list_lock);
+ list_for_each_entry(dquot, &inuse_list, dq_inuse) {
+ if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+ continue;
+ if (dquot->dq_sb != sb)
+ continue;
+ /* Now we have active dquot so we can just increase use count */
+ atomic_inc(&dquot->dq_count);
+ dqstats.lookups++;
+ spin_unlock(&dq_list_lock);
+ dqput(old_dquot);
+ old_dquot = dquot;
+ ret = fn(dquot, priv);
+ if (ret < 0)
+ goto out;
+ spin_lock(&dq_list_lock);
+ /* We are safe to continue now because our dquot could not
+ * be moved out of the inuse list while we hold the reference */
}
spin_unlock(&dq_list_lock);
+out:
+ dqput(old_dquot);
+ mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+ return ret;
}
int vfs_quota_sync(struct super_block *sb, int type)
@@ -479,7 +523,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && cnt != type)
continue;
- if (!sb_has_quota_enabled(sb, cnt))
+ if (!sb_has_quota_active(sb, cnt))
continue;
spin_lock(&dq_list_lock);
dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -504,8 +548,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
}
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt)
- && info_dirty(&dqopt->info[cnt]))
+ if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
+ && info_dirty(&dqopt->info[cnt]))
sb->dq_op->write_info(sb, cnt);
spin_lock(&dq_list_lock);
dqstats.syncs++;
@@ -527,7 +571,7 @@ static void prune_dqcache(int count)
remove_dquot_hash(dquot);
remove_free_dquot(dquot);
remove_inuse(dquot);
- kmem_cache_free(dquot_cachep, dquot);
+ do_destroy_dquot(dquot);
count--;
head = free_dquots.prev;
}
@@ -558,7 +602,7 @@ static struct shrinker dqcache_shrinker = {
* NOTE: If you change this function please check whether dqput_blocks() works right...
* MUST be called with either dqptr_sem or dqonoff_mutex held
*/
-static void dqput(struct dquot *dquot)
+void dqput(struct dquot *dquot)
{
int ret;
@@ -584,7 +628,7 @@ we_slept:
/* We have more than one user... nothing to do */
atomic_dec(&dquot->dq_count);
/* Releasing dquot during quotaoff phase? */
- if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) &&
+ if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
atomic_read(&dquot->dq_count) == 1)
wake_up(&dquot->dq_wait_unused);
spin_unlock(&dq_list_lock);
@@ -625,11 +669,17 @@ we_slept:
spin_unlock(&dq_list_lock);
}
+struct dquot *dquot_alloc(struct super_block *sb, int type)
+{
+ return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+}
+EXPORT_SYMBOL(dquot_alloc);
+
static struct dquot *get_empty_dquot(struct super_block *sb, int type)
{
struct dquot *dquot;
- dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+ dquot = sb->dq_op->alloc_dquot(sb, type);
if(!dquot)
return NODQUOT;
@@ -647,15 +697,33 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
}
/*
+ * Check whether dquot is in memory.
+ * MUST be called with either dqptr_sem or dqonoff_mutex held
+ */
+int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
+{
+ unsigned int hashent = hashfn(sb, id, type);
+ int ret = 0;
+
+ if (!sb_has_quota_active(sb, type))
+ return 0;
+ spin_lock(&dq_list_lock);
+ if (find_dquot(hashent, sb, id, type) != NODQUOT)
+ ret = 1;
+ spin_unlock(&dq_list_lock);
+ return ret;
+}
+
+/*
* Get reference to dquot
* MUST be called with either dqptr_sem or dqonoff_mutex held
*/
-static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
+struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
{
unsigned int hashent = hashfn(sb, id, type);
struct dquot *dquot, *empty = NODQUOT;
- if (!sb_has_quota_enabled(sb, type))
+ if (!sb_has_quota_active(sb, type))
return NODQUOT;
we_slept:
spin_lock(&dq_list_lock);
@@ -682,7 +750,7 @@ we_slept:
dqstats.lookups++;
spin_unlock(&dq_list_lock);
if (empty)
- kmem_cache_free(dquot_cachep, empty);
+ do_destroy_dquot(empty);
}
/* Wait for dq_lock - after this we know that either dquot_release() is already
* finished or it will be canceled due to dq_count > 1 test */
@@ -820,7 +888,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
}
}
-static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
{
dquot->dq_dqb.dqb_curinodes += number;
}
@@ -830,9 +898,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
dquot->dq_dqb.dqb_curspace += number;
}
-static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
{
- if (dquot->dq_dqb.dqb_curinodes > number)
+ if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+ dquot->dq_dqb.dqb_curinodes >= number)
dquot->dq_dqb.dqb_curinodes -= number;
else
dquot->dq_dqb.dqb_curinodes = 0;
@@ -843,11 +912,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
{
- if (dquot->dq_dqb.dqb_curspace > number)
+ if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+ dquot->dq_dqb.dqb_curspace >= number)
dquot->dq_dqb.dqb_curspace -= number;
else
dquot->dq_dqb.dqb_curspace = 0;
- if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+ if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
dquot->dq_dqb.dqb_btime = (time_t) 0;
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}
@@ -1023,10 +1093,11 @@ static inline char ignore_hardlimit(struct dquot *dquot)
}
/* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
+static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
{
*warntype = QUOTA_NL_NOWARN;
- if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+ if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+ test_bit(DQ_FAKE_B, &dquot->dq_flags))
return QUOTA_OK;
if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1058,11 +1129,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
{
*warntype = QUOTA_NL_NOWARN;
- if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+ if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+ test_bit(DQ_FAKE_B, &dquot->dq_flags))
return QUOTA_OK;
if (dquot->dq_dqb.dqb_bhardlimit &&
- toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit &&
+ dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
!ignore_hardlimit(dquot)) {
if (!prealloc)
*warntype = QUOTA_NL_BHARDWARN;
@@ -1070,7 +1142,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
- toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+ dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
!ignore_hardlimit(dquot)) {
if (!prealloc)
@@ -1079,7 +1151,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
- toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+ dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
dquot->dq_dqb.dqb_btime == 0) {
if (!prealloc) {
*warntype = QUOTA_NL_BSOFTWARN;
@@ -1096,10 +1168,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
return QUOTA_OK;
}
-static int info_idq_free(struct dquot *dquot, ulong inodes)
+static int info_idq_free(struct dquot *dquot, qsize_t inodes)
{
if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
- dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+ dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
+ !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
return QUOTA_NL_NOWARN;
if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1113,15 +1186,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
static int info_bdq_free(struct dquot *dquot, qsize_t space)
{
if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
- toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+ dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
return QUOTA_NL_NOWARN;
- if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
- dquot->dq_dqb.dqb_bsoftlimit)
+ if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
return QUOTA_NL_BSOFTBELOW;
- if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
- toqb(dquot->dq_dqb.dqb_curspace - space) <
- dquot->dq_dqb.dqb_bhardlimit)
+ if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
+ dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
return QUOTA_NL_BHARDBELOW;
return QUOTA_NL_NOWARN;
}
@@ -1166,17 +1237,23 @@ out_err:
* Release all quotas referenced by inode
* Transaction must be started at an entry
*/
-int dquot_drop(struct inode *inode)
+int dquot_drop_locked(struct inode *inode)
{
int cnt;
- down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (inode->i_dquot[cnt] != NODQUOT) {
dqput(inode->i_dquot[cnt]);
inode->i_dquot[cnt] = NODQUOT;
}
}
+ return 0;
+}
+
+int dquot_drop(struct inode *inode)
+{
+ down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ dquot_drop_locked(inode);
up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
return 0;
}
@@ -1264,7 +1341,7 @@ warn_put_all:
/*
* This operation can block, but only after everything is updated
*/
-int dquot_alloc_inode(const struct inode *inode, unsigned long number)
+int dquot_alloc_inode(const struct inode *inode, qsize_t number)
{
int cnt, ret = NO_QUOTA;
char warntype[MAXQUOTAS];
@@ -1349,7 +1426,7 @@ out_sub:
/*
* This operation can block, but only after everything is updated
*/
-int dquot_free_inode(const struct inode *inode, unsigned long number)
+int dquot_free_inode(const struct inode *inode, qsize_t number)
{
unsigned int cnt;
char warntype[MAXQUOTAS];
@@ -1495,7 +1572,7 @@ warn_put_all:
/* Wrapper for transferring ownership of an inode */
int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
{
- if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+ if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
vfs_dq_init(inode);
if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
return 1;
@@ -1533,54 +1610,27 @@ struct dquot_operations dquot_operations = {
.acquire_dquot = dquot_acquire,
.release_dquot = dquot_release,
.mark_dirty = dquot_mark_dquot_dirty,
- .write_info = dquot_commit_info
+ .write_info = dquot_commit_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
};
-static inline void set_enable_flags(struct quota_info *dqopt, int type)
-{
- switch (type) {
- case USRQUOTA:
- dqopt->flags |= DQUOT_USR_ENABLED;
- dqopt->flags &= ~DQUOT_USR_SUSPENDED;
- break;
- case GRPQUOTA:
- dqopt->flags |= DQUOT_GRP_ENABLED;
- dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
- break;
- }
-}
-
-static inline void reset_enable_flags(struct quota_info *dqopt, int type,
- int remount)
-{
- switch (type) {
- case USRQUOTA:
- dqopt->flags &= ~DQUOT_USR_ENABLED;
- if (remount)
- dqopt->flags |= DQUOT_USR_SUSPENDED;
- else
- dqopt->flags &= ~DQUOT_USR_SUSPENDED;
- break;
- case GRPQUOTA:
- dqopt->flags &= ~DQUOT_GRP_ENABLED;
- if (remount)
- dqopt->flags |= DQUOT_GRP_SUSPENDED;
- else
- dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
- break;
- }
-}
-
-
/*
* Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
*/
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
{
int cnt, ret = 0;
struct quota_info *dqopt = sb_dqopt(sb);
struct inode *toputinode[MAXQUOTAS];
+ /* Cannot turn off usage accounting without turning off limits, or
+ * suspend quotas and simultaneously turn quotas off. */
+ if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
+ || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
+ DQUOT_USAGE_ENABLED)))
+ return -EINVAL;
+
/* We need to serialize quota_off() for device */
mutex_lock(&dqopt->dqonoff_mutex);
@@ -1589,7 +1639,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
* sometimes we are called when fill_super() failed and calling
* sync_fs() in such cases does no good.
*/
- if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) {
+ if (!sb_any_quota_loaded(sb)) {
mutex_unlock(&dqopt->dqonoff_mutex);
return 0;
}
@@ -1597,17 +1647,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
toputinode[cnt] = NULL;
if (type != -1 && cnt != type)
continue;
- /* If we keep inodes of quota files after remount and quotaoff
- * is called, drop kept inodes. */
- if (!remount && sb_has_quota_suspended(sb, cnt)) {
- iput(dqopt->files[cnt]);
- dqopt->files[cnt] = NULL;
- reset_enable_flags(dqopt, cnt, 0);
+ if (!sb_has_quota_loaded(sb, cnt))
continue;
+
+ if (flags & DQUOT_SUSPENDED) {
+ dqopt->flags |=
+ dquot_state_flag(DQUOT_SUSPENDED, cnt);
+ } else {
+ dqopt->flags &= ~dquot_state_flag(flags, cnt);
+ /* Turning off suspended quotas? */
+ if (!sb_has_quota_loaded(sb, cnt) &&
+ sb_has_quota_suspended(sb, cnt)) {
+ dqopt->flags &= ~dquot_state_flag(
+ DQUOT_SUSPENDED, cnt);
+ iput(dqopt->files[cnt]);
+ dqopt->files[cnt] = NULL;
+ continue;
+ }
}
- if (!sb_has_quota_enabled(sb, cnt))
+
+ /* We still have to keep quota loaded? */
+ if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
continue;
- reset_enable_flags(dqopt, cnt, remount);
/* Note: these are blocking operations */
drop_dquot_ref(sb, cnt);
@@ -1623,7 +1684,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
put_quota_format(dqopt->info[cnt].dqi_format);
toputinode[cnt] = dqopt->files[cnt];
- if (!remount)
+ if (!sb_has_quota_loaded(sb, cnt))
dqopt->files[cnt] = NULL;
dqopt->info[cnt].dqi_flags = 0;
dqopt->info[cnt].dqi_igrace = 0;
@@ -1631,6 +1692,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
dqopt->ops[cnt] = NULL;
}
mutex_unlock(&dqopt->dqonoff_mutex);
+
+ /* Skip syncing and setting flags if quota files are hidden */
+ if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+ goto put_inodes;
+
/* Sync the superblock so that buffers with quota data are written to
* disk (and so userspace sees correct data afterwards). */
if (sb->s_op->sync_fs)
@@ -1646,7 +1712,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
mutex_lock(&dqopt->dqonoff_mutex);
/* If quota was reenabled in the meantime, we have
* nothing to do */
- if (!sb_has_quota_enabled(sb, cnt)) {
+ if (!sb_has_quota_loaded(sb, cnt)) {
mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
S_NOATIME | S_NOQUOTA);
@@ -1655,26 +1721,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
mark_inode_dirty(toputinode[cnt]);
}
mutex_unlock(&dqopt->dqonoff_mutex);
+ }
+ if (sb->s_bdev)
+ invalidate_bdev(sb->s_bdev);
+put_inodes:
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+ if (toputinode[cnt]) {
/* On remount RO, we keep the inode pointer so that we
- * can reenable quota on the subsequent remount RW.
- * But we have better not keep inode pointer when there
- * is pending delete on the quota file... */
- if (!remount)
+ * can reenable quota on the subsequent remount RW. We
+ * have to check 'flags' variable and not use sb_has_
+ * function because another quotaon / quotaoff could
+ * change global state before we got here. We refuse
+ * to suspend quotas when there is pending delete on
+ * the quota file... */
+ if (!(flags & DQUOT_SUSPENDED))
iput(toputinode[cnt]);
else if (!toputinode[cnt]->i_nlink)
ret = -EBUSY;
}
- if (sb->s_bdev)
- invalidate_bdev(sb->s_bdev);
return ret;
}
+int vfs_quota_off(struct super_block *sb, int type, int remount)
+{
+ return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+ (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+}
+
/*
* Turn quotas on on a device
*/
-/* Helper function when we already have the inode */
-static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
+/*
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+ unsigned int flags)
{
struct quota_format_type *fmt = find_quota_format(format_id);
struct super_block *sb = inode->i_sb;
@@ -1696,27 +1779,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
error = -EINVAL;
goto out_fmt;
}
+ /* Usage always has to be set... */
+ if (!(flags & DQUOT_USAGE_ENABLED)) {
+ error = -EINVAL;
+ goto out_fmt;
+ }
- /* As we bypass the pagecache we must now flush the inode so that
- * we see all the changes from userspace... */
- write_inode_now(inode, 1);
- /* And now flush the block cache so that kernel sees the changes */
- invalidate_bdev(sb->s_bdev);
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+ /* As we bypass the pagecache we must now flush the inode so
+ * that we see all the changes from userspace... */
+ write_inode_now(inode, 1);
+ /* And now flush the block cache so that kernel sees the
+ * changes */
+ invalidate_bdev(sb->s_bdev);
+ }
mutex_lock(&inode->i_mutex);
mutex_lock(&dqopt->dqonoff_mutex);
- if (sb_has_quota_enabled(sb, type) ||
- sb_has_quota_suspended(sb, type)) {
+ if (sb_has_quota_loaded(sb, type)) {
error = -EBUSY;
goto out_lock;
}
- /* We don't want quota and atime on quota files (deadlocks possible)
- * Also nobody should write to the file - we use special IO operations
- * which ignore the immutable bit. */
- down_write(&dqopt->dqptr_sem);
- oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
- inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
- up_write(&dqopt->dqptr_sem);
- sb->dq_op->drop(inode);
+
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+ /* We don't want quota and atime on quota files (deadlocks
+ * possible) Also nobody should write to the file - we use
+ * special IO operations which ignore the immutable bit. */
+ down_write(&dqopt->dqptr_sem);
+ oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
+ inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+ up_write(&dqopt->dqptr_sem);
+ sb->dq_op->drop(inode);
+ }
error = -EIO;
dqopt->files[type] = igrab(inode);
@@ -1737,7 +1830,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
}
mutex_unlock(&dqopt->dqio_mutex);
mutex_unlock(&inode->i_mutex);
- set_enable_flags(dqopt, type);
+ dqopt->flags |= dquot_state_flag(flags, type);
add_dquot_ref(sb, type);
mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1770,20 +1863,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
struct quota_info *dqopt = sb_dqopt(sb);
struct inode *inode;
int ret;
+ unsigned int flags;
mutex_lock(&dqopt->dqonoff_mutex);
if (!sb_has_quota_suspended(sb, type)) {
mutex_unlock(&dqopt->dqonoff_mutex);
return 0;
}
- BUG_ON(sb_has_quota_enabled(sb, type));
-
inode = dqopt->files[type];
dqopt->files[type] = NULL;
- reset_enable_flags(dqopt, type, 0);
+ flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED, type);
+ dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
mutex_unlock(&dqopt->dqonoff_mutex);
- ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id);
+ flags = dquot_generic_flag(flags, type);
+ ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+ flags);
iput(inode);
return ret;
@@ -1799,12 +1895,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
if (path->mnt->mnt_sb != sb)
error = -EXDEV;
else
- error = vfs_quota_on_inode(path->dentry->d_inode, type,
- format_id);
+ error = vfs_load_quota_inode(path->dentry->d_inode, type,
+ format_id, DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED);
return error;
}
-/* Actual function called from quotactl() */
int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
int remount)
{
@@ -1823,6 +1919,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
}
/*
+ * More powerful function for turning on quotas allowing setting
+ * of individual quota flags
+ */
+int vfs_quota_enable(struct inode *inode, int type, int format_id,
+ unsigned int flags)
+{
+ int ret = 0;
+ struct super_block *sb = inode->i_sb;
+ struct quota_info *dqopt = sb_dqopt(sb);
+
+ /* Just unsuspend quotas? */
+ if (flags & DQUOT_SUSPENDED)
+ return vfs_quota_on_remount(sb, type);
+ if (!flags)
+ return 0;
+ /* Just updating flags needed? */
+ if (sb_has_quota_loaded(sb, type)) {
+ mutex_lock(&dqopt->dqonoff_mutex);
+ /* Now do a reliable test... */
+ if (!sb_has_quota_loaded(sb, type)) {
+ mutex_unlock(&dqopt->dqonoff_mutex);
+ goto load_quota;
+ }
+ if (flags & DQUOT_USAGE_ENABLED &&
+ sb_has_quota_usage_enabled(sb, type)) {
+ ret = -EBUSY;
+ goto out_lock;
+ }
+ if (flags & DQUOT_LIMITS_ENABLED &&
+ sb_has_quota_limits_enabled(sb, type)) {
+ ret = -EBUSY;
+ goto out_lock;
+ }
+ sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+out_lock:
+ mutex_unlock(&dqopt->dqonoff_mutex);
+ return ret;
+ }
+
+load_quota:
+ return vfs_load_quota_inode(inode, type, format_id, flags);
+}
+
+/*
* This function is used when filesystem needs to initialize quotas
* during mount time.
*/
@@ -1843,7 +1983,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
error = security_quota_on(dentry);
if (!error)
- error = vfs_quota_on_inode(dentry->d_inode, type, format_id);
+ error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
+ DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
out:
dput(dentry);
@@ -1866,14 +2007,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
return ret;
}
+static inline qsize_t qbtos(qsize_t blocks)
+{
+ return blocks << QIF_DQBLKSIZE_BITS;
+}
+
+static inline qsize_t stoqb(qsize_t space)
+{
+ return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
/* Generic routine for getting common part of quota structure */
static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
{
struct mem_dqblk *dm = &dquot->dq_dqb;
spin_lock(&dq_data_lock);
- di->dqb_bhardlimit = dm->dqb_bhardlimit;
- di->dqb_bsoftlimit = dm->dqb_bsoftlimit;
+ di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
+ di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
di->dqb_curspace = dm->dqb_curspace;
di->dqb_ihardlimit = dm->dqb_ihardlimit;
di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1918,28 +2069,38 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
if (di->dqb_valid & QIF_SPACE) {
dm->dqb_curspace = di->dqb_curspace;
check_blim = 1;
+ __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
}
if (di->dqb_valid & QIF_BLIMITS) {
- dm->dqb_bsoftlimit = di->dqb_bsoftlimit;
- dm->dqb_bhardlimit = di->dqb_bhardlimit;
+ dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
+ dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
check_blim = 1;
+ __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
}
if (di->dqb_valid & QIF_INODES) {
dm->dqb_curinodes = di->dqb_curinodes;
check_ilim = 1;
+ __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
}
if (di->dqb_valid & QIF_ILIMITS) {
dm->dqb_isoftlimit = di->dqb_isoftlimit;
dm->dqb_ihardlimit = di->dqb_ihardlimit;
check_ilim = 1;
+ __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
}
- if (di->dqb_valid & QIF_BTIME)
+ if (di->dqb_valid & QIF_BTIME) {
dm->dqb_btime = di->dqb_btime;
- if (di->dqb_valid & QIF_ITIME)
+ check_blim = 1;
+ __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+ }
+ if (di->dqb_valid & QIF_ITIME) {
dm->dqb_itime = di->dqb_itime;
+ check_ilim = 1;
+ __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+ }
if (check_blim) {
- if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) {
+ if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
dm->dqb_btime = 0;
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}
@@ -1970,12 +2131,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
int rc;
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- if (!(dquot = dqget(sb, id, type))) {
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
- return -ESRCH;
+ dquot = dqget(sb, id, type);
+ if (!dquot) {
+ rc = -ESRCH;
+ goto out;
}
rc = do_set_dqblk(dquot, di);
dqput(dquot);
+out:
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
return rc;
}
@@ -1986,7 +2149,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
struct mem_dqinfo *mi;
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- if (!sb_has_quota_enabled(sb, type)) {
+ if (!sb_has_quota_active(sb, type)) {
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
return -ESRCH;
}
@@ -2005,11 +2168,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
{
struct mem_dqinfo *mi;
+ int err = 0;
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- if (!sb_has_quota_enabled(sb, type)) {
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
- return -ESRCH;
+ if (!sb_has_quota_active(sb, type)) {
+ err = -ESRCH;
+ goto out;
}
mi = sb_dqopt(sb)->info + type;
spin_lock(&dq_data_lock);
@@ -2023,8 +2187,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
mark_info_dirty(sb, type);
/* Force write to disk */
sb->dq_op->write_info(sb, type);
+out:
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
- return 0;
+ return err;
}
struct quotactl_ops vfs_quotactl_ops = {
@@ -2186,10 +2351,13 @@ EXPORT_SYMBOL(register_quota_format);
EXPORT_SYMBOL(unregister_quota_format);
EXPORT_SYMBOL(dqstats);
EXPORT_SYMBOL(dq_data_lock);
+EXPORT_SYMBOL(vfs_quota_enable);
EXPORT_SYMBOL(vfs_quota_on);
EXPORT_SYMBOL(vfs_quota_on_path);
EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(vfs_quota_disable);
EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_scan_active);
EXPORT_SYMBOL(vfs_quota_sync);
EXPORT_SYMBOL(vfs_get_dqinfo);
EXPORT_SYMBOL(vfs_set_dqinfo);
@@ -2202,7 +2370,11 @@ EXPORT_SYMBOL(dquot_release);
EXPORT_SYMBOL(dquot_mark_dquot_dirty);
EXPORT_SYMBOL(dquot_initialize);
EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(dquot_drop_locked);
EXPORT_SYMBOL(vfs_dq_drop);
+EXPORT_SYMBOL(dqget);
+EXPORT_SYMBOL(dqput);
+EXPORT_SYMBOL(dquot_is_cached);
EXPORT_SYMBOL(dquot_alloc_space);
EXPORT_SYMBOL(dquot_alloc_inode);
EXPORT_SYMBOL(dquot_free_space);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 6046239465a..c01e043670e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -175,8 +175,8 @@ out:
*
* Returns zero on success; non-zero on error.
*/
-static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
- loff_t offset)
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+ loff_t offset)
{
int rc = 0;
char dst[MD5_DIGEST_SIZE];
@@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags(
crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED;
+ if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+ crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES;
+ if (mount_crypt_stat->flags
+ & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)
+ crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK;
+ else if (mount_crypt_stat->flags
+ & ECRYPTFS_GLOBAL_ENCFN_USE_FEK)
+ crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK;
+ }
}
static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
@@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem {
static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
{0x00000001, ECRYPTFS_ENABLE_HMAC},
{0x00000002, ECRYPTFS_ENCRYPTED},
- {0x00000004, ECRYPTFS_METADATA_IN_XATTR}
+ {0x00000004, ECRYPTFS_METADATA_IN_XATTR},
+ {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES}
};
/**
@@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = {
/**
* ecryptfs_code_for_cipher_string
- * @crypt_stat: The cryptographic context
+ * @cipher_name: The string alias for the cipher
+ * @key_bytes: Length of key in bytes; used for AES code selection
*
* Returns zero on no match, or the cipher code on match
*/
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes)
{
int i;
u8 code = 0;
struct ecryptfs_cipher_code_str_map_elem *map =
ecryptfs_cipher_code_str_map;
- if (strcmp(crypt_stat->cipher, "aes") == 0) {
- switch (crypt_stat->key_size) {
+ if (strcmp(cipher_name, "aes") == 0) {
+ switch (key_bytes) {
case 16:
code = RFC2440_CIPHER_AES_128;
break;
@@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
}
} else {
for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
- if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){
+ if (strcmp(cipher_name, map[i].cipher_str) == 0) {
code = map[i].cipher_code;
break;
}
@@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data,
&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
int rc;
+ if (crypt_stat->extent_size == 0)
+ crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
ecryptfs_inode);
if (rc) {
@@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data,
}
if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
rc = -EINVAL;
- ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n");
}
out:
return rc;
@@ -1628,95 +1640,95 @@ out:
}
/**
- * ecryptfs_encode_filename - converts a plaintext file name to cipher text
- * @crypt_stat: The crypt_stat struct associated with the file anem to encode
- * @name: The plaintext name
- * @length: The length of the plaintext
- * @encoded_name: The encypted name
+ * ecryptfs_encrypt_filename - encrypt filename
*
- * Encrypts and encodes a filename into something that constitutes a
- * valid filename for a filesystem, with printable characters.
+ * CBC-encrypts the filename. We do not want to encrypt the same
+ * filename with the same key and IV, which may happen with hard
+ * links, so we prepend random bits to each filename.
*
- * We assume that we have a properly initialized crypto context,
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of encoded filename; negative if error
+ * Returns zero on success; non-zero otherwise
*/
-int
-ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
- const char *name, int length, char **encoded_name)
+static int
+ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
{
- int error = 0;
+ int rc = 0;
- (*encoded_name) = kmalloc(length + 2, GFP_KERNEL);
- if (!(*encoded_name)) {
- error = -ENOMEM;
+ filename->encrypted_filename = NULL;
+ filename->encrypted_filename_size = 0;
+ if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+ || (mount_crypt_stat && (mount_crypt_stat->flags
+ & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+ size_t packet_size;
+ size_t remaining_bytes;
+
+ rc = ecryptfs_write_tag_70_packet(
+ NULL, NULL,
+ &filename->encrypted_filename_size,
+ mount_crypt_stat, NULL,
+ filename->filename_size);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to get packet "
+ "size for tag 72; rc = [%d]\n", __func__,
+ rc);
+ filename->encrypted_filename_size = 0;
+ goto out;
+ }
+ filename->encrypted_filename =
+ kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
+ if (!filename->encrypted_filename) {
+ printk(KERN_ERR "%s: Out of memory whilst attempting "
+ "to kmalloc [%zd] bytes\n", __func__,
+ filename->encrypted_filename_size);
+ rc = -ENOMEM;
+ goto out;
+ }
+ remaining_bytes = filename->encrypted_filename_size;
+ rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename,
+ &remaining_bytes,
+ &packet_size,
+ mount_crypt_stat,
+ filename->filename,
+ filename->filename_size);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to generate "
+ "tag 70 packet; rc = [%d]\n", __func__,
+ rc);
+ kfree(filename->encrypted_filename);
+ filename->encrypted_filename = NULL;
+ filename->encrypted_filename_size = 0;
+ goto out;
+ }
+ filename->encrypted_filename_size = packet_size;
+ } else {
+ printk(KERN_ERR "%s: No support for requested filename "
+ "encryption method in this release\n", __func__);
+ rc = -ENOTSUPP;
goto out;
}
- /* TODO: Filename encryption is a scheduled feature for a
- * future version of eCryptfs. This function is here only for
- * the purpose of providing a framework for other developers
- * to easily implement filename encryption. Hint: Replace this
- * memcpy() with a call to encrypt and encode the
- * filename, the set the length accordingly. */
- memcpy((void *)(*encoded_name), (void *)name, length);
- (*encoded_name)[length] = '\0';
- error = length + 1;
out:
- return error;
+ return rc;
}
-/**
- * ecryptfs_decode_filename - converts the cipher text name to plaintext
- * @crypt_stat: The crypt_stat struct associated with the file
- * @name: The filename in cipher text
- * @length: The length of the cipher text name
- * @decrypted_name: The plaintext name
- *
- * Decodes and decrypts the filename.
- *
- * We assume that we have a properly initialized crypto context,
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of decoded filename; negative if error
- */
-int
-ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
- const char *name, int length, char **decrypted_name)
+static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
+ const char *name, size_t name_size)
{
- int error = 0;
+ int rc = 0;
- (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL);
- if (!(*decrypted_name)) {
- error = -ENOMEM;
+ (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL);
+ if (!(*copied_name)) {
+ rc = -ENOMEM;
goto out;
}
- /* TODO: Filename encryption is a scheduled feature for a
- * future version of eCryptfs. This function is here only for
- * the purpose of providing a framework for other developers
- * to easily implement filename encryption. Hint: Replace this
- * memcpy() with a call to decode and decrypt the
- * filename, the set the length accordingly. */
- memcpy((void *)(*decrypted_name), (void *)name, length);
- (*decrypted_name)[length + 1] = '\0'; /* Only for convenience
+ memcpy((void *)(*copied_name), (void *)name, name_size);
+ (*copied_name)[(name_size)] = '\0'; /* Only for convenience
* in printing out the
* string in debug
* messages */
- error = length;
+ (*copied_name_size) = (name_size + 1);
out:
- return error;
+ return rc;
}
/**
@@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
*key_tfm = NULL;
if (*key_size > ECRYPTFS_MAX_KEY_BYTES) {
rc = -EINVAL;
- printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum "
+ printk(KERN_ERR "Requested key size is [%zd] bytes; maximum "
"allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES);
goto out;
}
@@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
get_random_bytes(dummy_key, *key_size);
rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
if (rc) {
- printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+ printk(KERN_ERR "Error attempting to set key of size [%zd] for "
"cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc);
rc = -EINVAL;
goto out;
@@ -1910,3 +1922,341 @@ out:
mutex_unlock(&key_tfm_list_mutex);
return rc;
}
+
+/* 64 characters forming a 6-bit target field */
+static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
+ "EFGHIJKLMNOPQRST"
+ "UVWXYZabcdefghij"
+ "klmnopqrstuvwxyz");
+
+/* We could either offset on every reverse map or just pad some 0x00's
+ * at the front here */
+static const unsigned char filename_rev_map[] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */
+ 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */
+ 0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */
+ 0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */
+ 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */
+ 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */
+ 0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */
+ 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
+ 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
+ 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
+ 0x3D, 0x3E, 0x3F
+};
+
+/**
+ * ecryptfs_encode_for_filename
+ * @dst: Destination location for encoded filename
+ * @dst_size: Size of the encoded filename in bytes
+ * @src: Source location for the filename to encode
+ * @src_size: Size of the source in bytes
+ */
+void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
+ unsigned char *src, size_t src_size)
+{
+ size_t num_blocks;
+ size_t block_num = 0;
+ size_t dst_offset = 0;
+ unsigned char last_block[3];
+
+ if (src_size == 0) {
+ (*dst_size) = 0;
+ goto out;
+ }
+ num_blocks = (src_size / 3);
+ if ((src_size % 3) == 0) {
+ memcpy(last_block, (&src[src_size - 3]), 3);
+ } else {
+ num_blocks++;
+ last_block[2] = 0x00;
+ switch (src_size % 3) {
+ case 1:
+ last_block[0] = src[src_size - 1];
+ last_block[1] = 0x00;
+ break;
+ case 2:
+ last_block[0] = src[src_size - 2];
+ last_block[1] = src[src_size - 1];
+ }
+ }
+ (*dst_size) = (num_blocks * 4);
+ if (!dst)
+ goto out;
+ while (block_num < num_blocks) {
+ unsigned char *src_block;
+ unsigned char dst_block[4];
+
+ if (block_num == (num_blocks - 1))
+ src_block = last_block;
+ else
+ src_block = &src[block_num * 3];
+ dst_block[0] = ((src_block[0] >> 2) & 0x3F);
+ dst_block[1] = (((src_block[0] << 4) & 0x30)
+ | ((src_block[1] >> 4) & 0x0F));
+ dst_block[2] = (((src_block[1] << 2) & 0x3C)
+ | ((src_block[2] >> 6) & 0x03));
+ dst_block[3] = (src_block[2] & 0x3F);
+ dst[dst_offset++] = portable_filename_chars[dst_block[0]];
+ dst[dst_offset++] = portable_filename_chars[dst_block[1]];
+ dst[dst_offset++] = portable_filename_chars[dst_block[2]];
+ dst[dst_offset++] = portable_filename_chars[dst_block[3]];
+ block_num++;
+ }
+out:
+ return;
+}
+
+/**
+ * ecryptfs_decode_from_filename
+ * @dst: If NULL, this function only sets @dst_size and returns. If
+ * non-NULL, this function decodes the encoded octets in @src
+ * into the memory that @dst points to.
+ * @dst_size: Set to the size of the decoded string.
+ * @src: The encoded set of octets to decode.
+ * @src_size: The size of the encoded set of octets to decode.
+ */
+static void
+ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
+ const unsigned char *src, size_t src_size)
+{
+ u8 current_bit_offset = 0;
+ size_t src_byte_offset = 0;
+ size_t dst_byte_offset = 0;
+
+ if (dst == NULL) {
+ /* Not exact; conservatively long. Every block of 4
+ * encoded characters decodes into a block of 3
+ * decoded characters. This segment of code provides
+ * the caller with the maximum amount of allocated
+ * space that @dst will need to point to in a
+ * subsequent call. */
+ (*dst_size) = (((src_size + 1) * 3) / 4);
+ goto out;
+ }
+ while (src_byte_offset < src_size) {
+ unsigned char src_byte =
+ filename_rev_map[(int)src[src_byte_offset]];
+
+ switch (current_bit_offset) {
+ case 0:
+ dst[dst_byte_offset] = (src_byte << 2);
+ current_bit_offset = 6;
+ break;
+ case 6:
+ dst[dst_byte_offset++] |= (src_byte >> 4);
+ dst[dst_byte_offset] = ((src_byte & 0xF)
+ << 4);
+ current_bit_offset = 4;
+ break;
+ case 4:
+ dst[dst_byte_offset++] |= (src_byte >> 2);
+ dst[dst_byte_offset] = (src_byte << 6);
+ current_bit_offset = 2;
+ break;
+ case 2:
+ dst[dst_byte_offset++] |= (src_byte);
+ dst[dst_byte_offset] = 0;
+ current_bit_offset = 0;
+ break;
+ }
+ src_byte_offset++;
+ }
+ (*dst_size) = dst_byte_offset;
+out:
+ return;
+}
+
+/**
+ * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
+ * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @name: The plaintext name
+ * @length: The length of the plaintext
+ * @encoded_name: The encypted name
+ *
+ * Encrypts and encodes a filename into something that constitutes a
+ * valid filename for a filesystem, with printable characters.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * Returns zero on success; non-zero on otherwise
+ */
+int ecryptfs_encrypt_and_encode_filename(
+ char **encoded_name,
+ size_t *encoded_name_size,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+ const char *name, size_t name_size)
+{
+ size_t encoded_name_no_prefix_size;
+ int rc = 0;
+
+ (*encoded_name) = NULL;
+ (*encoded_name_size) = 0;
+ if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+ || (mount_crypt_stat && (mount_crypt_stat->flags
+ & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
+ struct ecryptfs_filename *filename;
+
+ filename = kzalloc(sizeof(*filename), GFP_KERNEL);
+ if (!filename) {
+ printk(KERN_ERR "%s: Out of memory whilst attempting "
+ "to kzalloc [%zd] bytes\n", __func__,
+ sizeof(*filename));
+ rc = -ENOMEM;
+ goto out;
+ }
+ filename->filename = (char *)name;
+ filename->filename_size = name_size;
+ rc = ecryptfs_encrypt_filename(filename, crypt_stat,
+ mount_crypt_stat);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to encrypt "
+ "filename; rc = [%d]\n", __func__, rc);
+ kfree(filename);
+ goto out;
+ }
+ ecryptfs_encode_for_filename(
+ NULL, &encoded_name_no_prefix_size,
+ filename->encrypted_filename,
+ filename->encrypted_filename_size);
+ if ((crypt_stat && (crypt_stat->flags
+ & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+ || (mount_crypt_stat
+ && (mount_crypt_stat->flags
+ & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+ (*encoded_name_size) =
+ (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+ + encoded_name_no_prefix_size);
+ else
+ (*encoded_name_size) =
+ (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+ + encoded_name_no_prefix_size);
+ (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
+ if (!(*encoded_name)) {
+ printk(KERN_ERR "%s: Out of memory whilst attempting "
+ "to kzalloc [%zd] bytes\n", __func__,
+ (*encoded_name_size));
+ rc = -ENOMEM;
+ kfree(filename->encrypted_filename);
+ kfree(filename);
+ goto out;
+ }
+ if ((crypt_stat && (crypt_stat->flags
+ & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+ || (mount_crypt_stat
+ && (mount_crypt_stat->flags
+ & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+ memcpy((*encoded_name),
+ ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+ ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
+ ecryptfs_encode_for_filename(
+ ((*encoded_name)
+ + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE),
+ &encoded_name_no_prefix_size,
+ filename->encrypted_filename,
+ filename->encrypted_filename_size);
+ (*encoded_name_size) =
+ (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+ + encoded_name_no_prefix_size);
+ (*encoded_name)[(*encoded_name_size)] = '\0';
+ (*encoded_name_size)++;
+ } else {
+ rc = -ENOTSUPP;
+ }
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to encode "
+ "encrypted filename; rc = [%d]\n", __func__,
+ rc);
+ kfree((*encoded_name));
+ (*encoded_name) = NULL;
+ (*encoded_name_size) = 0;
+ }
+ kfree(filename->encrypted_filename);
+ kfree(filename);
+ } else {
+ rc = ecryptfs_copy_filename(encoded_name,
+ encoded_name_size,
+ name, name_size);
+ }
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
+ * @plaintext_name: The plaintext name
+ * @plaintext_name_size: The plaintext name size
+ * @ecryptfs_dir_dentry: eCryptfs directory dentry
+ * @name: The filename in cipher text
+ * @name_size: The cipher text name size
+ *
+ * Decrypts and decodes the filename.
+ *
+ * Returns zero on error; non-zero otherwise
+ */
+int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
+ size_t *plaintext_name_size,
+ struct dentry *ecryptfs_dir_dentry,
+ const char *name, size_t name_size)
+{
+ char *decoded_name;
+ size_t decoded_name_size;
+ size_t packet_size;
+ int rc = 0;
+
+ if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
+ && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+ ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+ &ecryptfs_superblock_to_private(
+ ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
+ const char *orig_name = name;
+ size_t orig_name_size = name_size;
+
+ name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+ name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+ ecryptfs_decode_from_filename(NULL, &decoded_name_size,
+ name, name_size);
+ decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
+ if (!decoded_name) {
+ printk(KERN_ERR "%s: Out of memory whilst attempting "
+ "to kmalloc [%zd] bytes\n", __func__,
+ decoded_name_size);
+ rc = -ENOMEM;
+ goto out;
+ }
+ ecryptfs_decode_from_filename(decoded_name, &decoded_name_size,
+ name, name_size);
+ rc = ecryptfs_parse_tag_70_packet(plaintext_name,
+ plaintext_name_size,
+ &packet_size,
+ mount_crypt_stat,
+ decoded_name,
+ decoded_name_size);
+ if (rc) {
+ printk(KERN_INFO "%s: Could not parse tag 70 packet "
+ "from filename; copying through filename "
+ "as-is\n", __func__);
+ rc = ecryptfs_copy_filename(plaintext_name,
+ plaintext_name_size,
+ orig_name, orig_name_size);
+ goto out_free;
+ }
+ } else {
+ rc = ecryptfs_copy_filename(plaintext_name,
+ plaintext_name_size,
+ name, name_size);
+ goto out;
+ }
+out_free:
+ kfree(decoded_name);
+out:
+ return rc;
+}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a75026d35d1..c11fc95714a 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -51,12 +51,16 @@
#define ECRYPTFS_VERSIONING_XATTR 0x00000010
#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020
#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040
+#define ECRYPTFS_VERSIONING_HMAC 0x00000080
+#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100
+#define ECRYPTFS_VERSIONING_GCM 0x00000200
#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
| ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
| ECRYPTFS_VERSIONING_PUBKEY \
| ECRYPTFS_VERSIONING_XATTR \
| ECRYPTFS_VERSIONING_MULTKEY \
- | ECRYPTFS_VERSIONING_DEVMISC)
+ | ECRYPTFS_VERSIONING_DEVMISC \
+ | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
#define ECRYPTFS_MAX_PASSWORD_LENGTH 64
#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
#define ECRYPTFS_SALT_SIZE 8
@@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key)
#define ECRYPTFS_DEFAULT_CIPHER "aes"
#define ECRYPTFS_DEFAULT_KEY_BYTES 16
#define ECRYPTFS_DEFAULT_HASH "md5"
+#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key)
#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41
#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42
#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43
+#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename
+ * as dentry name */
+#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in
+ * metadata */
+#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as
+ * dentry name */
+#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
+ * metadata */
+/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
+ * ECRYPTFS_MAX_IV_BYTES */
+#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
+#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
#define MD5_DIGEST_SIZE 16
+#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
+#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
struct ecryptfs_key_sig {
struct list_head crypt_stat_list;
char keysig[ECRYPTFS_SIG_SIZE_HEX];
};
+struct ecryptfs_filename {
+ struct list_head crypt_stat_list;
+#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001
+ u32 flags;
+ u32 seq_no;
+ char *filename;
+ char *encrypted_filename;
+ size_t filename_size;
+ size_t encrypted_filename_size;
+ char fnek_sig[ECRYPTFS_SIG_SIZE_HEX];
+ char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1];
+};
+
/**
* This is the primary struct associated with each encrypted file.
*
* TODO: cache align/pack?
*/
struct ecryptfs_crypt_stat {
-#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
-#define ECRYPTFS_POLICY_APPLIED 0x00000002
-#define ECRYPTFS_NEW_FILE 0x00000004
-#define ECRYPTFS_ENCRYPTED 0x00000008
-#define ECRYPTFS_SECURITY_WARNING 0x00000010
-#define ECRYPTFS_ENABLE_HMAC 0x00000020
-#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040
-#define ECRYPTFS_KEY_VALID 0x00000080
-#define ECRYPTFS_METADATA_IN_XATTR 0x00000100
-#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200
-#define ECRYPTFS_KEY_SET 0x00000400
+#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
+#define ECRYPTFS_POLICY_APPLIED 0x00000002
+#define ECRYPTFS_NEW_FILE 0x00000004
+#define ECRYPTFS_ENCRYPTED 0x00000008
+#define ECRYPTFS_SECURITY_WARNING 0x00000010
+#define ECRYPTFS_ENABLE_HMAC 0x00000020
+#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040
+#define ECRYPTFS_KEY_VALID 0x00000080
+#define ECRYPTFS_METADATA_IN_XATTR 0x00000100
+#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200
+#define ECRYPTFS_KEY_SET 0x00000400
+#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800
+#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
+#define ECRYPTFS_ENCFN_USE_FEK 0x00002000
u32 flags;
unsigned int file_version;
size_t iv_bytes;
@@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat {
#define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002
#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004
#define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008
+#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010
+#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020
+#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040
u32 flags;
struct list_head global_auth_tok_list;
struct mutex global_auth_tok_list_mutex;
size_t num_global_auth_toks;
size_t global_default_cipher_key_size;
+ size_t global_default_fn_cipher_key_bytes;
unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
+ 1];
+ unsigned char global_default_fn_cipher_name[
+ ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+ char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
};
/* superblock private data. */
@@ -571,13 +617,22 @@ struct ecryptfs_open_req {
int ecryptfs_interpose(struct dentry *hidden_dentry,
struct dentry *this_dentry, struct super_block *sb,
u32 flags);
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
+ struct dentry *lower_dentry,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct inode *ecryptfs_dir_inode,
+ struct nameidata *ecryptfs_nd);
+int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
+ size_t *decrypted_name_size,
+ struct dentry *ecryptfs_dentry,
+ const char *name, size_t name_size);
int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
-int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
- const char *name, int length,
- char **decrypted_name);
-int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
- const char *name, int length,
- char **encoded_name);
+int ecryptfs_encrypt_and_encode_filename(
+ char **encoded_name,
+ size_t *encoded_name_size,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+ const char *name, size_t name_size);
struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
void ecryptfs_dump_hex(char *data, int bytes);
int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
@@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data,
struct inode *ecryptfs_inode);
int ecryptfs_read_and_validate_xattr_region(char *page_virt,
struct dentry *ecryptfs_dentry);
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat);
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
int ecryptfs_generate_key_packet_set(char *dest_base,
@@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file,
struct vfsmount *lower_mnt,
const struct cred *cred);
int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+ size_t *packet_size,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+ char *filename, size_t filename_size);
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+ size_t *packet_size,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+ char *data, size_t max_packet_size);
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+ loff_t offset);
#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index eb3dc4c7ac0..9e944057001 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback {
/* Inspired by generic filldir in fs/readdir.c */
static int
-ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
- u64 ino, unsigned int d_type)
+ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
{
- struct ecryptfs_crypt_stat *crypt_stat;
struct ecryptfs_getdents_callback *buf =
(struct ecryptfs_getdents_callback *)dirent;
+ size_t name_size;
+ char *name;
int rc;
- int decoded_length;
- char *decoded_name;
- crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
buf->filldir_called++;
- decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen,
- &decoded_name);
- if (decoded_length < 0) {
- rc = decoded_length;
+ rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
+ buf->dentry, lower_name,
+ lower_namelen);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to decode and decrypt "
+ "filename [%s]; rc = [%d]\n", __func__, lower_name,
+ rc);
goto out;
}
- rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset,
- ino, d_type);
- kfree(decoded_name);
+ rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
+ kfree(name);
if (rc >= 0)
buf->entries_written++;
out:
@@ -106,8 +106,8 @@ out:
/**
* ecryptfs_readdir
- * @file: The ecryptfs file struct
- * @dirent: Directory entry
+ * @file: The eCryptfs directory file
+ * @dirent: Directory entry handle
* @filldir: The filldir callback function
*/
static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
@@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
static int
ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
{
- struct file *lower_file = ecryptfs_file_to_lower(file);
- struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- struct inode *lower_inode = lower_dentry->d_inode;
- int rc = -EINVAL;
-
- if (lower_inode->i_fop->fsync) {
- mutex_lock(&lower_inode->i_mutex);
- rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
- datasync);
- mutex_unlock(&lower_inode->i_mutex);
- }
- return rc;
+ return vfs_fsync(ecryptfs_file_to_lower(file),
+ ecryptfs_dentry_to_lower(dentry),
+ datasync);
}
static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5e78fc17988..5697899a168 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir)
/**
* ecryptfs_create_underlying_file
* @lower_dir_inode: inode of the parent in the lower fs of the new file
- * @lower_dentry: New file's dentry in the lower fs
- * @ecryptfs_dentry: New file's dentry in ecryptfs
+ * @dentry: New file's dentry
* @mode: The mode of the new file
* @nd: nameidata of ecryptfs' parent's dentry & vfsmount
*
@@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
{
int rc;
- /* ecryptfs_do_create() calls ecryptfs_interpose(), which opens
- * the crypt_stat->lower_file (persistent file) */
+ /* ecryptfs_do_create() calls ecryptfs_interpose() */
rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
if (unlikely(rc)) {
ecryptfs_printk(KERN_WARNING, "Failed to create file in"
@@ -244,141 +242,91 @@ out:
}
/**
- * ecryptfs_lookup
- * @dir: inode
- * @dentry: The dentry
- * @nd: nameidata, may be NULL
- *
- * Find a file on disk. If the file does not exist, then we'll add it to the
- * dentry cache and continue on to read it from the disk.
+ * ecryptfs_lookup_and_interpose_lower - Perform a lookup
*/
-static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
+ struct dentry *lower_dentry,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct inode *ecryptfs_dir_inode,
+ struct nameidata *ecryptfs_nd)
{
- int rc = 0;
struct dentry *lower_dir_dentry;
- struct dentry *lower_dentry;
struct vfsmount *lower_mnt;
- char *encoded_name;
- int encoded_namelen;
- struct ecryptfs_crypt_stat *crypt_stat = NULL;
+ struct inode *lower_inode;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
char *page_virt = NULL;
- struct inode *lower_inode;
u64 file_size;
+ int rc = 0;
- lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
- dentry->d_op = &ecryptfs_dops;
- if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, "."))
- || (dentry->d_name.len == 2
- && !strcmp(dentry->d_name.name, ".."))) {
- d_drop(dentry);
- goto out;
- }
- encoded_namelen = ecryptfs_encode_filename(crypt_stat,
- dentry->d_name.name,
- dentry->d_name.len,
- &encoded_name);
- if (encoded_namelen < 0) {
- rc = encoded_namelen;
- d_drop(dentry);
- goto out;
- }
- ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
- "= [%d]\n", encoded_name, encoded_namelen);
- lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
- encoded_namelen - 1);
- kfree(encoded_name);
- if (IS_ERR(lower_dentry)) {
- ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
- rc = PTR_ERR(lower_dentry);
- d_drop(dentry);
- goto out;
- }
- lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
- ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
- "d_name.name = [%s]\n", lower_dentry,
- lower_dentry->d_name.name);
+ lower_dir_dentry = lower_dentry->d_parent;
+ lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
+ ecryptfs_dentry->d_parent));
lower_inode = lower_dentry->d_inode;
- fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode);
+ fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
BUG_ON(!atomic_read(&lower_dentry->d_count));
- ecryptfs_set_dentry_private(dentry,
+ ecryptfs_set_dentry_private(ecryptfs_dentry,
kmem_cache_alloc(ecryptfs_dentry_info_cache,
GFP_KERNEL));
- if (!ecryptfs_dentry_to_private(dentry)) {
+ if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
rc = -ENOMEM;
- ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting "
- "to allocate ecryptfs_dentry_info struct\n");
+ printk(KERN_ERR "%s: Out of memory whilst attempting "
+ "to allocate ecryptfs_dentry_info struct\n",
+ __func__);
goto out_dput;
}
- ecryptfs_set_dentry_lower(dentry, lower_dentry);
- ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+ ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
+ ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
if (!lower_dentry->d_inode) {
/* We want to add because we couldn't find in lower */
- d_add(dentry, NULL);
+ d_add(ecryptfs_dentry, NULL);
goto out;
}
- rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
- ECRYPTFS_INTERPOSE_FLAG_D_ADD);
+ rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
+ ecryptfs_dir_inode->i_sb, 1);
if (rc) {
- ecryptfs_printk(KERN_ERR, "Error interposing\n");
+ printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
+ __func__, rc);
goto out;
}
- if (S_ISDIR(lower_inode->i_mode)) {
- ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
+ if (S_ISDIR(lower_inode->i_mode))
goto out;
- }
- if (S_ISLNK(lower_inode->i_mode)) {
- ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
+ if (S_ISLNK(lower_inode->i_mode))
goto out;
- }
- if (special_file(lower_inode->i_mode)) {
- ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
+ if (special_file(lower_inode->i_mode))
goto out;
- }
- if (!nd) {
- ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
- "as we *think* we are about to unlink\n");
+ if (!ecryptfs_nd)
goto out;
- }
/* Released in this function */
- page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2,
- GFP_USER);
+ page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
if (!page_virt) {
+ printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
+ __func__);
rc = -ENOMEM;
- ecryptfs_printk(KERN_ERR,
- "Cannot ecryptfs_kmalloc a page\n");
goto out;
}
- crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
- if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
- ecryptfs_set_default_sizes(crypt_stat);
- if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
- rc = ecryptfs_init_persistent_file(dentry);
+ if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+ rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
if (rc) {
printk(KERN_ERR "%s: Error attempting to initialize "
"the persistent file for the dentry with name "
"[%s]; rc = [%d]\n", __func__,
- dentry->d_name.name, rc);
- goto out;
+ ecryptfs_dentry->d_name.name, rc);
+ goto out_free_kmem;
}
}
rc = ecryptfs_read_and_validate_header_region(page_virt,
- dentry->d_inode);
+ ecryptfs_dentry->d_inode);
if (rc) {
- rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry);
+ rc = ecryptfs_read_and_validate_xattr_region(page_virt,
+ ecryptfs_dentry);
if (rc) {
- printk(KERN_DEBUG "Valid metadata not found in header "
- "region or xattr region; treating file as "
- "unencrypted\n");
rc = 0;
- kmem_cache_free(ecryptfs_header_cache_2, page_virt);
- goto out;
+ goto out_free_kmem;
}
crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
}
mount_crypt_stat = &ecryptfs_superblock_to_private(
- dentry->d_sb)->mount_crypt_stat;
+ ecryptfs_dentry->d_sb)->mount_crypt_stat;
if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
file_size = (crypt_stat->num_header_bytes_at_front
@@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
} else {
file_size = get_unaligned_be64(page_virt);
}
- i_size_write(dentry->d_inode, (loff_t)file_size);
+ i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size);
+out_free_kmem:
kmem_cache_free(ecryptfs_header_cache_2, page_virt);
goto out;
-
out_dput:
dput(lower_dentry);
- d_drop(dentry);
+ d_drop(ecryptfs_dentry);
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_lookup
+ * @ecryptfs_dir_inode: The eCryptfs directory inode
+ * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
+ * @ecryptfs_nd: nameidata; may be NULL
+ *
+ * Find a file on disk. If the file does not exist, then we'll add it to the
+ * dentry cache and continue on to read it from the disk.
+ */
+static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
+ struct dentry *ecryptfs_dentry,
+ struct nameidata *ecryptfs_nd)
+{
+ char *encrypted_and_encoded_name = NULL;
+ size_t encrypted_and_encoded_name_size;
+ struct ecryptfs_crypt_stat *crypt_stat = NULL;
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
+ struct ecryptfs_inode_info *inode_info;
+ struct dentry *lower_dir_dentry, *lower_dentry;
+ int rc = 0;
+
+ ecryptfs_dentry->d_op = &ecryptfs_dops;
+ if ((ecryptfs_dentry->d_name.len == 1
+ && !strcmp(ecryptfs_dentry->d_name.name, "."))
+ || (ecryptfs_dentry->d_name.len == 2
+ && !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
+ goto out_d_drop;
+ }
+ lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+ lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+ lower_dir_dentry,
+ ecryptfs_dentry->d_name.len);
+ if (IS_ERR(lower_dentry)) {
+ rc = PTR_ERR(lower_dentry);
+ printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+ "lower_dentry = [%s]\n", __func__, rc,
+ ecryptfs_dentry->d_name.name);
+ goto out_d_drop;
+ }
+ if (lower_dentry->d_inode)
+ goto lookup_and_interpose;
+ inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
+ if (inode_info) {
+ crypt_stat = &inode_info->crypt_stat;
+ /* TODO: lock for crypt_stat comparison */
+ if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+ ecryptfs_set_default_sizes(crypt_stat);
+ }
+ if (crypt_stat)
+ mount_crypt_stat = crypt_stat->mount_crypt_stat;
+ else
+ mount_crypt_stat = &ecryptfs_superblock_to_private(
+ ecryptfs_dentry->d_sb)->mount_crypt_stat;
+ if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+ && !(mount_crypt_stat && (mount_crypt_stat->flags
+ & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
+ goto lookup_and_interpose;
+ dput(lower_dentry);
+ rc = ecryptfs_encrypt_and_encode_filename(
+ &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
+ crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+ ecryptfs_dentry->d_name.len);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to encrypt and encode "
+ "filename; rc = [%d]\n", __func__, rc);
+ goto out_d_drop;
+ }
+ lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+ lower_dir_dentry,
+ encrypted_and_encoded_name_size - 1);
+ if (IS_ERR(lower_dentry)) {
+ rc = PTR_ERR(lower_dentry);
+ printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+ "lower_dentry = [%s]\n", __func__, rc,
+ encrypted_and_encoded_name);
+ goto out_d_drop;
+ }
+lookup_and_interpose:
+ rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
+ crypt_stat, ecryptfs_dir_inode,
+ ecryptfs_nd);
+ goto out;
+out_d_drop:
+ d_drop(ecryptfs_dentry);
out:
+ kfree(encrypted_and_encoded_name);
return ERR_PTR(rc);
}
@@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
struct dentry *lower_dentry;
struct dentry *lower_dir_dentry;
char *encoded_symname;
- int encoded_symlen;
- struct ecryptfs_crypt_stat *crypt_stat = NULL;
+ size_t encoded_symlen;
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
dget(lower_dentry);
lower_dir_dentry = lock_parent(lower_dentry);
- encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
- strlen(symname),
- &encoded_symname);
- if (encoded_symlen < 0) {
- rc = encoded_symlen;
+ mount_crypt_stat = &ecryptfs_superblock_to_private(
+ dir->i_sb)->mount_crypt_stat;
+ rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
+ &encoded_symlen,
+ NULL,
+ mount_crypt_stat, symname,
+ strlen(symname));
+ if (rc)
goto out_lock;
- }
rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
encoded_symname);
kfree(encoded_symname);
@@ -602,53 +641,54 @@ out_lock:
}
static int
-ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
- int rc;
- struct dentry *lower_dentry;
- char *decoded_name;
char *lower_buf;
- mm_segment_t old_fs;
+ struct dentry *lower_dentry;
struct ecryptfs_crypt_stat *crypt_stat;
+ char *plaintext_name;
+ size_t plaintext_name_size;
+ mm_segment_t old_fs;
+ int rc;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- if (!lower_dentry->d_inode->i_op ||
- !lower_dentry->d_inode->i_op->readlink) {
+ if (!lower_dentry->d_inode->i_op->readlink) {
rc = -EINVAL;
goto out;
}
+ crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
/* Released in this function */
lower_buf = kmalloc(bufsiz, GFP_KERNEL);
if (lower_buf == NULL) {
- ecryptfs_printk(KERN_ERR, "Out of memory\n");
+ printk(KERN_ERR "%s: Out of memory whilst attempting to "
+ "kmalloc [%d] bytes\n", __func__, bufsiz);
rc = -ENOMEM;
goto out;
}
old_fs = get_fs();
set_fs(get_ds());
- ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
- "lower_dentry->d_name.name = [%s]\n",
- lower_dentry->d_name.name);
rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
(char __user *)lower_buf,
bufsiz);
set_fs(old_fs);
if (rc >= 0) {
- crypt_stat = NULL;
- rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc,
- &decoded_name);
- if (rc == -ENOMEM)
+ rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
+ &plaintext_name_size,
+ dentry, lower_buf,
+ rc);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to decode and "
+ "decrypt filename; rc = [%d]\n", __func__,
+ rc);
goto out_free_lower_buf;
- if (rc > 0) {
- ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
- "to userspace: [%*s]\n", rc,
- decoded_name);
- if (copy_to_user(buf, decoded_name, rc))
- rc = -EFAULT;
}
- kfree(decoded_name);
- fsstack_copy_attr_atime(dentry->d_inode,
- lower_dentry->d_inode);
+ rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
+ if (rc)
+ rc = -EFAULT;
+ else
+ rc = plaintext_name_size;
+ kfree(plaintext_name);
+ fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
}
out_free_lower_buf:
kfree(lower_buf);
@@ -670,8 +710,6 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
}
old_fs = get_fs();
set_fs(get_ds());
- ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
- "dentry->d_name.name = [%s]\n", dentry->d_name.name);
rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
set_fs(old_fs);
if (rc < 0)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 0d713b69194..ff539420cc6 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
/* verify that everything through the encrypted FEK size is present */
if (message_len < 4) {
rc = -EIO;
- printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable "
+ printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable "
"message length is [%d]\n", __func__, message_len, 4);
goto out;
}
@@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
i += data_len;
if (message_len < (i + key_rec->enc_key_size)) {
rc = -EIO;
- printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n",
+ printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n",
__func__, message_len, (i + key_rec->enc_key_size));
goto out;
}
if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
rc = -EIO;
- printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than "
+ printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than "
"the maximum key size [%d]\n", __func__,
key_rec->enc_key_size,
ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
@@ -403,6 +403,580 @@ out:
}
static int
+ecryptfs_find_global_auth_tok_for_sig(
+ struct ecryptfs_global_auth_tok **global_auth_tok,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
+{
+ struct ecryptfs_global_auth_tok *walker;
+ int rc = 0;
+
+ (*global_auth_tok) = NULL;
+ mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+ list_for_each_entry(walker,
+ &mount_crypt_stat->global_auth_tok_list,
+ mount_crypt_stat_list) {
+ if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
+ (*global_auth_tok) = walker;
+ goto out;
+ }
+ }
+ rc = -EINVAL;
+out:
+ mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+ return rc;
+}
+
+/**
+ * ecryptfs_find_auth_tok_for_sig
+ * @auth_tok: Set to the matching auth_tok; NULL if not found
+ * @crypt_stat: inode crypt_stat crypto context
+ * @sig: Sig of auth_tok to find
+ *
+ * For now, this function simply looks at the registered auth_tok's
+ * linked off the mount_crypt_stat, so all the auth_toks that can be
+ * used must be registered at mount time. This function could
+ * potentially try a lot harder to find auth_tok's (e.g., by calling
+ * out to ecryptfsd to dynamically retrieve an auth_tok object) so
+ * that static registration of auth_tok's will no longer be necessary.
+ *
+ * Returns zero on no error; non-zero on error
+ */
+static int
+ecryptfs_find_auth_tok_for_sig(
+ struct ecryptfs_auth_tok **auth_tok,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+ char *sig)
+{
+ struct ecryptfs_global_auth_tok *global_auth_tok;
+ int rc = 0;
+
+ (*auth_tok) = NULL;
+ if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
+ mount_crypt_stat, sig)) {
+ struct key *auth_tok_key;
+
+ rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
+ sig);
+ } else
+ (*auth_tok) = global_auth_tok->global_auth_tok;
+ return rc;
+}
+
+/**
+ * write_tag_70_packet can gobble a lot of stack space. We stuff most
+ * of the function's parameters in a kmalloc'd struct to help reduce
+ * eCryptfs' overall stack usage.
+ */
+struct ecryptfs_write_tag_70_packet_silly_stack {
+ u8 cipher_code;
+ size_t max_packet_size;
+ size_t packet_size_len;
+ size_t block_aligned_filename_size;
+ size_t block_size;
+ size_t i;
+ size_t j;
+ size_t num_rand_bytes;
+ struct mutex *tfm_mutex;
+ char *block_aligned_filename;
+ struct ecryptfs_auth_tok *auth_tok;
+ struct scatterlist src_sg;
+ struct scatterlist dst_sg;
+ struct blkcipher_desc desc;
+ char iv[ECRYPTFS_MAX_IV_BYTES];
+ char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+ char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+ struct hash_desc hash_desc;
+ struct scatterlist hash_sg;
+};
+
+/**
+ * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
+ * @filename: NULL-terminated filename string
+ *
+ * This is the simplest mechanism for achieving filename encryption in
+ * eCryptfs. It encrypts the given filename with the mount-wide
+ * filename encryption key (FNEK) and stores it in a packet to @dest,
+ * which the callee will encode and write directly into the dentry
+ * name.
+ */
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+ size_t *packet_size,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+ char *filename, size_t filename_size)
+{
+ struct ecryptfs_write_tag_70_packet_silly_stack *s;
+ int rc = 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s) {
+ printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+ "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+ goto out;
+ }
+ s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ (*packet_size) = 0;
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
+ &s->desc.tfm,
+ &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
+ if (unlikely(rc)) {
+ printk(KERN_ERR "Internal error whilst attempting to get "
+ "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+ mount_crypt_stat->global_default_fn_cipher_name, rc);
+ goto out;
+ }
+ mutex_lock(s->tfm_mutex);
+ s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
+ /* Plus one for the \0 separator between the random prefix
+ * and the plaintext filename */
+ s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
+ s->block_aligned_filename_size = (s->num_rand_bytes + filename_size);
+ if ((s->block_aligned_filename_size % s->block_size) != 0) {
+ s->num_rand_bytes += (s->block_size
+ - (s->block_aligned_filename_size
+ % s->block_size));
+ s->block_aligned_filename_size = (s->num_rand_bytes
+ + filename_size);
+ }
+ /* Octet 0: Tag 70 identifier
+ * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+ * and block-aligned encrypted filename size)
+ * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+ * Octet N2-N3: Cipher identifier (1 octet)
+ * Octets N3-N4: Block-aligned encrypted filename
+ * - Consists of a minimum number of random characters, a \0
+ * separator, and then the filename */
+ s->max_packet_size = (1 /* Tag 70 identifier */
+ + 3 /* Max Tag 70 packet size */
+ + ECRYPTFS_SIG_SIZE /* FNEK sig */
+ + 1 /* Cipher identifier */
+ + s->block_aligned_filename_size);
+ if (dest == NULL) {
+ (*packet_size) = s->max_packet_size;
+ goto out_unlock;
+ }
+ if (s->max_packet_size > (*remaining_bytes)) {
+ printk(KERN_WARNING "%s: Require [%zd] bytes to write; only "
+ "[%zd] available\n", __func__, s->max_packet_size,
+ (*remaining_bytes));
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+ s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
+ GFP_KERNEL);
+ if (!s->block_aligned_filename) {
+ printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+ "kzalloc [%zd] bytes\n", __func__,
+ s->block_aligned_filename_size);
+ rc = -ENOMEM;
+ goto out_unlock;
+ }
+ s->i = 0;
+ dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
+ rc = ecryptfs_write_packet_length(&dest[s->i],
+ (ECRYPTFS_SIG_SIZE
+ + 1 /* Cipher code */
+ + s->block_aligned_filename_size),
+ &s->packet_size_len);
+ if (rc) {
+ printk(KERN_ERR "%s: Error generating tag 70 packet "
+ "header; cannot generate packet length; rc = [%d]\n",
+ __func__, rc);
+ goto out_free_unlock;
+ }
+ s->i += s->packet_size_len;
+ ecryptfs_from_hex(&dest[s->i],
+ mount_crypt_stat->global_default_fnek_sig,
+ ECRYPTFS_SIG_SIZE);
+ s->i += ECRYPTFS_SIG_SIZE;
+ s->cipher_code = ecryptfs_code_for_cipher_string(
+ mount_crypt_stat->global_default_fn_cipher_name,
+ mount_crypt_stat->global_default_fn_cipher_key_bytes);
+ if (s->cipher_code == 0) {
+ printk(KERN_WARNING "%s: Unable to generate code for "
+ "cipher [%s] with key bytes [%zd]\n", __func__,
+ mount_crypt_stat->global_default_fn_cipher_name,
+ mount_crypt_stat->global_default_fn_cipher_key_bytes);
+ rc = -EINVAL;
+ goto out_free_unlock;
+ }
+ dest[s->i++] = s->cipher_code;
+ rc = ecryptfs_find_auth_tok_for_sig(
+ &s->auth_tok, mount_crypt_stat,
+ mount_crypt_stat->global_default_fnek_sig);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to find auth tok for "
+ "fnek sig [%s]; rc = [%d]\n", __func__,
+ mount_crypt_stat->global_default_fnek_sig, rc);
+ goto out_free_unlock;
+ }
+ /* TODO: Support other key modules than passphrase for
+ * filename encryption */
+ BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+ sg_init_one(
+ &s->hash_sg,
+ (u8 *)s->auth_tok->token.password.session_key_encryption_key,
+ s->auth_tok->token.password.session_key_encryption_key_bytes);
+ s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(s->hash_desc.tfm)) {
+ rc = PTR_ERR(s->hash_desc.tfm);
+ printk(KERN_ERR "%s: Error attempting to "
+ "allocate hash crypto context; rc = [%d]\n",
+ __func__, rc);
+ goto out_free_unlock;
+ }
+ rc = crypto_hash_init(&s->hash_desc);
+ if (rc) {
+ printk(KERN_ERR
+ "%s: Error initializing crypto hash; rc = [%d]\n",
+ __func__, rc);
+ goto out_release_free_unlock;
+ }
+ rc = crypto_hash_update(
+ &s->hash_desc, &s->hash_sg,
+ s->auth_tok->token.password.session_key_encryption_key_bytes);
+ if (rc) {
+ printk(KERN_ERR
+ "%s: Error updating crypto hash; rc = [%d]\n",
+ __func__, rc);
+ goto out_release_free_unlock;
+ }
+ rc = crypto_hash_final(&s->hash_desc, s->hash);
+ if (rc) {
+ printk(KERN_ERR
+ "%s: Error finalizing crypto hash; rc = [%d]\n",
+ __func__, rc);
+ goto out_release_free_unlock;
+ }
+ for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
+ s->block_aligned_filename[s->j] =
+ s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
+ if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
+ == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
+ sg_init_one(&s->hash_sg, (u8 *)s->hash,
+ ECRYPTFS_TAG_70_DIGEST_SIZE);
+ rc = crypto_hash_init(&s->hash_desc);
+ if (rc) {
+ printk(KERN_ERR
+ "%s: Error initializing crypto hash; "
+ "rc = [%d]\n", __func__, rc);
+ goto out_release_free_unlock;
+ }
+ rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
+ ECRYPTFS_TAG_70_DIGEST_SIZE);
+ if (rc) {
+ printk(KERN_ERR
+ "%s: Error updating crypto hash; "
+ "rc = [%d]\n", __func__, rc);
+ goto out_release_free_unlock;
+ }
+ rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
+ if (rc) {
+ printk(KERN_ERR
+ "%s: Error finalizing crypto hash; "
+ "rc = [%d]\n", __func__, rc);
+ goto out_release_free_unlock;
+ }
+ memcpy(s->hash, s->tmp_hash,
+ ECRYPTFS_TAG_70_DIGEST_SIZE);
+ }
+ if (s->block_aligned_filename[s->j] == '\0')
+ s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
+ }
+ memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
+ filename_size);
+ rc = virt_to_scatterlist(s->block_aligned_filename,
+ s->block_aligned_filename_size, &s->src_sg, 1);
+ if (rc != 1) {
+ printk(KERN_ERR "%s: Internal error whilst attempting to "
+ "convert filename memory to scatterlist; "
+ "expected rc = 1; got rc = [%d]. "
+ "block_aligned_filename_size = [%zd]\n", __func__, rc,
+ s->block_aligned_filename_size);
+ goto out_release_free_unlock;
+ }
+ rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
+ &s->dst_sg, 1);
+ if (rc != 1) {
+ printk(KERN_ERR "%s: Internal error whilst attempting to "
+ "convert encrypted filename memory to scatterlist; "
+ "expected rc = 1; got rc = [%d]. "
+ "block_aligned_filename_size = [%zd]\n", __func__, rc,
+ s->block_aligned_filename_size);
+ goto out_release_free_unlock;
+ }
+ /* The characters in the first block effectively do the job
+ * of the IV here, so we just use 0's for the IV. Note the
+ * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+ * >= ECRYPTFS_MAX_IV_BYTES. */
+ memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+ s->desc.info = s->iv;
+ rc = crypto_blkcipher_setkey(
+ s->desc.tfm,
+ s->auth_tok->token.password.session_key_encryption_key,
+ mount_crypt_stat->global_default_fn_cipher_key_bytes);
+ if (rc < 0) {
+ printk(KERN_ERR "%s: Error setting key for crypto context; "
+ "rc = [%d]. s->auth_tok->token.password.session_key_"
+ "encryption_key = [0x%p]; mount_crypt_stat->"
+ "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+ rc,
+ s->auth_tok->token.password.session_key_encryption_key,
+ mount_crypt_stat->global_default_fn_cipher_key_bytes);
+ goto out_release_free_unlock;
+ }
+ rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+ s->block_aligned_filename_size);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to encrypt filename; "
+ "rc = [%d]\n", __func__, rc);
+ goto out_release_free_unlock;
+ }
+ s->i += s->block_aligned_filename_size;
+ (*packet_size) = s->i;
+ (*remaining_bytes) -= (*packet_size);
+out_release_free_unlock:
+ crypto_free_hash(s->hash_desc.tfm);
+out_free_unlock:
+ memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
+ kfree(s->block_aligned_filename);
+out_unlock:
+ mutex_unlock(s->tfm_mutex);
+out:
+ kfree(s);
+ return rc;
+}
+
+struct ecryptfs_parse_tag_70_packet_silly_stack {
+ u8 cipher_code;
+ size_t max_packet_size;
+ size_t packet_size_len;
+ size_t parsed_tag_70_packet_size;
+ size_t block_aligned_filename_size;
+ size_t block_size;
+ size_t i;
+ struct mutex *tfm_mutex;
+ char *decrypted_filename;
+ struct ecryptfs_auth_tok *auth_tok;
+ struct scatterlist src_sg;
+ struct scatterlist dst_sg;
+ struct blkcipher_desc desc;
+ char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
+ char iv[ECRYPTFS_MAX_IV_BYTES];
+ char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+};
+
+/**
+ * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
+ * @filename: This function kmalloc's the memory for the filename
+ * @filename_size: This function sets this to the amount of memory
+ * kmalloc'd for the filename
+ * @packet_size: This function sets this to the the number of octets
+ * in the packet parsed
+ * @mount_crypt_stat: The mount-wide cryptographic context
+ * @data: The memory location containing the start of the tag 70
+ * packet
+ * @max_packet_size: The maximum legal size of the packet to be parsed
+ * from @data
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+ size_t *packet_size,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+ char *data, size_t max_packet_size)
+{
+ struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+ int rc = 0;
+
+ (*packet_size) = 0;
+ (*filename_size) = 0;
+ (*filename) = NULL;
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s) {
+ printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+ "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+ goto out;
+ }
+ s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
+ printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
+ "at least [%d]\n", __func__, max_packet_size,
+ (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
+ rc = -EINVAL;
+ goto out;
+ }
+ /* Octet 0: Tag 70 identifier
+ * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+ * and block-aligned encrypted filename size)
+ * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+ * Octet N2-N3: Cipher identifier (1 octet)
+ * Octets N3-N4: Block-aligned encrypted filename
+ * - Consists of a minimum number of random numbers, a \0
+ * separator, and then the filename */
+ if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) {
+ printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be "
+ "tag [0x%.2x]\n", __func__,
+ data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE);
+ rc = -EINVAL;
+ goto out;
+ }
+ rc = ecryptfs_parse_packet_length(&data[(*packet_size)],
+ &s->parsed_tag_70_packet_size,
+ &s->packet_size_len);
+ if (rc) {
+ printk(KERN_WARNING "%s: Error parsing packet length; "
+ "rc = [%d]\n", __func__, rc);
+ goto out;
+ }
+ s->block_aligned_filename_size = (s->parsed_tag_70_packet_size
+ - ECRYPTFS_SIG_SIZE - 1);
+ if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
+ > max_packet_size) {
+ printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet "
+ "size is [%zd]\n", __func__, max_packet_size,
+ (1 + s->packet_size_len + 1
+ + s->block_aligned_filename_size));
+ rc = -EINVAL;
+ goto out;
+ }
+ (*packet_size) += s->packet_size_len;
+ ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)],
+ ECRYPTFS_SIG_SIZE);
+ s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+ (*packet_size) += ECRYPTFS_SIG_SIZE;
+ s->cipher_code = data[(*packet_size)++];
+ rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code);
+ if (rc) {
+ printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n",
+ __func__, s->cipher_code);
+ goto out;
+ }
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
+ &s->tfm_mutex,
+ s->cipher_string);
+ if (unlikely(rc)) {
+ printk(KERN_ERR "Internal error whilst attempting to get "
+ "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+ s->cipher_string, rc);
+ goto out;
+ }
+ mutex_lock(s->tfm_mutex);
+ rc = virt_to_scatterlist(&data[(*packet_size)],
+ s->block_aligned_filename_size, &s->src_sg, 1);
+ if (rc != 1) {
+ printk(KERN_ERR "%s: Internal error whilst attempting to "
+ "convert encrypted filename memory to scatterlist; "
+ "expected rc = 1; got rc = [%d]. "
+ "block_aligned_filename_size = [%zd]\n", __func__, rc,
+ s->block_aligned_filename_size);
+ goto out_unlock;
+ }
+ (*packet_size) += s->block_aligned_filename_size;
+ s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
+ GFP_KERNEL);
+ if (!s->decrypted_filename) {
+ printk(KERN_ERR "%s: Out of memory whilst attempting to "
+ "kmalloc [%zd] bytes\n", __func__,
+ s->block_aligned_filename_size);
+ rc = -ENOMEM;
+ goto out_unlock;
+ }
+ rc = virt_to_scatterlist(s->decrypted_filename,
+ s->block_aligned_filename_size, &s->dst_sg, 1);
+ if (rc != 1) {
+ printk(KERN_ERR "%s: Internal error whilst attempting to "
+ "convert decrypted filename memory to scatterlist; "
+ "expected rc = 1; got rc = [%d]. "
+ "block_aligned_filename_size = [%zd]\n", __func__, rc,
+ s->block_aligned_filename_size);
+ goto out_free_unlock;
+ }
+ /* The characters in the first block effectively do the job of
+ * the IV here, so we just use 0's for the IV. Note the
+ * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+ * >= ECRYPTFS_MAX_IV_BYTES. */
+ memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+ s->desc.info = s->iv;
+ rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
+ s->fnek_sig_hex);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to find auth tok for "
+ "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
+ rc);
+ goto out_free_unlock;
+ }
+ /* TODO: Support other key modules than passphrase for
+ * filename encryption */
+ BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+ rc = crypto_blkcipher_setkey(
+ s->desc.tfm,
+ s->auth_tok->token.password.session_key_encryption_key,
+ mount_crypt_stat->global_default_fn_cipher_key_bytes);
+ if (rc < 0) {
+ printk(KERN_ERR "%s: Error setting key for crypto context; "
+ "rc = [%d]. s->auth_tok->token.password.session_key_"
+ "encryption_key = [0x%p]; mount_crypt_stat->"
+ "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+ rc,
+ s->auth_tok->token.password.session_key_encryption_key,
+ mount_crypt_stat->global_default_fn_cipher_key_bytes);
+ goto out_free_unlock;
+ }
+ rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+ s->block_aligned_filename_size);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to decrypt filename; "
+ "rc = [%d]\n", __func__, rc);
+ goto out_free_unlock;
+ }
+ s->i = 0;
+ while (s->decrypted_filename[s->i] != '\0'
+ && s->i < s->block_aligned_filename_size)
+ s->i++;
+ if (s->i == s->block_aligned_filename_size) {
+ printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
+ "find valid separator between random characters and "
+ "the filename\n", __func__);
+ rc = -EINVAL;
+ goto out_free_unlock;
+ }
+ s->i++;
+ (*filename_size) = (s->block_aligned_filename_size - s->i);
+ if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
+ printk(KERN_WARNING "%s: Filename size is [%zd], which is "
+ "invalid\n", __func__, (*filename_size));
+ rc = -EINVAL;
+ goto out_free_unlock;
+ }
+ (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
+ if (!(*filename)) {
+ printk(KERN_ERR "%s: Out of memory whilst attempting to "
+ "kmalloc [%zd] bytes\n", __func__,
+ ((*filename_size) + 1));
+ rc = -ENOMEM;
+ goto out_free_unlock;
+ }
+ memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size));
+ (*filename)[(*filename_size)] = '\0';
+out_free_unlock:
+ kfree(s->decrypted_filename);
+out_unlock:
+ mutex_unlock(s->tfm_mutex);
+out:
+ if (rc) {
+ (*packet_size) = 0;
+ (*filename_size) = 0;
+ (*filename) = NULL;
+ }
+ kfree(s);
+ return rc;
+}
+
+static int
ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok)
{
int rc = 0;
@@ -897,30 +1471,6 @@ out:
return rc;
}
-static int
-ecryptfs_find_global_auth_tok_for_sig(
- struct ecryptfs_global_auth_tok **global_auth_tok,
- struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
-{
- struct ecryptfs_global_auth_tok *walker;
- int rc = 0;
-
- (*global_auth_tok) = NULL;
- mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
- list_for_each_entry(walker,
- &mount_crypt_stat->global_auth_tok_list,
- mount_crypt_stat_list) {
- if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
- (*global_auth_tok) = walker;
- goto out;
- }
- }
- rc = -EINVAL;
-out:
- mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
- return rc;
-}
-
/**
* ecryptfs_verify_version
* @version: The version number to confirm
@@ -990,43 +1540,6 @@ out:
}
/**
- * ecryptfs_find_auth_tok_for_sig
- * @auth_tok: Set to the matching auth_tok; NULL if not found
- * @crypt_stat: inode crypt_stat crypto context
- * @sig: Sig of auth_tok to find
- *
- * For now, this function simply looks at the registered auth_tok's
- * linked off the mount_crypt_stat, so all the auth_toks that can be
- * used must be registered at mount time. This function could
- * potentially try a lot harder to find auth_tok's (e.g., by calling
- * out to ecryptfsd to dynamically retrieve an auth_tok object) so
- * that static registration of auth_tok's will no longer be necessary.
- *
- * Returns zero on no error; non-zero on error
- */
-static int
-ecryptfs_find_auth_tok_for_sig(
- struct ecryptfs_auth_tok **auth_tok,
- struct ecryptfs_crypt_stat *crypt_stat, char *sig)
-{
- struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
- crypt_stat->mount_crypt_stat;
- struct ecryptfs_global_auth_tok *global_auth_tok;
- int rc = 0;
-
- (*auth_tok) = NULL;
- if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
- mount_crypt_stat, sig)) {
- struct key *auth_tok_key;
-
- rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
- sig);
- } else
- (*auth_tok) = global_auth_tok->global_auth_tok;
- return rc;
-}
-
-/**
* decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok.
* @auth_tok: The passphrase authentication token to use to encrypt the FEK
* @crypt_stat: The cryptographic context
@@ -1256,7 +1769,8 @@ find_next_matching_auth_tok:
rc = -EINVAL;
goto out_wipe_list;
}
- ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat,
+ ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
+ crypt_stat->mount_crypt_stat,
candidate_auth_tok_sig);
if (matching_auth_tok) {
found_auth_tok = 1;
@@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
int rc;
rc = write_tag_66_packet(auth_tok->token.private_key.signature,
- ecryptfs_code_for_cipher_string(crypt_stat),
+ ecryptfs_code_for_cipher_string(
+ crypt_stat->cipher,
+ crypt_stat->key_size),
crypt_stat, &payload, &payload_len);
if (rc) {
ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
@@ -1696,7 +2212,8 @@ encrypted_session_key_set:
dest[(*packet_size)++] = 0x04; /* version 4 */
/* TODO: Break from RFC2440 so that arbitrary ciphers can be
* specified with strings */
- cipher_code = ecryptfs_code_for_cipher_string(crypt_stat);
+ cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher,
+ crypt_stat->key_size);
if (cipher_code == 0) {
ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
"cipher [%s]\n", crypt_stat->cipher);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fd630713c5c..789cf2e1be1 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
ecryptfs_opt_ecryptfs_key_bytes,
ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
- ecryptfs_opt_encrypted_view, ecryptfs_opt_err };
+ ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
+ ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
+ ecryptfs_opt_err };
static const match_table_t tokens = {
{ecryptfs_opt_sig, "sig=%s"},
@@ -217,6 +219,9 @@ static const match_table_t tokens = {
{ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
{ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"},
{ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"},
+ {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
+ {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
+ {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
{ecryptfs_opt_err, NULL}
};
@@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
int rc = 0;
int sig_set = 0;
int cipher_name_set = 0;
+ int fn_cipher_name_set = 0;
int cipher_key_bytes;
int cipher_key_bytes_set = 0;
+ int fn_cipher_key_bytes;
+ int fn_cipher_key_bytes_set = 0;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
&ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
substring_t args[MAX_OPT_ARGS];
@@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
char *sig_src;
char *cipher_name_dst;
char *cipher_name_src;
+ char *fn_cipher_name_dst;
+ char *fn_cipher_name_src;
+ char *fnek_dst;
+ char *fnek_src;
char *cipher_key_bytes_src;
+ char *fn_cipher_key_bytes_src;
if (!options) {
rc = -EINVAL;
@@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
global_default_cipher_name;
strncpy(cipher_name_dst, cipher_name_src,
ECRYPTFS_MAX_CIPHER_NAME_SIZE);
- ecryptfs_printk(KERN_DEBUG,
- "The mount_crypt_stat "
- "global_default_cipher_name set to: "
- "[%s]\n", cipher_name_dst);
+ cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
cipher_name_set = 1;
break;
case ecryptfs_opt_ecryptfs_key_bytes:
@@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
&cipher_key_bytes_src, 0);
mount_crypt_stat->global_default_cipher_key_size =
cipher_key_bytes;
- ecryptfs_printk(KERN_DEBUG,
- "The mount_crypt_stat "
- "global_default_cipher_key_size "
- "set to: [%d]\n", mount_crypt_stat->
- global_default_cipher_key_size);
cipher_key_bytes_set = 1;
break;
case ecryptfs_opt_passthrough:
@@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
mount_crypt_stat->flags |=
ECRYPTFS_ENCRYPTED_VIEW_ENABLED;
break;
+ case ecryptfs_opt_fnek_sig:
+ fnek_src = args[0].from;
+ fnek_dst =
+ mount_crypt_stat->global_default_fnek_sig;
+ strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX);
+ mount_crypt_stat->global_default_fnek_sig[
+ ECRYPTFS_SIG_SIZE_HEX] = '\0';
+ rc = ecryptfs_add_global_auth_tok(
+ mount_crypt_stat,
+ mount_crypt_stat->global_default_fnek_sig);
+ if (rc) {
+ printk(KERN_ERR "Error attempting to register "
+ "global fnek sig [%s]; rc = [%d]\n",
+ mount_crypt_stat->global_default_fnek_sig,
+ rc);
+ goto out;
+ }
+ mount_crypt_stat->flags |=
+ (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES
+ | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK);
+ break;
+ case ecryptfs_opt_fn_cipher:
+ fn_cipher_name_src = args[0].from;
+ fn_cipher_name_dst =
+ mount_crypt_stat->global_default_fn_cipher_name;
+ strncpy(fn_cipher_name_dst, fn_cipher_name_src,
+ ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+ mount_crypt_stat->global_default_fn_cipher_name[
+ ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+ fn_cipher_name_set = 1;
+ break;
+ case ecryptfs_opt_fn_cipher_key_bytes:
+ fn_cipher_key_bytes_src = args[0].from;
+ fn_cipher_key_bytes =
+ (int)simple_strtol(fn_cipher_key_bytes_src,
+ &fn_cipher_key_bytes_src, 0);
+ mount_crypt_stat->global_default_fn_cipher_key_bytes =
+ fn_cipher_key_bytes;
+ fn_cipher_key_bytes_set = 1;
+ break;
case ecryptfs_opt_err:
default:
- ecryptfs_printk(KERN_WARNING,
- "eCryptfs: unrecognized option '%s'\n",
- p);
+ printk(KERN_WARNING
+ "%s: eCryptfs: unrecognized option [%s]\n",
+ __func__, p);
}
}
if (!sig_set) {
@@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
-
strcpy(mount_crypt_stat->global_default_cipher_name,
ECRYPTFS_DEFAULT_CIPHER);
}
- if (!cipher_key_bytes_set) {
+ if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+ && !fn_cipher_name_set)
+ strcpy(mount_crypt_stat->global_default_fn_cipher_name,
+ mount_crypt_stat->global_default_cipher_name);
+ if (!cipher_key_bytes_set)
mount_crypt_stat->global_default_cipher_key_size = 0;
- }
+ if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+ && !fn_cipher_key_bytes_set)
+ mount_crypt_stat->global_default_fn_cipher_key_bytes =
+ mount_crypt_stat->global_default_cipher_key_size;
mutex_lock(&key_tfm_list_mutex);
if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
- NULL))
+ NULL)) {
rc = ecryptfs_add_new_key_tfm(
NULL, mount_crypt_stat->global_default_cipher_name,
mount_crypt_stat->global_default_cipher_key_size);
- mutex_unlock(&key_tfm_list_mutex);
- if (rc) {
- printk(KERN_ERR "Error attempting to initialize cipher with "
- "name = [%s] and key size = [%td]; rc = [%d]\n",
- mount_crypt_stat->global_default_cipher_name,
- mount_crypt_stat->global_default_cipher_key_size, rc);
- rc = -EINVAL;
- goto out;
+ if (rc) {
+ printk(KERN_ERR "Error attempting to initialize "
+ "cipher with name = [%s] and key size = [%td]; "
+ "rc = [%d]\n",
+ mount_crypt_stat->global_default_cipher_name,
+ mount_crypt_stat->global_default_cipher_key_size,
+ rc);
+ rc = -EINVAL;
+ mutex_unlock(&key_tfm_list_mutex);
+ goto out;
+ }
}
+ if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+ && !ecryptfs_tfm_exists(
+ mount_crypt_stat->global_default_fn_cipher_name, NULL)) {
+ rc = ecryptfs_add_new_key_tfm(
+ NULL, mount_crypt_stat->global_default_fn_cipher_name,
+ mount_crypt_stat->global_default_fn_cipher_key_bytes);
+ if (rc) {
+ printk(KERN_ERR "Error attempting to initialize "
+ "cipher with name = [%s] and key size = [%td]; "
+ "rc = [%d]\n",
+ mount_crypt_stat->global_default_fn_cipher_name,
+ mount_crypt_stat->global_default_fn_cipher_key_bytes,
+ rc);
+ rc = -EINVAL;
+ mutex_unlock(&key_tfm_list_mutex);
+ goto out;
+ }
+ }
+ mutex_unlock(&key_tfm_list_mutex);
rc = ecryptfs_init_global_auth_toks(mount_crypt_stat);
- if (rc) {
+ if (rc)
printk(KERN_WARNING "One or more global auth toks could not "
"properly register; rc = [%d]\n", rc);
- }
out:
return rc;
}
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 6913f727624..96ef51489e0 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
(*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
if (!(*daemon)) {
rc = -ENOMEM;
- printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+ printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
"GFP_KERNEL memory\n", __func__, sizeof(**daemon));
goto out;
}
@@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
if (!msg_ctx->msg) {
rc = -ENOMEM;
- printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+ printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
"GFP_KERNEL memory\n", __func__, msg_size);
goto unlock;
}
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index efd95a0ed1e..a67fea655f4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
if (!msg_ctx->msg) {
rc = -ENOMEM;
printk(KERN_ERR "%s: Out of memory whilst attempting "
- "to kmalloc(%Zd, GFP_KERNEL)\n", __func__,
+ "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
(sizeof(*msg_ctx->msg) + data_size));
goto out_unlock;
}
@@ -322,7 +322,7 @@ check_list:
if (count < total_length) {
rc = 0;
printk(KERN_WARNING "%s: Only given user buffer of "
- "size [%Zd], but we need [%Zd] to read the "
+ "size [%zd], but we need [%zd] to read the "
"pending message\n", __func__, count, total_length);
goto out_unlock_msg_ctx;
}
@@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
if ((sizeof(*msg) + msg->data_len) != data_size) {
printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
- "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__,
+ "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__,
(sizeof(*msg) + msg->data_len), data_size);
rc = -EINVAL;
goto out;
@@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
data = kmalloc(count, GFP_KERNEL);
if (!data) {
printk(KERN_ERR "%s: Out of memory whilst attempting to "
- "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count);
+ "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
goto out;
}
rc = copy_from_user(data, buf, count);
@@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
case ECRYPTFS_MSG_RESPONSE:
if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
printk(KERN_WARNING "%s: Minimum acceptable packet "
- "size is [%Zd], but amount of data written is "
- "only [%Zd]. Discarding response packet.\n",
+ "size is [%zd], but amount of data written is "
+ "only [%zd]. Discarding response packet.\n",
__func__,
(1 + 4 + 1 + sizeof(struct ecryptfs_message)),
count);
@@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
}
i += packet_size_length;
if ((1 + 4 + packet_size_length + packet_size) != count) {
- printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])"
- " + packet_size([%Zd]))([%Zd]) != "
- "count([%Zd]). Invalid packet format.\n",
+ printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
+ " + packet_size([%zd]))([%zd]) != "
+ "count([%zd]). Invalid packet format.\n",
__func__, packet_size_length, packet_size,
(1 + packet_size_length + packet_size), count);
goto out_free;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 04d7b3fa1ac..46cec2b6979 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -288,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
loff_t prev_page_end_size;
int rc = 0;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
diff --git a/fs/exec.c b/fs/exec.c
index 3ef9cf9b187..71a6efe5d8b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -51,6 +51,7 @@
#include <linux/audit.h>
#include <linux/tracehook.h>
#include <linux/kmod.h>
+#include <linux/fsnotify.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -132,6 +133,8 @@ asmlinkage long sys_uselib(const char __user * library)
if (IS_ERR(file))
goto out;
+ fsnotify_open(file->f_path.dentry);
+
error = -ENOEXEC;
if(file->f_op) {
struct linux_binfmt * fmt;
@@ -229,13 +232,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
static int __bprm_mm_init(struct linux_binprm *bprm)
{
- int err = -ENOMEM;
+ int err;
struct vm_area_struct *vma = NULL;
struct mm_struct *mm = bprm->mm;
bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma)
- goto err;
+ return -ENOMEM;
down_write(&mm->mmap_sem);
vma->vm_mm = mm;
@@ -248,28 +251,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
*/
vma->vm_end = STACK_TOP_MAX;
vma->vm_start = vma->vm_end - PAGE_SIZE;
-
vma->vm_flags = VM_STACK_FLAGS;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
err = insert_vm_struct(mm, vma);
- if (err) {
- up_write(&mm->mmap_sem);
+ if (err)
goto err;
- }
mm->stack_vm = mm->total_vm = 1;
up_write(&mm->mmap_sem);
-
bprm->p = vma->vm_end - sizeof(void *);
-
return 0;
-
err:
- if (vma) {
- bprm->vma = NULL;
- kmem_cache_free(vm_area_cachep, vma);
- }
-
+ up_write(&mm->mmap_sem);
+ bprm->vma = NULL;
+ kmem_cache_free(vm_area_cachep, vma);
return err;
}
@@ -684,6 +679,8 @@ struct file *open_exec(const char *name)
if (IS_ERR(file))
return file;
+ fsnotify_open(file->f_path.dentry);
+
err = deny_write_access(file);
if (err) {
fput(file);
@@ -1689,7 +1686,7 @@ int get_dumpable(struct mm_struct *mm)
return (ret >= 2) ? 2 : ret;
}
-int do_coredump(long signr, int exit_code, struct pt_regs * regs)
+void do_coredump(long signr, int exit_code, struct pt_regs *regs)
{
struct core_state core_state;
char corename[CORENAME_MAX_SIZE + 1];
@@ -1773,6 +1770,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
if (ispipe) {
helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+ if (!helper_argv) {
+ printk(KERN_WARNING "%s failed to allocate memory\n",
+ __func__);
+ goto fail_unlock;
+ }
/* Terminate the string before the first option */
delimit = strchr(corename, ' ');
if (delimit)
@@ -1840,5 +1842,5 @@ fail_unlock:
put_cred(cred);
coredump_finish(mm);
fail:
- return retval;
+ return;
}
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c454d5db28a..66321a877e7 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
memset(ei->i_data, 0, sizeof(ei->i_data));
- ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
- if (S_ISLNK(mode))
- ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
- /* dirsync is only applied to directories */
- if (!S_ISDIR(mode))
- ei->i_flags &= ~EXT2_DIRSYNC_FL;
+ ei->i_flags =
+ ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
ei->i_faddr = 0;
ei->i_frag_no = 0;
ei->i_frag_size = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 02b39a5deb7..23fff2f8778 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -498,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode,
* ext2_splice_branch - splice the allocated branch onto inode.
* @inode: owner
* @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- * ext2_alloc_branch)
* @where: location of missing link
* @num: number of indirect blocks we are adding
* @blks: number of direct blocks we are adding
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e..7cb4badef92 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
goto setflags_out;
}
- if (!S_ISDIR(inode->i_mode))
- flags &= ~EXT2_DIRSYNC_FL;
+ flags = ext2_mask_flags(inode->i_mode, flags);
mutex_lock(&inode->i_mutex);
/* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac8..da8bdeaa2e6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
percpu_counter_destroy(&sbi->s_dirs_counter);
brelse (sbi->s_sbh);
sb->s_fs_info = NULL;
+ kfree(sbi->s_blockgroup_lock);
kfree(sbi);
return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock) {
+ kfree(sbi);
+ return -ENOMEM;
+ }
sb->s_fs_info = sbi;
sbi->s_sb_block = sb_block;
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
printk ("EXT2-fs: not enough memory\n");
goto failed_mount;
}
- bgl_lock_init(&sbi->s_blockgroup_lock);
+ bgl_lock_init(sbi->s_blockgroup_lock);
sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
if (!sbi->s_debts) {
printk ("EXT2-fs: not enough memory\n");
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index c30e149fbd2..7d215b4d4f2 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
/* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
{
- __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ const unsigned char *ucp = (const unsigned char *) name;
+
+ while (len--) {
+ hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+ if (hash & 0x80000000)
+ hash -= 0x7fffffff;
+ hash1 = hash0;
+ hash0 = hash;
+ }
+ return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ const signed char *scp = (const signed char *) name;
+
while (len--) {
- __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+ hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
- if (hash & 0x80000000) hash -= 0x7fffffff;
+ if (hash & 0x80000000)
+ hash -= 0x7fffffff;
hash1 = hash0;
hash0 = hash;
}
- return (hash0 << 1);
+ return hash0 << 1;
}
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
{
__u32 pad, val;
int i;
+ const signed char *scp = (const signed char *) msg;
+
+ pad = (__u32)len | ((__u32)len << 8);
+ pad |= pad << 16;
+
+ val = pad;
+ if (len > num*4)
+ len = num * 4;
+ for (i = 0; i < len; i++) {
+ if ((i % 4) == 0)
+ val = pad;
+ val = ((int) scp[i]) + (val << 8);
+ if ((i % 4) == 3) {
+ *buf++ = val;
+ val = pad;
+ num--;
+ }
+ }
+ if (--num >= 0)
+ *buf++ = val;
+ while (--num >= 0)
+ *buf++ = pad;
+}
+
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+ __u32 pad, val;
+ int i;
+ const unsigned char *ucp = (const unsigned char *) msg;
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
for (i=0; i < len; i++) {
if ((i % 4) == 0)
val = pad;
- val = msg[i] + (val << 8);
+ val = ((int) ucp[i]) + (val << 8);
if ((i % 4) == 3) {
*buf++ = val;
val = pad;
@@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
const char *p;
int i;
__u32 in[8], buf[4];
+ void (*str2hashbuf)(const char *, int, __u32 *, int) =
+ str2hashbuf_signed;
/* Initialize the default seed for the hash checksum functions */
buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
}
switch (hinfo->hash_version) {
+ case DX_HASH_LEGACY_UNSIGNED:
+ hash = dx_hack_hash_unsigned(name, len);
+ break;
case DX_HASH_LEGACY:
- hash = dx_hack_hash(name, len);
+ hash = dx_hack_hash_signed(name, len);
break;
+ case DX_HASH_HALF_MD4_UNSIGNED:
+ str2hashbuf = str2hashbuf_unsigned;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
- str2hashbuf(p, len, in, 8);
+ (*str2hashbuf)(p, len, in, 8);
half_md4_transform(buf, in);
len -= 32;
p += 32;
@@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
minor_hash = buf[2];
hash = buf[1];
break;
+ case DX_HASH_TEA_UNSIGNED:
+ str2hashbuf = str2hashbuf_unsigned;
case DX_HASH_TEA:
p = name;
while (len > 0) {
- str2hashbuf(p, len, in, 4);
+ (*str2hashbuf)(p, len, in, 4);
TEA_transform(buf, in);
len -= 16;
p += 16;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 5655fbcbd11..8de6c720e51 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
ei->i_dir_start_lookup = 0;
ei->i_disksize = 0;
- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
- if (S_ISLNK(mode))
- ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
- /* dirsync only applies to directories */
- if (!S_ISDIR(mode))
- ei->i_flags &= ~EXT3_DIRSYNC_FL;
+ ei->i_flags =
+ ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
#ifdef EXT3_FRAGMENTS
ei->i_faddr = 0;
ei->i_frag_no = 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index c4bdccf976b..5fa453b49a6 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1161,7 +1161,7 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
to = from + len;
retry:
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8..5e86ce9a86e 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
goto flags_out;
}
- if (!S_ISDIR(inode->i_mode))
- flags &= ~EXT3_DIRSYNC_FL;
+ flags = ext3_mask_flags(inode->i_mode, flags);
mutex_lock(&inode->i_mutex);
/* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 297ea8dfac7..69a3d19ca9f 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle,
#define assert(test) J_ASSERT(test)
#endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
-
#ifdef DX_DEBUG
#define dxtrace(command) command
#else
@@ -368,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir,
goto fail;
}
hinfo->hash_version = root->info.hash_version;
+ if (hinfo->hash_version <= DX_HASH_TEA)
+ hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
if (entry)
ext3fs_dirhash(entry->name, entry->len, hinfo);
@@ -636,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
dir = dir_file->f_path.dentry->d_inode;
if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+ EXT3_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
start_hash, start_minor_hash);
@@ -1156,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
u32 hash2;
struct dx_map_entry *map;
char *data1 = (*bh)->b_data, *data2;
- unsigned split, move, size, i;
+ unsigned split, move, size;
struct ext3_dir_entry_2 *de = NULL, *de2;
- int err = 0;
+ int err = 0, i;
bh2 = ext3_append (handle, dir, &newblock, &err);
if (!(bh2)) {
@@ -1398,6 +1399,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
/* Initialize as for dx_probe */
hinfo.hash_version = root->info.hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
ext3fs_dirhash(name, namelen, &hinfo);
frame = frames;
@@ -2175,8 +2178,7 @@ retry:
* We have a transaction open. All is sweetness. It also sets
* i_size in generic_commit_write().
*/
- err = __page_symlink(inode, symname, l,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ err = __page_symlink(inode, symname, l, 1);
if (err) {
drop_nlink(inode);
unlock_new_inode(inode);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f6c94f232ec..5d047a030a7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
ext3_blkdev_remove(sbi);
}
sb->s_fs_info = NULL;
+ kfree(sbi->s_blockgroup_lock);
kfree(sbi);
return;
}
@@ -682,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
ext3_nfs_get_inode);
}
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device. Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+ gfp_t wait)
+{
+ journal_t *journal = EXT3_SB(sb)->s_journal;
+
+ WARN_ON(PageChecked(page));
+ if (!page_has_buffers(page))
+ return 0;
+ if (journal)
+ return journal_try_to_free_buffers(journal, page,
+ wait & ~__GFP_WAIT);
+ return try_to_free_buffers(page);
+}
+
#ifdef CONFIG_QUOTA
#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -713,7 +734,9 @@ static struct dquot_operations ext3_quota_operations = {
.acquire_dquot = ext3_acquire_dquot,
.release_dquot = ext3_release_dquot,
.mark_dirty = ext3_mark_dquot_dirty,
- .write_info = ext3_write_info
+ .write_info = ext3_write_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
};
static struct quotactl_ops ext3_qctl_operations = {
@@ -746,6 +769,7 @@ static const struct super_operations ext3_sops = {
.quota_read = ext3_quota_read,
.quota_write = ext3_quota_write,
#endif
+ .bdev_try_to_free_page = bdev_try_to_free_page,
};
static const struct export_operations ext3_export_ops = {
@@ -1035,8 +1059,7 @@ static int parse_options (char *options, struct super_block *sb,
case Opt_grpjquota:
qtype = GRPQUOTA;
set_qf_name:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
!sbi->s_qf_names[qtype]) {
printk(KERN_ERR
"EXT3-fs: Cannot change journaled "
@@ -1075,8 +1098,7 @@ set_qf_name:
case Opt_offgrpjquota:
qtype = GRPQUOTA;
clear_qf_name:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
sbi->s_qf_names[qtype]) {
printk(KERN_ERR "EXT3-fs: Cannot change "
"journaled quota options when "
@@ -1095,8 +1117,7 @@ clear_qf_name:
case Opt_jqfmt_vfsv0:
qfmt = QFMT_VFS_V0;
set_qf_format:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
sbi->s_jquota_fmt != qfmt) {
printk(KERN_ERR "EXT3-fs: Cannot change "
"journaled quota options when "
@@ -1115,8 +1136,7 @@ set_qf_format:
set_opt(sbi->s_mount_opt, GRPQUOTA);
break;
case Opt_noquota:
- if (sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) {
+ if (sb_any_quota_loaded(sb)) {
printk(KERN_ERR "EXT3-fs: Cannot change quota "
"options when quota turned on.\n");
return 0;
@@ -1548,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock) {
+ kfree(sbi);
+ return -ENOMEM;
+ }
sb->s_fs_info = sbi;
sbi->s_mount_opt = 0;
sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1744,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
for (i=0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
+ i = le32_to_cpu(es->s_flags);
+ if (i & EXT2_FLAGS_UNSIGNED_HASH)
+ sbi->s_hash_unsigned = 3;
+ else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+ es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+ sbi->s_hash_unsigned = 3;
+#else
+ es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+ sb->s_dirt = 1;
+ }
if (sbi->s_blocks_per_group > blocksize * 8) {
printk (KERN_ERR
@@ -1788,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- bgl_lock_init(&sbi->s_blockgroup_lock);
+ bgl_lock_init(sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logic_sb_block, i);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38b3acf5683..6bba06b09dd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
#include "ext4.h"
#include "ext4_jbd2.h"
#include "group.h"
+#include "mballoc.h"
/*
* balloc.c contains the blocks allocation and deallocation routines
@@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
ext4_error(sb, __func__,
- "Checksum bad for group %lu\n", block_group);
- gdp->bg_free_blocks_count = 0;
- gdp->bg_free_inodes_count = 0;
- gdp->bg_itable_unused = 0;
+ "Checksum bad for group %u", block_group);
+ ext4_free_blks_set(sb, gdp, 0);
+ ext4_free_inodes_set(sb, gdp, 0);
+ ext4_itable_unused_set(sb, gdp, 0);
memset(bh->b_data, 0xff, sb->s_blocksize);
return 0;
}
@@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head **bh)
{
- unsigned long group_desc;
- unsigned long offset;
+ unsigned int group_desc;
+ unsigned int offset;
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (block_group >= sbi->s_groups_count) {
ext4_error(sb, "ext4_get_group_desc",
"block_group >= groups_count - "
- "block_group = %lu, groups_count = %lu",
+ "block_group = %u, groups_count = %u",
block_group, sbi->s_groups_count);
return NULL;
@@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
if (!sbi->s_group_desc[group_desc]) {
ext4_error(sb, "ext4_get_group_desc",
"Group descriptor not loaded - "
- "block_group = %lu, group_desc = %lu, desc = %lu",
+ "block_group = %u, group_desc = %u, desc = %u",
block_group, group_desc, offset);
return NULL;
}
@@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
if (unlikely(!bh)) {
ext4_error(sb, __func__,
"Cannot read block bitmap - "
- "block_group = %lu, block_bitmap = %llu",
+ "block_group = %u, block_bitmap = %llu",
block_group, bitmap_blk);
return NULL;
}
- if (buffer_uptodate(bh) &&
- !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+
+ if (bitmap_uptodate(bh))
return bh;
lock_buffer(bh);
+ if (bitmap_uptodate(bh)) {
+ unlock_buffer(bh);
+ return bh;
+ }
spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh, block_group, desc);
+ set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
- unlock_buffer(bh);
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ unlock_buffer(bh);
return bh;
}
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ if (buffer_uptodate(bh)) {
+ /*
+ * if not uninit if bh is uptodate,
+ * bitmap is also uptodate
+ */
+ set_bitmap_uptodate(bh);
+ unlock_buffer(bh);
+ return bh;
+ }
+ /*
+ * submit the buffer_head for read. We can
+ * safely mark the bitmap as uptodate now.
+ * We do it here so the bitmap uptodate bit
+ * get set with buffer lock held.
+ */
+ set_bitmap_uptodate(bh);
if (bh_submit_read(bh) < 0) {
put_bh(bh);
ext4_error(sb, __func__,
"Cannot read block bitmap - "
- "block_group = %lu, block_bitmap = %llu",
+ "block_group = %u, block_bitmap = %llu",
block_group, bitmap_blk);
return NULL;
}
@@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
}
/**
- * ext4_free_blocks_sb() -- Free given blocks and update quota
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
* @handle: handle to this transaction
* @sb: super block
- * @block: start physcial block to free
+ * @block: start physcial block to add to the block group
* @count: number of blocks to free
- * @pdquot_freed_blocks: pointer to quota
*
- * XXX This function is only used by the on-line resizing code, which
- * should probably be fixed up to call the mballoc variant. There
- * this needs to be cleaned up later; in fact, I'm not convinced this
- * is 100% correct in the face of the mballoc code. The online resizing
- * code needs to be fixed up to more tightly (and correctly) interlock
- * with the mballoc code.
+ * This marks the blocks as free in the bitmap. We ask the
+ * mballoc to reload the buddy after this by setting group
+ * EXT4_GROUP_INFO_NEED_INIT_BIT flag
*/
-void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count,
- unsigned long *pdquot_freed_blocks)
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count)
{
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *gd_bh;
ext4_group_t block_group;
ext4_grpblk_t bit;
- unsigned long i;
- unsigned long overflow;
+ unsigned int i;
struct ext4_group_desc *desc;
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
- int err = 0, ret;
- ext4_grpblk_t group_freed;
+ int err = 0, ret, blk_free_count;
+ ext4_grpblk_t blocks_freed;
+ struct ext4_group_info *grp;
- *pdquot_freed_blocks = 0;
sbi = EXT4_SB(sb);
es = sbi->s_es;
- if (block < le32_to_cpu(es->s_first_data_block) ||
- block + count < block ||
- block + count > ext4_blocks_count(es)) {
- ext4_error(sb, "ext4_free_blocks",
- "Freeing blocks not in datazone - "
- "block = %llu, count = %lu", block, count);
- goto error_return;
- }
-
- ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
+ ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
-do_more:
- overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ grp = ext4_get_group_info(sb, block_group);
/*
* Check to see if we are freeing blocks across a group
* boundary.
*/
if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
- count -= overflow;
+ goto error_return;
}
- brelse(bitmap_bh);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
@@ -418,18 +422,17 @@ do_more:
in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, desc),
sbi->s_itb_per_group)) {
- ext4_error(sb, "ext4_free_blocks",
- "Freeing blocks in system zones - "
+ ext4_error(sb, __func__,
+ "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
goto error_return;
}
/*
- * We are about to start releasing blocks in the bitmap,
+ * We are about to add blocks to the bitmap,
* so we need undo access.
*/
- /* @@@ check errors */
BUFFER_TRACE(bitmap_bh, "getting undo access");
err = ext4_journal_get_undo_access(handle, bitmap_bh);
if (err)
@@ -444,107 +447,55 @@ do_more:
err = ext4_journal_get_write_access(handle, gd_bh);
if (err)
goto error_return;
-
- jbd_lock_bh_state(bitmap_bh);
-
- for (i = 0, group_freed = 0; i < count; i++) {
- /*
- * An HJ special. This is expensive...
- */
-#ifdef CONFIG_JBD2_DEBUG
- jbd_unlock_bh_state(bitmap_bh);
- {
- struct buffer_head *debug_bh;
- debug_bh = sb_find_get_block(sb, block + i);
- if (debug_bh) {
- BUFFER_TRACE(debug_bh, "Deleted!");
- if (!bh2jh(bitmap_bh)->b_committed_data)
- BUFFER_TRACE(debug_bh,
- "No commited data in bitmap");
- BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
- __brelse(debug_bh);
- }
- }
- jbd_lock_bh_state(bitmap_bh);
-#endif
- if (need_resched()) {
- jbd_unlock_bh_state(bitmap_bh);
- cond_resched();
- jbd_lock_bh_state(bitmap_bh);
- }
- /* @@@ This prevents newly-allocated data from being
- * freed and then reallocated within the same
- * transaction.
- *
- * Ideally we would want to allow that to happen, but to
- * do so requires making jbd2_journal_forget() capable of
- * revoking the queued write of a data block, which
- * implies blocking on the journal lock. *forget()
- * cannot block due to truncate races.
- *
- * Eventually we can fix this by making jbd2_journal_forget()
- * return a status indicating whether or not it was able
- * to revoke the buffer. On successful revoke, it is
- * safe not to set the allocation bit in the committed
- * bitmap, because we know that there is no outstanding
- * activity on the buffer any more and so it is safe to
- * reallocate it.
- */
- BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
- J_ASSERT_BH(bitmap_bh,
- bh2jh(bitmap_bh)->b_committed_data != NULL);
- ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
- bh2jh(bitmap_bh)->b_committed_data);
-
- /*
- * We clear the bit in the bitmap after setting the committed
- * data bit, because this is the reverse order to that which
- * the allocator uses.
- */
+ /*
+ * make sure we don't allow a parallel init on other groups in the
+ * same buddy cache
+ */
+ down_write(&grp->alloc_sem);
+ for (i = 0, blocks_freed = 0; i < count; i++) {
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
bit + i, bitmap_bh->b_data)) {
- jbd_unlock_bh_state(bitmap_bh);
ext4_error(sb, __func__,
"bit already cleared for block %llu",
(ext4_fsblk_t)(block + i));
- jbd_lock_bh_state(bitmap_bh);
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
- group_freed++;
+ blocks_freed++;
}
}
- jbd_unlock_bh_state(bitmap_bh);
-
spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
+ blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+ ext4_free_blks_set(sb, desc, blk_free_count);
desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
spin_unlock(sb_bgl_lock(sbi, block_group));
- percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_blocks += count;
+ sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
spin_unlock(sb_bgl_lock(sbi, flex_group));
}
+ /*
+ * request to reload the buddy with the
+ * new bitmap information
+ */
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+ ext4_mb_update_group_info(grp, blocks_freed);
+ up_write(&grp->alloc_sem);
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
/* And the group descriptor block */
BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_journal_dirty_metadata(handle, gd_bh);
- if (!err) err = ret;
- *pdquot_freed_blocks += group_freed;
-
- if (overflow && !err) {
- block += count;
- count = overflow;
- goto do_more;
- }
+ ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+ if (!err)
+ err = ret;
sb->s_dirt = 1;
+
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, err);
@@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
if (dirty_blocks < 0) {
printk(KERN_CRIT "Dirty block accounting "
"went wrong %lld\n",
- dirty_blocks);
+ (long long)dirty_blocks);
}
}
/* Check whether we have space after
@@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
}
-#define EXT4_META_BLOCK 0x1
-
-static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, ext4_fsblk_t goal,
- unsigned long *count, int *errp, int flags)
-{
- struct ext4_allocation_request ar;
- ext4_fsblk_t ret;
-
- memset(&ar, 0, sizeof(ar));
- /* Fill with neighbour allocated blocks */
-
- ar.inode = inode;
- ar.goal = goal;
- ar.len = *count;
- ar.logical = iblock;
-
- if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
- /* enable in-core preallocation for data block allocation */
- ar.flags = EXT4_MB_HINT_DATA;
- else
- /* disable in-core preallocation for non-regular files */
- ar.flags = 0;
-
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- *count = ar.len;
- return ret;
-}
-
/*
* ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
*
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
- * @count: total number of blocks need
+ * @count: pointer to total number of blocks needed
* @errp: error code
*
- * Return 1st allocated block numberon success, *count stores total account
+ * Return 1st allocated block number on success, *count stores total account
* error stores in errp pointer
*/
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
+ struct ext4_allocation_request ar;
ext4_fsblk_t ret;
- ret = do_blk_alloc(handle, inode, 0, goal,
- count, errp, EXT4_META_BLOCK);
+
+ memset(&ar, 0, sizeof(ar));
+ /* Fill with neighbour allocated blocks */
+ ar.inode = inode;
+ ar.goal = goal;
+ ar.len = count ? *count : 1;
+
+ ret = ext4_mb_new_blocks(handle, &ar, errp);
+ if (count)
+ *count = ar.len;
+
/*
* Account for the allocated meta blocks
*/
if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- EXT4_I(inode)->i_allocated_meta_blocks += *count;
+ EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}
return ret;
}
-/*
- * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
- *
- * @handle: handle to this transaction
- * @inode: file inode
- * @goal: given target block(filesystem wide)
- * @errp: error code
- *
- * Return allocated block number on success
- */
-ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp)
-{
- unsigned long count = 1;
- return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
-}
-
-/*
- * ext4_new_blocks() -- allocate data blocks
- *
- * @handle: handle to this transaction
- * @inode: file inode
- * @goal: given target block(filesystem wide)
- * @count: total number of blocks need
- * @errp: error code
- *
- * Return 1st allocated block numberon success, *count stores total account
- * error stores in errp pointer
- */
-
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, ext4_fsblk_t goal,
- unsigned long *count, int *errp)
-{
- return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
-}
-
/**
* ext4_count_free_blocks() -- count filesystem free blocks
* @sb: superblock
@@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
ext4_fsblk_t bitmap_count;
- unsigned long x;
+ unsigned int x;
struct buffer_head *bitmap_bh = NULL;
es = EXT4_SB(sb)->s_es;
@@ -796,7 +691,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
continue;
x = ext4_count_free(bitmap_bh, sb->s_blocksize);
- printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
+ printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n",
i, le16_to_cpu(gdp->bg_free_blocks_count), x);
bitmap_count += x;
}
@@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ desc_count += ext4_free_blks_count(sb, gdp);
}
return desc_count;
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 0a7a6663c19..fa3af81ac56 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,10 +15,9 @@
static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
+unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
{
- unsigned int i;
- unsigned long sum = 0;
+ unsigned int i, sum = 0;
if (!map)
return 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index fed5b610df5..2df2e40b01a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
int ext4_check_dir_entry(const char *function, struct inode *dir,
struct ext4_dir_entry_2 *de,
struct buffer_head *bh,
- unsigned long offset)
+ unsigned int offset)
{
const char *error_msg = NULL;
const int rlen = ext4_rec_len_from_disk(de->rec_len);
@@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
if (error_msg != NULL)
ext4_error(dir->i_sb, function,
"bad entry in directory #%lu: %s - "
- "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+ "offset=%u, inode=%u, rec_len=%d, name_len=%d",
dir->i_ino, error_msg, offset,
- (unsigned long) le32_to_cpu(de->inode),
+ le32_to_cpu(de->inode),
rlen, de->name_len);
return error_msg == NULL ? 1 : 0;
}
@@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp,
void *dirent, filldir_t filldir)
{
int error = 0;
- unsigned long offset;
+ unsigned int offset;
int i, stored;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
@@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent,
sb = inode->i_sb;
if (!fname) {
- printk(KERN_ERR "ext4: call_filldir: called with "
+ printk(KERN_ERR "EXT4-fs: call_filldir: called with "
"null fname?!?\n");
return 0;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0537c82702..c668e4377d7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -19,6 +19,7 @@
#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/magic.h>
+#include <linux/jbd2.h>
#include "ext4_i.h"
/*
@@ -94,9 +95,9 @@ struct ext4_allocation_request {
/* phys. block for ^^^ */
ext4_fsblk_t pright;
/* how many blocks we want to allocate */
- unsigned long len;
+ unsigned int len;
/* flags. see above EXT4_MB_HINT_* */
- unsigned long flags;
+ unsigned int flags;
};
/*
@@ -156,12 +157,12 @@ struct ext4_group_desc
__le32 bg_block_bitmap_lo; /* Blocks bitmap block */
__le32 bg_inode_bitmap_lo; /* Inodes bitmap block */
__le32 bg_inode_table_lo; /* Inodes table block */
- __le16 bg_free_blocks_count; /* Free blocks count */
- __le16 bg_free_inodes_count; /* Free inodes count */
- __le16 bg_used_dirs_count; /* Directories count */
+ __le16 bg_free_blocks_count_lo;/* Free blocks count */
+ __le16 bg_free_inodes_count_lo;/* Free inodes count */
+ __le16 bg_used_dirs_count_lo; /* Directories count */
__le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
__u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */
- __le16 bg_itable_unused; /* Unused inodes count */
+ __le16 bg_itable_unused_lo; /* Unused inodes count */
__le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
__le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
__le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */
@@ -169,7 +170,7 @@ struct ext4_group_desc
__le16 bg_free_blocks_count_hi;/* Free blocks count MSB */
__le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
__le16 bg_used_dirs_count_hi; /* Directories count MSB */
- __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
+ __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
__u32 bg_reserved2[3];
};
@@ -328,6 +329,7 @@ struct ext4_mount_options {
uid_t s_resuid;
gid_t s_resgid;
unsigned long s_commit_interval;
+ u32 s_min_batch_time, s_max_batch_time;
#ifdef CONFIG_QUOTA
int s_jquota_fmt;
char *s_qf_names[MAXQUOTAS];
@@ -534,7 +536,6 @@ do { \
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
-#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
*/
#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
- (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
+ ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
- (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
+ ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
- (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
+ ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
#define EXT4_DEFM_JMODE_WBACK 0x0060
/*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME 0
+#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
+
+/*
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
@@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
#define DX_HASH_LEGACY 0
#define DX_HASH_HALF_MD4 1
#define DX_HASH_TEA 2
+#define DX_HASH_LEGACY_UNSIGNED 3
+#define DX_HASH_HALF_MD4_UNSIGNED 4
+#define DX_HASH_TEA_UNSIGNED 5
#ifdef __KERNEL__
@@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
#define ERR_BAD_DX_DIR -75000
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
- unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
+ ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
extern struct proc_dir_entry *ext4_proc_root;
@@ -987,6 +997,9 @@ do { \
# define ATTRIB_NORET __attribute__((noreturn))
# define NORET_AND noreturn,
+/* bitmap.c */
+extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+
/* balloc.c */
extern unsigned int ext4_block_group(struct super_block *sb,
ext4_fsblk_t blocknr);
@@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
-extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp);
extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, ext4_fsblk_t goal,
- unsigned long *count, int *errp);
extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count,
- unsigned long *pdquot_freed_blocks);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count);
extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
/* dir.c */
extern int ext4_check_dir_entry(const char *, struct inode *,
struct ext4_dir_entry_2 *,
- struct buffer_head *, unsigned long);
+ struct buffer_head *, unsigned int);
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
struct ext4_dir_entry_2 *dirent);
@@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
extern unsigned long ext4_count_dirs(struct super_block *);
extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
/* mballoc.c */
extern long ext4_mb_stats;
@@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
unsigned long, unsigned long, int, unsigned long *);
-extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
ext4_grpblk_t add);
-
-
+extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
+ ext4_group_t, int);
/* inode.c */
int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr);
@@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, unsigned long maxblocks,
- struct buffer_head *bh_result,
- int create, int extend_disksize);
extern struct inode *ext4_iget(struct super_block *, unsigned long);
extern int ext4_write_inode(struct inode *, int);
@@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
extern void ext4_warning(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
+extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
+ const char *, const char *, ...)
+ __attribute__ ((format (printf, 4, 5)));
extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
@@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
struct ext4_group_desc *bg);
+extern __u32 ext4_free_blks_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
extern void ext4_block_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_blks_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
@@ -1225,11 +1247,11 @@ do { \
} while (0)
#ifdef CONFIG_SMP
-/* Each CPU can accumulate FBC_BATCH blocks in their local
+/* Each CPU can accumulate percpu_counter_batch blocks in their local
* counters. So we need to make sure we have free blocks more
- * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
+ * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times.
*/
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
#else
#define EXT4_FREEBLOCKS_WATERMARK 0
#endif
@@ -1246,6 +1268,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
return ;
}
+struct ext4_group_info {
+ unsigned long bb_state;
+ struct rb_root bb_free_root;
+ unsigned short bb_first_free;
+ unsigned short bb_free;
+ unsigned short bb_fragments;
+ struct list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+ void *bb_bitmap;
+#endif
+ struct rw_semaphore alloc_sem;
+ unsigned short bb_counters[];
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
+#define EXT4_GROUP_INFO_LOCKED_BIT 1
+
+#define EXT4_MB_GRP_NEED_INIT(grp) \
+ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+ struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+ bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+ ext4_group_t group)
+{
+ struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+ bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline int ext4_is_group_locked(struct super_block *sb,
+ ext4_group_t group)
+{
+ struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+ return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
+ &(grinfo->bb_state));
+}
+
/*
* Inodes and files operations
*/
@@ -1271,18 +1337,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
int chunk);
extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock,
- unsigned long max_blocks, struct buffer_head *bh_result,
- int create, int extend_disksize);
+ ext4_lblk_t iblock, unsigned int max_blocks,
+ struct buffer_head *bh_result,
+ int create, int extend_disksize);
extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
- sector_t block, unsigned long max_blocks,
+ sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int create,
int extend_disksize, int flag);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
+
+/*
+ * Add new method to test wether block and inode bitmaps are properly
+ * initialized. With uninit_bg reading the block from disk is not enough
+ * to mark the bitmap uptodate. We need to also zero-out the bitmap
+ */
+#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
+
+static inline int bitmap_uptodate(struct buffer_head *bh)
+{
+ return (buffer_uptodate(bh) &&
+ test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
+}
+static inline void set_bitmap_uptodate(struct buffer_head *bh)
+{
+ set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+}
+
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bec7ce59fc0..18cb67b2cbb 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode)
return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
}
-static inline void ext4_ext_tree_changed(struct inode *inode)
-{
- EXT4_I(inode)->i_ext_generation++;
-}
-
static inline void
ext4_ext_invalidate_cache(struct inode *inode)
{
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 5c124c0ac6d..e69acc16f5c 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t;
typedef __u32 ext4_lblk_t;
/* data type for block group number */
-typedef unsigned long ext4_group_t;
+typedef unsigned int ext4_group_t;
#define rsv_start rsv_window._rsv_start
#define rsv_end rsv_window._rsv_end
@@ -100,9 +100,6 @@ struct ext4_inode_info {
*/
loff_t i_disksize;
- /* on-disk additional length */
- __u16 i_extra_isize;
-
/*
* i_data_sem is for serialising ext4_truncate() against
* ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
@@ -117,7 +114,6 @@ struct ext4_inode_info {
struct inode vfs_inode;
struct jbd2_inode jinode;
- unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
/*
* File creation time. Its function is same as that of
@@ -130,10 +126,14 @@ struct ext4_inode_info {
spinlock_t i_prealloc_lock;
/* allocation reservation info for delalloc */
- unsigned long i_reserved_data_blocks;
- unsigned long i_reserved_meta_blocks;
- unsigned long i_allocated_meta_blocks;
+ unsigned int i_reserved_data_blocks;
+ unsigned int i_reserved_meta_blocks;
+ unsigned int i_allocated_meta_blocks;
unsigned short i_delalloc_reserved_flag;
+
+ /* on-disk additional length */
+ __u16 i_extra_isize;
+
spinlock_t i_block_reservation_lock;
};
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index c75384b34f2..ad13a84644e 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,53 +7,96 @@
int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
struct buffer_head *bh)
{
- int err = jbd2_journal_get_undo_access(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, __func__, bh, handle, err);
+ int err = 0;
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_get_undo_access(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, __func__, bh,
+ handle, err);
+ }
return err;
}
int __ext4_journal_get_write_access(const char *where, handle_t *handle,
struct buffer_head *bh)
{
- int err = jbd2_journal_get_write_access(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, __func__, bh, handle, err);
+ int err = 0;
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_get_write_access(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, __func__, bh,
+ handle, err);
+ }
return err;
}
int __ext4_journal_forget(const char *where, handle_t *handle,
struct buffer_head *bh)
{
- int err = jbd2_journal_forget(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, __func__, bh, handle, err);
+ int err = 0;
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_forget(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, __func__, bh,
+ handle, err);
+ }
return err;
}
int __ext4_journal_revoke(const char *where, handle_t *handle,
ext4_fsblk_t blocknr, struct buffer_head *bh)
{
- int err = jbd2_journal_revoke(handle, blocknr, bh);
- if (err)
- ext4_journal_abort_handle(where, __func__, bh, handle, err);
+ int err = 0;
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_revoke(handle, blocknr, bh);
+ if (err)
+ ext4_journal_abort_handle(where, __func__, bh,
+ handle, err);
+ }
return err;
}
int __ext4_journal_get_create_access(const char *where,
handle_t *handle, struct buffer_head *bh)
{
- int err = jbd2_journal_get_create_access(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, __func__, bh, handle, err);
+ int err = 0;
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_get_create_access(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, __func__, bh,
+ handle, err);
+ }
return err;
}
-int __ext4_journal_dirty_metadata(const char *where,
- handle_t *handle, struct buffer_head *bh)
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+ struct inode *inode, struct buffer_head *bh)
{
- int err = jbd2_journal_dirty_metadata(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, __func__, bh, handle, err);
+ int err = 0;
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_dirty_metadata(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, __func__, bh,
+ handle, err);
+ } else {
+ mark_buffer_dirty(bh);
+ if (inode && inode_needs_sync(inode)) {
+ sync_dirty_buffer(bh);
+ if (buffer_req(bh) && !buffer_uptodate(bh)) {
+ ext4_error(inode->i_sb, __func__,
+ "IO error syncing inode, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long) bh->b_blocknr);
+ err = -EIO;
+ }
+ }
+ }
return err;
}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b455c685a98..be2f426f680 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -32,8 +32,8 @@
* 5 levels of tree + root which are stored in the inode. */
#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
- (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
- || test_opt(sb, EXTENTS) ? 27U : 8U)
+ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
+ ? 27U : 8U)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
@@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
* been done yet.
*/
-static inline void ext4_journal_release_buffer(handle_t *handle,
- struct buffer_head *bh)
-{
- jbd2_journal_release_buffer(handle, bh);
-}
-
void ext4_journal_abort_handle(const char *caller, const char *err_fn,
struct buffer_head *bh, handle_t *handle, int err);
@@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
int __ext4_journal_get_create_access(const char *where,
handle_t *handle, struct buffer_head *bh);
-int __ext4_journal_dirty_metadata(const char *where,
- handle_t *handle, struct buffer_head *bh);
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+ struct inode *inode, struct buffer_head *bh);
#define ext4_journal_get_undo_access(handle, bh) \
__ext4_journal_get_undo_access(__func__, (handle), (bh))
@@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where,
__ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
#define ext4_journal_get_create_access(handle, bh) \
__ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_dirty_metadata(handle, bh) \
- __ext4_journal_dirty_metadata(__func__, (handle), (bh))
#define ext4_journal_forget(handle, bh) \
__ext4_journal_forget(__func__, (handle), (bh))
+#define ext4_handle_dirty_metadata(handle, inode, bh) \
+ __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
+#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
+
+static inline int ext4_handle_valid(handle_t *handle)
+{
+ if (handle == EXT4_NOJOURNAL_HANDLE)
+ return 0;
+ return 1;
+}
+
+static inline void ext4_handle_sync(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ handle->h_sync = 1;
+}
+
+static inline void ext4_handle_release_buffer(handle_t *handle,
+ struct buffer_head *bh)
+{
+ if (ext4_handle_valid(handle))
+ jbd2_journal_release_buffer(handle, bh);
+}
+
+static inline int ext4_handle_is_aborted(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ return is_handle_aborted(handle);
+ return 0;
+}
+
+static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+{
+ if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
+ return 0;
+ return 1;
+}
+
+static inline void ext4_journal_release_buffer(handle_t *handle,
+ struct buffer_head *bh)
+{
+ if (ext4_handle_valid(handle))
+ jbd2_journal_release_buffer(handle, bh);
+}
+
static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
{
return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void)
static inline int ext4_journal_extend(handle_t *handle, int nblocks)
{
- return jbd2_journal_extend(handle, nblocks);
+ if (ext4_handle_valid(handle))
+ return jbd2_journal_extend(handle, nblocks);
+ return 0;
}
static inline int ext4_journal_restart(handle_t *handle, int nblocks)
{
- return jbd2_journal_restart(handle, nblocks);
+ if (ext4_handle_valid(handle))
+ return jbd2_journal_restart(handle, nblocks);
+ return 0;
}
static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
- return jbd2_journal_blocks_per_page(inode);
+ if (EXT4_JOURNAL(inode) != NULL)
+ return jbd2_journal_blocks_per_page(inode);
+ return 0;
}
static inline int ext4_journal_force_commit(journal_t *journal)
{
- return jbd2_journal_force_commit(journal);
+ if (journal)
+ return jbd2_journal_force_commit(journal);
+ return 0;
}
static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
{
- return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+ if (ext4_handle_valid(handle))
+ return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+ return 0;
}
/* super.c */
@@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb);
static inline int ext4_should_journal_data(struct inode *inode)
{
+ if (EXT4_JOURNAL(inode) == NULL)
+ return 0;
if (!S_ISREG(inode->i_mode))
return 1;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode)
static inline int ext4_should_order_data(struct inode *inode)
{
+ if (EXT4_JOURNAL(inode) == NULL)
+ return 0;
if (!S_ISREG(inode->i_mode))
return 0;
if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
@@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode)
static inline int ext4_should_writeback_data(struct inode *inode)
{
+ if (EXT4_JOURNAL(inode) == NULL)
+ return 0;
if (!S_ISREG(inode->i_mode))
return 0;
if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 445fde603df..039b6ea1a04 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -57,6 +57,7 @@ struct ext4_sb_info {
u32 s_next_generation;
u32 s_hash_seed[4];
int s_def_hash_version;
+ int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
struct percpu_counter s_freeblocks_counter;
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
@@ -73,6 +74,8 @@ struct ext4_sb_info {
struct journal_s *s_journal;
struct list_head s_orphan;
unsigned long s_commit_interval;
+ u32 s_max_batch_time;
+ u32 s_min_batch_time;
struct block_device *journal_bdev;
#ifdef CONFIG_JBD2_DEBUG
struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
@@ -101,7 +104,8 @@ struct ext4_sb_info {
spinlock_t s_reserve_lock;
spinlock_t s_md_lock;
tid_t s_last_transaction;
- unsigned short *s_mb_offsets, *s_mb_maxs;
+ unsigned short *s_mb_offsets;
+ unsigned int *s_mb_maxs;
/* tunables */
unsigned long s_stripe;
@@ -146,4 +150,10 @@ struct ext4_sb_info {
struct flex_groups *s_flex_groups;
};
+static inline spinlock_t *
+sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
+{
+ return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+}
+
#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea2ce3c0ae6..54bf0623a9a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
{
int err;
+ if (!ext4_handle_valid(handle))
+ return 0;
if (handle->h_buffer_credits > needed)
return 0;
err = ext4_journal_extend(handle, needed);
@@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
int err;
if (path->p_bh) {
/* path points to block */
- err = ext4_journal_dirty_metadata(handle, path->p_bh);
+ err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
@@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
- newblock = ext4_new_meta_block(handle, inode, goal, err);
+ newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
return newblock;
}
@@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
set_buffer_uptodate(bh);
unlock_buffer(bh);
- err = ext4_journal_dirty_metadata(handle, bh);
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
if (err)
goto cleanup;
brelse(bh);
@@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
set_buffer_uptodate(bh);
unlock_buffer(bh);
- err = ext4_journal_dirty_metadata(handle, bh);
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
if (err)
goto cleanup;
brelse(bh);
@@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
set_buffer_uptodate(bh);
unlock_buffer(bh);
- err = ext4_journal_dirty_metadata(handle, bh);
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
if (err)
goto out;
@@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
while (--depth >= 0) {
ix = path[depth].p_idx;
if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
- break;
+ goto got_index;
}
- if (depth < 0) {
- /* we've gone up to the root and
- * found no index to the right */
- return 0;
- }
+ /* we've gone up to the root and found no index to the right */
+ return 0;
+got_index:
/* we've found index to the right, let's
* follow it and find the closest allocated
* block to the right */
@@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
*phys = ext_pblock(ex);
put_bh(bh);
return 0;
-
}
/*
@@ -1622,7 +1621,6 @@ cleanup:
ext4_ext_drop_refs(npath);
kfree(npath);
}
- ext4_ext_tree_changed(inode);
ext4_ext_invalidate_cache(inode);
return err;
}
@@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
}
}
out:
- ext4_ext_tree_changed(inode);
ext4_ext_drop_refs(path);
kfree(path);
ext4_journal_stop(handle);
@@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb)
* possible initialization would be here
*/
- if (test_opt(sb, EXTENTS)) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
printk(KERN_INFO "EXT4-fs: file extents enabled");
#ifdef AGGRESSIVE_TEST
printk(", aggressive tests");
@@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb)
*/
void ext4_ext_release(struct super_block *sb)
{
- if (!test_opt(sb, EXTENTS))
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
return;
#ifdef EXTENTS_STATS
@@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t iblock,
- unsigned long max_blocks)
+ unsigned int max_blocks)
{
struct ext4_extent *ex, newex, orig_ex;
struct ext4_extent *ex1 = NULL;
@@ -2536,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
*/
newdepth = ext_depth(inode);
/*
- * update the extent length after successfull insert of the
+ * update the extent length after successful insert of the
* split extent
*/
orig_ex.ee_len = cpu_to_le16(ee_len -
@@ -2678,26 +2675,26 @@ fix_extent_len:
*/
int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock,
- unsigned long max_blocks, struct buffer_head *bh_result,
+ unsigned int max_blocks, struct buffer_head *bh_result,
int create, int extend_disksize)
{
struct ext4_ext_path *path = NULL;
struct ext4_extent_header *eh;
struct ext4_extent newex, *ex;
- ext4_fsblk_t goal, newblock;
- int err = 0, depth, ret;
- unsigned long allocated = 0;
+ ext4_fsblk_t newblock;
+ int err = 0, depth, ret, cache_type;
+ unsigned int allocated = 0;
struct ext4_allocation_request ar;
loff_t disksize;
__clear_bit(BH_New, &bh_result->b_state);
- ext_debug("blocks %u/%lu requested for inode %u\n",
+ ext_debug("blocks %u/%u requested for inode %u\n",
iblock, max_blocks, inode->i_ino);
/* check in cache */
- goal = ext4_ext_in_cache(inode, iblock, &newex);
- if (goal) {
- if (goal == EXT4_EXT_CACHE_GAP) {
+ cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+ if (cache_type) {
+ if (cache_type == EXT4_EXT_CACHE_GAP) {
if (!create) {
/*
* block isn't allocated yet and
@@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
goto out2;
}
/* we should allocate requested block */
- } else if (goal == EXT4_EXT_CACHE_EXTENT) {
+ } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
/* block is already allocated */
newblock = iblock
- le32_to_cpu(newex.ee_block)
@@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
if (!newblock)
goto out2;
ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
- goal, newblock, allocated);
+ ar.goal, newblock, allocated);
/* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock);
@@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode)
* transaction synchronous.
*/
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
out_stop:
up_write(&EXT4_I(inode)->i_data_sem);
@@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
handle_t *handle;
ext4_lblk_t block;
loff_t new_size;
- unsigned long max_blocks;
+ unsigned int max_blocks;
int ret = 0;
int ret2 = 0;
int retries = 0;
@@ -3083,7 +3080,7 @@ retry:
/*
* Callback function called for each extent to gather FIEMAP information.
*/
-int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
struct ext4_ext_cache *newex, struct ext4_extent *ex,
void *data)
{
@@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
/* fiemap flags we can handle specified here */
#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+static int ext4_xattr_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo)
{
__u64 physical = 0;
__u64 length;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bd11fba71f..f731cb545a0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len);
-
const struct file_operations ext4_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 556ca8eba3d..ac8f168c8ab 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
/* The old legacy hash */
-static __u32 dx_hack_hash(const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
{
- __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ const unsigned char *ucp = (const unsigned char *) name;
+
+ while (len--) {
+ hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+ if (hash & 0x80000000)
+ hash -= 0x7fffffff;
+ hash1 = hash0;
+ hash0 = hash;
+ }
+ return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ const signed char *scp = (const signed char *) name;
+
while (len--) {
- __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+ hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
- if (hash & 0x80000000) hash -= 0x7fffffff;
+ if (hash & 0x80000000)
+ hash -= 0x7fffffff;
hash1 = hash0;
hash0 = hash;
}
- return (hash0 << 1);
+ return hash0 << 1;
+}
+
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+ __u32 pad, val;
+ int i;
+ const signed char *scp = (const signed char *) msg;
+
+ pad = (__u32)len | ((__u32)len << 8);
+ pad |= pad << 16;
+
+ val = pad;
+ if (len > num*4)
+ len = num * 4;
+ for (i = 0; i < len; i++) {
+ if ((i % 4) == 0)
+ val = pad;
+ val = ((int) scp[i]) + (val << 8);
+ if ((i % 4) == 3) {
+ *buf++ = val;
+ val = pad;
+ num--;
+ }
+ }
+ if (--num >= 0)
+ *buf++ = val;
+ while (--num >= 0)
+ *buf++ = pad;
}
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
{
__u32 pad, val;
int i;
+ const unsigned char *ucp = (const unsigned char *) msg;
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
for (i = 0; i < len; i++) {
if ((i % 4) == 0)
val = pad;
- val = msg[i] + (val << 8);
+ val = ((int) ucp[i]) + (val << 8);
if ((i % 4) == 3) {
*buf++ = val;
val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
const char *p;
int i;
__u32 in[8], buf[4];
+ void (*str2hashbuf)(const char *, int, __u32 *, int) =
+ str2hashbuf_signed;
/* Initialize the default seed for the hash checksum functions */
buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
}
switch (hinfo->hash_version) {
+ case DX_HASH_LEGACY_UNSIGNED:
+ hash = dx_hack_hash_unsigned(name, len);
+ break;
case DX_HASH_LEGACY:
- hash = dx_hack_hash(name, len);
+ hash = dx_hack_hash_signed(name, len);
break;
+ case DX_HASH_HALF_MD4_UNSIGNED:
+ str2hashbuf = str2hashbuf_unsigned;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
- str2hashbuf(p, len, in, 8);
+ (*str2hashbuf)(p, len, in, 8);
half_md4_transform(buf, in);
len -= 32;
p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
minor_hash = buf[2];
hash = buf[1];
break;
+ case DX_HASH_TEA_UNSIGNED:
+ str2hashbuf = str2hashbuf_unsigned;
case DX_HASH_TEA:
p = name;
while (len > 0) {
- str2hashbuf(p, len, in, 4);
+ (*str2hashbuf)(p, len, in, 4);
TEA_transform(buf, in);
len -= 16;
p += 16;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 6e6052879aa..4fb86a0061d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
/* If checksum is bad mark all blocks and inodes use to prevent
* allocation, essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
- ext4_error(sb, __func__, "Checksum bad for group %lu\n",
+ ext4_error(sb, __func__, "Checksum bad for group %u",
block_group);
- gdp->bg_free_blocks_count = 0;
- gdp->bg_free_inodes_count = 0;
- gdp->bg_itable_unused = 0;
+ ext4_free_blks_set(sb, gdp, 0);
+ ext4_free_inodes_set(sb, gdp, 0);
+ ext4_itable_unused_set(sb, gdp, 0);
memset(bh->b_data, 0xff, sb->s_blocksize);
return 0;
}
memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
- mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+ mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
bh->b_data);
return EXT4_INODES_PER_GROUP(sb);
@@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
if (unlikely(!bh)) {
ext4_error(sb, __func__,
"Cannot read inode bitmap - "
- "block_group = %lu, inode_bitmap = %llu",
+ "block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
return NULL;
}
- if (buffer_uptodate(bh) &&
- !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+ if (bitmap_uptodate(bh))
return bh;
lock_buffer(bh);
+ if (bitmap_uptodate(bh)) {
+ unlock_buffer(bh);
+ return bh;
+ }
spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
ext4_init_inode_bitmap(sb, bh, block_group, desc);
+ set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
- unlock_buffer(bh);
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ unlock_buffer(bh);
return bh;
}
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ if (buffer_uptodate(bh)) {
+ /*
+ * if not uninit if bh is uptodate,
+ * bitmap is also uptodate
+ */
+ set_bitmap_uptodate(bh);
+ unlock_buffer(bh);
+ return bh;
+ }
+ /*
+ * submit the buffer_head for read. We can
+ * safely mark the bitmap as uptodate now.
+ * We do it here so the bitmap uptodate bit
+ * get set with buffer lock held.
+ */
+ set_bitmap_uptodate(bh);
if (bh_submit_read(bh) < 0) {
put_bh(bh);
ext4_error(sb, __func__,
"Cannot read inode bitmap - "
- "block_group = %lu, inode_bitmap = %llu",
+ "block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
return NULL;
}
@@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
struct ext4_group_desc *gdp;
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
- int fatal = 0, err;
+ int fatal = 0, err, count;
ext4_group_t flex_group;
if (atomic_read(&inode->i_count) > 1) {
@@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
ino = inode->i_ino;
ext4_debug("freeing inode %lu\n", ino);
+ trace_mark(ext4_free_inode,
+ "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
+ sb->s_id, inode->i_ino, inode->i_mode,
+ (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
+ (unsigned long long) inode->i_blocks);
/*
* Note: we must free any quota before locking the superblock,
@@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (gdp) {
spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&gdp->bg_free_inodes_count, 1);
- if (is_directory)
- le16_add_cpu(&gdp->bg_used_dirs_count, -1);
+ count = ext4_free_inodes_count(sb, gdp) + 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (is_directory) {
+ count = ext4_used_dirs_count(sb, gdp) - 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ }
gdp->bg_checksum = ext4_group_desc_csum(sbi,
block_group, gdp);
spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
spin_unlock(sb_bgl_lock(sbi, flex_group));
}
}
- BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh2);
+ BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, bh2);
if (!fatal) fatal = err;
}
- BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+ BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (!fatal)
fatal = err;
sb->s_dirt = 1;
@@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
for (group = 0; group < ngroups; group++) {
desc = ext4_get_group_desc(sb, group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
+ if (!desc || !ext4_free_inodes_count(sb, desc))
continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+ if (ext4_free_inodes_count(sb, desc) < avefreei)
continue;
if (!best_desc ||
- (le16_to_cpu(desc->bg_free_blocks_count) >
- le16_to_cpu(best_desc->bg_free_blocks_count))) {
+ (ext4_free_blks_count(sb, desc) >
+ ext4_free_blks_count(sb, best_desc))) {
*best_group = group;
best_desc = desc;
ret = 0;
@@ -369,7 +397,7 @@ found_flexbg:
for (i = best_flex * flex_size; i < ngroups &&
i < (best_flex + 1) * flex_size; i++) {
desc = ext4_get_group_desc(sb, i, &bh);
- if (le16_to_cpu(desc->bg_free_inodes_count)) {
+ if (ext4_free_inodes_count(sb, desc)) {
*best_group = i;
goto out;
}
@@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
for (i = 0; i < ngroups; i++) {
grp = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, grp, NULL);
- if (!desc || !desc->bg_free_inodes_count)
+ if (!desc || !ext4_free_inodes_count(sb, desc))
continue;
- if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+ if (ext4_used_dirs_count(sb, desc) >= best_ndir)
continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+ if (ext4_free_inodes_count(sb, desc) < avefreei)
continue;
- if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+ if (ext4_free_blks_count(sb, desc) < avefreeb)
continue;
*group = grp;
ret = 0;
- best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+ best_ndir = ext4_used_dirs_count(sb, desc);
}
if (ret == 0)
return ret;
@@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
for (i = 0; i < ngroups; i++) {
*group = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
+ if (!desc || !ext4_free_inodes_count(sb, desc))
continue;
- if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+ if (ext4_used_dirs_count(sb, desc) >= max_dirs)
continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+ if (ext4_free_inodes_count(sb, desc) < min_inodes)
continue;
- if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+ if (ext4_free_blks_count(sb, desc) < min_blocks)
continue;
return 0;
}
@@ -494,8 +522,8 @@ fallback:
for (i = 0; i < ngroups; i++) {
*group = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && desc->bg_free_inodes_count &&
- le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_inodes_count(sb, desc) >= avefreei)
return 0;
}
@@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*/
*group = parent_group;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
- le16_to_cpu(desc->bg_free_blocks_count))
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_blks_count(sb, desc))
return 0;
/*
@@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
if (*group >= ngroups)
*group -= ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
- le16_to_cpu(desc->bg_free_blocks_count))
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_blks_count(sb, desc))
return 0;
}
@@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
if (++*group >= ngroups)
*group = 0;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+ if (desc && ext4_free_inodes_count(sb, desc))
return 0;
}
@@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
}
/*
+ * claim the inode from the inode bitmap. If the group
+ * is uninit we need to take the groups's sb_bgl_lock
+ * and clear the uninit flag. The inode bitmap update
+ * and group desc uninit flag clear should be done
+ * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * doesn't race with the ext4_claim_inode
+ */
+static int ext4_claim_inode(struct super_block *sb,
+ struct buffer_head *inode_bitmap_bh,
+ unsigned long ino, ext4_group_t group, int mode)
+{
+ int free = 0, retval = 0, count;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+ spin_lock(sb_bgl_lock(sbi, group));
+ if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+ /* not a free inode */
+ retval = 1;
+ goto err_ret;
+ }
+ ino++;
+ if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
+ ino > EXT4_INODES_PER_GROUP(sb)) {
+ spin_unlock(sb_bgl_lock(sbi, group));
+ ext4_error(sb, __func__,
+ "reserved inode or inode > inodes count - "
+ "block_group = %u, inode=%lu", group,
+ ino + group * EXT4_INODES_PER_GROUP(sb));
+ return 1;
+ }
+ /* If we didn't allocate from within the initialized part of the inode
+ * table then we need to initialize up to this inode. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ /* When marking the block group with
+ * ~EXT4_BG_INODE_UNINIT we don't want to depend
+ * on the value of bg_itable_unused even though
+ * mke2fs could have initialized the same for us.
+ * Instead we calculated the value below
+ */
+
+ free = 0;
+ } else {
+ free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ }
+
+ /*
+ * Check the relative inode number against the last used
+ * relative inode number in this group. if it is greater
+ * we need to update the bg_itable_unused count
+ *
+ */
+ if (ino > free)
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - ino));
+ }
+ count = ext4_free_inodes_count(sb, gdp) - 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (S_ISDIR(mode)) {
+ count = ext4_used_dirs_count(sb, gdp) + 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ }
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+err_ret:
+ spin_unlock(sb_bgl_lock(sbi, group));
+ return retval;
+}
+
+/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
* free space and a low directory-to-inode ratio; if that fails, then of
@@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
{
struct super_block *sb;
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *bh2;
+ struct buffer_head *inode_bitmap_bh = NULL;
+ struct buffer_head *group_desc_bh;
ext4_group_t group = 0;
unsigned long ino = 0;
struct inode *inode;
@@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
return ERR_PTR(-EPERM);
sb = dir->i_sb;
+ trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
+ dir->i_ino, mode);
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -631,40 +734,52 @@ got_group:
for (i = 0; i < sbi->s_groups_count; i++) {
err = -EIO;
- gdp = ext4_get_group_desc(sb, group, &bh2);
+ gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp)
goto fail;
- brelse(bitmap_bh);
- bitmap_bh = ext4_read_inode_bitmap(sb, group);
- if (!bitmap_bh)
+ brelse(inode_bitmap_bh);
+ inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+ if (!inode_bitmap_bh)
goto fail;
ino = 0;
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
- bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
+ inode_bitmap_bh->b_data,
+ EXT4_INODES_PER_GROUP(sb), ino);
+
if (ino < EXT4_INODES_PER_GROUP(sb)) {
- BUFFER_TRACE(bitmap_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
+ BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle,
+ inode_bitmap_bh);
if (err)
goto fail;
- if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
- ino, bitmap_bh->b_data)) {
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle,
+ group_desc_bh);
+ if (err)
+ goto fail;
+ if (!ext4_claim_inode(sb, inode_bitmap_bh,
+ ino, group, mode)) {
/* we won it */
- BUFFER_TRACE(bitmap_bh,
- "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle,
- bitmap_bh);
+ BUFFER_TRACE(inode_bitmap_bh,
+ "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle,
+ inode,
+ inode_bitmap_bh);
if (err)
goto fail;
+ /* zero bit is inode number 1*/
+ ino++;
goto got;
}
/* we lost it */
- jbd2_journal_release_buffer(handle, bitmap_bh);
+ ext4_handle_release_buffer(handle, inode_bitmap_bh);
+ ext4_handle_release_buffer(handle, group_desc_bh);
if (++ino < EXT4_INODES_PER_GROUP(sb))
goto repeat_in_this_group;
@@ -684,30 +799,16 @@ repeat_in_this_group:
goto out;
got:
- ino++;
- if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
- ino > EXT4_INODES_PER_GROUP(sb)) {
- ext4_error(sb, __func__,
- "reserved inode or inode > inodes count - "
- "block_group = %lu, inode=%lu", group,
- ino + group * EXT4_INODES_PER_GROUP(sb));
- err = -EIO;
- goto fail;
- }
-
- BUFFER_TRACE(bh2, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh2);
- if (err) goto fail;
-
/* We may have to initialize the block bitmap if it isn't already */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
+ struct buffer_head *block_bitmap_bh;
- BUFFER_TRACE(block_bh, "get block bitmap access");
- err = ext4_journal_get_write_access(handle, block_bh);
+ block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+ err = ext4_journal_get_write_access(handle, block_bitmap_bh);
if (err) {
- brelse(block_bh);
+ brelse(block_bitmap_bh);
goto fail;
}
@@ -715,9 +816,9 @@ got:
spin_lock(sb_bgl_lock(sbi, group));
/* recheck and clear flag under lock if we still need to */
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
free = ext4_free_blocks_after_init(sb, group, gdp);
- gdp->bg_free_blocks_count = cpu_to_le16(free);
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_blks_set(sb, gdp, free);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
gdp);
}
@@ -725,55 +826,19 @@ got:
/* Don't need to dirty bitmap block if we didn't change it */
if (free) {
- BUFFER_TRACE(block_bh, "dirty block bitmap");
- err = ext4_journal_dirty_metadata(handle, block_bh);
+ BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+ err = ext4_handle_dirty_metadata(handle,
+ NULL, block_bitmap_bh);
}
- brelse(block_bh);
+ brelse(block_bitmap_bh);
if (err)
goto fail;
}
-
- spin_lock(sb_bgl_lock(sbi, group));
- /* If we didn't allocate from within the initialized part of the inode
- * table then we need to initialize up to this inode. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-
- /* When marking the block group with
- * ~EXT4_BG_INODE_UNINIT we don't want to depend
- * on the value of bg_itable_unused even though
- * mke2fs could have initialized the same for us.
- * Instead we calculated the value below
- */
-
- free = 0;
- } else {
- free = EXT4_INODES_PER_GROUP(sb) -
- le16_to_cpu(gdp->bg_itable_unused);
- }
-
- /*
- * Check the relative inode number against the last used
- * relative inode number in this group. if it is greater
- * we need to update the bg_itable_unused count
- *
- */
- if (ino > free)
- gdp->bg_itable_unused =
- cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
- }
-
- le16_add_cpu(&gdp->bg_free_inodes_count, -1);
- if (S_ISDIR(mode)) {
- le16_add_cpu(&gdp->bg_used_dirs_count, 1);
- }
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
- spin_unlock(sb_bgl_lock(sbi, group));
- BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh2);
- if (err) goto fail;
+ BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
+ if (err)
+ goto fail;
percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
@@ -825,7 +890,7 @@ got:
ext4_set_inode_flags(inode);
if (IS_DIRSYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
if (insert_inode_locked(inode) < 0) {
err = -EINVAL;
goto fail_drop;
@@ -852,7 +917,7 @@ got:
if (err)
goto fail_free_drop;
- if (test_opt(sb, EXTENTS)) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
/* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
@@ -867,6 +932,8 @@ got:
}
ext4_debug("allocating inode %lu\n", inode->i_ino);
+ trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
+ sb->s_id, inode->i_ino, dir->i_ino, mode);
goto really_out;
fail:
ext4_std_error(sb, err);
@@ -874,7 +941,7 @@ out:
iput(inode);
ret = ERR_PTR(err);
really_out:
- brelse(bitmap_bh);
+ brelse(inode_bitmap_bh);
return ret;
fail_free_drop:
@@ -886,7 +953,7 @@ fail_drop:
inode->i_nlink = 0;
unlock_new_inode(inode);
iput(inode);
- brelse(bitmap_bh);
+ brelse(inode_bitmap_bh);
return ERR_PTR(err);
}
@@ -985,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ desc_count += ext4_free_inodes_count(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_inode_bitmap(sb, i);
if (!bitmap_bh)
@@ -993,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
- i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+ i, ext4_free_inodes_count(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
@@ -1007,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ desc_count += ext4_free_inodes_count(sb, gdp);
cond_resched();
}
return desc_count;
@@ -1024,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- count += le16_to_cpu(gdp->bg_used_dirs_count);
+ count += ext4_used_dirs_count(sb, gdp);
}
return count;
}
-
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c3325e0b00..a6444cee0c7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -72,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
* "bh" may be NULL: a metadata block may have been freed from memory
* but there may still be a record of it in the journal, and that record
* still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling so there's nothing to do.
*/
int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr)
{
int err;
+ if (!ext4_handle_valid(handle))
+ return 0;
+
might_sleep();
BUFFER_TRACE(bh, "enter");
@@ -170,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode)
*/
static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
{
- if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+ if (!ext4_handle_valid(handle))
+ return 0;
+ if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
return 0;
if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
return 0;
@@ -184,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
*/
static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
{
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
jbd_debug(2, "restarting handle %p\n", handle);
return ext4_journal_restart(handle, blocks_for_truncate(inode));
}
@@ -216,7 +224,7 @@ void ext4_delete_inode(struct inode *inode)
}
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
inode->i_size = 0;
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
@@ -233,7 +241,7 @@ void ext4_delete_inode(struct inode *inode)
* enough credits left in the handle to remove the inode from
* the orphan list and set the dtime field.
*/
- if (handle->h_buffer_credits < 3) {
+ if (!ext4_handle_has_enough_credits(handle, 3)) {
err = ext4_journal_extend(handle, 3);
if (err > 0)
err = ext4_journal_restart(handle, 3);
@@ -506,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
* return the total number of blocks to be allocate, including the
* direct and indirect blocks.
*/
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
int blocks_to_boundary)
{
- unsigned long count = 0;
+ unsigned int count = 0;
/*
* Simple case, [t,d]Indirect block(s) has not allocated yet
@@ -547,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
int indirect_blks, int blks,
ext4_fsblk_t new_blocks[4], int *err)
{
+ struct ext4_allocation_request ar;
int target, i;
unsigned long count = 0, blk_allocated = 0;
int index = 0;
@@ -595,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
if (!target)
goto allocated;
/* Now allocate data blocks */
- count = target;
- /* allocating blocks for data blocks */
- current_block = ext4_new_blocks(handle, inode, iblock,
- goal, &count, err);
+ memset(&ar, 0, sizeof(ar));
+ ar.inode = inode;
+ ar.goal = goal;
+ ar.len = target;
+ ar.logical = iblock;
+ if (S_ISREG(inode->i_mode))
+ /* enable in-core preallocation only for regular files */
+ ar.flags = EXT4_MB_HINT_DATA;
+
+ current_block = ext4_mb_new_blocks(handle, &ar, err);
+
if (*err && (target == blks)) {
/*
* if the allocation failed and we didn't allocate
@@ -614,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
*/
new_blocks[index] = current_block;
}
- blk_allocated += count;
+ blk_allocated += ar.len;
}
allocated:
/* total number of blocks allocated for direct blocks */
@@ -709,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
set_buffer_uptodate(bh);
unlock_buffer(bh);
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
if (err)
goto failed;
}
@@ -792,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
*/
jbd_debug(5, "splicing indirect only\n");
- BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, where->bh);
+ BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, where->bh);
if (err)
goto err_out;
} else {
@@ -840,10 +856,10 @@ err_out:
* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
* (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
*/
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, unsigned long maxblocks,
- struct buffer_head *bh_result,
- int create, int extend_disksize)
+static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, unsigned int maxblocks,
+ struct buffer_head *bh_result,
+ int create, int extend_disksize)
{
int err = -EIO;
ext4_lblk_t offsets[4];
@@ -1045,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
* It returns the error in case of allocation failure.
*/
int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
- unsigned long max_blocks, struct buffer_head *bh,
+ unsigned int max_blocks, struct buffer_head *bh,
int create, int extend_disksize, int flag)
{
int retval;
@@ -1221,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
set_buffer_uptodate(bh);
}
unlock_buffer(bh);
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
if (!fatal)
fatal = err;
} else {
@@ -1335,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
unsigned from, to;
+ trace_mark(ext4_write_begin,
+ "dev %s ino %lu pos %llu len %u flags %u",
+ inode->i_sb->s_id, inode->i_ino,
+ (unsigned long long) pos, len, flags);
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
@@ -1346,7 +1366,7 @@ retry:
goto out;
}
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
ext4_journal_stop(handle);
ret = -ENOMEM;
@@ -1387,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
set_buffer_uptodate(bh);
- return ext4_journal_dirty_metadata(handle, bh);
+ return ext4_handle_dirty_metadata(handle, NULL, bh);
}
/*
@@ -1406,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file,
struct inode *inode = mapping->host;
int ret = 0, ret2;
+ trace_mark(ext4_ordered_write_end,
+ "dev %s ino %lu pos %llu len %u copied %u",
+ inode->i_sb->s_id, inode->i_ino,
+ (unsigned long long) pos, len, copied);
ret = ext4_jbd2_file_inode(handle, inode);
if (ret == 0) {
@@ -1444,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file,
int ret = 0, ret2;
loff_t new_i_size;
+ trace_mark(ext4_writeback_write_end,
+ "dev %s ino %lu pos %llu len %u copied %u",
+ inode->i_sb->s_id, inode->i_ino,
+ (unsigned long long) pos, len, copied);
new_i_size = pos + copied;
if (new_i_size > EXT4_I(inode)->i_disksize) {
ext4_update_i_disksize(inode, new_i_size);
@@ -1479,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file,
unsigned from, to;
loff_t new_i_size;
+ trace_mark(ext4_journalled_write_end,
+ "dev %s ino %lu pos %llu len %u copied %u",
+ inode->i_sb->s_id, inode->i_ino,
+ (unsigned long long) pos, len, copied);
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
@@ -1625,7 +1657,7 @@ struct mpage_da_data {
get_block_t *get_block;
struct writeback_control *wbc;
int io_done;
- long pages_written;
+ int pages_written;
int retval;
};
@@ -1645,35 +1677,39 @@ struct mpage_da_data {
*/
static int mpage_da_submit_io(struct mpage_da_data *mpd)
{
- struct address_space *mapping = mpd->inode->i_mapping;
- int ret = 0, err, nr_pages, i;
- unsigned long index, end;
- struct pagevec pvec;
long pages_skipped;
+ struct pagevec pvec;
+ unsigned long index, end;
+ int ret = 0, err, nr_pages, i;
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
BUG_ON(mpd->next_page <= mpd->first_page);
- pagevec_init(&pvec, 0);
+ /*
+ * We need to start from the first_page to the next_page - 1
+ * to make sure we also write the mapped dirty buffer_heads.
+ * If we look at mpd->lbh.b_blocknr we would only be looking
+ * at the currently mapped buffer_heads.
+ */
index = mpd->first_page;
end = mpd->next_page - 1;
+ pagevec_init(&pvec, 0);
while (index <= end) {
- /*
- * We can use PAGECACHE_TAG_DIRTY lookup here because
- * even though we have cleared the dirty flag on the page
- * We still keep the page in the radix tree with tag
- * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
- * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
- * which is called via the below writepage callback.
- */
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- min(end - index,
- (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+
pages_skipped = mpd->wbc->pages_skipped;
err = mapping->a_ops->writepage(page, mpd->wbc);
if (!err && (pages_skipped == mpd->wbc->pages_skipped))
@@ -1831,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode)
ext4_count_free_blocks(inode->i_sb));
printk(KERN_EMERG "Free/Dirty block details\n");
printk(KERN_EMERG "free_blocks=%lld\n",
- percpu_counter_sum(&sbi->s_freeblocks_counter));
+ (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
printk(KERN_EMERG "dirty_blocks=%lld\n",
- percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+ (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
printk(KERN_EMERG "Block reservation details\n");
- printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+ printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
EXT4_I(inode)->i_reserved_data_blocks);
- printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+ printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
EXT4_I(inode)->i_reserved_meta_blocks);
return;
}
@@ -2087,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page,
bh = head;
do {
BUG_ON(buffer_locked(bh));
+ /*
+ * We need to try to allocate
+ * unmapped blocks in the same page.
+ * Otherwise we won't make progress
+ * with the page in ext4_da_writepage
+ */
if (buffer_dirty(bh) &&
(!buffer_mapped(bh) || buffer_delay(bh))) {
mpage_add_bh_to_extent(mpd, logical, bh);
if (mpd->io_done)
return MPAGE_DA_EXTENT_TAIL;
+ } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+ /*
+ * mapped dirty buffer. We need to update
+ * the b_state because we look at
+ * b_state in mpage_da_map_blocks. We don't
+ * update b_size because if we find an
+ * unmapped buffer_head later we need to
+ * use the b_state flag of that buffer_head.
+ */
+ if (mpd->lbh.b_size == 0)
+ mpd->lbh.b_state =
+ bh->b_state & BH_FLAGS;
}
logical++;
} while ((bh = bh->b_this_page) != head);
@@ -2269,10 +2323,13 @@ static int ext4_da_writepage(struct page *page,
{
int ret = 0;
loff_t size;
- unsigned long len;
+ unsigned int len;
struct buffer_head *page_bufs;
struct inode *inode = page->mapping->host;
+ trace_mark(ext4_da_writepage,
+ "dev %s ino %lu page_index %lu",
+ inode->i_sb->s_id, inode->i_ino, page->index);
size = i_size_read(inode);
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
@@ -2378,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping,
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
int no_nrwrite_index_update;
- long pages_written = 0, pages_skipped;
+ int pages_written = 0;
+ long pages_skipped;
int needed_blocks, ret = 0, nr_to_writebump = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+ trace_mark(ext4_da_writepages,
+ "dev %s ino %lu nr_t_write %ld "
+ "pages_skipped %ld range_start %llu "
+ "range_end %llu nonblocking %d "
+ "for_kupdate %d for_reclaim %d "
+ "for_writepages %d range_cyclic %d",
+ inode->i_sb->s_id, inode->i_ino,
+ wbc->nr_to_write, wbc->pages_skipped,
+ (unsigned long long) wbc->range_start,
+ (unsigned long long) wbc->range_end,
+ wbc->nonblocking, wbc->for_kupdate,
+ wbc->for_reclaim, wbc->for_writepages,
+ wbc->range_cyclic);
+
/*
* No pages to write? This is mainly a kludge to avoid starting
* a transaction for special inodes like journal inode on last iput()
@@ -2389,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping,
*/
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
+
+ /*
+ * If the filesystem has aborted, it is read-only, so return
+ * right away instead of dumping stack traces later on that
+ * will obscure the real source of the problem. We test
+ * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
+ * the latter could be true if the filesystem is mounted
+ * read-only, and in that case, ext4_da_writepages should
+ * *never* be called, so if that ever happens, we would want
+ * the stack trace.
+ */
+ if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
+ return -EROFS;
+
/*
* Make sure nr_to_write is >= sbi->s_mb_stream_request
* This make sure small files blocks are allocated in
@@ -2433,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping,
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- printk(KERN_EMERG "%s: jbd2_start: "
+ printk(KERN_CRIT "%s: jbd2_start: "
"%ld pages, ino %lu; err %d\n", __func__,
wbc->nr_to_write, inode->i_ino, ret);
dump_stack();
@@ -2486,6 +2572,14 @@ out_writepages:
if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
wbc->nr_to_write -= nr_to_writebump;
+ trace_mark(ext4_da_writepage_result,
+ "dev %s ino %lu ret %d pages_written %d "
+ "pages_skipped %ld congestion %d "
+ "more_io %d no_nrwrite_index_update %d",
+ inode->i_sb->s_id, inode->i_ino, ret,
+ pages_written, wbc->pages_skipped,
+ wbc->encountered_congestion, wbc->more_io,
+ wbc->no_nrwrite_index_update);
return ret;
}
@@ -2498,7 +2592,7 @@ static int ext4_nonda_switch(struct super_block *sb)
/*
* switch to non delalloc mode if we are running low
* on free block. The free block accounting via percpu
- * counters can get slightly wrong with FBC_BATCH getting
+ * counters can get slightly wrong with percpu_counter_batch getting
* accumulated on each CPU without updating global counters
* Delalloc need an accurate free block accounting. So switch
* to non delalloc when we are near to error range.
@@ -2537,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
len, flags, pagep, fsdata);
}
*fsdata = (void *)0;
+
+ trace_mark(ext4_da_write_begin,
+ "dev %s ino %lu pos %llu len %u flags %u",
+ inode->i_sb->s_id, inode->i_ino,
+ (unsigned long long) pos, len, flags);
retry:
/*
* With delayed allocation, we don't log the i_disksize update
@@ -2550,7 +2649,7 @@ retry:
goto out;
}
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
ext4_journal_stop(handle);
ret = -ENOMEM;
@@ -2626,6 +2725,10 @@ static int ext4_da_write_end(struct file *file,
}
}
+ trace_mark(ext4_da_write_end,
+ "dev %s ino %lu pos %llu len %u copied %u",
+ inode->i_sb->s_id, inode->i_ino,
+ (unsigned long long) pos, len, copied);
start = pos & (PAGE_CACHE_SIZE - 1);
end = start + copied - 1;
@@ -2718,7 +2821,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
filemap_write_and_wait(mapping);
}
- if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+ BUG_ON(!EXT4_JOURNAL(inode) &&
+ EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
+
+ if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
/*
* This is a REALLY heavyweight approach, but the use of
* bmap on dirty files is expected to be extremely rare:
@@ -2836,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page,
loff_t size = i_size_read(inode);
loff_t len;
+ trace_mark(ext4_normal_writepage,
+ "dev %s ino %lu page_index %lu",
+ inode->i_sb->s_id, inode->i_ino, page->index);
J_ASSERT(PageLocked(page));
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
@@ -2921,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page,
loff_t size = i_size_read(inode);
loff_t len;
+ trace_mark(ext4_journalled_writepage,
+ "dev %s ino %lu page_index %lu",
+ inode->i_sb->s_id, inode->i_ino, page->index);
J_ASSERT(PageLocked(page));
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
@@ -2989,7 +3101,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
if (offset == 0)
ClearPageChecked(page);
- jbd2_journal_invalidatepage(journal, page, offset);
+ if (journal)
+ jbd2_journal_invalidatepage(journal, page, offset);
+ else
+ block_invalidatepage(page, offset);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2999,7 +3114,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
WARN_ON(PageChecked(page));
if (!page_has_buffers(page))
return 0;
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ if (journal)
+ return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ else
+ return try_to_free_buffers(page);
}
/*
@@ -3271,7 +3389,7 @@ int ext4_block_truncate_page(handle_t *handle,
err = 0;
if (ext4_should_journal_data(inode)) {
- err = ext4_journal_dirty_metadata(handle, bh);
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
} else {
if (ext4_should_order_data(inode))
err = ext4_jbd2_file_inode(handle, inode);
@@ -3395,8 +3513,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
__le32 *p;
if (try_to_extend_transaction(handle, inode)) {
if (bh) {
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle, inode, bh);
}
ext4_mark_inode_dirty(handle, inode);
ext4_journal_test_restart(handle, inode);
@@ -3496,7 +3614,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
count, block_to_free_p, p);
if (this_bh) {
- BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
+ BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
/*
* The buffer head should have an attached journal head at this
@@ -3505,7 +3623,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
* the block was cleared. Check for this instead of OOPSing.
*/
if (bh2jh(this_bh))
- ext4_journal_dirty_metadata(handle, this_bh);
+ ext4_handle_dirty_metadata(handle, inode, this_bh);
else
ext4_error(inode->i_sb, __func__,
"circular indirect block detected, "
@@ -3535,7 +3653,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
ext4_fsblk_t nr;
__le32 *p;
- if (is_handle_aborted(handle))
+ if (ext4_handle_is_aborted(handle))
return;
if (depth--) {
@@ -3605,7 +3723,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
* will merely complain about releasing a free block,
* rather than leaking blocks.
*/
- if (is_handle_aborted(handle))
+ if (ext4_handle_is_aborted(handle))
return;
if (try_to_extend_transaction(handle, inode)) {
ext4_mark_inode_dirty(handle, inode);
@@ -3624,9 +3742,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
parent_bh)){
*p = 0;
BUFFER_TRACE(parent_bh,
- "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle,
- parent_bh);
+ "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle,
+ inode,
+ parent_bh);
}
}
}
@@ -3814,7 +3933,7 @@ do_indirects:
* synchronous
*/
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
out_stop:
/*
* If this was a simple ftruncate(), and the file will remain alive
@@ -3844,7 +3963,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
ext4_fsblk_t block;
int inodes_per_block, inode_offset;
- iloc->bh = 0;
+ iloc->bh = NULL;
if (!ext4_valid_inum(sb, inode->i_ino))
return -EIO;
@@ -3951,7 +4070,7 @@ make_io:
num = EXT4_INODES_PER_GROUP(sb);
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
- num -= le16_to_cpu(gdp->bg_itable_unused);
+ num -= ext4_itable_unused_count(sb, gdp);
table += num / inodes_per_block;
if (end > table)
end = table;
@@ -4313,8 +4432,8 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_SET_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
sb->s_dirt = 1;
- handle->h_sync = 1;
- err = ext4_journal_dirty_metadata(handle,
+ ext4_handle_sync(handle);
+ err = ext4_handle_dirty_metadata(handle, inode,
EXT4_SB(sb)->s_sbh);
}
}
@@ -4341,9 +4460,8 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
}
-
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- rc = ext4_journal_dirty_metadata(handle, bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ rc = ext4_handle_dirty_metadata(handle, inode, bh);
if (!err)
err = rc;
ei->i_state &= ~EXT4_STATE_NEW;
@@ -4406,6 +4524,25 @@ int ext4_write_inode(struct inode *inode, int wait)
return ext4_force_commit(inode->i_sb);
}
+int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
+{
+ int err = 0;
+
+ mark_buffer_dirty(bh);
+ if (inode && inode_needs_sync(inode)) {
+ sync_dirty_buffer(bh);
+ if (buffer_req(bh) && !buffer_uptodate(bh)) {
+ ext4_error(inode->i_sb, __func__,
+ "IO error syncing inode, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long)bh->b_blocknr);
+ err = -EIO;
+ }
+ }
+ return err;
+}
+
/*
* ext4_setattr()
*
@@ -4710,16 +4847,15 @@ int
ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc)
{
- int err = 0;
- if (handle) {
- err = ext4_get_inode_loc(inode, iloc);
- if (!err) {
- BUFFER_TRACE(iloc->bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, iloc->bh);
- if (err) {
- brelse(iloc->bh);
- iloc->bh = NULL;
- }
+ int err;
+
+ err = ext4_get_inode_loc(inode, iloc);
+ if (!err) {
+ BUFFER_TRACE(iloc->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, iloc->bh);
+ if (err) {
+ brelse(iloc->bh);
+ iloc->bh = NULL;
}
}
ext4_std_error(inode->i_sb, err);
@@ -4791,7 +4927,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
might_sleep();
err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+ if (ext4_handle_valid(handle) &&
+ EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
!(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
/*
* We need extra buffer credits since we may write into EA block
@@ -4843,6 +4980,11 @@ void ext4_dirty_inode(struct inode *inode)
handle_t *current_handle = ext4_journal_current_handle();
handle_t *handle;
+ if (!ext4_handle_valid(current_handle)) {
+ ext4_mark_inode_dirty(current_handle, inode);
+ return;
+ }
+
handle = ext4_journal_start(inode, 2);
if (IS_ERR(handle))
goto out;
@@ -4880,8 +5022,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
BUFFER_TRACE(iloc.bh, "get_write_access");
err = jbd2_journal_get_write_access(handle, iloc.bh);
if (!err)
- err = ext4_journal_dirty_metadata(handle,
- iloc.bh);
+ err = ext4_handle_dirty_metadata(handle,
+ inode,
+ iloc.bh);
brelse(iloc.bh);
}
}
@@ -4907,6 +5050,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
*/
journal = EXT4_JOURNAL(inode);
+ if (!journal)
+ return 0;
if (is_journal_aborted(journal))
return -EROFS;
@@ -4936,7 +5081,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return PTR_ERR(handle);
err = ext4_mark_inode_dirty(handle, inode);
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
ext4_journal_stop(handle);
ext4_std_error(inode->i_sb, err);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dc99b4776d5..42dc83fb247 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
goto flags_out;
}
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
goto flags_err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 444ad998f72..918aec0c8a1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -100,7 +100,7 @@
* inode as:
*
* { page }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information. So for each group we
@@ -330,6 +330,18 @@
* object
*
*/
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+
+
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
blocknr += first + i;
blocknr +=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
- ext4_error(sb, __func__, "double-free of inode"
- " %lu's block %llu(bit %u in group %lu)\n",
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ __func__, "double-free of inode"
+ " %lu's block %llu(bit %u in group %u)",
inode ? inode->i_ino : 0, blocknr,
first + i, e4b->bd_group);
}
@@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
b2 = (unsigned char *) bitmap;
for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
if (b1[i] != b2[i]) {
- printk(KERN_ERR "corruption in group %lu "
+ printk(KERN_ERR "corruption in group %u "
"at byte %u(%u): %x in copy != %x "
"on disk/prealloc\n",
e4b->bd_group, i, i * 8, b1[i], b2[i]);
@@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
grp->bb_fragments = fragments;
if (free != grp->bb_free) {
- ext4_error(sb, __func__,
- "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
+ ext4_grp_locked_error(sb, group, __func__,
+ "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
group, free, grp->bb_free);
/*
* If we intent to continue, we consider group descritor
@@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
* stored in the inode as
*
* { page }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information.
@@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (bh[i] == NULL)
goto out;
- if (buffer_uptodate(bh[i]) &&
- !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+ if (bitmap_uptodate(bh[i]))
continue;
lock_buffer(bh[i]);
+ if (bitmap_uptodate(bh[i])) {
+ unlock_buffer(bh[i]);
+ continue;
+ }
spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh[i],
first_group + i, desc);
+ set_bitmap_uptodate(bh[i]);
set_buffer_uptodate(bh[i]);
- unlock_buffer(bh[i]);
spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+ unlock_buffer(bh[i]);
continue;
}
spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+ if (buffer_uptodate(bh[i])) {
+ /*
+ * if not uninit if bh is uptodate,
+ * bitmap is also uptodate
+ */
+ set_bitmap_uptodate(bh[i]);
+ unlock_buffer(bh[i]);
+ continue;
+ }
get_bh(bh[i]);
+ /*
+ * submit the buffer_head for read. We can
+ * safely mark the bitmap as uptodate now.
+ * We do it here so the bitmap uptodate bit
+ * get set with buffer lock held.
+ */
+ set_bitmap_uptodate(bh[i]);
bh[i]->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh[i]);
- mb_debug("read bitmap for group %lu\n", first_group + i);
+ mb_debug("read bitmap for group %u\n", first_group + i);
}
/* wait for I/O completion */
@@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
err = 0;
first_block = page->index * blocks_per_page;
+ /* init the page */
+ memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
for (i = 0; i < blocks_per_page; i++) {
int group;
struct ext4_group_info *grinfo;
@@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
BUG_ON(incore == NULL);
mb_debug("put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
- memset(data, 0xff, blocksize);
grinfo = ext4_get_group_info(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
@@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/*
* incore got set to the group block bitmap below
*/
+ ext4_lock_group(sb, group);
ext4_mb_generate_buddy(sb, data, incore, group);
+ ext4_unlock_group(sb, group);
incore = NULL;
} else {
/* this is block of bitmap */
@@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* mark all preallocated blks used in in-core bitmap */
ext4_mb_generate_from_pa(sb, data, group);
+ ext4_mb_generate_from_freelist(sb, data, group);
ext4_unlock_group(sb, group);
/* set incore so that the buddy information can be
@@ -886,18 +922,20 @@ static noinline_for_stack int
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
int blocks_per_page;
int block;
int pnum;
int poff;
struct page *page;
int ret;
+ struct ext4_group_info *grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
- mb_debug("load group %lu\n", group);
+ mb_debug("load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ grp = ext4_get_group_info(sb, group);
e4b->bd_blkbits = sb->s_blocksize_bits;
e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
e4b->bd_group = group;
e4b->bd_buddy_page = NULL;
e4b->bd_bitmap_page = NULL;
+ e4b->alloc_semp = &grp->alloc_sem;
+
+ /* Take the read lock on the group alloc
+ * sem. This would make sure a parallel
+ * ext4_mb_init_group happening on other
+ * groups mapped by the page is blocked
+ * till we are done with allocation
+ */
+ down_read(e4b->alloc_semp);
/*
* the buddy cache inode stores the block bitmap
@@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
page = find_get_page(inode->i_mapping, pnum);
if (page == NULL || !PageUptodate(page)) {
if (page)
+ /*
+ * drop the page reference and try
+ * to get the page with lock. If we
+ * are not uptodate that implies
+ * somebody just created the page but
+ * is yet to initialize the same. So
+ * wait for it to initialize.
+ */
page_cache_release(page);
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
@@ -985,6 +1040,9 @@ err:
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
+
+ /* Done with the buddy cache */
+ up_read(e4b->alloc_semp);
return ret;
}
@@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
+ /* Done with the buddy cache */
+ if (e4b->alloc_semp)
+ up_read(e4b->alloc_semp);
}
@@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
cur += 32;
continue;
}
- mb_clear_bit_atomic(lock, cur, bm);
+ if (lock)
+ mb_clear_bit_atomic(lock, cur, bm);
+ else
+ mb_clear_bit(cur, bm);
cur++;
}
}
@@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
cur += 32;
continue;
}
- mb_set_bit_atomic(lock, cur, bm);
+ if (lock)
+ mb_set_bit_atomic(lock, cur, bm);
+ else
+ mb_set_bit(cur, bm);
cur++;
}
}
@@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
blocknr += block;
blocknr +=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
- ext4_unlock_group(sb, e4b->bd_group);
- ext4_error(sb, __func__, "double-free of inode"
- " %lu's block %llu(bit %u in group %lu)\n",
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ __func__, "double-free of inode"
+ " %lu's block %llu(bit %u in group %u)",
inode ? inode->i_ino : 0, blocknr, block,
e4b->bd_group);
- ext4_lock_group(sb, e4b->bd_group);
}
mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
e4b->bd_info->bb_counters[order]++;
@@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
ac->ac_tail = ret & 0xffff;
ac->ac_buddy = ret >> 16;
- /* XXXXXXX: SUCH A HORRIBLE **CK */
- /*FIXME!! Why ? */
+ /*
+ * take the page reference. We want the page to be pinned
+ * so that we don't get a ext4_mb_init_cache_call for this
+ * group until we update the bitmap. That would mean we
+ * double allocate blocks. The reference is dropped
+ * in ext4_mb_release_context
+ */
ac->ac_bitmap_page = e4b->bd_bitmap_page;
get_page(ac->ac_bitmap_page);
ac->ac_buddy_page = e4b->bd_buddy_page;
get_page(ac->ac_buddy_page);
-
+ /* on allocation we use ac to track the held semaphore */
+ ac->alloc_semp = e4b->alloc_semp;
+ e4b->alloc_semp = NULL;
/* store last allocated for subsequent stream allocation */
if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
spin_lock(&sbi->s_md_lock);
@@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
struct ext4_free_extent ex;
int max;
+ if (ac->ac_status == AC_STATUS_FOUND)
+ return;
/*
* We don't want to scan for a whole year
*/
@@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
* free blocks even though group info says we
* we have free blocks
*/
- ext4_error(sb, __func__, "%d free blocks as per "
- "group info. But bitmap says 0\n",
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ __func__, "%d free blocks as per "
+ "group info. But bitmap says 0",
free);
break;
}
@@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
BUG_ON(ex.fe_len <= 0);
if (free < ex.fe_len) {
- ext4_error(sb, __func__, "%d free blocks as per "
- "group info. But got %d blocks\n",
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ __func__, "%d free blocks as per "
+ "group info. But got %d blocks",
free, ex.fe_len);
/*
* The number of free blocks differs. This mostly
@@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}
+/*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen whild holding the buddy cache
+ * lock
+ */
+int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
+{
+ int i;
+ int block, pnum;
+ int blocks_per_page;
+ int groups_per_page;
+ ext4_group_t first_group;
+ struct ext4_group_info *grp;
+
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ first_group = pnum * blocks_per_page / 2;
+
+ groups_per_page = blocks_per_page >> 1;
+ if (groups_per_page == 0)
+ groups_per_page = 1;
+ /* read all groups the page covers into the cache */
+ for (i = 0; i < groups_per_page; i++) {
+
+ if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+ break;
+ grp = ext4_get_group_info(sb, first_group + i);
+ /* take all groups write allocation
+ * semaphore. This make sure there is
+ * no block allocation going on in any
+ * of that groups
+ */
+ down_write_nested(&grp->alloc_sem, i);
+ }
+ return i;
+}
+
+void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+ ext4_group_t group, int locked_group)
+{
+ int i;
+ int block, pnum;
+ int blocks_per_page;
+ ext4_group_t first_group;
+ struct ext4_group_info *grp;
+
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ first_group = pnum * blocks_per_page / 2;
+ /* release locks on all the groups */
+ for (i = 0; i < locked_group; i++) {
+
+ grp = ext4_get_group_info(sb, first_group + i);
+ /* take all groups write allocation
+ * semaphore. This make sure there is
+ * no block allocation going on in any
+ * of that groups
+ */
+ up_write(&grp->alloc_sem);
+ }
+
+}
+
+static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+ int ret;
+ void *bitmap;
+ int blocks_per_page;
+ int block, pnum, poff;
+ int num_grp_locked = 0;
+ struct ext4_group_info *this_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ struct page *page = NULL, *bitmap_page = NULL;
+
+ mb_debug("init group %lu\n", group);
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ this_grp = ext4_get_group_info(sb, group);
+ /*
+ * This ensures we don't add group
+ * to this buddy cache via resize
+ */
+ num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
+ if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ /*
+ * somebody initialized the group
+ * return without doing anything
+ */
+ ret = 0;
+ goto err;
+ }
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+ bitmap_page = page;
+ bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+ /* init buddy cache */
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page == bitmap_page) {
+ /*
+ * If both the bitmap and buddy are in
+ * the same page we don't need to force
+ * init the buddy
+ */
+ unlock_page(page);
+ } else if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+err:
+ ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+ if (bitmap_page)
+ page_cache_release(bitmap_page);
+ if (page)
+ page_cache_release(page);
+ return ret;
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
@@ -1775,7 +2019,7 @@ repeat:
group = 0;
/* quick check to skip empty groups */
- grp = ext4_get_group_info(ac->ac_sb, group);
+ grp = ext4_get_group_info(sb, group);
if (grp->bb_free == 0)
continue;
@@ -1788,10 +2032,9 @@ repeat:
* we need full data about the group
* to make a good selection
*/
- err = ext4_mb_load_buddy(sb, group, &e4b);
+ err = ext4_mb_init_group(sb, group);
if (err)
goto out;
- ext4_mb_release_desc(&e4b);
}
/*
@@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
if (hs->op == EXT4_MB_HISTORY_ALLOC) {
fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
"%-5u %-5s %-5u %-6u\n";
- sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+ sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
hs->result.fe_start, hs->result.fe_len,
hs->result.fe_logical);
- sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+ sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
hs->orig.fe_start, hs->orig.fe_len,
hs->orig.fe_logical);
- sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
+ sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
hs->goal.fe_start, hs->goal.fe_len,
hs->goal.fe_logical);
seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
@@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
hs->buddy ? 1 << hs->buddy : 0);
} else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
fmt = "%-5u %-8u %-23s %-23s %-23s\n";
- sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+ sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
hs->result.fe_start, hs->result.fe_len,
hs->result.fe_logical);
- sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+ sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
hs->orig.fe_start, hs->orig.fe_len,
hs->orig.fe_logical);
seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
} else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
- sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+ sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
hs->result.fe_start, hs->result.fe_len);
seq_printf(seq, "%-5u %-8u %-23s discard\n",
hs->pid, hs->ino, buf2);
} else if (hs->op == EXT4_MB_HISTORY_FREE) {
- sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+ sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
hs->result.fe_start, hs->result.fe_len);
seq_printf(seq, "%-5u %-8u %-23s free\n",
hs->pid, hs->ino, buf2);
@@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
return NULL;
group = *pos + 1;
- return (void *) group;
+ return (void *) ((unsigned long) group);
}
static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
if (*pos < 0 || *pos >= sbi->s_groups_count)
return NULL;
group = *pos + 1;
- return (void *) group;;
+ return (void *) ((unsigned long) group);
}
static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
struct super_block *sb = seq->private;
- long group = (long) v;
+ ext4_group_t group = (ext4_group_t) ((unsigned long) v);
int i;
int err;
struct ext4_buddy e4b;
@@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
sizeof(struct ext4_group_info);
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
- seq_printf(seq, "#%-5lu: I/O error\n", group);
+ seq_printf(seq, "#%-5u: I/O error\n", group);
return 0;
}
ext4_lock_group(sb, group);
@@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
ext4_unlock_group(sb, group);
ext4_mb_release_desc(&e4b);
- seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
ext4_free_blocks_after_init(sb, group, desc);
} else {
meta_group_info[i]->bb_free =
- le16_to_cpu(desc->bg_free_blocks_count);
+ ext4_free_blks_count(sb, desc);
}
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root.rb_node = NULL;;
#ifdef DOUBLE_CHECK
@@ -2327,54 +2571,6 @@ exit_meta_group_info:
} /* ext4_mb_add_groupinfo */
/*
- * Add a group to the existing groups.
- * This function is used for online resize
- */
-int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
- struct ext4_group_desc *desc)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- int blocks_per_page;
- int block;
- int pnum;
- struct page *page;
- int err;
-
- /* Add group based on group descriptor*/
- err = ext4_mb_add_groupinfo(sb, group, desc);
- if (err)
- return err;
-
- /*
- * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
- * datas) are set not up to date so that they will be re-initilaized
- * during the next call to ext4_mb_load_buddy
- */
-
- /* Set buddy page as not up to date */
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- block = group * 2;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
-
- /* Set bitmap page as not up to date */
- block++;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
-
- return 0;
-}
-
-/*
* Update an existing group.
* This function is used for online resize
*/
@@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
printk(KERN_ERR
- "EXT4-fs: can't read descriptor %lu\n", i);
+ "EXT4-fs: can't read descriptor %u\n", i);
goto err_freebuddy;
}
if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
if (sbi->s_mb_offsets == NULL) {
return -ENOMEM;
}
+
+ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
kfree(sbi->s_mb_maxs);
@@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
ext4_mb_init_per_dev_proc(sb);
ext4_mb_history_init(sb);
- sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+ if (sbi->s_journal)
+ sbi->s_journal->j_commit_callback = release_blocks_on_commit;
printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
@@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
list_for_each_safe(l, ltmp, &txn->t_private_list) {
entry = list_entry(l, struct ext4_free_data, list);
- mb_debug("gonna free %u blocks in group %lu (0x%p):",
+ mb_debug("gonna free %u blocks in group %u (0x%p):",
entry->count, entry->group, entry);
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+ entry->start_blk
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
- trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
- (unsigned long long) discard_block, entry->count);
+ trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
+ sb->s_id, (unsigned long long) discard_block,
+ entry->count);
sb_issue_discard(sb, discard_block, entry->count);
kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void)
*/
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
- handle_t *handle, unsigned long reserv_blks)
+ handle_t *handle, unsigned int reserv_blks)
{
struct buffer_head *bitmap_bh = NULL;
struct ext4_super_block *es;
@@ -2824,7 +3024,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (!gdp)
goto out_err;
- ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+ ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
gdp->bg_free_blocks_count);
err = ext4_journal_get_write_access(handle, gdp_bh);
@@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
in_range(block + len - 1, ext4_inode_table(sb, gdp),
EXT4_SB(sb)->s_itb_per_group)) {
ext4_error(sb, __func__,
- "Allocating block in system zone - block = %llu",
- block);
+ "Allocating block %llu in system zone of %d group\n",
+ block, ac->ac_b_ex.fe_group);
/* File system mounted not to panic on error
* Fix the bitmap and repeat the block allocation
* We leak some of the blocks here.
@@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
bitmap_bh->b_data, ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len);
- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (!err)
err = -EAGAIN;
goto out_err;
@@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
}
#endif
- mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
- ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
-
spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+ mb_set_bits(NULL, bitmap_bh->b_data,
+ ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- gdp->bg_free_blocks_count =
- cpu_to_le16(ext4_free_blocks_after_init(sb,
- ac->ac_b_ex.fe_group,
- gdp));
+ ext4_free_blks_set(sb, gdp,
+ ext4_free_blocks_after_init(sb,
+ ac->ac_b_ex.fe_group, gdp));
}
- le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
+ len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
+ ext4_free_blks_set(sb, gdp, len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
spin_unlock(sb_bgl_lock(sbi, flex_group));
}
- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (err)
goto out_err;
- err = ext4_journal_dirty_metadata(handle, gdp_bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
out_err:
sb->s_dirt = 1;
@@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* check we don't cross already preallocated blocks */
rcu_read_lock();
list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
- unsigned long pa_end;
+ ext4_lblk_t pa_end;
if (pa->pa_deleted)
continue;
@@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* XXX: extra loop to check we really don't overlap preallocations */
rcu_read_lock();
list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
- unsigned long pa_end;
+ ext4_lblk_t pa_end;
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0) {
pa_end = pa->pa_lstart + pa->pa_len;
@@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
}
/*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+ ext4_group_t group)
+{
+ struct rb_node *n;
+ struct ext4_group_info *grp;
+ struct ext4_free_data *entry;
+
+ grp = ext4_get_group_info(sb, group);
+ n = rb_first(&(grp->bb_free_root));
+
+ while (n) {
+ entry = rb_entry(n, struct ext4_free_data, node);
+ mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+ bitmap, entry->start_blk,
+ entry->count);
+ n = rb_next(n);
+ }
+ return;
+}
+
+/*
* the function goes through all preallocation in this group and marks them
* used in in-core bitmap. buddy must be generated from this bitmap
* Need to be called with ext4 group lock (ext4_lock_group)
@@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
preallocated += len;
count++;
}
- mb_debug("prellocated %u for group %lu\n", preallocated, group);
+ mb_debug("prellocated %u for group %u\n", preallocated, group);
}
static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
struct super_block *sb, struct ext4_prealloc_space *pa)
{
- unsigned long grp;
+ ext4_group_t grp;
if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
return;
@@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ trace_mark(ext4_mb_new_inode_pa,
+ "dev %s ino %lu pstart %llu len %u lstart %u",
+ sb->s_id, ac->ac_inode->i_ino,
+ pa->pa_pstart, pa->pa_len, pa->pa_lstart);
ext4_mb_use_inode_pa(ac, pa);
atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_linear = 1;
mb_debug("new group pa %p: %llu/%u for %u\n", pa,
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
+ sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
ext4_mb_use_group_pa(ac, pa);
atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
{
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned long end;
- unsigned long next;
+ unsigned int end;
+ unsigned int next;
ext4_group_t group;
ext4_grpblk_t bit;
+ unsigned long long grp_blk_start;
sector_t start;
int err = 0;
int free = 0;
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+ grp_blk_start = pa->pa_pstart - bit;
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
end = bit + pa->pa_len;
@@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
ext4_mb_store_history(ac);
}
+ trace_mark(ext4_mb_release_inode_pa,
+ "dev %s ino %lu block %llu count %u",
+ sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
+ next - bit);
mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
bit = next + 1;
}
@@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
pa, (unsigned long) pa->pa_lstart,
(unsigned long) pa->pa_pstart,
(unsigned long) pa->pa_len);
- ext4_error(sb, __func__, "free %u, pa_free %u\n",
- free, pa->pa_free);
+ ext4_grp_locked_error(sb, group,
+ __func__, "free %u, pa_free %u",
+ free, pa->pa_free);
/*
* pa is already deleted so we use the value obtained
* from the bitmap and continue.
@@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
if (ac)
ac->ac_op = EXT4_MB_HISTORY_DISCARD;
+ trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
+ sb->s_id, pa->pa_pstart, pa->pa_len);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
int busy = 0;
int free = 0;
- mb_debug("discard preallocation for group %lu\n", group);
+ mb_debug("discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
return 0;
@@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
ext4_error(sb, __func__, "Error in reading block "
- "bitmap for %lu\n", group);
+ "bitmap for %u", group);
return 0;
}
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
ext4_error(sb, __func__, "Error in loading buddy "
- "information for %lu\n", group);
+ "information for %u", group);
put_bh(bitmap_bh);
return 0;
}
@@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode)
}
mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+ trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
+ inode->i_ino);
INIT_LIST_HEAD(&list);
@@ -3874,14 +4116,14 @@ repeat:
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
ext4_error(sb, __func__, "Error in loading buddy "
- "information for %lu\n", group);
+ "information for %u", group);
continue;
}
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
ext4_error(sb, __func__, "Error in reading block "
- "bitmap for %lu\n", group);
+ "bitmap for %u", group);
ext4_mb_release_desc(&e4b);
continue;
}
@@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
ext4_group_t group;
- unsigned long len;
- unsigned long goal;
+ unsigned int len;
+ ext4_fsblk_t goal;
ext4_grpblk_t block;
/* we can't allocate > group size */
@@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ac->ac_pa = NULL;
ac->ac_bitmap_page = NULL;
ac->ac_buddy_page = NULL;
+ ac->alloc_semp = NULL;
ac->ac_lg = NULL;
/* we have to define context: we'll we work with a file or
@@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
ext4_error(sb, __func__, "Error in loading buddy "
- "information for %lu\n", group);
+ "information for %u", group);
continue;
}
ext4_lock_group(sb, group);
@@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
}
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
+ if (ac->alloc_semp)
+ up_read(ac->alloc_semp);
if (ac->ac_bitmap_page)
page_cache_release(ac->ac_bitmap_page);
if (ac->ac_buddy_page)
@@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
int ret;
int freed = 0;
+ trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
+ sb->s_id, needed);
for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
ret = ext4_mb_discard_group_preallocations(sb, i, needed);
freed += ret;
@@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block = 0;
- unsigned long inquota;
- unsigned long reserv_blks = 0;
+ unsigned int inquota;
+ unsigned int reserv_blks = 0;
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
+ trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
+ "lblk %llu goal %llu lleft %llu lright %llu "
+ "pleft %llu pright %llu ",
+ sb->s_id, ar->flags, ar->len,
+ ar->inode ? ar->inode->i_ino : 0,
+ (unsigned long long) ar->logical,
+ (unsigned long long) ar->goal,
+ (unsigned long long) ar->lleft,
+ (unsigned long long) ar->lright,
+ (unsigned long long) ar->pleft,
+ (unsigned long long) ar->pright);
+
if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
/*
* With delalloc we already reserved the blocks
@@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
if (ar->len == 0) {
*errp = -EDQUOT;
- return 0;
+ goto out3;
}
inquota = ar->len;
@@ -4348,10 +4607,14 @@ repeat:
ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
ext4_mb_new_preallocation(ac);
}
-
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
if (*errp == -EAGAIN) {
+ /*
+ * drop the reference that we took
+ * in ext4_mb_use_best_found
+ */
+ ext4_mb_release_context(ac);
ac->ac_b_ex.fe_group = 0;
ac->ac_b_ex.fe_start = 0;
ac->ac_b_ex.fe_len = 0;
@@ -4382,6 +4645,26 @@ out2:
out1:
if (ar->len < inquota)
DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+out3:
+ if (!ar->len) {
+ if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+ /* release all the reserved blocks if non delalloc */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ reserv_blks);
+ }
+
+ trace_mark(ext4_allocate_blocks,
+ "dev %s block %llu flags %u len %u ino %lu "
+ "logical %llu goal %llu lleft %llu lright %llu "
+ "pleft %llu pright %llu ",
+ sb->s_id, (unsigned long long) block,
+ ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
+ (unsigned long long) ar->logical,
+ (unsigned long long) ar->goal,
+ (unsigned long long) ar->lleft,
+ (unsigned long long) ar->lright,
+ (unsigned long long) ar->pleft,
+ (unsigned long long) ar->pright);
return block;
}
@@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1,
static noinline_for_stack int
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
- ext4_group_t group, ext4_grpblk_t block, int count)
+ struct ext4_free_data *new_entry)
{
+ ext4_grpblk_t block;
+ struct ext4_free_data *entry;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_free_data *entry, *new_entry;
struct rb_node **n = &db->bb_free_root.rb_node, *node;
struct rb_node *parent = NULL, *new_node;
-
+ BUG_ON(!ext4_handle_valid(handle));
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
- new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
- new_entry->start_blk = block;
- new_entry->group = group;
- new_entry->count = count;
- new_entry->t_tid = handle->h_transaction->t_tid;
new_node = &new_entry->node;
+ block = new_entry->start_blk;
- ext4_lock_group(sb, group);
if (!*n) {
/* first free block exent. We need to
protect buddy cache from being freed,
@@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
else if (block >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
else {
- ext4_unlock_group(sb, group);
- ext4_error(sb, __func__,
- "Double free of blocks %d (%d %d)\n",
- block, entry->start_blk, entry->count);
+ ext4_grp_locked_error(sb, e4b->bd_group, __func__,
+ "Double free of blocks %d (%d %d)",
+ block, entry->start_blk, entry->count);
return 0;
}
}
@@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
spin_lock(&sbi->s_md_lock);
list_add(&new_entry->list, &handle->h_transaction->t_private_list);
spin_unlock(&sbi->s_md_lock);
- ext4_unlock_group(sb, group);
return 0;
}
@@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
struct ext4_allocation_context *ac = NULL;
struct ext4_group_desc *gdp;
struct ext4_super_block *es;
- unsigned long overflow;
+ unsigned int overflow;
ext4_grpblk_t bit;
struct buffer_head *gd_bh;
ext4_group_t block_group;
@@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
}
ext4_debug("freeing block %lu\n", block);
+ trace_mark(ext4_free_blocks,
+ "dev %s block %llu count %lu metadata %d ino %lu",
+ sb->s_id, (unsigned long long) block, count, metadata,
+ inode ? inode->i_ino : 0);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (ac) {
@@ -4581,11 +4862,6 @@ do_more:
err = ext4_journal_get_write_access(handle, gd_bh);
if (err)
goto error_return;
-
- err = ext4_mb_load_buddy(sb, block_group, &e4b);
- if (err)
- goto error_return;
-
#ifdef AGGRESSIVE_CHECK
{
int i;
@@ -4593,13 +4869,6 @@ do_more:
BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
}
#endif
- mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
- bit, count);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
-
if (ac) {
ac->ac_b_ex.fe_group = block_group;
ac->ac_b_ex.fe_start = bit;
@@ -4607,19 +4876,41 @@ do_more:
ext4_mb_store_history(ac);
}
- if (metadata) {
- /* blocks being freed are metadata. these blocks shouldn't
- * be used until this transaction is committed */
- ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_return;
+ if (metadata && ext4_handle_valid(handle)) {
+ struct ext4_free_data *new_entry;
+ /*
+ * blocks being freed are metadata. these blocks shouldn't
+ * be used until this transaction is committed
+ */
+ new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry->start_blk = bit;
+ new_entry->group = block_group;
+ new_entry->count = count;
+ new_entry->t_tid = handle->h_transaction->t_tid;
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+ bit, count);
+ ext4_mb_free_metadata(handle, &e4b, new_entry);
+ ext4_unlock_group(sb, block_group);
} else {
ext4_lock_group(sb, block_group);
+ /* need to update group_info->bb_free and bitmap
+ * with group lock held. generate_buddy look at
+ * them with group lock_held
+ */
+ mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+ bit, count);
mb_free_blocks(inode, &e4b, bit, count);
ext4_mb_return_to_preallocation(inode, &e4b, block, count);
ext4_unlock_group(sb, block_group);
}
spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&gdp->bg_free_blocks_count, count);
+ ret = ext4_free_blks_count(sb, gdp) + count;
+ ext4_free_blks_set(sb, gdp, ret);
gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
@@ -4635,9 +4926,13 @@ do_more:
*freed += count;
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
/* And the group descriptor block */
BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_journal_dirty_metadata(handle, gd_bh);
+ ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
if (!err)
err = ret;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b5dff1fff1e..10a2921baf1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -20,6 +20,7 @@
#include <linux/version.h>
#include <linux/blkdev.h>
#include <linux/marker.h>
+#include <linux/mutex.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "group.h"
@@ -98,9 +99,6 @@
*/
#define MB_DEFAULT_GROUP_PREALLOC 512
-static struct kmem_cache *ext4_pspace_cachep;
-static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
struct ext4_free_data {
/* this links the free block information from group_info */
@@ -120,26 +118,6 @@ struct ext4_free_data {
tid_t t_tid;
};
-struct ext4_group_info {
- unsigned long bb_state;
- struct rb_root bb_free_root;
- unsigned short bb_first_free;
- unsigned short bb_free;
- unsigned short bb_fragments;
- struct list_head bb_prealloc_list;
-#ifdef DOUBLE_CHECK
- void *bb_bitmap;
-#endif
- unsigned short bb_counters[];
-};
-
-#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
-#define EXT4_GROUP_INFO_LOCKED_BIT 1
-
-#define EXT4_MB_GRP_NEED_INIT(grp) \
- (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-
-
struct ext4_prealloc_space {
struct list_head pa_inode_list;
struct list_head pa_group_list;
@@ -217,6 +195,11 @@ struct ext4_allocation_context {
__u8 ac_op; /* operation, for history only */
struct page *ac_bitmap_page;
struct page *ac_buddy_page;
+ /*
+ * pointer to the held semaphore upon successful
+ * block allocation
+ */
+ struct rw_semaphore *alloc_semp;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
};
@@ -250,6 +233,7 @@ struct ext4_buddy {
struct super_block *bd_sb;
__u16 bd_blkbits;
ext4_group_t bd_group;
+ struct rw_semaphore *alloc_semp;
};
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
@@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
{
return;
}
-#else
-static void ext4_mb_store_history(struct ext4_allocation_context *ac);
#endif
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
-
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
- ext4_group_t group);
-static void ext4_mb_return_to_preallocation(struct inode *inode,
- struct ext4_buddy *e4b, sector_t block,
- int count);
-static void ext4_mb_put_pa(struct ext4_allocation_context *,
- struct super_block *, struct ext4_prealloc_space *pa);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
-
-
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline void ext4_unlock_group(struct super_block *sb,
- ext4_group_t group)
-{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline int ext4_is_group_locked(struct super_block *sb,
- ext4_group_t group)
-{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
- &(grinfo->bb_state));
-}
-
-static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
struct ext4_free_extent *fex)
{
ext4_fsblk_t block;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f2a9cf498ec..734abca25e3 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
/*
* Make sure the credit we accumalated is not really high
*/
- if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
+ if (needed && ext4_handle_has_enough_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS)) {
retval = ext4_journal_restart(handle, needed);
if (retval)
goto err_out;
@@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
{
int retval = 0, needed;
- if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+ if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
return 0;
/*
* We are freeing a blocks. During this we touch
@@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode)
struct list_blocks_struct lb;
unsigned long max_entries;
- if (!test_opt(inode->i_sb, EXTENTS))
- /*
- * if mounted with noextents we don't allow the migrate
- */
- return -EINVAL;
-
- if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ /*
+ * If the filesystem does not support extents, or the inode
+ * already is extent-based, error out.
+ */
+ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index da98a9012fa..fec0b4c2f5f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
#define assert(test) J_ASSERT(test)
#endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
-
#ifdef DX_DEBUG
#define dxtrace(command) command
#else
@@ -372,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
goto fail;
}
hinfo->hash_version = root->info.hash_version;
+ if (hinfo->hash_version <= DX_HASH_TEA)
+ hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
if (d_name)
ext4fs_dirhash(d_name->name, d_name->len, hinfo);
@@ -641,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
dir = dir_file->f_path.dentry->d_inode;
if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+ EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
start_hash, start_minor_hash);
@@ -806,7 +807,7 @@ static inline int ext4_match (int len, const char * const name,
static inline int search_dirblock(struct buffer_head *bh,
struct inode *dir,
const struct qstr *d_name,
- unsigned long offset,
+ unsigned int offset,
struct ext4_dir_entry_2 ** res_dir)
{
struct ext4_dir_entry_2 * de;
@@ -1043,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
bh = ext4_find_entry(dir, &dentry->d_name, &de);
inode = NULL;
if (bh) {
- unsigned long ino = le32_to_cpu(de->inode);
+ __u32 ino = le32_to_cpu(de->inode);
brelse(bh);
if (!ext4_valid_inum(dir->i_sb, ino)) {
ext4_error(dir->i_sb, "ext4_lookup",
- "bad inode number: %lu", ino);
+ "bad inode number: %u", ino);
return ERR_PTR(-EIO);
}
inode = ext4_iget(dir->i_sb, ino);
@@ -1060,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
struct dentry *ext4_get_parent(struct dentry *child)
{
- unsigned long ino;
+ __u32 ino;
struct inode *inode;
static const struct qstr dotdot = {
.name = "..",
@@ -1078,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
ext4_error(child->d_inode->i_sb, "ext4_get_parent",
- "bad inode number: %lu", ino);
+ "bad inode number: %u", ino);
return ERR_PTR(-EIO);
}
@@ -1166,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
u32 hash2;
struct dx_map_entry *map;
char *data1 = (*bh)->b_data, *data2;
- unsigned split, move, size, i;
+ unsigned split, move, size;
struct ext4_dir_entry_2 *de = NULL, *de2;
- int err = 0;
+ int err = 0, i;
bh2 = ext4_append (handle, dir, &newblock, &err);
if (!(bh2)) {
@@ -1228,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
de = de2;
}
dx_insert_block(frame, hash2 + continued, newblock);
- err = ext4_journal_dirty_metadata(handle, bh2);
+ err = ext4_handle_dirty_metadata(handle, dir, bh2);
if (err)
goto journal_error;
- err = ext4_journal_dirty_metadata(handle, frame->bh);
+ err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
if (err)
goto journal_error;
brelse(bh2);
@@ -1266,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
struct inode *dir = dentry->d_parent->d_inode;
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- unsigned long offset = 0;
+ unsigned int offset = 0;
unsigned short reclen;
int nlen, rlen, err;
char *top;
@@ -1335,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
ext4_update_dx_flag(dir);
dir->i_version++;
ext4_mark_inode_dirty(handle, dir);
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, dir, bh);
if (err)
ext4_std_error(dir->i_sb, err);
brelse(bh);
@@ -1408,6 +1409,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
/* Initialize as for dx_probe */
hinfo.hash_version = root->info.hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
ext4fs_dirhash(name, namelen, &hinfo);
frame = frames;
@@ -1437,7 +1440,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode)
{
struct inode *dir = dentry->d_parent->d_inode;
- unsigned long offset;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
@@ -1459,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
ext4_mark_inode_dirty(handle, dir);
}
blocks = dir->i_size >> sb->s_blocksize_bits;
- for (block = 0, offset = 0; block < blocks; block++) {
+ for (block = 0; block < blocks; block++) {
bh = ext4_bread(handle, dir, block, 0, &retval);
if(!bh)
return retval;
@@ -1574,7 +1576,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
dxtrace(dx_show_index("node", frames[1].entries));
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
- err = ext4_journal_dirty_metadata(handle, bh2);
+ err = ext4_handle_dirty_metadata(handle, inode, bh2);
if (err)
goto journal_error;
brelse (bh2);
@@ -1600,7 +1602,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (err)
goto journal_error;
}
- ext4_journal_dirty_metadata(handle, frames[0].bh);
+ ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
}
de = do_split(handle, dir, &bh, frame, &hinfo, &err);
if (!de)
@@ -1646,8 +1648,8 @@ static int ext4_delete_entry(handle_t *handle,
else
de->inode = 0;
dir->i_version++;
- BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle, dir, bh);
return 0;
}
i += ext4_rec_len_from_disk(de->rec_len);
@@ -1725,7 +1727,7 @@ retry:
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
inode = ext4_new_inode (handle, dir, mode);
err = PTR_ERR(inode);
@@ -1759,7 +1761,7 @@ retry:
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
inode = ext4_new_inode(handle, dir, mode);
err = PTR_ERR(inode);
@@ -1795,7 +1797,7 @@ retry:
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
err = PTR_ERR(inode);
@@ -1824,8 +1826,8 @@ retry:
strcpy(de->name, "..");
ext4_set_de_type(dir->i_sb, de, S_IFDIR);
inode->i_nlink = 2;
- BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, dir_block);
+ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle, dir, dir_block);
brelse(dir_block);
ext4_mark_inode_dirty(handle, inode);
err = ext4_add_entry(handle, dentry, inode);
@@ -1854,7 +1856,7 @@ out_stop:
*/
static int empty_dir(struct inode *inode)
{
- unsigned long offset;
+ unsigned int offset;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de, *de1;
struct super_block *sb;
@@ -1899,7 +1901,7 @@ static int empty_dir(struct inode *inode)
if (err)
ext4_error(sb, __func__,
"error %d reading directory"
- " #%lu offset %lu",
+ " #%lu offset %u",
err, inode->i_ino, offset);
offset += sb->s_blocksize;
continue;
@@ -1937,6 +1939,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0, rc;
+ if (!ext4_handle_valid(handle))
+ return 0;
+
lock_super(sb);
if (!list_empty(&EXT4_I(inode)->i_orphan))
goto out_unlock;
@@ -1965,7 +1970,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
/* Insert this inode at the head of the on-disk orphan list... */
NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
err = rc;
@@ -1999,10 +2004,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct list_head *prev;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi;
- unsigned long ino_next;
+ __u32 ino_next;
struct ext4_iloc iloc;
int err = 0;
+ if (!ext4_handle_valid(handle))
+ return 0;
+
lock_super(inode->i_sb);
if (list_empty(&ei->i_orphan)) {
unlock_super(inode->i_sb);
@@ -2021,7 +2029,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
* transaction handle with which to update the orphan list on
* disk, but we still need to remove the inode from the linked
* list in memory. */
- if (!handle)
+ if (sbi->s_journal && !handle)
goto out;
err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2029,19 +2037,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
goto out_err;
if (prev == &sbi->s_orphan) {
- jbd_debug(4, "superblock will point to %lu\n", ino_next);
+ jbd_debug(4, "superblock will point to %u\n", ino_next);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
goto out_brelse;
sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
} else {
struct ext4_iloc iloc2;
struct inode *i_prev =
&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
- jbd_debug(4, "orphan inode %lu will point to %lu\n",
+ jbd_debug(4, "orphan inode %lu will point to %u\n",
i_prev->i_ino, ino_next);
err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
if (err)
@@ -2086,7 +2094,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
goto end_rmdir;
if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
inode = dentry->d_inode;
@@ -2140,7 +2148,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de);
@@ -2197,7 +2205,7 @@ retry:
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
err = PTR_ERR(inode);
@@ -2212,8 +2220,7 @@ retry:
* We have a transaction open. All is sweetness. It also sets
* i_size in generic_commit_write().
*/
- err = __page_symlink(inode, symname, l,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ err = __page_symlink(inode, symname, l, 1);
if (err) {
clear_nlink(inode);
unlock_new_inode(inode);
@@ -2261,7 +2268,7 @@ retry:
return PTR_ERR(handle);
if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
inode->i_ctime = ext4_current_time(inode);
ext4_inc_count(handle, inode);
@@ -2310,7 +2317,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
return PTR_ERR(handle);
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
/*
@@ -2364,8 +2371,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
new_dir->i_ctime = new_dir->i_mtime =
ext4_current_time(new_dir);
ext4_mark_inode_dirty(handle, new_dir);
- BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, new_bh);
+ BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle, new_dir, new_bh);
brelse(new_bh);
new_bh = NULL;
}
@@ -2415,8 +2422,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
BUFFER_TRACE(dir_bh, "get_write_access");
ext4_journal_get_write_access(handle, dir_bh);
PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
- BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, dir_bh);
+ BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
ext4_dec_count(handle, old_dir);
if (new_inode) {
/* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6ec1843a01..c328be5d688 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb,
ext4_get_group_no_and_offset(sb, start, NULL, &offset);
if (group != sbi->s_groups_count)
ext4_warning(sb, __func__,
- "Cannot add at group %u (only %lu groups)",
+ "Cannot add at group %u (only %u groups)",
input->group, sbi->s_groups_count);
else if (offset != 0)
ext4_warning(sb, __func__, "Last group not full");
@@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
{
int err;
- if (handle->h_buffer_credits >= thresh)
+ if (ext4_handle_has_enough_credits(handle, thresh))
return 0;
err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
@@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb,
memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
set_buffer_uptodate(gdb);
unlock_buffer(gdb);
- ext4_journal_dirty_metadata(handle, gdb);
+ ext4_handle_dirty_metadata(handle, NULL, gdb);
ext4_set_bit(bit, bh->b_data);
brelse(gdb);
}
@@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb,
err = PTR_ERR(bh);
goto exit_bh;
}
- ext4_journal_dirty_metadata(handle, gdb);
+ ext4_handle_dirty_metadata(handle, NULL, gdb);
ext4_set_bit(bit, bh->b_data);
brelse(gdb);
}
@@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb,
err = PTR_ERR(it);
goto exit_bh;
}
- ext4_journal_dirty_metadata(handle, it);
+ ext4_handle_dirty_metadata(handle, NULL, it);
brelse(it);
ext4_set_bit(bit, bh->b_data);
}
@@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
if ((err = extend_or_restart_transaction(handle, 2, bh)))
goto exit_bh;
- mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
- bh->b_data);
- ext4_journal_dirty_metadata(handle, bh);
+ mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+ ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
-
/* Mark unused entries in inode bitmap used */
ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
input->inode_bitmap, input->inode_bitmap - start);
@@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb,
goto exit_journal;
}
- mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+ mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
bh->b_data);
- ext4_journal_dirty_metadata(handle, bh);
+ ext4_handle_dirty_metadata(handle, NULL, bh);
exit_bh:
brelse(bh);
@@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
* reserved inode, and will become GDT blocks (primary and backup).
*/
data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
- ext4_journal_dirty_metadata(handle, dind);
+ ext4_handle_dirty_metadata(handle, NULL, dind);
brelse(dind);
inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
ext4_mark_iloc_dirty(handle, inode, &iloc);
memset((*primary)->b_data, 0, sb->s_blocksize);
- ext4_journal_dirty_metadata(handle, *primary);
+ ext4_handle_dirty_metadata(handle, NULL, *primary);
o_group_desc = EXT4_SB(sb)->s_group_desc;
memcpy(n_group_desc, o_group_desc,
@@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
kfree(o_group_desc);
le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
- ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+ ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
return 0;
@@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
primary[i]->b_blocknr, gdbackups,
blk + primary[i]->b_blocknr); */
data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
- err2 = ext4_journal_dirty_metadata(handle, primary[i]);
+ err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
if (!err)
err = err2;
}
@@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb,
struct buffer_head *bh;
/* Out of journal space, and can't get more - abort - so sad */
- if (handle->h_buffer_credits == 0 &&
+ if (ext4_handle_valid(handle) &&
+ handle->h_buffer_credits == 0 &&
ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
(err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
break;
@@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb,
memset(bh->b_data + size, 0, rest);
set_buffer_uptodate(bh);
unlock_buffer(bh);
- ext4_journal_dirty_metadata(handle, bh);
+ ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
}
if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb,
exit_err:
if (err) {
ext4_warning(sb, __func__,
- "can't update backup for group %lu (err %d), "
+ "can't update backup for group %u (err %d), "
"forcing fsck on next reboot", group, err);
sbi->s_mount_state &= ~EXT4_VALID_FS;
sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
struct inode *inode = NULL;
handle_t *handle;
int gdb_off, gdb_num;
+ int num_grp_locked = 0;
int err, err2;
gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
if (ext4_blocks_count(es) + input->blocks_count <
ext4_blocks_count(es)) {
- ext4_warning(sb, __func__, "blocks_count overflow\n");
+ ext4_warning(sb, __func__, "blocks_count overflow");
return -EINVAL;
}
if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
le32_to_cpu(es->s_inodes_count)) {
- ext4_warning(sb, __func__, "inodes_count overflow\n");
+ ext4_warning(sb, __func__, "inodes_count overflow");
return -EINVAL;
}
@@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
}
}
+
if ((err = verify_group_input(sb, input)))
goto exit_put;
@@ -855,6 +856,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* using the new disk blocks.
*/
+ num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
/* Update group descriptor block for new group */
gdp = (struct ext4_group_desc *)((char *)primary->b_data +
gdb_off * EXT4_DESC_SIZE(sb));
@@ -862,17 +864,20 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
- gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
- gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+ ext4_free_blks_set(sb, gdp, input->free_blocks_count);
+ ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+ gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
/*
* We can allocate memory for mb_alloc based on the new group
* descriptor
*/
- err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
- if (err)
+ err = ext4_mb_add_groupinfo(sb, input->group, gdp);
+ if (err) {
+ ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
goto exit_journal;
+ }
/*
* Make the new blocks and inodes valid next. We do this before
@@ -914,8 +919,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
/* Update the global fs size fields */
sbi->s_groups_count++;
+ ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
- ext4_journal_dirty_metadata(handle, primary);
+ ext4_handle_dirty_metadata(handle, NULL, primary);
/* Update the reserved block counts only once the new group is
* active. */
@@ -937,7 +943,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
EXT4_INODES_PER_GROUP(sb);
}
- ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+ ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
sb->s_dirt = 1;
exit_journal:
@@ -975,9 +981,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
struct buffer_head *bh;
handle_t *handle;
int err;
- unsigned long freed_blocks;
ext4_group_t group;
- struct ext4_group_info *grp;
/* We don't need to worry about locking wrt other resizers just
* yet: we're going to revalidate es->s_blocks_count after
@@ -997,8 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
" too large to resize to %llu blocks safely\n",
sb->s_id, n_blocks_count);
if (sizeof(sector_t) < 8)
- ext4_warning(sb, __func__,
- "CONFIG_LBD not enabled\n");
+ ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
return -EINVAL;
}
@@ -1071,62 +1074,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
goto exit_put;
}
ext4_blocks_count_set(es, o_blocks_count + add);
- ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+ ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
sb->s_dirt = 1;
unlock_super(sb);
ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
o_blocks_count + add);
- ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+ /* We add the blocks to the bitmap and set the group need init bit */
+ ext4_add_groupblocks(handle, sb, o_blocks_count, add);
ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
o_blocks_count + add);
if ((err = ext4_journal_stop(handle)))
goto exit_put;
- /*
- * Mark mballoc pages as not up to date so that they will be updated
- * next time they are loaded by ext4_mb_load_buddy.
- *
- * XXX Bad, Bad, BAD!!! We should not be overloading the
- * Uptodate flag, particularly on thte bitmap bh, as way of
- * hinting to ext4_mb_load_buddy() that it needs to be
- * overloaded. A user could take a LVM snapshot, then do an
- * on-line fsck, and clear the uptodate flag, and this would
- * not be a bug in userspace, but a bug in the kernel. FIXME!!!
- */
- {
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- int blocks_per_page;
- int block;
- int pnum;
- struct page *page;
-
- /* Set buddy page as not up to date */
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- block = group * 2;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
-
- /* Set bitmap page as not up to date */
- block++;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
-
- /* Get the info on the last group */
- grp = ext4_get_group_info(sb, group);
-
- /* Update free blocks in group info */
- ext4_mb_update_group_info(grp, add);
- }
-
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04158ad74db..8f7e0be8ab1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,8 +51,6 @@ struct proc_dir_entry *ext4_proc_root;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
-static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
- unsigned int);
static void ext4_commit_super(struct super_block *sb,
struct ext4_super_block *es, int sync);
static void ext4_mark_recovery_complete(struct super_block *sb,
@@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
}
+__u32 ext4_free_blks_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+
+__u32 ext4_free_inodes_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+
+__u32 ext4_used_dirs_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+
+__u32 ext4_itable_unused_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_itable_unused_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+}
+
void ext4_block_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
@@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb,
bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
}
+void ext4_free_blks_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_free_inodes_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_used_dirs_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_itable_unused_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
+}
+
/*
* Wrappers for jbd2_journal_start/end.
*
@@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
* backs (eg. EIO in the commit thread), then we still need to
* take the FS itself readonly cleanly. */
journal = EXT4_SB(sb)->s_journal;
- if (is_journal_aborted(journal)) {
- ext4_abort(sb, __func__,
- "Detected aborted journal");
- return ERR_PTR(-EROFS);
+ if (journal) {
+ if (is_journal_aborted(journal)) {
+ ext4_abort(sb, __func__,
+ "Detected aborted journal");
+ return ERR_PTR(-EROFS);
+ }
+ return jbd2_journal_start(journal, nblocks);
}
-
- return jbd2_journal_start(journal, nblocks);
+ /*
+ * We're not journaling, return the appropriate indication.
+ */
+ current->journal_info = EXT4_NOJOURNAL_HANDLE;
+ return current->journal_info;
}
/*
@@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
int err;
int rc;
+ if (!ext4_handle_valid(handle)) {
+ /*
+ * Do this here since we don't call jbd2_journal_stop() in
+ * no-journal mode.
+ */
+ current->journal_info = NULL;
+ return 0;
+ }
sb = handle->h_transaction->t_journal->j_private;
err = handle->h_err;
rc = jbd2_journal_stop(handle);
@@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
char nbuf[16];
const char *errstr = ext4_decode_error(NULL, err, nbuf);
+ BUG_ON(!ext4_handle_valid(handle));
+
if (bh)
BUFFER_TRACE(bh, "abort");
@@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function,
va_end(args);
}
+void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
+ const char *function, const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+ va_list args;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ va_start(args, fmt);
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+ vprintk(fmt, args);
+ printk("\n");
+ va_end(args);
+
+ if (test_opt(sb, ERRORS_CONT)) {
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+ ext4_commit_super(sb, es, 0);
+ return;
+ }
+ ext4_unlock_group(sb, grp);
+ ext4_handle_error(sb);
+ /*
+ * We only get here in the ERRORS_RO case; relocking the group
+ * may be dangerous, but nothing bad will happen since the
+ * filesystem will have already been marked read/only and the
+ * journal has been aborted. We return 1 as a hint to callers
+ * who might what to use the return value from
+ * ext4_grp_locked_error() to distinguish beween the
+ * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+ * aggressively from the ext4 function in question, with a
+ * more appropriate error code.
+ */
+ ext4_lock_group(sb, grp);
+ return;
+}
+
+
void ext4_update_dynamic_rev(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
return bdev;
fail:
- printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
+ printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
__bdevname(dev, b), PTR_ERR(bdev));
return NULL;
}
@@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb)
ext4_mb_release(sb);
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
- err = jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
- if (err < 0)
- ext4_abort(sb, __func__, "Couldn't clean up the journal");
-
+ if (sbi->s_journal) {
+ err = jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ if (err < 0)
+ ext4_abort(sb, __func__,
+ "Couldn't clean up the journal");
+ }
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
+ /*
+ * Note: We can be called before EXT4_SB(sb)->s_journal is set,
+ * therefore it can be null here. Don't check it, just initialize
+ * jinode.
+ */
jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
@@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode)
}
#endif
ext4_discard_preallocations(inode);
- jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+ if (EXT4_JOURNAL(inode))
+ jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
&EXT4_I(inode)->jinode);
}
@@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
#endif
if (!test_opt(sb, RESERVATION))
seq_puts(seq, ",noreservation");
- if (sbi->s_commit_interval) {
+ if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
seq_printf(seq, ",commit=%u",
(unsigned) (sbi->s_commit_interval / HZ));
}
+ if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
+ seq_printf(seq, ",min_batch_time=%u",
+ (unsigned) sbi->s_min_batch_time);
+ }
+ if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
+ seq_printf(seq, ",max_batch_time=%u",
+ (unsigned) sbi->s_min_batch_time);
+ }
+
/*
* We're changing the default of barrier mount option, so
* let's always display its mount state so it's clear what its
@@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",journal_async_commit");
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
- if (!test_opt(sb, EXTENTS))
- seq_puts(seq, ",noextents");
if (test_opt(sb, I_VERSION))
seq_puts(seq, ",i_version");
if (!test_opt(sb, DELALLOC))
@@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
ext4_nfs_get_inode);
}
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device. Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd2 layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+{
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+
+ WARN_ON(PageChecked(page));
+ if (!page_has_buffers(page))
+ return 0;
+ if (journal)
+ return jbd2_journal_try_to_free_buffers(journal, page,
+ wait & ~__GFP_WAIT);
+ return try_to_free_buffers(page);
+}
+
#ifdef CONFIG_QUOTA
#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -803,7 +953,9 @@ static struct dquot_operations ext4_quota_operations = {
.acquire_dquot = ext4_acquire_dquot,
.release_dquot = ext4_release_dquot,
.mark_dirty = ext4_mark_dquot_dirty,
- .write_info = ext4_write_info
+ .write_info = ext4_write_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
};
static struct quotactl_ops ext4_qctl_operations = {
@@ -836,6 +988,7 @@ static const struct super_operations ext4_sops = {
.quota_read = ext4_quota_read,
.quota_write = ext4_quota_write,
#endif
+ .bdev_try_to_free_page = bdev_try_to_free_page,
};
static const struct export_operations ext4_export_ops = {
@@ -850,16 +1003,17 @@ enum {
Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
- Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+ Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+ Opt_journal_update, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+ Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc,
- Opt_inode_readahead_blks
+ Opt_inode_readahead_blks, Opt_journal_ioprio
};
static const match_table_t tokens = {
@@ -889,8 +1043,9 @@ static const match_table_t tokens = {
{Opt_nobh, "nobh"},
{Opt_bh, "bh"},
{Opt_commit, "commit=%u"},
+ {Opt_min_batch_time, "min_batch_time=%u"},
+ {Opt_max_batch_time, "max_batch_time=%u"},
{Opt_journal_update, "journal=update"},
- {Opt_journal_inum, "journal=%u"},
{Opt_journal_dev, "journal_dev=%u"},
{Opt_journal_checksum, "journal_checksum"},
{Opt_journal_async_commit, "journal_async_commit"},
@@ -911,14 +1066,13 @@ static const match_table_t tokens = {
{Opt_quota, "quota"},
{Opt_usrquota, "usrquota"},
{Opt_barrier, "barrier=%u"},
- {Opt_extents, "extents"},
- {Opt_noextents, "noextents"},
{Opt_i_version, "i_version"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
{Opt_delalloc, "delalloc"},
{Opt_nodelalloc, "nodelalloc"},
{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+ {Opt_journal_ioprio, "journal_ioprio=%u"},
{Opt_err, NULL},
};
@@ -943,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data)
return sb_block;
}
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
static int parse_options(char *options, struct super_block *sb,
- unsigned int *inum, unsigned long *journal_devnum,
+ unsigned long *journal_devnum,
+ unsigned int *journal_ioprio,
ext4_fsblk_t *n_blocks_count, int is_remount)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -956,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb,
int qtype, qfmt;
char *qname;
#endif
- ext4_fsblk_t last_block;
if (!options)
return 1;
@@ -1068,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
}
set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
break;
- case Opt_journal_inum:
- if (is_remount) {
- printk(KERN_ERR "EXT4-fs: cannot specify "
- "journal on remount\n");
- return 0;
- }
- if (match_int(&args[0], &option))
- return 0;
- *inum = option;
- break;
case Opt_journal_dev:
if (is_remount) {
printk(KERN_ERR "EXT4-fs: cannot specify "
@@ -1107,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb,
option = JBD2_DEFAULT_MAX_COMMIT_AGE;
sbi->s_commit_interval = HZ * option;
break;
+ case Opt_max_batch_time:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option < 0)
+ return 0;
+ if (option == 0)
+ option = EXT4_DEF_MAX_BATCH_TIME;
+ sbi->s_max_batch_time = option;
+ break;
+ case Opt_min_batch_time:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option < 0)
+ return 0;
+ sbi->s_min_batch_time = option;
+ break;
case Opt_data_journal:
data_opt = EXT4_MOUNT_JOURNAL_DATA;
goto datacheck;
@@ -1142,8 +1304,7 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_grpjquota:
qtype = GRPQUOTA;
set_qf_name:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
!sbi->s_qf_names[qtype]) {
printk(KERN_ERR
"EXT4-fs: Cannot change journaled "
@@ -1182,8 +1343,7 @@ set_qf_name:
case Opt_offgrpjquota:
qtype = GRPQUOTA;
clear_qf_name:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
sbi->s_qf_names[qtype]) {
printk(KERN_ERR "EXT4-fs: Cannot change "
"journaled quota options when "
@@ -1202,8 +1362,7 @@ clear_qf_name:
case Opt_jqfmt_vfsv0:
qfmt = QFMT_VFS_V0;
set_qf_format:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
sbi->s_jquota_fmt != qfmt) {
printk(KERN_ERR "EXT4-fs: Cannot change "
"journaled quota options when "
@@ -1222,7 +1381,7 @@ set_qf_format:
set_opt(sbi->s_mount_opt, GRPQUOTA);
break;
case Opt_noquota:
- if (sb_any_quota_enabled(sb)) {
+ if (sb_any_quota_loaded(sb)) {
printk(KERN_ERR "EXT4-fs: Cannot change quota "
"options when quota turned on.\n");
return 0;
@@ -1280,33 +1439,6 @@ set_qf_format:
case Opt_bh:
clear_opt(sbi->s_mount_opt, NOBH);
break;
- case Opt_extents:
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS)) {
- ext4_warning(sb, __func__,
- "extents feature not enabled "
- "on this filesystem, use tune2fs\n");
- return 0;
- }
- set_opt(sbi->s_mount_opt, EXTENTS);
- break;
- case Opt_noextents:
- /*
- * When e2fsprogs support resizing an already existing
- * ext3 file system to greater than 2**32 we need to
- * add support to block allocator to handle growing
- * already existing block mapped inode so that blocks
- * allocated for them fall within 2**32
- */
- last_block = ext4_blocks_count(sbi->s_es) - 1;
- if (last_block > 0xffffffffULL) {
- printk(KERN_ERR "EXT4-fs: Filesystem too "
- "large to mount with "
- "-o noextents options\n");
- return 0;
- }
- clear_opt(sbi->s_mount_opt, EXTENTS);
- break;
case Opt_i_version:
set_opt(sbi->s_mount_opt, I_VERSION);
sb->s_flags |= MS_I_VERSION;
@@ -1331,6 +1463,14 @@ set_qf_format:
return 0;
sbi->s_inode_readahead_blks = option;
break;
+ case Opt_journal_ioprio:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option < 0 || option > 7)
+ break;
+ *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
+ option);
+ break;
default:
printk(KERN_ERR
"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1406,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
printk(KERN_WARNING
"EXT4-fs warning: checktime reached, "
"running e2fsck is recommended\n");
-#if 0
- /* @@@ We _will_ want to clear the valid bit if we find
- * inconsistencies, to force a fsck at reboot. But for
- * a plain journaled filesystem we can keep it set as
- * valid forever! :)
- */
- es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
-#endif
+ if (!sbi->s_journal)
+ es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
es->s_mtime = cpu_to_le32(get_seconds());
ext4_update_dynamic_rev(sb);
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ if (sbi->s_journal)
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
ext4_commit_super(sb, es, 1);
if (test_opt(sb, DEBUG))
- printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
+ printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
"bpg=%lu, ipg=%lu, mo=%04lx]\n",
sb->s_blocksize,
sbi->s_groups_count,
@@ -1431,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
EXT4_INODES_PER_GROUP(sb),
sbi->s_mount_opt);
- printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
- sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
- "external", EXT4_SB(sb)->s_journal->j_devname);
+ if (EXT4_SB(sb)->s_journal) {
+ printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+ sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+ "external", EXT4_SB(sb)->s_journal->j_devname);
+ } else {
+ printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+ }
return res;
}
@@ -1445,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
ext4_group_t flex_group_count;
ext4_group_t flex_group;
int groups_per_flex = 0;
- __u64 block_bitmap = 0;
int i;
if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1464,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb)
sizeof(struct flex_groups), GFP_KERNEL);
if (sbi->s_flex_groups == NULL) {
printk(KERN_ERR "EXT4-fs: not enough memory for "
- "%lu flex groups\n", flex_group_count);
+ "%u flex groups\n", flex_group_count);
goto failed;
}
- gdp = ext4_get_group_desc(sb, 1, &bh);
- block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
-
for (i = 0; i < sbi->s_groups_count; i++) {
gdp = ext4_get_group_desc(sb, i, &bh);
flex_group = ext4_flex_group(sbi, i);
sbi->s_flex_groups[flex_group].free_inodes +=
- le16_to_cpu(gdp->bg_free_inodes_count);
+ ext4_free_inodes_count(sb, gdp);
sbi->s_flex_groups[flex_group].free_blocks +=
- le16_to_cpu(gdp->bg_free_blocks_count);
+ ext4_free_blks_count(sb, gdp);
}
return 1;
@@ -1552,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb)
block_bitmap = ext4_block_bitmap(sb, gdp);
if (block_bitmap < first_block || block_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
- "Block bitmap for group %lu not in group "
+ "Block bitmap for group %u not in group "
"(block %llu)!\n", i, block_bitmap);
return 0;
}
inode_bitmap = ext4_inode_bitmap(sb, gdp);
if (inode_bitmap < first_block || inode_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
- "Inode bitmap for group %lu not in group "
+ "Inode bitmap for group %u not in group "
"(block %llu)!\n", i, inode_bitmap);
return 0;
}
@@ -1567,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb)
if (inode_table < first_block ||
inode_table + sbi->s_itb_per_group - 1 > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
- "Inode table for group %lu not in group "
+ "Inode table for group %u not in group "
"(block %llu)!\n", i, inode_table);
return 0;
}
spin_lock(sb_bgl_lock(sbi, i));
if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
- "Checksum for group %lu failed (%u!=%u)\n",
+ "Checksum for group %u failed (%u!=%u)\n",
i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
gdp)), le16_to_cpu(gdp->bg_checksum));
if (!(sb->s_flags & MS_RDONLY)) {
@@ -1866,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_fsblk_t sb_block = get_sb_block(&data);
ext4_fsblk_t logical_sb_block;
unsigned long offset = 0;
- unsigned int journal_inum = 0;
unsigned long journal_devnum = 0;
unsigned long def_mount_opts;
struct inode *root;
char *cp;
+ const char *descr;
int ret = -EINVAL;
int blocksize;
- int db_count;
- int i;
+ unsigned int db_count;
+ unsigned int i;
int needs_recovery, has_huge_files;
- __le32 features;
+ int features;
__u64 blocks_count;
int err;
+ unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -1959,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+ sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+ sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+ sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
set_opt(sbi->s_mount_opt, RESERVATION);
set_opt(sbi->s_mount_opt, BARRIER);
/*
- * turn on extents feature by default in ext4 filesystem
- * only if feature flag already set by mkfs or tune2fs.
- * Use -o noextents to turn it off
- */
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
- set_opt(sbi->s_mount_opt, EXTENTS);
- else
- ext4_warning(sb, __func__,
- "extents feature not enabled on this filesystem, "
- "use tune2fs.\n");
-
- /*
* enable delayed allocation by default
* Use -o nodelalloc to turn it off
*/
set_opt(sbi->s_mount_opt, DELALLOC);
- if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
- NULL, 0))
+ if (!parse_options((char *) data, sb, &journal_devnum,
+ &journal_ioprio, NULL, 0))
goto failed_mount;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2005,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
if (features) {
printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ "unsupported optional features (%x).\n", sb->s_id,
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+ ~EXT4_FEATURE_INCOMPAT_SUPP));
goto failed_mount;
}
features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
if (!(sb->s_flags & MS_RDONLY) && features) {
printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ "unsupported optional features (%x).\n", sb->s_id,
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+ ~EXT4_FEATURE_RO_COMPAT_SUPP));
goto failed_mount;
}
has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2118,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
+ i = le32_to_cpu(es->s_flags);
+ if (i & EXT2_FLAGS_UNSIGNED_HASH)
+ sbi->s_hash_unsigned = 3;
+ else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+ es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+ sbi->s_hash_unsigned = 3;
+#else
+ es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+ sb->s_dirt = 1;
+ }
if (sbi->s_blocks_per_group > blocksize * 8) {
printk(KERN_ERR
@@ -2145,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
goto cantfind_ext4;
- /* ensure blocks_count calculation below doesn't sign-extend */
- if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
- le32_to_cpu(es->s_first_data_block) + 1) {
- printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
- "first data block %u, blocks per group %lu\n",
- ext4_blocks_count(es),
- le32_to_cpu(es->s_first_data_block),
- EXT4_BLOCKS_PER_GROUP(sb));
+ /*
+ * It makes no sense for the first data block to be beyond the end
+ * of the filesystem.
+ */
+ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+ printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
+ "block %u is beyond end of filesystem (%llu)\n",
+ le32_to_cpu(es->s_first_data_block),
+ ext4_blocks_count(es));
goto failed_mount;
}
blocks_count = (ext4_blocks_count(es) -
le32_to_cpu(es->s_first_data_block) +
EXT4_BLOCKS_PER_GROUP(sb) - 1);
do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+ if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+ printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+ "(block count %llu, first data block %u, "
+ "blocks per group %lu)\n", sbi->s_groups_count,
+ ext4_blocks_count(es),
+ le32_to_cpu(es->s_first_data_block),
+ EXT4_BLOCKS_PER_GROUP(sb));
+ goto failed_mount;
+ }
sbi->s_groups_count = blocks_count;
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
@@ -2270,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
ext4_commit_super(sb, es, 1);
- printk(KERN_CRIT
- "EXT4-fs (device %s): mount failed\n",
- sb->s_id);
goto failed_mount4;
}
}
- } else if (journal_inum) {
- if (ext4_create_journal(sb, es, journal_inum))
- goto failed_mount3;
+ } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
+ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ printk(KERN_ERR "EXT4-fs: required journal recovery "
+ "suppressed and not mounted read-only\n");
+ goto failed_mount4;
} else {
- if (!silent)
- printk(KERN_ERR
- "ext4: No journal on filesystem on %s\n",
- sb->s_id);
- goto failed_mount3;
+ clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+ set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+ sbi->s_journal = NULL;
+ needs_recovery = 0;
+ goto no_journal;
}
if (ext4_blocks_count(es) > 0xffffffffULL &&
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_64BIT)) {
- printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n");
+ printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
goto failed_mount4;
}
@@ -2335,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
default:
break;
}
+ set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+
+no_journal:
if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
@@ -2420,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
- if (needs_recovery)
+ if (needs_recovery) {
printk(KERN_INFO "EXT4-fs: recovery complete.\n");
- ext4_mark_recovery_complete(sb, es);
- printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
- test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
- test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
- "writeback");
+ ext4_mark_recovery_complete(sb, es);
+ }
+ if (EXT4_SB(sb)->s_journal) {
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ descr = " journalled data mode";
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ descr = " ordered data mode";
+ else
+ descr = " writeback data mode";
+ } else
+ descr = "out journal";
+
+ printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
+ sb->s_id, descr);
lock_kernel();
return 0;
@@ -2438,8 +2600,11 @@ cantfind_ext4:
goto failed_mount;
failed_mount4:
- jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+ if (sbi->s_journal) {
+ jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ }
failed_mount3:
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -2476,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (sbi->s_commit_interval)
- journal->j_commit_interval = sbi->s_commit_interval;
- /* We could also set up an ext4-specific default for the commit
- * interval here, but for now we'll just fall back to the jbd
- * default. */
+ journal->j_commit_interval = sbi->s_commit_interval;
+ journal->j_min_batch_time = sbi->s_min_batch_time;
+ journal->j_max_batch_time = sbi->s_max_batch_time;
spin_lock(&journal->j_state_lock);
if (test_opt(sb, BARRIER))
@@ -2500,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
struct inode *journal_inode;
journal_t *journal;
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
/* First, test for the existence of a valid inode on disk. Bad
* things happen if we iget() an unused inode, as the subsequent
* iput() will try to delete it. */
@@ -2548,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
struct ext4_super_block *es;
struct block_device *bdev;
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
bdev = ext4_blkdev_get(j_dev);
if (bdev == NULL)
return NULL;
if (bd_claim(bdev, sb)) {
printk(KERN_ERR
- "EXT4: failed to claim external journal device.\n");
+ "EXT4-fs: failed to claim external journal device.\n");
blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
return NULL;
}
@@ -2635,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb,
int err = 0;
int really_read_only;
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
printk(KERN_INFO "EXT4-fs: external journal device major/minor "
@@ -2719,48 +2888,6 @@ static int ext4_load_journal(struct super_block *sb,
return 0;
}
-static int ext4_create_journal(struct super_block *sb,
- struct ext4_super_block *es,
- unsigned int journal_inum)
-{
- journal_t *journal;
- int err;
-
- if (sb->s_flags & MS_RDONLY) {
- printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
- "create journal.\n");
- return -EROFS;
- }
-
- journal = ext4_get_journal(sb, journal_inum);
- if (!journal)
- return -EINVAL;
-
- printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
- journal_inum);
-
- err = jbd2_journal_create(journal);
- if (err) {
- printk(KERN_ERR "EXT4-fs: error creating journal.\n");
- jbd2_journal_destroy(journal);
- return -EIO;
- }
-
- EXT4_SB(sb)->s_journal = journal;
-
- ext4_update_dynamic_rev(sb);
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
- EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
-
- es->s_journal_inum = cpu_to_le32(journal_inum);
- sb->s_dirt = 1;
-
- /* Make sure we flush the recovery flag to disk. */
- ext4_commit_super(sb, es, 1);
-
- return 0;
-}
-
static void ext4_commit_super(struct super_block *sb,
struct ext4_super_block *es, int sync)
{
@@ -2777,20 +2904,23 @@ static void ext4_commit_super(struct super_block *sb,
* be remapped. Nothing we can do but to retry the
* write and hope for the best.
*/
- printk(KERN_ERR "ext4: previous I/O error to "
+ printk(KERN_ERR "EXT4-fs: previous I/O error to "
"superblock detected for %s.\n", sb->s_id);
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
es->s_wtime = cpu_to_le32(get_seconds());
- ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
- es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+ ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeblocks_counter));
+ es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeinodes_counter));
+
BUFFER_TRACE(sbh, "marking dirty");
mark_buffer_dirty(sbh);
if (sync) {
sync_dirty_buffer(sbh);
if (buffer_write_io_error(sbh)) {
- printk(KERN_ERR "ext4: I/O error while writing "
+ printk(KERN_ERR "EXT4-fs: I/O error while writing "
"superblock for %s.\n", sb->s_id);
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
@@ -2809,6 +2939,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
{
journal_t *journal = EXT4_SB(sb)->s_journal;
+ if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ BUG_ON(journal != NULL);
+ return;
+ }
jbd2_journal_lock_updates(journal);
if (jbd2_journal_flush(journal) < 0)
goto out;
@@ -2838,6 +2972,8 @@ static void ext4_clear_journal_err(struct super_block *sb,
int j_errno;
const char *errstr;
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
journal = EXT4_SB(sb)->s_journal;
/*
@@ -2870,14 +3006,17 @@ static void ext4_clear_journal_err(struct super_block *sb,
int ext4_force_commit(struct super_block *sb)
{
journal_t *journal;
- int ret;
+ int ret = 0;
if (sb->s_flags & MS_RDONLY)
return 0;
journal = EXT4_SB(sb)->s_journal;
- sb->s_dirt = 0;
- ret = ext4_journal_force_commit(journal);
+ if (journal) {
+ sb->s_dirt = 0;
+ ret = ext4_journal_force_commit(journal);
+ }
+
return ret;
}
@@ -2889,9 +3028,13 @@ int ext4_force_commit(struct super_block *sb)
*/
static void ext4_write_super(struct super_block *sb)
{
- if (mutex_trylock(&sb->s_lock) != 0)
- BUG();
- sb->s_dirt = 0;
+ if (EXT4_SB(sb)->s_journal) {
+ if (mutex_trylock(&sb->s_lock) != 0)
+ BUG();
+ sb->s_dirt = 0;
+ } else {
+ ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+ }
}
static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -2900,10 +3043,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
sb->s_dirt = 0;
- if (wait)
- ret = ext4_force_commit(sb);
- else
- jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+ if (EXT4_SB(sb)->s_journal) {
+ if (wait)
+ ret = ext4_force_commit(sb);
+ else
+ jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+ } else {
+ ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+ }
return ret;
}
@@ -2918,15 +3065,17 @@ static void ext4_write_super_lockfs(struct super_block *sb)
if (!(sb->s_flags & MS_RDONLY)) {
journal_t *journal = EXT4_SB(sb)->s_journal;
- /* Now we set up the journal barrier. */
- jbd2_journal_lock_updates(journal);
+ if (journal) {
+ /* Now we set up the journal barrier. */
+ jbd2_journal_lock_updates(journal);
- /*
- * We don't want to clear needs_recovery flag when we failed
- * to flush the journal.
- */
- if (jbd2_journal_flush(journal) < 0)
- return;
+ /*
+ * We don't want to clear needs_recovery flag when we
+ * failed to flush the journal.
+ */
+ if (jbd2_journal_flush(journal) < 0)
+ return;
+ }
/* Journal blocked and flushed, clear needs_recovery flag. */
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2940,7 +3089,7 @@ static void ext4_write_super_lockfs(struct super_block *sb)
*/
static void ext4_unlockfs(struct super_block *sb)
{
- if (!(sb->s_flags & MS_RDONLY)) {
+ if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
lock_super(sb);
/* Reser the needs_recovery flag before the fs is unlocked. */
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2958,6 +3107,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
unsigned long old_sb_flags;
struct ext4_mount_options old_opts;
ext4_group_t g;
+ unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
int err;
#ifdef CONFIG_QUOTA
int i;
@@ -2969,16 +3119,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
old_opts.s_resuid = sbi->s_resuid;
old_opts.s_resgid = sbi->s_resgid;
old_opts.s_commit_interval = sbi->s_commit_interval;
+ old_opts.s_min_batch_time = sbi->s_min_batch_time;
+ old_opts.s_max_batch_time = sbi->s_max_batch_time;
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
old_opts.s_qf_names[i] = sbi->s_qf_names[i];
#endif
+ if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+ journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
/*
* Allow the "check" option to be passed as a remount option.
*/
- if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+ if (!parse_options(data, sb, NULL, &journal_ioprio,
+ &n_blocks_count, 1)) {
err = -EINVAL;
goto restore_opts;
}
@@ -2991,7 +3146,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
es = sbi->s_es;
- ext4_init_journal_params(sb, sbi->s_journal);
+ if (sbi->s_journal) {
+ ext4_init_journal_params(sb, sbi->s_journal);
+ set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+ }
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
n_blocks_count > ext4_blocks_count(es)) {
@@ -3020,17 +3178,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
* We have to unlock super so that we can wait for
* transactions.
*/
- unlock_super(sb);
- ext4_mark_recovery_complete(sb, es);
- lock_super(sb);
+ if (sbi->s_journal) {
+ unlock_super(sb);
+ ext4_mark_recovery_complete(sb, es);
+ lock_super(sb);
+ }
} else {
- __le32 ret;
+ int ret;
if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
~EXT4_FEATURE_RO_COMPAT_SUPP))) {
printk(KERN_WARNING "EXT4-fs: %s: couldn't "
"remount RDWR because of unsupported "
- "optional features (%x).\n",
- sb->s_id, le32_to_cpu(ret));
+ "optional features (%x).\n", sb->s_id,
+ (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
+ ~EXT4_FEATURE_RO_COMPAT_SUPP));
err = -EROFS;
goto restore_opts;
}
@@ -3047,7 +3208,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
printk(KERN_ERR
"EXT4-fs: ext4_remount: "
- "Checksum for group %lu failed (%u!=%u)\n",
+ "Checksum for group %u failed (%u!=%u)\n",
g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
le16_to_cpu(gdp->bg_checksum));
err = -EINVAL;
@@ -3076,7 +3237,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
* been changed by e2fsck since we originally mounted
* the partition.)
*/
- ext4_clear_journal_err(sb, es);
+ if (sbi->s_journal)
+ ext4_clear_journal_err(sb, es);
sbi->s_mount_state = le16_to_cpu(es->s_state);
if ((err = ext4_group_extend(sb, es, n_blocks_count)))
goto restore_opts;
@@ -3084,6 +3246,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags &= ~MS_RDONLY;
}
}
+ if (sbi->s_journal == NULL)
+ ext4_commit_super(sb, es, 1);
+
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
@@ -3098,6 +3263,8 @@ restore_opts:
sbi->s_resuid = old_opts.s_resuid;
sbi->s_resgid = old_opts.s_resgid;
sbi->s_commit_interval = old_opts.s_commit_interval;
+ sbi->s_min_batch_time = old_opts.s_min_batch_time;
+ sbi->s_max_batch_time = old_opts.s_max_batch_time;
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
@@ -3360,7 +3527,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
* When we journal data on quota file, we have to flush journal to see
* all updates to the file when we bypass pagecache...
*/
- if (ext4_should_journal_data(path.dentry->d_inode)) {
+ if (EXT4_SB(sb)->s_journal &&
+ ext4_should_journal_data(path.dentry->d_inode)) {
/*
* We don't need to lock updates but journal_flush() could
* otherwise be livelocked...
@@ -3434,7 +3602,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
- if (!handle) {
+ if (EXT4_SB(sb)->s_journal && !handle) {
printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
" cancelled because transaction is not started.\n",
(unsigned long long)off, (unsigned long long)len);
@@ -3459,7 +3627,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
flush_dcache_page(bh->b_page);
unlock_buffer(bh);
if (journal_quota)
- err = ext4_journal_dirty_metadata(handle, bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
else {
/* Always do at least ordered writes for quotas */
err = ext4_jbd2_file_inode(handle, inode);
@@ -3513,18 +3681,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file)
static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
size_t cnt, loff_t *ppos)
{
- unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
+ unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
char str[32];
- unsigned long value;
if (cnt >= sizeof(str))
return -EINVAL;
if (copy_from_user(str, buf, cnt))
return -EFAULT;
- value = simple_strtol(str, NULL, 0);
- if (value < 0)
- return -ERANGE;
- *p = value;
+
+ *p = simple_strtoul(str, NULL, 0);
return cnt;
}
@@ -3615,7 +3780,7 @@ static void __exit exit_ext4_fs(void)
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+MODULE_DESCRIPTION("Fourth Extended Filesystem");
MODULE_LICENSE("GPL");
module_init(init_ext4_fs)
module_exit(exit_ext4_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 80626d516fe..157ce6589c5 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
sb->s_dirt = 1;
- ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+ ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
}
}
@@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
} else {
le32_add_cpu(&BHDR(bh)->h_refcount, -1);
- error = ext4_journal_dirty_metadata(handle, bh);
+ error = ext4_handle_dirty_metadata(handle, inode, bh);
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
DQUOT_FREE_BLOCK(inode, 1);
ea_bdebug(bh, "refcount now=%d; releasing",
le32_to_cpu(BHDR(bh)->h_refcount));
@@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (error == -EIO)
goto bad_block;
if (!error)
- error = ext4_journal_dirty_metadata(handle,
- bs->bh);
+ error = ext4_handle_dirty_metadata(handle,
+ inode,
+ bs->bh);
if (error)
goto cleanup;
goto inserted;
@@ -794,8 +795,9 @@ inserted:
ea_bdebug(new_bh, "reusing; refcount now=%d",
le32_to_cpu(BHDR(new_bh)->h_refcount));
unlock_buffer(new_bh);
- error = ext4_journal_dirty_metadata(handle,
- new_bh);
+ error = ext4_handle_dirty_metadata(handle,
+ inode,
+ new_bh);
if (error)
goto cleanup_dquot;
}
@@ -810,8 +812,8 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
- goal, &error);
+ ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+ goal, NULL, &error);
if (error)
goto cleanup;
ea_idebug(inode, "creating block %d", block);
@@ -833,7 +835,8 @@ getblk_failed:
set_buffer_uptodate(new_bh);
unlock_buffer(new_bh);
ext4_xattr_cache_insert(new_bh);
- error = ext4_journal_dirty_metadata(handle, new_bh);
+ error = ext4_handle_dirty_metadata(handle,
+ inode, new_bh);
if (error)
goto cleanup;
}
@@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
*/
is.iloc.bh = NULL;
if (IS_SYNC(inode))
- handle->h_sync = 1;
+ ext4_handle_sync(handle);
}
cleanup:
diff --git a/fs/filesystems.c b/fs/filesystems.c
index d0e20ced62d..d488dcd7f2b 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void)
module_init(proc_filesystems_init);
#endif
-struct file_system_type *get_fs_type(const char *name)
+static struct file_system_type *__get_fs_type(const char *name, int len)
{
struct file_system_type *fs;
- const char *dot = strchr(name, '.');
- unsigned len = dot ? dot - name : strlen(name);
read_lock(&file_systems_lock);
fs = *(find_filesystem(name, len));
if (fs && !try_module_get(fs->owner))
fs = NULL;
read_unlock(&file_systems_lock);
- if (!fs && (request_module("%.*s", len, name) == 0)) {
- read_lock(&file_systems_lock);
- fs = *(find_filesystem(name, len));
- if (fs && !try_module_get(fs->owner))
- fs = NULL;
- read_unlock(&file_systems_lock);
- }
+ return fs;
+}
+
+struct file_system_type *get_fs_type(const char *name)
+{
+ struct file_system_type *fs;
+ const char *dot = strchr(name, '.');
+ int len = dot ? dot - name : strlen(name);
+
+ fs = __get_fs_type(name, len);
+ if (!fs && (request_module("%.*s", len, name) == 0))
+ fs = __get_fs_type(name, len);
if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
put_filesystem(fs);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0ff0b8cf30..e5eaa62fd17 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* If we're a pdlfush thread, then implement pdflush collision avoidance
* against the entire list.
*
- * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
- * that it can be located for waiting on in __writeback_single_inode().
- *
* If `bdi' is non-zero then we're being asked to writeback a specific queue.
* This function assumes that the blockdev superblock's inodes are backed by
* a variety of queues, so all inodes are searched. For other superblocks,
@@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
struct writeback_control *wbc)
{
const unsigned long start = jiffies; /* livelock avoidance */
+ int sync = wbc->sync_mode == WB_SYNC_ALL;
spin_lock(&inode_lock);
if (!wbc->for_kupdate || list_empty(&sb->s_io))
@@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb,
__iget(inode);
pages_skipped = wbc->pages_skipped;
__writeback_single_inode(inode, wbc);
- if (wbc->sync_mode == WB_SYNC_HOLD) {
- inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &sb->s_dirty);
- }
if (current_is_pdflush())
writeback_release(bdi);
if (wbc->pages_skipped != pages_skipped) {
@@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb,
if (!list_empty(&sb->s_more_io))
wbc->more_io = 1;
}
- spin_unlock(&inode_lock);
+
+ if (sync) {
+ struct inode *inode, *old_inode = NULL;
+
+ /*
+ * Data integrity sync. Must wait for all pages under writeback,
+ * because there may have been pages dirtied before our sync
+ * call, but which had writeout started before we write it out.
+ * In which case, the inode may not be on the dirty list, but
+ * we still have to wait for that writeout.
+ */
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ struct address_space *mapping;
+
+ if (inode->i_state & (I_FREEING|I_WILL_FREE))
+ continue;
+ mapping = inode->i_mapping;
+ if (mapping->nrpages == 0)
+ continue;
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ /*
+ * We hold a reference to 'inode' so it couldn't have
+ * been removed from s_inodes list while we dropped the
+ * inode_lock. We cannot iput the inode now as we can
+ * be holding the last reference and we cannot iput it
+ * under inode_lock. So we keep the reference and iput
+ * it later.
+ */
+ iput(old_inode);
+ old_inode = inode;
+
+ filemap_fdatawait(mapping);
+
+ cond_resched();
+
+ spin_lock(&inode_lock);
+ }
+ spin_unlock(&inode_lock);
+ iput(old_inode);
+ } else
+ spin_unlock(&inode_lock);
+
return; /* Leave any unwritten inodes on s_io */
}
EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
@@ -588,8 +624,7 @@ restart:
/*
* writeback and wait upon the filesystem's dirty inodes. The caller will
- * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is
- * used to park the written inodes on sb->s_dirty for the wait pass.
+ * do this in two passes - one to write, and one to wait.
*
* A finite limit is set on the number of pages which will be written.
* To prevent infinite livelock of sys_sync().
@@ -600,30 +635,21 @@ restart:
void sync_inodes_sb(struct super_block *sb, int wait)
{
struct writeback_control wbc = {
- .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+ .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
.range_start = 0,
.range_end = LLONG_MAX,
};
- unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
- unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
- wbc.nr_to_write = nr_dirty + nr_unstable +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
- nr_dirty + nr_unstable;
- wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
- sync_sb_inodes(sb, &wbc);
-}
+ if (!wait) {
+ unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+ unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-/*
- * Rather lame livelock avoidance.
- */
-static void set_sb_syncing(int val)
-{
- struct super_block *sb;
- spin_lock(&sb_lock);
- list_for_each_entry_reverse(sb, &super_blocks, s_list)
- sb->s_syncing = val;
- spin_unlock(&sb_lock);
+ wbc.nr_to_write = nr_dirty + nr_unstable +
+ (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+ } else
+ wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
+
+ sync_sb_inodes(sb, &wbc);
}
/**
@@ -652,9 +678,6 @@ static void __sync_inodes(int wait)
spin_lock(&sb_lock);
restart:
list_for_each_entry(sb, &super_blocks, s_list) {
- if (sb->s_syncing)
- continue;
- sb->s_syncing = 1;
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
@@ -672,13 +695,10 @@ restart:
void sync_inodes(int wait)
{
- set_sb_syncing(0);
__sync_inodes(0);
- if (wait) {
- set_sb_syncing(0);
+ if (wait)
__sync_inodes(1);
- }
}
/**
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 4f3cab32141..99c99dfb037 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -1,6 +1,6 @@
/*
FUSE: Filesystem in Userspace
- Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
This program can be distributed under the terms of the GNU GPL.
See the file COPYING.
@@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
size_t size;
if (!*ppos) {
+ long value;
struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
if (!fc)
return 0;
- file->private_data=(void *)(long)atomic_read(&fc->num_waiting);
+ value = atomic_read(&fc->num_waiting);
+ file->private_data = (void *)value;
fuse_conn_put(fc);
}
size = sprintf(tmp, "%ld\n", (long)file->private_data);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fba571648a8..e0c7ada08a1 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
/*
FUSE: Filesystem in Userspace
- Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
This program can be distributed under the terms of the GNU GPL.
See the file COPYING.
@@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
* Called with fc->lock, unlocks it
*/
static void request_end(struct fuse_conn *fc, struct fuse_req *req)
- __releases(fc->lock)
+__releases(&fc->lock)
{
void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
req->end = NULL;
@@ -293,13 +293,13 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&req->waitq);
if (end)
end(fc, req);
- else
- fuse_put_request(fc, req);
+ fuse_put_request(fc, req);
}
static void wait_answer_interruptible(struct fuse_conn *fc,
struct fuse_req *req)
- __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
{
if (signal_pending(current))
return;
@@ -317,7 +317,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
}
static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
- __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
{
if (!fc->no_interrupt) {
/* Any signal may interrupt this */
@@ -380,7 +381,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
}
}
-void request_send(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
{
req->isreply = 1;
spin_lock(&fc->lock);
@@ -399,8 +400,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
spin_unlock(&fc->lock);
}
-static void request_send_nowait_locked(struct fuse_conn *fc,
- struct fuse_req *req)
+static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
+ struct fuse_req *req)
{
req->background = 1;
fc->num_background++;
@@ -414,11 +415,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc,
flush_bg_queue(fc);
}
-static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
{
spin_lock(&fc->lock);
if (fc->connected) {
- request_send_nowait_locked(fc, req);
+ fuse_request_send_nowait_locked(fc, req);
spin_unlock(&fc->lock);
} else {
req->out.h.error = -ENOTCONN;
@@ -426,16 +427,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
}
}
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
{
req->isreply = 0;
- request_send_nowait(fc, req);
+ fuse_request_send_nowait(fc, req);
}
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
{
req->isreply = 1;
- request_send_nowait(fc, req);
+ fuse_request_send_nowait(fc, req);
}
/*
@@ -443,10 +444,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
*
* fc->connected must have been checked previously
*/
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+ struct fuse_req *req)
{
req->isreply = 1;
- request_send_nowait_locked(fc, req);
+ fuse_request_send_nowait_locked(fc, req);
}
/*
@@ -539,8 +541,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
BUG_ON(!cs->nr_segs);
cs->seglen = cs->iov[0].iov_len;
cs->addr = (unsigned long) cs->iov[0].iov_base;
- cs->iov ++;
- cs->nr_segs --;
+ cs->iov++;
+ cs->nr_segs--;
}
down_read(&current->mm->mmap_sem);
err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
@@ -589,9 +591,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
kunmap_atomic(mapaddr, KM_USER1);
}
while (count) {
- int err;
- if (!cs->len && (err = fuse_copy_fill(cs)))
- return err;
+ if (!cs->len) {
+ int err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ }
if (page) {
void *mapaddr = kmap_atomic(page, KM_USER1);
void *buf = mapaddr + offset;
@@ -631,9 +635,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
{
while (size) {
- int err;
- if (!cs->len && (err = fuse_copy_fill(cs)))
- return err;
+ if (!cs->len) {
+ int err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ }
fuse_copy_do(cs, &val, &size);
}
return 0;
@@ -664,6 +670,8 @@ static int request_pending(struct fuse_conn *fc)
/* Wait until a request is available on the pending list */
static void request_wait(struct fuse_conn *fc)
+__releases(&fc->lock)
+__acquires(&fc->lock)
{
DECLARE_WAITQUEUE(wait, current);
@@ -691,7 +699,7 @@ static void request_wait(struct fuse_conn *fc)
*/
static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
const struct iovec *iov, unsigned long nr_segs)
- __releases(fc->lock)
+__releases(&fc->lock)
{
struct fuse_copy_state cs;
struct fuse_in_header ih;
@@ -813,6 +821,34 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
return err;
}
+static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_poll_wakeup_out outarg;
+ int err;
+
+ if (size != sizeof(outarg))
+ return -EINVAL;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ return err;
+
+ return fuse_notify_poll_wakeup(fc, &outarg);
+}
+
+static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+ unsigned int size, struct fuse_copy_state *cs)
+{
+ switch (code) {
+ case FUSE_NOTIFY_POLL:
+ return fuse_notify_poll(fc, size, cs);
+
+ default:
+ return -EINVAL;
+ }
+}
+
/* Look up request on processing list by unique ID */
static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
{
@@ -876,9 +912,23 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
err = fuse_copy_one(&cs, &oh, sizeof(oh));
if (err)
goto err_finish;
+
+ err = -EINVAL;
+ if (oh.len != nbytes)
+ goto err_finish;
+
+ /*
+ * Zero oh.unique indicates unsolicited notification message
+ * and error contains notification code.
+ */
+ if (!oh.unique) {
+ err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+ fuse_copy_finish(&cs);
+ return err ? err : nbytes;
+ }
+
err = -EINVAL;
- if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
- oh.len != nbytes)
+ if (oh.error <= -1000 || oh.error > 0)
goto err_finish;
spin_lock(&fc->lock);
@@ -966,6 +1016,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
* This function releases and reacquires fc->lock
*/
static void end_requests(struct fuse_conn *fc, struct list_head *head)
+__releases(&fc->lock)
+__acquires(&fc->lock)
{
while (!list_empty(head)) {
struct fuse_req *req;
@@ -988,7 +1040,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
* locked).
*/
static void end_io_requests(struct fuse_conn *fc)
- __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
{
while (!list_empty(&fc->io)) {
struct fuse_req *req =
@@ -1002,11 +1055,11 @@ static void end_io_requests(struct fuse_conn *fc)
wake_up(&req->waitq);
if (end) {
req->end = NULL;
- /* The end function will consume this reference */
__fuse_get_request(req);
spin_unlock(&fc->lock);
wait_event(req->waitq, !req->locked);
end(fc, req);
+ fuse_put_request(fc, req);
spin_lock(&fc->lock);
}
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 95bc22bdd06..fdff346e96f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
/*
FUSE: Filesystem in Userspace
- Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
This program can be distributed under the terms of the GNU GPL.
See the file COPYING.
@@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
parent = dget_parent(entry);
fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
&entry->d_name, &outarg);
- request_send(fc, req);
+ fuse_request_send(fc, req);
dput(parent);
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
return 0;
}
spin_lock(&fc->lock);
- fi->nlookup ++;
+ fi->nlookup++;
spin_unlock(&fc->lock);
}
fuse_put_request(fc, forget_req);
@@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
attr_version = fuse_get_attr_version(fc);
fuse_lookup_init(fc, req, nodeid, name, outarg);
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
/* Zero nodeid is same as -ENOENT, but with valid timeout */
@@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
{
fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
ff->reserved_req->force = 1;
- request_send(fc, ff->reserved_req);
+ fuse_request_send(fc, ff->reserved_req);
fuse_put_request(fc, ff->reserved_req);
kfree(ff);
}
@@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
goto out_put_forget_req;
err = -ENOMEM;
- ff = fuse_file_alloc();
+ ff = fuse_file_alloc(fc);
if (!ff)
goto out_put_request;
@@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
req->out.args[0].value = &outentry;
req->out.args[1].size = sizeof(outopen);
req->out.args[1].value = &outopen;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
if (err) {
if (err == -ENOSYS)
@@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
else
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err)
@@ -631,15 +631,17 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
req->in.numargs = 1;
req->in.args[0].size = entry->d_name.len + 1;
req->in.args[0].value = entry->d_name.name;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err) {
struct inode *inode = entry->d_inode;
- /* Set nlink to zero so the inode can be cleared, if
- the inode does have more links this will be
- discovered at the next lookup/getattr */
+ /*
+ * Set nlink to zero so the inode can be cleared, if the inode
+ * does have more links this will be discovered at the next
+ * lookup/getattr.
+ */
clear_nlink(inode);
fuse_invalidate_attr(inode);
fuse_invalidate_attr(dir);
@@ -662,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
req->in.numargs = 1;
req->in.args[0].size = entry->d_name.len + 1;
req->in.args[0].value = entry->d_name.name;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err) {
@@ -695,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
req->in.args[1].value = oldent->d_name.name;
req->in.args[2].size = newent->d_name.len + 1;
req->in.args[2].value = newent->d_name.name;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err) {
@@ -811,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
else
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err) {
@@ -911,7 +913,7 @@ static int fuse_access(struct inode *inode, int mask)
req->in.numargs = 1;
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS) {
@@ -1033,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
req->num_pages = 1;
req->pages[0] = page;
fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
- request_send(fc, req);
+ fuse_request_send(fc, req);
nbytes = req->out.args[0].size;
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -1067,7 +1069,7 @@ static char *read_link(struct dentry *dentry)
req->out.numargs = 1;
req->out.args[0].size = PAGE_SIZE - 1;
req->out.args[0].value = link;
- request_send(fc, req);
+ fuse_request_send(fc, req);
if (req->out.h.error) {
free_page((unsigned long) link);
link = ERR_PTR(req->out.h.error);
@@ -1273,7 +1275,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
else
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err) {
@@ -1367,7 +1369,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
req->in.args[1].value = name;
req->in.args[2].size = size;
req->in.args[2].value = value;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS) {
@@ -1413,7 +1415,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
}
- request_send(fc, req);
+ fuse_request_send(fc, req);
ret = req->out.h.error;
if (!ret)
ret = size ? req->out.args[0].size : outarg.size;
@@ -1463,7 +1465,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
}
- request_send(fc, req);
+ fuse_request_send(fc, req);
ret = req->out.h.error;
if (!ret)
ret = size ? req->out.args[0].size : outarg.size;
@@ -1496,7 +1498,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
req->in.numargs = 1;
req->in.args[0].size = strlen(name) + 1;
req->in.args[0].value = name;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34930a964b8..e8162646a9b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
/*
FUSE: Filesystem in Userspace
- Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
This program can be distributed under the terms of the GNU GPL.
See the file COPYING.
@@ -39,14 +39,14 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
req->out.numargs = 1;
req->out.args[0].size = sizeof(*outargp);
req->out.args[0].value = outargp;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
return err;
}
-struct fuse_file *fuse_file_alloc(void)
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
{
struct fuse_file *ff;
ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
@@ -58,7 +58,12 @@ struct fuse_file *fuse_file_alloc(void)
} else {
INIT_LIST_HEAD(&ff->write_entry);
atomic_set(&ff->count, 0);
+ spin_lock(&fc->lock);
+ ff->kh = ++fc->khctr;
+ spin_unlock(&fc->lock);
}
+ RB_CLEAR_NODE(&ff->polled_node);
+ init_waitqueue_head(&ff->poll_wait);
}
return ff;
}
@@ -79,7 +84,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
{
dput(req->misc.release.dentry);
mntput(req->misc.release.vfsmount);
- fuse_put_request(fc, req);
}
static void fuse_file_put(struct fuse_file *ff)
@@ -89,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff)
struct inode *inode = req->misc.release.dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
req->end = fuse_release_end;
- request_send_background(fc, req);
+ fuse_request_send_background(fc, req);
kfree(ff);
}
}
@@ -109,6 +113,7 @@ void fuse_finish_open(struct inode *inode, struct file *file,
int fuse_open_common(struct inode *inode, struct file *file, int isdir)
{
+ struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_open_out outarg;
struct fuse_file *ff;
int err;
@@ -121,7 +126,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
if (err)
return err;
- ff = fuse_file_alloc();
+ ff = fuse_file_alloc(fc);
if (!ff)
return -ENOMEM;
@@ -167,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
spin_lock(&fc->lock);
list_del(&ff->write_entry);
+ if (!RB_EMPTY_NODE(&ff->polled_node))
+ rb_erase(&ff->polled_node, &fc->polled_files);
spin_unlock(&fc->lock);
+
+ wake_up_interruptible_sync(&ff->poll_wait);
/*
* Normally this will send the RELEASE request,
* however if some asynchronous READ or WRITE requests
@@ -280,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
req->force = 1;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS) {
@@ -344,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
req->in.numargs = 1;
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS) {
@@ -396,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
inarg->read_flags |= FUSE_READ_LOCKOWNER;
inarg->lock_owner = fuse_lock_owner_id(fc, owner);
}
- request_send(fc, req);
+ fuse_request_send(fc, req);
return req->out.args[0].size;
}
@@ -493,7 +502,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
}
if (req->ff)
fuse_file_put(req->ff);
- fuse_put_request(fc, req);
}
static void fuse_send_readpages(struct fuse_req *req, struct file *file,
@@ -509,10 +517,11 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
struct fuse_file *ff = file->private_data;
req->ff = fuse_file_get(ff);
req->end = fuse_readpages_end;
- request_send_background(fc, req);
+ fuse_request_send_background(fc, req);
} else {
- request_send(fc, req);
+ fuse_request_send(fc, req);
fuse_readpages_end(fc, req);
+ fuse_put_request(fc, req);
}
}
@@ -543,7 +552,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
}
}
req->pages[req->num_pages] = page;
- req->num_pages ++;
+ req->num_pages++;
return 0;
}
@@ -636,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
inarg->lock_owner = fuse_lock_owner_id(fc, owner);
}
- request_send(fc, req);
+ fuse_request_send(fc, req);
return req->misc.write.out.size;
}
@@ -646,7 +655,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
{
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- *pagep = __grab_cache_page(mapping, index);
+ *pagep = grab_cache_page_write_begin(mapping, index, flags);
if (!*pagep)
return -ENOMEM;
return 0;
@@ -779,7 +788,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
break;
err = -ENOMEM;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, 0);
if (!page)
break;
@@ -1042,7 +1051,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
{
__free_page(req->pages[0]);
fuse_file_put(req->ff);
- fuse_put_request(fc, req);
}
static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1060,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
/* Called under fc->lock, may release and reacquire it */
static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+__releases(&fc->lock)
+__acquires(&fc->lock)
{
struct fuse_inode *fi = get_fuse_inode(req->inode);
loff_t size = i_size_read(req->inode);
@@ -1079,13 +1089,14 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
req->in.args[1].size = inarg->size;
fi->writectr++;
- request_send_background_locked(fc, req);
+ fuse_request_send_background_locked(fc, req);
return;
out_free:
fuse_writepage_finish(fc, req);
spin_unlock(&fc->lock);
fuse_writepage_free(fc, req);
+ fuse_put_request(fc, req);
spin_lock(&fc->lock);
}
@@ -1096,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
* Called with fc->lock
*/
void fuse_flush_writepages(struct inode *inode)
+__releases(&fc->lock)
+__acquires(&fc->lock)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1325,7 +1338,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
req->out.numargs = 1;
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (!err)
@@ -1357,7 +1370,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
return PTR_ERR(req);
fuse_lk_fill(req, file, fl, opcode, pid, flock);
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
/* locking is restartable */
if (err == -EINTR)
@@ -1433,7 +1446,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
req->out.numargs = 1;
req->out.args[0].size = sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
if (err == -ENOSYS)
@@ -1470,6 +1483,406 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
return retval;
}
+static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
+ unsigned int nr_segs, size_t bytes, bool to_user)
+{
+ struct iov_iter ii;
+ int page_idx = 0;
+
+ if (!bytes)
+ return 0;
+
+ iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+
+ while (iov_iter_count(&ii)) {
+ struct page *page = pages[page_idx++];
+ size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
+ void *kaddr, *map;
+
+ kaddr = map = kmap(page);
+
+ while (todo) {
+ char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
+ size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+ size_t copy = min(todo, iov_len);
+ size_t left;
+
+ if (!to_user)
+ left = copy_from_user(kaddr, uaddr, copy);
+ else
+ left = copy_to_user(uaddr, kaddr, copy);
+
+ if (unlikely(left))
+ return -EFAULT;
+
+ iov_iter_advance(&ii, copy);
+ todo -= copy;
+ kaddr += copy;
+ }
+
+ kunmap(map);
+ }
+
+ return 0;
+}
+
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written. Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs. Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ * char *buf;
+ * size_t buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
+ * { .iov_base = a.buf, .iov_len = a.buflen } }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY. This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_ioctl_in inarg = {
+ .fh = ff->fh,
+ .cmd = cmd,
+ .arg = arg,
+ .flags = flags
+ };
+ struct fuse_ioctl_out outarg;
+ struct fuse_req *req = NULL;
+ struct page **pages = NULL;
+ struct page *iov_page = NULL;
+ struct iovec *in_iov = NULL, *out_iov = NULL;
+ unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
+ size_t in_size, out_size, transferred;
+ int err;
+
+ /* assume all the iovs returned by client always fits in a page */
+ BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+
+ if (!fuse_allow_task(fc, current))
+ return -EACCES;
+
+ err = -EIO;
+ if (is_bad_inode(inode))
+ goto out;
+
+ err = -ENOMEM;
+ pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+ iov_page = alloc_page(GFP_KERNEL);
+ if (!pages || !iov_page)
+ goto out;
+
+ /*
+ * If restricted, initialize IO parameters as encoded in @cmd.
+ * RETRY from server is not allowed.
+ */
+ if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+ struct iovec *iov = page_address(iov_page);
+
+ iov->iov_base = (void __user *)arg;
+ iov->iov_len = _IOC_SIZE(cmd);
+
+ if (_IOC_DIR(cmd) & _IOC_WRITE) {
+ in_iov = iov;
+ in_iovs = 1;
+ }
+
+ if (_IOC_DIR(cmd) & _IOC_READ) {
+ out_iov = iov;
+ out_iovs = 1;
+ }
+ }
+
+ retry:
+ inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+ inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+
+ /*
+ * Out data can be used either for actual out data or iovs,
+ * make sure there always is at least one page.
+ */
+ out_size = max_t(size_t, out_size, PAGE_SIZE);
+ max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+
+ /* make sure there are enough buffer pages and init request with them */
+ err = -ENOMEM;
+ if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+ goto out;
+ while (num_pages < max_pages) {
+ pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (!pages[num_pages])
+ goto out;
+ num_pages++;
+ }
+
+ req = fuse_get_req(fc);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ req = NULL;
+ goto out;
+ }
+ memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
+ req->num_pages = num_pages;
+
+ /* okay, let's send it to the client */
+ req->in.h.opcode = FUSE_IOCTL;
+ req->in.h.nodeid = get_node_id(inode);
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(inarg);
+ req->in.args[0].value = &inarg;
+ if (in_size) {
+ req->in.numargs++;
+ req->in.args[1].size = in_size;
+ req->in.argpages = 1;
+
+ err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
+ false);
+ if (err)
+ goto out;
+ }
+
+ req->out.numargs = 2;
+ req->out.args[0].size = sizeof(outarg);
+ req->out.args[0].value = &outarg;
+ req->out.args[1].size = out_size;
+ req->out.argpages = 1;
+ req->out.argvar = 1;
+
+ fuse_request_send(fc, req);
+ err = req->out.h.error;
+ transferred = req->out.args[1].size;
+ fuse_put_request(fc, req);
+ req = NULL;
+ if (err)
+ goto out;
+
+ /* did it ask for retry? */
+ if (outarg.flags & FUSE_IOCTL_RETRY) {
+ char *vaddr;
+
+ /* no retry if in restricted mode */
+ err = -EIO;
+ if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+ goto out;
+
+ in_iovs = outarg.in_iovs;
+ out_iovs = outarg.out_iovs;
+
+ /*
+ * Make sure things are in boundary, separate checks
+ * are to protect against overflow.
+ */
+ err = -ENOMEM;
+ if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+ out_iovs > FUSE_IOCTL_MAX_IOV ||
+ in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+ goto out;
+
+ err = -EIO;
+ if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
+ goto out;
+
+ /* okay, copy in iovs and retry */
+ vaddr = kmap_atomic(pages[0], KM_USER0);
+ memcpy(page_address(iov_page), vaddr, transferred);
+ kunmap_atomic(vaddr, KM_USER0);
+
+ in_iov = page_address(iov_page);
+ out_iov = in_iov + in_iovs;
+
+ goto retry;
+ }
+
+ err = -EIO;
+ if (transferred > inarg.out_size)
+ goto out;
+
+ err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
+ out:
+ if (req)
+ fuse_put_request(fc, req);
+ if (iov_page)
+ __free_page(iov_page);
+ while (num_pages)
+ __free_page(pages[--num_pages]);
+ kfree(pages);
+
+ return err ? err : outarg.result;
+}
+
+static long fuse_file_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return fuse_file_do_ioctl(file, cmd, arg, 0);
+}
+
+static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+
+/*
+ * All files which have been polled are linked to RB tree
+ * fuse_conn->polled_files which is indexed by kh. Walk the tree and
+ * find the matching one.
+ */
+static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
+ struct rb_node **parent_out)
+{
+ struct rb_node **link = &fc->polled_files.rb_node;
+ struct rb_node *last = NULL;
+
+ while (*link) {
+ struct fuse_file *ff;
+
+ last = *link;
+ ff = rb_entry(last, struct fuse_file, polled_node);
+
+ if (kh < ff->kh)
+ link = &last->rb_left;
+ else if (kh > ff->kh)
+ link = &last->rb_right;
+ else
+ return link;
+ }
+
+ if (parent_out)
+ *parent_out = last;
+ return link;
+}
+
+/*
+ * The file is about to be polled. Make sure it's on the polled_files
+ * RB tree. Note that files once added to the polled_files tree are
+ * not removed before the file is released. This is because a file
+ * polled once is likely to be polled again.
+ */
+static void fuse_register_polled_file(struct fuse_conn *fc,
+ struct fuse_file *ff)
+{
+ spin_lock(&fc->lock);
+ if (RB_EMPTY_NODE(&ff->polled_node)) {
+ struct rb_node **link, *parent;
+
+ link = fuse_find_polled_node(fc, ff->kh, &parent);
+ BUG_ON(*link);
+ rb_link_node(&ff->polled_node, parent, link);
+ rb_insert_color(&ff->polled_node, &fc->polled_files);
+ }
+ spin_unlock(&fc->lock);
+}
+
+static unsigned fuse_file_poll(struct file *file, poll_table *wait)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
+ struct fuse_poll_out outarg;
+ struct fuse_req *req;
+ int err;
+
+ if (fc->no_poll)
+ return DEFAULT_POLLMASK;
+
+ poll_wait(file, &ff->poll_wait, wait);
+
+ /*
+ * Ask for notification iff there's someone waiting for it.
+ * The client may ignore the flag and always notify.
+ */
+ if (waitqueue_active(&ff->poll_wait)) {
+ inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
+ fuse_register_polled_file(fc, ff);
+ }
+
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ req->in.h.opcode = FUSE_POLL;
+ req->in.h.nodeid = get_node_id(inode);
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(inarg);
+ req->in.args[0].value = &inarg;
+ req->out.numargs = 1;
+ req->out.args[0].size = sizeof(outarg);
+ req->out.args[0].value = &outarg;
+ fuse_request_send(fc, req);
+ err = req->out.h.error;
+ fuse_put_request(fc, req);
+
+ if (!err)
+ return outarg.revents;
+ if (err == -ENOSYS) {
+ fc->no_poll = 1;
+ return DEFAULT_POLLMASK;
+ }
+ return POLLERR;
+}
+
+/*
+ * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
+ * wakes up the poll waiters.
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+ struct fuse_notify_poll_wakeup_out *outarg)
+{
+ u64 kh = outarg->kh;
+ struct rb_node **link;
+
+ spin_lock(&fc->lock);
+
+ link = fuse_find_polled_node(fc, kh, NULL);
+ if (*link) {
+ struct fuse_file *ff;
+
+ ff = rb_entry(*link, struct fuse_file, polled_node);
+ wake_up_interruptible_sync(&ff->poll_wait);
+ }
+
+ spin_unlock(&fc->lock);
+ return 0;
+}
+
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
.read = do_sync_read,
@@ -1484,6 +1897,9 @@ static const struct file_operations fuse_file_operations = {
.lock = fuse_file_lock,
.flock = fuse_file_flock,
.splice_read = generic_file_splice_read,
+ .unlocked_ioctl = fuse_file_ioctl,
+ .compat_ioctl = fuse_file_compat_ioctl,
+ .poll = fuse_file_poll,
};
static const struct file_operations fuse_direct_io_file_operations = {
@@ -1496,6 +1912,9 @@ static const struct file_operations fuse_direct_io_file_operations = {
.fsync = fuse_fsync,
.lock = fuse_file_lock,
.flock = fuse_file_flock,
+ .unlocked_ioctl = fuse_file_ioctl,
+ .compat_ioctl = fuse_file_compat_ioctl,
+ .poll = fuse_file_poll,
/* no mmap and splice_read */
};
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 35accfdd747..5e64b815a5a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
/*
FUSE: Filesystem in Userspace
- Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
This program can be distributed under the terms of the GNU GPL.
See the file COPYING.
@@ -19,6 +19,8 @@
#include <linux/backing-dev.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/poll.h>
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
@@ -100,6 +102,9 @@ struct fuse_file {
/** Request reserved for flush and release */
struct fuse_req *reserved_req;
+ /** Kernel file handle guaranteed to be unique */
+ u64 kh;
+
/** File handle used by userspace */
u64 fh;
@@ -108,6 +113,12 @@ struct fuse_file {
/** Entry on inode's write_files list */
struct list_head write_entry;
+
+ /** RB node to be linked on fuse_conn->polled_files */
+ struct rb_node polled_node;
+
+ /** Wait queue head for poll */
+ wait_queue_head_t poll_wait;
};
/** One input argument of a request */
@@ -322,6 +333,12 @@ struct fuse_conn {
/** The list of requests under I/O */
struct list_head io;
+ /** The next unique kernel file handle */
+ u64 khctr;
+
+ /** rbtree of fuse_files waiting for poll events indexed by ph */
+ struct rb_root polled_files;
+
/** Number of requests currently in the background */
unsigned num_background;
@@ -355,19 +372,19 @@ struct fuse_conn {
/** Connection failed (version mismatch). Cannot race with
setting other bitfields since it is only set once in INIT
reply, before any other request, and never cleared */
- unsigned conn_error : 1;
+ unsigned conn_error:1;
/** Connection successful. Only set in INIT */
- unsigned conn_init : 1;
+ unsigned conn_init:1;
/** Do readpages asynchronously? Only set in INIT */
- unsigned async_read : 1;
+ unsigned async_read:1;
/** Do not send separate SETATTR request before open(O_TRUNC) */
- unsigned atomic_o_trunc : 1;
+ unsigned atomic_o_trunc:1;
/** Filesystem supports NFS exporting. Only set in INIT */
- unsigned export_support : 1;
+ unsigned export_support:1;
/*
* The following bitfields are only for optimization purposes
@@ -375,43 +392,46 @@ struct fuse_conn {
*/
/** Is fsync not implemented by fs? */
- unsigned no_fsync : 1;
+ unsigned no_fsync:1;
/** Is fsyncdir not implemented by fs? */
- unsigned no_fsyncdir : 1;
+ unsigned no_fsyncdir:1;
/** Is flush not implemented by fs? */
- unsigned no_flush : 1;
+ unsigned no_flush:1;
/** Is setxattr not implemented by fs? */
- unsigned no_setxattr : 1;
+ unsigned no_setxattr:1;
/** Is getxattr not implemented by fs? */
- unsigned no_getxattr : 1;
+ unsigned no_getxattr:1;
/** Is listxattr not implemented by fs? */
- unsigned no_listxattr : 1;
+ unsigned no_listxattr:1;
/** Is removexattr not implemented by fs? */
- unsigned no_removexattr : 1;
+ unsigned no_removexattr:1;
/** Are file locking primitives not implemented by fs? */
- unsigned no_lock : 1;
+ unsigned no_lock:1;
/** Is access not implemented by fs? */
- unsigned no_access : 1;
+ unsigned no_access:1;
/** Is create not implemented by fs? */
- unsigned no_create : 1;
+ unsigned no_create:1;
/** Is interrupt not implemented by fs? */
- unsigned no_interrupt : 1;
+ unsigned no_interrupt:1;
/** Is bmap not implemented by fs? */
- unsigned no_bmap : 1;
+ unsigned no_bmap:1;
+
+ /** Is poll not implemented by fs? */
+ unsigned no_poll:1;
/** Do multi-page cached writes */
- unsigned big_writes : 1;
+ unsigned big_writes:1;
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -445,6 +465,9 @@ struct fuse_conn {
/** Version counter for attribute changes */
u64 attr_version;
+
+ /** Called on final put */
+ void (*release)(struct fuse_conn *);
};
static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -499,7 +522,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
*/
int fuse_open_common(struct inode *inode, struct file *file, int isdir);
-struct fuse_file *fuse_file_alloc(void);
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
void fuse_file_free(struct fuse_file *ff);
void fuse_finish_open(struct inode *inode, struct file *file,
struct fuse_file *ff, struct fuse_open_out *outarg);
@@ -519,6 +542,12 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
int isdir);
/**
+ * Notify poll wakeup
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+ struct fuse_notify_poll_wakeup_out *outarg);
+
+/**
* Initialize file operations on a regular file
*/
void fuse_init_file_inode(struct inode *inode);
@@ -593,19 +622,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
/**
* Send a request (synchronous)
*/
-void request_send(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
/**
* Send a request with no reply
*/
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
/**
* Send a request in the background
*/
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+ struct fuse_req *req);
/* Abort all requests */
void fuse_abort_conn(struct fuse_conn *fc);
@@ -623,6 +653,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
/**
+ * Initialize fuse_conn
+ */
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
+
+/**
* Release reference to fuse_conn
*/
void fuse_conn_put(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e99f34b443..47c96fdca1a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
/*
FUSE: Filesystem in Userspace
- Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
This program can be distributed under the terms of the GNU GPL.
See the file COPYING.
@@ -37,10 +37,10 @@ struct fuse_mount_data {
unsigned rootmode;
unsigned user_id;
unsigned group_id;
- unsigned fd_present : 1;
- unsigned rootmode_present : 1;
- unsigned user_id_present : 1;
- unsigned group_id_present : 1;
+ unsigned fd_present:1;
+ unsigned rootmode_present:1;
+ unsigned user_id_present:1;
+ unsigned group_id_present:1;
unsigned flags;
unsigned max_read;
unsigned blksize;
@@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
req->in.numargs = 1;
req->in.args[0].size = sizeof(struct fuse_forget_in);
req->in.args[0].value = inarg;
- request_send_noreply(fc, req);
+ fuse_request_send_noreply(fc, req);
}
static void fuse_clear_inode(struct inode *inode)
@@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
fi = get_fuse_inode(inode);
spin_lock(&fc->lock);
- fi->nlookup ++;
+ fi->nlookup++;
spin_unlock(&fc->lock);
fuse_change_attributes(inode, attr, attr_valid, attr_version);
@@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
fc->destroy_req = NULL;
req->in.h.opcode = FUSE_DESTROY;
req->force = 1;
- request_send(fc, req);
+ fuse_request_send(fc, req);
fuse_put_request(fc, req);
}
}
@@ -334,7 +334,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
req->out.args[0].size =
fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
req->out.args[0].value = &outarg;
- request_send(fc, req);
+ fuse_request_send(fc, req);
err = req->out.h.error;
if (!err)
convert_fuse_statfs(buf, &outarg.st);
@@ -462,68 +462,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
return 0;
}
-static struct fuse_conn *new_conn(struct super_block *sb)
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
{
- struct fuse_conn *fc;
int err;
- fc = kzalloc(sizeof(*fc), GFP_KERNEL);
- if (fc) {
- spin_lock_init(&fc->lock);
- mutex_init(&fc->inst_mutex);
- atomic_set(&fc->count, 1);
- init_waitqueue_head(&fc->waitq);
- init_waitqueue_head(&fc->blocked_waitq);
- init_waitqueue_head(&fc->reserved_req_waitq);
- INIT_LIST_HEAD(&fc->pending);
- INIT_LIST_HEAD(&fc->processing);
- INIT_LIST_HEAD(&fc->io);
- INIT_LIST_HEAD(&fc->interrupts);
- INIT_LIST_HEAD(&fc->bg_queue);
- atomic_set(&fc->num_waiting, 0);
- fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
- fc->bdi.unplug_io_fn = default_unplug_io_fn;
- /* fuse does it's own writeback accounting */
- fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
- fc->dev = sb->s_dev;
- err = bdi_init(&fc->bdi);
- if (err)
- goto error_kfree;
- if (sb->s_bdev) {
- err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
- MAJOR(fc->dev), MINOR(fc->dev));
- } else {
- err = bdi_register_dev(&fc->bdi, fc->dev);
- }
- if (err)
- goto error_bdi_destroy;
- /*
- * For a single fuse filesystem use max 1% of dirty +
- * writeback threshold.
- *
- * This gives about 1M of write buffer for memory maps on a
- * machine with 1G and 10% dirty_ratio, which should be more
- * than enough.
- *
- * Privileged users can raise it by writing to
- *
- * /sys/class/bdi/<bdi>/max_ratio
- */
- bdi_set_max_ratio(&fc->bdi, 1);
- fc->reqctr = 0;
- fc->blocked = 1;
- fc->attr_version = 1;
- get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+ memset(fc, 0, sizeof(*fc));
+ spin_lock_init(&fc->lock);
+ mutex_init(&fc->inst_mutex);
+ atomic_set(&fc->count, 1);
+ init_waitqueue_head(&fc->waitq);
+ init_waitqueue_head(&fc->blocked_waitq);
+ init_waitqueue_head(&fc->reserved_req_waitq);
+ INIT_LIST_HEAD(&fc->pending);
+ INIT_LIST_HEAD(&fc->processing);
+ INIT_LIST_HEAD(&fc->io);
+ INIT_LIST_HEAD(&fc->interrupts);
+ INIT_LIST_HEAD(&fc->bg_queue);
+ INIT_LIST_HEAD(&fc->entry);
+ atomic_set(&fc->num_waiting, 0);
+ fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ fc->bdi.unplug_io_fn = default_unplug_io_fn;
+ /* fuse does it's own writeback accounting */
+ fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+ fc->khctr = 0;
+ fc->polled_files = RB_ROOT;
+ fc->dev = sb->s_dev;
+ err = bdi_init(&fc->bdi);
+ if (err)
+ goto error_mutex_destroy;
+ if (sb->s_bdev) {
+ err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+ MAJOR(fc->dev), MINOR(fc->dev));
+ } else {
+ err = bdi_register_dev(&fc->bdi, fc->dev);
}
- return fc;
+ if (err)
+ goto error_bdi_destroy;
+ /*
+ * For a single fuse filesystem use max 1% of dirty +
+ * writeback threshold.
+ *
+ * This gives about 1M of write buffer for memory maps on a
+ * machine with 1G and 10% dirty_ratio, which should be more
+ * than enough.
+ *
+ * Privileged users can raise it by writing to
+ *
+ * /sys/class/bdi/<bdi>/max_ratio
+ */
+ bdi_set_max_ratio(&fc->bdi, 1);
+ fc->reqctr = 0;
+ fc->blocked = 1;
+ fc->attr_version = 1;
+ get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
-error_bdi_destroy:
+ return 0;
+
+ error_bdi_destroy:
bdi_destroy(&fc->bdi);
-error_kfree:
+ error_mutex_destroy:
mutex_destroy(&fc->inst_mutex);
- kfree(fc);
- return NULL;
+ return err;
}
+EXPORT_SYMBOL_GPL(fuse_conn_init);
void fuse_conn_put(struct fuse_conn *fc)
{
@@ -532,7 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc)
fuse_request_free(fc->destroy_req);
mutex_destroy(&fc->inst_mutex);
bdi_destroy(&fc->bdi);
- kfree(fc);
+ fc->release(fc);
}
}
@@ -542,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
return fc;
}
-static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
{
struct fuse_attr attr;
memset(&attr, 0, sizeof(attr));
@@ -553,8 +554,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
return fuse_iget(sb, 1, 0, &attr, 0, 0);
}
-struct fuse_inode_handle
-{
+struct fuse_inode_handle {
u64 nodeid;
u32 generation;
};
@@ -761,7 +761,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->max_write = max_t(unsigned, 4096, fc->max_write);
fc->conn_init = 1;
}
- fuse_put_request(fc, req);
fc->blocked = 0;
wake_up_all(&fc->blocked_waitq);
}
@@ -787,7 +786,12 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
req->out.args[0].size = sizeof(struct fuse_init_out);
req->out.args[0].value = &req->misc.init_out;
req->end = process_init_reply;
- request_send_background(fc, req);
+ fuse_request_send_background(fc, req);
+}
+
+static void fuse_free_conn(struct fuse_conn *fc)
+{
+ kfree(fc);
}
static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -828,10 +832,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (file->f_op != &fuse_dev_operations)
return -EINVAL;
- fc = new_conn(sb);
+ fc = kmalloc(sizeof(*fc), GFP_KERNEL);
if (!fc)
return -ENOMEM;
+ err = fuse_conn_init(fc, sb);
+ if (err) {
+ kfree(fc);
+ return err;
+ }
+
+ fc->release = fuse_free_conn;
fc->flags = d.flags;
fc->user_id = d.user_id;
fc->group_id = d.group_id;
@@ -841,7 +852,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = fc;
err = -ENOMEM;
- root = get_root_inode(sb, d.rootmode);
+ root = fuse_get_root_inode(sb, d.rootmode);
if (!root)
goto err;
@@ -952,7 +963,7 @@ static inline void unregister_fuseblk(void)
static void fuse_inode_init_once(void *foo)
{
- struct inode * inode = foo;
+ struct inode *inode = foo;
inode_init_once(inode);
}
@@ -1031,7 +1042,7 @@ static int __init fuse_init(void)
{
int res;
- printk("fuse init (API version %i.%i)\n",
+ printk(KERN_INFO "fuse init (API version %i.%i)\n",
FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
INIT_LIST_HEAD(&fuse_conn_list);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index ab2f57e3fb8..e563a644981 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
config GFS2_FS
tristate "GFS2 file system support"
- depends on EXPERIMENTAL && (64BIT || (LSF && LBD))
+ depends on EXPERIMENTAL && (64BIT || LBD)
select FS_POSIX_ACL
select CRC32
help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index ec65851ec80..c1b4ec6a965 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,5 +1,5 @@
obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
ops_fstype.o ops_inode.o ops_super.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3e9bd46f27e..e335dceb6a4 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -91,7 +91,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
struct gfs2_ea_location el_this;
int error;
- if (!ip->i_di.di_eattr)
+ if (!ip->i_eattr)
return 0;
memset(&er, 0, sizeof(struct gfs2_ea_request));
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index bec76b1c2bb..11ffc56f1f8 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,9 +75,9 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
void *kaddr = kmap(page);
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
- ip->i_di.di_size);
- memset(kaddr + ip->i_di.di_size, 0,
- PAGE_CACHE_SIZE - ip->i_di.di_size);
+ ip->i_disksize);
+ memset(kaddr + ip->i_disksize, 0,
+ PAGE_CACHE_SIZE - ip->i_disksize);
kunmap(page);
SetPageUptodate(page);
@@ -132,7 +132,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
if (error)
goto out;
- if (ip->i_di.di_size) {
+ if (ip->i_disksize) {
/* Get a free block, fill it with the stuffed data,
and write it out to disk */
@@ -159,7 +159,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
di = (struct gfs2_dinode *)dibh->b_data;
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
- if (ip->i_di.di_size) {
+ if (ip->i_disksize) {
*(__be64 *)(di + 1) = cpu_to_be64(block);
gfs2_add_inode_blocks(&ip->i_inode, 1);
di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -926,7 +926,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
}
}
- ip->i_di.di_size = size;
+ ip->i_disksize = size;
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
@@ -1033,7 +1033,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
goto out;
if (gfs2_is_stuffed(ip)) {
- ip->i_di.di_size = size;
+ ip->i_disksize = size;
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
@@ -1045,9 +1045,9 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
if (!error) {
- ip->i_di.di_size = size;
+ ip->i_disksize = size;
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
- ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+ ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
}
@@ -1114,13 +1114,13 @@ static int trunc_end(struct gfs2_inode *ip)
if (error)
goto out;
- if (!ip->i_di.di_size) {
+ if (!ip->i_disksize) {
ip->i_height = 0;
ip->i_goal = ip->i_no_addr;
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
}
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
- ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+ ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
@@ -1205,9 +1205,9 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
return -EINVAL;
- if (size > ip->i_di.di_size)
+ if (size > ip->i_disksize)
error = do_grow(ip, size);
- else if (size < ip->i_di.di_size)
+ else if (size < ip->i_disksize)
error = do_shrink(ip, size);
else
/* update time stamps */
@@ -1219,7 +1219,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
int gfs2_truncatei_resume(struct gfs2_inode *ip)
{
int error;
- error = trunc_dealloc(ip, ip->i_di.di_size);
+ error = trunc_dealloc(ip, ip->i_disksize);
if (!error)
error = trunc_end(ip);
return error;
@@ -1231,35 +1231,6 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
}
/**
- * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
- * @ip: the file
- * @len: the number of bytes to be written to the file
- * @data_blocks: returns the number of data blocks required
- * @ind_blocks: returns the number of indirect blocks required
- *
- */
-
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
- unsigned int *data_blocks, unsigned int *ind_blocks)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int tmp;
-
- if (gfs2_is_dir(ip)) {
- *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
- *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
- } else {
- *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
- *ind_blocks = 3 * (sdp->sd_max_height - 1);
- }
-
- for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
- tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
- *ind_blocks += tmp;
- }
-}
-
-/**
* gfs2_write_alloc_required - figure out if a write will require an allocation
* @ip: the file being written to
* @offset: the offset to write to
@@ -1276,6 +1247,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
struct buffer_head bh;
unsigned int shift;
u64 lblock, lblock_stop, size;
+ u64 end_of_file;
*alloc_required = 0;
@@ -1291,19 +1263,12 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
*alloc_required = 1;
shift = sdp->sd_sb.sb_bsize_shift;
- if (gfs2_is_dir(ip)) {
- unsigned int bsize = sdp->sd_jbsize;
- lblock = offset;
- do_div(lblock, bsize);
- lblock_stop = offset + len + bsize - 1;
- do_div(lblock_stop, bsize);
- } else {
- u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
- lblock = offset >> shift;
- lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
- if (lblock_stop > end_of_file)
- return 0;
- }
+ BUG_ON(gfs2_is_dir(ip));
+ end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
+ lblock = offset >> shift;
+ lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+ if (lblock_stop > end_of_file)
+ return 0;
size = (lblock_stop - lblock) << shift;
do {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e6cde2943b..c983177e05a 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -10,10 +10,40 @@
#ifndef __BMAP_DOT_H__
#define __BMAP_DOT_H__
+#include "inode.h"
+
struct inode;
struct gfs2_inode;
struct page;
+
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+
+static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
+ unsigned int len,
+ unsigned int *data_blocks,
+ unsigned int *ind_blocks)
+{
+ const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ unsigned int tmp;
+
+ BUG_ON(gfs2_is_dir(ip));
+ *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+ *ind_blocks = 3 * (sdp->sd_max_height - 1);
+
+ for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+ tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+ *ind_blocks += tmp;
+ }
+}
+
int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
@@ -21,10 +51,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
int gfs2_truncatei_resume(struct gfs2_inode *ip);
int gfs2_file_dealloc(struct gfs2_inode *ip);
-
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
- unsigned int *data_blocks,
- unsigned int *ind_blocks);
int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
unsigned int len, int *alloc_required);
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
deleted file mode 100644
index e51991947d2..00000000000
--- a/fs/gfs2/daemon.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-#include <linux/freezer.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "daemon.h"
-#include "glock.h"
-#include "log.h"
-#include "quota.h"
-#include "recovery.h"
-#include "super.h"
-#include "util.h"
-
-/* This uses schedule_timeout() instead of msleep() because it's good for
- the daemons to wake up more often than the timeout when unmounting so
- the user's unmount doesn't sit there forever.
-
- The kthread functions used to start these daemons block and flush signals. */
-
-/**
- * gfs2_glockd - Reclaim unused glock structures
- * @sdp: Pointer to GFS2 superblock
- *
- * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
- * Number of daemons can be set by user, with num_glockd mount option.
- */
-
-int gfs2_glockd(void *data)
-{
- struct gfs2_sbd *sdp = data;
-
- while (!kthread_should_stop()) {
- while (atomic_read(&sdp->sd_reclaim_count))
- gfs2_reclaim_glock(sdp);
-
- wait_event_interruptible(sdp->sd_reclaim_wq,
- (atomic_read(&sdp->sd_reclaim_count) ||
- kthread_should_stop()));
- if (freezing(current))
- refrigerator();
- }
-
- return 0;
-}
-
-/**
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-
-int gfs2_recoverd(void *data)
-{
- struct gfs2_sbd *sdp = data;
- unsigned long t;
-
- while (!kthread_should_stop()) {
- gfs2_check_journals(sdp);
- t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
- if (freezing(current))
- refrigerator();
- schedule_timeout_interruptible(t);
- }
-
- return 0;
-}
-
-/**
- * gfs2_quotad - Write cached quota changes into the quota file
- * @sdp: Pointer to GFS2 superblock
- *
- */
-
-int gfs2_quotad(void *data)
-{
- struct gfs2_sbd *sdp = data;
- unsigned long t;
- int error;
-
- while (!kthread_should_stop()) {
- /* Update the master statfs file */
-
- t = sdp->sd_statfs_sync_time +
- gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
-
- if (time_after_eq(jiffies, t)) {
- error = gfs2_statfs_sync(sdp);
- if (error &&
- error != -EROFS &&
- !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
- fs_err(sdp, "quotad: (1) error=%d\n", error);
- sdp->sd_statfs_sync_time = jiffies;
- }
-
- /* Update quota file */
-
- t = sdp->sd_quota_sync_time +
- gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
-
- if (time_after_eq(jiffies, t)) {
- error = gfs2_quota_sync(sdp);
- if (error &&
- error != -EROFS &&
- !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
- fs_err(sdp, "quotad: (2) error=%d\n", error);
- sdp->sd_quota_sync_time = jiffies;
- }
-
- gfs2_quota_scan(sdp);
-
- t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
- if (freezing(current))
- refrigerator();
- schedule_timeout_interruptible(t);
- }
-
- return 0;
-}
-
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
deleted file mode 100644
index 4be084fb6a6..00000000000
--- a/fs/gfs2/daemon.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __DAEMON_DOT_H__
-#define __DAEMON_DOT_H__
-
-int gfs2_glockd(void *data);
-int gfs2_recoverd(void *data);
-int gfs2_quotad(void *data);
-
-#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index eed040d8ba3..b7c8e5c7079 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -36,7 +36,7 @@
* the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
* beginning of the leaf block. The dirents reside in leaves when
*
- * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+ * dip->i_diskflags & GFS2_DIF_EXHASH is true
*
* Otherwise, the dirents are "linear", within a single stuffed dinode block.
*
@@ -128,8 +128,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
- if (ip->i_di.di_size < offset + size)
- ip->i_di.di_size = offset + size;
+ if (ip->i_disksize < offset + size)
+ ip->i_disksize = offset + size;
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_dinode_out(ip, dibh->b_data);
@@ -226,8 +226,8 @@ out:
if (error)
return error;
- if (ip->i_di.di_size < offset + copied)
- ip->i_di.di_size = offset + copied;
+ if (ip->i_disksize < offset + copied)
+ ip->i_disksize = offset + copied;
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -277,11 +277,11 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
int copied = 0;
int error = 0;
- if (offset >= ip->i_di.di_size)
+ if (offset >= ip->i_disksize)
return 0;
- if (offset + size > ip->i_di.di_size)
- size = ip->i_di.di_size - offset;
+ if (offset + size > ip->i_disksize)
+ size = ip->i_disksize - offset;
if (!size)
return 0;
@@ -755,12 +755,12 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
struct gfs2_inode *ip = GFS2_I(inode);
int error;
- if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ if (ip->i_diskflags & GFS2_DIF_EXHASH) {
struct gfs2_leaf *leaf;
unsigned hsize = 1 << ip->i_depth;
unsigned index;
u64 ln;
- if (hsize * sizeof(u64) != ip->i_di.di_size) {
+ if (hsize * sizeof(u64) != ip->i_disksize) {
gfs2_consist_inode(ip);
return ERR_PTR(-EIO);
}
@@ -858,8 +858,8 @@ static int dir_make_exhash(struct inode *inode)
return -ENOSPC;
bn = bh->b_blocknr;
- gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
- leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+ gfs2_assert(sdp, dip->i_entries < (1 << 16));
+ leaf->lf_entries = cpu_to_be16(dip->i_entries);
/* Copy dirents */
@@ -905,9 +905,9 @@ static int dir_make_exhash(struct inode *inode)
for (x = sdp->sd_hash_ptrs; x--; lp++)
*lp = cpu_to_be64(bn);
- dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+ dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
gfs2_add_inode_blocks(&dip->i_inode, 1);
- dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+ dip->i_diskflags |= GFS2_DIF_EXHASH;
for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
dip->i_depth = y;
@@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
int error = 0;
hsize = 1 << dip->i_depth;
- if (hsize * sizeof(u64) != dip->i_di.di_size) {
+ if (hsize * sizeof(u64) != dip->i_disksize) {
gfs2_consist_inode(dip);
return -EIO;
}
@@ -1091,7 +1091,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
- for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+ for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
error = gfs2_dir_read_data(dip, (char *)buf,
block * sdp->sd_hash_bsize,
sdp->sd_hash_bsize, 1);
@@ -1370,7 +1370,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
unsigned depth = 0;
hsize = 1 << dip->i_depth;
- if (hsize * sizeof(u64) != dip->i_di.di_size) {
+ if (hsize * sizeof(u64) != dip->i_disksize) {
gfs2_consist_inode(dip);
return -EIO;
}
@@ -1426,10 +1426,10 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
int copied = 0;
int error;
- if (!dip->i_di.di_entries)
+ if (!dip->i_entries)
return 0;
- if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+ if (dip->i_diskflags & GFS2_DIF_EXHASH)
return dir_e_read(inode, offset, opaque, filldir);
if (!gfs2_is_stuffed(dip)) {
@@ -1453,17 +1453,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
error = PTR_ERR(dent);
goto out;
}
- if (dip->i_di.di_entries != g.offset) {
+ if (dip->i_entries != g.offset) {
fs_warn(sdp, "Number of entries corrupt in dir %llu, "
- "ip->i_di.di_entries (%u) != g.offset (%u)\n",
+ "ip->i_entries (%u) != g.offset (%u)\n",
(unsigned long long)dip->i_no_addr,
- dip->i_di.di_entries,
+ dip->i_entries,
g.offset);
error = -EIO;
goto out;
}
error = do_filldir_main(dip, offset, opaque, filldir, darr,
- dip->i_di.di_entries, &copied);
+ dip->i_entries, &copied);
out:
kfree(darr);
}
@@ -1612,7 +1612,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
dent = gfs2_init_dirent(inode, dent, name, bh);
gfs2_inum_out(nip, dent);
dent->de_type = cpu_to_be16(type);
- if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ if (ip->i_diskflags & GFS2_DIF_EXHASH) {
leaf = (struct gfs2_leaf *)bh->b_data;
be16_add_cpu(&leaf->lf_entries, 1);
}
@@ -1621,14 +1621,14 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
if (error)
break;
gfs2_trans_add_bh(ip->i_gl, bh, 1);
- ip->i_di.di_entries++;
+ ip->i_entries++;
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
error = 0;
break;
}
- if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+ if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
error = dir_make_exhash(inode);
if (error)
break;
@@ -1691,7 +1691,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
}
dirent_del(dip, bh, prev, dent);
- if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ if (dip->i_diskflags & GFS2_DIF_EXHASH) {
struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
u16 entries = be16_to_cpu(leaf->lf_entries);
if (!entries)
@@ -1704,10 +1704,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
if (error)
return error;
- if (!dip->i_di.di_entries)
+ if (!dip->i_entries)
gfs2_consist_inode(dip);
gfs2_trans_add_bh(dip->i_gl, bh, 1);
- dip->i_di.di_entries--;
+ dip->i_entries--;
dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
gfs2_dinode_out(dip, bh->b_data);
brelse(bh);
@@ -1748,7 +1748,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
gfs2_inum_out(nip, dent);
dent->de_type = cpu_to_be16(new_type);
- if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ if (dip->i_diskflags & GFS2_DIF_EXHASH) {
brelse(bh);
error = gfs2_meta_inode_buffer(dip, &bh);
if (error)
@@ -1784,7 +1784,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
int error = 0;
hsize = 1 << dip->i_depth;
- if (hsize * sizeof(u64) != dip->i_di.di_size) {
+ if (hsize * sizeof(u64) != dip->i_disksize) {
gfs2_consist_inode(dip);
return -EIO;
}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 8a468cac932..4f919440c3b 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -11,6 +11,7 @@
#define __DIR_DOT_H__
#include <linux/dcache.h>
+#include <linux/crc32.h>
struct inode;
struct gfs2_inode;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index e3f76f451b0..0d1c76d906a 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -114,11 +114,11 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
__be64 *eablk, *end;
int error;
- error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
if (error)
return error;
- if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+ if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
error = ea_foreach_i(ip, bh, ea_call, data);
goto out;
}
@@ -414,7 +414,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
if (error)
return error;
- if (ip->i_di.di_eattr) {
+ if (ip->i_eattr) {
struct ea_list ei = { .ei_er = er, .ei_size = 0 };
error = ea_foreach(ip, ea_list_i, &ei);
@@ -514,7 +514,7 @@ int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
struct gfs2_ea_location el;
int error;
- if (!ip->i_di.di_eattr)
+ if (!ip->i_eattr)
return -ENODATA;
error = gfs2_ea_find(ip, er, &el);
@@ -741,7 +741,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (error)
return error;
- ip->i_di.di_eattr = bh->b_blocknr;
+ ip->i_eattr = bh->b_blocknr;
error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
brelse(bh);
@@ -935,10 +935,10 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
int error;
int mh_size = sizeof(struct gfs2_meta_header);
- if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+ if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
__be64 *end;
- error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
&indbh);
if (error)
return error;
@@ -972,9 +972,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
gfs2_buffer_clear_tail(indbh, mh_size);
eablk = (__be64 *)(indbh->b_data + mh_size);
- *eablk = cpu_to_be64(ip->i_di.di_eattr);
- ip->i_di.di_eattr = blk;
- ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+ *eablk = cpu_to_be64(ip->i_eattr);
+ ip->i_eattr = blk;
+ ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
gfs2_add_inode_blocks(&ip->i_inode, 1);
eablk++;
@@ -1015,7 +1015,7 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (error)
return error;
- if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+ if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
blks++;
if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
@@ -1040,7 +1040,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
struct gfs2_ea_location el;
int error;
- if (!ip->i_di.di_eattr) {
+ if (!ip->i_eattr) {
if (er->er_flags & XATTR_REPLACE)
return -ENODATA;
return ea_init(ip, er);
@@ -1051,7 +1051,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
return error;
if (el.el_ea) {
- if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+ if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
brelse(el.el_bh);
return -EPERM;
}
@@ -1145,7 +1145,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
struct gfs2_ea_location el;
int error;
- if (!ip->i_di.di_eattr)
+ if (!ip->i_eattr)
return -ENODATA;
error = gfs2_ea_find(ip, er, &el);
@@ -1309,7 +1309,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
- error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
if (error)
return error;
@@ -1388,7 +1388,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
if (bstart)
gfs2_free_meta(ip, bstart, blen);
- ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+ ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
@@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
struct buffer_head *dibh;
int error;
- rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
if (!rgd) {
gfs2_consist_inode(ip);
return -EIO;
@@ -1432,9 +1432,9 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
if (error)
goto out_gunlock;
- gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+ gfs2_free_meta(ip, ip->i_eattr, 1);
- ip->i_di.di_eattr = 0;
+ ip->i_eattr = 0;
gfs2_add_inode_blocks(&ip->i_inode, -1);
error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -1479,7 +1479,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
if (error)
goto out_rindex;
- if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+ if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
error = ea_dealloc_indirect(ip);
if (error)
goto out_rindex;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c962283d4e7..6b983aef785 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -40,6 +40,7 @@
#include "quota.h"
#include "super.h"
#include "util.h"
+#include "bmap.h"
struct gfs2_gl_hash_bucket {
struct hlist_head hb_list;
@@ -61,9 +62,10 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
static DECLARE_RWSEM(gfs2_umount_flush_sem);
static struct dentry *gfs2_root;
-static struct task_struct *scand_process;
-static unsigned int scand_secs = 5;
static struct workqueue_struct *glock_workqueue;
+static LIST_HEAD(lru_list);
+static atomic_t lru_count = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(lru_lock);
#define GFS2_GL_HASH_SHIFT 15
#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
@@ -174,6 +176,22 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
}
/**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+
+static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+ spin_lock(&lru_lock);
+ if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+ list_add_tail(&gl->gl_lru, &lru_list);
+ atomic_inc(&lru_count);
+ }
+ spin_unlock(&lru_lock);
+}
+
+/**
* gfs2_glock_put() - Decrement reference count on glock
* @gl: The glock to put
*
@@ -187,14 +205,23 @@ int gfs2_glock_put(struct gfs2_glock *gl)
if (atomic_dec_and_test(&gl->gl_ref)) {
hlist_del(&gl->gl_list);
write_unlock(gl_lock_addr(gl->gl_hash));
+ spin_lock(&lru_lock);
+ if (!list_empty(&gl->gl_lru)) {
+ list_del_init(&gl->gl_lru);
+ atomic_dec(&lru_count);
+ }
+ spin_unlock(&lru_lock);
GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
- GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
+ GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
glock_free(gl);
rv = 1;
goto out;
}
write_unlock(gl_lock_addr(gl->gl_hash));
+ /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
+ if (atomic_read(&gl->gl_ref) == 2)
+ gfs2_glock_schedule_for_reclaim(gl);
out:
return rv;
}
@@ -289,10 +316,13 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
* do_promote - promote as many requests as possible on the current queue
* @gl: The glock
*
- * Returns: true if there is a blocked holder at the head of the list
+ * Returns: 1 if there is a blocked holder at the head of the list, or 2
+ * if a type specific operation is underway.
*/
static int do_promote(struct gfs2_glock *gl)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_holder *gh, *tmp;
@@ -310,6 +340,8 @@ restart:
ret = glops->go_lock(gh);
spin_lock(&gl->gl_spin);
if (ret) {
+ if (ret == 1)
+ return 2;
gh->gh_error = ret;
list_del_init(&gh->gh_list);
gfs2_holder_wake(gh);
@@ -414,6 +446,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_holder *gh;
unsigned state = ret & LM_OUT_ST_MASK;
+ int rv;
spin_lock(&gl->gl_spin);
state_change(gl, state);
@@ -468,7 +501,6 @@ retry:
gfs2_demote_wake(gl);
if (state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
- int rv;
spin_unlock(&gl->gl_spin);
rv = glops->go_xmote_bh(gl, gh);
if (rv == -EAGAIN)
@@ -479,10 +511,13 @@ retry:
goto out;
}
}
- do_promote(gl);
+ rv = do_promote(gl);
+ if (rv == 2)
+ goto out_locked;
}
out:
clear_bit(GLF_LOCK, &gl->gl_flags);
+out_locked:
spin_unlock(&gl->gl_spin);
gfs2_glock_put(gl);
}
@@ -511,6 +546,8 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
*/
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -576,8 +613,11 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
*/
static void run_queue(struct gfs2_glock *gl, const int nonblock)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
{
struct gfs2_holder *gh = NULL;
+ int ret;
if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
return;
@@ -596,8 +636,11 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
} else {
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
gfs2_demote_wake(gl);
- if (do_promote(gl) == 0)
+ ret = do_promote(gl);
+ if (ret == 0)
goto out;
+ if (ret == 2)
+ return;
gh = find_first_waiter(gl);
gl->gl_target = gh->gh_state;
if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
@@ -820,7 +863,7 @@ static void wait_on_demote(struct gfs2_glock *gl)
*/
static void handle_callback(struct gfs2_glock *gl, unsigned int state,
- int remote, unsigned long delay)
+ unsigned long delay)
{
int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
@@ -828,9 +871,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
gl->gl_demote_state = state;
gl->gl_demote_time = jiffies;
- if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
- gl->gl_object)
- gfs2_glock_schedule_for_reclaim(gl);
} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != state) {
gl->gl_demote_state = LM_ST_UNLOCKED;
@@ -877,6 +917,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
*/
static inline void add_to_queue(struct gfs2_holder *gh)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -998,7 +1040,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
spin_lock(&gl->gl_spin);
if (gh->gh_flags & GL_NOCACHE)
- handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+ handle_callback(gl, LM_ST_UNLOCKED, 0);
list_del_init(&gh->gh_list);
if (find_first_holder(gl) == NULL) {
@@ -1269,12 +1311,26 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
delay = gl->gl_ops->go_min_hold_time;
spin_lock(&gl->gl_spin);
- handle_callback(gl, state, 1, delay);
+ handle_callback(gl, state, delay);
spin_unlock(&gl->gl_spin);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
gfs2_glock_put(gl);
}
+static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+ struct gfs2_jdesc *jd;
+
+ spin_lock(&sdp->sd_jindex_spin);
+ list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+ if (jd->jd_jid != jid)
+ continue;
+ jd->jd_dirty = 1;
+ break;
+ }
+ spin_unlock(&sdp->sd_jindex_spin);
+}
+
/**
* gfs2_glock_cb - Callback used by locking module
* @sdp: Pointer to the superblock
@@ -1338,80 +1394,83 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
* Returns: 1 if it's ok
*/
-static int demote_ok(struct gfs2_glock *gl)
+static int demote_ok(const struct gfs2_glock *gl)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
- int demote = 1;
-
- if (test_bit(GLF_STICKY, &gl->gl_flags))
- demote = 0;
- else if (glops->go_demote_ok)
- demote = glops->go_demote_ok(gl);
-
- return demote;
-}
-
-/**
- * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
- * @gl: the glock
- *
- */
-
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- spin_lock(&sdp->sd_reclaim_lock);
- if (list_empty(&gl->gl_reclaim)) {
- gfs2_glock_hold(gl);
- list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
- atomic_inc(&sdp->sd_reclaim_count);
- spin_unlock(&sdp->sd_reclaim_lock);
- wake_up(&sdp->sd_reclaim_wq);
- } else
- spin_unlock(&sdp->sd_reclaim_lock);
+ if (gl->gl_state == LM_ST_UNLOCKED)
+ return 0;
+ if (!list_empty(&gl->gl_holders))
+ return 0;
+ if (glops->go_demote_ok)
+ return glops->go_demote_ok(gl);
+ return 1;
}
-/**
- * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
- * @sdp: the filesystem
- *
- * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
- * different glock and we notice that there are a lot of glocks in the
- * reclaim list.
- *
- */
-void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
{
struct gfs2_glock *gl;
- int done_callback = 0;
+ int may_demote;
+ int nr_skipped = 0;
+ int got_ref = 0;
+ LIST_HEAD(skipped);
- spin_lock(&sdp->sd_reclaim_lock);
- if (list_empty(&sdp->sd_reclaim_list)) {
- spin_unlock(&sdp->sd_reclaim_lock);
- return;
- }
- gl = list_entry(sdp->sd_reclaim_list.next,
- struct gfs2_glock, gl_reclaim);
- list_del_init(&gl->gl_reclaim);
- spin_unlock(&sdp->sd_reclaim_lock);
+ if (nr == 0)
+ goto out;
- atomic_dec(&sdp->sd_reclaim_count);
- atomic_inc(&sdp->sd_reclaimed);
+ if (!(gfp_mask & __GFP_FS))
+ return -1;
- spin_lock(&gl->gl_spin);
- if (find_first_holder(gl) == NULL &&
- gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
- handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
- done_callback = 1;
+ spin_lock(&lru_lock);
+ while(nr && !list_empty(&lru_list)) {
+ gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
+ list_del_init(&gl->gl_lru);
+ atomic_dec(&lru_count);
+
+ /* Test for being demotable */
+ if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+ gfs2_glock_hold(gl);
+ got_ref = 1;
+ spin_unlock(&lru_lock);
+ spin_lock(&gl->gl_spin);
+ may_demote = demote_ok(gl);
+ spin_unlock(&gl->gl_spin);
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ if (may_demote) {
+ handle_callback(gl, LM_ST_UNLOCKED, 0);
+ nr--;
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
+ }
+ spin_lock(&lru_lock);
+ if (may_demote)
+ continue;
+ }
+ if (list_empty(&gl->gl_lru) &&
+ (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
+ nr_skipped++;
+ list_add(&gl->gl_lru, &skipped);
+ }
+ if (got_ref) {
+ spin_unlock(&lru_lock);
+ gfs2_glock_put(gl);
+ spin_lock(&lru_lock);
+ got_ref = 0;
+ }
}
- spin_unlock(&gl->gl_spin);
- if (!done_callback ||
- queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gfs2_glock_put(gl);
+ list_splice(&skipped, &lru_list);
+ atomic_add(nr_skipped, &lru_count);
+ spin_unlock(&lru_lock);
+out:
+ return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
}
+static struct shrinker glock_shrinker = {
+ .shrink = gfs2_shrink_glock_memory,
+ .seeks = DEFAULT_SEEKS,
+};
+
/**
* examine_bucket - Call a function for glock in a hash bucket
* @examiner: the function
@@ -1457,26 +1516,6 @@ out:
}
/**
- * scan_glock - look at a glock and see if we can reclaim it
- * @gl: the glock to look at
- *
- */
-
-static void scan_glock(struct gfs2_glock *gl)
-{
- if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
- return;
- if (test_bit(GLF_LOCK, &gl->gl_flags))
- return;
-
- spin_lock(&gl->gl_spin);
- if (find_first_holder(gl) == NULL &&
- gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
- gfs2_glock_schedule_for_reclaim(gl);
- spin_unlock(&gl->gl_spin);
-}
-
-/**
* clear_glock - look at a glock and see if we can free it from glock cache
* @gl: the glock to look at
*
@@ -1484,23 +1523,16 @@ static void scan_glock(struct gfs2_glock *gl)
static void clear_glock(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- int released;
-
- spin_lock(&sdp->sd_reclaim_lock);
- if (!list_empty(&gl->gl_reclaim)) {
- list_del_init(&gl->gl_reclaim);
- atomic_dec(&sdp->sd_reclaim_count);
- spin_unlock(&sdp->sd_reclaim_lock);
- released = gfs2_glock_put(gl);
- gfs2_assert(sdp, !released);
- } else {
- spin_unlock(&sdp->sd_reclaim_lock);
+ spin_lock(&lru_lock);
+ if (!list_empty(&gl->gl_lru)) {
+ list_del_init(&gl->gl_lru);
+ atomic_dec(&lru_count);
}
+ spin_unlock(&lru_lock);
spin_lock(&gl->gl_spin);
if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
- handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+ handle_callback(gl, LM_ST_UNLOCKED, 0);
spin_unlock(&gl->gl_spin);
gfs2_glock_hold(gl);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1548,6 +1580,20 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
}
}
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
+{
+ struct gfs2_glock *gl = ip->i_gl;
+ int ret;
+
+ ret = gfs2_truncatei_resume(ip);
+ gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+
+ spin_lock(&gl->gl_spin);
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ run_queue(gl, 1);
+ spin_unlock(&gl->gl_spin);
+}
+
static const char *state2str(unsigned state)
{
switch(state) {
@@ -1623,8 +1669,6 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
char *p = buf;
if (test_bit(GLF_LOCK, gflags))
*p++ = 'l';
- if (test_bit(GLF_STICKY, gflags))
- *p++ = 's';
if (test_bit(GLF_DEMOTE, gflags))
*p++ = 'D';
if (test_bit(GLF_PENDING_DEMOTE, gflags))
@@ -1743,34 +1787,6 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
return error;
}
-/**
- * gfs2_scand - Look for cached glocks and inodes to toss from memory
- * @sdp: Pointer to GFS2 superblock
- *
- * One of these daemons runs, finding candidates to add to sd_reclaim_list.
- * See gfs2_glockd()
- */
-
-static int gfs2_scand(void *data)
-{
- unsigned x;
- unsigned delay;
-
- while (!kthread_should_stop()) {
- for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
- examine_bucket(scan_glock, NULL, x);
- if (freezing(current))
- refrigerator();
- delay = scand_secs;
- if (delay < 1)
- delay = 1;
- schedule_timeout_interruptible(delay * HZ);
- }
-
- return 0;
-}
-
-
int __init gfs2_glock_init(void)
{
@@ -1784,28 +1800,21 @@ int __init gfs2_glock_init(void)
}
#endif
- scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
- if (IS_ERR(scand_process))
- return PTR_ERR(scand_process);
-
glock_workqueue = create_workqueue("glock_workqueue");
- if (IS_ERR(glock_workqueue)) {
- kthread_stop(scand_process);
+ if (IS_ERR(glock_workqueue))
return PTR_ERR(glock_workqueue);
- }
+
+ register_shrinker(&glock_shrinker);
return 0;
}
void gfs2_glock_exit(void)
{
+ unregister_shrinker(&glock_shrinker);
destroy_workqueue(glock_workqueue);
- kthread_stop(scand_process);
}
-module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
-
static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
{
struct gfs2_glock *gl;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 695c6b19361..543ec7ecfbd 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -129,9 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
void gfs2_lvb_unhold(struct gfs2_glock *gl);
void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
int __init gfs2_glock_init(void);
void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c6c318c2a0f..8522d3aa64f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -201,19 +201,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
* Returns: 1 if it's ok
*/
-static int inode_go_demote_ok(struct gfs2_glock *gl)
+static int inode_go_demote_ok(const struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
- int demote = 0;
-
- if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
- demote = 1;
- else if (!sdp->sd_args.ar_localcaching &&
- time_after_eq(jiffies, gl->gl_stamp +
- gfs2_tune_get(sdp, gt_demote_secs) * HZ))
- demote = 1;
-
- return demote;
+ if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
+ return 0;
+ return 1;
}
/**
@@ -227,6 +220,7 @@ static int inode_go_demote_ok(struct gfs2_glock *gl)
static int inode_go_lock(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
struct gfs2_inode *ip = gl->gl_object;
int error = 0;
@@ -239,10 +233,16 @@ static int inode_go_lock(struct gfs2_holder *gh)
return error;
}
- if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
+ if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
(gl->gl_state == LM_ST_EXCLUSIVE) &&
- (gh->gh_state == LM_ST_EXCLUSIVE))
- error = gfs2_truncatei_resume(ip);
+ (gh->gh_state == LM_ST_EXCLUSIVE)) {
+ spin_lock(&sdp->sd_trunc_lock);
+ if (list_empty(&ip->i_trunc_list))
+ list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
+ spin_unlock(&sdp->sd_trunc_lock);
+ wake_up(&sdp->sd_quota_wait);
+ return 1;
+ }
return error;
}
@@ -260,10 +260,13 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
const struct gfs2_inode *ip = gl->gl_object;
if (ip == NULL)
return 0;
- gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+ gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
(unsigned long long)ip->i_no_formal_ino,
(unsigned long long)ip->i_no_addr,
- IF2DT(ip->i_inode.i_mode), ip->i_flags);
+ IF2DT(ip->i_inode.i_mode), ip->i_flags,
+ (unsigned int)ip->i_diskflags,
+ (unsigned long long)ip->i_inode.i_size,
+ (unsigned long long)ip->i_disksize);
return 0;
}
@@ -274,7 +277,7 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
* Returns: 1 if it's ok
*/
-static int rgrp_go_demote_ok(struct gfs2_glock *gl)
+static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
{
return !gl->gl_aspace->i_mapping->nrpages;
}
@@ -318,7 +321,9 @@ static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
const struct gfs2_rgrpd *rgd = gl->gl_object;
if (rgd == NULL)
return 0;
- gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+ gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+ (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+ rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
return 0;
}
@@ -374,13 +379,25 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
}
/**
+ * trans_go_demote_ok
+ * @gl: the glock
+ *
+ * Always returns 0
+ */
+
+static int trans_go_demote_ok(const struct gfs2_glock *gl)
+{
+ return 0;
+}
+
+/**
* quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
* @gl: the glock
*
* Returns: 1 if it's ok
*/
-static int quota_go_demote_ok(struct gfs2_glock *gl)
+static int quota_go_demote_ok(const struct gfs2_glock *gl)
{
return !atomic_read(&gl->gl_lvb_count);
}
@@ -414,6 +431,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
const struct gfs2_glock_operations gfs2_trans_glops = {
.go_xmote_th = trans_go_sync,
.go_xmote_bh = trans_go_xmote_bh,
+ .go_demote_ok = trans_go_demote_ok,
.go_type = LM_TYPE_NONDISK,
};
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f566ec1b4e8..608849d0002 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -68,12 +68,6 @@ struct gfs2_bitmap {
u32 bi_len;
};
-struct gfs2_rgrp_host {
- u32 rg_free;
- u32 rg_dinodes;
- u64 rg_igeneration;
-};
-
struct gfs2_rgrpd {
struct list_head rd_list; /* Link with superblock */
struct list_head rd_list_mru;
@@ -83,14 +77,16 @@ struct gfs2_rgrpd {
u32 rd_length; /* length of rgrp header in fs blocks */
u32 rd_data; /* num of data blocks in rgrp */
u32 rd_bitbytes; /* number of bytes in data bitmaps */
- struct gfs2_rgrp_host rd_rg;
+ u32 rd_free;
+ u32 rd_free_clone;
+ u32 rd_dinodes;
+ u64 rd_igeneration;
struct gfs2_bitmap *rd_bits;
- unsigned int rd_bh_count;
struct mutex rd_mutex;
- u32 rd_free_clone;
struct gfs2_log_element rd_le;
- u32 rd_last_alloc;
struct gfs2_sbd *rd_sbd;
+ unsigned int rd_bh_count;
+ u32 rd_last_alloc;
unsigned char rd_flags;
#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */
#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */
@@ -129,7 +125,7 @@ struct gfs2_glock_operations {
void (*go_xmote_th) (struct gfs2_glock *gl);
int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
void (*go_inval) (struct gfs2_glock *gl, int flags);
- int (*go_demote_ok) (struct gfs2_glock *gl);
+ int (*go_demote_ok) (const struct gfs2_glock *gl);
int (*go_lock) (struct gfs2_holder *gh);
void (*go_unlock) (struct gfs2_holder *gh);
int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
@@ -159,7 +155,6 @@ struct gfs2_holder {
enum {
GLF_LOCK = 1,
- GLF_STICKY = 2,
GLF_DEMOTE = 3,
GLF_PENDING_DEMOTE = 4,
GLF_DEMOTE_IN_PROGRESS = 5,
@@ -194,7 +189,7 @@ struct gfs2_glock {
unsigned long gl_tchange;
void *gl_object;
- struct list_head gl_reclaim;
+ struct list_head gl_lru;
struct gfs2_sbd *gl_sbd;
@@ -233,29 +228,24 @@ enum {
GIF_USER = 4, /* user inode, not metadata addr space */
};
-struct gfs2_dinode_host {
- u64 di_size; /* number of bytes in file */
- u64 di_generation; /* generation number for NFS */
- u32 di_flags; /* GFS2_DIF_... */
- /* These only apply to directories */
- u32 di_entries; /* The number of entries in the directory */
- u64 di_eattr; /* extended attribute block number */
-};
struct gfs2_inode {
struct inode i_inode;
u64 i_no_addr;
u64 i_no_formal_ino;
+ u64 i_generation;
+ u64 i_eattr;
+ loff_t i_disksize;
unsigned long i_flags; /* GIF_... */
-
- struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
-
struct gfs2_glock *i_gl; /* Move into i_gh? */
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
struct gfs2_alloc *i_alloc;
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
+ struct list_head i_trunc_list;
+ u32 i_entries;
+ u32 i_diskflags;
u8 i_height;
u8 i_depth;
};
@@ -406,13 +396,11 @@ struct gfs2_args {
struct gfs2_tune {
spinlock_t gt_spin;
- unsigned int gt_demote_secs; /* Cache retention for unheld glock */
unsigned int gt_incore_log_blocks;
unsigned int gt_log_flush_secs;
unsigned int gt_recoverd_secs;
unsigned int gt_logd_secs;
- unsigned int gt_quotad_secs;
unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
@@ -488,10 +476,6 @@ struct gfs2_sbd {
/* Lock Stuff */
struct lm_lockstruct sd_lockstruct;
- struct list_head sd_reclaim_list;
- spinlock_t sd_reclaim_lock;
- wait_queue_head_t sd_reclaim_wq;
- atomic_t sd_reclaim_count;
struct gfs2_holder sd_live_gh;
struct gfs2_glock *sd_rename_gl;
struct gfs2_glock *sd_trans_gl;
@@ -519,7 +503,6 @@ struct gfs2_sbd {
spinlock_t sd_statfs_spin;
struct gfs2_statfs_change_host sd_statfs_master;
struct gfs2_statfs_change_host sd_statfs_local;
- unsigned long sd_statfs_sync_time;
/* Resource group stuff */
@@ -552,8 +535,6 @@ struct gfs2_sbd {
struct task_struct *sd_recoverd_process;
struct task_struct *sd_logd_process;
struct task_struct *sd_quotad_process;
- struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
- unsigned int sd_glockd_num;
/* Quota stuff */
@@ -561,13 +542,15 @@ struct gfs2_sbd {
atomic_t sd_quota_count;
spinlock_t sd_quota_spin;
struct mutex sd_quota_mutex;
+ wait_queue_head_t sd_quota_wait;
+ struct list_head sd_trunc_list;
+ spinlock_t sd_trunc_lock;
unsigned int sd_quota_slots;
unsigned int sd_quota_chunks;
unsigned char **sd_quota_bitmap;
u64 sd_quota_sync_gen;
- unsigned long sd_quota_sync_time;
/* Log stuff */
@@ -624,10 +607,6 @@ struct gfs2_sbd {
struct mutex sd_freeze_lock;
unsigned int sd_freeze_count;
- /* Counters */
-
- atomic_t sd_reclaimed;
-
char sd_fsname[GFS2_FSNAME_LEN];
char sd_table_name[GFS2_FSNAME_LEN];
char sd_proto_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d57616840e8..3b87c188da4 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -32,7 +32,6 @@
#include "log.h"
#include "meta_io.h"
#include "ops_address.h"
-#include "ops_inode.h"
#include "quota.h"
#include "rgrp.h"
#include "trans.h"
@@ -248,7 +247,6 @@ fail:
static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
{
- struct gfs2_dinode_host *di = &ip->i_di;
const struct gfs2_dinode *str = buf;
struct timespec atime;
u16 height, depth;
@@ -274,8 +272,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
* to do that.
*/
ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
- di->di_size = be64_to_cpu(str->di_size);
- i_size_write(&ip->i_inode, di->di_size);
+ ip->i_disksize = be64_to_cpu(str->di_size);
+ i_size_write(&ip->i_inode, ip->i_disksize);
gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
atime.tv_sec = be64_to_cpu(str->di_atime);
atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -287,9 +285,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
ip->i_goal = be64_to_cpu(str->di_goal_meta);
- di->di_generation = be64_to_cpu(str->di_generation);
+ ip->i_generation = be64_to_cpu(str->di_generation);
- di->di_flags = be32_to_cpu(str->di_flags);
+ ip->i_diskflags = be32_to_cpu(str->di_flags);
gfs2_set_inode_flags(&ip->i_inode);
height = be16_to_cpu(str->di_height);
if (unlikely(height > GFS2_MAX_META_HEIGHT))
@@ -300,9 +298,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
goto corrupt;
ip->i_depth = (u8)depth;
- di->di_entries = be32_to_cpu(str->di_entries);
+ ip->i_entries = be32_to_cpu(str->di_entries);
- di->di_eattr = be64_to_cpu(str->di_eattr);
+ ip->i_eattr = be64_to_cpu(str->di_eattr);
if (S_ISREG(ip->i_inode.i_mode))
gfs2_set_aops(&ip->i_inode);
@@ -388,7 +386,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
gfs2_free_di(rgd, ip);
gfs2_trans_end(sdp);
- clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
out_rg_gunlock:
gfs2_glock_dq_uninit(&al->al_rgd_gh);
@@ -690,7 +687,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
return error;
}
- if (dip->i_di.di_entries == (u32)-1)
+ if (dip->i_entries == (u32)-1)
return -EFBIG;
if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
return -EMLINK;
@@ -790,11 +787,11 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
di->di_flags = 0;
if (S_ISREG(mode)) {
- if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
+ if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
gfs2_tune_get(sdp, gt_new_files_jdata))
di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
} else if (S_ISDIR(mode)) {
- di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+ di->di_flags |= cpu_to_be32(dip->i_diskflags &
GFS2_DIF_INHERIT_JDATA);
}
@@ -1068,7 +1065,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
struct qstr dotname;
int error;
- if (ip->i_di.di_entries != 2) {
+ if (ip->i_entries != 2) {
if (gfs2_consist_inode(ip))
gfs2_dinode_print(ip);
return -EIO;
@@ -1168,7 +1165,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
return error;
}
- if (!ip->i_di.di_size) {
+ if (!ip->i_disksize) {
gfs2_consist_inode(ip);
error = -EIO;
goto out;
@@ -1178,7 +1175,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
if (error)
goto out;
- x = ip->i_di.di_size + 1;
+ x = ip->i_disksize + 1;
if (x > *len) {
*buf = kmalloc(x, GFP_NOFS);
if (!*buf) {
@@ -1242,7 +1239,6 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
{
- const struct gfs2_dinode_host *di = &ip->i_di;
struct gfs2_dinode *str = buf;
str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -1256,7 +1252,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
- str->di_size = cpu_to_be64(di->di_size);
+ str->di_size = cpu_to_be64(ip->i_disksize);
str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1264,17 +1260,17 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_goal_meta = cpu_to_be64(ip->i_goal);
str->di_goal_data = cpu_to_be64(ip->i_goal);
- str->di_generation = cpu_to_be64(di->di_generation);
+ str->di_generation = cpu_to_be64(ip->i_generation);
- str->di_flags = cpu_to_be32(di->di_flags);
+ str->di_flags = cpu_to_be32(ip->i_diskflags);
str->di_height = cpu_to_be16(ip->i_height);
str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
- !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
+ !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
GFS2_FORMAT_DE : 0);
str->di_depth = cpu_to_be16(ip->i_depth);
- str->di_entries = cpu_to_be32(di->di_entries);
+ str->di_entries = cpu_to_be32(ip->i_entries);
- str->di_eattr = cpu_to_be64(di->di_eattr);
+ str->di_eattr = cpu_to_be64(ip->i_eattr);
str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
@@ -1282,22 +1278,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
void gfs2_dinode_print(const struct gfs2_inode *ip)
{
- const struct gfs2_dinode_host *di = &ip->i_di;
-
printk(KERN_INFO " no_formal_ino = %llu\n",
(unsigned long long)ip->i_no_formal_ino);
printk(KERN_INFO " no_addr = %llu\n",
(unsigned long long)ip->i_no_addr);
- printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
+ printk(KERN_INFO " i_disksize = %llu\n",
+ (unsigned long long)ip->i_disksize);
printk(KERN_INFO " blocks = %llu\n",
(unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
printk(KERN_INFO " i_goal = %llu\n",
(unsigned long long)ip->i_goal);
- printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags);
+ printk(KERN_INFO " i_diskflags = 0x%.8X\n", ip->i_diskflags);
printk(KERN_INFO " i_height = %u\n", ip->i_height);
printk(KERN_INFO " i_depth = %u\n", ip->i_depth);
- printk(KERN_INFO " di_entries = %u\n", di->di_entries);
- printk(KERN_INFO " di_eattr = %llu\n",
- (unsigned long long)di->di_eattr);
+ printk(KERN_INFO " i_entries = %u\n", ip->i_entries);
+ printk(KERN_INFO " i_eattr = %llu\n",
+ (unsigned long long)ip->i_eattr);
}
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2d43f69610a..d5329364cdf 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,6 +10,7 @@
#ifndef __INODE_DOT_H__
#define __INODE_DOT_H__
+#include <linux/fs.h>
#include "util.h"
static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -19,7 +20,7 @@ static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
{
- return ip->i_di.di_flags & GFS2_DIF_JDATA;
+ return ip->i_diskflags & GFS2_DIF_JDATA;
}
static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
@@ -97,5 +98,15 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
void gfs2_dinode_print(const struct gfs2_inode *ip);
+extern const struct inode_operations gfs2_file_iops;
+extern const struct inode_operations gfs2_dir_iops;
+extern const struct inode_operations gfs2_symlink_iops;
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
+
+extern void gfs2_set_inode_flags(struct inode *inode);
+
#endif /* __INODE_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 0c4cbe6c828..1aa7eb6a022 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -194,17 +194,25 @@ out:
static void gdlm_recovery_done(void *lockspace, unsigned int jid,
unsigned int message)
{
+ char env_jid[20];
+ char env_status[20];
+ char *envp[] = { env_jid, env_status, NULL };
struct gdlm_ls *ls = lockspace;
ls->recover_jid_done = jid;
ls->recover_jid_status = message;
- kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+ sprintf(env_jid, "JID=%d", jid);
+ sprintf(env_status, "RECOVERY=%s",
+ message == LM_RD_SUCCESS ? "Done" : "Failed");
+ kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
}
static void gdlm_others_may_mount(void *lockspace)
{
+ char *message = "FIRSTMOUNT=Done";
+ char *envp[] = { message, NULL };
struct gdlm_ls *ls = lockspace;
ls->first_done = 1;
- kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+ kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
}
/* Userspace gets the offline uevent, blocks new gfs locks on
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4ec571c3d8a..9b7edcf7bd4 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -195,9 +195,23 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
kobject_put(&ls->kobj);
}
+static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
+ struct kobj_uevent_env *env)
+{
+ struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+ add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
+ add_uevent_var(env, "LOCKPROTO=lock_dlm");
+ return 0;
+}
+
+static struct kset_uevent_ops gdlm_uevent_ops = {
+ .uevent = gdlm_uevent,
+};
+
+
int gdlm_sysfs_init(void)
{
- gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
+ gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
if (!gdlm_kset) {
printk(KERN_WARNING "%s: can not create kset\n", __func__);
return -ENOMEM;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bb2cc303ac2..7cacfde3219 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -19,7 +19,7 @@
#include "gfs2.h"
#include "incore.h"
-#include "ops_fstype.h"
+#include "super.h"
#include "sys.h"
#include "util.h"
#include "glock.h"
@@ -30,6 +30,7 @@ static void gfs2_init_inode_once(void *foo)
inode_init_once(&ip->i_inode);
init_rwsem(&ip->i_rw_mutex);
+ INIT_LIST_HEAD(&ip->i_trunc_list);
ip->i_alloc = NULL;
}
@@ -42,7 +43,7 @@ static void gfs2_init_glock_once(void *foo)
INIT_LIST_HEAD(&gl->gl_holders);
gl->gl_lvb = NULL;
atomic_set(&gl->gl_lvb_count, 0);
- INIT_LIST_HEAD(&gl->gl_reclaim);
+ INIT_LIST_HEAD(&gl->gl_lru);
INIT_LIST_HEAD(&gl->gl_ail_list);
atomic_set(&gl->gl_ail_count, 0);
}
@@ -93,6 +94,12 @@ static int __init init_gfs2_fs(void)
if (!gfs2_rgrpd_cachep)
goto fail;
+ gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad",
+ sizeof(struct gfs2_quota_data),
+ 0, 0, NULL);
+ if (!gfs2_quotad_cachep)
+ goto fail;
+
error = register_filesystem(&gfs2_fs_type);
if (error)
goto fail;
@@ -112,6 +119,9 @@ fail_unregister:
fail:
gfs2_glock_exit();
+ if (gfs2_quotad_cachep)
+ kmem_cache_destroy(gfs2_quotad_cachep);
+
if (gfs2_rgrpd_cachep)
kmem_cache_destroy(gfs2_rgrpd_cachep);
@@ -140,6 +150,7 @@ static void __exit exit_gfs2_fs(void)
unregister_filesystem(&gfs2_fs_type);
unregister_filesystem(&gfs2meta_fs_type);
+ kmem_cache_destroy(gfs2_quotad_cachep);
kmem_cache_destroy(gfs2_rgrpd_cachep);
kmem_cache_destroy(gfs2_bufdata_cachep);
kmem_cache_destroy(gfs2_inode_cachep);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f96eb90a2cf..3cb0a44ba02 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -32,7 +32,6 @@ enum {
Opt_debug,
Opt_nodebug,
Opt_upgrade,
- Opt_num_glockd,
Opt_acl,
Opt_noacl,
Opt_quota_off,
@@ -57,7 +56,6 @@ static const match_table_t tokens = {
{Opt_debug, "debug"},
{Opt_nodebug, "nodebug"},
{Opt_upgrade, "upgrade"},
- {Opt_num_glockd, "num_glockd=%d"},
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_quota_off, "quota=off"},
@@ -87,16 +85,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
int error = 0;
if (!remount) {
- /* If someone preloaded options, use those instead */
- spin_lock(&gfs2_sys_margs_lock);
- if (gfs2_sys_margs) {
- data = gfs2_sys_margs;
- gfs2_sys_margs = NULL;
- }
- spin_unlock(&gfs2_sys_margs_lock);
-
/* Set some defaults */
- args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
args->ar_quota = GFS2_QUOTA_DEFAULT;
args->ar_data = GFS2_DATA_DEFAULT;
}
@@ -105,7 +94,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
process them */
for (options = data; (o = strsep(&options, ",")); ) {
- int token, option;
+ int token;
substring_t tmp[MAX_OPT_ARGS];
if (!*o)
@@ -196,22 +185,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
goto cant_remount;
args->ar_upgrade = 1;
break;
- case Opt_num_glockd:
- if ((error = match_int(&tmp[0], &option))) {
- fs_info(sdp, "problem getting num_glockd\n");
- goto out_error;
- }
-
- if (remount && option != args->ar_num_glockd)
- goto cant_remount;
- if (!option || option > GFS2_GLOCKD_MAX) {
- fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
- GFS2_GLOCKD_MAX, option);
- error = -EINVAL;
- goto out_error;
- }
- args->ar_num_glockd = option;
- break;
case Opt_acl:
args->ar_posix_acl = 1;
sdp->sd_vfs->s_flags |= MS_POSIXACL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 27563816e1c..4ddab67867e 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -210,25 +210,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
{
struct inode *inode = page->mapping->host;
struct gfs2_sbd *sdp = GFS2_SB(inode);
- int error;
+ int ret;
int done_trans = 0;
- error = gfs2_writepage_common(page, wbc);
- if (error <= 0)
- return error;
-
if (PageChecked(page)) {
if (wbc->sync_mode != WB_SYNC_ALL)
goto out_ignore;
- error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
- if (error)
+ ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+ if (ret)
goto out_ignore;
done_trans = 1;
}
- error = __gfs2_jdata_writepage(page, wbc);
+ ret = gfs2_writepage_common(page, wbc);
+ if (ret > 0)
+ ret = __gfs2_jdata_writepage(page, wbc);
if (done_trans)
gfs2_trans_end(sdp);
- return error;
+ return ret;
out_ignore:
redirty_page_for_writepage(wbc, page);
@@ -453,8 +451,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
kaddr = kmap_atomic(page, KM_USER0);
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
- ip->i_di.di_size);
- memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
+ ip->i_disksize);
+ memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
kunmap_atomic(kaddr, KM_USER0);
flush_dcache_page(page);
brelse(dibh);
@@ -627,7 +625,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
{
struct gfs2_inode *ip = GFS2_I(mapping->host);
struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
- unsigned int data_blocks, ind_blocks, rblocks;
+ unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
int alloc_required;
int error = 0;
struct gfs2_alloc *al;
@@ -641,11 +639,13 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
if (unlikely(error))
goto out_uninit;
- gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
if (error)
goto out_unlock;
+ if (alloc_required || gfs2_is_jdata(ip))
+ gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
+
if (alloc_required) {
al = gfs2_alloc_get(ip);
if (!al) {
@@ -675,7 +675,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
goto out_trans_fail;
error = -ENOMEM;
- page = __grab_cache_page(mapping, index);
+ flags |= AOP_FLAG_NOFS;
+ page = grab_cache_page_write_begin(mapping, index, flags);
*pagep = page;
if (unlikely(!page))
goto out_endtrans;
@@ -782,7 +783,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
if (inode->i_size < to) {
i_size_write(inode, to);
- ip->i_di.di_size = inode->i_size;
+ ip->i_disksize = inode->i_size;
di->di_size = cpu_to_be64(inode->i_size);
mark_inode_dirty(inode);
}
@@ -847,9 +848,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
- if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
+ if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
di = (struct gfs2_dinode *)dibh->b_data;
- ip->i_di.di_size = inode->i_size;
+ ip->i_disksize = inode->i_size;
di->di_size = cpu_to_be64(inode->i_size);
mark_inode_dirty(inode);
}
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 4a5e676b442..c2ad36330ca 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -19,7 +19,7 @@
#include "incore.h"
#include "dir.h"
#include "glock.h"
-#include "ops_dentry.h"
+#include "super.h"
#include "util.h"
#include "inode.h"
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
deleted file mode 100644
index 5caa3db4d3f..00000000000
--- a/fs/gfs2/ops_dentry.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_DENTRY_DOT_H__
-#define __OPS_DENTRY_DOT_H__
-
-#include <linux/dcache.h>
-
-extern struct dentry_operations gfs2_dops;
-
-#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index bbb8c36403a..7fdeb14ddd1 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -22,8 +22,7 @@
#include "glock.h"
#include "glops.h"
#include "inode.h"
-#include "ops_dentry.h"
-#include "ops_fstype.h"
+#include "super.h"
#include "rgrp.h"
#include "util.h"
@@ -214,7 +213,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
}
error = -EIO;
- if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
+ if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
iput(inode);
goto fail;
}
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3a747f8e218..93fe41b67f9 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -39,7 +39,6 @@
#include "util.h"
#include "eaops.h"
#include "ops_address.h"
-#include "ops_inode.h"
/**
* gfs2_llseek - seek to a location in a file
@@ -158,8 +157,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
if (error)
return error;
- fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
- if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
+ fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
+ if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
fsflags |= FS_JOURNAL_DATA_FL;
if (put_user(fsflags, ptr))
error = -EFAULT;
@@ -172,17 +171,16 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
void gfs2_set_inode_flags(struct inode *inode)
{
struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_dinode_host *di = &ip->i_di;
unsigned int flags = inode->i_flags;
flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
- if (di->di_flags & GFS2_DIF_IMMUTABLE)
+ if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
flags |= S_IMMUTABLE;
- if (di->di_flags & GFS2_DIF_APPENDONLY)
+ if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
flags |= S_APPEND;
- if (di->di_flags & GFS2_DIF_NOATIME)
+ if (ip->i_diskflags & GFS2_DIF_NOATIME)
flags |= S_NOATIME;
- if (di->di_flags & GFS2_DIF_SYNC)
+ if (ip->i_diskflags & GFS2_DIF_SYNC)
flags |= S_SYNC;
inode->i_flags = flags;
}
@@ -221,7 +219,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
if (error)
goto out_drop_write;
- flags = ip->i_di.di_flags;
+ flags = ip->i_diskflags;
new_flags = (flags & ~mask) | (reqflags & mask);
if ((new_flags ^ flags) == 0)
goto out;
@@ -260,7 +258,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
if (error)
goto out_trans_end;
gfs2_trans_add_bh(ip->i_gl, bh, 1);
- ip->i_di.di_flags = new_flags;
+ ip->i_diskflags = new_flags;
gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
gfs2_set_inode_flags(inode);
@@ -344,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
unsigned long last_index;
- u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits);
+ u64 pos = page->index << PAGE_CACHE_SHIFT;
unsigned int data_blocks, ind_blocks, rblocks;
int alloc_required = 0;
struct gfs2_holder gh;
@@ -357,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
goto out;
set_bit(GIF_SW_PAGED, &ip->i_flags);
- gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
if (ret || !alloc_required)
goto out_unlock;
@@ -369,6 +366,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
ret = gfs2_quota_lock_check(ip);
if (ret)
goto out_alloc_put;
+ gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
al->al_requested = data_blocks + ind_blocks;
ret = gfs2_inplace_reserve(ip);
if (ret)
@@ -479,7 +477,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
goto fail;
if (!(file->f_flags & O_LARGEFILE) &&
- ip->i_di.di_size > MAX_NON_LFS) {
+ ip->i_disksize > MAX_NON_LFS) {
error = -EOVERFLOW;
goto fail_gunlock;
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b117fcf2c4f..f91eebdde58 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -22,20 +22,18 @@
#include "gfs2.h"
#include "incore.h"
#include "bmap.h"
-#include "daemon.h"
#include "glock.h"
#include "glops.h"
#include "inode.h"
#include "mount.h"
-#include "ops_fstype.h"
-#include "ops_dentry.h"
-#include "ops_super.h"
#include "recovery.h"
#include "rgrp.h"
#include "super.h"
#include "sys.h"
#include "util.h"
#include "log.h"
+#include "quota.h"
+#include "dir.h"
#define DO 0
#define UNDO 1
@@ -58,12 +56,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
{
spin_lock_init(&gt->gt_spin);
- gt->gt_demote_secs = 300;
gt->gt_incore_log_blocks = 1024;
gt->gt_log_flush_secs = 60;
gt->gt_recoverd_secs = 60;
gt->gt_logd_secs = 1;
- gt->gt_quotad_secs = 5;
gt->gt_quota_simul_sync = 64;
gt->gt_quota_warn_period = 10;
gt->gt_quota_scale_num = 1;
@@ -91,10 +87,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
gfs2_tune_init(&sdp->sd_tune);
- INIT_LIST_HEAD(&sdp->sd_reclaim_list);
- spin_lock_init(&sdp->sd_reclaim_lock);
- init_waitqueue_head(&sdp->sd_reclaim_wq);
-
mutex_init(&sdp->sd_inum_mutex);
spin_lock_init(&sdp->sd_statfs_spin);
@@ -110,6 +102,9 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
INIT_LIST_HEAD(&sdp->sd_quota_list);
spin_lock_init(&sdp->sd_quota_spin);
mutex_init(&sdp->sd_quota_mutex);
+ init_waitqueue_head(&sdp->sd_quota_wait);
+ INIT_LIST_HEAD(&sdp->sd_trunc_list);
+ spin_lock_init(&sdp->sd_trunc_lock);
spin_lock_init(&sdp->sd_log_lock);
@@ -443,24 +438,11 @@ out:
static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
int undo)
{
- struct task_struct *p;
int error = 0;
if (undo)
goto fail_trans;
- for (sdp->sd_glockd_num = 0;
- sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
- sdp->sd_glockd_num++) {
- p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
- error = IS_ERR(p);
- if (error) {
- fs_err(sdp, "can't start glockd thread: %d\n", error);
- goto fail;
- }
- sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
- }
-
error = gfs2_glock_nq_num(sdp,
GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
@@ -493,7 +475,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
fs_err(sdp, "can't create transaction glock: %d\n", error);
goto fail_rename;
}
- set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
return 0;
@@ -506,9 +487,6 @@ fail_live:
fail_mount:
gfs2_glock_dq_uninit(mount_gh);
fail:
- while (sdp->sd_glockd_num--)
- kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
-
return error;
}
@@ -620,7 +598,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
prev_db = 0;
- for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
+ for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
bh.b_state = 0;
bh.b_blocknr = 0;
bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -661,6 +639,72 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
sdp->sd_lockstruct.ls_lockspace);
}
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+ struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+ struct qstr name;
+ char buf[20];
+ struct gfs2_jdesc *jd;
+ int error;
+
+ name.name = buf;
+
+ mutex_lock(&sdp->sd_jindex_mutex);
+
+ for (;;) {
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
+ if (error)
+ break;
+
+ name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+ name.hash = gfs2_disk_hash(name.name, name.len);
+
+ error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
+ if (error == -ENOENT) {
+ error = 0;
+ break;
+ }
+
+ gfs2_glock_dq_uninit(ji_gh);
+
+ if (error)
+ break;
+
+ error = -ENOMEM;
+ jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+ if (!jd)
+ break;
+
+ INIT_LIST_HEAD(&jd->extent_list);
+ jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
+ if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+ if (!jd->jd_inode)
+ error = -ENOENT;
+ else
+ error = PTR_ERR(jd->jd_inode);
+ kfree(jd);
+ break;
+ }
+
+ spin_lock(&sdp->sd_jindex_spin);
+ jd->jd_jid = sdp->sd_journals++;
+ list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+ spin_unlock(&sdp->sd_jindex_spin);
+ }
+
+ mutex_unlock(&sdp->sd_jindex_mutex);
+
+ return error;
+}
+
static int init_journal(struct gfs2_sbd *sdp, int undo)
{
struct inode *master = sdp->sd_master_dir->d_inode;
@@ -681,7 +725,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
return PTR_ERR(sdp->sd_jindex);
}
ip = GFS2_I(sdp->sd_jindex);
- set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
/* Load in the journal index special file */
@@ -832,7 +875,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
goto fail_statfs;
}
ip = GFS2_I(sdp->sd_rindex);
- set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
sdp->sd_rindex_uptodate = 0;
/* Read in the quota inode */
@@ -973,9 +1015,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
}
sdp->sd_logd_process = p;
- sdp->sd_statfs_sync_time = jiffies;
- sdp->sd_quota_sync_time = jiffies;
-
p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
error = IS_ERR(p);
if (error) {
@@ -1224,17 +1263,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
static void gfs2_kill_sb(struct super_block *sb)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
- if (sdp) {
- gfs2_meta_syncfs(sdp);
- dput(sdp->sd_root_dir);
- dput(sdp->sd_master_dir);
- sdp->sd_root_dir = NULL;
- sdp->sd_master_dir = NULL;
+
+ if (sdp == NULL) {
+ kill_block_super(sb);
+ return;
}
+
+ gfs2_meta_syncfs(sdp);
+ dput(sdp->sd_root_dir);
+ dput(sdp->sd_master_dir);
+ sdp->sd_root_dir = NULL;
+ sdp->sd_master_dir = NULL;
shrink_dcache_sb(sb);
kill_block_super(sb);
- if (sdp)
- gfs2_delete_debugfs_file(sdp);
+ gfs2_delete_debugfs_file(sdp);
+ kfree(sdp);
}
struct file_system_type gfs2_fs_type = {
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
deleted file mode 100644
index da849051183..00000000000
--- a/fs/gfs2/ops_fstype.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_FSTYPE_DOT_H__
-#define __OPS_FSTYPE_DOT_H__
-
-#include <linux/fs.h>
-
-extern struct file_system_type gfs2_fs_type;
-extern struct file_system_type gfs2meta_fs_type;
-extern const struct export_operations gfs2_export_ops;
-
-#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d232991b904..49877546beb 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -19,6 +19,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/lm_interface.h>
+#include <linux/fiemap.h>
#include <asm/uaccess.h>
#include "gfs2.h"
@@ -31,12 +32,11 @@
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
-#include "ops_dentry.h"
-#include "ops_inode.h"
#include "quota.h"
#include "rgrp.h"
#include "trans.h"
#include "util.h"
+#include "super.h"
/**
* gfs2_create - Create a file
@@ -185,7 +185,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (!dip->i_inode.i_nlink)
goto out_gunlock;
error = -EFBIG;
- if (dip->i_di.di_entries == (u32)-1)
+ if (dip->i_entries == (u32)-1)
goto out_gunlock;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -371,7 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
ip = ghs[1].gh_gl->gl_object;
- ip->i_di.di_size = size;
+ ip->i_disksize = size;
error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -425,9 +425,9 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
ip = ghs[1].gh_gl->gl_object;
ip->i_inode.i_nlink = 2;
- ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
- ip->i_di.di_flags |= GFS2_DIF_JDATA;
- ip->i_di.di_entries = 2;
+ ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+ ip->i_diskflags |= GFS2_DIF_JDATA;
+ ip->i_entries = 2;
error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -517,13 +517,13 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
if (error)
goto out_gunlock;
- if (ip->i_di.di_entries < 2) {
+ if (ip->i_entries < 2) {
if (gfs2_consist_inode(ip))
gfs2_dinode_print(ip);
error = -EIO;
goto out_gunlock;
}
- if (ip->i_di.di_entries > 2) {
+ if (ip->i_entries > 2) {
error = -ENOTEMPTY;
goto out_gunlock;
}
@@ -726,13 +726,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
goto out_gunlock;
if (S_ISDIR(nip->i_inode.i_mode)) {
- if (nip->i_di.di_entries < 2) {
+ if (nip->i_entries < 2) {
if (gfs2_consist_inode(nip))
gfs2_dinode_print(nip);
error = -EIO;
goto out_gunlock;
}
- if (nip->i_di.di_entries > 2) {
+ if (nip->i_entries > 2) {
error = -ENOTEMPTY;
goto out_gunlock;
}
@@ -758,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
error = -EINVAL;
goto out_gunlock;
}
- if (ndip->i_di.di_entries == (u32)-1) {
+ if (ndip->i_entries == (u32)-1) {
error = -EFBIG;
goto out_gunlock;
}
@@ -990,7 +990,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
struct gfs2_sbd *sdp = GFS2_SB(inode);
int error;
- if (attr->ia_size != ip->i_di.di_size) {
+ if (attr->ia_size != ip->i_disksize) {
error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
if (error)
return error;
@@ -1001,8 +1001,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
}
error = gfs2_truncatei(ip, attr->ia_size);
- if (error && (inode->i_size != ip->i_di.di_size))
- i_size_write(inode, ip->i_di.di_size);
+ if (error && (inode->i_size != ip->i_disksize))
+ i_size_write(inode, ip->i_disksize);
return error;
}
@@ -1212,6 +1212,48 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
}
+static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
+
+ ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
+ mutex_lock(&inode->i_mutex);
+
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ if (ret)
+ goto out;
+
+ if (gfs2_is_stuffed(ip)) {
+ u64 phys = ip->i_no_addr << inode->i_blkbits;
+ u64 size = i_size_read(inode);
+ u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
+ FIEMAP_EXTENT_DATA_INLINE;
+ phys += sizeof(struct gfs2_dinode);
+ phys += start;
+ if (start + len > size)
+ len = size - start;
+ if (start < size)
+ ret = fiemap_fill_next_extent(fieinfo, start, phys,
+ len, flags);
+ if (ret == 1)
+ ret = 0;
+ } else {
+ ret = __generic_block_fiemap(inode, fieinfo, start, len,
+ gfs2_block_map);
+ }
+
+ gfs2_glock_dq_uninit(&gh);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
const struct inode_operations gfs2_file_iops = {
.permission = gfs2_permission,
.setattr = gfs2_setattr,
@@ -1220,6 +1262,7 @@ const struct inode_operations gfs2_file_iops = {
.getxattr = gfs2_getxattr,
.listxattr = gfs2_listxattr,
.removexattr = gfs2_removexattr,
+ .fiemap = gfs2_fiemap,
};
const struct inode_operations gfs2_dir_iops = {
@@ -1239,6 +1282,7 @@ const struct inode_operations gfs2_dir_iops = {
.getxattr = gfs2_getxattr,
.listxattr = gfs2_listxattr,
.removexattr = gfs2_removexattr,
+ .fiemap = gfs2_fiemap,
};
const struct inode_operations gfs2_symlink_iops = {
@@ -1251,5 +1295,6 @@ const struct inode_operations gfs2_symlink_iops = {
.getxattr = gfs2_getxattr,
.listxattr = gfs2_listxattr,
.removexattr = gfs2_removexattr,
+ .fiemap = gfs2_fiemap,
};
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
deleted file mode 100644
index 14b4b797622..00000000000
--- a/fs/gfs2/ops_inode.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_INODE_DOT_H__
-#define __OPS_INODE_DOT_H__
-
-#include <linux/fs.h>
-
-extern const struct inode_operations gfs2_file_iops;
-extern const struct inode_operations gfs2_dir_iops;
-extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-extern const struct file_operations gfs2_file_fops_nolock;
-extern const struct file_operations gfs2_dir_fops_nolock;
-
-extern void gfs2_set_inode_flags(struct inode *inode);
-
-#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index d5355d9b592..777783deddc 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -28,7 +28,6 @@
#include "inode.h"
#include "log.h"
#include "mount.h"
-#include "ops_super.h"
#include "quota.h"
#include "recovery.h"
#include "rgrp.h"
@@ -143,8 +142,6 @@ static void gfs2_put_super(struct super_block *sb)
kthread_stop(sdp->sd_quotad_process);
kthread_stop(sdp->sd_logd_process);
kthread_stop(sdp->sd_recoverd_process);
- while (sdp->sd_glockd_num--)
- kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
if (!(sb->s_flags & MS_RDONLY)) {
error = gfs2_make_fs_ro(sdp);
@@ -185,7 +182,6 @@ static void gfs2_put_super(struct super_block *sb)
/* At this point, we're through participating in the lockspace */
gfs2_sys_fs_del(sdp);
- kfree(sdp);
}
/**
@@ -260,6 +256,137 @@ static void gfs2_unlockfs(struct super_block *sb)
}
/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+ struct gfs2_statfs_change_host *sc)
+{
+ gfs2_rgrp_verify(rgd);
+ sc->sc_total += rgd->rd_data;
+ sc->sc_free += rgd->rd_free;
+ sc->sc_dinodes += rgd->rd_dinodes;
+ return 0;
+}
+
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+ struct gfs2_holder ri_gh;
+ struct gfs2_rgrpd *rgd_next;
+ struct gfs2_holder *gha, *gh;
+ unsigned int slots = 64;
+ unsigned int x;
+ int done;
+ int error = 0, err;
+
+ memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+ gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+ if (!gha)
+ return -ENOMEM;
+
+ error = gfs2_rindex_hold(sdp, &ri_gh);
+ if (error)
+ goto out;
+
+ rgd_next = gfs2_rgrpd_get_first(sdp);
+
+ for (;;) {
+ done = 1;
+
+ for (x = 0; x < slots; x++) {
+ gh = gha + x;
+
+ if (gh->gh_gl && gfs2_glock_poll(gh)) {
+ err = gfs2_glock_wait(gh);
+ if (err) {
+ gfs2_holder_uninit(gh);
+ error = err;
+ } else {
+ if (!error)
+ error = statfs_slow_fill(
+ gh->gh_gl->gl_object, sc);
+ gfs2_glock_dq_uninit(gh);
+ }
+ }
+
+ if (gh->gh_gl)
+ done = 0;
+ else if (rgd_next && !error) {
+ error = gfs2_glock_nq_init(rgd_next->rd_gl,
+ LM_ST_SHARED,
+ GL_ASYNC,
+ gh);
+ rgd_next = gfs2_rgrpd_get_next(rgd_next);
+ done = 0;
+ }
+
+ if (signal_pending(current))
+ error = -ERESTARTSYS;
+ }
+
+ if (done)
+ break;
+
+ yield();
+ }
+
+ gfs2_glock_dq_uninit(&ri_gh);
+
+out:
+ kfree(gha);
+ return error;
+}
+
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+ struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+ struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+ spin_lock(&sdp->sd_statfs_spin);
+
+ *sc = *m_sc;
+ sc->sc_total += l_sc->sc_total;
+ sc->sc_free += l_sc->sc_free;
+ sc->sc_dinodes += l_sc->sc_dinodes;
+
+ spin_unlock(&sdp->sd_statfs_spin);
+
+ if (sc->sc_free < 0)
+ sc->sc_free = 0;
+ if (sc->sc_free > sc->sc_total)
+ sc->sc_free = sc->sc_total;
+ if (sc->sc_dinodes < 0)
+ sc->sc_dinodes = 0;
+
+ return 0;
+}
+
+/**
* gfs2_statfs - Gather and return stats about the filesystem
* @sb: The superblock
* @statfsbuf: The buffer
@@ -370,7 +497,6 @@ static void gfs2_clear_inode(struct inode *inode)
*/
if (test_bit(GIF_USER, &ip->i_flags)) {
ip->i_gl->gl_object = NULL;
- gfs2_glock_schedule_for_reclaim(ip->i_gl);
gfs2_glock_put(ip->i_gl);
ip->i_gl = NULL;
if (ip->i_iopen_gh.gh_gl) {
@@ -423,8 +549,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
seq_printf(s, ",debug");
if (args->ar_upgrade)
seq_printf(s, ",upgrade");
- if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
- seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
if (args->ar_posix_acl)
seq_printf(s, ",acl");
if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -494,16 +618,16 @@ static void gfs2_delete_inode(struct inode *inode)
gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
error = gfs2_glock_nq(&ip->i_iopen_gh);
if (error)
- goto out_uninit;
+ goto out_truncate;
if (S_ISDIR(inode->i_mode) &&
- (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+ (ip->i_diskflags & GFS2_DIF_EXHASH)) {
error = gfs2_dir_exhash_dealloc(ip);
if (error)
goto out_unlock;
}
- if (ip->i_di.di_eattr) {
+ if (ip->i_eattr) {
error = gfs2_ea_dealloc(ip);
if (error)
goto out_unlock;
@@ -519,6 +643,7 @@ static void gfs2_delete_inode(struct inode *inode)
if (error)
goto out_unlock;
+out_truncate:
error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
if (error)
goto out_unlock;
@@ -527,8 +652,8 @@ static void gfs2_delete_inode(struct inode *inode)
gfs2_trans_end(sdp);
out_unlock:
- gfs2_glock_dq(&ip->i_iopen_gh);
-out_uninit:
+ if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+ gfs2_glock_dq(&ip->i_iopen_gh);
gfs2_holder_uninit(&ip->i_iopen_gh);
gfs2_glock_dq_uninit(&gh);
if (error && error != GLR_TRYFAILED)
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
deleted file mode 100644
index 442a274c627..00000000000
--- a/fs/gfs2/ops_super.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_SUPER_DOT_H__
-#define __OPS_SUPER_DOT_H__
-
-#include <linux/fs.h>
-
-extern const struct super_operations gfs2_super_ops;
-
-#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e073f5144f..b08d09696b3 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -46,6 +46,8 @@
#include <linux/bio.h>
#include <linux/gfs2_ondisk.h>
#include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include "gfs2.h"
#include "incore.h"
@@ -94,7 +96,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
struct gfs2_quota_data *qd;
int error;
- qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS);
+ qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
if (!qd)
return -ENOMEM;
@@ -119,7 +121,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
return 0;
fail:
- kfree(qd);
+ kmem_cache_free(gfs2_quotad_cachep, qd);
return error;
}
@@ -158,7 +160,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
if (qd || !create) {
if (new_qd) {
gfs2_lvb_unhold(new_qd->qd_gl);
- kfree(new_qd);
+ kmem_cache_free(gfs2_quotad_cachep, new_qd);
}
*qdp = qd;
return 0;
@@ -1013,7 +1015,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
return;
- if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
+ if (ip->i_diskflags & GFS2_DIF_SYSTEM)
return;
for (x = 0; x < al->al_qd_num; x++) {
@@ -1100,15 +1102,15 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
int gfs2_quota_init(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
- unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+ unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
unsigned int x, slot = 0;
unsigned int found = 0;
u64 dblock;
u32 extlen = 0;
int error;
- if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
- ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
+ if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
+ ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
gfs2_consist_inode(ip);
return -EIO;
}
@@ -1195,7 +1197,7 @@ fail:
return error;
}
-void gfs2_quota_scan(struct gfs2_sbd *sdp)
+static void gfs2_quota_scan(struct gfs2_sbd *sdp)
{
struct gfs2_quota_data *qd, *safe;
LIST_HEAD(dead);
@@ -1222,7 +1224,7 @@ void gfs2_quota_scan(struct gfs2_sbd *sdp)
gfs2_assert_warn(sdp, !qd->qd_bh_count);
gfs2_lvb_unhold(qd->qd_gl);
- kfree(qd);
+ kmem_cache_free(gfs2_quotad_cachep, qd);
}
}
@@ -1257,7 +1259,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
gfs2_assert_warn(sdp, !qd->qd_bh_count);
gfs2_lvb_unhold(qd->qd_gl);
- kfree(qd);
+ kmem_cache_free(gfs2_quotad_cachep, qd);
spin_lock(&sdp->sd_quota_spin);
}
@@ -1272,3 +1274,94 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
}
}
+static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
+{
+ if (error == 0 || error == -EROFS)
+ return;
+ if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
+}
+
+static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
+ int (*fxn)(struct gfs2_sbd *sdp),
+ unsigned long t, unsigned long *timeo,
+ unsigned int *new_timeo)
+{
+ if (t >= *timeo) {
+ int error = fxn(sdp);
+ quotad_error(sdp, msg, error);
+ *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
+ } else {
+ *timeo -= t;
+ }
+}
+
+static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
+{
+ struct gfs2_inode *ip;
+
+ while(1) {
+ ip = NULL;
+ spin_lock(&sdp->sd_trunc_lock);
+ if (!list_empty(&sdp->sd_trunc_list)) {
+ ip = list_entry(sdp->sd_trunc_list.next,
+ struct gfs2_inode, i_trunc_list);
+ list_del_init(&ip->i_trunc_list);
+ }
+ spin_unlock(&sdp->sd_trunc_lock);
+ if (ip == NULL)
+ return;
+ gfs2_glock_finish_truncate(ip);
+ }
+}
+
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_quotad(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+ struct gfs2_tune *tune = &sdp->sd_tune;
+ unsigned long statfs_timeo = 0;
+ unsigned long quotad_timeo = 0;
+ unsigned long t = 0;
+ DEFINE_WAIT(wait);
+ int empty;
+
+ while (!kthread_should_stop()) {
+
+ /* Update the master statfs file */
+ quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+ &statfs_timeo, &tune->gt_statfs_quantum);
+
+ /* Update quota file */
+ quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
+ &quotad_timeo, &tune->gt_quota_quantum);
+
+ /* FIXME: This should be turned into a shrinker */
+ gfs2_quota_scan(sdp);
+
+ /* Check for & recover partially truncated inodes */
+ quotad_check_trunc_list(sdp);
+
+ if (freezing(current))
+ refrigerator();
+ t = min(quotad_timeo, statfs_timeo);
+
+ prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_lock(&sdp->sd_trunc_lock);
+ empty = list_empty(&sdp->sd_trunc_list);
+ spin_unlock(&sdp->sd_trunc_lock);
+ if (empty)
+ t -= schedule_timeout(t);
+ else
+ t = 0;
+ finish_wait(&sdp->sd_quota_wait, &wait);
+ }
+
+ return 0;
+}
+
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 3b7f4b0e5df..cec9032be97 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,22 +15,22 @@ struct gfs2_sbd;
#define NO_QUOTA_CHANGE ((u32)-1)
-int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unhold(struct gfs2_inode *ip);
+extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_unhold(struct gfs2_inode *ip);
-int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unlock(struct gfs2_inode *ip);
+extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_unlock(struct gfs2_inode *ip);
-int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
- u32 uid, u32 gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+ u32 uid, u32 gid);
-int gfs2_quota_sync(struct gfs2_sbd *sdp);
-int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
-int gfs2_quota_init(struct gfs2_sbd *sdp);
-void gfs2_quota_scan(struct gfs2_sbd *sdp);
-void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+extern int gfs2_quota_init(struct gfs2_sbd *sdp);
+extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+extern int gfs2_quotad(void *data);
static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
{
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index d5e91f4f6a0..efd09c3d2b2 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,6 +14,8 @@
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include "gfs2.h"
#include "incore.h"
@@ -583,13 +585,35 @@ fail:
return error;
}
+static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+{
+ struct gfs2_jdesc *jd;
+ int found = 0;
+
+ spin_lock(&sdp->sd_jindex_spin);
+
+ list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+ if (jd->jd_dirty) {
+ jd->jd_dirty = 0;
+ found = 1;
+ break;
+ }
+ }
+ spin_unlock(&sdp->sd_jindex_spin);
+
+ if (!found)
+ jd = NULL;
+
+ return jd;
+}
+
/**
* gfs2_check_journals - Recover any dirty journals
* @sdp: the filesystem
*
*/
-void gfs2_check_journals(struct gfs2_sbd *sdp)
+static void gfs2_check_journals(struct gfs2_sbd *sdp)
{
struct gfs2_jdesc *jd;
@@ -603,3 +627,25 @@ void gfs2_check_journals(struct gfs2_sbd *sdp)
}
}
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_recoverd(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+ unsigned long t;
+
+ while (!kthread_should_stop()) {
+ gfs2_check_journals(sdp);
+ t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
+ if (freezing(current))
+ refrigerator();
+ schedule_timeout_interruptible(t);
+ }
+
+ return 0;
+}
+
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index f7235e61c72..a8218ea15b5 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -18,17 +18,17 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
*blk = 0;
}
-int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
struct buffer_head **bh);
-int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
-int gfs2_find_jhead(struct gfs2_jdesc *jd,
+extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head);
-int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-void gfs2_check_journals(struct gfs2_sbd *sdp);
+extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+extern int gfs2_recoverd(void *data);
#endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2d90fb25350..8b01c635d92 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -269,16 +269,14 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
bi->bi_len, x);
}
- if (count[0] != rgd->rd_rg.rg_free) {
+ if (count[0] != rgd->rd_free) {
if (gfs2_consist_rgrpd(rgd))
fs_err(sdp, "free data mismatch: %u != %u\n",
- count[0], rgd->rd_rg.rg_free);
+ count[0], rgd->rd_free);
return;
}
- tmp = rgd->rd_data -
- rgd->rd_rg.rg_free -
- rgd->rd_rg.rg_dinodes;
+ tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
if (count[1] + count[2] != tmp) {
if (gfs2_consist_rgrpd(rgd))
fs_err(sdp, "used data mismatch: %u != %u\n",
@@ -286,10 +284,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
return;
}
- if (count[3] != rgd->rd_rg.rg_dinodes) {
+ if (count[3] != rgd->rd_dinodes) {
if (gfs2_consist_rgrpd(rgd))
fs_err(sdp, "used metadata mismatch: %u != %u\n",
- count[3], rgd->rd_rg.rg_dinodes);
+ count[3], rgd->rd_dinodes);
return;
}
@@ -501,7 +499,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
for (rgrps = 0;; rgrps++) {
loff_t pos = rgrps * sizeof(struct gfs2_rindex);
- if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size)
+ if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
break;
error = gfs2_internal_read(ip, &ra_state, buf, &pos,
sizeof(struct gfs2_rindex));
@@ -590,7 +588,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct inode *inode = &ip->i_inode;
struct file_ra_state ra_state;
- u64 rgrp_count = ip->i_di.di_size;
+ u64 rgrp_count = ip->i_disksize;
int error;
if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
@@ -634,7 +632,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
/* Ignore partials */
if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
- ip->i_di.di_size)
+ ip->i_disksize)
break;
error = read_rindex_entry(ip, &ra_state);
if (error) {
@@ -692,7 +690,6 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
{
const struct gfs2_rgrp *str = buf;
- struct gfs2_rgrp_host *rg = &rgd->rd_rg;
u32 rg_flags;
rg_flags = be32_to_cpu(str->rg_flags);
@@ -700,24 +697,23 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
rgd->rd_flags |= GFS2_RDF_NOALLOC;
else
rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
- rg->rg_free = be32_to_cpu(str->rg_free);
- rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
- rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
+ rgd->rd_free = be32_to_cpu(str->rg_free);
+ rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
+ rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
}
static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
{
struct gfs2_rgrp *str = buf;
- struct gfs2_rgrp_host *rg = &rgd->rd_rg;
u32 rg_flags = 0;
if (rgd->rd_flags & GFS2_RDF_NOALLOC)
rg_flags |= GFS2_RGF_NOALLOC;
str->rg_flags = cpu_to_be32(rg_flags);
- str->rg_free = cpu_to_be32(rg->rg_free);
- str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
+ str->rg_free = cpu_to_be32(rgd->rd_free);
+ str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
str->__pad = cpu_to_be32(0);
- str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
+ str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
}
@@ -776,7 +772,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
}
spin_lock(&sdp->sd_rindex_spin);
- rgd->rd_free_clone = rgd->rd_rg.rg_free;
+ rgd->rd_free_clone = rgd->rd_free;
rgd->rd_bh_count++;
spin_unlock(&sdp->sd_rindex_spin);
@@ -850,7 +846,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
}
spin_lock(&sdp->sd_rindex_spin);
- rgd->rd_free_clone = rgd->rd_rg.rg_free;
+ rgd->rd_free_clone = rgd->rd_free;
spin_unlock(&sdp->sd_rindex_spin);
}
@@ -1403,8 +1399,8 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
block = rgd->rd_data0 + blk;
ip->i_goal = block;
- gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n);
- rgd->rd_rg.rg_free -= *n;
+ gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
+ rgd->rd_free -= *n;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1445,10 +1441,10 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
block = rgd->rd_data0 + blk;
- gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
- rgd->rd_rg.rg_free--;
- rgd->rd_rg.rg_dinodes++;
- *generation = rgd->rd_rg.rg_igeneration++;
+ gfs2_assert_withdraw(sdp, rgd->rd_free);
+ rgd->rd_free--;
+ rgd->rd_dinodes++;
+ *generation = rgd->rd_igeneration++;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1481,7 +1477,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
if (!rgd)
return;
- rgd->rd_rg.rg_free += blen;
+ rgd->rd_free += blen;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1509,7 +1505,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
if (!rgd)
return;
- rgd->rd_rg.rg_free += blen;
+ rgd->rd_free += blen;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1546,10 +1542,10 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
return;
gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
- if (!rgd->rd_rg.rg_dinodes)
+ if (!rgd->rd_dinodes)
gfs2_consist_rgrpd(rgd);
- rgd->rd_rg.rg_dinodes--;
- rgd->rd_rg.rg_free++;
+ rgd->rd_dinodes--;
+ rgd->rd_free++;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c3ba3d9d0aa..141b781f2fc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -34,76 +34,6 @@
#include "util.h"
/**
- * gfs2_jindex_hold - Grab a lock on the jindex
- * @sdp: The GFS2 superblock
- * @ji_gh: the holder for the jindex glock
- *
- * This is very similar to the gfs2_rindex_hold() function, except that
- * in general we hold the jindex lock for longer periods of time and
- * we grab it far less frequently (in general) then the rgrp lock.
- *
- * Returns: errno
- */
-
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
-{
- struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
- struct qstr name;
- char buf[20];
- struct gfs2_jdesc *jd;
- int error;
-
- name.name = buf;
-
- mutex_lock(&sdp->sd_jindex_mutex);
-
- for (;;) {
- error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
- if (error)
- break;
-
- name.len = sprintf(buf, "journal%u", sdp->sd_journals);
- name.hash = gfs2_disk_hash(name.name, name.len);
-
- error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
- if (error == -ENOENT) {
- error = 0;
- break;
- }
-
- gfs2_glock_dq_uninit(ji_gh);
-
- if (error)
- break;
-
- error = -ENOMEM;
- jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
- if (!jd)
- break;
-
- INIT_LIST_HEAD(&jd->extent_list);
- jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
- if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
- if (!jd->jd_inode)
- error = -ENOENT;
- else
- error = PTR_ERR(jd->jd_inode);
- kfree(jd);
- break;
- }
-
- spin_lock(&sdp->sd_jindex_spin);
- jd->jd_jid = sdp->sd_journals++;
- list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
- spin_unlock(&sdp->sd_jindex_spin);
- }
-
- mutex_unlock(&sdp->sd_jindex_mutex);
-
- return error;
-}
-
-/**
* gfs2_jindex_free - Clear all the journal index information
* @sdp: The GFS2 superblock
*
@@ -166,39 +96,6 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
return jd;
}
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
-{
- struct gfs2_jdesc *jd;
-
- spin_lock(&sdp->sd_jindex_spin);
- jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
- if (jd)
- jd->jd_dirty = 1;
- spin_unlock(&sdp->sd_jindex_spin);
-}
-
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
-{
- struct gfs2_jdesc *jd;
- int found = 0;
-
- spin_lock(&sdp->sd_jindex_spin);
-
- list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
- if (jd->jd_dirty) {
- jd->jd_dirty = 0;
- found = 1;
- break;
- }
- }
- spin_unlock(&sdp->sd_jindex_spin);
-
- if (!found)
- jd = NULL;
-
- return jd;
-}
-
int gfs2_jdesc_check(struct gfs2_jdesc *jd)
{
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -206,14 +103,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
int ar;
int error;
- if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
- (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
+ if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
+ (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
gfs2_consist_inode(ip);
return -EIO;
}
- jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+ jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
- error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
+ error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
if (!error && ar) {
gfs2_consist_inode(ip);
error = -EIO;
@@ -423,137 +320,6 @@ out:
return error;
}
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
- struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
- struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-
- spin_lock(&sdp->sd_statfs_spin);
-
- *sc = *m_sc;
- sc->sc_total += l_sc->sc_total;
- sc->sc_free += l_sc->sc_free;
- sc->sc_dinodes += l_sc->sc_dinodes;
-
- spin_unlock(&sdp->sd_statfs_spin);
-
- if (sc->sc_free < 0)
- sc->sc_free = 0;
- if (sc->sc_free > sc->sc_total)
- sc->sc_free = sc->sc_total;
- if (sc->sc_dinodes < 0)
- sc->sc_dinodes = 0;
-
- return 0;
-}
-
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
- struct gfs2_statfs_change_host *sc)
-{
- gfs2_rgrp_verify(rgd);
- sc->sc_total += rgd->rd_data;
- sc->sc_free += rgd->rd_rg.rg_free;
- sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
- return 0;
-}
-
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
- struct gfs2_holder ri_gh;
- struct gfs2_rgrpd *rgd_next;
- struct gfs2_holder *gha, *gh;
- unsigned int slots = 64;
- unsigned int x;
- int done;
- int error = 0, err;
-
- memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
- gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
- if (!gha)
- return -ENOMEM;
-
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- goto out;
-
- rgd_next = gfs2_rgrpd_get_first(sdp);
-
- for (;;) {
- done = 1;
-
- for (x = 0; x < slots; x++) {
- gh = gha + x;
-
- if (gh->gh_gl && gfs2_glock_poll(gh)) {
- err = gfs2_glock_wait(gh);
- if (err) {
- gfs2_holder_uninit(gh);
- error = err;
- } else {
- if (!error)
- error = statfs_slow_fill(
- gh->gh_gl->gl_object, sc);
- gfs2_glock_dq_uninit(gh);
- }
- }
-
- if (gh->gh_gl)
- done = 0;
- else if (rgd_next && !error) {
- error = gfs2_glock_nq_init(rgd_next->rd_gl,
- LM_ST_SHARED,
- GL_ASYNC,
- gh);
- rgd_next = gfs2_rgrpd_get_next(rgd_next);
- done = 0;
- }
-
- if (signal_pending(current))
- error = -ERESTARTSYS;
- }
-
- if (done)
- break;
-
- yield();
- }
-
- gfs2_glock_dq_uninit(&ri_gh);
-
-out:
- kfree(gha);
- return error;
-}
-
struct lfcc {
struct list_head list;
struct gfs2_holder gh;
@@ -580,10 +346,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
struct gfs2_log_header_host lh;
int error;
- error = gfs2_jindex_hold(sdp, &ji_gh);
- if (error)
- return error;
-
list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
if (!lfcc) {
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 50a4c9b1215..f6b8b00ad88 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -10,6 +10,8 @@
#ifndef __SUPER_DOT_H__
#define __SUPER_DOT_H__
+#include <linux/fs.h>
+#include <linux/dcache.h>
#include "incore.h"
void gfs2_lm_unmount(struct gfs2_sbd *sdp);
@@ -23,12 +25,9 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
return x;
}
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
void gfs2_jindex_free(struct gfs2_sbd *sdp);
struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
int gfs2_jdesc_check(struct gfs2_jdesc *jd);
int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
@@ -40,11 +39,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp);
void gfs2_statfs_change(struct gfs2_sbd *sdp,
s64 total, s64 free, s64 dinodes);
int gfs2_statfs_sync(struct gfs2_sbd *sdp);
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
int gfs2_freeze_fs(struct gfs2_sbd *sdp);
void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+extern const struct export_operations gfs2_export_ops;
+extern const struct super_operations gfs2_super_ops;
+extern struct dentry_operations gfs2_dops;
+
#endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7e1879f1a02..26c1fa777a9 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,9 +26,6 @@
#include "quota.h"
#include "util.h"
-char *gfs2_sys_margs;
-spinlock_t gfs2_sys_margs_lock;
-
static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
{
return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -263,7 +260,6 @@ ARGS_ATTR(localcaching, "%d\n");
ARGS_ATTR(localflocks, "%d\n");
ARGS_ATTR(debug, "%d\n");
ARGS_ATTR(upgrade, "%d\n");
-ARGS_ATTR(num_glockd, "%u\n");
ARGS_ATTR(posix_acl, "%d\n");
ARGS_ATTR(quota, "%u\n");
ARGS_ATTR(suiddir, "%d\n");
@@ -279,7 +275,6 @@ static struct attribute *args_attrs[] = {
&args_attr_localflocks.attr,
&args_attr_debug.attr,
&args_attr_upgrade.attr,
- &args_attr_num_glockd.attr,
&args_attr_posix_acl.attr,
&args_attr_quota.attr,
&args_attr_suiddir.attr,
@@ -288,30 +283,6 @@ static struct attribute *args_attrs[] = {
};
/*
- * display counters from superblock
- */
-
-struct counters_attr {
- struct attribute attr;
- ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-
-#define COUNTERS_ATTR(name, fmt) \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
-{ \
- return snprintf(buf, PAGE_SIZE, fmt, \
- (unsigned int)atomic_read(&sdp->sd_##name)); \
-} \
-static struct counters_attr counters_attr_##name = __ATTR_RO(name)
-
-COUNTERS_ATTR(reclaimed, "%u\n");
-
-static struct attribute *counters_attrs[] = {
- &counters_attr_reclaimed.attr,
- NULL,
-};
-
-/*
* get and set struct gfs2_tune fields
*/
@@ -393,7 +364,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
} \
TUNE_ATTR_2(name, name##_store)
-TUNE_ATTR(demote_secs, 0);
TUNE_ATTR(incore_log_blocks, 0);
TUNE_ATTR(log_flush_secs, 0);
TUNE_ATTR(quota_warn_period, 0);
@@ -408,11 +378,9 @@ TUNE_ATTR(stall_secs, 1);
TUNE_ATTR(statfs_quantum, 1);
TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
TUNE_ATTR_DAEMON(logd_secs, logd_process);
-TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
static struct attribute *tune_attrs[] = {
- &tune_attr_demote_secs.attr,
&tune_attr_incore_log_blocks.attr,
&tune_attr_log_flush_secs.attr,
&tune_attr_quota_warn_period.attr,
@@ -426,7 +394,6 @@ static struct attribute *tune_attrs[] = {
&tune_attr_statfs_quantum.attr,
&tune_attr_recoverd_secs.attr,
&tune_attr_logd_secs.attr,
- &tune_attr_quotad_secs.attr,
&tune_attr_quota_scale.attr,
&tune_attr_new_files_jdata.attr,
NULL,
@@ -437,11 +404,6 @@ static struct attribute_group lockstruct_group = {
.attrs = lockstruct_attrs,
};
-static struct attribute_group counters_group = {
- .name = "counters",
- .attrs = counters_attrs,
-};
-
static struct attribute_group args_group = {
.name = "args",
.attrs = args_attrs,
@@ -466,13 +428,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
if (error)
goto fail_reg;
- error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
- if (error)
- goto fail_lockstruct;
-
error = sysfs_create_group(&sdp->sd_kobj, &args_group);
if (error)
- goto fail_counters;
+ goto fail_lockstruct;
error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
if (error)
@@ -483,8 +441,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
fail_args:
sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_counters:
- sysfs_remove_group(&sdp->sd_kobj, &counters_group);
fail_lockstruct:
sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
fail_reg:
@@ -498,16 +454,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
{
sysfs_remove_group(&sdp->sd_kobj, &tune_group);
sysfs_remove_group(&sdp->sd_kobj, &args_group);
- sysfs_remove_group(&sdp->sd_kobj, &counters_group);
sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
kobject_put(&sdp->sd_kobj);
}
+static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
+ struct kobj_uevent_env *env)
+{
+ struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+ add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
+ add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+ return 0;
+}
+
+static struct kset_uevent_ops gfs2_uevent_ops = {
+ .uevent = gfs2_uevent,
+};
+
+
int gfs2_sys_init(void)
{
- gfs2_sys_margs = NULL;
- spin_lock_init(&gfs2_sys_margs_lock);
- gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
+ gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
if (!gfs2_kset)
return -ENOMEM;
return 0;
@@ -515,7 +482,6 @@ int gfs2_sys_init(void)
void gfs2_sys_uninit(void)
{
- kfree(gfs2_sys_margs);
kset_unregister(gfs2_kset);
}
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index 1ca8cdac530..e94560e836d 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -13,10 +13,6 @@
#include <linux/spinlock.h>
struct gfs2_sbd;
-/* Allow args to be passed to GFS2 when using an initial ram disk */
-extern char *gfs2_sys_margs;
-extern spinlock_t gfs2_sys_margs_lock;
-
int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index d31e355c61f..374f50e9549 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,6 +25,7 @@ struct kmem_cache *gfs2_glock_cachep __read_mostly;
struct kmem_cache *gfs2_inode_cachep __read_mostly;
struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
+struct kmem_cache *gfs2_quotad_cachep __read_mostly;
void gfs2_assert_i(struct gfs2_sbd *sdp)
{
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 7f48576289c..33e96b0ce9a 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -148,6 +148,7 @@ extern struct kmem_cache *gfs2_glock_cachep;
extern struct kmem_cache *gfs2_inode_cachep;
extern struct kmem_cache *gfs2_bufdata_cachep;
extern struct kmem_cache *gfs2_rgrpd_cachep;
+extern struct kmem_cache *gfs2_quotad_cachep;
static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
unsigned int *p)
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a31451ac17..5c538e0ec14 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -501,7 +501,7 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
{
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- *pagep = __grab_cache_page(mapping, index);
+ *pagep = grab_cache_page_write_begin(mapping, index, flags);
if (!*pagep)
return -ENOMEM;
return 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7d479ce3ace..6903d37af03 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
for (;;) {
struct page *page;
unsigned long nr, ret;
+ int ra;
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
@@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
*/
ret = len < nr ? len : nr;
if (clear_user(buf, ret))
- ret = -EFAULT;
+ ra = -EFAULT;
+ else
+ ra = 0;
} else {
/*
* We have the page, copy it to user space buffer.
*/
- ret = hugetlbfs_read_actor(page, offset, buf, len, nr);
+ ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
+ ret = ra;
}
- if (ret < 0) {
+ if (ra < 0) {
if (retval == 0)
- retval = ret;
+ retval = ra;
if (page)
page_cache_release(page);
goto out;
@@ -506,7 +510,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
inode->i_mode = mode;
inode->i_uid = uid;
inode->i_gid = gid;
- inode->i_blocks = 0;
inode->i_mapping->a_ops = &hugetlbfs_aops;
inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 7de1cda9248..913ab2d9a5d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
#include <linux/bootmem.h>
#include <linux/inotify.h>
#include <linux/mount.h>
+#include <linux/async.h>
/*
* This is needed for the following functions:
@@ -110,8 +111,8 @@ static void wake_up_inode(struct inode *inode)
/**
* inode_init_always - perform inode structure intialisation
- * @sb - superblock inode belongs to.
- * @inode - inode to initialise
+ * @sb: superblock inode belongs to
+ * @inode: inode to initialise
*
* These are initializations that need to be done on every inode
* allocation as the fields are not initialised by slab allocation.
@@ -131,6 +132,8 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_op = &empty_iops;
inode->i_fop = &empty_fops;
inode->i_nlink = 1;
+ inode->i_uid = 0;
+ inode->i_gid = 0;
atomic_set(&inode->i_writecount, 0);
inode->i_size = 0;
inode->i_blocks = 0;
@@ -164,7 +167,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
- mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+ mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->assoc_mapping = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
mapping->writeback_index = 0;
@@ -574,8 +577,8 @@ __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
/**
* inode_add_to_lists - add a new inode to relevant lists
- * @sb - superblock inode belongs to.
- * @inode - inode to mark in use
+ * @sb: superblock inode belongs to
+ * @inode: inode to mark in use
*
* When an inode is allocated it needs to be accounted for, added to the in use
* list, the owning superblock and the inode hash. This needs to be done under
@@ -599,7 +602,7 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
* @sb: superblock
*
* Allocates a new inode for given superblock. The default gfp_mask
- * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE.
+ * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
* If HIGHMEM pages are unsuitable or it is known that pages allocated
* for the page cache are not reclaimable or migratable,
* mapping_set_gfp_mask() must be called with suitable flags on the
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 43e8b2c0664..cc3f1aa1cf7 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,7 +231,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
-/*
+/**
+ * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
* @inode - the inode to map
* @arg - the pointer to userspace where we copy everything to
* @get_block - the fs's get_block function
@@ -242,11 +243,15 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
*
* If it is possible to have data blocks beyond a hole past @inode->i_size, then
* please do not use this function, it will stop at the first unmapped block
- * beyond i_size
+ * beyond i_size.
+ *
+ * If you use this function directly, you need to do your own locking. Use
+ * generic_block_fiemap if you want the locking done for you.
*/
-int generic_block_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo, u64 start,
- u64 len, get_block_t *get_block)
+
+int __generic_block_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo, u64 start,
+ u64 len, get_block_t *get_block)
{
struct buffer_head tmp;
unsigned int start_blk;
@@ -260,9 +265,6 @@ int generic_block_fiemap(struct inode *inode,
start_blk = logical_to_blk(inode, start);
- /* guard against change */
- mutex_lock(&inode->i_mutex);
-
length = (long long)min_t(u64, len, i_size_read(inode));
map_len = length;
@@ -334,14 +336,36 @@ int generic_block_fiemap(struct inode *inode,
cond_resched();
} while (1);
- mutex_unlock(&inode->i_mutex);
-
/* if ret is 1 then we just hit the end of the extent array */
if (ret == 1)
ret = 0;
return ret;
}
+EXPORT_SYMBOL(__generic_block_fiemap);
+
+/**
+ * generic_block_fiemap - FIEMAP for block based inodes
+ * @inode: The inode to map
+ * @fieinfo: The mapping information
+ * @start: The initial block to map
+ * @len: The length of the extect to attempt to map
+ * @get_block: The block mapping function for the fs
+ *
+ * Calls __generic_block_fiemap to map the inode, after taking
+ * the inode's mutex lock.
+ */
+
+int generic_block_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo, u64 start,
+ u64 len, get_block_t *get_block)
+{
+ int ret;
+ mutex_lock(&inode->i_mutex);
+ ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
EXPORT_SYMBOL(generic_block_fiemap);
#endif /* CONFIG_BLOCK */
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 3569e0ad86a..1a39ac37094 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -27,7 +27,7 @@
#include <linux/security.h>
#include <linux/pid_namespace.h>
-static int set_task_ioprio(struct task_struct *task, int ioprio)
+int set_task_ioprio(struct task_struct *task, int ioprio)
{
int err;
struct io_context *ioc;
@@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
task_unlock(task);
return err;
}
+EXPORT_SYMBOL_GPL(set_task_ioprio);
asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
{
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f8af0f1505..6147ec3643a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -855,10 +855,6 @@ root_found:
}
sbi->s_joliet_level = joliet_level;
- /* check the root inode */
- if (!inode->i_op)
- goto out_bad_root;
-
/* Make sure the root inode is a directory */
if (!S_ISDIR(inode->i_mode)) {
printk(KERN_WARNING
@@ -886,8 +882,6 @@ root_found:
/*
* Display error messages and free resources.
*/
-out_bad_root:
- printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
out_iput:
iput(inode);
goto out_no_inode;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c5..3fbffb1ea71 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
int flags;
int err;
unsigned long blocknr;
+ ktime_t start_time;
+ u64 commit_time;
char *tagp = NULL;
journal_header_t *header;
journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
journal->j_running_transaction = NULL;
+ start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
wake_up(&journal->j_wait_transaction_locked);
spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
journal->j_committing_transaction = NULL;
+ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+ /*
+ * weight the commit time higher than the average time so we don't
+ * react too strongly to vast changes in commit time
+ */
+ if (likely(journal->j_average_commit_time))
+ journal->j_average_commit_time = (commit_time*3 +
+ journal->j_average_commit_time) / 4;
+ else
+ journal->j_average_commit_time = commit_time;
+
spin_unlock(&journal->j_state_lock);
if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c880..e6a11743127 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/hrtimer.h>
static void __journal_temp_unlink_buffer(struct journal_head *jh);
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
{
transaction->t_journal = journal;
transaction->t_state = T_RUNNING;
+ transaction->t_start_time = ktime_get();
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
@@ -752,7 +754,6 @@ out:
* int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
* @handle: transaction to add buffer modifications to
* @bh: bh to be used for metadata writes
- * @credits: variable that will receive credits for the buffer
*
* Returns an error code or 0 on success.
*
@@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
- int old_handle_count, err;
+ int err;
pid_t pid;
J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle)
* on IO anyway. Speeds up many-threaded, many-dir operations
* by 30x or more...
*
+ * We try and optimize the sleep time against what the underlying disk
+ * can do, instead of having a static sleep time. This is usefull for
+ * the case where our storage is so fast that it is more optimal to go
+ * ahead and force a flush and wait for the transaction to be committed
+ * than it is to wait for an arbitrary amount of time for new writers to
+ * join the transaction. We acheive this by measuring how long it takes
+ * to commit a transaction, and compare it with how long this
+ * transaction has been running, and if run time < commit time then we
+ * sleep for the delta and commit. This greatly helps super fast disks
+ * that would see slowdowns as more threads started doing fsyncs.
+ *
* But don't do this if this process was the most recent one to
* perform a synchronous write. We do this to detect the case where a
* single process is doing a stream of sync writes. No point in waiting
@@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle)
*/
pid = current->pid;
if (handle->h_sync && journal->j_last_sync_writer != pid) {
+ u64 commit_time, trans_time;
+
journal->j_last_sync_writer = pid;
- do {
- old_handle_count = transaction->t_handle_count;
- schedule_timeout_uninterruptible(1);
- } while (old_handle_count != transaction->t_handle_count);
+
+ spin_lock(&journal->j_state_lock);
+ commit_time = journal->j_average_commit_time;
+ spin_unlock(&journal->j_state_lock);
+
+ trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+ transaction->t_start_time));
+
+ commit_time = min_t(u64, commit_time,
+ 1000*jiffies_to_usecs(1));
+
+ if (trans_time < commit_time) {
+ ktime_t expires = ktime_add_ns(ktime_get(),
+ commit_time);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ }
}
current->journal_info = NULL;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9497718fe92..17159cacbd9 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -249,16 +249,14 @@ restart:
return ret;
}
-#define NR_BATCH 64
-
static void
-__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+__flush_batch(journal_t *journal, int *batch_count)
{
int i;
- ll_rw_block(SWRITE, *batch_count, bhs);
+ ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
for (i = 0; i < *batch_count; i++) {
- struct buffer_head *bh = bhs[i];
+ struct buffer_head *bh = journal->j_chkpt_bhs[i];
clear_buffer_jwrite(bh);
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __process_buffer(journal_t *journal, struct journal_head *jh,
- struct buffer_head **bhs, int *batch_count,
- transaction_t *transaction)
+ int *batch_count, transaction_t *transaction)
{
struct buffer_head *bh = jh2bh(jh);
int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
get_bh(bh);
J_ASSERT_BH(bh, !buffer_jwrite(bh));
set_buffer_jwrite(bh);
- bhs[*batch_count] = bh;
+ journal->j_chkpt_bhs[*batch_count] = bh;
__buffer_relink_io(jh);
jbd_unlock_bh_state(bh);
transaction->t_chp_stats.cs_written++;
(*batch_count)++;
- if (*batch_count == NR_BATCH) {
+ if (*batch_count == JBD2_NR_BATCH) {
spin_unlock(&journal->j_list_lock);
- __flush_batch(journal, bhs, batch_count);
+ __flush_batch(journal, batch_count);
ret = 1;
}
}
@@ -388,7 +385,6 @@ restart:
if (journal->j_checkpoint_transactions == transaction &&
transaction->t_tid == this_tid) {
int batch_count = 0;
- struct buffer_head *bhs[NR_BATCH];
struct journal_head *jh;
int retry = 0, err;
@@ -402,7 +398,7 @@ restart:
retry = 1;
break;
}
- retry = __process_buffer(journal, jh, bhs, &batch_count,
+ retry = __process_buffer(journal, jh, &batch_count,
transaction);
if (retry < 0 && !result)
result = retry;
@@ -419,7 +415,7 @@ restart:
spin_unlock(&journal->j_list_lock);
retry = 1;
}
- __flush_batch(journal, bhs, &batch_count);
+ __flush_batch(journal, &batch_count);
}
if (retry) {
@@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
safely remove this transaction from the log */
__jbd2_journal_drop_transaction(journal, transaction);
+ kfree(transaction);
/* Just in case anybody was waiting for more transactions to be
checkpointed... */
@@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(journal->j_running_transaction != transaction);
jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
- kfree(transaction);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index ebc667bc54a..62804e57a44 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
#include <linux/crc32.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
+#include <linux/bio.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
set_buffer_ordered(bh);
barrier_done = 1;
}
- ret = submit_bh(WRITE, bh);
+ ret = submit_bh(WRITE_SYNC, bh);
if (barrier_done)
clear_buffer_ordered(bh);
@@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
lock_buffer(bh);
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
- ret = submit_bh(WRITE, bh);
+ ret = submit_bh(WRITE_SYNC, bh);
}
*cbh = bh;
return ret;
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal,
* This function along with journal_submit_commit_record
* allows to write the commit record asynchronously.
*/
-static int journal_wait_on_commit_record(struct buffer_head *bh)
+static int journal_wait_on_commit_record(journal_t *journal,
+ struct buffer_head *bh)
{
int ret = 0;
+retry:
clear_buffer_dirty(bh);
wait_on_buffer(bh);
+ if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
+ printk(KERN_WARNING
+ "JBD2: wait_on_commit_record: sync failed on %s - "
+ "disabling barriers\n", journal->j_devname);
+ spin_lock(&journal->j_state_lock);
+ journal->j_flags &= ~JBD2_BARRIER;
+ spin_unlock(&journal->j_state_lock);
+
+ lock_buffer(bh);
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ bh->b_end_io = journal_end_buffer_io_sync;
+
+ ret = submit_bh(WRITE_SYNC, bh);
+ if (ret) {
+ unlock_buffer(bh);
+ return ret;
+ }
+ goto retry;
+ }
if (unlikely(!buffer_uptodate(bh)))
ret = -EIO;
@@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
int flags;
int err;
unsigned long long blocknr;
+ ktime_t start_time;
+ u64 commit_time;
char *tagp = NULL;
journal_header_t *header;
journal_block_tag_t *tag = NULL;
int space_left = 0;
int first_tag = 0;
int tag_flag;
- int i;
+ int i, to_free = 0;
int tag_bytes = journal_tag_bytes(journal);
struct buffer_head *cbh = NULL; /* For transactional checksums */
__u32 crc32_sum = ~0;
@@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
journal->j_running_transaction = NULL;
+ start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
wake_up(&journal->j_wait_transaction_locked);
spin_unlock(&journal->j_state_lock);
@@ -509,6 +535,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (is_journal_aborted(journal)) {
clear_buffer_jbddirty(jh2bh(jh));
JBUFFER_TRACE(jh, "journal is aborting: refile");
+ jbd2_buffer_abort_trigger(jh,
+ jh->b_frozen_data ?
+ jh->b_frozen_triggers :
+ jh->b_triggers);
jbd2_journal_refile_buffer(journal, jh);
/* If that was the last one, we need to clean up
* any descriptor buffers which may have been
@@ -799,7 +829,7 @@ wait_for_iobuf:
__jbd2_journal_abort_hard(journal);
}
if (!err && !is_journal_aborted(journal))
- err = journal_wait_on_commit_record(cbh);
+ err = journal_wait_on_commit_record(journal, cbh);
if (err)
jbd2_journal_abort(journal, err);
@@ -844,6 +874,9 @@ restart_loop:
* data.
*
* Otherwise, we can just throw away the frozen data now.
+ *
+ * We also know that the frozen data has already fired
+ * its triggers if they exist, so we can clear that too.
*/
if (jh->b_committed_data) {
jbd2_free(jh->b_committed_data, bh->b_size);
@@ -851,10 +884,12 @@ restart_loop:
if (jh->b_frozen_data) {
jh->b_committed_data = jh->b_frozen_data;
jh->b_frozen_data = NULL;
+ jh->b_frozen_triggers = NULL;
}
} else if (jh->b_frozen_data) {
jbd2_free(jh->b_frozen_data, bh->b_size);
jh->b_frozen_data = NULL;
+ jh->b_frozen_triggers = NULL;
}
spin_lock(&journal->j_list_lock);
@@ -972,14 +1007,23 @@ restart_loop:
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
journal->j_committing_transaction = NULL;
- spin_unlock(&journal->j_state_lock);
+ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
- if (journal->j_commit_callback)
- journal->j_commit_callback(journal, commit_transaction);
+ /*
+ * weight the commit time higher than the average time so we don't
+ * react too strongly to vast changes in the commit time
+ */
+ if (likely(journal->j_average_commit_time))
+ journal->j_average_commit_time = (commit_time +
+ journal->j_average_commit_time*3) / 4;
+ else
+ journal->j_average_commit_time = commit_time;
+ spin_unlock(&journal->j_state_lock);
if (commit_transaction->t_checkpoint_list == NULL &&
commit_transaction->t_checkpoint_io_list == NULL) {
__jbd2_journal_drop_transaction(journal, commit_transaction);
+ to_free = 1;
} else {
if (journal->j_checkpoint_transactions == NULL) {
journal->j_checkpoint_transactions = commit_transaction;
@@ -998,11 +1042,16 @@ restart_loop:
}
spin_unlock(&journal->j_list_lock);
+ if (journal->j_commit_callback)
+ journal->j_commit_callback(journal, commit_transaction);
+
trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
- journal->j_devname, journal->j_commit_sequence,
+ journal->j_devname, commit_transaction->t_tid,
journal->j_tail_sequence);
jbd_debug(1, "JBD: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
+ if (to_free)
+ kfree(commit_transaction);
wake_up(&journal->j_wait_done_commit);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e70d657a19f..56675306ed8 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -40,6 +40,7 @@
#include <asm/uaccess.h>
#include <asm/page.h>
+#include <asm/div64.h>
EXPORT_SYMBOL(jbd2_journal_start);
EXPORT_SYMBOL(jbd2_journal_restart);
@@ -50,6 +51,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
EXPORT_SYMBOL(jbd2_journal_get_write_access);
EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_set_triggers);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_release_buffer);
EXPORT_SYMBOL(jbd2_journal_forget);
@@ -65,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
EXPORT_SYMBOL(jbd2_journal_check_used_features);
EXPORT_SYMBOL(jbd2_journal_check_available_features);
EXPORT_SYMBOL(jbd2_journal_set_features);
-EXPORT_SYMBOL(jbd2_journal_create);
EXPORT_SYMBOL(jbd2_journal_load);
EXPORT_SYMBOL(jbd2_journal_destroy);
EXPORT_SYMBOL(jbd2_journal_abort);
@@ -131,8 +132,9 @@ static int kjournald2(void *arg)
journal->j_task = current;
wake_up(&journal->j_wait_done_commit);
- printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n",
- journal->j_commit_interval / HZ);
+ printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
+ "commit interval %ld seconds\n", current->pid,
+ journal->j_devname, journal->j_commit_interval / HZ);
/*
* And now, wait forever for commit wakeup events.
@@ -290,6 +292,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
+ struct jbd2_buffer_trigger_type *triggers;
/*
* The buffer really shouldn't be locked: only the current committing
@@ -314,13 +317,23 @@ repeat:
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
new_offset = offset_in_page(jh_in->b_frozen_data);
+ triggers = jh_in->b_frozen_triggers;
} else {
new_page = jh2bh(jh_in)->b_page;
new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+ triggers = jh_in->b_triggers;
}
mapped_data = kmap_atomic(new_page, KM_USER0);
/*
+ * Fire any commit trigger. Do this before checking for escaping,
+ * as the trigger may modify the magic offset. If a copy-out
+ * happens afterwards, it will have the correct data in the buffer.
+ */
+ jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+ triggers);
+
+ /*
* Check for escaping
*/
if (*((__be32 *)(mapped_data + new_offset)) ==
@@ -352,6 +365,13 @@ repeat:
new_page = virt_to_page(tmp);
new_offset = offset_in_page(tmp);
done_copy_out = 1;
+
+ /*
+ * This isn't strictly necessary, as we're using frozen
+ * data for the escaping, but it keeps consistency with
+ * b_frozen_data usage.
+ */
+ jh_in->b_frozen_triggers = jh_in->b_triggers;
}
/*
@@ -631,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
return NULL;
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ if (!bh)
+ return NULL;
lock_buffer(bh);
memset(bh->b_data, 0, journal->j_blocksize);
set_buffer_uptodate(bh);
@@ -824,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
seq_printf(seq, " %ums logging transaction\n",
jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
+ seq_printf(seq, " %luus average transaction commit time\n",
+ do_div(s->journal->j_average_commit_time, 1000));
seq_printf(seq, " %lu handles per transaction\n",
s->stats->u.run.rs_handle_count / s->stats->ts_tid);
seq_printf(seq, " %lu blocks per transaction\n",
@@ -961,6 +985,8 @@ static journal_t * journal_init_common (void)
spin_lock_init(&journal->j_state_lock);
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+ journal->j_min_batch_time = 0;
+ journal->j_max_batch_time = 15000; /* 15ms */
/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT;
@@ -1016,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
/* journal descriptor can store up to n blocks -bzzz */
journal->j_blocksize = blocksize;
+ jbd2_stats_proc_init(journal);
n = journal->j_blocksize / sizeof(journal_block_tag_t);
journal->j_wbufsize = n;
journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
if (!journal->j_wbuf) {
printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
__func__);
- kfree(journal);
- journal = NULL;
- goto out;
+ goto out_err;
}
journal->j_dev = bdev;
journal->j_fs_dev = fs_dev;
@@ -1034,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
p = journal->j_devname;
while ((p = strchr(p, '/')))
*p = '!';
- jbd2_stats_proc_init(journal);
bh = __getblk(journal->j_dev, start, journal->j_blocksize);
- J_ASSERT(bh != NULL);
+ if (!bh) {
+ printk(KERN_ERR
+ "%s: Cannot get buffer for journal superblock\n",
+ __func__);
+ goto out_err;
+ }
journal->j_sb_buffer = bh;
journal->j_superblock = (journal_superblock_t *)bh->b_data;
-out:
+
return journal;
+out_err:
+ jbd2_stats_proc_exit(journal);
+ kfree(journal);
+ return NULL;
}
/**
@@ -1089,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
if (!journal->j_wbuf) {
printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
__func__);
- jbd2_stats_proc_exit(journal);
- kfree(journal);
- return NULL;
+ goto out_err;
}
err = jbd2_journal_bmap(journal, 0, &blocknr);
@@ -1099,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
if (err) {
printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
__func__);
- jbd2_stats_proc_exit(journal);
- kfree(journal);
- return NULL;
+ goto out_err;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
- J_ASSERT(bh != NULL);
+ if (!bh) {
+ printk(KERN_ERR
+ "%s: Cannot get buffer for journal superblock\n",
+ __func__);
+ goto out_err;
+ }
journal->j_sb_buffer = bh;
journal->j_superblock = (journal_superblock_t *)bh->b_data;
return journal;
+out_err:
+ jbd2_stats_proc_exit(journal);
+ kfree(journal);
+ return NULL;
}
/*
@@ -1158,77 +1196,6 @@ static int journal_reset(journal_t *journal)
}
/**
- * int jbd2_journal_create() - Initialise the new journal file
- * @journal: Journal to create. This structure must have been initialised
- *
- * Given a journal_t structure which tells us which disk blocks we can
- * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.
- **/
-int jbd2_journal_create(journal_t *journal)
-{
- unsigned long long blocknr;
- struct buffer_head *bh;
- journal_superblock_t *sb;
- int i, err;
-
- if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
- printk (KERN_ERR "Journal length (%d blocks) too short.\n",
- journal->j_maxlen);
- journal_fail_superblock(journal);
- return -EINVAL;
- }
-
- if (journal->j_inode == NULL) {
- /*
- * We don't know what block to start at!
- */
- printk(KERN_EMERG
- "%s: creation of journal on external device!\n",
- __func__);
- BUG();
- }
-
- /* Zero out the entire journal on disk. We cannot afford to
- have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
- jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
- for (i = 0; i < journal->j_maxlen; i++) {
- err = jbd2_journal_bmap(journal, i, &blocknr);
- if (err)
- return err;
- bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
- lock_buffer(bh);
- memset (bh->b_data, 0, journal->j_blocksize);
- BUFFER_TRACE(bh, "marking dirty");
- mark_buffer_dirty(bh);
- BUFFER_TRACE(bh, "marking uptodate");
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- __brelse(bh);
- }
-
- sync_blockdev(journal->j_dev);
- jbd_debug(1, "JBD: journal cleared.\n");
-
- /* OK, fill in the initial static fields in the new superblock */
- sb = journal->j_superblock;
-
- sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
- sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
-
- sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
- sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
- sb->s_first = cpu_to_be32(1);
-
- journal->j_transaction_sequence = 1;
-
- journal->j_flags &= ~JBD2_ABORT;
- journal->j_format_version = 2;
-
- return journal_reset(journal);
-}
-
-/**
* void jbd2_journal_update_superblock() - Update journal sb on disk.
* @journal: The journal to update.
* @wait: Set to '0' if you don't want to wait for IO completion.
@@ -1472,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal)
spin_lock(&journal->j_list_lock);
while (journal->j_checkpoint_transactions != NULL) {
spin_unlock(&journal->j_list_lock);
+ mutex_lock(&journal->j_checkpoint_mutex);
jbd2_log_do_checkpoint(journal);
+ mutex_unlock(&journal->j_checkpoint_mutex);
spin_lock(&journal->j_list_lock);
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599..46b4e347ed7 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/hrtimer.h>
static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
{
transaction->t_journal = journal;
transaction->t_state = T_RUNNING;
+ transaction->t_start_time = ktime_get();
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
@@ -741,6 +743,12 @@ done:
source = kmap_atomic(page, KM_USER0);
memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
kunmap_atomic(source, KM_USER0);
+
+ /*
+ * Now that the frozen data is saved off, we need to store
+ * any matching triggers.
+ */
+ jh->b_frozen_triggers = jh->b_triggers;
}
jbd_unlock_bh_state(bh);
@@ -944,6 +952,47 @@ out:
}
/**
+ * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * @bh: buffer to trigger on
+ * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
+ *
+ * Set any triggers on this journal_head. This is always safe, because
+ * triggers for a committing buffer will be saved off, and triggers for
+ * a running transaction will match the buffer in that transaction.
+ *
+ * Call with NULL to clear the triggers.
+ */
+void jbd2_journal_set_triggers(struct buffer_head *bh,
+ struct jbd2_buffer_trigger_type *type)
+{
+ struct journal_head *jh = bh2jh(bh);
+
+ jh->b_triggers = type;
+}
+
+void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+ struct jbd2_buffer_trigger_type *triggers)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (!triggers || !triggers->t_commit)
+ return;
+
+ triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+}
+
+void jbd2_buffer_abort_trigger(struct journal_head *jh,
+ struct jbd2_buffer_trigger_type *triggers)
+{
+ if (!triggers || !triggers->t_abort)
+ return;
+
+ triggers->t_abort(triggers, jh2bh(jh));
+}
+
+
+
+/**
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
@@ -1193,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
- int old_handle_count, err;
+ int err;
pid_t pid;
J_ASSERT(journal_current_handle() == handle);
@@ -1216,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
/*
* Implement synchronous transaction batching. If the handle
* was synchronous, don't force a commit immediately. Let's
- * yield and let another thread piggyback onto this transaction.
- * Keep doing that while new threads continue to arrive.
- * It doesn't cost much - we're about to run a commit and sleep
- * on IO anyway. Speeds up many-threaded, many-dir operations
- * by 30x or more...
+ * yield and let another thread piggyback onto this
+ * transaction. Keep doing that while new threads continue to
+ * arrive. It doesn't cost much - we're about to run a commit
+ * and sleep on IO anyway. Speeds up many-threaded, many-dir
+ * operations by 30x or more...
*
- * But don't do this if this process was the most recent one to
- * perform a synchronous write. We do this to detect the case where a
- * single process is doing a stream of sync writes. No point in waiting
- * for joiners in that case.
+ * We try and optimize the sleep time against what the
+ * underlying disk can do, instead of having a static sleep
+ * time. This is useful for the case where our storage is so
+ * fast that it is more optimal to go ahead and force a flush
+ * and wait for the transaction to be committed than it is to
+ * wait for an arbitrary amount of time for new writers to
+ * join the transaction. We achieve this by measuring how
+ * long it takes to commit a transaction, and compare it with
+ * how long this transaction has been running, and if run time
+ * < commit time then we sleep for the delta and commit. This
+ * greatly helps super fast disks that would see slowdowns as
+ * more threads started doing fsyncs.
+ *
+ * But don't do this if this process was the most recent one
+ * to perform a synchronous write. We do this to detect the
+ * case where a single process is doing a stream of sync
+ * writes. No point in waiting for joiners in that case.
*/
pid = current->pid;
if (handle->h_sync && journal->j_last_sync_writer != pid) {
+ u64 commit_time, trans_time;
+
journal->j_last_sync_writer = pid;
- do {
- old_handle_count = transaction->t_handle_count;
- schedule_timeout_uninterruptible(1);
- } while (old_handle_count != transaction->t_handle_count);
+
+ spin_lock(&journal->j_state_lock);
+ commit_time = journal->j_average_commit_time;
+ spin_unlock(&journal->j_state_lock);
+
+ trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+ transaction->t_start_time));
+
+ commit_time = max_t(u64, commit_time,
+ 1000*journal->j_min_batch_time);
+ commit_time = min_t(u64, commit_time,
+ 1000*journal->j_max_batch_time);
+
+ if (trans_time < commit_time) {
+ ktime_t expires = ktime_add_ns(ktime_get(),
+ commit_time);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ }
}
current->journal_info = NULL;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index c73fa89b5f8..170d289ac78 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -22,9 +22,7 @@
#define BIT_DIVIDER_MIPS 1043
-static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */
-
-#include <linux/errno.h>
+static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241};
struct pushpull {
unsigned char *buf;
@@ -43,7 +41,9 @@ struct rubin_state {
int bits[8];
};
-static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve)
+static inline void init_pushpull(struct pushpull *pp, char *buf,
+ unsigned buflen, unsigned ofs,
+ unsigned reserve)
{
pp->buf = buf;
pp->buflen = buflen;
@@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen
static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
{
- if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) {
+ if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve))
return -ENOSPC;
- }
- if (bit) {
- pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7)));
- }
- else {
- pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7)));
- }
+ if (bit)
+ pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7)));
+ else
+ pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7)));
+
pp->ofs++;
return 0;
@@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits)
rs->p = (long) (2 * UPPER_BIT_RUBIN);
rs->bit_number = (long) 0;
rs->bit_divider = div;
+
for (c=0; c<8; c++)
rs->bits[c] = bits[c];
}
@@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
long i0, i1;
int ret;
- while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
+ while ((rs->q >= UPPER_BIT_RUBIN) ||
+ ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
rs->bit_number++;
ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0);
@@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
rs->p <<= 1;
}
i0 = A * rs->p / (A + B);
- if (i0 <= 0) {
+ if (i0 <= 0)
i0 = 1;
- }
- if (i0 >= rs->p) {
+
+ if (i0 >= rs->p)
i0 = rs->p - 1;
- }
+
i1 = rs->p - i0;
if (symbol == 0)
@@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits)
/* behalve lower */
rs->rec_q = 0;
- for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
+ for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE;
+ rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
;
}
-static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q)
+static void __do_decode(struct rubin_state *rs, unsigned long p,
+ unsigned long q)
{
register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN;
unsigned long rec_q;
@@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B)
__do_decode(rs, p, q);
i0 = A * rs->p / (A + B);
- if (i0 <= 0) {
+ if (i0 <= 0)
i0 = 1;
- }
- if (i0 >= rs->p) {
+
+ if (i0 >= rs->p)
i0 = rs->p - 1;
- }
threshold = rs->q + i0;
symbol = rs->rec_q >= threshold;
@@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte)
struct rubin_state rs_copy;
rs_copy = *rs;
- for (i=0;i<8;i++) {
- ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1);
+ for (i=0; i<8; i++) {
+ ret = encode(rs, rs->bit_divider-rs->bits[i],
+ rs->bits[i], byte & 1);
if (ret) {
/* Failed. Restore old state */
*rs = rs_copy;
return ret;
}
- byte=byte>>1;
+ byte >>= 1 ;
}
return 0;
}
@@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs)
int i, result = 0, bit_divider = rs->bit_divider;
for (i = 0; i < 8; i++)
- result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i;
+ result |= decode(rs, bit_divider - rs->bits[i],
+ rs->bits[i]) << i;
return result;
}
@@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs)
static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
- unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen)
+ unsigned char *cpage_out, uint32_t *sourcelen,
+ uint32_t *dstlen)
{
int outpos = 0;
int pos=0;
@@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
uint32_t *sourcelen, uint32_t *dstlen, void *model)
{
- return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen);
+ return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+ cpage_out, sourcelen, dstlen);
}
#endif
static int jffs2_dynrubin_compress(unsigned char *data_in,
@@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
return -1;
memset(histo, 0, 256);
- for (i=0; i<mysrclen; i++) {
+ for (i=0; i<mysrclen; i++)
histo[data_in[i]]++;
- }
memset(bits, 0, sizeof(int)*8);
for (i=0; i<256; i++) {
if (i&128)
@@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
cpage_out[i] = bits[i];
}
- ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen);
+ ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen,
+ &mydstlen);
if (ret)
return ret;
@@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
return 0;
}
-static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in,
- unsigned char *page_out, uint32_t srclen, uint32_t destlen)
+static void rubin_do_decompress(int bit_divider, int *bits,
+ unsigned char *cdata_in,
+ unsigned char *page_out, uint32_t srclen,
+ uint32_t destlen)
{
int outpos = 0;
struct rubin_state rs;
@@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata
init_pushpull(&rs.pp, cdata_in, srclen, 0, 0);
init_decode(&rs, bit_divider, bits);
- while (outpos < destlen) {
+ while (outpos < destlen)
page_out[outpos++] = in_byte(&rs);
- }
}
@@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
uint32_t sourcelen, uint32_t dstlen,
void *model)
{
- rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen);
+ rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+ cpage_out, sourcelen, dstlen);
return 0;
}
@@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in,
for (c=0; c<8; c++)
bits[c] = data_in[c];
- rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen);
+ rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8,
+ dstlen);
return 0;
}
static struct jffs2_compressor jffs2_rubinmips_comp = {
- .priority = JFFS2_RUBINMIPS_PRIORITY,
- .name = "rubinmips",
- .compr = JFFS2_COMPR_DYNRUBIN,
- .compress = NULL, /*&jffs2_rubinmips_compress,*/
- .decompress = &jffs2_rubinmips_decompress,
+ .priority = JFFS2_RUBINMIPS_PRIORITY,
+ .name = "rubinmips",
+ .compr = JFFS2_COMPR_DYNRUBIN,
+ .compress = NULL, /*&jffs2_rubinmips_compress,*/
+ .decompress = &jffs2_rubinmips_decompress,
#ifdef JFFS2_RUBINMIPS_DISABLED
- .disabled = 1,
+ .disabled = 1,
#else
- .disabled = 0,
+ .disabled = 0,
#endif
};
int jffs2_rubinmips_init(void)
{
- return jffs2_register_compressor(&jffs2_rubinmips_comp);
+ return jffs2_register_compressor(&jffs2_rubinmips_comp);
}
void jffs2_rubinmips_exit(void)
{
- jffs2_unregister_compressor(&jffs2_rubinmips_comp);
+ jffs2_unregister_compressor(&jffs2_rubinmips_comp);
}
static struct jffs2_compressor jffs2_dynrubin_comp = {
- .priority = JFFS2_DYNRUBIN_PRIORITY,
- .name = "dynrubin",
- .compr = JFFS2_COMPR_RUBINMIPS,
- .compress = jffs2_dynrubin_compress,
- .decompress = &jffs2_dynrubin_decompress,
+ .priority = JFFS2_DYNRUBIN_PRIORITY,
+ .name = "dynrubin",
+ .compr = JFFS2_COMPR_RUBINMIPS,
+ .compress = jffs2_dynrubin_compress,
+ .decompress = &jffs2_dynrubin_decompress,
#ifdef JFFS2_DYNRUBIN_DISABLED
- .disabled = 1,
+ .disabled = 1,
#else
- .disabled = 0,
+ .disabled = 0,
#endif
};
int jffs2_dynrubin_init(void)
{
- return jffs2_register_compressor(&jffs2_dynrubin_comp);
+ return jffs2_register_compressor(&jffs2_dynrubin_comp);
}
void jffs2_dynrubin_exit(void)
{
- jffs2_unregister_compressor(&jffs2_dynrubin_comp);
+ jffs2_unregister_compressor(&jffs2_dynrubin_comp);
}
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 259461b910a..c32b4a1ad6c 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
{
/* For NAND, if the failure did not occur at the device level for a
specific physical page, don't bother updating the bad block table. */
- if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) {
+ if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) {
/* We had a device-level failure to erase. Let's see if we've
failed too many times. */
if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
@@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr)
struct erase_priv_struct *priv = (void *)instr->priv;
if(instr->state != MTD_ERASE_DONE) {
- printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state);
+ printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
+ (unsigned long long)instr->addr, instr->state);
jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
} else {
jffs2_erase_succeeded(priv->c, priv->jeb);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5a98aa87c85..5edc2bf2058 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -132,7 +132,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
uint32_t pageofs = index << PAGE_CACHE_SHIFT;
int ret = 0;
- pg = __grab_cache_page(mapping, index);
+ pg = grab_cache_page_write_begin(mapping, index, flags);
if (!pg)
return -ENOMEM;
*pagep = pg;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 1750445556c..507ed6ec184 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -366,9 +366,6 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c);
void jffs2_free_raw_node_refs(struct jffs2_sb_info *c);
struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset);
void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete);
-struct rb_node *rb_next(struct rb_node *);
-struct rb_node *rb_prev(struct rb_node *);
-void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root);
int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index d6363d8309d..0f94381ca6d 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -58,9 +58,9 @@
/*
* __mark_inode_dirty expects inodes to be hashed. Since we don't want
- * special inodes in the fileset inode space, we hash them to a dummy head
+ * special inodes in the fileset inode space, we make them appear hashed,
+ * but do not put on any lists.
*/
-static HLIST_HEAD(aggregate_hash);
/*
* imap locks
@@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
/* release the page */
release_metapage(mp);
- hlist_add_head(&ip->i_hash, &aggregate_hash);
+ /*
+ * that will look hashed, but won't be on any list; hlist_del()
+ * will work fine and require no locking.
+ */
+ ip->i_hash.pprev = &ip->i_hash.next;
return (ip);
}
diff --git a/fs/libfs.c b/fs/libfs.c
index e960a832190..49b44099dab 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
*/
root->i_ino = 1;
root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
- root->i_uid = root->i_gid = 0;
root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
dentry = d_alloc(NULL, &d_name);
if (!dentry) {
@@ -360,7 +359,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
*/
inode->i_ino = 1;
inode->i_mode = S_IFDIR | 0755;
- inode->i_uid = inode->i_gid = 0;
- inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
@@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
if (!inode)
goto out;
inode->i_mode = S_IFREG | files->mode;
- inode->i_uid = inode->i_gid = 0;
- inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_fop = files->ops;
inode->i_ino = i;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 31668b690e0..dd7957064a8 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -16,7 +16,6 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
#define NLMDBG_FACILITY NLMDBG_CLIENT
#define NLMCLNT_GRACE_WAIT (5*HZ)
@@ -518,11 +517,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
unsigned char fl_type;
int status = -ENOLCK;
- if (nsm_monitor(host) < 0) {
- printk(KERN_NOTICE "lockd: failed to monitor %s\n",
- host->h_name);
+ if (nsm_monitor(host) < 0)
goto out;
- }
+
fl->fl_flags |= FL_ACCESS;
status = do_vfs_lock(fl);
fl->fl_flags = fl_flags;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index abdebf76b82..99d737bd432 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -15,7 +15,6 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
#include <linux/mutex.h>
#include <net/ipv6.h>
@@ -32,11 +31,6 @@ static int nrhosts;
static DEFINE_MUTEX(nlm_host_mutex);
static void nlm_gc_hosts(void);
-static struct nsm_handle *nsm_find(const struct sockaddr *sap,
- const size_t salen,
- const char *hostname,
- const size_t hostname_len,
- const int create);
struct nlm_lookup_host_info {
const int server; /* search for server|client */
@@ -105,32 +99,6 @@ static void nlm_clear_port(struct sockaddr *sap)
}
}
-static void nlm_display_address(const struct sockaddr *sap,
- char *buf, const size_t len)
-{
- const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
- const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-
- switch (sap->sa_family) {
- case AF_UNSPEC:
- snprintf(buf, len, "unspecified");
- break;
- case AF_INET:
- snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
- break;
- case AF_INET6:
- if (ipv6_addr_v4mapped(&sin6->sin6_addr))
- snprintf(buf, len, "%pI4",
- &sin6->sin6_addr.s6_addr32[3]);
- else
- snprintf(buf, len, "%pI6", &sin6->sin6_addr);
- break;
- default:
- snprintf(buf, len, "unsupported address family");
- break;
- }
-}
-
/*
* Common host lookup routine for server & client
*/
@@ -190,8 +158,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
atomic_inc(&nsm->sm_count);
else {
host = NULL;
- nsm = nsm_find(ni->sap, ni->salen,
- ni->hostname, ni->hostname_len, 1);
+ nsm = nsm_get_handle(ni->sap, ni->salen,
+ ni->hostname, ni->hostname_len);
if (!nsm) {
dprintk("lockd: nlm_lookup_host failed; "
"no nsm handle\n");
@@ -206,6 +174,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
goto out;
}
host->h_name = nsm->sm_name;
+ host->h_addrbuf = nsm->sm_addrbuf;
memcpy(nlm_addr(host), ni->sap, ni->salen);
host->h_addrlen = ni->salen;
nlm_clear_port(nlm_addr(host));
@@ -232,11 +201,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
nrhosts++;
- nlm_display_address((struct sockaddr *)&host->h_addr,
- host->h_addrbuf, sizeof(host->h_addrbuf));
- nlm_display_address((struct sockaddr *)&host->h_srcaddr,
- host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
-
dprintk("lockd: nlm_lookup_host created host %s\n",
host->h_name);
@@ -256,10 +220,8 @@ nlm_destroy_host(struct nlm_host *host)
BUG_ON(!list_empty(&host->h_lockowners));
BUG_ON(atomic_read(&host->h_count));
- /*
- * Release NSM handle and unmonitor host.
- */
nsm_unmonitor(host);
+ nsm_release(host->h_nsmhandle);
clnt = host->h_rpcclnt;
if (clnt != NULL)
@@ -378,8 +340,8 @@ nlm_bind_host(struct nlm_host *host)
{
struct rpc_clnt *clnt;
- dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
- host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
+ dprintk("lockd: nlm_bind_host %s (%s)\n",
+ host->h_name, host->h_addrbuf);
/* Lock host handle */
mutex_lock(&host->h_mutex);
@@ -481,35 +443,23 @@ void nlm_release_host(struct nlm_host *host)
}
}
-/*
- * We were notified that the host indicated by address &sin
- * has rebooted.
- * Release all resources held by that peer.
+/**
+ * nlm_host_rebooted - Release all resources held by rebooted host
+ * @info: pointer to decoded results of NLM_SM_NOTIFY call
+ *
+ * We were notified that the specified host has rebooted. Release
+ * all resources held by that peer.
*/
-void nlm_host_rebooted(const struct sockaddr_in *sin,
- const char *hostname,
- unsigned int hostname_len,
- u32 new_state)
+void nlm_host_rebooted(const struct nlm_reboot *info)
{
struct hlist_head *chain;
struct hlist_node *pos;
struct nsm_handle *nsm;
struct nlm_host *host;
- nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
- hostname, hostname_len, 0);
- if (nsm == NULL) {
- dprintk("lockd: never saw rebooted peer '%.*s' before\n",
- hostname_len, hostname);
+ nsm = nsm_reboot_lookup(info);
+ if (unlikely(nsm == NULL))
return;
- }
-
- dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
- hostname_len, hostname, nsm->sm_addrbuf);
-
- /* When reclaiming locks on this peer, make sure that
- * we set up a new notification */
- nsm->sm_monitored = 0;
/* Mark all hosts tied to this NSM state as having rebooted.
* We run the loop repeatedly, because we drop the host table
@@ -520,8 +470,8 @@ again: mutex_lock(&nlm_host_mutex);
for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
hlist_for_each_entry(host, pos, chain, h_hash) {
if (host->h_nsmhandle == nsm
- && host->h_nsmstate != new_state) {
- host->h_nsmstate = new_state;
+ && host->h_nsmstate != info->state) {
+ host->h_nsmstate = info->state;
host->h_state++;
nlm_get_host(host);
@@ -629,89 +579,3 @@ nlm_gc_hosts(void)
next_gc = jiffies + NLM_HOST_COLLECT;
}
-
-
-/*
- * Manage NSM handles
- */
-static LIST_HEAD(nsm_handles);
-static DEFINE_SPINLOCK(nsm_lock);
-
-static struct nsm_handle *nsm_find(const struct sockaddr *sap,
- const size_t salen,
- const char *hostname,
- const size_t hostname_len,
- const int create)
-{
- struct nsm_handle *nsm = NULL;
- struct nsm_handle *pos;
-
- if (!sap)
- return NULL;
-
- if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
- if (printk_ratelimit()) {
- printk(KERN_WARNING "Invalid hostname \"%.*s\" "
- "in NFS lock request\n",
- (int)hostname_len, hostname);
- }
- return NULL;
- }
-
-retry:
- spin_lock(&nsm_lock);
- list_for_each_entry(pos, &nsm_handles, sm_link) {
-
- if (hostname && nsm_use_hostnames) {
- if (strlen(pos->sm_name) != hostname_len
- || memcmp(pos->sm_name, hostname, hostname_len))
- continue;
- } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
- continue;
- atomic_inc(&pos->sm_count);
- kfree(nsm);
- nsm = pos;
- goto found;
- }
- if (nsm) {
- list_add(&nsm->sm_link, &nsm_handles);
- goto found;
- }
- spin_unlock(&nsm_lock);
-
- if (!create)
- return NULL;
-
- nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
- if (nsm == NULL)
- return NULL;
-
- memcpy(nsm_addr(nsm), sap, salen);
- nsm->sm_addrlen = salen;
- nsm->sm_name = (char *) (nsm + 1);
- memcpy(nsm->sm_name, hostname, hostname_len);
- nsm->sm_name[hostname_len] = '\0';
- nlm_display_address((struct sockaddr *)&nsm->sm_addr,
- nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
- atomic_set(&nsm->sm_count, 1);
- goto retry;
-
-found:
- spin_unlock(&nsm_lock);
- return nsm;
-}
-
-/*
- * Release an NSM handle
- */
-void
-nsm_release(struct nsm_handle *nsm)
-{
- if (!nsm)
- return;
- if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
- list_del(&nsm->sm_link);
- spin_unlock(&nsm_lock);
- kfree(nsm);
- }
-}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index ffd3461f75e..5e2c4d5ac82 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -9,35 +9,123 @@
#include <linux/types.h>
#include <linux/utsname.h>
#include <linux/kernel.h>
+#include <linux/ktime.h>
+
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/xprtsock.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
-
#define NLMDBG_FACILITY NLMDBG_MONITOR
+#define NSM_PROGRAM 100024
+#define NSM_VERSION 1
+
+enum {
+ NSMPROC_NULL,
+ NSMPROC_STAT,
+ NSMPROC_MON,
+ NSMPROC_UNMON,
+ NSMPROC_UNMON_ALL,
+ NSMPROC_SIMU_CRASH,
+ NSMPROC_NOTIFY,
+};
+
+struct nsm_args {
+ struct nsm_private *priv;
+ u32 prog; /* RPC callback info */
+ u32 vers;
+ u32 proc;
-#define XDR_ADDRBUF_LEN (20)
+ char *mon_name;
+};
-static struct rpc_clnt * nsm_create(void);
+struct nsm_res {
+ u32 status;
+ u32 state;
+};
static struct rpc_program nsm_program;
+static LIST_HEAD(nsm_handles);
+static DEFINE_SPINLOCK(nsm_lock);
/*
* Local NSM state
*/
-int nsm_local_state;
+int __read_mostly nsm_local_state;
+int __read_mostly nsm_use_hostnames;
-/*
- * Common procedure for SM_MON/SM_UNMON calls
- */
-static int
-nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
+static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
+{
+ return (struct sockaddr *)&nsm->sm_addr;
+}
+
+static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
+ const size_t len)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
+}
+
+static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
+ const size_t len)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+
+ if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+ snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
+ else if (sin6->sin6_scope_id != 0)
+ snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
+ sin6->sin6_scope_id);
+ else
+ snprintf(buf, len, "%pI6", &sin6->sin6_addr);
+}
+
+static void nsm_display_address(const struct sockaddr *sap,
+ char *buf, const size_t len)
+{
+ switch (sap->sa_family) {
+ case AF_INET:
+ nsm_display_ipv4_address(sap, buf, len);
+ break;
+ case AF_INET6:
+ nsm_display_ipv6_address(sap, buf, len);
+ break;
+ default:
+ snprintf(buf, len, "unsupported address family");
+ break;
+ }
+}
+
+static struct rpc_clnt *nsm_create(void)
+{
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+ };
+ struct rpc_create_args args = {
+ .protocol = XPRT_TRANSPORT_UDP,
+ .address = (struct sockaddr *)&sin,
+ .addrsize = sizeof(sin),
+ .servername = "rpc.statd",
+ .program = &nsm_program,
+ .version = NSM_VERSION,
+ .authflavor = RPC_AUTH_NULL,
+ };
+
+ return rpc_create(&args);
+}
+
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
{
struct rpc_clnt *clnt;
int status;
- struct nsm_args args;
+ struct nsm_args args = {
+ .priv = &nsm->sm_priv,
+ .prog = NLM_PROGRAM,
+ .vers = 3,
+ .proc = NLMPROC_NSM_NOTIFY,
+ .mon_name = nsm->sm_mon_name,
+ };
struct rpc_message msg = {
.rpc_argp = &args,
.rpc_resp = res,
@@ -46,22 +134,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
clnt = nsm_create();
if (IS_ERR(clnt)) {
status = PTR_ERR(clnt);
+ dprintk("lockd: failed to create NSM upcall transport, "
+ "status=%d\n", status);
goto out;
}
- memset(&args, 0, sizeof(args));
- args.mon_name = nsm->sm_name;
- args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
- args.prog = NLM_PROGRAM;
- args.vers = 3;
- args.proc = NLMPROC_NSM_NOTIFY;
memset(res, 0, sizeof(*res));
msg.rpc_proc = &clnt->cl_procinfo[proc];
status = rpc_call_sync(clnt, &msg, 0);
if (status < 0)
- printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n",
- status);
+ dprintk("lockd: NSM upcall RPC failed, status=%d\n",
+ status);
else
status = 0;
rpc_shutdown_client(clnt);
@@ -69,82 +153,272 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
return status;
}
-/*
- * Set up monitoring of a remote host
+/**
+ * nsm_monitor - Notify a peer in case we reboot
+ * @host: pointer to nlm_host of peer to notify
+ *
+ * If this peer is not already monitored, this function sends an
+ * upcall to the local rpc.statd to record the name/address of
+ * the peer to notify in case we reboot.
+ *
+ * Returns zero if the peer is monitored by the local rpc.statd;
+ * otherwise a negative errno value is returned.
*/
-int
-nsm_monitor(struct nlm_host *host)
+int nsm_monitor(const struct nlm_host *host)
{
struct nsm_handle *nsm = host->h_nsmhandle;
struct nsm_res res;
int status;
- dprintk("lockd: nsm_monitor(%s)\n", host->h_name);
- BUG_ON(nsm == NULL);
+ dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
if (nsm->sm_monitored)
return 0;
- status = nsm_mon_unmon(nsm, SM_MON, &res);
+ /*
+ * Choose whether to record the caller_name or IP address of
+ * this peer in the local rpc.statd's database.
+ */
+ nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
- if (status < 0 || res.status != 0)
- printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name);
+ status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
+ if (res.status != 0)
+ status = -EIO;
+ if (status < 0)
+ printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
else
nsm->sm_monitored = 1;
return status;
}
-/*
- * Cease to monitor remote host
+/**
+ * nsm_unmonitor - Unregister peer notification
+ * @host: pointer to nlm_host of peer to stop monitoring
+ *
+ * If this peer is monitored, this function sends an upcall to
+ * tell the local rpc.statd not to send this peer a notification
+ * when we reboot.
*/
-int
-nsm_unmonitor(struct nlm_host *host)
+void nsm_unmonitor(const struct nlm_host *host)
{
struct nsm_handle *nsm = host->h_nsmhandle;
struct nsm_res res;
- int status = 0;
-
- if (nsm == NULL)
- return 0;
- host->h_nsmhandle = NULL;
+ int status;
if (atomic_read(&nsm->sm_count) == 1
&& nsm->sm_monitored && !nsm->sm_sticky) {
- dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
+ dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
- status = nsm_mon_unmon(nsm, SM_UNMON, &res);
+ status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
+ if (res.status != 0)
+ status = -EIO;
if (status < 0)
printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
- host->h_name);
+ nsm->sm_name);
else
nsm->sm_monitored = 0;
}
- nsm_release(nsm);
- return status;
+}
+
+static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
+ const size_t len)
+{
+ struct nsm_handle *nsm;
+
+ list_for_each_entry(nsm, &nsm_handles, sm_link)
+ if (strlen(nsm->sm_name) == len &&
+ memcmp(nsm->sm_name, hostname, len) == 0)
+ return nsm;
+ return NULL;
+}
+
+static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+{
+ struct nsm_handle *nsm;
+
+ list_for_each_entry(nsm, &nsm_handles, sm_link)
+ if (nlm_cmp_addr(nsm_addr(nsm), sap))
+ return nsm;
+ return NULL;
+}
+
+static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+{
+ struct nsm_handle *nsm;
+
+ list_for_each_entry(nsm, &nsm_handles, sm_link)
+ if (memcmp(nsm->sm_priv.data, priv->data,
+ sizeof(priv->data)) == 0)
+ return nsm;
+ return NULL;
}
/*
- * Create NSM client for the local host
+ * Construct a unique cookie to match this nsm_handle to this monitored
+ * host. It is passed to the local rpc.statd via NSMPROC_MON, and
+ * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
+ * requests.
+ *
+ * The NSM protocol requires that these cookies be unique while the
+ * system is running. We prefer a stronger requirement of making them
+ * unique across reboots. If user space bugs cause a stale cookie to
+ * be sent to the kernel, it could cause the wrong host to lose its
+ * lock state if cookies were not unique across reboots.
+ *
+ * The cookies are exposed only to local user space via loopback. They
+ * do not appear on the physical network. If we want greater security
+ * for some reason, nsm_init_private() could perform a one-way hash to
+ * obscure the contents of the cookie.
*/
-static struct rpc_clnt *
-nsm_create(void)
+static void nsm_init_private(struct nsm_handle *nsm)
{
- struct sockaddr_in sin = {
- .sin_family = AF_INET,
- .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
- .sin_port = 0,
- };
- struct rpc_create_args args = {
- .protocol = XPRT_TRANSPORT_UDP,
- .address = (struct sockaddr *)&sin,
- .addrsize = sizeof(sin),
- .servername = "localhost",
- .program = &nsm_program,
- .version = SM_VERSION,
- .authflavor = RPC_AUTH_NULL,
- };
+ u64 *p = (u64 *)&nsm->sm_priv.data;
+ struct timespec ts;
- return rpc_create(&args);
+ ktime_get_ts(&ts);
+ *p++ = timespec_to_ns(&ts);
+ *p = (unsigned long)nsm;
+}
+
+static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
+ const size_t salen,
+ const char *hostname,
+ const size_t hostname_len)
+{
+ struct nsm_handle *new;
+
+ new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
+ if (unlikely(new == NULL))
+ return NULL;
+
+ atomic_set(&new->sm_count, 1);
+ new->sm_name = (char *)(new + 1);
+ memcpy(nsm_addr(new), sap, salen);
+ new->sm_addrlen = salen;
+ nsm_init_private(new);
+ nsm_display_address((const struct sockaddr *)&new->sm_addr,
+ new->sm_addrbuf, sizeof(new->sm_addrbuf));
+ memcpy(new->sm_name, hostname, hostname_len);
+ new->sm_name[hostname_len] = '\0';
+
+ return new;
+}
+
+/**
+ * nsm_get_handle - Find or create a cached nsm_handle
+ * @sap: pointer to socket address of handle to find
+ * @salen: length of socket address
+ * @hostname: pointer to C string containing hostname to find
+ * @hostname_len: length of C string
+ *
+ * Behavior is modulated by the global nsm_use_hostnames variable.
+ *
+ * Returns a cached nsm_handle after bumping its ref count, or
+ * returns a fresh nsm_handle if a handle that matches @sap and/or
+ * @hostname cannot be found in the handle cache. Returns NULL if
+ * an error occurs.
+ */
+struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+ const size_t salen, const char *hostname,
+ const size_t hostname_len)
+{
+ struct nsm_handle *cached, *new = NULL;
+
+ if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING "Invalid hostname \"%.*s\" "
+ "in NFS lock request\n",
+ (int)hostname_len, hostname);
+ }
+ return NULL;
+ }
+
+retry:
+ spin_lock(&nsm_lock);
+
+ if (nsm_use_hostnames && hostname != NULL)
+ cached = nsm_lookup_hostname(hostname, hostname_len);
+ else
+ cached = nsm_lookup_addr(sap);
+
+ if (cached != NULL) {
+ atomic_inc(&cached->sm_count);
+ spin_unlock(&nsm_lock);
+ kfree(new);
+ dprintk("lockd: found nsm_handle for %s (%s), "
+ "cnt %d\n", cached->sm_name,
+ cached->sm_addrbuf,
+ atomic_read(&cached->sm_count));
+ return cached;
+ }
+
+ if (new != NULL) {
+ list_add(&new->sm_link, &nsm_handles);
+ spin_unlock(&nsm_lock);
+ dprintk("lockd: created nsm_handle for %s (%s)\n",
+ new->sm_name, new->sm_addrbuf);
+ return new;
+ }
+
+ spin_unlock(&nsm_lock);
+
+ new = nsm_create_handle(sap, salen, hostname, hostname_len);
+ if (unlikely(new == NULL))
+ return NULL;
+ goto retry;
+}
+
+/**
+ * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @info: pointer to NLMPROC_SM_NOTIFY arguments
+ *
+ * Returns a matching nsm_handle if found in the nsm cache; the returned
+ * nsm_handle's reference count is bumped and sm_monitored is cleared.
+ * Otherwise returns NULL if some error occurred.
+ */
+struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+{
+ struct nsm_handle *cached;
+
+ spin_lock(&nsm_lock);
+
+ cached = nsm_lookup_priv(&info->priv);
+ if (unlikely(cached == NULL)) {
+ spin_unlock(&nsm_lock);
+ dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+ info->len, info->mon);
+ return cached;
+ }
+
+ atomic_inc(&cached->sm_count);
+ spin_unlock(&nsm_lock);
+
+ /*
+ * During subsequent lock activity, force a fresh
+ * notification to be set up for this host.
+ */
+ cached->sm_monitored = 0;
+
+ dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
+ cached->sm_name, cached->sm_addrbuf,
+ atomic_read(&cached->sm_count));
+ return cached;
+}
+
+/**
+ * nsm_release - Release an NSM handle
+ * @nsm: pointer to handle to be released
+ *
+ */
+void nsm_release(struct nsm_handle *nsm)
+{
+ if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
+ list_del(&nsm->sm_link);
+ spin_unlock(&nsm_lock);
+ dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
+ nsm->sm_name, nsm->sm_addrbuf);
+ kfree(nsm);
+ }
}
/*
@@ -154,127 +428,132 @@ nsm_create(void)
* Status Monitor wire protocol.
*/
-static __be32 *xdr_encode_nsm_string(__be32 *p, char *string)
+static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
{
- size_t len = strlen(string);
-
- if (len > SM_MAXSTRLEN)
- len = SM_MAXSTRLEN;
- return xdr_encode_opaque(p, string, len);
+ const u32 len = strlen(string);
+ __be32 *p;
+
+ if (unlikely(len > SM_MAXSTRLEN))
+ return -EIO;
+ p = xdr_reserve_space(xdr, sizeof(u32) + len);
+ if (unlikely(p == NULL))
+ return -EIO;
+ xdr_encode_opaque(p, string, len);
+ return 0;
}
/*
* "mon_name" specifies the host to be monitored.
- *
- * Linux uses a text version of the IP address of the remote
- * host as the host identifier (the "mon_name" argument).
- *
- * Linux statd always looks up the canonical hostname first for
- * whatever remote hostname it receives, so this works alright.
*/
-static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
+static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
{
- char buffer[XDR_ADDRBUF_LEN + 1];
- char *name = argp->mon_name;
-
- if (!nsm_use_hostnames) {
- snprintf(buffer, XDR_ADDRBUF_LEN,
- "%pI4", &argp->addr);
- name = buffer;
- }
-
- return xdr_encode_nsm_string(p, name);
+ return encode_nsm_string(xdr, argp->mon_name);
}
/*
* The "my_id" argument specifies the hostname and RPC procedure
* to be called when the status manager receives notification
- * (via the SM_NOTIFY call) that the state of host "mon_name"
+ * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
* has changed.
*/
-static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp)
+static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
{
- p = xdr_encode_nsm_string(p, utsname()->nodename);
- if (!p)
- return ERR_PTR(-EIO);
-
+ int status;
+ __be32 *p;
+
+ status = encode_nsm_string(xdr, utsname()->nodename);
+ if (unlikely(status != 0))
+ return status;
+ p = xdr_reserve_space(xdr, 3 * sizeof(u32));
+ if (unlikely(p == NULL))
+ return -EIO;
*p++ = htonl(argp->prog);
*p++ = htonl(argp->vers);
*p++ = htonl(argp->proc);
-
- return p;
+ return 0;
}
/*
* The "mon_id" argument specifies the non-private arguments
- * of an SM_MON or SM_UNMON call.
+ * of an NSMPROC_MON or NSMPROC_UNMON call.
*/
-static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
+static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
{
- p = xdr_encode_mon_name(p, argp);
- if (!p)
- return ERR_PTR(-EIO);
+ int status;
- return xdr_encode_my_id(p, argp);
+ status = encode_mon_name(xdr, argp);
+ if (unlikely(status != 0))
+ return status;
+ return encode_my_id(xdr, argp);
}
/*
* The "priv" argument may contain private information required
- * by the SM_MON call. This information will be supplied in the
- * SM_NOTIFY call.
- *
- * Linux provides the raw IP address of the monitored host,
- * left in network byte order.
+ * by the NSMPROC_MON call. This information will be supplied in the
+ * NLMPROC_SM_NOTIFY call.
*/
-static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp)
+static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
{
- *p++ = argp->addr;
- *p++ = 0;
- *p++ = 0;
- *p++ = 0;
+ __be32 *p;
- return p;
+ p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
+ if (unlikely(p == NULL))
+ return -EIO;
+ xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
+ return 0;
}
-static int
-xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
+ const struct nsm_args *argp)
{
- p = xdr_encode_mon_id(p, argp);
- if (IS_ERR(p))
- return PTR_ERR(p);
-
- p = xdr_encode_priv(p, argp);
- if (IS_ERR(p))
- return PTR_ERR(p);
-
- rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
- return 0;
+ struct xdr_stream xdr;
+ int status;
+
+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+ status = encode_mon_id(&xdr, argp);
+ if (unlikely(status))
+ return status;
+ return encode_priv(&xdr, argp);
}
-static int
-xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
+ const struct nsm_args *argp)
{
- p = xdr_encode_mon_id(p, argp);
- if (IS_ERR(p))
- return PTR_ERR(p);
- rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
- return 0;
+ struct xdr_stream xdr;
+
+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+ return encode_mon_id(&xdr, argp);
}
-static int
-xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
+ struct nsm_res *resp)
{
+ struct xdr_stream xdr;
+
+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+ p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
+ if (unlikely(p == NULL))
+ return -EIO;
resp->status = ntohl(*p++);
- resp->state = ntohl(*p++);
- dprintk("nsm: xdr_decode_stat_res status %d state %d\n",
+ resp->state = ntohl(*p);
+
+ dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
resp->status, resp->state);
return 0;
}
-static int
-xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
+ struct nsm_res *resp)
{
- resp->state = ntohl(*p++);
+ struct xdr_stream xdr;
+
+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+ p = xdr_inline_decode(&xdr, sizeof(u32));
+ if (unlikely(p == NULL))
+ return -EIO;
+ resp->state = ntohl(*p);
+
+ dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
return 0;
}
@@ -288,22 +567,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
#define SM_unmonres_sz 1
static struct rpc_procinfo nsm_procedures[] = {
-[SM_MON] = {
- .p_proc = SM_MON,
- .p_encode = (kxdrproc_t) xdr_encode_mon,
- .p_decode = (kxdrproc_t) xdr_decode_stat_res,
+[NSMPROC_MON] = {
+ .p_proc = NSMPROC_MON,
+ .p_encode = (kxdrproc_t)xdr_enc_mon,
+ .p_decode = (kxdrproc_t)xdr_dec_stat_res,
.p_arglen = SM_mon_sz,
.p_replen = SM_monres_sz,
- .p_statidx = SM_MON,
+ .p_statidx = NSMPROC_MON,
.p_name = "MONITOR",
},
-[SM_UNMON] = {
- .p_proc = SM_UNMON,
- .p_encode = (kxdrproc_t) xdr_encode_unmon,
- .p_decode = (kxdrproc_t) xdr_decode_stat,
+[NSMPROC_UNMON] = {
+ .p_proc = NSMPROC_UNMON,
+ .p_encode = (kxdrproc_t)xdr_enc_unmon,
+ .p_decode = (kxdrproc_t)xdr_dec_stat,
.p_arglen = SM_mon_id_sz,
.p_replen = SM_unmonres_sz,
- .p_statidx = SM_UNMON,
+ .p_statidx = NSMPROC_UNMON,
.p_name = "UNMONITOR",
},
};
@@ -322,7 +601,7 @@ static struct rpc_stat nsm_stats;
static struct rpc_program nsm_program = {
.name = "statd",
- .number = SM_PROGRAM,
+ .number = NSM_PROGRAM,
.nrvers = ARRAY_SIZE(nsm_version),
.version = nsm_version,
.stats = &nsm_stats
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 252d80163d0..64f1c31b585 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,7 +35,6 @@
#include <linux/sunrpc/svcsock.h>
#include <net/ip.h>
#include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
#include <linux/nfs.h>
#define NLMDBG_FACILITY NLMDBG_SVC
@@ -54,13 +53,26 @@ static struct svc_rqst *nlmsvc_rqst;
unsigned long nlmsvc_timeout;
/*
+ * If the kernel has IPv6 support available, always listen for
+ * both AF_INET and AF_INET6 requests.
+ */
+#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
+ defined(CONFIG_SUNRPC_REGISTER_V4)
+static const sa_family_t nlmsvc_family = AF_INET6;
+#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+static const sa_family_t nlmsvc_family = AF_INET;
+#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+
+/*
* These can be set at insmod time (useful for NFS as root filesystem),
* and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
*/
static unsigned long nlm_grace_period;
static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO;
static int nlm_udpport, nlm_tcpport;
-int nsm_use_hostnames = 0;
+
+/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
+static unsigned int nlm_max_connections = 1024;
/*
* Constants needed for the sysctl interface.
@@ -143,6 +155,9 @@ lockd(void *vrqstp)
long timeout = MAX_SCHEDULE_TIMEOUT;
RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+ /* update sv_maxconn if it has changed */
+ rqstp->rq_server->sv_maxconn = nlm_max_connections;
+
if (signalled()) {
flush_signals(current);
if (nlmsvc_ops) {
@@ -189,6 +204,19 @@ lockd(void *vrqstp)
return 0;
}
+static int create_lockd_listener(struct svc_serv *serv, char *name,
+ unsigned short port)
+{
+ struct svc_xprt *xprt;
+
+ xprt = svc_find_xprt(serv, name, 0, 0);
+ if (xprt == NULL)
+ return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+
+ svc_xprt_put(xprt);
+ return 0;
+}
+
/*
* Ensure there are active UDP and TCP listeners for lockd.
*
@@ -202,29 +230,23 @@ lockd(void *vrqstp)
static int make_socks(struct svc_serv *serv)
{
static int warned;
- struct svc_xprt *xprt;
- int err = 0;
+ int err;
- xprt = svc_find_xprt(serv, "udp", 0, 0);
- if (!xprt)
- err = svc_create_xprt(serv, "udp", nlm_udpport,
- SVC_SOCK_DEFAULTS);
- else
- svc_xprt_put(xprt);
- if (err >= 0) {
- xprt = svc_find_xprt(serv, "tcp", 0, 0);
- if (!xprt)
- err = svc_create_xprt(serv, "tcp", nlm_tcpport,
- SVC_SOCK_DEFAULTS);
- else
- svc_xprt_put(xprt);
- }
- if (err >= 0) {
- warned = 0;
- err = 0;
- } else if (warned++ == 0)
+ err = create_lockd_listener(serv, "udp", nlm_udpport);
+ if (err < 0)
+ goto out_err;
+
+ err = create_lockd_listener(serv, "tcp", nlm_tcpport);
+ if (err < 0)
+ goto out_err;
+
+ warned = 0;
+ return 0;
+
+out_err:
+ if (warned++ == 0)
printk(KERN_WARNING
- "lockd_up: makesock failed, error=%d\n", err);
+ "lockd_up: makesock failed, error=%d\n", err);
return err;
}
@@ -252,7 +274,7 @@ int lockd_up(void)
"lockd_up: no pid, %d users??\n", nlmsvc_users);
error = -ENOMEM;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
goto out;
@@ -276,6 +298,7 @@ int lockd_up(void)
}
svc_sock_update_bufs(serv);
+ serv->sv_maxconn = nlm_max_connections;
nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
if (IS_ERR(nlmsvc_task)) {
@@ -485,6 +508,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
module_param_call(nlm_tcpport, param_set_port, param_get_int,
&nlm_tcpport, 0644);
module_param(nsm_use_hostnames, bool, 0644);
+module_param(nlm_max_connections, uint, 0644);
/*
* Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4dfdcbc6bf6..1725037374c 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -16,8 +16,6 @@
#include <linux/nfsd/nfsd.h>
#include <linux/lockd/lockd.h>
#include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
-
#define NLMDBG_FACILITY NLMDBG_CLIENT
@@ -419,8 +417,6 @@ static __be32
nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
void *resp)
{
- struct sockaddr_in saddr;
-
dprintk("lockd: SM_NOTIFY called\n");
if (!nlm_privileged_requester(rqstp)) {
@@ -430,14 +426,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
return rpc_system_err;
}
- /* Obtain the host pointer for this NFS server and try to
- * reclaim all locks we hold on this server.
- */
- memset(&saddr, 0, sizeof(saddr));
- saddr.sin_family = AF_INET;
- saddr.sin_addr.s_addr = argp->addr;
- nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
-
+ nlm_host_rebooted(argp);
return rpc_success;
}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3ca89e2a938..3688e55901f 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -16,8 +16,6 @@
#include <linux/nfsd/nfsd.h>
#include <linux/lockd/lockd.h>
#include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
-
#define NLMDBG_FACILITY NLMDBG_CLIENT
@@ -451,8 +449,6 @@ static __be32
nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
void *resp)
{
- struct sockaddr_in saddr;
-
dprintk("lockd: SM_NOTIFY called\n");
if (!nlm_privileged_requester(rqstp)) {
@@ -462,14 +458,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
return rpc_system_err;
}
- /* Obtain the host pointer for this NFS server and try to
- * reclaim all locks we hold on this server.
- */
- memset(&saddr, 0, sizeof(saddr));
- saddr.sin_family = AF_INET;
- saddr.sin_addr.s_addr = argp->addr;
- nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
-
+ nlm_host_rebooted(argp);
return rpc_success;
}
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 34c2766e27c..9e4d6aab611 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -17,7 +17,6 @@
#include <linux/nfsd/export.h>
#include <linux/lockd/lockd.h>
#include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
#include <linux/module.h>
#include <linux/mount.h>
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 1f226290c67..0336f2beacd 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,7 +16,6 @@
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/stats.h>
#include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
#define NLMDBG_FACILITY NLMDBG_XDR
@@ -349,8 +348,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
return 0;
argp->state = ntohl(*p++);
- /* Preserve the address in network byte order */
- argp->addr = *p++;
+ memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+ p += XDR_QUADLEN(SM_PRIV_SIZE);
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 50c493a8ad8..e1d52865319 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -17,7 +17,6 @@
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/stats.h>
#include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
#define NLMDBG_FACILITY NLMDBG_XDR
@@ -356,8 +355,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
return 0;
argp->state = ntohl(*p++);
- /* Preserve the address in network byte order */
- argp->addr = *p++;
+ memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+ p += XDR_QUADLEN(SM_PRIV_SIZE);
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index f70433816a3..d4946c4c90e 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
return -EINVAL;
got_it:
- pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page);
+ pos = page_offset(page) + p - (char *)page_address(page);
err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize,
AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
if (err)
diff --git a/fs/mpage.c b/fs/mpage.c
index 552b80b3fac..16c3ef37eae 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
first_hole = page_block;
page_block++;
block_in_file++;
- clear_buffer_mapped(map_bh);
continue;
}
@@ -308,7 +307,10 @@ alloc_new:
goto alloc_new;
}
- if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
+ relative_block = block_in_file - *first_logical_block;
+ nblocks = map_bh->b_size >> blkbits;
+ if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
+ (first_hole != blocks_per_page))
bio = mpage_bio_submit(READ, bio);
else
*last_block_in_bio = blocks[blocks_per_page - 1];
diff --git a/fs/namei.c b/fs/namei.c
index dd5c9f0bf82..f05bed24242 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -257,7 +257,7 @@ int inode_permission(struct inode *inode, int mask)
return -EACCES;
}
- if (inode->i_op && inode->i_op->permission)
+ if (inode->i_op->permission)
retval = inode->i_op->permission(inode, mask);
else
retval = generic_permission(inode, mask, NULL);
@@ -432,7 +432,7 @@ static int exec_permission_lite(struct inode *inode)
{
umode_t mode = inode->i_mode;
- if (inode->i_op && inode->i_op->permission)
+ if (inode->i_op->permission)
return -EAGAIN;
if (current_fsuid() == inode->i_uid)
@@ -908,9 +908,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
inode = next.dentry->d_inode;
if (!inode)
goto out_dput;
- err = -ENOTDIR;
- if (!inode->i_op)
- goto out_dput;
if (inode->i_op->follow_link) {
err = do_follow_link(&next, nd);
@@ -920,9 +917,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
inode = nd->path.dentry->d_inode;
if (!inode)
break;
- err = -ENOTDIR;
- if (!inode->i_op)
- break;
} else
path_to_nameidata(&next, nd);
err = -ENOTDIR;
@@ -961,7 +955,7 @@ last_component:
break;
inode = next.dentry->d_inode;
if ((lookup_flags & LOOKUP_FOLLOW)
- && inode && inode->i_op && inode->i_op->follow_link) {
+ && inode && inode->i_op->follow_link) {
err = do_follow_link(&next, nd);
if (err)
goto return_err;
@@ -973,7 +967,7 @@ last_component:
break;
if (lookup_flags & LOOKUP_DIRECTORY) {
err = -ENOTDIR;
- if (!inode->i_op || !inode->i_op->lookup)
+ if (!inode->i_op->lookup)
break;
}
goto return_base;
@@ -1469,7 +1463,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
if (error)
return error;
- if (!dir->i_op || !dir->i_op->create)
+ if (!dir->i_op->create)
return -EACCES; /* shouldn't it be ENOSYS? */
mode &= S_IALLUGO;
mode |= S_IFREG;
@@ -1752,7 +1746,7 @@ do_last:
error = -ENOENT;
if (!path.dentry->d_inode)
goto exit_dput;
- if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
+ if (path.dentry->d_inode->i_op->follow_link)
goto do_link;
path_to_nameidata(&path, &nd);
@@ -1933,7 +1927,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
return -EPERM;
- if (!dir->i_op || !dir->i_op->mknod)
+ if (!dir->i_op->mknod)
return -EPERM;
error = devcgroup_inode_mknod(mode, dev);
@@ -2035,7 +2029,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
if (error)
return error;
- if (!dir->i_op || !dir->i_op->mkdir)
+ if (!dir->i_op->mkdir)
return -EPERM;
mode &= (S_IRWXUGO|S_ISVTX);
@@ -2126,7 +2120,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
if (error)
return error;
- if (!dir->i_op || !dir->i_op->rmdir)
+ if (!dir->i_op->rmdir)
return -EPERM;
DQUOT_INIT(dir);
@@ -2213,7 +2207,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
if (error)
return error;
- if (!dir->i_op || !dir->i_op->unlink)
+ if (!dir->i_op->unlink)
return -EPERM;
DQUOT_INIT(dir);
@@ -2320,7 +2314,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
if (error)
return error;
- if (!dir->i_op || !dir->i_op->symlink)
+ if (!dir->i_op->symlink)
return -EPERM;
error = security_inode_symlink(dir, dentry, oldname);
@@ -2401,7 +2395,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return -EPERM;
- if (!dir->i_op || !dir->i_op->link)
+ if (!dir->i_op->link)
return -EPERM;
if (S_ISDIR(inode->i_mode))
return -EPERM;
@@ -2608,7 +2602,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (error)
return error;
- if (!old_dir->i_op || !old_dir->i_op->rename)
+ if (!old_dir->i_op->rename)
return -EPERM;
DQUOT_INIT(old_dir);
@@ -2817,18 +2811,23 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
}
}
-int __page_symlink(struct inode *inode, const char *symname, int len,
- gfp_t gfp_mask)
+/*
+ * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
+ */
+int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
void *fsdata;
int err;
char *kaddr;
+ unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+ if (nofs)
+ flags |= AOP_FLAG_NOFS;
retry:
err = pagecache_write_begin(NULL, mapping, 0, len-1,
- AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+ flags, &page, &fsdata);
if (err)
goto fail;
@@ -2852,7 +2851,7 @@ fail:
int page_symlink(struct inode *inode, const char *symname, int len)
{
return __page_symlink(inode, symname, len,
- mapping_gfp_mask(inode->i_mapping));
+ !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
}
const struct inode_operations page_symlink_inode_operations = {
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 335b003dddf..0af3349de85 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -16,7 +16,6 @@
* @opts: an array of &struct option entries controlling parser operations
* @optopt: output; will contain the current option
* @optarg: output; will contain the value (if one exists)
- * @flag: output; may be NULL; should point to a long for or'ing flags
* @value: output; may be NULL; will be overwritten with the integer value
* of the current argument.
*
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 6d04e050c74..f54360f50a9 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl
{
s32 auth_type;
u32 object_name_len;
- compat_caddr_t object_name; /* an userspace data, in most cases user name */
+ compat_caddr_t object_name; /* a userspace data, in most cases user name */
};
struct compat_ncp_fs_info_v2 {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d319b49f8f0..90f292b520d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 0184fe9b514..c903e04aa21 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -76,10 +76,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
ret = set_groups(new, gi);
put_group_info(gi);
- if (!ret)
+ if (ret < 0)
goto error;
- if (new->uid)
+ if (new->fsuid)
new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
else
new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6d7d8c02c19..c464181b599 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,9 +53,6 @@
#define NFSPROC4_CB_NULL 0
#define NFSPROC4_CB_COMPOUND 1
-/* declarations */
-static const struct rpc_call_ops nfs4_cb_null_ops;
-
/* Index of predefined Linux callback client operations */
enum {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 669461e291a..9fa60a3ad48 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -946,6 +946,11 @@ encode_op:
nfsd4_encode_operation(resp, op);
status = op->status;
}
+
+ dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
+ args->ops, args->opcnt, resp->opcnt, op->opnum,
+ be32_to_cpu(status));
+
if (cstate->replay_owner) {
nfs4_put_stateowner(cstate->replay_owner);
cstate->replay_owner = NULL;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 0f9d6efaa62..74f7b67567f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
md5_to_hex(dname, cksum.data);
- kfree(cksum.data);
status = nfs_ok;
out:
+ kfree(cksum.data);
crypto_free_hash(desc.tfm);
out_no_tfm:
return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 13e0e074dbb..88db7d3ec12 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2416,6 +2416,26 @@ out:
#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS)
#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1)
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end: NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ BUG_ON(!len);
+ end = start + len;
+ return end > start ? end - 1: NFS4_MAX_UINT64;
+}
+
#define lockownerid_hashval(id) \
((id) & LOCK_HASH_MASK)
@@ -2435,13 +2455,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
static struct nfs4_stateid *
find_stateid(stateid_t *stid, int flags)
{
- struct nfs4_stateid *local = NULL;
+ struct nfs4_stateid *local;
u32 st_id = stid->si_stateownerid;
u32 f_id = stid->si_fileid;
unsigned int hashval;
dprintk("NFSD: find_stateid flags 0x%x\n",flags);
- if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+ if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
hashval = stateid_hashval(st_id, f_id);
list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2449,7 +2469,8 @@ find_stateid(stateid_t *stid, int flags)
return local;
}
}
- if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+
+ if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
hashval = stateid_hashval(st_id, f_id);
list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2518,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
deny->ld_clientid.cl_id = 0;
}
deny->ld_start = fl->fl_start;
- deny->ld_length = ~(u64)0;
- if (fl->fl_end != ~(u64)0)
+ deny->ld_length = NFS4_MAX_UINT64;
+ if (fl->fl_end != NFS4_MAX_UINT64)
deny->ld_length = fl->fl_end - fl->fl_start + 1;
deny->ld_type = NFS4_READ_LT;
if (fl->fl_type != F_RDLCK)
@@ -2616,7 +2637,7 @@ out:
static int
check_lock_length(u64 offset, u64 length)
{
- return ((length == 0) || ((length != ~(u64)0) &&
+ return ((length == 0) || ((length != NFS4_MAX_UINT64) &&
LOFF_OVERFLOW(offset, length)));
}
@@ -2736,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
file_lock.fl_lmops = &nfsd_posix_mng_ops;
file_lock.fl_start = lock->lk_offset;
- if ((lock->lk_length == ~(u64)0) ||
- LOFF_OVERFLOW(lock->lk_offset, lock->lk_length))
- file_lock.fl_end = ~(u64)0;
- else
- file_lock.fl_end = lock->lk_offset + lock->lk_length - 1;
+ file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
nfs4_transform_lock_offset(&file_lock);
/*
@@ -2781,6 +2798,25 @@ out:
}
/*
+ * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
+ * so we do a temporary open here just to get an open file to pass to
+ * vfs_test_lock. (Arguably perhaps test_lock should be done with an
+ * inode operation.)
+ */
+static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
+{
+ struct file *file;
+ int err;
+
+ err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+ if (err)
+ return err;
+ err = vfs_test_lock(file, lock);
+ nfsd_close(file);
+ return err;
+}
+
+/*
* LOCKT operation
*/
__be32
@@ -2788,7 +2824,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_lockt *lockt)
{
struct inode *inode;
- struct file file;
struct file_lock file_lock;
int error;
__be32 status;
@@ -2839,23 +2874,12 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
file_lock.fl_lmops = &nfsd_posix_mng_ops;
file_lock.fl_start = lockt->lt_offset;
- if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length))
- file_lock.fl_end = ~(u64)0;
- else
- file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1;
+ file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
nfs4_transform_lock_offset(&file_lock);
- /* vfs_test_lock uses the struct file _only_ to resolve the inode.
- * since LOCKT doesn't require an OPEN, and therefore a struct
- * file may not exist, pass vfs_test_lock a struct file with
- * only the dentry:inode set.
- */
- memset(&file, 0, sizeof (struct file));
- file.f_path.dentry = cstate->current_fh.fh_dentry;
-
status = nfs_ok;
- error = vfs_test_lock(&file, &file_lock);
+ error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
if (error) {
status = nfserrno(error);
goto out;
@@ -2906,10 +2930,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
file_lock.fl_lmops = &nfsd_posix_mng_ops;
file_lock.fl_start = locku->lu_offset;
- if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length))
- file_lock.fl_end = ~(u64)0;
- else
- file_lock.fl_end = locku->lu_offset + locku->lu_length - 1;
+ file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length);
nfs4_transform_lock_offset(&file_lock);
/*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index afcdf4b7684..f65953be39c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1,6 +1,4 @@
/*
- * fs/nfs/nfs4xdr.c
- *
* Server-side XDR for NFSv4
*
* Copyright (c) 2002 The Regents of the University of Michigan.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77d7b8c531a..3d93b2064ce 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size);
static ssize_t write_getfd(struct file *file, char *buf, size_t size);
static ssize_t write_getfs(struct file *file, char *buf, size_t size);
static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
static ssize_t write_threads(struct file *file, char *buf, size_t size);
static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
static ssize_t write_versions(struct file *file, char *buf, size_t size);
@@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
#endif
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size);
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size);
-
static ssize_t (*write_op[])(struct file *, char *, size_t) = {
[NFSD_Svc] = write_svc,
[NFSD_Add] = write_add,
@@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
[NFSD_Getfd] = write_getfd,
[NFSD_Getfs] = write_getfs,
[NFSD_Fh] = write_filehandle,
- [NFSD_FO_UnlockIP] = failover_unlock_ip,
- [NFSD_FO_UnlockFS] = failover_unlock_fs,
+ [NFSD_FO_UnlockIP] = write_unlock_ip,
+ [NFSD_FO_UnlockFS] = write_unlock_fs,
[NFSD_Threads] = write_threads,
[NFSD_Pool_Threads] = write_pool_threads,
[NFSD_Versions] = write_versions,
@@ -176,10 +175,24 @@ static const struct file_operations exports_operations = {
/*----------------------------------------------------------------------------*/
/*
* payload - write methods
- * If the method has a response, the response should be put in buf,
- * and the length returned. Otherwise return 0 or and -error.
*/
+/**
+ * write_svc - Start kernel's NFSD server
+ *
+ * Deprecated. /proc/fs/nfsd/threads is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ * buf: struct nfsctl_svc
+ * svc_port: port number of this
+ * server's listener
+ * svc_nthreads: number of threads to start
+ * size: size in bytes of passed in nfsctl_svc
+ * Output:
+ * On success: returns zero
+ * On error: return code is negative errno value
+ */
static ssize_t write_svc(struct file *file, char *buf, size_t size)
{
struct nfsctl_svc *data;
@@ -189,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size)
return nfsd_svc(data->svc_port, data->svc_nthreads);
}
+/**
+ * write_add - Add or modify client entry in auth unix cache
+ *
+ * Deprecated. /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ * buf: struct nfsctl_client
+ * cl_ident: '\0'-terminated C string
+ * containing domain name
+ * of client
+ * cl_naddr: no. of items in cl_addrlist
+ * cl_addrlist: array of client addresses
+ * cl_fhkeytype: ignored
+ * cl_fhkeylen: ignored
+ * cl_fhkey: ignored
+ * size: size in bytes of passed in nfsctl_client
+ * Output:
+ * On success: returns zero
+ * On error: return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
static ssize_t write_add(struct file *file, char *buf, size_t size)
{
struct nfsctl_client *data;
@@ -198,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size)
return exp_addclient(data);
}
+/**
+ * write_del - Remove client from auth unix cache
+ *
+ * Deprecated. /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ * buf: struct nfsctl_client
+ * cl_ident: '\0'-terminated C string
+ * containing domain name
+ * of client
+ * cl_naddr: ignored
+ * cl_addrlist: ignored
+ * cl_fhkeytype: ignored
+ * cl_fhkeylen: ignored
+ * cl_fhkey: ignored
+ * size: size in bytes of passed in nfsctl_client
+ * Output:
+ * On success: returns zero
+ * On error: return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
static ssize_t write_del(struct file *file, char *buf, size_t size)
{
struct nfsctl_client *data;
@@ -207,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size)
return exp_delclient(data);
}
+/**
+ * write_export - Export part or all of a local file system
+ *
+ * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ * buf: struct nfsctl_export
+ * ex_client: '\0'-terminated C string
+ * containing domain name
+ * of client allowed to access
+ * this export
+ * ex_path: '\0'-terminated C string
+ * containing pathname of
+ * directory in local file system
+ * ex_dev: fsid to use for this export
+ * ex_ino: ignored
+ * ex_flags: export flags for this export
+ * ex_anon_uid: UID to use for anonymous
+ * requests
+ * ex_anon_gid: GID to use for anonymous
+ * requests
+ * size: size in bytes of passed in nfsctl_export
+ * Output:
+ * On success: returns zero
+ * On error: return code is negative errno value
+ */
static ssize_t write_export(struct file *file, char *buf, size_t size)
{
struct nfsctl_export *data;
@@ -216,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size)
return exp_export(data);
}
+/**
+ * write_unexport - Unexport a previously exported file system
+ *
+ * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ * buf: struct nfsctl_export
+ * ex_client: '\0'-terminated C string
+ * containing domain name
+ * of client no longer allowed
+ * to access this export
+ * ex_path: '\0'-terminated C string
+ * containing pathname of
+ * directory in local file system
+ * ex_dev: ignored
+ * ex_ino: ignored
+ * ex_flags: ignored
+ * ex_anon_uid: ignored
+ * ex_anon_gid: ignored
+ * size: size in bytes of passed in nfsctl_export
+ * Output:
+ * On success: returns zero
+ * On error: return code is negative errno value
+ */
static ssize_t write_unexport(struct file *file, char *buf, size_t size)
{
struct nfsctl_export *data;
@@ -226,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size)
return exp_unexport(data);
}
+/**
+ * write_getfs - Get a variable-length NFS file handle by path
+ *
+ * Deprecated. /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ * buf: struct nfsctl_fsparm
+ * gd_addr: socket address of client
+ * gd_path: '\0'-terminated C string
+ * containing pathname of
+ * directory in local file system
+ * gd_maxlen: maximum size of returned file
+ * handle
+ * size: size in bytes of passed in nfsctl_fsparm
+ * Output:
+ * On success: passed-in buffer filled with a knfsd_fh structure
+ * (a variable-length raw NFS file handle);
+ * return code is the size in bytes of the file handle
+ * On error: return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
static ssize_t write_getfs(struct file *file, char *buf, size_t size)
{
struct nfsctl_fsparm *data;
@@ -265,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
return err;
}
+/**
+ * write_getfd - Get a fixed-length NFS file handle by path (used by mountd)
+ *
+ * Deprecated. /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ * buf: struct nfsctl_fdparm
+ * gd_addr: socket address of client
+ * gd_path: '\0'-terminated C string
+ * containing pathname of
+ * directory in local file system
+ * gd_version: fdparm structure version
+ * size: size in bytes of passed in nfsctl_fdparm
+ * Output:
+ * On success: passed-in buffer filled with nfsctl_res
+ * (a fixed-length raw NFS file handle);
+ * return code is the size in bytes of the file handle
+ * On error: return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
static ssize_t write_getfd(struct file *file, char *buf, size_t size)
{
struct nfsctl_fdparm *data;
@@ -309,7 +469,23 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
return err;
}
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
+/**
+ * write_unlock_ip - Release all locks used by a client
+ *
+ * Experimental.
+ *
+ * Input:
+ * buf: '\n'-terminated C string containing a
+ * presentation format IPv4 address
+ * size: length of C string in @buf
+ * Output:
+ * On success: returns zero if all specified locks were released;
+ * returns one if one or more locks were not released
+ * On error: return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in
+ */
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
{
struct sockaddr_in sin = {
.sin_family = AF_INET,
@@ -339,7 +515,21 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
}
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
+/**
+ * write_unlock_fs - Release all locks on a local file system
+ *
+ * Experimental.
+ *
+ * Input:
+ * buf: '\n'-terminated C string containing the
+ * absolute pathname of a local file system
+ * size: length of C string in @buf
+ * Output:
+ * On success: returns zero if all specified locks were released;
+ * returns one if one or more locks were not released
+ * On error: return code is negative errno value
+ */
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
{
struct path path;
char *fo_path;
@@ -360,21 +550,44 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
if (error)
return error;
+ /*
+ * XXX: Needs better sanity checking. Otherwise we could end up
+ * releasing locks on the wrong file system.
+ *
+ * For example:
+ * 1. Does the path refer to a directory?
+ * 2. Is that directory a mount point, or
+ * 3. Is that directory the root of an exported file system?
+ */
error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
path_put(&path);
return error;
}
+/**
+ * write_filehandle - Get a variable-length NFS file handle by path
+ *
+ * On input, the buffer contains a '\n'-terminated C string comprised of
+ * three alphanumeric words separated by whitespace. The string may
+ * contain escape sequences.
+ *
+ * Input:
+ * buf:
+ * domain: client domain name
+ * path: export pathname
+ * maxsize: numeric maximum size of
+ * @buf
+ * size: length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C
+ * string containing a ASCII hex text version
+ * of the NFS file handle;
+ * return code is the size in bytes of the string
+ * On error: return code is negative errno value
+ */
static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
{
- /* request is:
- * domain path maxsize
- * response is
- * filehandle
- *
- * qword quoting is used, so filehandle will be \x....
- */
char *dname, *path;
int uninitialized_var(maxsize);
char *mesg = buf;
@@ -391,11 +604,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
dname = mesg;
len = qword_get(&mesg, dname, size);
- if (len <= 0) return -EINVAL;
+ if (len <= 0)
+ return -EINVAL;
path = dname+len+1;
len = qword_get(&mesg, path, size);
- if (len <= 0) return -EINVAL;
+ if (len <= 0)
+ return -EINVAL;
len = get_int(&mesg, &maxsize);
if (len)
@@ -419,17 +634,43 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
if (len)
return len;
- mesg = buf; len = SIMPLE_TRANSACTION_LIMIT;
+ mesg = buf;
+ len = SIMPLE_TRANSACTION_LIMIT;
qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
mesg[-1] = '\n';
return mesg - buf;
}
+/**
+ * write_threads - Start NFSD, or report the current number of running threads
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C
+ * string numeric value representing the number of
+ * running NFSD threads;
+ * return code is the size in bytes of the string
+ * On error: return code is zero
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing an unsigned
+ * integer value representing the
+ * number of NFSD threads to start
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: NFS service is started;
+ * passed-in buffer filled with '\n'-terminated C
+ * string numeric value representing the number of
+ * running NFSD threads;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
static ssize_t write_threads(struct file *file, char *buf, size_t size)
{
- /* if size > 0, look for a number of threads and call nfsd_svc
- * then write out number of threads as reply
- */
char *mesg = buf;
int rv;
if (size > 0) {
@@ -437,9 +678,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
rv = get_int(&mesg, &newthreads);
if (rv)
return rv;
- if (newthreads <0)
+ if (newthreads < 0)
return -EINVAL;
- rv = nfsd_svc(2049, newthreads);
+ rv = nfsd_svc(NFS_PORT, newthreads);
if (rv)
return rv;
}
@@ -447,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
return strlen(buf);
}
+/**
+ * write_pool_threads - Set or report the current number of threads per pool
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing whitespace-
+ * separated unsigned integer values
+ * representing the number of NFSD
+ * threads to start in each pool
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C
+ * string containing integer values representing the
+ * number of NFSD threads in each pool;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
{
/* if size > 0, look for an array of number of threads per node
@@ -517,10 +780,6 @@ out_free:
static ssize_t __write_versions(struct file *file, char *buf, size_t size)
{
- /*
- * Format:
- * [-/+]vers [-/+]vers ...
- */
char *mesg = buf;
char *vers, sign;
int len, num;
@@ -578,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
return len;
}
+/**
+ * write_versions - Set or report the available NFS protocol versions
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C
+ * string containing positive or negative integer
+ * values representing the current status of each
+ * protocol version;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing whitespace-
+ * separated positive or negative
+ * integer values representing NFS
+ * protocol versions to enable ("+n")
+ * or disable ("-n")
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: status of zero or more protocol versions has
+ * been updated; passed-in buffer filled with
+ * '\n'-terminated C string containing positive
+ * or negative integer values representing the
+ * current status of each protocol version;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
static ssize_t write_versions(struct file *file, char *buf, size_t size)
{
ssize_t rv;
@@ -687,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
return -EINVAL;
}
+/**
+ * write_ports - Pass a socket file descriptor or transport name to listen on
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ * Output:
+ * On success: passed-in buffer filled with a '\n'-terminated C
+ * string containing a whitespace-separated list of
+ * named NFSD listeners;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing an unsigned
+ * integer value representing a bound
+ * but unconnected socket that is to be
+ * used as an NFSD listener
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: NFS service is started;
+ * passed-in buffer filled with a '\n'-terminated C
+ * string containing a unique alphanumeric name of
+ * the listener;
+ * return code is the size in bytes of the string
+ * On error: return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing a "-" followed
+ * by an integer value representing a
+ * previously passed in socket file
+ * descriptor
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: NFS service no longer listens on that socket;
+ * passed-in buffer filled with a '\n'-terminated C
+ * string containing a unique name of the listener;
+ * return code is the size in bytes of the string
+ * On error: return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing a transport
+ * name and an unsigned integer value
+ * representing the port to listen on,
+ * separated by whitespace
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: returns zero; NFS service is started
+ * On error: return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing a "-" followed
+ * by a transport name and an unsigned
+ * integer value representing the port
+ * to listen on, separated by whitespace
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: returns zero; NFS service no longer listens
+ * on that transport
+ * On error: return code is a negative errno value
+ */
static ssize_t write_ports(struct file *file, char *buf, size_t size)
{
ssize_t rv;
@@ -700,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
int nfsd_max_blksize;
+/**
+ * write_maxblksize - Set or report the current NFS blksize
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing an unsigned
+ * integer value representing the new
+ * NFS blksize
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C string
+ * containing numeric value of the current NFS blksize
+ * setting;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
@@ -752,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
return strlen(buf);
}
+/**
+ * write_leasetime - Set or report the current NFSv4 lease time
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing an unsigned
+ * integer value representing the new
+ * NFSv4 lease expiry time
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C
+ * string containing unsigned integer value of the
+ * current lease expiry time;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
{
ssize_t rv;
@@ -788,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
return strlen(buf);
}
+/**
+ * write_recoverydir - Set or report the pathname of the recovery directory
+ *
+ * Input:
+ * buf: ignored
+ * size: zero
+ *
+ * OR
+ *
+ * Input:
+ * buf: C string containing the pathname
+ * of the directory on a local file
+ * system containing permanent NFSv4
+ * recovery data
+ * size: non-zero length of C string in @buf
+ * Output:
+ * On success: passed-in buffer filled with '\n'-terminated C string
+ * containing the current recovery pathname setting;
+ * return code is the size in bytes of the string
+ * On error: return code is zero or a negative errno value
+ */
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
{
ssize_t rv;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f0da7d9c3a9..9f1ca17293d 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -258,14 +258,32 @@ out:
return error;
}
-/*
- * Perform sanity checks on the dentry in a client's file handle.
+/**
+ * fh_verify - filehandle lookup and access checking
+ * @rqstp: pointer to current rpc request
+ * @fhp: filehandle to be verified
+ * @type: expected type of object pointed to by filehandle
+ * @access: type of access needed to object
+ *
+ * Look up a dentry from the on-the-wire filehandle, check the client's
+ * access to the export, and set the current task's credentials.
+ *
+ * Regardless of success or failure of fh_verify(), fh_put() should be
+ * called on @fhp when the caller is finished with the filehandle.
*
- * Note that the file handle dentry may need to be freed even after
- * an error return.
+ * fh_verify() may be called multiple times on a given filehandle, for
+ * example, when processing an NFSv4 compound. The first call will look
+ * up a dentry using the on-the-wire filehandle. Subsequent calls will
+ * skip the lookup and just perform the other checks and possibly change
+ * the current task's credentials.
*
- * This is only called at the start of an nfsproc call, so fhp points to
- * a svc_fh which is all 0 except for the over-the-wire file handle.
+ * @type specifies the type of object expected using one of the S_IF*
+ * constants defined in include/linux/stat.h. The caller may use zero
+ * to indicate that it doesn't care, or a negative integer to indicate
+ * that it expects something not of the given type.
+ *
+ * @access is formed from the NFSD_MAY_* constants defined in
+ * include/linux/nfsd/nfsd.h.
*/
__be32
fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
@@ -466,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
goto retry;
break;
}
+ } else if (exp->ex_flags & NFSEXP_FSID) {
+ fsid_type = FSID_NUM;
} else if (exp->ex_uuid) {
if (fhp->fh_maxsize >= 64) {
if (root_export)
@@ -478,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
else
fsid_type = FSID_UUID4_INUM;
}
- } else if (exp->ex_flags & NFSEXP_FSID)
- fsid_type = FSID_NUM;
- else if (!old_valid_dev(ex_dev))
+ } else if (!old_valid_dev(ex_dev))
/* for newer device numbers, we must use a newer fsid format */
fsid_type = FSID_ENCODE_DEV;
else
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cffeca7ace..6f7f2635122 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -622,6 +622,7 @@ nfserrno (int errno)
{ nfserr_badname, -ESRCH },
{ nfserr_io, -ETXTBSY },
{ nfserr_notsupp, -EOPNOTSUPP },
+ { nfserr_toosmall, -ETOOSMALL },
};
int i;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d1c5f787b36..6e50aaa56ca 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -764,7 +764,6 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
return err;
}
-
static int
nfsd_sync(struct file *filp)
@@ -1211,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
dirp = dentry->d_inode;
err = nfserr_notdir;
- if(!dirp->i_op || !dirp->i_op->lookup)
+ if (!dirp->i_op->lookup)
goto out;
/*
* Check whether the response file handle has been verified yet.
@@ -1347,7 +1346,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
/* Get all the sanity checks out of the way before
* we lock the parent. */
err = nfserr_notdir;
- if(!dirp->i_op || !dirp->i_op->lookup)
+ if (!dirp->i_op->lookup)
goto out;
fh_lock_nested(fhp, I_MUTEX_PARENT);
@@ -1482,7 +1481,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
inode = dentry->d_inode;
err = nfserr_inval;
- if (!inode->i_op || !inode->i_op->readlink)
+ if (!inode->i_op->readlink)
goto out;
touch_atime(fhp->fh_export->ex_path.mnt, dentry);
@@ -2162,7 +2161,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
size_t size;
int error;
- if (!IS_POSIXACL(inode) || !inode->i_op ||
+ if (!IS_POSIXACL(inode) ||
!inode->i_op->setxattr || !inode->i_op->removexattr)
return -EOPNOTSUPP;
switch(type) {
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 400f8064a54..81b8644b013 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -704,7 +704,7 @@ fput_and_out:
return ret;
}
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd)
{
struct file *filp;
struct inotify_device *dev;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index e9da092e277..86bef156cf0 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
ni->allocated_size = sle64_to_cpu(
a->data.non_resident.allocated_size);
}
- /* Setup the operations for this attribute inode. */
- vi->i_op = NULL;
- vi->i_fop = NULL;
if (NInoMstProtected(ni))
vi->i_mapping->a_ops = &ntfs_mst_aops;
else
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 589dcdfdfe3..01596079dd6 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
ocfs2-objs := \
alloc.o \
aops.o \
+ blockcheck.o \
buffer_head_io.o \
dcache.o \
dir.o \
@@ -35,8 +36,14 @@ ocfs2-objs := \
sysfile.o \
uptodate.o \
ver.o \
+ quota_local.o \
+ quota_global.o \
xattr.o
+ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
+ocfs2-objs += acl.o
+endif
+
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 00000000000..12dfb44c22e
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,479 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+ int n, count;
+ struct posix_acl *acl;
+
+ if (!value)
+ return NULL;
+ if (size < sizeof(struct posix_acl_entry))
+ return ERR_PTR(-EINVAL);
+
+ count = size / sizeof(struct posix_acl_entry);
+ if (count < 0)
+ return ERR_PTR(-EINVAL);
+ if (count == 0)
+ return NULL;
+
+ acl = posix_acl_alloc(count, GFP_NOFS);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+ for (n = 0; n < count; n++) {
+ struct ocfs2_acl_entry *entry =
+ (struct ocfs2_acl_entry *)value;
+
+ acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
+ acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+ acl->a_entries[n].e_id = le32_to_cpu(entry->e_id);
+ value += sizeof(struct posix_acl_entry);
+
+ }
+ return acl;
+}
+
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+ struct ocfs2_acl_entry *entry = NULL;
+ char *ocfs2_acl;
+ size_t n;
+
+ *size = acl->a_count * sizeof(struct posix_acl_entry);
+
+ ocfs2_acl = kmalloc(*size, GFP_NOFS);
+ if (!ocfs2_acl)
+ return ERR_PTR(-ENOMEM);
+
+ entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+ for (n = 0; n < acl->a_count; n++, entry++) {
+ entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
+ entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+ entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+ }
+ return ocfs2_acl;
+}
+
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+ int type,
+ struct buffer_head *di_bh)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int name_index;
+ char *value = NULL;
+ struct posix_acl *acl;
+ int retval;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return NULL;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+ if (retval > 0) {
+ value = kmalloc(retval, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+ "", value, retval);
+ }
+
+ if (retval > 0)
+ acl = ocfs2_acl_from_xattr(value, retval);
+ else if (retval == -ENODATA || retval == 0)
+ acl = NULL;
+ else
+ acl = ERR_PTR(retval);
+
+ kfree(value);
+
+ return acl;
+}
+
+
+/*
+ * Get posix acl.
+ */
+static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *di_bh = NULL;
+ struct posix_acl *acl;
+ int ret;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return NULL;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ acl = ERR_PTR(ret);
+ return acl;
+ }
+
+ acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+
+ ocfs2_inode_unlock(inode, 0);
+
+ brelse(di_bh);
+
+ return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ */
+static int ocfs2_set_acl(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ int type,
+ struct posix_acl *acl,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ int name_index;
+ void *value = NULL;
+ size_t size = 0;
+ int ret;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+ if (acl) {
+ mode_t mode = inode->i_mode;
+ ret = posix_acl_equiv_mode(acl, &mode);
+ if (ret < 0)
+ return ret;
+ else {
+ inode->i_mode = mode;
+ if (ret == 0)
+ acl = NULL;
+ }
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (acl) {
+ value = ocfs2_acl_to_xattr(acl, &size);
+ if (IS_ERR(value))
+ return (int)PTR_ERR(value);
+ }
+
+ if (handle)
+ ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+ "", value, size, 0,
+ meta_ac, data_ac);
+ else
+ ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+
+ kfree(value);
+
+ return ret;
+}
+
+int ocfs2_check_acl(struct inode *inode, int mask)
+{
+ struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl) {
+ int ret = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ return ret;
+ }
+
+ return -EAGAIN;
+}
+
+int ocfs2_acl_chmod(struct inode *inode)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl, *clone;
+ int ret;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return 0;
+
+ acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ posix_acl_release(acl);
+ if (!clone)
+ return -ENOMEM;
+ ret = posix_acl_chmod_masq(clone, inode->i_mode);
+ if (!ret)
+ ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+ clone, NULL, NULL);
+ posix_acl_release(clone);
+ return ret;
+}
+
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+ struct inode *inode,
+ struct inode *dir,
+ struct buffer_head *di_bh,
+ struct buffer_head *dir_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl = NULL;
+ int ret = 0;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+ dir_bh);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ if (!acl)
+ inode->i_mode &= ~current->fs->umask;
+ }
+ if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+ struct posix_acl *clone;
+ mode_t mode;
+
+ if (S_ISDIR(inode->i_mode)) {
+ ret = ocfs2_set_acl(handle, inode, di_bh,
+ ACL_TYPE_DEFAULT, acl,
+ meta_ac, data_ac);
+ if (ret)
+ goto cleanup;
+ }
+ clone = posix_acl_clone(acl, GFP_NOFS);
+ ret = -ENOMEM;
+ if (!clone)
+ goto cleanup;
+
+ mode = inode->i_mode;
+ ret = posix_acl_create_masq(clone, &mode);
+ if (ret >= 0) {
+ inode->i_mode = mode;
+ if (ret > 0) {
+ ret = ocfs2_set_acl(handle, inode,
+ di_bh, ACL_TYPE_ACCESS,
+ clone, meta_ac, data_ac);
+ }
+ }
+ posix_acl_release(clone);
+ }
+cleanup:
+ posix_acl_release(acl);
+ return ret;
+}
+
+static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+ char *list,
+ size_t list_len,
+ const char *name,
+ size_t name_len)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return 0;
+
+ if (list && size <= list_len)
+ memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+ return size;
+}
+
+static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+ char *list,
+ size_t list_len,
+ const char *name,
+ size_t name_len)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return 0;
+
+ if (list && size <= list_len)
+ memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+ return size;
+}
+
+static int ocfs2_xattr_get_acl(struct inode *inode,
+ int type,
+ void *buffer,
+ size_t size)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl;
+ int ret;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return -EOPNOTSUPP;
+
+ acl = ocfs2_get_acl(inode, type);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl == NULL)
+ return -ENODATA;
+ ret = posix_acl_to_xattr(acl, buffer, size);
+ posix_acl_release(acl);
+
+ return ret;
+}
+
+static int ocfs2_xattr_get_acl_access(struct inode *inode,
+ const char *name,
+ void *buffer,
+ size_t size)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int ocfs2_xattr_get_acl_default(struct inode *inode,
+ const char *name,
+ void *buffer,
+ size_t size)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int ocfs2_xattr_set_acl(struct inode *inode,
+ int type,
+ const void *value,
+ size_t size)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl;
+ int ret = 0;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return -EOPNOTSUPP;
+
+ if (!is_owner_or_cap(inode))
+ return -EPERM;
+
+ if (value) {
+ acl = posix_acl_from_xattr(value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ else if (acl) {
+ ret = posix_acl_valid(acl);
+ if (ret)
+ goto cleanup;
+ }
+ } else
+ acl = NULL;
+
+ ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+
+cleanup:
+ posix_acl_release(acl);
+ return ret;
+}
+
+static int ocfs2_xattr_set_acl_access(struct inode *inode,
+ const char *name,
+ const void *value,
+ size_t size,
+ int flags)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int ocfs2_xattr_set_acl_default(struct inode *inode,
+ const char *name,
+ const void *value,
+ size_t size,
+ int flags)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+struct xattr_handler ocfs2_xattr_acl_access_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .list = ocfs2_xattr_list_acl_access,
+ .get = ocfs2_xattr_get_acl_access,
+ .set = ocfs2_xattr_set_acl_access,
+};
+
+struct xattr_handler ocfs2_xattr_acl_default_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .list = ocfs2_xattr_list_acl_default,
+ .get = ocfs2_xattr_get_acl_default,
+ .set = ocfs2_xattr_set_acl_default,
+};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 00000000000..8f6389ed4da
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+
+#include <linux/posix_acl_xattr.h>
+
+struct ocfs2_acl_entry {
+ __le16 e_tag;
+ __le16 e_perm;
+ __le32 e_id;
+};
+
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+
+extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_acl_chmod(struct inode *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+ struct buffer_head *, struct buffer_head *,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
+
+#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
+#define ocfs2_check_acl NULL
+static inline int ocfs2_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+static inline int ocfs2_init_acl(handle_t *handle,
+ struct inode *inode,
+ struct inode *dir,
+ struct buffer_head *di_bh,
+ struct buffer_head *dir_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ return 0;
+}
+
+#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
+#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0cc2deb9394..d861096c9d8 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/swap.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
@@ -36,6 +37,7 @@
#include "alloc.h"
#include "aops.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
@@ -46,6 +48,7 @@
#include "file.h"
#include "super.h"
#include "uptodate.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
static int ocfs2_dinode_sanity_check(struct inode *inode,
struct ocfs2_extent_tree *et)
{
- int ret = 0;
- struct ocfs2_dinode *di;
+ struct ocfs2_dinode *di = et->et_object;
BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
- di = et->et_object;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- ret = -EIO;
- ocfs2_error(inode->i_sb,
- "Inode %llu has invalid path root",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- }
-
- return ret;
+ return 0;
}
static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
{
- struct ocfs2_xattr_value_root *xv = et->et_object;
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
- et->et_root_el = &xv->xr_list;
+ et->et_root_el = &vb->vb_xv->xr_list;
}
static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
u64 blkno)
{
- struct ocfs2_xattr_value_root *xv =
- (struct ocfs2_xattr_value_root *)et->et_object;
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
- xv->xr_last_eb_blk = cpu_to_le64(blkno);
+ vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
}
static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
{
- struct ocfs2_xattr_value_root *xv =
- (struct ocfs2_xattr_value_root *) et->et_object;
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
- return le64_to_cpu(xv->xr_last_eb_blk);
+ return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
}
static void ocfs2_xattr_value_update_clusters(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 clusters)
{
- struct ocfs2_xattr_value_root *xv =
- (struct ocfs2_xattr_value_root *)et->et_object;
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
- le32_add_cpu(&xv->xr_clusters, clusters);
+ le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
}
static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
struct buffer_head *bh,
+ ocfs2_journal_access_func access,
void *obj,
struct ocfs2_extent_tree_operations *ops)
{
et->et_ops = ops;
et->et_root_bh = bh;
+ et->et_root_journal_access = access;
if (!obj)
obj = (void *)bh->b_data;
et->et_object = obj;
@@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
struct buffer_head *bh)
{
- __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+ __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
+ NULL, &ocfs2_dinode_et_ops);
}
void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
struct buffer_head *bh)
{
- __ocfs2_init_extent_tree(et, inode, bh, NULL,
- &ocfs2_xattr_tree_et_ops);
+ __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
+ NULL, &ocfs2_xattr_tree_et_ops);
}
void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
- struct buffer_head *bh,
- struct ocfs2_xattr_value_root *xv)
+ struct ocfs2_xattr_value_buf *vb)
{
- __ocfs2_init_extent_tree(et, inode, bh, xv,
+ __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
&ocfs2_xattr_value_et_ops);
}
@@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
et->et_ops->eo_update_clusters(inode, et, clusters);
}
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+ struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ int type)
+{
+ return et->et_root_journal_access(handle, inode, et->et_root_bh,
+ type);
+}
+
static inline int ocfs2_et_insert_check(struct inode *inode,
struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec)
@@ -402,12 +405,14 @@ struct ocfs2_path_item {
#define OCFS2_MAX_PATH_DEPTH 5
struct ocfs2_path {
- int p_tree_depth;
- struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
+ int p_tree_depth;
+ ocfs2_journal_access_func p_root_access;
+ struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
};
#define path_root_bh(_path) ((_path)->p_node[0].bh)
#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
#define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
*/
if (keep_root)
depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+ else
+ path_root_access(path) = NULL;
path->p_tree_depth = depth;
}
@@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
BUG_ON(path_root_bh(dest) != path_root_bh(src));
BUG_ON(path_root_el(dest) != path_root_el(src));
+ BUG_ON(path_root_access(dest) != path_root_access(src));
ocfs2_reinit_path(dest, 1);
@@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
int i;
BUG_ON(path_root_bh(dest) != path_root_bh(src));
+ BUG_ON(path_root_access(dest) != path_root_access(src));
for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
brelse(dest->p_node[i].bh);
@@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
}
static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
- struct ocfs2_extent_list *root_el)
+ struct ocfs2_extent_list *root_el,
+ ocfs2_journal_access_func access)
{
struct ocfs2_path *path;
@@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
get_bh(root_bh);
path_root_bh(path) = root_bh;
path_root_el(path) = root_el;
+ path_root_access(path) = access;
}
return path;
}
+static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+ return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+ path_root_access(path));
+}
+
+static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+ return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+ et->et_root_journal_access);
+}
+
+/*
+ * Journal the buffer at depth idx. All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+static int ocfs2_path_bh_journal_access(handle_t *handle,
+ struct inode *inode,
+ struct ocfs2_path *path,
+ int idx)
+{
+ ocfs2_journal_access_func access = path_root_access(path);
+
+ if (!access)
+ access = ocfs2_journal_access;
+
+ if (idx)
+ access = ocfs2_journal_access_eb;
+
+ return access(handle, inode, path->p_node[idx].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+}
+
/*
* Convenience function to journal all components in a path.
*/
@@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
goto out;
for(i = 0; i < path_num_items(path); i++) {
- ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt {
int c_split_covers_rec;
};
+static int ocfs2_validate_extent_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_extent_block *eb =
+ (struct ocfs2_extent_block *)bh->b_data;
+
+ mlog(0, "Validating extent block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+ if (rc) {
+ mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ return rc;
+ }
+
+ /*
+ * Errors after here are fatal.
+ */
+
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ ocfs2_error(sb,
+ "Extent block #%llu has bad signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ eb->h_signature);
+ return -EINVAL;
+ }
+
+ if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_blkno "
+ "of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Extent block #%llu has an invalid "
+ "h_fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(eb->h_fs_generation));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+ struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(inode, eb_blkno, &tmp,
+ ocfs2_validate_extent_block);
+
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+
/*
* How many free extents have we got before we need more meta data?
*/
@@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
last_eb_blk = ocfs2_et_get_last_eb_blk(et);
if (last_eb_blk) {
- retval = ocfs2_read_block(inode, last_eb_blk,
- &eb_bh);
+ retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
if (retval < 0) {
mlog_errno(retval);
goto bail;
@@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
}
ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
- status = ocfs2_journal_access(handle, inode, bhs[i],
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_eb(handle, inode, bhs[i],
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
for(i = 0; i < new_blocks; i++) {
bh = new_eb_bhs[i];
eb = (struct ocfs2_extent_block *) bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
- }
+ /* ocfs2_create_new_meta_bhs() should create it right! */
+ BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
eb_el = &eb->h_list;
- status = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_eb(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
* journal_dirty erroring as it won't unless we've aborted the
* handle (in which case we would never be here) so reserving
* the write with journal_access is all we need to do. */
- status = ocfs2_journal_access(handle, inode, *last_eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_journal_access(handle, inode, et->et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
if (eb_bh) {
- status = ocfs2_journal_access(handle, inode, eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_eb(handle, inode, eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
}
eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
- }
+ /* ocfs2_create_new_meta_bhs() should create it right! */
+ BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
eb_el = &eb->h_list;
root_el = et->et_root_el;
- status = ocfs2_journal_access(handle, inode, new_eb_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_journal_access(handle, inode, et->et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
brelse(bh);
bh = NULL;
- status = ocfs2_read_block(inode, blkno, &bh);
+ status = ocfs2_read_extent_block(inode, blkno, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
eb = (struct ocfs2_extent_block *) bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
- }
el = &eb->h_list;
if (le16_to_cpu(el->l_next_free_rec) <
@@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode,
brelse(bh);
bh = NULL;
- ret = ocfs2_read_block(inode, blkno, &bh);
+ ret = ocfs2_read_extent_block(inode, blkno, &bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode,
eb = (struct ocfs2_extent_block *) bh->b_data;
el = &eb->h_list;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- ret = -EIO;
- goto out;
- }
if (le16_to_cpu(el->l_next_free_rec) >
le16_to_cpu(el->l_count)) {
@@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ subtree_index);
if (ret) {
mlog_errno(ret);
goto out;
}
for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
- ret = ocfs2_journal_access(handle, inode,
- right_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode,
- left_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ left_path, i);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
*ret_left_path = NULL;
- left_path = ocfs2_new_path(path_root_bh(right_path),
- path_root_el(right_path));
+ left_path = ocfs2_new_path_from_path(right_path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
return -EAGAIN;
if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
- ret = ocfs2_journal_access(handle, inode,
- path_leaf_bh(right_path),
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_eb(handle, inode,
+ path_leaf_bh(right_path),
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
* We have to update i_last_eb_blk during the meta
* data delete.
*/
- ret = ocfs2_journal_access(handle, inode, et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
*/
BUG_ON(right_has_empty && !del_right_subtree);
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ subtree_index);
if (ret) {
mlog_errno(ret);
goto out;
}
for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
- ret = ocfs2_journal_access(handle, inode,
- right_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode,
- left_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ left_path, i);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2596,16 +2694,17 @@ out:
static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
handle_t *handle,
- struct buffer_head *bh,
- struct ocfs2_extent_list *el)
+ struct ocfs2_path *path)
{
int ret;
+ struct buffer_head *bh = path_leaf_bh(path);
+ struct ocfs2_extent_list *el = path_leaf_el(path);
if (!ocfs2_is_empty_extent(&el->l_recs[0]))
return 0;
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, path,
+ path_num_items(path) - 1);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
goto out;
}
- left_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ left_path = ocfs2_new_path_from_path(path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
ocfs2_cp_path(left_path, path);
- right_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ right_path = ocfs2_new_path_from_path(path);
if (!right_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
* Caller might still want to make changes to the
* tree root, so re-add it to the journal here.
*/
- ret = ocfs2_journal_access(handle, inode,
- path_root_bh(left_path),
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ left_path, 0);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
* We have a path to the left of this one - it needs
* an update too.
*/
- left_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ left_path = ocfs2_new_path_from_path(path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -2875,8 +2970,7 @@ rightmost_no_delete:
* it up front.
*/
ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
- path_leaf_bh(path),
- path_leaf_el(path));
+ path);
if (ret)
mlog_errno(ret);
goto out;
@@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode,
/* This function shouldn't be called for the rightmost leaf. */
BUG_ON(right_cpos == 0);
- right_path = ocfs2_new_path(path_root_bh(left_path),
- path_root_el(left_path));
+ right_path = ocfs2_new_path_from_path(left_path);
if (!right_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ subtree_index);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
for (i = subtree_index + 1;
i < path_num_items(right_path); i++) {
- ret = ocfs2_journal_access(handle, inode,
- right_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode,
- left_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ left_path, i);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
right_rec = &el->l_recs[index + 1];
}
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
+ path_num_items(left_path) - 1);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode,
/* This function shouldn't be called for the leftmost leaf. */
BUG_ON(left_cpos == 0);
- left_path = ocfs2_new_path(path_root_bh(right_path),
- path_root_el(right_path));
+ left_path = ocfs2_new_path_from_path(right_path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ subtree_index);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
for (i = subtree_index + 1;
i < path_num_items(right_path); i++) {
- ret = ocfs2_journal_access(handle, inode,
- right_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode,
- left_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ left_path, i);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
has_empty_extent = 1;
}
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ path_num_items(right_path) - 1);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
* leftmost leaf.
*/
if (left_cpos) {
- left_path = ocfs2_new_path(path_root_bh(right_path),
- path_root_el(right_path));
+ left_path = ocfs2_new_path_from_path(right_path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -3781,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
struct ocfs2_extent_rec *rec, *tmprec;
- right_el = path_leaf_el(right_path);;
+ right_el = path_leaf_el(right_path);
if (left_path)
left_el = path_leaf_el(left_path);
@@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
el = et->et_root_el;
- ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
goto out_update_clusters;
}
- right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+ right_path = ocfs2_new_path_from_et(et);
if (!right_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
* ocfs2_rotate_tree_right() might have extended the
* transaction without re-journaling our tree root.
*/
- ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
goto out;
if (left_cpos != 0) {
- left_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ left_path = ocfs2_new_path_from_path(path);
if (!left_path)
goto out;
@@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
le16_to_cpu(new_el->l_count)) {
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
- eb);
+ ocfs2_error(inode->i_sb,
+ "Extent block #%llu has an "
+ "invalid l_next_free_rec of "
+ "%d. It should have "
+ "matched the l_count of %d",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec),
+ le16_to_cpu(new_el->l_count));
+ status = -EINVAL;
goto out;
}
rec = &new_el->l_recs[
@@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (right_cpos == 0)
goto out;
- right_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ right_path = ocfs2_new_path_from_path(path);
if (!right_path)
goto out;
@@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
- eb);
+ ocfs2_error(inode->i_sb,
+ "Extent block #%llu has an "
+ "invalid l_next_free_rec of %d",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec));
+ status = -EINVAL;
goto out;
}
rec = &new_el->l_recs[1];
@@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* ocfs2_figure_insert_type() and ocfs2_add_branch()
* may want it later.
*/
- ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
+ ret = ocfs2_read_extent_block(inode,
+ ocfs2_et_get_last_eb_blk(et),
+ &bh);
if (ret) {
mlog_exit(ret);
goto out;
@@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
return 0;
}
- path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+ path = ocfs2_new_path_from_et(et);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
BUG_ON(num_bits > clusters_to_add);
- /* reserve our write early -- insert_extent may update the inode */
- status = ocfs2_journal_access(handle, inode, et->et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ /* reserve our write early -- insert_extent may update the tree root */
+ status = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
if (path->p_tree_depth) {
struct ocfs2_extent_block *eb;
- ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
- &last_eb_bh);
+ ret = ocfs2_read_extent_block(inode,
+ ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
if (ret) {
mlog_exit(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- ret = -EROFS;
- goto out;
- }
-
rightmost_el = &eb->h_list;
} else
rightmost_el = path_root_el(path);
@@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
if (et->et_ops == &ocfs2_dinode_et_ops)
ocfs2_extent_map_trunc(inode, 0);
- left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+ left_path = ocfs2_new_path_from_et(et);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
depth = path->p_tree_depth;
if (depth > 0) {
- ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
- &last_eb_bh);
+ ret = ocfs2_read_extent_block(inode,
+ ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
}
if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
- left_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ left_path = ocfs2_new_path_from_path(path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode,
ocfs2_extent_map_trunc(inode, 0);
- path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+ path = ocfs2_new_path_from_et(et);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -5255,6 +5348,78 @@ out:
return ret;
}
+int ocfs2_remove_btree_range(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 phys_cpos, u32 len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+
+ ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_et_update_clusters(inode, et, -len);
+
+ ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return ret;
+}
+
int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
{
struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -5308,13 +5473,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
di = (struct ocfs2_dinode *) tl_bh->b_data;
- tl = &di->id2.i_dealloc;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
- status = -EIO;
- goto bail;
- }
+ /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
+ * by the underlying call to ocfs2_read_inode_block(), so any
+ * corruption is a code bug */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ tl = &di->id2.i_dealloc;
tl_count = le16_to_cpu(tl->tl_count);
mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
tl_count == 0,
@@ -5332,8 +5497,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_journal_access(handle, tl_inode, tl_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -5394,8 +5559,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
while (i >= 0) {
/* Caller has given us at least enough credits to
* update the truncate log dinode */
- status = ocfs2_journal_access(handle, tl_inode, tl_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -5464,13 +5629,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
BUG_ON(mutex_trylock(&tl_inode->i_mutex));
di = (struct ocfs2_dinode *) tl_bh->b_data;
- tl = &di->id2.i_dealloc;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
- status = -EIO;
- goto out;
- }
+ /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
+ * by the underlying call to ocfs2_read_inode_block(), so any
+ * corruption is a code bug */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ tl = &di->id2.i_dealloc;
num_to_flush = le16_to_cpu(tl->tl_used);
mlog(0, "Flush %u records from truncate log #%llu\n",
num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5586,7 +5751,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+ status = ocfs2_read_inode_block(inode, &bh);
if (status < 0) {
iput(inode);
mlog_errno(status);
@@ -5625,13 +5790,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
}
di = (struct ocfs2_dinode *) tl_bh->b_data;
- tl = &di->id2.i_dealloc;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
- status = -EIO;
- goto bail;
- }
+ /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's
+ * validated by the underlying call to ocfs2_read_inode_block(),
+ * so any corruption is a code bug */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ tl = &di->id2.i_dealloc;
if (le16_to_cpu(tl->tl_used)) {
mlog(0, "We'll have %u logs to recover\n",
le16_to_cpu(tl->tl_used));
@@ -5651,6 +5816,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
* tl_used. */
tl->tl_used = 0;
+ ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
status = ocfs2_write_block(osb, tl_bh, tl_inode);
if (status < 0) {
mlog_errno(status);
@@ -5800,7 +5966,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
*/
/*
- * Describes a single block free from a suballocator
+ * Describe a single bit freed from a suballocator. For the block
+ * suballocators, it represents one block. For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
*/
struct ocfs2_cached_block_free {
struct ocfs2_cached_block_free *free_next;
@@ -5815,10 +5984,10 @@ struct ocfs2_per_slot_free_list {
struct ocfs2_cached_block_free *f_first;
};
-static int ocfs2_free_cached_items(struct ocfs2_super *osb,
- int sysfile_type,
- int slot,
- struct ocfs2_cached_block_free *head)
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
+ int sysfile_type,
+ int slot,
+ struct ocfs2_cached_block_free *head)
{
int ret;
u64 bg_blkno;
@@ -5893,6 +6062,82 @@ out:
return ret;
}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ u64 blkno, unsigned int bit)
+{
+ int ret = 0;
+ struct ocfs2_cached_block_free *item;
+
+ item = kmalloc(sizeof(*item), GFP_NOFS);
+ if (item == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
+ bit, (unsigned long long)blkno);
+
+ item->free_blk = blkno;
+ item->free_bit = bit;
+ item->free_next = ctxt->c_global_allocator;
+
+ ctxt->c_global_allocator = item;
+ return ret;
+}
+
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+ struct ocfs2_cached_block_free *head)
+{
+ struct ocfs2_cached_block_free *tmp;
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ int ret = 0;
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ while (head) {
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+ head->free_bit);
+
+ ocfs2_commit_trans(osb, handle);
+ tmp = head;
+ head = head->free_next;
+ kfree(tmp);
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ mutex_unlock(&tl_inode->i_mutex);
+
+ while (head) {
+ /* Premature exit may have left some dangling items. */
+ tmp = head;
+ head = head->free_next;
+ kfree(tmp);
+ }
+
+ return ret;
+}
+
int ocfs2_run_deallocs(struct ocfs2_super *osb,
struct ocfs2_cached_dealloc_ctxt *ctxt)
{
@@ -5908,8 +6153,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
if (fl->f_first) {
mlog(0, "Free items: (type %u, slot %d)\n",
fl->f_inode_type, fl->f_slot);
- ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
- fl->f_slot, fl->f_first);
+ ret2 = ocfs2_free_cached_blocks(osb,
+ fl->f_inode_type,
+ fl->f_slot,
+ fl->f_first);
if (ret2)
mlog_errno(ret2);
if (!ret)
@@ -5920,6 +6167,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
kfree(fl);
}
+ if (ctxt->c_global_allocator) {
+ ret2 = ocfs2_free_cached_clusters(osb,
+ ctxt->c_global_allocator);
+ if (ret2)
+ mlog_errno(ret2);
+ if (!ret)
+ ret = ret2;
+
+ ctxt->c_global_allocator = NULL;
+ }
+
return ret;
}
@@ -6075,11 +6333,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
eb = (struct ocfs2_extent_block *) bh->b_data;
el = &eb->h_list;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- ret = -EROFS;
- goto out;
- }
+
+ /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
+ * Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
*new_last_eb = bh;
get_bh(*new_last_eb);
@@ -6326,8 +6583,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
}
if (last_eb_bh) {
- status = ocfs2_journal_access(handle, inode, last_eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -6350,6 +6607,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
goto bail;
}
+ vfs_dq_free_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
spin_lock(&OCFS2_I(inode)->ip_lock);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
clusters_to_del;
@@ -6436,11 +6695,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
mlog_errno(ret);
else if (ocfs2_should_order_data(inode)) {
ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, &partial,
- ocfs2_journal_dirty_data);
-#endif
if (ret < 0)
mlog_errno(ret);
}
@@ -6663,6 +6917,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct page **pages = NULL;
loff_t end = osb->s_clustersize;
struct ocfs2_extent_tree et;
+ int did_quota = 0;
has_data = i_size_read(inode) ? 1 : 0;
@@ -6682,15 +6937,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
}
- handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
+ handle = ocfs2_start_trans(osb,
+ ocfs2_inline_to_extents_credits(osb->sb));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out_unlock;
}
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -6701,6 +6957,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
unsigned int page_end;
u64 phys;
+ if (vfs_dq_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1))) {
+ ret = -EDQUOT;
+ goto out_commit;
+ }
+ did_quota = 1;
+
ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
&num);
if (ret) {
@@ -6774,6 +7037,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
out_commit:
+ if (ret < 0 && did_quota)
+ vfs_dq_free_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+
ocfs2_commit_trans(osb, handle);
out_unlock:
@@ -6813,7 +7080,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
i_size_read(inode));
- path = ocfs2_new_path(fe_bh, &di->id2.i_list);
+ path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+ ocfs2_journal_access_di);
if (!path) {
status = -ENOMEM;
mlog_errno(status);
@@ -6984,20 +7252,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
if (fe->id2.i_list.l_tree_depth) {
- status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
- &last_eb_bh);
+ status = ocfs2_read_extent_block(inode,
+ le64_to_cpu(fe->i_last_eb_blk),
+ &last_eb_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-
- brelse(last_eb_bh);
- status = -EIO;
- goto bail;
- }
}
(*tc)->tc_last_eb_bh = last_eb_bh;
@@ -7052,8 +7314,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
goto out;
}
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 70257c84cfb..cceff5c37f4 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,9 @@
*
* ocfs2_extent_tree contains info for the root of the b-tree, it must have a
* root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions.
+ * functions. With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
* ocfs2_extent_tree_operations abstract the normal operations we do for
* the root of extent b-tree.
*/
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
struct ocfs2_extent_tree_operations *et_ops;
struct buffer_head *et_root_bh;
struct ocfs2_extent_list *et_root_el;
+ ocfs2_journal_access_func et_root_journal_access;
void *et_object;
unsigned int et_max_leaf_clusters;
};
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
- struct buffer_head *bh,
- struct ocfs2_xattr_value_root *xv);
+ struct ocfs2_xattr_value_buf *vb);
+
+/*
+ * Read an extent block into *bh. If *bh is NULL, a bh will be
+ * allocated. This is a cached read. The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+ struct buffer_head **bh);
struct ocfs2_alloc_context;
int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
u32 cpos, u32 len, handle_t *handle,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 phys_cpos, u32 len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct inode *inode,
struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
*/
struct ocfs2_cached_dealloc_ctxt {
struct ocfs2_per_slot_free_list *c_first_suballocator;
+ struct ocfs2_cached_block_free *c_global_allocator;
};
static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
{
c->c_first_suballocator = NULL;
+ c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ u64 blkno, unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+ return c->c_global_allocator != NULL;
}
int ocfs2_run_deallocs(struct ocfs2_super *osb,
struct ocfs2_cached_dealloc_ctxt *ctxt);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c22543b3342..a067a6cffb0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -27,6 +27,7 @@
#include <linux/swap.h>
#include <linux/pipe_fs_i.h>
#include <linux/mpage.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
- status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+ status = ocfs2_read_inode_block(inode, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
- fe->i_signature);
- goto bail;
- }
-
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(fe->i_clusters))) {
mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
BUG_ON(!PageLocked(page));
BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
- ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+ ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
if (ocfs2_should_order_data(inode)) {
ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
- ret = walk_page_buffers(handle,
- page_buffers(page),
- from, to, NULL,
- ocfs2_journal_dirty_data);
-#endif
if (ret < 0)
mlog_errno(ret);
}
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
tmppage = wc->w_pages[i];
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode)) {
+ if (ocfs2_should_order_data(inode))
ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
- walk_page_buffers(wc->w_handle,
- page_buffers(tmppage),
- from, to, NULL,
- ocfs2_journal_dirty_data);
-#endif
- }
block_commit_write(tmppage, from, to);
}
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
goto out;
}
- ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
ocfs2_commit_trans(osb, handle);
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
wc->w_handle = handle;
+ if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
+ ret = -EDQUOT;
+ goto out_commit;
+ }
/*
* We don't want this to fail in ocfs2_write_end(), so do it
* here.
*/
- ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out_quota;
}
/*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
mmap_page);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out_quota;
}
ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
len);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out_quota;
}
if (data_ac)
@@ -1790,6 +1776,10 @@ success:
*pagep = wc->w_target_page;
*fsdata = wc;
return 0;
+out_quota:
+ if (clusters_to_alloc)
+ vfs_dq_free_space(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
out_commit:
ocfs2_commit_trans(osb, handle);
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
}
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode)) {
+ if (ocfs2_should_order_data(inode))
ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
- walk_page_buffers(wc->w_handle,
- page_buffers(tmppage),
- from, to, NULL,
- ocfs2_journal_dirty_data);
-#endif
- }
block_commit_write(tmppage, from, to);
}
}
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 00000000000..2a947c44e59
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,477 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "blockcheck.h"
+
+
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+
+
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer. Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offest by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based. This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example. Take bit 1 of the data buffer. 1 is a power of two (2^0),
+ * so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added. This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+ unsigned int b, p = 0;
+
+ /*
+ * Data bits are 0-based, but we're talking code bits, which
+ * are 1-based.
+ */
+ b = i + 1;
+
+ /* Use the cache if it is there */
+ if (p_cache)
+ p = *p_cache;
+ b += p;
+
+ /*
+ * For every power of two below our bit number, bump our bit.
+ *
+ * We compare with (b + 1) because we have to compare with what b
+ * would be _if_ it were bumped up by the parity bit. Capice?
+ *
+ * p is set above.
+ */
+ for (; (1 << p) < (b + 1); p++)
+ b++;
+
+ if (p_cache)
+ *p_cache = p;
+
+ return b;
+}
+
+/*
+ * This is the low level encoder function. It can be called across
+ * multiple hunks just like the crc32 code. 'd' is the number of bits
+ * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+ unsigned int i, b, p = 0;
+
+ BUG_ON(!d);
+
+ /*
+ * b is the hamming code bit number. Hamming code specifies a
+ * 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is
+ * for the algorithm.
+ *
+ * The i++ in the for loop is so that the start offset passed
+ * to ocfs2_find_next_bit_set() is one greater than the previously
+ * found bit.
+ */
+ for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+ {
+ /*
+ * i is the offset in this hunk, nr + i is the total bit
+ * offset.
+ */
+ b = calc_code_bit(nr + i, &p);
+
+ /*
+ * Data bits in the resultant code are checked by
+ * parity bits that are part of the bit number
+ * representation. Huh?
+ *
+ * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+ * In other words, the parity bit at position 2^k
+ * checks bits in positions having bit k set in
+ * their binary representation. Conversely, for
+ * instance, bit 13, i.e. 1101(2), is checked by
+ * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+ * </wikipedia>
+ *
+ * Note that 'k' is the _code_ bit number. 'b' in
+ * our loop.
+ */
+ parity ^= b;
+ }
+
+ /* While the data buffer was treated as little endian, the
+ * return value is in host endian. */
+ return parity;
+}
+
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+ return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
+ * offset of the current hunk. If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+ unsigned int fix)
+{
+ unsigned int i, b;
+
+ BUG_ON(!d);
+
+ /*
+ * If the bit to fix has an hweight of 1, it's a parity bit. One
+ * busted parity bit is its own error. Nothing to do here.
+ */
+ if (hweight32(fix) == 1)
+ return;
+
+ /*
+ * nr + d is the bit right past the data hunk we're looking at.
+ * If fix after that, nothing to do
+ */
+ if (fix >= calc_code_bit(nr + d, NULL))
+ return;
+
+ /*
+ * nr is the offset in the data hunk we're starting at. Let's
+ * start b at the offset in the code buffer. See hamming_encode()
+ * for a more detailed description of 'b'.
+ */
+ b = calc_code_bit(nr, NULL);
+ /* If the fix is before this hunk, nothing to do */
+ if (fix < b)
+ return;
+
+ for (i = 0; i < d; i++, b++)
+ {
+ /* Skip past parity bits */
+ while (hweight32(b) == 1)
+ b++;
+
+ /*
+ * i is the offset in this data hunk.
+ * nr + i is the offset in the total data buffer.
+ * b is the offset in the total code buffer.
+ *
+ * Thus, when b == fix, bit i in the current hunk needs
+ * fixing.
+ */
+ if (b == fix)
+ {
+ if (ocfs2_test_bit(i, data))
+ ocfs2_clear_bit(i, data);
+ else
+ ocfs2_set_bit(i, data);
+ break;
+ }
+ }
+}
+
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+ unsigned int fix)
+{
+ ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked. bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information. If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc)
+{
+ u32 crc;
+ u32 ecc;
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ crc = crc32_le(~0, data, blocksize);
+ ecc = ocfs2_hamming_encode_block(data, blocksize);
+
+ /*
+ * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+ * larger than 16 bits.
+ */
+ BUG_ON(ecc > USHORT_MAX);
+
+ bc->bc_crc32e = cpu_to_le32(crc);
+ bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information. Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc)
+{
+ int rc = 0;
+ struct ocfs2_block_check check;
+ u32 crc, ecc;
+
+ check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ /* Fast path - if the crc32 validates, we're good to go */
+ crc = crc32_le(~0, data, blocksize);
+ if (crc == check.bc_crc32e)
+ goto out;
+
+ mlog(ML_ERROR,
+ "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
+ (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+ /* Ok, try ECC fixups */
+ ecc = ocfs2_hamming_encode_block(data, blocksize);
+ ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+
+ /* And check the crc32 again */
+ crc = crc32_le(~0, data, blocksize);
+ if (crc == check.bc_crc32e)
+ goto out;
+
+ mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+ (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+ rc = -EIO;
+
+out:
+ bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+ return rc;
+}
+
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked. bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information. If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc)
+{
+ int i;
+ u32 crc, ecc;
+
+ BUG_ON(nr < 0);
+
+ if (!nr)
+ return;
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+ crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+ /*
+ * The number of bits in a buffer is obviously b_size*8.
+ * The offset of this buffer is b_size*i, so the bit offset
+ * of this buffer is b_size*8*i.
+ */
+ ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+ bhs[i]->b_size * 8,
+ bhs[i]->b_size * 8 * i);
+ }
+
+ /*
+ * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+ * larger than 16 bits.
+ */
+ BUG_ON(ecc > USHORT_MAX);
+
+ bc->bc_crc32e = cpu_to_le32(crc);
+ bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads. Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes. If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc)
+{
+ int i, rc = 0;
+ struct ocfs2_block_check check;
+ u32 crc, ecc, fix;
+
+ BUG_ON(nr < 0);
+
+ if (!nr)
+ return 0;
+
+ check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ /* Fast path - if the crc32 validates, we're good to go */
+ for (i = 0, crc = ~0; i < nr; i++)
+ crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+ if (crc == check.bc_crc32e)
+ goto out;
+
+ mlog(ML_ERROR,
+ "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
+ (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+ /* Ok, try ECC fixups */
+ for (i = 0, ecc = 0; i < nr; i++) {
+ /*
+ * The number of bits in a buffer is obviously b_size*8.
+ * The offset of this buffer is b_size*i, so the bit offset
+ * of this buffer is b_size*8*i.
+ */
+ ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+ bhs[i]->b_size * 8,
+ bhs[i]->b_size * 8 * i);
+ }
+ fix = ecc ^ check.bc_ecc;
+ for (i = 0; i < nr; i++) {
+ /*
+ * Try the fix against each buffer. It will only affect
+ * one of them.
+ */
+ ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+ bhs[i]->b_size * 8 * i, fix);
+ }
+
+ /* And check the crc32 again */
+ for (i = 0, crc = ~0; i < nr; i++)
+ crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+ if (crc == check.bc_crc32e)
+ goto out;
+
+ mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+ (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+ rc = -EIO;
+
+out:
+ bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+ return rc;
+}
+
+/*
+ * These are the main API. They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc)
+{
+ if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+ ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc)
+{
+ int rc = 0;
+
+ if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+ rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+
+ return rc;
+}
+
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc)
+{
+ if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+ ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc)
+{
+ int rc = 0;
+
+ if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+ rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+
+ return rc;
+}
+
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 00000000000..70ec3feda32
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+
+
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc);
+
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc);
+
+/*
+ * Hamming code functions
+ */
+
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function. It can be called across
+ * multiple hunks just like the crc32 code. 'd' is the number of bits
+ * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+ unsigned int nr);
+/*
+ * Fix a buffer with a bit error. The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
+ * offset of the current hunk. If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+ unsigned int fix);
+
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+ unsigned int fix);
+#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 3a178ec48d7..15c8e6deee2 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,6 +39,18 @@
#include "buffer_head_io.h"
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+ BH_NeedsValidate = BH_JBDPrivateStart,
+};
+
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
+
int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
struct inode *inode)
{
@@ -166,7 +178,9 @@ bail:
}
int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
- struct buffer_head *bhs[], int flags)
+ struct buffer_head *bhs[], int flags,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
{
int status = 0;
int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
+ if (validate)
+ set_buffer_needs_validate(bh);
bh->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh);
continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
bhs[i] = NULL;
continue;
}
+
+ if (buffer_needs_validate(bh)) {
+ /* We never set NeedsValidate if the
+ * buffer was held by the journal, so
+ * that better not have changed */
+ BUG_ON(buffer_jbd(bh));
+ clear_buffer_needs_validate(bh);
+ status = validate(inode->i_sb, bh);
+ if (status) {
+ put_bh(bh);
+ bhs[i] = NULL;
+ continue;
+ }
+ }
}
/* Always set the buffer in the cache, even if it was
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 75e1dcb1ade..c75d682dadd 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,24 @@
void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
int uptodate);
-static inline int ocfs2_read_block(struct inode *inode,
- u64 off,
- struct buffer_head **bh);
-
int ocfs2_write_block(struct ocfs2_super *osb,
struct buffer_head *bh,
struct inode *inode);
-int ocfs2_read_blocks(struct inode *inode,
- u64 block,
- int nr,
- struct buffer_head *bhs[],
- int flags);
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[]);
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk. It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+ struct buffer_head *bhs[], int flags,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh));
+
int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
struct buffer_head *bh);
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
#define OCFS2_BH_READAHEAD 8
static inline int ocfs2_read_block(struct inode *inode, u64 off,
- struct buffer_head **bh)
+ struct buffer_head **bh,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
{
int status = 0;
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
goto bail;
}
- status = ocfs2_read_blocks(inode, off, 1, bh, 0);
+ status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
bail:
return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6ebaa58e2c0..04697ba7f73 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -854,7 +854,7 @@ static int o2hb_thread(void *data)
while (!kthread_should_stop() && !reg->hr_unclean_stop) {
/* We track the time spent inside
- * o2hb_do_disk_heartbeat so that we avoid more then
+ * o2hb_do_disk_heartbeat so that we avoid more than
* hr_timeout_ms between disk writes. On busy systems
* this should result in a heartbeat which is less
* likely to time itself out. */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d8a0cb92cef..96df5416993 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(QUORUM),
define_mask(EXPORT),
define_mask(XATTR),
+ define_mask(QUOTA),
define_mask(ERROR),
define_mask(NOTICE),
define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 57670c68047..7e72a81bc2d 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
+#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
/* bits that are infrequently given and frequently matched in the high word */
#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 026e6eb8518..f2c4098cf33 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,6 +40,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
@@ -47,6 +48,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dir.h"
#include "dlmglue.h"
#include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **new_bh);
-static struct buffer_head *ocfs2_bread(struct inode *inode,
- int block, int *err, int reada)
+/*
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_dir_has_trailer(struct inode *dir)
{
- struct buffer_head *bh = NULL;
- int tmperr;
- u64 p_blkno;
- int readflags = 0;
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
- if (reada)
- readflags |= OCFS2_BH_READAHEAD;
+ return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
+}
- if (((u64)block << inode->i_sb->s_blocksize_bits) >=
- i_size_read(inode)) {
- BUG_ON(!reada);
- return NULL;
- }
+static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
+{
+ return ocfs2_meta_ecc(osb);
+}
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
- NULL);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- if (tmperr < 0) {
- mlog_errno(tmperr);
- goto fail;
- }
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
+{
+ return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
+}
- tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
- if (tmperr < 0)
- goto fail;
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
- tmperr = 0;
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+ void *data)
+{
+ char *p = data;
- *err = 0;
- return bh;
+ p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
+ return (struct ocfs2_dir_block_trailer *)p;
+}
-fail:
- brelse(bh);
- bh = NULL;
+/*
+ * XXX: This is executed once on every dirent. We should consider optimizing
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+ struct ocfs2_dir_entry *de,
+ unsigned long offset,
+ unsigned long blklen)
+{
+ unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
- *err = -EIO;
- return NULL;
+ if (!ocfs2_dir_has_trailer(dir))
+ return 0;
+
+ if (offset != toff)
+ return 0;
+
+ return 1;
+}
+
+static void ocfs2_init_dir_trailer(struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct ocfs2_dir_block_trailer *trailer;
+
+ trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+ strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+ trailer->db_compat_rec_len =
+ cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+ trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+ trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
}
/*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
- ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+ ret = ocfs2_read_inode_block(dir, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -250,6 +277,108 @@ out:
return NULL;
}
+static int ocfs2_validate_dir_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_dir_block_trailer *trailer =
+ ocfs2_trailer_from_bh(bh, sb);
+
+
+ /*
+ * We don't validate dirents here, that's handled
+ * in-place when the code walks them.
+ */
+ mlog(0, "Validating dirblock %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ *
+ * Note that we are safe to call this even if the directory
+ * doesn't have a trailer. Filesystems without metaecc will do
+ * nothing, and filesystems with it will have one.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+ if (rc)
+ mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ return rc;
+}
+
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread(). We haven't audited what returning the
+ * real error codes would do to callers. We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh, int flags)
+{
+ int rc = 0;
+ struct buffer_head *tmp = *bh;
+ struct ocfs2_dir_block_trailer *trailer;
+
+ rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+ ocfs2_validate_dir_block);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ /*
+ * We check the trailer here rather than in
+ * ocfs2_validate_dir_block() because that function doesn't have
+ * the inode to test.
+ */
+ if (!(flags & OCFS2_BH_READAHEAD) &&
+ ocfs2_dir_has_trailer(inode)) {
+ trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+ if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+ rc = -EINVAL;
+ ocfs2_error(inode->i_sb,
+ "Invalid dirblock #%llu: "
+ "signature = %.*s\n",
+ (unsigned long long)tmp->b_blocknr, 7,
+ trailer->db_signature);
+ goto out;
+ }
+ if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
+ rc = -EINVAL;
+ ocfs2_error(inode->i_sb,
+ "Directory block #%llu has an invalid "
+ "db_blkno of %llu",
+ (unsigned long long)tmp->b_blocknr,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ goto out;
+ }
+ if (le64_to_cpu(trailer->db_parent_dinode) !=
+ OCFS2_I(inode)->ip_blkno) {
+ rc = -EINVAL;
+ ocfs2_error(inode->i_sb,
+ "Directory block #%llu on dinode "
+ "#%llu has an invalid parent_dinode "
+ "of %llu",
+ (unsigned long long)tmp->b_blocknr,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ goto out;
+ }
+ }
+
+ /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+ if (!*bh)
+ *bh = tmp;
+
+out:
+ return rc ? -EIO : 0;
+}
+
static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
struct inode *dir,
struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
}
num++;
- bh = ocfs2_bread(dir, b++, &err, 1);
+ bh = NULL;
+ err = ocfs2_read_dir_block(dir, b++, &bh,
+ OCFS2_BH_READAHEAD);
bh_use[ra_max] = bh;
}
}
if ((bh = bh_use[ra_ptr++]) == NULL)
goto next;
- if (ocfs2_read_block(dir, block, &bh)) {
+ if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
/* read error, skip block & hope for the best.
- * ocfs2_read_block() has released the bh. */
+ * ocfs2_read_dir_block() has released the bh. */
ocfs2_error(dir->i_sb, "reading directory %llu, "
"offset %lu\n",
(unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
struct inode *new_entry_inode)
{
int ret;
+ ocfs2_journal_access_func access = ocfs2_journal_access_db;
/*
* The same code works fine for both inline-data and extent
- * based directories, so no need to split this up.
+ * based directories, so no need to split this up. The only
+ * difference is the journal_access function.
*/
- ret = ocfs2_journal_access(handle, dir, de_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ access = ocfs2_journal_access_di;
+
+ ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
{
struct ocfs2_dir_entry *de, *pde;
int i, status = -ENOENT;
+ ocfs2_journal_access_func access = ocfs2_journal_access_db;
mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ access = ocfs2_journal_access_di;
+
i = 0;
pde = NULL;
de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
goto bail;
}
if (de == de_del) {
- status = ocfs2_journal_access(handle, dir, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = access(handle, dir, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
status = -EIO;
mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
- ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+ ret = ocfs2_read_inode_block(dir, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
goto bail;
}
+ /* We're guaranteed that we should have space, so we
+ * can't possibly have hit the trailer...right? */
+ mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+ "Hit dir trailer trying to insert %.*s "
+ "(namelen %d) into directory %llu. "
+ "offset is %lu, trailer offset is %d\n",
+ namelen, name, namelen,
+ (unsigned long long)parent_fe_bh->b_blocknr,
+ offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
+
if (ocfs2_dirent_would_fit(de, rec_len)) {
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
goto bail;
}
- status = ocfs2_journal_access(handle, dir, insert_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ if (insert_bh == parent_fe_bh)
+ status = ocfs2_journal_access_di(handle, dir,
+ insert_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ else
+ status = ocfs2_journal_access_db(handle, dir,
+ insert_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
/* By now the buffer is marked for journaling */
offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
retval = 0;
goto bail;
}
+
offset += le16_to_cpu(de->rec_len);
de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
}
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
struct ocfs2_inline_data *data;
struct ocfs2_dir_entry *de;
- ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+ ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
int i, stored;
struct buffer_head * bh, * tmp;
struct ocfs2_dir_entry * de;
- int err;
struct super_block * sb = inode->i_sb;
unsigned int ra_sectors = 16;
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
while (!error && !stored && *f_pos < i_size_read(inode)) {
blk = (*f_pos) >> sb->s_blocksize_bits;
- bh = ocfs2_bread(inode, blk, &err, 0);
- if (!bh) {
- mlog(ML_ERROR,
- "directory #%llu contains a hole at offset %lld\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- *f_pos);
+ if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
+ /* Skip the corrupt dirblock and keep trying */
*f_pos += sb->s_blocksize - offset;
continue;
}
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
|| (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
i > 0; i--) {
- tmp = ocfs2_bread(inode, ++blk, &err, 1);
- brelse(tmp);
+ tmp = NULL;
+ if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+ OCFS2_BH_READAHEAD))
+ brelse(tmp);
}
last_ra_blk = blk;
ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
}
offset = 0;
brelse(bh);
+ bh = NULL;
}
stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
return !priv.seen_other;
}
-static void ocfs2_fill_initial_dirents(struct inode *inode,
- struct inode *parent,
- char *start, unsigned int size)
+/*
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+ struct inode *parent,
+ char *start,
+ unsigned int size)
{
struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
de->name_len = 2;
strcpy(de->name, "..");
ocfs2_set_de_type(de, S_IFDIR);
+
+ return de;
}
/*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
struct ocfs2_inline_data *data = &di->id2.i_data;
unsigned int size = le16_to_cpu(data->id_count);
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
struct ocfs2_alloc_context *data_ac)
{
int status;
+ unsigned int size = osb->sb->s_blocksize;
struct buffer_head *new_bh = NULL;
+ struct ocfs2_dir_entry *de;
mlog_entry_void();
+ if (ocfs2_supports_dir_trailer(osb))
+ size = ocfs2_dir_trailer_blk_off(parent->i_sb);
+
status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
data_ac, NULL, &new_bh);
if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
ocfs2_set_new_buffer_uptodate(inode, new_bh);
- status = ocfs2_journal_access(handle, inode, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_db(handle, inode, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
memset(new_bh->b_data, 0, osb->sb->s_blocksize);
- ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
- osb->sb->s_blocksize);
+ de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
+ if (ocfs2_supports_dir_trailer(osb))
+ ocfs2_init_dir_trailer(inode, new_bh);
status = ocfs2_journal_dirty(handle, new_bh);
if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
data_ac);
}
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
- unsigned int new_size)
+ struct super_block *sb)
{
struct ocfs2_dir_entry *de;
struct ocfs2_dir_entry *prev_de;
char *de_buf, *limit;
- unsigned int bytes = new_size - old_size;
+ unsigned int new_size = sb->s_blocksize;
+ unsigned int bytes;
+
+ if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+ new_size = ocfs2_dir_trailer_blk_off(sb);
+
+ bytes = new_size - old_size;
limit = start + old_size;
de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
unsigned int blocks_wanted,
struct buffer_head **first_block_bh)
{
- int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
u32 alloc, bit_off, len;
struct super_block *sb = dir->i_sb;
+ int ret, credits = ocfs2_inline_to_extents_credits(sb);
u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
handle_t *handle;
struct ocfs2_extent_tree et;
+ int did_quota = 0;
ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
@@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
goto out_sem;
}
+ if (vfs_dq_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+ ret = -EDQUOT;
+ goto out_commit;
+ }
+ did_quota = 1;
/*
* Try to claim as many clusters as the bitmap can give though
* if we only get one now, that's enough to continue. The rest
@@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
- ret = ocfs2_journal_access(handle, dir, dirdata_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
memset(dirdata_bh->b_data + i_size_read(dir), 0,
sb->s_blocksize - i_size_read(dir));
- ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
- sb->s_blocksize);
+ ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
+ if (ocfs2_supports_dir_trailer(osb))
+ ocfs2_init_dir_trailer(dir, dirdata_bh);
ret = ocfs2_journal_dirty(handle, dirdata_bh);
if (ret) {
@@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* We let the later dirent insert modify c/mtime - to the user
* the data hasn't changed.
*/
- ret = ocfs2_journal_access(handle, dir, di_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ ret = ocfs2_journal_access_di(handle, dir, di_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
dirdata_bh = NULL;
out_commit:
+ if (ret < 0 && did_quota)
+ vfs_dq_free_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, 2));
ocfs2_commit_trans(osb, handle);
out_sem:
@@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
struct buffer_head **new_bh)
{
int status;
- int extend;
+ int extend, did_quota = 0;
u64 p_blkno, v_blkno;
spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
if (extend) {
u32 offset = OCFS2_I(dir)->ip_clusters;
+ if (vfs_dq_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(sb, 1))) {
+ status = -EDQUOT;
+ goto bail;
+ }
+ did_quota = 1;
+
status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
1, 0, parent_fe_bh, handle,
data_ac, meta_ac, NULL);
@@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
}
status = 0;
bail:
+ if (did_quota && status < 0)
+ vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
mlog_exit(status);
return status;
}
@@ -1569,16 +1771,22 @@ do_extend:
ocfs2_set_new_buffer_uptodate(dir, new_bh);
- status = ocfs2_journal_access(handle, dir, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_db(handle, dir, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
memset(new_bh->b_data, 0, sb->s_blocksize);
+
de = (struct ocfs2_dir_entry *) new_bh->b_data;
de->inode = 0;
- de->rec_len = cpu_to_le16(sb->s_blocksize);
+ if (ocfs2_dir_has_trailer(dir)) {
+ de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+ ocfs2_init_dir_trailer(dir, new_bh);
+ } else {
+ de->rec_len = cpu_to_le16(sb->s_blocksize);
+ }
status = ocfs2_journal_dirty(handle, new_bh);
if (status < 0) {
mlog_errno(status);
@@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
unsigned int *blocks_wanted)
{
int ret;
+ struct super_block *sb = dir->i_sb;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_dir_entry *de, *last_de = NULL;
char *de_buf, *limit;
unsigned long offset = 0;
- unsigned int rec_len, new_rec_len;
+ unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+
+ /*
+ * This calculates how many free bytes we'd have in block zero, should
+ * this function force expansion to an extent tree.
+ */
+ if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+ free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+ else
+ free_space = dir->i_sb->s_blocksize - i_size_read(dir);
de_buf = di->id2.i_data.id_data;
limit = de_buf + i_size_read(dir);
@@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
ret = -EEXIST;
goto out;
}
+ /*
+ * No need to check for a trailing dirent record here as
+ * they're not used for inline dirs.
+ */
+
if (ocfs2_dirent_would_fit(de, rec_len)) {
/* Ok, we found a spot. Return this bh and let
* the caller actually fill it in. */
@@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
* dirent can be found.
*/
*blocks_wanted = 1;
- new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
+ new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
*blocks_wanted = 2;
@@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
struct ocfs2_dir_entry *de;
struct super_block *sb = dir->i_sb;
int status;
+ int blocksize = dir->i_sb->s_blocksize;
- bh = ocfs2_bread(dir, 0, &status, 0);
- if (!bh) {
+ status = ocfs2_read_dir_block(dir, 0, &bh, 0);
+ if (status) {
mlog_errno(status);
goto bail;
}
@@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
status = -ENOSPC;
goto bail;
}
- bh = ocfs2_bread(dir,
- offset >> sb->s_blocksize_bits,
- &status,
- 0);
- if (!bh) {
+ status = ocfs2_read_dir_block(dir,
+ offset >> sb->s_blocksize_bits,
+ &bh, 0);
+ if (status) {
mlog_errno(status);
goto bail;
}
@@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
status = -EEXIST;
goto bail;
}
+
+ if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+ blocksize))
+ goto next;
+
if (ocfs2_dirent_would_fit(de, rec_len)) {
/* Ok, we found a spot. Return this bh and let
* the caller actually fill it in. */
@@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
status = 0;
goto bail;
}
+next:
offset += le16_to_cpu(de->rec_len);
de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index ce48b9080d8..c511e2e18e9 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
struct ocfs2_alloc_context *data_ac);
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+ void *data);
#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 644bee55d8b..d07ddbe4b28 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
struct list_head *iter, *head=NULL;
u64 cookie;
u32 flags;
+ u8 node;
if (!dlm_grab(dlm)) {
dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
name = past->name;
locklen = past->namelen;
- cookie = be64_to_cpu(past->cookie);
+ cookie = past->cookie;
flags = be32_to_cpu(past->flags);
+ node = past->node_idx;
if (locklen > DLM_LOCKID_NAME_MAX) {
ret = DLM_IVBUFLEN;
- mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+ mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+ "handler!\n", locklen);
goto leave;
}
if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
(LKM_PUT_LVB|LKM_GET_LVB)) {
- mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+ mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+ flags);
ret = DLM_BADARGS;
goto leave;
}
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
if (past->type != DLM_AST &&
past->type != DLM_BAST) {
mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
- "name=%.*s\n", past->type,
- dlm_get_lock_cookie_node(cookie),
- dlm_get_lock_cookie_seq(cookie),
- locklen, name);
+ "name=%.*s, node=%u\n", past->type,
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ locklen, name, node);
ret = DLM_IVLOCKID;
goto leave;
}
res = dlm_lookup_lockres(dlm, name, locklen);
if (!res) {
- mlog(0, "got %sast for unknown lockres! "
- "cookie=%u:%llu, name=%.*s, namelen=%u\n",
- past->type == DLM_AST ? "" : "b",
- dlm_get_lock_cookie_node(cookie),
- dlm_get_lock_cookie_seq(cookie),
- locklen, name, locklen);
+ mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
+ "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ locklen, name, node);
ret = DLM_IVLOCKID;
goto leave;
}
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
- mlog(0, "responding with DLM_RECOVERING!\n");
+ mlog(0, "Responding with DLM_RECOVERING!\n");
ret = DLM_RECOVERING;
goto unlock_out;
}
if (res->state & DLM_LOCK_RES_MIGRATING) {
- mlog(0, "responding with DLM_MIGRATING!\n");
+ mlog(0, "Responding with DLM_MIGRATING!\n");
ret = DLM_MIGRATING;
goto unlock_out;
}
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
lock = NULL;
list_for_each(iter, head) {
lock = list_entry (iter, struct dlm_lock, list);
- if (be64_to_cpu(lock->ml.cookie) == cookie)
+ if (lock->ml.cookie == cookie)
goto do_ast;
}
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
list_for_each(iter, head) {
lock = list_entry (iter, struct dlm_lock, list);
- if (be64_to_cpu(lock->ml.cookie) == cookie)
+ if (lock->ml.cookie == cookie)
goto do_ast;
}
- mlog(0, "got %sast for unknown lock! cookie=%u:%llu, "
- "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b",
- dlm_get_lock_cookie_node(cookie),
- dlm_get_lock_cookie_seq(cookie),
- locklen, name, locklen);
+ mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
+ "node=%u\n", past->type == DLM_AST ? "" : "b",
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ locklen, name, node);
ret = DLM_NORMAL;
unlock_out:
@@ -383,8 +386,8 @@ do_ast:
if (past->type == DLM_AST) {
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->granted);
- mlog(0, "ast: adding to granted list... type=%d, "
- "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+ mlog(0, "ast: Adding to granted list... type=%d, "
+ "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
if (lock->ml.convert_type != LKM_IVMODE) {
lock->ml.type = lock->ml.convert_type;
lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
dlm_do_local_bast(dlm, res, lock, past->blocked_type);
leave:
-
if (res)
dlm_lockres_put(res);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d5a86fb81a4..bb53714813a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
unsigned int purge_count;
spinlock_t spinlock;
spinlock_t ast_lock;
+ spinlock_t track_lock;
char *name;
u8 node_num;
u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
* put on a list for the dlm thread to run. */
unsigned long last_used;
+ struct dlm_ctxt *dlm;
+
unsigned migration_pending:1;
atomic_t asts_reserved;
spinlock_t spinlock;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 1b81dcba175..b32f60a5acf 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
{
struct debug_lockres *dl = m->private;
struct dlm_ctxt *dlm = dl->dl_ctxt;
+ struct dlm_lock_resource *oldres = dl->dl_res;
struct dlm_lock_resource *res = NULL;
+ struct list_head *track_list;
- spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->track_lock);
+ if (oldres)
+ track_list = &oldres->tracking;
+ else
+ track_list = &dlm->tracking_list;
- if (dl->dl_res) {
- list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
- if (dl->dl_res) {
- dlm_lockres_put(dl->dl_res);
- dl->dl_res = NULL;
- }
- if (&res->tracking == &dlm->tracking_list) {
- mlog(0, "End of list found, %p\n", res);
- dl = NULL;
- break;
- }
+ list_for_each_entry(res, track_list, tracking) {
+ if (&res->tracking == &dlm->tracking_list)
+ res = NULL;
+ else
dlm_lockres_get(res);
- dl->dl_res = res;
- break;
- }
- } else {
- if (!list_empty(&dlm->tracking_list)) {
- list_for_each_entry(res, &dlm->tracking_list, tracking)
- break;
- dlm_lockres_get(res);
- dl->dl_res = res;
- } else
- dl = NULL;
+ break;
}
+ spin_unlock(&dlm->track_lock);
- if (dl) {
- spin_lock(&dl->dl_res->spinlock);
- dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
- spin_unlock(&dl->dl_res->spinlock);
- }
+ if (oldres)
+ dlm_lockres_put(oldres);
- spin_unlock(&dlm->spinlock);
+ dl->dl_res = res;
+
+ if (res) {
+ spin_lock(&res->spinlock);
+ dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+ spin_unlock(&res->spinlock);
+ } else
+ dl = NULL;
+ /* passed to seq_show */
return dl;
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 63f8125824e..d8d578f4561 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
spin_lock_init(&dlm->spinlock);
spin_lock_init(&dlm->master_lock);
spin_lock_init(&dlm->ast_lock);
+ spin_lock_init(&dlm->track_lock);
INIT_LIST_HEAD(&dlm->list);
INIT_LIST_HEAD(&dlm->dirty_list);
INIT_LIST_HEAD(&dlm->reco.resources);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 6f7a77d5402..1c9efb406a9 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_blocks = 0;
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inc_nlink(inode);
@@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_blocks = 0;
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 44f87caf368..54e182a27ca 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
static void dlm_lockres_release(struct kref *kref)
{
struct dlm_lock_resource *res;
+ struct dlm_ctxt *dlm;
res = container_of(kref, struct dlm_lock_resource, refs);
+ dlm = res->dlm;
/* This should not happen -- all lockres' have a name
* associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
+ spin_lock(&dlm->track_lock);
if (!list_empty(&res->tracking))
list_del_init(&res->tracking);
else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
res->lockname.len, res->lockname.name);
dlm_print_one_lock_resource(res);
}
+ spin_unlock(&dlm->track_lock);
+
+ dlm_put(dlm);
if (!hlist_unhashed(&res->hash_node) ||
!list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->migration_pending = 0;
res->inflight_locks = 0;
+ /* put in dlm_lockres_release */
+ dlm_grab(dlm);
+ res->dlm = dlm;
+
kref_init(&res->refs);
/* just for consistency */
@@ -722,14 +732,21 @@ lookup:
if (tmpres) {
int dropping_ref = 0;
+ spin_unlock(&dlm->spinlock);
+
spin_lock(&tmpres->spinlock);
+ /* We wait for the other thread that is mastering the resource */
+ if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+ __dlm_wait_on_lockres(tmpres);
+ BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+ }
+
if (tmpres->owner == dlm->node_num) {
BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
dlm_lockres_grab_inflight_ref(dlm, tmpres);
} else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
dropping_ref = 1;
spin_unlock(&tmpres->spinlock);
- spin_unlock(&dlm->spinlock);
/* wait until done messaging the master, drop our ref to allow
* the lockres to be purged, start over. */
@@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
struct dlm_node_iter *iter)
{
struct dlm_migrate_request migrate;
- int ret, status = 0;
+ int ret, skip, status = 0;
int nodenum;
memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
nodenum == new_master)
continue;
+ /* We could race exit domain. If exited, skip. */
+ spin_lock(&dlm->spinlock);
+ skip = (!test_bit(nodenum, dlm->domain_map));
+ spin_unlock(&dlm->spinlock);
+ if (skip) {
+ clear_bit(nodenum, iter->node_map);
+ continue;
+ }
+
ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
&migrate, sizeof(migrate), nodenum,
&status);
- if (ret < 0)
- mlog_errno(ret);
- else if (status < 0) {
+ if (ret < 0) {
+ mlog(0, "migrate_request returned %d!\n", ret);
+ if (!dlm_is_host_down(ret)) {
+ mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+ BUG();
+ }
+ clear_bit(nodenum, iter->node_map);
+ ret = 0;
+ } else if (status < 0) {
mlog(0, "migrate request (node %u) returned %d!\n",
nodenum, status);
ret = status;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc..d1295203029 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
/* This ensures that clear refmap is sent after the set */
- __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+ __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
+ DLM_LOCK_RES_MIGRATING));
spin_unlock(&res->spinlock);
/* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e6cc0a2e5f..b0c4cadd4c4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,6 +32,7 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_DLM_GLUE
#include <cluster/masklog.h>
@@ -51,6 +52,7 @@
#include "slot_map.h"
#include "super.h"
#include "uptodate.h"
+#include "quota.h"
#include "buffer_head_io.h"
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
/*
* Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
@@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
unsigned int line,
struct ocfs2_lock_res *lockres)
{
- struct ocfs2_meta_lvb *lvb =
- (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+ struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
mlog(level, "LVB information for %s (called from %s:%u):\n",
lockres->l_name, function, line);
@@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+ .set_lvb = ocfs2_set_qinfo_lvb,
+ .get_osb = ocfs2_get_qinfo_osb,
+ .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
+
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
return (struct ocfs2_dentry_lock *)lockres->l_priv;
}
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+ BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+
+ return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
+
static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
{
if (lockres->l_ops->get_osb)
@@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
return OCFS2_SB(inode->i_sb);
}
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+
+ return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
+
static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
{
struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
lockres->l_flags |= OCFS2_LOCK_NOCACHE;
}
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_mem_dqinfo *info)
+{
+ ocfs2_lock_res_init_once(lockres);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+ 0, lockres->l_name);
+ ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+ OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+ info);
+}
+
void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
{
mlog_entry_void();
@@ -1290,7 +1324,7 @@ again:
goto out;
}
- mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
+ mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
lockres->l_name);
/* At this point we've gone inside the dlm and need to
@@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
mlog_entry_void();
- lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
/*
* Invalidate the LVB of a deleted inode - this way other
@@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
mlog_meta_lvb(0, lockres);
- lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
/* We're safe here without the lockres lock... */
spin_lock(&oi->ip_lock);
@@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
struct ocfs2_lock_res *lockres)
{
- struct ocfs2_meta_lvb *lvb =
- (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+ struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
if (lvb->lvb_version == OCFS2_LVB_VERSION
&& be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
} else {
/* Boo, we have to go to disk. */
/* read bh, cast, ocfs2_refresh_inode */
- status = ocfs2_read_block(inode, oi->ip_blkno, bh);
+ status = ocfs2_read_inode_block(inode, bh);
if (status < 0) {
mlog_errno(status);
goto bail_refresh;
@@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
fe = (struct ocfs2_dinode *) (*bh)->b_data;
/* This is a good chance to make sure we're not
- * locking an invalid object.
+ * locking an invalid object. ocfs2_read_inode_block()
+ * already checked that the inode block is sane.
*
* We bug on a stale inode here because we checked
* above whether it was wiped from disk. The wiping
* node provides a guarantee that we receive that
* message and can mark the inode before dropping any
* locks associated with it. */
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- status = -EIO;
- goto bail_refresh;
- }
mlog_bug_on_msg(inode->i_generation !=
le32_to_cpu(fe->i_generation),
"Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode,
return 0;
}
- status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
+ status = ocfs2_read_inode_block(inode, ret_bh);
if (status < 0)
mlog_errno(status);
@@ -2922,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
ocfs2_dlm_dump_lksb(&lockres->l_lksb);
BUG();
}
- mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
+ mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
lockres->l_name);
ocfs2_wait_on_busy_lock(lockres);
@@ -3449,6 +3478,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
return UNBLOCK_CONTINUE_POST;
}
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_qinfo_lvb *lvb;
+ struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+ struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+ oinfo->dqi_gi.dqi_type);
+
+ mlog_entry_void();
+
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+ lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+ lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+ lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+ lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+ lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+ lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+
+ mlog_exit_void();
+}
+
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+ struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+ mlog_entry_void();
+ if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, lockres, level);
+ mlog_exit_void();
+}
+
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+ struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+ oinfo->dqi_gi.dqi_type);
+ struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+ struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ struct buffer_head *bh = NULL;
+ struct ocfs2_global_disk_dqinfo *gdinfo;
+ int status = 0;
+
+ if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+ info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+ info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+ oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+ oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+ oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+ oinfo->dqi_gi.dqi_free_entry =
+ be32_to_cpu(lvb->lvb_free_entry);
+ } else {
+ status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ gdinfo = (struct ocfs2_global_disk_dqinfo *)
+ (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+ info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+ info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+ oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+ oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+ oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+ oinfo->dqi_gi.dqi_free_entry =
+ le32_to_cpu(gdinfo->dqi_free_entry);
+ brelse(bh);
+ ocfs2_track_lock_refresh(lockres);
+ }
+
+bail:
+ return status;
+}
+
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+ struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ int status = 0;
+
+ mlog_entry_void();
+
+ /* On RO devices, locking really isn't needed... */
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (ex)
+ status = -EROFS;
+ goto bail;
+ }
+ if (ocfs2_mount_local(osb))
+ goto bail;
+
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ if (!ocfs2_should_refresh_lock_res(lockres))
+ goto bail;
+ /* OK, we have the lock but we need to refresh the quota info */
+ status = ocfs2_refresh_qinfo(oinfo);
+ if (status)
+ ocfs2_qinfo_unlock(oinfo, ex);
+ ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+ mlog_exit(status);
+ return status;
+}
+
/*
* This is the filesystem locking protocol. It provides the lock handling
* hooks for the underlying DLM. It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2bb01f09c1b..3f8d9986b8e 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
__be32 lvb_reserved2;
};
+#define OCFS2_QINFO_LVB_VERSION 1
+
+struct ocfs2_qinfo_lvb {
+ __u8 lvb_version;
+ __u8 lvb_reserved[3];
+ __be32 lvb_bgrace;
+ __be32 lvb_igrace;
+ __be32 lvb_syncms;
+ __be32 lvb_blocks;
+ __be32 lvb_free_blk;
+ __be32 lvb_free_entry;
+};
+
/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
struct ocfs2_file_private;
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_mem_dqinfo *info);
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
int ocfs2_file_lock(struct file *file, int ex, int trylock);
void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+
void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2baedac5823..f2bb1a04d25 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
- ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+ ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- ret = -EROFS;
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- goto out;
- }
-
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
goto no_more_extents;
- ret = ocfs2_read_block(inode,
- le64_to_cpu(eb->h_next_leaf_blk),
- &next_eb_bh);
+ ret = ocfs2_read_extent_block(inode,
+ le64_to_cpu(eb->h_next_leaf_blk),
+ &next_eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
- next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
- ret = -EROFS;
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
- goto out;
- }
+ next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
el = &next_eb->h_list;
-
i = ocfs2_search_for_hole_index(el, v_cluster);
}
@@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
if (ret == 0)
goto out;
- ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+ ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -819,3 +806,74 @@ out:
return ret;
}
+
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+ struct buffer_head *bhs[], int flags,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
+{
+ int rc = 0;
+ u64 p_block, p_count;
+ int i, count, done = 0;
+
+ mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
+ "flags = %x, validate = %p)\n",
+ inode, (unsigned long long)v_block, nr, bhs, flags,
+ validate);
+
+ if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+ i_size_read(inode)) {
+ BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+ goto out;
+ }
+
+ while (done < nr) {
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+ &p_block, &p_count, NULL);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ if (rc) {
+ mlog_errno(rc);
+ break;
+ }
+
+ if (!p_block) {
+ rc = -EIO;
+ mlog(ML_ERROR,
+ "Inode #%llu contains a hole at offset %llu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)(v_block + done) <<
+ inode->i_sb->s_blocksize_bits);
+ break;
+ }
+
+ count = nr - done;
+ if (p_count < count)
+ count = p_count;
+
+ /*
+ * If the caller passed us bhs, they should have come
+ * from a previous readahead call to this function. Thus,
+ * they should have the right b_blocknr.
+ */
+ for (i = 0; i < count; i++) {
+ if (!bhs[done + i])
+ continue;
+ BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+ }
+
+ rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
+ flags, validate);
+ if (rc) {
+ mlog_errno(rc);
+ break;
+ }
+ done += count;
+ }
+
+out:
+ mlog_exit(rc);
+ return rc;
+}
+
+
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1c4aa8b06f3..b7dd9731b46 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
struct ocfs2_extent_list *el);
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+ struct buffer_head *bhs[], int flags,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh));
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
+{
+ int status = 0;
+
+ if (bh == NULL) {
+ printk("ocfs2: bh == NULL\n");
+ status = -EINVAL;
+ goto bail;
+ }
+
+ status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+
+bail:
+ return status;
+}
+
+
#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e2570a3bc2b..a5887df2cd8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -35,6 +35,7 @@
#include <linux/mount.h>
#include <linux/writeback.h>
#include <linux/falloc.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
@@ -56,6 +57,8 @@
#include "suballoc.h"
#include "super.h"
#include "xattr.h"
+#include "acl.h"
+#include "quota.h"
#include "buffer_head_io.h"
@@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
goto out;
}
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -303,9 +306,9 @@ bail:
return status;
}
-static int ocfs2_simple_size_update(struct inode *inode,
- struct buffer_head *di_bh,
- u64 new_i_size)
+int ocfs2_simple_size_update(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size)
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_commit;
@@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)new_i_size);
+ /* We trust di_bh because it comes from ocfs2_inode_lock(), which
+ * already validated it */
fe = (struct ocfs2_dinode *) di_bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- status = -EIO;
- goto bail;
- }
mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
"Inode %llu, inode i_size = %lld != di "
@@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
enum ocfs2_alloc_restarted why;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_extent_tree et;
+ int did_quota = 0;
mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
@@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
*/
BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
- status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+ status = ocfs2_read_inode_block(inode, &bh);
if (status < 0) {
mlog_errno(status);
goto leave;
}
-
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- status = -EIO;
- goto leave;
- }
restart_all:
BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -585,11 +580,18 @@ restart_all:
}
restarted_transaction:
+ if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
+ clusters_to_add))) {
+ status = -EDQUOT;
+ goto leave;
+ }
+ did_quota = 1;
+
/* reserve a write to the file entry early on - that we if we
* run out of credits in the allocation path, we can still
* update i_size. */
- status = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -622,6 +624,10 @@ restarted_transaction:
spin_lock(&OCFS2_I(inode)->ip_lock);
clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
spin_unlock(&OCFS2_I(inode)->ip_lock);
+ /* Release unused quota reservation */
+ vfs_dq_free_space(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+ did_quota = 0;
if (why != RESTART_NONE && clusters_to_add) {
if (why == RESTART_META) {
@@ -654,6 +660,9 @@ restarted_transaction:
OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
leave:
+ if (status < 0 && did_quota)
+ vfs_dq_free_space(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
if (handle) {
ocfs2_commit_trans(osb, handle);
handle = NULL;
@@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
struct ocfs2_super *osb = OCFS2_SB(sb);
struct buffer_head *bh = NULL;
handle_t *handle = NULL;
+ int locked[MAXQUOTAS] = {0, 0};
+ int credits, qtype;
+ struct ocfs2_mem_dqinfo *oinfo;
mlog_entry("(0x%p, '%.*s')\n", dentry,
dentry->d_name.len, dentry->d_name.name);
@@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto bail_unlock;
+ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+ (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ credits = OCFS2_INODE_UPDATE_CREDITS;
+ if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
+ && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+ oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto bail_unlock;
+ credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+ ocfs2_calc_qdel_credits(sb, USRQUOTA);
+ locked[USRQUOTA] = 1;
+ }
+ if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+ && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+ oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto bail_unlock;
+ credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
+ ocfs2_calc_qdel_credits(sb, GRPQUOTA);
+ locked[GRPQUOTA] = 1;
+ }
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail_unlock;
+ }
+ status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ if (status < 0)
+ goto bail_commit;
+ } else {
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail_unlock;
+ }
}
/*
@@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
+ for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+ if (!locked[qtype])
+ continue;
+ oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
+ ocfs2_unlock_global_qf(oinfo, 1);
+ }
ocfs2_inode_unlock(inode, 1);
bail_unlock_rw:
if (size_change)
@@ -989,6 +1043,12 @@ bail_unlock_rw:
bail:
brelse(bh);
+ if (!status && attr->ia_valid & ATTR_MODE) {
+ status = ocfs2_acl_chmod(inode);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
mlog_exit(status);
return status;
}
@@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask)
goto out;
}
- ret = generic_permission(inode, mask, NULL);
+ ret = generic_permission(inode, mask, ocfs2_check_acl);
ocfs2_inode_unlock(inode, 0);
out:
@@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
goto out;
}
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_trans;
@@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
{
int ret;
struct buffer_head *bh = NULL;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
+ ret = ocfs2_read_inode_block(inode, &bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
struct buffer_head *di_bh = NULL;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
- &di_bh);
+ ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1226,83 +1284,6 @@ out:
return ret;
}
-static int __ocfs2_remove_inode_range(struct inode *inode,
- struct buffer_head *di_bh,
- u32 cpos, u32 phys_cpos, u32 len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
-{
- int ret;
- u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct inode *tl_inode = osb->osb_tl_inode;
- handle_t *handle;
- struct ocfs2_alloc_context *meta_ac = NULL;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
- struct ocfs2_extent_tree et;
-
- ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
-
- ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
- if (ret) {
- mlog_errno(ret);
- return ret;
- }
-
- mutex_lock(&tl_inode->i_mutex);
-
- if (ocfs2_truncate_log_needs_flush(osb)) {
- ret = __ocfs2_flush_truncate_log(osb);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
- dealloc);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- OCFS2_I(inode)->ip_clusters -= len;
- di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
-
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
- if (ret)
- mlog_errno(ret);
-
-out_commit:
- ocfs2_commit_trans(osb, handle);
-out:
- mutex_unlock(&tl_inode->i_mutex);
-
- if (meta_ac)
- ocfs2_free_alloc_context(meta_ac);
-
- return ret;
-}
-
/*
* Truncate a byte range, avoiding pages within partial clusters. This
* preserves those pages for the zeroing code to write to.
@@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_cached_dealloc_ctxt dealloc;
struct address_space *mapping = inode->i_mapping;
+ struct ocfs2_extent_tree et;
+ ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
ocfs2_init_dealloc_ctxt(&dealloc);
if (byte_len == 0)
@@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
/* Only do work for non-holes */
if (phys_cpos != 0) {
- ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
- phys_cpos, alloc_size,
- &dealloc);
+ ret = ocfs2_remove_btree_range(inode, &et, cpos,
+ phys_cpos, alloc_size,
+ &dealloc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1622,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
struct ocfs2_space_resv *sr)
{
struct inode *inode = file->f_path.dentry->d_inode;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
!ocfs2_writes_unwritten_extents(osb))
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e92382cbca5..172f9fbc9fc 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_simple_size_update(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size);
int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
u64 zero_to);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7aa00d51187..229e707bc05 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/quotaops.h>
#include <asm/byteorder.h>
@@ -37,6 +38,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
@@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
return 0;
}
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
- int create_ino)
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+ int create_ino)
{
struct super_block *sb;
struct ocfs2_super *osb;
- int status = -EINVAL;
int use_plocks = 1;
mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
use_plocks = 0;
- /* this means that read_inode cannot create a superblock inode
- * today. change if needed. */
- if (!OCFS2_IS_VALID_DINODE(fe) ||
- !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
- mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, "
- "signature = %.*s, flags = 0x%x\n",
- inode->i_ino,
- (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
- fe->i_signature, le32_to_cpu(fe->i_flags));
- goto bail;
- }
+ /*
+ * These have all been checked by ocfs2_read_inode_block() or set
+ * by ocfs2_mknod_locked(), so a failure is a code bug.
+ */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode
+ cannot create a superblock
+ inode today. change if
+ that is needed. */
+ BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
+ BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
- if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
- mlog(ML_ERROR, "file entry generation does not match "
- "superblock! osb->fs_generation=%x, "
- "fe->i_fs_generation=%x\n",
- osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
- goto bail;
- }
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
inode->i_nlink = le16_to_cpu(fe->i_links_count);
- if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+ if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+ inode->i_flags |= S_NOQUOTA;
+ }
if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+ } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+ inode->i_flags |= S_NOQUOTA;
} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
/* we can't actually hit this as read_inode can't
@@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
ocfs2_set_inode_flags(inode);
- status = 0;
-bail:
- mlog_exit(status);
- return status;
+ mlog_exit_void();
}
static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
}
- if (can_lock)
- status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
- OCFS2_BH_IGNORE_CACHE);
- else
+ if (can_lock) {
+ status = ocfs2_read_inode_block_full(inode, &bh,
+ OCFS2_BH_IGNORE_CACHE);
+ } else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+ if (!status)
+ status = ocfs2_validate_inode_block(osb->sb, bh);
+ }
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
status = -EINVAL;
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)args->fi_blkno, 7,
- fe->i_signature);
- goto bail;
- }
/*
* This is a code bug. Right now the caller needs to
@@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
S_ISBLK(le16_to_cpu(fe->i_mode)))
- inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+ inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
- if (ocfs2_populate_inode(inode, fe, 0) < 0)
- goto bail;
+ ocfs2_populate_inode(inode, fe, 0);
BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
@@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail;
}
- handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS);
+ handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+ ocfs2_quota_trans_credits(inode->i_sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
}
/* set the inodes dtime */
- status = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail_commit;
@@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
}
ocfs2_remove_from_cache(inode, di_bh);
+ vfs_dq_free_inode(inode);
status = ocfs2_free_dinode(handle, inode_alloc_inode,
inode_alloc_bh, di);
@@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode)
mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
- if (is_bad_inode(inode)) {
+ /* When we fail in read_inode() we mark inode as bad. The second test
+ * catches the case when inode allocation fails before allocating
+ * a block for inode. */
+ if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
mlog(0, "Skipping delete of bad inode\n");
goto bail;
}
@@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
mlog_entry("(inode %llu)\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode,
spin_unlock(&OCFS2_I(inode)->ip_lock);
}
+
+int ocfs2_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ mlog(0, "Validating dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+ if (rc) {
+ mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ goto bail;
+ }
+
+ /*
+ * Errors after here are fatal.
+ */
+
+ rc = -EINVAL;
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ di->i_signature);
+ goto bail;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ goto bail;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ ocfs2_error(sb,
+ "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+ (unsigned long long)bh->b_blocknr);
+ goto bail;
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ goto bail;
+ }
+
+ rc = 0;
+
+bail:
+ return rc;
+}
+
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+ int flags)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
+ flags, ocfs2_validate_inode_block);
+
+ /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+ return ocfs2_read_inode_block_full(inode, bh, 0);
+}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 2f37af9bcc4..eb3c302b38d 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry);
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
- int create_ino);
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+ int create_ino);
void ocfs2_read_inode(struct inode *inode);
void ocfs2_read_inode2(struct inode *inode, void *opaque);
ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
struct buffer_head *bh);
int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+ int block, int *err, int reada);
void ocfs2_set_inode_flags(struct inode *inode);
void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
}
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+/*
+ * Read an inode block into *bh. If *bh is NULL, a bh will be allocated.
+ * This is a cached read. The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+ int flags);
#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 99fe9d584f3..57d7d25a2b9 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,6 +35,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dir.h"
#include "dlmglue.h"
#include "extent_map.h"
@@ -45,6 +46,7 @@
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
+#include "quota.h"
#include "buffer_head_io.h"
@@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock);
static int ocfs2_force_read_journal(struct inode *inode);
static int ocfs2_recover_node(struct ocfs2_super *osb,
- int node_num);
+ int node_num, int slot_num);
static int __ocfs2_recovery_thread(void *arg);
static int ocfs2_commit_cache(struct ocfs2_super *osb);
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
int dirty, int replayed);
static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
int slot);
static int ocfs2_commit_thread(void *arg);
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+ return __ocfs2_wait_on_mount(osb, 0);
+}
+
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+ return __ocfs2_wait_on_mount(osb, 1);
+}
+
+
/*
* The recovery_list is a simple linked list of node numbers to recover.
@@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
BUG_ON(max_buffs <= 0);
- /* JBD might support this, but our journalling code doesn't yet. */
- if (journal_current_handle()) {
- mlog(ML_ERROR, "Recursive transaction attempted!\n");
- BUG();
- }
+ /* Nested transaction? Just return the handle... */
+ if (journal_current_handle())
+ return jbd2_journal_start(journal, max_buffs);
down_read(&osb->journal->j_trans_barrier);
@@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
int ocfs2_commit_trans(struct ocfs2_super *osb,
handle_t *handle)
{
- int ret;
+ int ret, nested;
struct ocfs2_journal *journal = osb->journal;
BUG_ON(!handle);
+ nested = handle->h_ref > 1;
ret = jbd2_journal_stop(handle);
if (ret < 0)
mlog_errno(ret);
- up_read(&journal->j_trans_barrier);
+ if (!nested)
+ up_read(&journal->j_trans_barrier);
return ret;
}
@@ -357,10 +370,137 @@ bail:
return status;
}
-int ocfs2_journal_access(handle_t *handle,
- struct inode *inode,
- struct buffer_head *bh,
- int type)
+struct ocfs2_triggers {
+ struct jbd2_buffer_trigger_type ot_triggers;
+ int ot_offset;
+};
+
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+ return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+
+static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+
+ /*
+ * We aren't guaranteed to have the superblock here, so we
+ * must unconditionally compute the ecc data.
+ * __ocfs2_journal_access() will only set the triggers if
+ * metaecc is enabled.
+ */
+ ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct ocfs2_disk_dqtrailer *dqt =
+ ocfs2_block_dqtrailer(size, data);
+
+ /*
+ * We aren't guaranteed to have the superblock here, so we
+ * must unconditionally compute the ecc data.
+ * __ocfs2_journal_access() will only set the triggers if
+ * metaecc is enabled.
+ */
+ ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct ocfs2_dir_block_trailer *trailer =
+ ocfs2_dir_trailer_from_size(size, data);
+
+ /*
+ * We aren't guaranteed to have the superblock here, so we
+ * must unconditionally compute the ecc data.
+ * __ocfs2_journal_access() will only set the triggers if
+ * metaecc is enabled.
+ */
+ ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh)
+{
+ mlog(ML_ERROR,
+ "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, "
+ "bh->b_blocknr = %llu\n",
+ (unsigned long)bh,
+ (unsigned long long)bh->b_blocknr);
+
+ /* We aren't guaranteed to have the superblock here - but if we
+ * don't, it'll just crash. */
+ ocfs2_error(bh->b_assoc_map->host->i_sb,
+ "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+
+static struct ocfs2_triggers di_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_dinode, i_check),
+};
+
+static struct ocfs2_triggers eb_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
+};
+
+static struct ocfs2_triggers gd_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
+};
+
+static struct ocfs2_triggers db_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_db_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+};
+
+static struct ocfs2_triggers xb_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
+};
+
+static struct ocfs2_triggers dq_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_dq_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+};
+
+static int __ocfs2_journal_access(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct ocfs2_triggers *triggers,
+ int type)
{
int status;
@@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle,
status = -EINVAL;
mlog(ML_ERROR, "Uknown access type!\n");
}
+ if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
+ jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
if (status < 0)
@@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle,
return status;
}
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
+ type);
+}
+
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
+ type);
+}
+
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
+ type);
+}
+
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
+ type);
+}
+
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
+ type);
+}
+
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
+ type);
+}
+
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+}
+
int ocfs2_journal_dirty(handle_t *handle,
struct buffer_head *bh)
{
@@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle,
return status;
}
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int ocfs2_journal_dirty_data(handle_t *handle,
- struct buffer_head *bh)
-{
- int err = journal_dirty_data(handle, bh);
- if (err)
- mlog_errno(err);
- /* TODO: When we can handle it, abort the handle and go RO on
- * error here. */
-
- return err;
-}
-#endif
-
#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
void ocfs2_set_journal_params(struct ocfs2_super *osb)
@@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
mlog_entry_void();
fe = (struct ocfs2_dinode *)bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- /* This is called from startup/shutdown which will
- * handle the errors in a specific manner, so no need
- * to call ocfs2_error() here. */
- mlog(ML_ERROR, "Journal dinode %llu has invalid "
- "signature: %.*s",
- (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
- fe->i_signature);
- status = -EIO;
- goto out;
- }
+
+ /* The journal bh on the osb always comes from ocfs2_journal_init()
+ * and was validated there inside ocfs2_inode_lock_full(). It's a
+ * code bug if we mess it up. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
flags = le32_to_cpu(fe->id1.journal1.ij_flags);
if (dirty)
@@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
if (replayed)
ocfs2_bump_recovery_generation(fe);
+ ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
status = ocfs2_write_block(osb, bh, journal->j_inode);
if (status < 0)
mlog_errno(status);
-out:
mlog_exit(status);
return status;
}
@@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item {
int lri_slot;
struct ocfs2_dinode *lri_la_dinode;
struct ocfs2_dinode *lri_tl_dinode;
+ struct ocfs2_quota_recovery *lri_qrec;
};
/* Does the second half of the recovery process. By this point, the
@@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
struct ocfs2_super *osb = journal->j_osb;
struct ocfs2_dinode *la_dinode, *tl_dinode;
struct ocfs2_la_recovery_item *item, *n;
+ struct ocfs2_quota_recovery *qrec;
LIST_HEAD(tmp_la_list);
mlog_entry_void();
@@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
+ ocfs2_wait_on_quotas(osb);
+
la_dinode = item->lri_la_dinode;
if (la_dinode) {
mlog(0, "Clean up local alloc %llu\n",
@@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
if (ret < 0)
mlog_errno(ret);
+ qrec = item->lri_qrec;
+ if (qrec) {
+ mlog(0, "Recovering quota files");
+ ret = ocfs2_finish_quota_recovery(osb, qrec,
+ item->lri_slot);
+ if (ret < 0)
+ mlog_errno(ret);
+ /* Recovery info is already freed now */
+ }
+
kfree(item);
}
@@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
int slot_num,
struct ocfs2_dinode *la_dinode,
- struct ocfs2_dinode *tl_dinode)
+ struct ocfs2_dinode *tl_dinode,
+ struct ocfs2_quota_recovery *qrec)
{
struct ocfs2_la_recovery_item *item;
@@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
if (tl_dinode)
kfree(tl_dinode);
+ if (qrec)
+ ocfs2_free_quota_recovery(qrec);
+
mlog_errno(-ENOMEM);
return;
}
@@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
item->lri_la_dinode = la_dinode;
item->lri_slot = slot_num;
item->lri_tl_dinode = tl_dinode;
+ item->lri_qrec = qrec;
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
ocfs2_queue_recovery_completion(journal,
osb->slot_num,
osb->local_alloc_copy,
+ NULL,
NULL);
ocfs2_schedule_truncate_log_flush(osb, 0);
@@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
}
}
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+ if (osb->quota_rec) {
+ ocfs2_queue_recovery_completion(osb->journal,
+ osb->slot_num,
+ NULL,
+ NULL,
+ osb->quota_rec);
+ osb->quota_rec = NULL;
+ }
+}
+
static int __ocfs2_recovery_thread(void *arg)
{
- int status, node_num;
+ int status, node_num, slot_num;
struct ocfs2_super *osb = arg;
struct ocfs2_recovery_map *rm = osb->recovery_map;
+ int *rm_quota = NULL;
+ int rm_quota_used = 0, i;
+ struct ocfs2_quota_recovery *qrec;
mlog_entry_void();
@@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg)
goto bail;
}
+ rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+ if (!rm_quota) {
+ status = -ENOMEM;
+ goto bail;
+ }
restart:
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
@@ -1032,8 +1242,28 @@ restart:
* clear it until ocfs2_recover_node() has succeeded. */
node_num = rm->rm_entries[0];
spin_unlock(&osb->osb_lock);
-
- status = ocfs2_recover_node(osb, node_num);
+ mlog(0, "checking node %d\n", node_num);
+ slot_num = ocfs2_node_num_to_slot(osb, node_num);
+ if (slot_num == -ENOENT) {
+ status = 0;
+ mlog(0, "no slot for this node, so no recovery"
+ "required.\n");
+ goto skip_recovery;
+ }
+ mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+
+ /* It is a bit subtle with quota recovery. We cannot do it
+ * immediately because we have to obtain cluster locks from
+ * quota files and we also don't want to just skip it because
+ * then quota usage would be out of sync until some node takes
+ * the slot. So we remember which nodes need quota recovery
+ * and when everything else is done, we recover quotas. */
+ for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+ if (i == rm_quota_used)
+ rm_quota[rm_quota_used++] = slot_num;
+
+ status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
if (!status) {
ocfs2_recovery_map_clear(osb, node_num);
} else {
@@ -1055,13 +1285,27 @@ restart:
if (status < 0)
mlog_errno(status);
+ /* Now it is right time to recover quotas... We have to do this under
+ * superblock lock so that noone can start using the slot (and crash)
+ * before we recover it */
+ for (i = 0; i < rm_quota_used; i++) {
+ qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+ if (IS_ERR(qrec)) {
+ status = PTR_ERR(qrec);
+ mlog_errno(status);
+ continue;
+ }
+ ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+ NULL, NULL, qrec);
+ }
+
ocfs2_super_unlock(osb, 1);
/* We always run recovery on our own orphan dir - the dead
* node(s) may have disallowd a previos inode delete. Re-processing
* is therefore required. */
ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
- NULL);
+ NULL, NULL);
bail:
mutex_lock(&osb->recovery_lock);
@@ -1076,6 +1320,9 @@ bail:
mutex_unlock(&osb->recovery_lock);
+ if (rm_quota)
+ kfree(rm_quota);
+
mlog_exit(status);
/* no one is callint kthread_stop() for us so the kthread() api
* requires that we call do_exit(). And it isn't exported, but
@@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
}
SET_INODE_JOURNAL(inode);
- status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
- OCFS2_BH_IGNORE_CACHE);
+ status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
osb->slot_recovery_generations[slot_num] =
ocfs2_get_recovery_generation(fe);
+ ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
status = ocfs2_write_block(osb, bh, inode);
if (status < 0)
mlog_errno(status);
@@ -1304,31 +1551,19 @@ done:
* far less concerning.
*/
static int ocfs2_recover_node(struct ocfs2_super *osb,
- int node_num)
+ int node_num, int slot_num)
{
int status = 0;
- int slot_num;
struct ocfs2_dinode *la_copy = NULL;
struct ocfs2_dinode *tl_copy = NULL;
- mlog_entry("(node_num=%d, osb->node_num = %d)\n",
- node_num, osb->node_num);
-
- mlog(0, "checking node %d\n", node_num);
+ mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
+ node_num, slot_num, osb->node_num);
/* Should not ever be called to recover ourselves -- in that
* case we should've called ocfs2_journal_load instead. */
BUG_ON(osb->node_num == node_num);
- slot_num = ocfs2_node_num_to_slot(osb, node_num);
- if (slot_num == -ENOENT) {
- status = 0;
- mlog(0, "no slot for this node, so no recovery required.\n");
- goto done;
- }
-
- mlog(0, "node %d was using slot %d\n", node_num, slot_num);
-
status = ocfs2_replay_journal(osb, node_num, slot_num);
if (status < 0) {
if (status == -EBUSY) {
@@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
/* This will kfree the memory pointed to by la_copy and tl_copy */
ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
- tl_copy);
+ tl_copy, NULL);
status = 0;
done:
@@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
return ret;
}
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
{
/* This check is good because ocfs2 will wait on our recovery
* thread before changing it to something other than MOUNTED
* or DISABLED. */
wait_event(osb->osb_mount_event,
- atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+ (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+ atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
atomic_read(&osb->vol_state) == VOLUME_DISABLED);
/* If there's an error on mount, then we may never get to the
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d4d14e9a3ce..3c3532e1307 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,12 +27,7 @@
#define OCFS2_JOURNAL_H
#include <linux/fs.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
+#include <linux/jbd2.h>
enum ocfs2_journal_state {
OCFS2_JOURNAL_FREE = 0,
@@ -173,6 +168,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb,
int node_num);
int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
{
@@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
* ocfs2_extend_trans - Extend a handle by nblocks credits. This may
* commit the handle to disk in the process, but will
* not release any locks taken during the transaction.
- * ocfs2_journal_access - Notify the handle that we want to journal this
+ * ocfs2_journal_access* - Notify the handle that we want to journal this
* buffer. Will have to call ocfs2_journal_dirty once
* we've actually dirtied it. Type is one of . or .
+ * Always call the specific flavor of
+ * ocfs2_journal_access_*() unless you intend to
+ * manage the checksum by hand.
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
* ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
* the current handle commits.
@@ -248,10 +247,29 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks);
#define OCFS2_JOURNAL_ACCESS_WRITE 1
#define OCFS2_JOURNAL_ACCESS_UNDO 2
-int ocfs2_journal_access(handle_t *handle,
- struct inode *inode,
- struct buffer_head *bh,
- int type);
+
+/* ocfs2_inode */
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+
/*
* A word about the journal_access/journal_dirty "dance". It is
* entirely legal to journal_access a buffer more than once (as long
@@ -273,10 +291,6 @@ int ocfs2_journal_access(handle_t *handle,
*/
int ocfs2_journal_dirty(handle_t *handle,
struct buffer_head *bh);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int ocfs2_journal_dirty_data(handle_t *handle,
- struct buffer_head *bh);
-#endif
/*
* Credit Macros:
@@ -293,6 +307,37 @@ int ocfs2_journal_dirty_data(handle_t *handle,
/* extended attribute block update */
#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+ int credits = 0;
+
+ if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+ credits += OCFS2_QWRITE_CREDITS;
+ if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+ credits += OCFS2_QWRITE_CREDITS;
+ return credits;
+}
+
+/* Number of credits needed for removing quota structure from file */
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
+/* Number of credits needed for initialization of new quota structure */
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
+
/* group extend. inode update and last group update. */
#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
@@ -303,8 +348,11 @@ int ocfs2_journal_dirty_data(handle_t *handle,
* prev. group desc. if we relink. */
#define OCFS2_SUBALLOC_ALLOC (3)
-#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC \
- + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
+{
+ return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+ ocfs2_quota_trans_credits(sb);
+}
/* dinode + group descriptor update. We don't relink on free yet. */
#define OCFS2_SUBALLOC_FREE (2)
@@ -313,16 +361,23 @@ int ocfs2_journal_dirty_data(handle_t *handle,
#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
+ OCFS2_TRUNCATE_LOG_UPDATE)
-#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+ return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+ ocfs2_quota_trans_credits(sb);
+}
/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
* bitmap block for the new bit) */
#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks */
-#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \
- + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+ * group descriptor + mkdir/symlink blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb)
+{
+ return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+ ocfs2_quota_trans_credits(sb);
+}
/* local alloc metadata change + main bitmap updates */
#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
@@ -332,13 +387,21 @@ int ocfs2_journal_dirty_data(handle_t *handle,
* for the dinode, one for the new block. */
#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
-/* file update (nlink, etc) + directory mtime/ctime + dir entry block */
-#define OCFS2_LINK_CREDITS (2*OCFS2_INODE_UPDATE_CREDITS + 1)
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
+ * update on dir */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+ return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+ ocfs2_quota_trans_credits(sb);
+}
/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
* dir inode link */
-#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \
- + OCFS2_LINK_CREDITS)
+static inline int ocfs2_unlink_credits(struct super_block *sb)
+{
+ /* The quota update from ocfs2_link_credits is unused here... */
+ return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+}
/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
* inode alloc group descriptor */
@@ -347,8 +410,10 @@ int ocfs2_journal_dirty_data(handle_t *handle,
/* dinode update, old dir dinode update, new dir dinode update, old
* dir dir entry, new dir dir entry, dir entry update for renaming
* directory + target unlink */
-#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
- + OCFS2_UNLINK_CREDITS)
+static inline int ocfs2_rename_credits(struct super_block *sb)
+{
+ return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+}
/* global bitmap dinode, group desc., relinked group,
* suballocator dinode, group desc., relinked group,
@@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
* credit for the dinode there. */
extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
- return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
+ return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+ ocfs2_quota_trans_credits(sb);
}
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
{
- int blocks = OCFS2_MKNOD_CREDITS;
+ int blocks = ocfs2_mknod_credits(sb);
/* links can be longer than one block so we may update many
* within our single allocated extent. */
blocks += ocfs2_clusters_to_blocks(sb, 1);
- return blocks;
+ return blocks + ocfs2_quota_trans_credits(sb);
}
static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
/* update to the truncate log. */
credits += OCFS2_TRUNCATE_LOG_UPDATE;
+ credits += ocfs2_quota_trans_credits(sb);
+
return credits;
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 687b28713c3..ec70cdbe77f 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -36,6 +36,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "inode.h"
#include "journal.h"
@@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
- &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+ status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+ OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
}
memcpy(alloc_copy, alloc, bh->b_size);
- status = ocfs2_journal_access(handle, local_alloc_inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_commit;
@@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
mutex_lock(&inode->i_mutex);
- status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
- &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+ status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+ OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
ocfs2_clear_local_alloc(alloc);
+ ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
status = ocfs2_write_block(osb, alloc_bh, inode);
if (status < 0)
mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
* delete bits from it! */
*num_bits = bits_wanted;
- status = ocfs2_journal_access(handle, local_alloc_inode,
- osb->local_alloc_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, local_alloc_inode,
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
}
memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
- status = ocfs2_journal_access(handle, local_alloc_inode,
- osb->local_alloc_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, local_alloc_inode,
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2545e7402ef..084aba86c3b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,6 +40,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
@@ -61,17 +62,18 @@
#include "sysfile.h"
#include "uptodate.h"
#include "xattr.h"
+#include "acl.h"
#include "buffer_head_io.h"
static int ocfs2_mknod_locked(struct ocfs2_super *osb,
struct inode *dir,
- struct dentry *dentry, int mode,
+ struct inode *inode,
+ struct dentry *dentry,
dev_t dev,
struct buffer_head **new_fe_bh,
struct buffer_head *parent_fe_bh,
handle_t *handle,
- struct inode **ret_inode,
struct ocfs2_alloc_context *inode_ac);
static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +188,35 @@ bail:
return ret;
}
+static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+{
+ struct inode *inode;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode) {
+ mlog(ML_ERROR, "new_inode failed!\n");
+ return NULL;
+ }
+
+ /* populate as many fields early on as possible - many of
+ * these are used by the support functions here and in
+ * callers. */
+ if (S_ISDIR(mode))
+ inode->i_nlink = 2;
+ else
+ inode->i_nlink = 1;
+ inode->i_uid = current_fsuid();
+ if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else
+ inode->i_gid = current_fsgid();
+ inode->i_mode = mode;
+ vfs_dq_init(inode);
+ return inode;
+}
+
static int ocfs2_mknod(struct inode *dir,
struct dentry *dentry,
int mode,
@@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir,
struct inode *inode = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *xattr_ac = NULL;
+ int want_clusters = 0;
+ int xattr_credits = 0;
+ struct ocfs2_security_xattr_info si = {
+ .enable = 1,
+ };
+ int did_quota_inode = 0;
mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
(unsigned long)dev, dentry->d_name.len,
@@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- /* Reserve a cluster if creating an extent based directory. */
- if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
- status = ocfs2_reserve_clusters(osb, 1, &data_ac);
- if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
+ inode = ocfs2_get_init_inode(dir, mode);
+ if (!inode) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* get security xattr */
+ status = ocfs2_init_security_get(inode, dir, &si);
+ if (status) {
+ if (status == -EOPNOTSUPP)
+ si.enable = 0;
+ else {
+ mlog_errno(status);
goto leave;
}
}
- handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS);
+ /* calculate meta data/clusters for setting security and acl xattr */
+ status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+ &si, &want_clusters,
+ &xattr_credits, &xattr_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* Reserve a cluster if creating an extent based directory. */
+ if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+ want_clusters += 1;
+
+ status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+ xattr_credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+ * to be called. */
+ if (sb_any_quota_active(osb->sb) &&
+ osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+ status = -EDQUOT;
+ goto leave;
+ }
+ did_quota_inode = 1;
+
/* do the real work now. */
- status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+ status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
&new_fe_bh, parent_fe_bh, handle,
- &inode, inode_ac);
+ inode_ac);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- status = ocfs2_journal_access(handle, dir, parent_fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir,
inc_nlink(dir);
}
+ status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+ xattr_ac, data_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ if (si.enable) {
+ status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+ xattr_ac, data_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+ }
+
status = ocfs2_add_entry(handle, dentry, inode,
OCFS2_I(inode)->ip_blkno, parent_fe_bh,
de_bh);
@@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
d_instantiate(dentry, inode);
status = 0;
leave:
+ if (status < 0 && did_quota_inode)
+ vfs_dq_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
@@ -331,9 +425,13 @@ leave:
brelse(new_fe_bh);
brelse(de_bh);
brelse(parent_fe_bh);
+ kfree(si.name);
+ kfree(si.value);
- if ((status < 0) && inode)
+ if ((status < 0) && inode) {
+ clear_nlink(inode);
iput(inode);
+ }
if (inode_ac)
ocfs2_free_alloc_context(inode_ac);
@@ -341,6 +439,9 @@ leave:
if (data_ac)
ocfs2_free_alloc_context(data_ac);
+ if (xattr_ac)
+ ocfs2_free_alloc_context(xattr_ac);
+
mlog_exit(status);
return status;
@@ -348,12 +449,12 @@ leave:
static int ocfs2_mknod_locked(struct ocfs2_super *osb,
struct inode *dir,
- struct dentry *dentry, int mode,
+ struct inode *inode,
+ struct dentry *dentry,
dev_t dev,
struct buffer_head **new_fe_bh,
struct buffer_head *parent_fe_bh,
handle_t *handle,
- struct inode **ret_inode,
struct ocfs2_alloc_context *inode_ac)
{
int status = 0;
@@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
struct ocfs2_extent_list *fel;
u64 fe_blkno = 0;
u16 suballoc_bit;
- struct inode *inode = NULL;
- mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
- (unsigned long)dev, dentry->d_name.len,
+ mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
+ inode->i_mode, (unsigned long)dev, dentry->d_name.len,
dentry->d_name.name);
*new_fe_bh = NULL;
- *ret_inode = NULL;
status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
&fe_blkno);
@@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
goto leave;
}
- inode = new_inode(dir->i_sb);
- if (!inode) {
- status = -ENOMEM;
- mlog(ML_ERROR, "new_inode failed!\n");
- goto leave;
- }
-
/* populate as many fields early on as possible - many of
* these are used by the support functions here and in
* callers. */
inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
OCFS2_I(inode)->ip_blkno = fe_blkno;
- if (S_ISDIR(mode))
- inode->i_nlink = 2;
- else
- inode->i_nlink = 1;
- inode->i_mode = mode;
spin_lock(&osb->osb_lock);
inode->i_generation = osb->s_next_generation++;
spin_unlock(&osb->osb_lock);
@@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
}
ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
- status = ocfs2_journal_access(handle, inode, *new_fe_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fe->i_blkno = cpu_to_le64(fe_blkno);
fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
- fe->i_uid = cpu_to_le32(current_fsuid());
- if (dir->i_mode & S_ISGID) {
- fe->i_gid = cpu_to_le32(dir->i_gid);
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- fe->i_gid = cpu_to_le32(current_fsgid());
- fe->i_mode = cpu_to_le16(mode);
- if (S_ISCHR(mode) || S_ISBLK(mode))
+ fe->i_uid = cpu_to_le32(inode->i_uid);
+ fe->i_gid = cpu_to_le32(inode->i_gid);
+ fe->i_mode = cpu_to_le16(inode->i_mode);
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
-
fe->i_links_count = cpu_to_le16(inode->i_nlink);
fe->i_last_eb_blk = 0;
@@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
/*
* If supported, directories start with inline data.
*/
- if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
+ if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
u16 feat = le16_to_cpu(fe->i_dyn_features);
fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
goto leave;
}
- if (ocfs2_populate_inode(inode, fe, 1) < 0) {
- mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
- "i_blkno=%llu, i_ino=%lu\n",
- (unsigned long long)(*new_fe_bh)->b_blocknr,
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- inode->i_ino);
- BUG();
- }
-
+ ocfs2_populate_inode(inode, fe, 1);
ocfs2_inode_set_new(osb, inode);
if (!ocfs2_mount_local(osb)) {
status = ocfs2_create_new_inode_locks(inode);
@@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
status = 0; /* error in ocfs2_create_new_inode_locks is not
* critical */
- *ret_inode = inode;
leave:
if (status < 0) {
if (*new_fe_bh) {
brelse(*new_fe_bh);
*new_fe_bh = NULL;
}
- if (inode) {
- clear_nlink(inode);
- iput(inode);
- }
}
mlog_exit(status);
@@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out_unlock_inode;
}
- handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS);
+ handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
handle = NULL;
@@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out_unlock_inode;
}
- err = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ err = ocfs2_journal_access_di(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (err < 0) {
mlog_errno(err);
goto out_commit;
@@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
}
}
- handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS);
+ handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
goto leave;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
}
}
- handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS);
+ handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
}
- status = ocfs2_journal_access(handle, new_inode, newfe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
old_inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(old_inode);
- status = ocfs2_journal_access(handle, old_inode, old_inode_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status >= 0) {
old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
@@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
(int)old_dir_nlink, old_dir->i_nlink);
} else {
struct ocfs2_dinode *fe;
- status = ocfs2_journal_access(handle, old_dir,
- old_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, old_dir,
+ old_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir,
handle_t *handle = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *xattr_ac = NULL;
+ int want_clusters = 0;
+ int xattr_credits = 0;
+ struct ocfs2_security_xattr_info si = {
+ .enable = 1,
+ };
+ int did_quota = 0, did_quota_inode = 0;
mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
- /* don't reserve bitmap space for fast symlinks. */
- if (l > ocfs2_fast_symlink_chars(sb)) {
- status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+ inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
+ if (!inode) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* get security xattr */
+ status = ocfs2_init_security_get(inode, dir, &si);
+ if (status) {
+ if (status == -EOPNOTSUPP)
+ si.enable = 0;
+ else {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ /* calculate meta data/clusters for setting security xattr */
+ if (si.enable) {
+ status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+ &xattr_credits, &xattr_ac);
if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
+ mlog_errno(status);
goto bail;
}
}
- handle = ocfs2_start_trans(osb, credits);
+ /* don't reserve bitmap space for fast symlinks. */
+ if (l > ocfs2_fast_symlink_chars(sb))
+ want_clusters += 1;
+
+ status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+
+ handle = ocfs2_start_trans(osb, credits + xattr_credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
- status = ocfs2_mknod_locked(osb, dir, dentry,
- S_IFLNK | S_IRWXUGO, 0,
- &new_fe_bh, parent_fe_bh, handle,
- &inode, inode_ac);
+ /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+ * to be called. */
+ if (sb_any_quota_active(osb->sb) &&
+ osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+ status = -EDQUOT;
+ goto bail;
+ }
+ did_quota_inode = 1;
+
+ status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+ 0, &new_fe_bh, parent_fe_bh, handle,
+ inode_ac);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
u32 offset = 0;
inode->i_op = &ocfs2_symlink_inode_operations;
+ if (vfs_dq_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1))) {
+ status = -EDQUOT;
+ goto bail;
+ }
+ did_quota = 1;
status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
new_fe_bh,
handle, data_ac, NULL,
@@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir,
}
}
+ if (si.enable) {
+ status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+ xattr_ac, data_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
status = ocfs2_add_entry(handle, dentry, inode,
le64_to_cpu(fe->i_blkno), parent_fe_bh,
de_bh);
@@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
dentry->d_op = &ocfs2_dentry_ops;
d_instantiate(dentry, inode);
bail:
+ if (status < 0 && did_quota)
+ vfs_dq_free_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (status < 0 && did_quota_inode)
+ vfs_dq_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
@@ -1640,12 +1772,18 @@ bail:
brelse(new_fe_bh);
brelse(parent_fe_bh);
brelse(de_bh);
+ kfree(si.name);
+ kfree(si.value);
if (inode_ac)
ocfs2_free_alloc_context(inode_ac);
if (data_ac)
ocfs2_free_alloc_context(data_ac);
- if ((status < 0) && inode)
+ if (xattr_ac)
+ ocfs2_free_alloc_context(xattr_ac);
+ if ((status < 0) && inode) {
+ clear_nlink(inode);
iput(inode);
+ }
mlog_exit(status);
@@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
- status = ocfs2_read_block(orphan_dir_inode,
- OCFS2_I(orphan_dir_inode)->ip_blkno,
- &orphan_dir_bh);
+ status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
goto leave;
}
- status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3fed9e3d899..ad5c24a29ed 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -161,6 +161,7 @@ enum ocfs2_vol_state
{
VOLUME_INIT = 0,
VOLUME_MOUNTED,
+ VOLUME_MOUNTED_QUOTAS,
VOLUME_DISMOUNTED,
VOLUME_DISABLED
};
@@ -195,6 +196,9 @@ enum ocfs2_mount_options
OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
+ OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
+ OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
+ OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -205,6 +209,7 @@ enum ocfs2_mount_options
struct ocfs2_journal;
struct ocfs2_slot_info;
struct ocfs2_recovery_map;
+struct ocfs2_quota_recovery;
struct ocfs2_super
{
struct task_struct *commit_task;
@@ -286,10 +291,11 @@ struct ocfs2_super
char *local_alloc_debug_buf;
#endif
- /* Next two fields are for local node slot recovery during
+ /* Next three fields are for local node slot recovery during
* mount. */
int dirty;
struct ocfs2_dinode *local_alloc_copy;
+ struct ocfs2_quota_recovery *quota_rec;
struct ocfs2_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
@@ -333,6 +339,10 @@ struct ocfs2_super
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+
static inline int ocfs2_should_order_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
@@ -376,6 +386,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
return 0;
}
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+ return 1;
+ return 0;
+}
+
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
@@ -443,39 +460,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
#define OCFS2_IS_VALID_DINODE(ptr) \
(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
- typeof(__di) ____di = (__di); \
- ocfs2_error((__sb), \
- "Dinode # %llu has bad signature %.*s", \
- (unsigned long long)le64_to_cpu((____di)->i_blkno), 7, \
- (____di)->i_signature); \
-} while (0)
-
#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
- typeof(__eb) ____eb = (__eb); \
- ocfs2_error((__sb), \
- "Extent Block # %llu has bad signature %.*s", \
- (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7, \
- (____eb)->h_signature); \
-} while (0)
-
#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
- typeof(__gd) ____gd = (__gd); \
- ocfs2_error((__sb), \
- "Group Descriptor # %llu has bad signature %.*s", \
- (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
- (____gd)->bg_signature); \
-} while (0)
#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \
(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
+ (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
+
static inline unsigned long ino_from_blkno(struct super_block *sb,
u64 blkno)
{
@@ -632,5 +629,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
#define ocfs2_clear_bit ext2_clear_bit
#define ocfs2_test_bit ext2_test_bit
#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#define ocfs2_find_next_bit ext2_find_next_bit
#endif /* OCFS2_H */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5e0c0d0aef7..c7ae45aaa36 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -65,6 +65,7 @@
#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
/* Compatibility flags */
#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -93,8 +94,11 @@
| OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
| OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
| OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
- | OCFS2_FEATURE_INCOMPAT_XATTR)
-#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
+ | OCFS2_FEATURE_INCOMPAT_XATTR \
+ | OCFS2_FEATURE_INCOMPAT_META_ECC)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
/*
* Heartbeat-only devices are missing journals and other files. The
@@ -147,6 +151,9 @@
/* Support for extended attributes */
#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
+
/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
@@ -163,6 +170,12 @@
*/
#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
+
/* The byte offset of the first backup block will be 1G.
* The following will be 4G, 16G, 64G, 256G and 1T.
*/
@@ -192,6 +205,7 @@
#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
+#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */
/*
* Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +343,17 @@ enum {
#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
HEARTBEAT_SYSTEM_INODE,
GLOBAL_BITMAP_SYSTEM_INODE,
-#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+ USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
ORPHAN_DIR_SYSTEM_INODE,
EXTENT_ALLOC_SYSTEM_INODE,
INODE_ALLOC_SYSTEM_INODE,
JOURNAL_SYSTEM_INODE,
LOCAL_ALLOC_SYSTEM_INODE,
TRUNCATE_LOG_SYSTEM_INODE,
+ LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE,
NUM_SYSTEM_INODES
};
@@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
[SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
[HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
[GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
+ [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+ [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
/* Slot-specific system inodes (one copy per slot) */
[ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
[INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
[JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
[LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
- [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+ [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+ [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+ [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
};
/* Parameter passed from mount.ocfs2 to module */
@@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
/*
+ * Block checking structure. This is used in metadata to validate the
+ * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */
+ __le16 bc_ecc; /* Single-error-correction parity vector.
+ This is a simple Hamming code dependant
+ on the blocksize. OCFS2's maximum
+ blocksize, 4K, requires 16 parity bits,
+ so we fit in __le16. */
+ __le16 bc_reserved1;
+/*08*/
+};
+
+/*
* On disk extent record for OCFS2
* It describes a range of clusters on disk.
*
@@ -496,7 +534,7 @@ struct ocfs2_truncate_log {
struct ocfs2_extent_block
{
/*00*/ __u8 h_signature[8]; /* Signature for verification */
- __le64 h_reserved1;
+ struct ocfs2_block_check h_check; /* Error checking */
/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
extent_header belongs to */
__le16 h_suballoc_bit; /* Bit offset in suballocator
@@ -666,7 +704,8 @@ struct ocfs2_dinode {
was set in i_flags */
__le16 i_dyn_features;
__le64 i_xattr_loc;
-/*80*/ __le64 i_reserved2[7];
+/*80*/ struct ocfs2_block_check i_check; /* Error checking */
+/*88*/ __le64 i_reserved2[6];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
@@ -715,6 +754,34 @@ struct ocfs2_dir_entry {
} __attribute__ ((packed));
/*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/ __le64 db_compat_inode; /* Always zero. Was inode */
+
+ __le16 db_compat_rec_len; /* Backwards compatible with
+ * ocfs2_dir_entry. */
+ __u8 db_compat_name_len; /* Always zero. Was name_len */
+ __u8 db_reserved0;
+ __le16 db_reserved1;
+ __le16 db_free_rec_len; /* Size of largest empty hole
+ * in this block. (unused) */
+/*10*/ __u8 db_signature[8]; /* Signature for verification */
+ __le64 db_reserved2;
+ __le64 db_free_next; /* Next block in list (unused) */
+/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */
+ __le64 db_parent_dinode; /* dinode which owns me, in
+ blocks */
+/*30*/ struct ocfs2_block_check db_check; /* Error checking */
+/*40*/
+};
+
+/*
* On disk allocator group structure for OCFS2
*/
struct ocfs2_group_desc
@@ -733,7 +800,8 @@ struct ocfs2_group_desc
/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
blocks */
__le64 bg_blkno; /* Offset on disk, in blocks */
-/*30*/ __le64 bg_reserved2[2];
+/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
+ __le64 bg_reserved2;
/*40*/ __u8 bg_bitmap[0];
};
@@ -776,7 +844,12 @@ struct ocfs2_xattr_header {
in this extent record,
only valid in the first
bucket. */
- __le64 xh_csum;
+ struct ocfs2_block_check xh_check; /* Error checking
+ (Note, this is only
+ used for xattr
+ buckets. A block uses
+ xb_check and sets
+ this field to zero.) */
struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
};
@@ -827,7 +900,7 @@ struct ocfs2_xattr_block {
block group */
__le32 xb_fs_generation; /* Must match super block */
/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */
- __le64 xb_csum;
+ struct ocfs2_block_check xb_check; /* Error checking */
/*20*/ __le16 xb_flags; /* Indicates whether this block contains
real xattr or a xattr tree. */
__le16 xb_reserved0;
@@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
}
+/*
+ * On disk structures for global quota file
+ */
+
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+ 0x0cf52470, /* USRQUOTA */ \
+ 0x0cf52471 /* GRPQUOTA */ \
+}
+
+#define OCFS2_GLOBAL_QVERSIONS {\
+ 0, \
+ 0, \
+}
+
+
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+ __le32 dqh_magic; /* Magic number identifying file */
+ __le32 dqh_version; /* Quota format version */
+};
+
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */
+ __le32 dqi_igrace; /* Grace time for inode softlimit excess */
+ __le32 dqi_syncms; /* Time after which we sync local changes to
+ * global quota file */
+ __le32 dqi_blocks; /* Number of blocks in quota file */
+/*10*/ __le32 dqi_free_blk; /* First free block in quota file */
+ __le32 dqi_free_entry; /* First block with free dquot entry in quota
+ * file */
+};
+
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/ __le32 dqb_id; /* ID the structure belongs to */
+ __le32 dqb_use_count; /* Number of nodes having reference to this structure */
+ __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */
+/*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */
+ __le64 dqb_curinodes; /* current # allocated inodes */
+/*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */
+ __le64 dqb_bsoftlimit; /* preferred limit on disk space */
+/*30*/ __le64 dqb_curspace; /* current space occupied */
+ __le64 dqb_btime; /* time limit for excessive disk use */
+/*40*/ __le64 dqb_itime; /* time limit for excessive inode use */
+ __le64 dqb_pad1;
+/*50*/ __le64 dqb_pad2;
+};
+
+/*
+ * On-disk structures for local quota file
+ */
+
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+ 0x0cf524c0, /* USRQUOTA */ \
+ 0x0cf524c1 /* GRPQUOTA */ \
+}
+
+#define OCFS2_LOCAL_QVERSIONS {\
+ 0, \
+ 0, \
+}
+
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\
+ * quota has been cleanly turned off) */
+
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+ __le32 dqi_flags; /* Flags for quota file */
+ __le32 dqi_chunks; /* Number of chunks of quota structures
+ * with a bitmap */
+ __le32 dqi_blocks; /* Number of blocks allocated for quota file */
+};
+
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+ __le32 dqc_free; /* Number of free entries in the bitmap */
+ u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
+ * chunk of quota file */
+};
+
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/ __le64 dqb_id; /* id this quota applies to */
+ __le64 dqb_spacemod; /* Change in the amount of used space */
+/*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */
+};
+
+
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+
+struct ocfs2_disk_dqtrailer {
+/*00*/ struct ocfs2_block_check dq_check; /* Error checking */
+/*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+ void *buf)
+{
+ char *ptr = buf;
+ ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+
+ return (struct ocfs2_disk_dqtrailer *)ptr;
+}
+
#ifdef __KERNEL__
static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
{
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
deleted file mode 100644
index b91c78f8f55..00000000000
--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_jbd_compat.h
- *
- * Compatibility defines for JBD.
- *
- * Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-
-#ifndef OCFS2_JBD_COMPAT_H
-#define OCFS2_JBD_COMPAT_H
-
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# error Should not have been included
-#endif
-
-struct jbd2_inode {
- unsigned int dummy;
-};
-
-#define JBD2_BARRIER JFS_BARRIER
-#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE
-
-#define jbd2_journal_ack_err journal_ack_err
-#define jbd2_journal_clear_err journal_clear_err
-#define jbd2_journal_destroy journal_destroy
-#define jbd2_journal_dirty_metadata journal_dirty_metadata
-#define jbd2_journal_errno journal_errno
-#define jbd2_journal_extend journal_extend
-#define jbd2_journal_flush journal_flush
-#define jbd2_journal_force_commit journal_force_commit
-#define jbd2_journal_get_write_access journal_get_write_access
-#define jbd2_journal_get_undo_access journal_get_undo_access
-#define jbd2_journal_init_inode journal_init_inode
-#define jbd2_journal_invalidatepage journal_invalidatepage
-#define jbd2_journal_load journal_load
-#define jbd2_journal_lock_updates journal_lock_updates
-#define jbd2_journal_restart journal_restart
-#define jbd2_journal_start journal_start
-#define jbd2_journal_start_commit journal_start_commit
-#define jbd2_journal_stop journal_stop
-#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers
-#define jbd2_journal_unlock_updates journal_unlock_updates
-#define jbd2_journal_wipe journal_wipe
-#define jbd2_log_wait_commit log_wait_commit
-
-static inline int jbd2_journal_file_inode(handle_t *handle,
- struct jbd2_inode *inode)
-{
- return 0;
-}
-
-static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
- loff_t new_size)
-{
- return 0;
-}
-
-static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
- struct inode *inode)
-{
- return;
-}
-
-static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
- struct jbd2_inode *jinode)
-{
- return;
-}
-
-
-#endif /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 82c200f7a8f..eb6f50c9cec 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_DENTRY,
OCFS2_LOCK_TYPE_OPEN,
OCFS2_LOCK_TYPE_FLOCK,
+ OCFS2_LOCK_TYPE_QINFO,
OCFS2_NUM_LOCK_TYPES
};
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_FLOCK:
c = 'F';
break;
+ case OCFS2_LOCK_TYPE_QINFO:
+ c = 'Q';
+ break;
default:
c = '\0';
}
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
[OCFS2_LOCK_TYPE_OPEN] = "Open",
[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+ [OCFS2_LOCK_TYPE_QINFO] = "Quota",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 00000000000..7365e2e0870
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,119 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+
+#include "ocfs2.h"
+
+/* Common stuff */
+/* id number of quota format */
+#define QFMT_OCFS2 3
+
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+ struct dquot dq_dquot; /* Generic VFS dquot */
+ loff_t dq_local_off; /* Offset in the local quota file */
+ struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
+ unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
+ s64 dq_origspace; /* Last globally synced space usage */
+ s64 dq_originodes; /* Last globally synced inode usage */
+};
+
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+ struct list_head rc_list; /* List of chunks */
+ int rc_chunk; /* Chunk number */
+ unsigned long *rc_bitmap; /* Bitmap of entries to recover */
+};
+
+struct ocfs2_quota_recovery {
+ struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */
+};
+
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+ unsigned int dqi_type; /* Quota type this structure describes */
+ unsigned int dqi_chunks; /* Number of chunks in local quota file */
+ unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
+ unsigned int dqi_syncms; /* How often should we sync with other nodes */
+ unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */
+ struct list_head dqi_chunk; /* List of chunks */
+ struct inode *dqi_gqinode; /* Global quota file inode */
+ struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
+ struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
+ int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
+ struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
+ struct buffer_head *dqi_ibh; /* Buffer with information header */
+ struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
+ struct delayed_work dqi_sync_work; /* Work for syncing dquots */
+ struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
+ * information, in case we
+ * enable quotas on file
+ * needing it */
+};
+
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+ return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+
+struct ocfs2_quota_chunk {
+ struct list_head qc_chunk; /* List of quotafile chunks */
+ int qc_num; /* Number of quota chunk */
+ struct buffer_head *qc_headerbh; /* Buffer head with chunk header */
+};
+
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+
+extern struct qtree_fmt_operations ocfs2_global_ops;
+
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+ struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+ struct ocfs2_quota_recovery *rec,
+ int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+ return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+ return __ocfs2_sync_dquot(dquot, 1);
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh);
+
+extern struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+
+int ocfs2_quota_setup(void);
+void ocfs2_quota_shutdown(void);
+
+#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 00000000000..6aff8f2d3e4
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,1025 @@
+/*
+ * Implementation of operations over global quota file
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "quota.h"
+
+static struct workqueue_struct *ocfs2_quota_wq = NULL;
+
+static void qsync_work_fn(struct work_struct *work);
+
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+ struct ocfs2_global_disk_dqblk *d = dp;
+ struct mem_dqblk *m = &dquot->dq_dqb;
+
+ /* Update from disk only entries not set by the admin */
+ if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+ m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+ m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+ }
+ if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+ m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+ if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+ m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+ m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+ }
+ if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+ m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+ if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+ m->dqb_btime = le64_to_cpu(d->dqb_btime);
+ if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+ m->dqb_itime = le64_to_cpu(d->dqb_itime);
+ OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+ struct ocfs2_global_disk_dqblk *d = dp;
+ struct mem_dqblk *m = &dquot->dq_dqb;
+
+ d->dqb_id = cpu_to_le32(dquot->dq_id);
+ d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+ d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+ d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+ d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+ d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+ d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+ d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+ d->dqb_btime = cpu_to_le64(m->dqb_btime);
+ d->dqb_itime = cpu_to_le64(m->dqb_itime);
+}
+
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+ struct ocfs2_global_disk_dqblk *d = dp;
+ struct ocfs2_mem_dqinfo *oinfo =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+ if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+ return 0;
+ return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+
+struct qtree_fmt_operations ocfs2_global_ops = {
+ .mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+ .disk2mem_dqblk = ocfs2_global_disk2memdqb,
+ .is_id = ocfs2_global_is_id,
+};
+
+static int ocfs2_validate_quota_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ struct ocfs2_disk_dqtrailer *dqt =
+ ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
+
+ mlog(0, "Validating quota block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
+}
+
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh)
+{
+ int rc = 0;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+ ocfs2_validate_quota_block);
+ if (rc)
+ mlog_errno(rc);
+
+ /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+static int ocfs2_get_quota_block(struct inode *inode, int block,
+ struct buffer_head **bh)
+{
+ u64 pblock, pcount;
+ int err;
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
+ *bh = sb_getblk(inode->i_sb, pblock);
+ if (!*bh) {
+ err = -EIO;
+ mlog_errno(err);
+ }
+ return err;;
+}
+
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off)
+{
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ struct inode *gqinode = oinfo->dqi_gqinode;
+ loff_t i_size = i_size_read(gqinode);
+ int offset = off & (sb->s_blocksize - 1);
+ sector_t blk = off >> sb->s_blocksize_bits;
+ int err = 0;
+ struct buffer_head *bh;
+ size_t toread, tocopy;
+
+ if (off > i_size)
+ return 0;
+ if (off + len > i_size)
+ len = i_size - off;
+ toread = len;
+ while (toread > 0) {
+ tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+ bh = NULL;
+ err = ocfs2_read_quota_block(gqinode, blk, &bh);
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
+ memcpy(data, bh->b_data + offset, tocopy);
+ brelse(bh);
+ offset = 0;
+ toread -= tocopy;
+ data += tocopy;
+ blk++;
+ }
+ return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct inode *gqinode = oinfo->dqi_gqinode;
+ int offset = off & (sb->s_blocksize - 1);
+ sector_t blk = off >> sb->s_blocksize_bits;
+ int err = 0, new = 0, ja_type;
+ struct buffer_head *bh = NULL;
+ handle_t *handle = journal_current_handle();
+
+ if (!handle) {
+ mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+ "because transaction was not started.\n",
+ (unsigned long long)off, (unsigned long long)len);
+ return -EIO;
+ }
+ if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+ WARN_ON(1);
+ len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+ }
+
+ mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
+ if (gqinode->i_size < off + len) {
+ down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+ err = ocfs2_extend_no_holes(gqinode, off + len, off);
+ up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+ if (err < 0)
+ goto out;
+ err = ocfs2_simple_size_update(gqinode,
+ oinfo->dqi_gqi_bh,
+ off + len);
+ if (err < 0)
+ goto out;
+ new = 1;
+ }
+ /* Not rewriting whole block? */
+ if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+ !new) {
+ err = ocfs2_read_quota_block(gqinode, blk, &bh);
+ ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
+ } else {
+ err = ocfs2_get_quota_block(gqinode, blk, &bh);
+ ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
+ }
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
+ lock_buffer(bh);
+ if (new)
+ memset(bh->b_data, 0, sb->s_blocksize);
+ memcpy(bh->b_data + offset, data, len);
+ flush_dcache_page(bh->b_page);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ ocfs2_set_buffer_uptodate(gqinode, bh);
+ err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
+ if (err < 0) {
+ brelse(bh);
+ goto out;
+ }
+ err = ocfs2_journal_dirty(handle, bh);
+ brelse(bh);
+ if (err < 0)
+ goto out;
+out:
+ if (err) {
+ mutex_unlock(&gqinode->i_mutex);
+ mlog_errno(err);
+ return err;
+ }
+ gqinode->i_version++;
+ ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+ mutex_unlock(&gqinode->i_mutex);
+ return len;
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ int status;
+ struct buffer_head *bh = NULL;
+
+ status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+ if (status < 0)
+ return status;
+ spin_lock(&dq_data_lock);
+ if (!oinfo->dqi_gqi_count++)
+ oinfo->dqi_gqi_bh = bh;
+ else
+ WARN_ON(bh != oinfo->dqi_gqi_bh);
+ spin_unlock(&dq_data_lock);
+ return 0;
+}
+
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+ brelse(oinfo->dqi_gqi_bh);
+ spin_lock(&dq_data_lock);
+ if (!--oinfo->dqi_gqi_count)
+ oinfo->dqi_gqi_bh = NULL;
+ spin_unlock(&dq_data_lock);
+}
+
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+ struct inode *gqinode = NULL;
+ unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE };
+ struct ocfs2_global_disk_dqinfo dinfo;
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ int status;
+
+ mlog_entry_void();
+
+ /* Read global header */
+ gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+ OCFS2_INVALID_SLOT);
+ if (!gqinode) {
+ mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+ type);
+ status = -EINVAL;
+ goto out_err;
+ }
+ oinfo->dqi_gi.dqi_sb = sb;
+ oinfo->dqi_gi.dqi_type = type;
+ ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+ oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+ oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+ oinfo->dqi_gqi_bh = NULL;
+ oinfo->dqi_gqi_count = 0;
+ oinfo->dqi_gqinode = gqinode;
+ status = ocfs2_lock_global_qf(oinfo, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+ status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+ sizeof(struct ocfs2_global_disk_dqinfo),
+ OCFS2_GLOBAL_INFO_OFF);
+ ocfs2_unlock_global_qf(oinfo, 0);
+ if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+ mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+ status);
+ if (status >= 0)
+ status = -EIO;
+ mlog_errno(status);
+ goto out_err;
+ }
+ info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+ info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+ oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+ oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
+ oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+ oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+ oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+ oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+ oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+ OCFS2_QBLK_RESERVED_SPACE;
+ oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+ INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+ queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+ oinfo->dqi_syncjiff);
+
+out_err:
+ mlog_exit(status);
+ return status;
+}
+
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_global_disk_dqinfo dinfo;
+ ssize_t size;
+
+ spin_lock(&dq_data_lock);
+ info->dqi_flags &= ~DQF_INFO_DIRTY;
+ dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+ dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+ spin_unlock(&dq_data_lock);
+ dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+ dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+ dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+ dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+ size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+ sizeof(struct ocfs2_global_disk_dqinfo),
+ OCFS2_GLOBAL_INFO_OFF);
+ if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+ mlog(ML_ERROR, "Cannot write global quota info structure\n");
+ if (size >= 0)
+ size = -EIO;
+ return size;
+ }
+ return 0;
+}
+
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+ int err;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+
+ err = ocfs2_qinfo_lock(info, 1);
+ if (err < 0)
+ return err;
+ err = __ocfs2_global_write_info(sb, type);
+ ocfs2_qinfo_unlock(info, 1);
+ return err;
+}
+
+/* Read in information from global quota file and acquire a reference to it.
+ * dquot_acquire() has already started the transaction and locked quota file */
+int ocfs2_global_read_dquot(struct dquot *dquot)
+{
+ int err, err2, ex = 0;
+ struct ocfs2_mem_dqinfo *info =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+ err = ocfs2_qinfo_lock(info, 0);
+ if (err < 0)
+ goto out;
+ err = qtree_read_dquot(&info->dqi_gi, dquot);
+ if (err < 0)
+ goto out_qlock;
+ OCFS2_DQUOT(dquot)->dq_use_count++;
+ OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+ OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+ if (!dquot->dq_off) { /* No real quota entry? */
+ /* Upgrade to exclusive lock for allocation */
+ err = ocfs2_qinfo_lock(info, 1);
+ if (err < 0)
+ goto out_qlock;
+ ex = 1;
+ }
+ err = qtree_write_dquot(&info->dqi_gi, dquot);
+ if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
+ err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
+ if (!err)
+ err = err2;
+ }
+out_qlock:
+ if (ex)
+ ocfs2_qinfo_unlock(info, 1);
+ ocfs2_qinfo_unlock(info, 0);
+out:
+ if (err < 0)
+ mlog_errno(err);
+ return err;
+}
+
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+ int err, err2;
+ struct super_block *sb = dquot->dq_sb;
+ int type = dquot->dq_type;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ struct ocfs2_global_disk_dqblk dqblk;
+ s64 spacechange, inodechange;
+ time_t olditime, oldbtime;
+
+ err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+ sizeof(struct ocfs2_global_disk_dqblk),
+ dquot->dq_off);
+ if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+ if (err >= 0) {
+ mlog(ML_ERROR, "Short read from global quota file "
+ "(%u read)\n", err);
+ err = -EIO;
+ }
+ goto out;
+ }
+
+ /* Update space and inode usage. Get also other information from
+ * global quota file so that we don't overwrite any changes there.
+ * We are */
+ spin_lock(&dq_data_lock);
+ spacechange = dquot->dq_dqb.dqb_curspace -
+ OCFS2_DQUOT(dquot)->dq_origspace;
+ inodechange = dquot->dq_dqb.dqb_curinodes -
+ OCFS2_DQUOT(dquot)->dq_originodes;
+ olditime = dquot->dq_dqb.dqb_itime;
+ oldbtime = dquot->dq_dqb.dqb_btime;
+ ocfs2_global_disk2memdqb(dquot, &dqblk);
+ mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
+ dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
+ dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+ if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+ dquot->dq_dqb.dqb_curspace += spacechange;
+ if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+ dquot->dq_dqb.dqb_curinodes += inodechange;
+ /* Set properly space grace time... */
+ if (dquot->dq_dqb.dqb_bsoftlimit &&
+ dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+ if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+ oldbtime > 0) {
+ if (dquot->dq_dqb.dqb_btime > 0)
+ dquot->dq_dqb.dqb_btime =
+ min(dquot->dq_dqb.dqb_btime, oldbtime);
+ else
+ dquot->dq_dqb.dqb_btime = oldbtime;
+ }
+ } else {
+ dquot->dq_dqb.dqb_btime = 0;
+ clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+ }
+ /* Set properly inode grace time... */
+ if (dquot->dq_dqb.dqb_isoftlimit &&
+ dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+ if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+ olditime > 0) {
+ if (dquot->dq_dqb.dqb_itime > 0)
+ dquot->dq_dqb.dqb_itime =
+ min(dquot->dq_dqb.dqb_itime, olditime);
+ else
+ dquot->dq_dqb.dqb_itime = olditime;
+ }
+ } else {
+ dquot->dq_dqb.dqb_itime = 0;
+ clear_bit(DQ_INODES_B, &dquot->dq_flags);
+ }
+ /* All information is properly updated, clear the flags */
+ __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+ OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+ OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+ spin_unlock(&dq_data_lock);
+ err = ocfs2_qinfo_lock(info, freeing);
+ if (err < 0) {
+ mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
+ " (type=%d, id=%u)\n", dquot->dq_type,
+ (unsigned)dquot->dq_id);
+ goto out;
+ }
+ if (freeing)
+ OCFS2_DQUOT(dquot)->dq_use_count--;
+ err = qtree_write_dquot(&info->dqi_gi, dquot);
+ if (err < 0)
+ goto out_qlock;
+ if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+ err = qtree_release_dquot(&info->dqi_gi, dquot);
+ if (info_dirty(sb_dqinfo(sb, type))) {
+ err2 = __ocfs2_global_write_info(sb, type);
+ if (!err)
+ err = err2;
+ }
+ }
+out_qlock:
+ ocfs2_qinfo_unlock(info, freeing);
+out:
+ if (err < 0)
+ mlog_errno(err);
+ return err;
+}
+
+/*
+ * Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+ handle_t *handle;
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ int status = 0;
+
+ mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
+ dquot->dq_type, type, sb->s_id);
+ if (type != dquot->dq_type)
+ goto out;
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ status = ocfs2_sync_dquot(dquot);
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ if (status < 0)
+ mlog_errno(status);
+ /* We have to write local structure as well... */
+ dquot_mark_dquot_dirty(dquot);
+ status = dquot_commit(dquot);
+ if (status < 0)
+ mlog_errno(status);
+ ocfs2_commit_trans(osb, handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+static void qsync_work_fn(struct work_struct *work)
+{
+ struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+ struct ocfs2_mem_dqinfo,
+ dqi_sync_work.work);
+ struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+
+ dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+ queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+ oinfo->dqi_syncjiff);
+}
+
+/*
+ * Wrappers for generic quota functions
+ */
+
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+ int status = 0;
+
+ mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+
+ handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+ status = dquot_commit(dquot);
+ ocfs2_commit_trans(osb, handle);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+ struct ocfs2_mem_dqinfo *oinfo;
+ int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+ return 0;
+
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ /* We modify tree, leaf block, global info, local chunk header,
+ * global and local inode */
+ return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
+ 2 * OCFS2_INODE_UPDATE_CREDITS;
+}
+
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+ handle_t *handle;
+ struct ocfs2_mem_dqinfo *oinfo =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+ struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+ int status = 0;
+
+ mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ handle = ocfs2_start_trans(osb,
+ ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ status = dquot_release(dquot);
+ ocfs2_commit_trans(osb, handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
+{
+ struct ocfs2_mem_dqinfo *oinfo;
+ int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+ struct ocfs2_dinode *lfe, *gfe;
+
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+ return 0;
+
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
+ lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
+ /* We can extend local file + global file. In local file we
+ * can modify info, chunk header block and dquot block. In
+ * global file we can modify info, tree and leaf block */
+ return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
+ ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
+ 3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
+}
+
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+ handle_t *handle;
+ struct ocfs2_mem_dqinfo *oinfo =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+ struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+ int status = 0;
+
+ mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+ /* We need an exclusive lock, because we're going to update use count
+ * and instantiate possibly new dquot structure */
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ handle = ocfs2_start_trans(osb,
+ ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ status = dquot_acquire(dquot);
+ ocfs2_commit_trans(osb, handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+ unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+ (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+ (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+ (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+ (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+ (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+ int sync = 0;
+ int status;
+ struct super_block *sb = dquot->dq_sb;
+ int type = dquot->dq_type;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+
+ mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+ dquot_mark_dquot_dirty(dquot);
+
+ /* In case user set some limits, sync dquot immediately to global
+ * quota file so that information propagates quicker */
+ spin_lock(&dq_data_lock);
+ if (dquot->dq_flags & mask)
+ sync = 1;
+ spin_unlock(&dq_data_lock);
+ if (!sync) {
+ status = ocfs2_write_dquot(dquot);
+ goto out;
+ }
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ status = ocfs2_sync_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ /* Now write updated local dquot structure */
+ status = dquot_commit(dquot);
+out_trans:
+ ocfs2_commit_trans(osb, handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+ handle_t *handle;
+ int status = 0;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+ mlog_entry_void();
+
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ status = dquot_commit_info(sb, type);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+/* This is difficult. We have to lock quota inode and start transaction
+ * in this function but we don't want to take the penalty of exlusive
+ * quota file lock when we are just going to use cached structures. So
+ * we just take read lock check whether we have dquot cached and if so,
+ * we don't have to take the write lock... */
+static int ocfs2_dquot_initialize(struct inode *inode, int type)
+{
+ handle_t *handle = NULL;
+ int status = 0;
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_mem_dqinfo *oinfo;
+ int exclusive = 0;
+ int cnt;
+ qid_t id;
+
+ mlog_entry_void();
+
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (type != -1 && cnt != type)
+ continue;
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
+ oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+ status = ocfs2_lock_global_qf(oinfo, 0);
+ if (status < 0)
+ goto out;
+ /* This is just a performance optimization not a reliable test.
+ * Since we hold an inode lock, noone can actually release
+ * the structure until we are finished with initialization. */
+ if (inode->i_dquot[cnt] != NODQUOT) {
+ ocfs2_unlock_global_qf(oinfo, 0);
+ continue;
+ }
+ /* When we have inode lock, we know that no dquot_release() can
+ * run and thus we can safely check whether we need to
+ * read+modify global file to get quota information or whether
+ * our node already has it. */
+ if (cnt == USRQUOTA)
+ id = inode->i_uid;
+ else if (cnt == GRPQUOTA)
+ id = inode->i_gid;
+ else
+ BUG();
+ /* Obtain exclusion from quota off... */
+ down_write(&sb_dqopt(sb)->dqptr_sem);
+ exclusive = !dquot_is_cached(sb, id, cnt);
+ up_write(&sb_dqopt(sb)->dqptr_sem);
+ if (exclusive) {
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0) {
+ exclusive = 0;
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ ocfs2_calc_qinit_credits(sb, cnt));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ }
+ dquot_initialize(inode, cnt);
+ if (exclusive) {
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ ocfs2_unlock_global_qf(oinfo, 1);
+ }
+ ocfs2_unlock_global_qf(oinfo, 0);
+ }
+ mlog_exit(0);
+ return 0;
+out_ilock:
+ if (exclusive)
+ ocfs2_unlock_global_qf(oinfo, 1);
+ ocfs2_unlock_global_qf(oinfo, 0);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+static int ocfs2_dquot_drop_slow(struct inode *inode)
+{
+ int status = 0;
+ int cnt;
+ int got_lock[MAXQUOTAS] = {0, 0};
+ handle_t *handle;
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_mem_dqinfo *oinfo;
+
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
+ oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ got_lock[cnt] = 1;
+ }
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+ ocfs2_calc_qinit_credits(sb, GRPQUOTA));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+ dquot_drop(inode);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+ if (got_lock[cnt]) {
+ oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+ ocfs2_unlock_global_qf(oinfo, 1);
+ }
+ return status;
+}
+
+/* See the comment before ocfs2_dquot_initialize. */
+static int ocfs2_dquot_drop(struct inode *inode)
+{
+ int status = 0;
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_mem_dqinfo *oinfo;
+ int exclusive = 0;
+ int cnt;
+ int got_lock[MAXQUOTAS] = {0, 0};
+
+ mlog_entry_void();
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
+ oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+ status = ocfs2_lock_global_qf(oinfo, 0);
+ if (status < 0)
+ goto out;
+ got_lock[cnt] = 1;
+ }
+ /* Lock against anyone releasing references so that when when we check
+ * we know we are not going to be last ones to release dquot */
+ down_write(&sb_dqopt(sb)->dqptr_sem);
+ /* Urgh, this is a terrible hack :( */
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (inode->i_dquot[cnt] != NODQUOT &&
+ atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
+ exclusive = 1;
+ break;
+ }
+ }
+ if (!exclusive)
+ dquot_drop_locked(inode);
+ up_write(&sb_dqopt(sb)->dqptr_sem);
+out:
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+ if (got_lock[cnt]) {
+ oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+ ocfs2_unlock_global_qf(oinfo, 0);
+ }
+ /* In case we bailed out because we had to do expensive locking
+ * do it now... */
+ if (exclusive)
+ status = ocfs2_dquot_drop_slow(inode);
+ mlog_exit(status);
+ return status;
+}
+
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+ struct ocfs2_dquot *dquot =
+ kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+
+ if (!dquot)
+ return NULL;
+ return &dquot->dq_dquot;
+}
+
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+ kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+
+struct dquot_operations ocfs2_quota_operations = {
+ .initialize = ocfs2_dquot_initialize,
+ .drop = ocfs2_dquot_drop,
+ .alloc_space = dquot_alloc_space,
+ .alloc_inode = dquot_alloc_inode,
+ .free_space = dquot_free_space,
+ .free_inode = dquot_free_inode,
+ .transfer = dquot_transfer,
+ .write_dquot = ocfs2_write_dquot,
+ .acquire_dquot = ocfs2_acquire_dquot,
+ .release_dquot = ocfs2_release_dquot,
+ .mark_dirty = ocfs2_mark_dquot_dirty,
+ .write_info = ocfs2_write_info,
+ .alloc_dquot = ocfs2_alloc_dquot,
+ .destroy_dquot = ocfs2_destroy_dquot,
+};
+
+int ocfs2_quota_setup(void)
+{
+ ocfs2_quota_wq = create_workqueue("o2quot");
+ if (!ocfs2_quota_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+void ocfs2_quota_shutdown(void)
+{
+ if (ocfs2_quota_wq) {
+ flush_workqueue(ocfs2_quota_wq);
+ destroy_workqueue(ocfs2_quota_wq);
+ ocfs2_quota_wq = NULL;
+ }
+}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 00000000000..07deec5e972
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1253 @@
+/*
+ * Implementation of operations over local quota file
+ */
+
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+ return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+ sizeof(struct ocfs2_local_disk_dqblk));
+}
+
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+ return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+ OCFS2_QBLK_RESERVED_SPACE) << 3) /
+ ol_quota_entries_per_block(sb);
+}
+
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+ return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+ /* 1 block for local quota file info, 1 block per chunk for chunk info */
+ return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+ int epb = ol_quota_entries_per_block(sb);
+
+ return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
+{
+ int epb = ol_quota_entries_per_block(sb);
+
+ return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+ return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+ ol_dqblk_block_off(sb, c, off);
+}
+
+/* Compute block number from given offset */
+static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
+{
+ return off >> sb->s_blocksize_bits;
+}
+
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+ return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+ int epb = ol_quota_entries_per_block(sb);
+
+ return ((off >> sb->s_blocksize_bits) -
+ ol_quota_chunk_block(sb, c) - 1) * epb
+ + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+ sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+ void (*modify)(struct buffer_head *, void *), void *private)
+{
+ struct super_block *sb = inode->i_sb;
+ handle_t *handle;
+ int status;
+
+ handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ return status;
+ }
+ status = ocfs2_journal_access_dq(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ return status;
+ }
+ lock_buffer(bh);
+ modify(bh, private);
+ unlock_buffer(bh);
+ status = ocfs2_journal_dirty(handle, bh);
+ if (status < 0) {
+ mlog_errno(status);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ return status;
+ }
+ status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ if (status < 0) {
+ mlog_errno(status);
+ return status;
+ }
+ return 0;
+}
+
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+ unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+ unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+ unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+ unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+ unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE };
+ struct buffer_head *bh = NULL;
+ struct inode *linode = sb_dqopt(sb)->files[type];
+ struct inode *ginode = NULL;
+ struct ocfs2_disk_dqheader *dqhead;
+ int status, ret = 0;
+
+ /* First check whether we understand local quota file */
+ status = ocfs2_read_quota_block(linode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+ type);
+ goto out_err;
+ }
+ dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+ if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+ mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+ " type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+ lmagics[type], type);
+ goto out_err;
+ }
+ if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+ mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+ " type=%d\n", le32_to_cpu(dqhead->dqh_version),
+ lversions[type], type);
+ goto out_err;
+ }
+ brelse(bh);
+ bh = NULL;
+
+ /* Next check whether we understand global quota file */
+ ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+ OCFS2_INVALID_SLOT);
+ if (!ginode) {
+ mlog(ML_ERROR, "cannot get global quota file inode "
+ "(type=%d)\n", type);
+ goto out_err;
+ }
+ /* Since the header is read only, we don't care about locking */
+ status = ocfs2_read_quota_block(ginode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read global quota file header "
+ "(type=%d)\n", type);
+ goto out_err;
+ }
+ dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+ if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+ mlog(ML_ERROR, "global quota file magic does not match "
+ "(%u != %u), type=%d\n",
+ le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+ goto out_err;
+ }
+ if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+ mlog(ML_ERROR, "global quota file version does not match "
+ "(%u != %u), type=%d\n",
+ le32_to_cpu(dqhead->dqh_version), gversions[type],
+ type);
+ goto out_err;
+ }
+
+ ret = 1;
+out_err:
+ brelse(bh);
+ iput(ginode);
+ return ret;
+}
+
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+ struct ocfs2_quota_chunk *pos, *next;
+
+ list_for_each_entry_safe(pos, next, head, qc_chunk) {
+ list_del(&pos->qc_chunk);
+ brelse(pos->qc_headerbh);
+ kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+ }
+}
+
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+ struct ocfs2_local_disk_dqinfo *ldinfo,
+ struct list_head *head)
+{
+ struct ocfs2_quota_chunk *newchunk;
+ int i, status;
+
+ INIT_LIST_HEAD(head);
+ for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+ newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+ if (!newchunk) {
+ ocfs2_release_local_quota_bitmaps(head);
+ return -ENOMEM;
+ }
+ newchunk->qc_num = i;
+ newchunk->qc_headerbh = NULL;
+ status = ocfs2_read_quota_block(inode,
+ ol_quota_chunk_block(inode->i_sb, i),
+ &newchunk->qc_headerbh);
+ if (status) {
+ mlog_errno(status);
+ kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+ ocfs2_release_local_quota_bitmaps(head);
+ return status;
+ }
+ list_add_tail(&newchunk->qc_chunk, head);
+ }
+ return 0;
+}
+
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+ struct mem_dqinfo *info = private;
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ spin_lock(&dq_data_lock);
+ ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+ ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+ ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+ spin_unlock(&dq_data_lock);
+}
+
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+ struct ocfs2_local_disk_chunk *dchunk,
+ int chunk,
+ struct list_head *head)
+{
+ struct ocfs2_recovery_chunk *rc;
+
+ rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+ if (!rc)
+ return -ENOMEM;
+ rc->rc_chunk = chunk;
+ rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+ if (!rc->rc_bitmap) {
+ kfree(rc);
+ return -ENOMEM;
+ }
+ memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+ (ol_chunk_entries(sb) + 7) >> 3);
+ list_add_tail(&rc->rc_list, head);
+ return 0;
+}
+
+static void free_recovery_list(struct list_head *head)
+{
+ struct ocfs2_recovery_chunk *next;
+ struct ocfs2_recovery_chunk *rchunk;
+
+ list_for_each_entry_safe(rchunk, next, head, rc_list) {
+ list_del(&rchunk->rc_list);
+ kfree(rchunk->rc_bitmap);
+ kfree(rchunk);
+ }
+}
+
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+ int type;
+
+ for (type = 0; type < MAXQUOTAS; type++)
+ free_recovery_list(&(rec->r_list[type]));
+ kfree(rec);
+}
+
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+ struct ocfs2_local_disk_dqinfo *ldinfo,
+ int type,
+ struct list_head *head)
+{
+ struct super_block *sb = lqinode->i_sb;
+ struct buffer_head *hbh;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+ int status = 0;
+
+ for (i = 0; i < chunks; i++) {
+ hbh = NULL;
+ status = ocfs2_read_quota_block(lqinode,
+ ol_quota_chunk_block(sb, i),
+ &hbh);
+ if (status) {
+ mlog_errno(status);
+ break;
+ }
+ dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+ if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+ status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+ brelse(hbh);
+ if (status < 0)
+ break;
+ }
+ if (status < 0)
+ free_recovery_list(head);
+ return status;
+}
+
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+ int type;
+ struct ocfs2_quota_recovery *rec;
+
+ rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+ if (!rec)
+ return NULL;
+ for (type = 0; type < MAXQUOTAS; type++)
+ INIT_LIST_HEAD(&(rec->r_list[type]));
+ return rec;
+}
+
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+ struct ocfs2_super *osb,
+ int slot_num)
+{
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ struct super_block *sb = osb->sb;
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+ struct inode *lqinode;
+ struct buffer_head *bh;
+ int type;
+ int status = 0;
+ struct ocfs2_quota_recovery *rec;
+
+ mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+ rec = ocfs2_alloc_quota_recovery();
+ if (!rec)
+ return ERR_PTR(-ENOMEM);
+ /* First init... */
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ continue;
+ /* At this point, journal of the slot is already replayed so
+ * we can trust metadata and data of the quota file */
+ lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+ if (!lqinode) {
+ status = -ENOENT;
+ goto out;
+ }
+ status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+ OCFS2_META_LOCK_RECOVERY);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_put;
+ }
+ /* Now read local header */
+ bh = NULL;
+ status = ocfs2_read_quota_block(lqinode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file info header "
+ "(slot=%d type=%d)\n", slot_num, type);
+ goto out_lock;
+ }
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+ &rec->r_list[type]);
+ brelse(bh);
+out_lock:
+ ocfs2_inode_unlock(lqinode, 1);
+out_put:
+ iput(lqinode);
+ if (status < 0)
+ break;
+ }
+out:
+ if (status < 0) {
+ ocfs2_free_quota_recovery(rec);
+ rec = ERR_PTR(status);
+ }
+ return rec;
+}
+
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+ int type,
+ struct ocfs2_quota_recovery *rec)
+{
+ struct super_block *sb = lqinode->i_sb;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ struct ocfs2_local_disk_chunk *dchunk;
+ struct ocfs2_local_disk_dqblk *dqblk;
+ struct dquot *dquot;
+ handle_t *handle;
+ struct buffer_head *hbh = NULL, *qbh = NULL;
+ int status = 0;
+ int bit, chunk;
+ struct ocfs2_recovery_chunk *rchunk, *next;
+ qsize_t spacechange, inodechange;
+
+ mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+
+ list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+ chunk = rchunk->rc_chunk;
+ hbh = NULL;
+ status = ocfs2_read_quota_block(lqinode,
+ ol_quota_chunk_block(sb, chunk),
+ &hbh);
+ if (status) {
+ mlog_errno(status);
+ break;
+ }
+ dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+ for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+ qbh = NULL;
+ status = ocfs2_read_quota_block(lqinode,
+ ol_dqblk_block(sb, chunk, bit),
+ &qbh);
+ if (status) {
+ mlog_errno(status);
+ break;
+ }
+ dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+ ol_dqblk_block_off(sb, chunk, bit));
+ dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+ if (!dquot) {
+ status = -EIO;
+ mlog(ML_ERROR, "Failed to get quota structure "
+ "for id %u, type %d. Cannot finish quota "
+ "file recovery.\n",
+ (unsigned)le64_to_cpu(dqblk->dqb_id),
+ type);
+ goto out_put_bh;
+ }
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_QSYNC_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_put_dquot;
+ }
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ spin_lock(&dq_data_lock);
+ /* Add usage from quota entry into quota changes
+ * of our node. Auxiliary variables are important
+ * due to signedness */
+ spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+ inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+ dquot->dq_dqb.dqb_curspace += spacechange;
+ dquot->dq_dqb.dqb_curinodes += inodechange;
+ spin_unlock(&dq_data_lock);
+ /* We want to drop reference held by the crashed
+ * node. Since we have our own reference we know
+ * global structure actually won't be freed. */
+ status = ocfs2_global_release_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+ /* Release local quota file entry */
+ status = ocfs2_journal_access_dq(handle, lqinode,
+ qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+ lock_buffer(qbh);
+ WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
+ ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+ le32_add_cpu(&dchunk->dqc_free, 1);
+ unlock_buffer(qbh);
+ status = ocfs2_journal_dirty(handle, qbh);
+ if (status < 0)
+ mlog_errno(status);
+out_commit:
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_put_dquot:
+ dqput(dquot);
+out_put_bh:
+ brelse(qbh);
+ if (status < 0)
+ break;
+ }
+ brelse(hbh);
+ list_del(&rchunk->rc_list);
+ kfree(rchunk->rc_bitmap);
+ kfree(rchunk);
+ if (status < 0)
+ break;
+ }
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ if (status < 0)
+ free_recovery_list(&(rec->r_list[type]));
+ mlog_exit(status);
+ return status;
+}
+
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+ struct ocfs2_quota_recovery *rec,
+ int slot_num)
+{
+ unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ struct super_block *sb = osb->sb;
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+ struct buffer_head *bh;
+ handle_t *handle;
+ int type;
+ int status = 0;
+ struct inode *lqinode;
+ unsigned int flags;
+
+ mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+ mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (list_empty(&(rec->r_list[type])))
+ continue;
+ mlog(0, "Recovering quota in slot %d\n", slot_num);
+ lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+ if (!lqinode) {
+ status = -ENOENT;
+ goto out;
+ }
+ status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+ OCFS2_META_LOCK_NOQUEUE);
+ /* Someone else is holding the lock? Then he must be
+ * doing the recovery. Just skip the file... */
+ if (status == -EAGAIN) {
+ mlog(ML_NOTICE, "skipping quota recovery for slot %d "
+ "because quota file is locked.\n", slot_num);
+ status = 0;
+ goto out_put;
+ } else if (status < 0) {
+ mlog_errno(status);
+ goto out_put;
+ }
+ /* Now read local header */
+ bh = NULL;
+ status = ocfs2_read_quota_block(lqinode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file info header "
+ "(slot=%d type=%d)\n", slot_num, type);
+ goto out_lock;
+ }
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ /* Is recovery still needed? */
+ flags = le32_to_cpu(ldinfo->dqi_flags);
+ if (!(flags & OLQF_CLEAN))
+ status = ocfs2_recover_local_quota_file(lqinode,
+ type,
+ rec);
+ /* We don't want to mark file as clean when it is actually
+ * active */
+ if (slot_num == osb->slot_num)
+ goto out_bh;
+ /* Mark quota file as clean if we are recovering quota file of
+ * some other node. */
+ handle = ocfs2_start_trans(osb, 1);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_bh;
+ }
+ status = ocfs2_journal_access_dq(handle, lqinode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ lock_buffer(bh);
+ ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+ unlock_buffer(bh);
+ status = ocfs2_journal_dirty(handle, bh);
+ if (status < 0)
+ mlog_errno(status);
+out_trans:
+ ocfs2_commit_trans(osb, handle);
+out_bh:
+ brelse(bh);
+out_lock:
+ ocfs2_inode_unlock(lqinode, 1);
+out_put:
+ iput(lqinode);
+ if (status < 0)
+ break;
+ }
+out:
+ mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+ kfree(rec);
+ return status;
+}
+
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ int status;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_quota_recovery *rec;
+ int locked = 0;
+
+ info->dqi_maxblimit = 0x7fffffffffffffffLL;
+ info->dqi_maxilimit = 0x7fffffffffffffffLL;
+ oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+ if (!oinfo) {
+ mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+ " info.");
+ goto out_err;
+ }
+ info->dqi_priv = oinfo;
+ oinfo->dqi_type = type;
+ INIT_LIST_HEAD(&oinfo->dqi_chunk);
+ oinfo->dqi_rec = NULL;
+ oinfo->dqi_lqi_bh = NULL;
+ oinfo->dqi_ibh = NULL;
+
+ status = ocfs2_global_read_info(sb, type);
+ if (status < 0)
+ goto out_err;
+
+ status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+ locked = 1;
+
+ /* Now read local header */
+ status = ocfs2_read_quota_block(lqinode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file info header "
+ "(type=%d)\n", type);
+ goto out_err;
+ }
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+ oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+ oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+ oinfo->dqi_ibh = bh;
+
+ /* We crashed when using local quota file? */
+ if (!(info->dqi_flags & OLQF_CLEAN)) {
+ rec = OCFS2_SB(sb)->quota_rec;
+ if (!rec) {
+ rec = ocfs2_alloc_quota_recovery();
+ if (!rec) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_err;
+ }
+ OCFS2_SB(sb)->quota_rec = rec;
+ }
+
+ status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+ &rec->r_list[type]);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+ }
+
+ status = ocfs2_load_local_quota_bitmaps(lqinode,
+ ldinfo,
+ &oinfo->dqi_chunk);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ /* Now mark quota file as used */
+ info->dqi_flags &= ~OLQF_CLEAN;
+ status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ return 0;
+out_err:
+ if (oinfo) {
+ iput(oinfo->dqi_gqinode);
+ ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+ ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+ brelse(oinfo->dqi_lqi_bh);
+ if (locked)
+ ocfs2_inode_unlock(lqinode, 1);
+ ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+ kfree(oinfo);
+ }
+ brelse(bh);
+ return -1;
+}
+
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+ ->dqi_ibh;
+ int status;
+
+ status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+ info);
+ if (status < 0) {
+ mlog_errno(status);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int mark_clean = 1, len;
+ int status;
+
+ /* At this point we know there are no more dquots and thus
+ * even if there's some sync in the pdflush queue, it won't
+ * find any dquots and return without doing anything */
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ iput(oinfo->dqi_gqinode);
+ ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+ ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+ list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+ dchunk = (struct ocfs2_local_disk_chunk *)
+ (chunk->qc_headerbh->b_data);
+ if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+ len = ol_chunk_entries(sb);
+ } else {
+ len = (oinfo->dqi_blocks -
+ ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+ * ol_quota_entries_per_block(sb);
+ }
+ /* Not all entries free? Bug! */
+ if (le32_to_cpu(dchunk->dqc_free) != len) {
+ mlog(ML_ERROR, "releasing quota file with used "
+ "entries (type=%d)\n", type);
+ mark_clean = 0;
+ }
+ }
+ ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+
+ /* dqonoff_mutex protects us against racing with recovery thread... */
+ if (oinfo->dqi_rec) {
+ ocfs2_free_quota_recovery(oinfo->dqi_rec);
+ mark_clean = 0;
+ }
+
+ if (!mark_clean)
+ goto out;
+
+ /* Mark local file as clean */
+ info->dqi_flags |= OLQF_CLEAN;
+ status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+ oinfo->dqi_ibh,
+ olq_update_info,
+ info);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+out:
+ ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+ brelse(oinfo->dqi_ibh);
+ brelse(oinfo->dqi_lqi_bh);
+ kfree(oinfo);
+ return 0;
+}
+
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+ struct ocfs2_dquot *od = private;
+ struct ocfs2_local_disk_dqblk *dqblk;
+ struct super_block *sb = od->dq_dquot.dq_sb;
+
+ dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+ + ol_dqblk_block_offset(sb, od->dq_local_off));
+
+ dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+ spin_lock(&dq_data_lock);
+ dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+ od->dq_origspace);
+ dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+ od->dq_originodes);
+ spin_unlock(&dq_data_lock);
+ mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
+ od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
+ (long long)le64_to_cpu(dqblk->dqb_inodemod));
+}
+
+/* Write dquot to local quota file */
+static int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+ struct buffer_head *bh = NULL;
+ int status;
+
+ status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+ ol_dqblk_file_block(sb, od->dq_local_off),
+ &bh);
+ if (status) {
+ mlog_errno(status);
+ goto out;
+ }
+ status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+ olq_set_dquot, od);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+out:
+ brelse(bh);
+ return status;
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+ int type,
+ int *offset)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int found = 0, len;
+
+ list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+ dchunk = (struct ocfs2_local_disk_chunk *)
+ chunk->qc_headerbh->b_data;
+ if (le32_to_cpu(dchunk->dqc_free) > 0) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return NULL;
+
+ if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+ len = ol_chunk_entries(sb);
+ } else {
+ len = (oinfo->dqi_blocks -
+ ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+ * ol_quota_entries_per_block(sb);
+ }
+
+ found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+ /* We failed? */
+ if (found == len) {
+ mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+ " entries free (type=%d)\n", chunk->qc_num,
+ le32_to_cpu(dchunk->dqc_free), type);
+ return ERR_PTR(-EIO);
+ }
+ *offset = found;
+ return chunk;
+}
+
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+ struct super_block *sb,
+ int type,
+ int *offset)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ struct ocfs2_quota_chunk *chunk = NULL;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int status;
+ handle_t *handle;
+ struct buffer_head *bh = NULL;
+ u64 p_blkno;
+
+ /* We are protected by dqio_sem so no locking needed */
+ status = ocfs2_extend_no_holes(lqinode,
+ lqinode->i_size + 2 * sb->s_blocksize,
+ lqinode->i_size);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+ lqinode->i_size + 2 * sb->s_blocksize);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+ if (!chunk) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out;
+ }
+
+ down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+ &p_blkno, NULL, NULL);
+ up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ bh = sb_getblk(sb, p_blkno);
+ if (!bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out;
+ }
+ dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+
+ handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_journal_access_dq(handle, lqinode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ lock_buffer(bh);
+ dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
+ memset(dchunk->dqc_bitmap, 0,
+ sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+ OCFS2_QBLK_RESERVED_SPACE);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ status = ocfs2_journal_dirty(handle, bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+
+ oinfo->dqi_blocks += 2;
+ oinfo->dqi_chunks++;
+ status = ocfs2_local_write_info(sb, type);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+ chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+ struct ocfs2_quota_chunk,
+ qc_chunk)->qc_num + 1;
+ chunk->qc_headerbh = bh;
+ *offset = 0;
+ return chunk;
+out_trans:
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+ brelse(bh);
+ kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+ return ERR_PTR(status);
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+ struct super_block *sb,
+ int type,
+ int *offset)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_quota_chunk *chunk;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ struct ocfs2_local_disk_chunk *dchunk;
+ int epb = ol_quota_entries_per_block(sb);
+ unsigned int chunk_blocks;
+ int status;
+ handle_t *handle;
+
+ if (list_empty(&oinfo->dqi_chunk))
+ return ocfs2_local_quota_add_chunk(sb, type, offset);
+ /* Is the last chunk full? */
+ chunk = list_entry(oinfo->dqi_chunk.prev,
+ struct ocfs2_quota_chunk, qc_chunk);
+ chunk_blocks = oinfo->dqi_blocks -
+ ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+ if (ol_chunk_blocks(sb) == chunk_blocks)
+ return ocfs2_local_quota_add_chunk(sb, type, offset);
+
+ /* We are protected by dqio_sem so no locking needed */
+ status = ocfs2_extend_no_holes(lqinode,
+ lqinode->i_size + sb->s_blocksize,
+ lqinode->i_size);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+ lqinode->i_size + sb->s_blocksize);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+ status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+
+ dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+ lock_buffer(chunk->qc_headerbh);
+ le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+ unlock_buffer(chunk->qc_headerbh);
+ status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ oinfo->dqi_blocks++;
+ status = ocfs2_local_write_info(sb, type);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+
+ status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ *offset = chunk_blocks * epb;
+ return chunk;
+out_trans:
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+ return ERR_PTR(status);
+}
+
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+ int *offset = private;
+ struct ocfs2_local_disk_chunk *dchunk;
+
+ dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+ ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+ le32_add_cpu(&dchunk->dqc_free, -1);
+}
+
+/* Create dquot in the local file for given id */
+static int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+ struct super_block *sb = dquot->dq_sb;
+ int type = dquot->dq_type;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+ int offset;
+ int status;
+
+ chunk = ocfs2_find_free_entry(sb, type, &offset);
+ if (!chunk) {
+ chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+ if (IS_ERR(chunk))
+ return PTR_ERR(chunk);
+ } else if (IS_ERR(chunk)) {
+ return PTR_ERR(chunk);
+ }
+ od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+ od->dq_chunk = chunk;
+
+ /* Initialize dquot structure on disk */
+ status = ocfs2_local_write_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ /* Mark structure as allocated */
+ status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+ &offset);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+out:
+ return status;
+}
+
+/* Create entry in local file for dquot, load data from the global file */
+static int ocfs2_local_read_dquot(struct dquot *dquot)
+{
+ int status;
+
+ mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
+
+ status = ocfs2_global_read_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ /* Now create entry in the local quota file */
+ status = ocfs2_create_local_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+ mlog_exit(0);
+ return 0;
+out_err:
+ mlog_exit(status);
+ return status;
+}
+
+/* Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and obtained exclusive lock for global
+ * quota file. */
+static int ocfs2_local_release_dquot(struct dquot *dquot)
+{
+ int status;
+ int type = dquot->dq_type;
+ struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int offset;
+ handle_t *handle = journal_current_handle();
+
+ BUG_ON(!handle);
+ /* First write all local changes to global file */
+ status = ocfs2_global_release_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
+ od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+ od->dq_local_off);
+ dchunk = (struct ocfs2_local_disk_chunk *)
+ (od->dq_chunk->qc_headerbh->b_data);
+ /* Mark structure as freed */
+ lock_buffer(od->dq_chunk->qc_headerbh);
+ ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+ le32_add_cpu(&dchunk->dqc_free, 1);
+ unlock_buffer(od->dq_chunk->qc_headerbh);
+ status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ status = 0;
+out:
+ /* Clear the read bit so that next time someone uses this
+ * dquot he reads fresh info from disk and allocates local
+ * dquot structure */
+ clear_bit(DQ_READ_B, &dquot->dq_flags);
+ return status;
+}
+
+static struct quota_format_ops ocfs2_format_ops = {
+ .check_quota_file = ocfs2_local_check_quota_file,
+ .read_file_info = ocfs2_local_read_info,
+ .write_file_info = ocfs2_global_write_info,
+ .free_file_info = ocfs2_local_free_info,
+ .read_dqblk = ocfs2_local_read_dquot,
+ .commit_dqblk = ocfs2_local_write_dquot,
+ .release_dqblk = ocfs2_local_release_dquot,
+};
+
+struct quota_format_type ocfs2_quota_format = {
+ .qf_fmt_id = QFMT_OCFS2,
+ .qf_ops = &ocfs2_format_ops,
+ .qf_owner = THIS_MODULE
+};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ffd48db229a..424adaa5f90 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
new_clusters, first_new_cluster);
- ret = ocfs2_journal_access(handle, bm_inode, group_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
}
/* update the inode accordingly. */
- ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_rollback;
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+ /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+ * so any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
ocfs2_group_bitmap_size(osb->sb) * 8) {
mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
goto out_unlock;
}
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
- ret = -EIO;
- goto out_unlock;
- }
-
first_new_cluster = le32_to_cpu(fe->i_clusters);
lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
first_new_cluster - 1);
- ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
+ ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+ &group_bh);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
}
-
group = (struct ocfs2_group_desc *)group_bh->b_data;
- ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
- if (ret) {
- mlog_errno(ret);
- goto out_unlock;
- }
-
cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode,
struct buffer_head *group_bh)
{
int ret;
- struct ocfs2_group_desc *gd;
+ struct ocfs2_group_desc *gd =
+ (struct ocfs2_group_desc *)group_bh->b_data;
u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
- unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
- le16_to_cpu(di->id2.i_chain.cl_bpc);
-
- gd = (struct ocfs2_group_desc *)group_bh->b_data;
+ ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
+ if (ret)
+ goto out;
- ret = -EIO;
- if (!OCFS2_IS_VALID_GROUP_DESC(gd))
- mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno));
- else if (di->i_blkno != gd->bg_parent_dinode)
- mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
- "pointer (%llu, expected %llu)\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
- (unsigned long long)le64_to_cpu(di->i_blkno));
- else if (le16_to_cpu(gd->bg_bits) > max_bits)
- mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits));
- else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
- mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
- "claims that %u are free\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- le16_to_cpu(gd->bg_free_bits_count));
- else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
- mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
- "max bitmap bits of %u\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- 8 * le16_to_cpu(gd->bg_size));
- else if (le16_to_cpu(gd->bg_chain) != input->chain)
+ ret = -EINVAL;
+ if (le16_to_cpu(gd->bg_chain) != input->chain)
mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
"while input has %u set.\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode,
else
ret = 0;
+out:
return ret;
}
@@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
cl = &fe->id2.i_chain;
cr = &cl->cl_recs[input->chain];
- ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
@@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
goto out_commit;
}
- ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bdda2d8f850..40661e7824e 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
* this is not true, the read of -1 (UINT64_MAX) will fail.
*/
ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
- OCFS2_BH_IGNORE_CACHE);
+ OCFS2_BH_IGNORE_CACHE, NULL);
if (ret == 0) {
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
bh = NULL; /* Acquire a fresh bh */
status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
- OCFS2_BH_IGNORE_CACHE);
+ OCFS2_BH_IGNORE_CACHE, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c5ff18b46b5..a69628603e1 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -35,6 +35,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "inode.h"
#include "journal.h"
@@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
}
-/* somewhat more expensive than our other checks, so use sparingly. */
-int ocfs2_check_group_descriptor(struct super_block *sb,
- struct ocfs2_dinode *di,
- struct ocfs2_group_desc *gd)
+#define do_error(fmt, ...) \
+ do{ \
+ if (clean_error) \
+ mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
+ else \
+ ocfs2_error(sb, fmt, ##__VA_ARGS__); \
+ } while (0)
+
+static int ocfs2_validate_gd_self(struct super_block *sb,
+ struct buffer_head *bh,
+ int clean_error)
{
- unsigned int max_bits;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
- return -EIO;
+ do_error("Group descriptor #%llu has bad signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ gd->bg_signature);
+ return -EINVAL;
}
+ if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+ do_error("Group descriptor #%llu has an invalid bg_blkno "
+ "of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(gd->bg_blkno));
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+ do_error("Group descriptor #%llu has an invalid "
+ "fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(gd->bg_generation));
+ return -EINVAL;
+ }
+
+ if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+ do_error("Group descriptor #%llu has bit count %u but "
+ "claims that %u are free",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_bits),
+ le16_to_cpu(gd->bg_free_bits_count));
+ return -EINVAL;
+ }
+
+ if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+ do_error("Group descriptor #%llu has bit count %u but "
+ "max bitmap bits of %u",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_bits),
+ 8 * le16_to_cpu(gd->bg_size));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+ struct ocfs2_dinode *di,
+ struct buffer_head *bh,
+ int clean_error)
+{
+ unsigned int max_bits;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
if (di->i_blkno != gd->bg_parent_dinode) {
- ocfs2_error(sb, "Group descriptor # %llu has bad parent "
- "pointer (%llu, expected %llu)",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
- (unsigned long long)le64_to_cpu(di->i_blkno));
- return -EIO;
+ do_error("Group descriptor #%llu has bad parent "
+ "pointer (%llu, expected %llu)",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ return -EINVAL;
}
max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
if (le16_to_cpu(gd->bg_bits) > max_bits) {
- ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits));
- return -EIO;
+ do_error("Group descriptor #%llu has bit count of %u",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_bits));
+ return -EINVAL;
}
if (le16_to_cpu(gd->bg_chain) >=
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
- ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_chain));
- return -EIO;
+ do_error("Group descriptor #%llu has bad chain %u",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_chain));
+ return -EINVAL;
}
- if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
- ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
- "claims that %u are free",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- le16_to_cpu(gd->bg_free_bits_count));
- return -EIO;
- }
+ return 0;
+}
- if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
- ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
- "max bitmap bits of %u",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- 8 * le16_to_cpu(gd->bg_size));
- return -EIO;
+#undef do_error
+
+/*
+ * This version only prints errors. It does not fail the filesystem, and
+ * exists only for resize.
+ */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+ struct ocfs2_dinode *di,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+ if (rc) {
+ mlog(ML_ERROR,
+ "Checksum failed for group descriptor %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ } else
+ rc = ocfs2_validate_gd_self(sb, bh, 1);
+ if (!rc)
+ rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+
+ return rc;
+}
+
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+ mlog(0, "Validating group descriptor %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+ if (rc)
+ return rc;
+
+ /*
+ * Errors after here are fatal.
+ */
+
+ return ocfs2_validate_gd_self(sb, bh, 0);
+}
+
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+ u64 gd_blkno, struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(inode, gd_blkno, &tmp,
+ ocfs2_validate_group_descriptor);
+ if (rc)
+ goto out;
+
+ rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
+ if (rc) {
+ brelse(tmp);
+ goto out;
}
- return 0;
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!*bh)
+ *bh = tmp;
+
+out:
+ return rc;
}
static int ocfs2_block_group_fill(handle_t *handle,
@@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
goto bail;
}
- status = ocfs2_journal_access(handle,
- alloc_inode,
- bg_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_gd(handle,
+ alloc_inode,
+ bg_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
bg = (struct ocfs2_group_desc *) bg_bh->b_data;
- status = ocfs2_journal_access(handle, alloc_inode,
- bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, alloc_inode,
+ bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
ac->ac_alloc_slot = slot;
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
- status = -EIO;
- goto bail;
- }
+
+ /* The bh was validated by the inode read inside
+ * ocfs2_inode_lock(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
(unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
int offset, start, found, status = 0;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
- return -EIO;
- }
+ /* Callers got this descriptor from
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
found = start = best_offset = best_size = 0;
bitmap = bg->bg_bitmap;
@@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
mlog_entry_void();
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
- status = -EIO;
- goto bail;
- }
+ /* All callers get the descriptor via
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
if (ocfs2_is_cluster_bitmap(alloc_inode))
journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
- status = ocfs2_journal_access(handle,
- alloc_inode,
- group_bh,
- journal_type);
+ status = ocfs2_journal_access_gd(handle,
+ alloc_inode,
+ group_bh,
+ journal_type);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
- status = -EIO;
- goto out;
- }
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
- status = -EIO;
- goto out;
- }
- if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
- status = -EIO;
- goto out;
- }
+ /* The caller got these descriptors from
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
(unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
bg_ptr = le64_to_cpu(bg->bg_next_group);
prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
- status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_rollback;
@@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
goto out_rollback;
}
- status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_rollback;
@@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
goto out_rollback;
}
- status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_rollback;
@@ -1008,7 +1116,7 @@ out_rollback:
bg->bg_next_group = cpu_to_le64(bg_ptr);
prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
}
-out:
+
mlog_exit(status);
return status;
}
@@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
u16 found;
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *gd;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
struct inode *alloc_inode = ac->ac_inode;
- ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
+ ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+ &group_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
}
gd = (struct ocfs2_group_desc *) group_bh->b_data;
- if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
- ret = -EIO;
- goto out;
- }
-
ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
ac->ac_max_block, bit_off, &found);
if (ret < 0) {
@@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
bits_wanted, chain,
(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
- status = ocfs2_read_block(alloc_inode,
- le64_to_cpu(cl->cl_recs[chain].c_blkno),
- &group_bh);
+ status = ocfs2_read_group_descriptor(alloc_inode, fe,
+ le64_to_cpu(cl->cl_recs[chain].c_blkno),
+ &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bg = (struct ocfs2_group_desc *) group_bh->b_data;
- status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
status = -ENOSPC;
/* for now, the chain search is a bit simplistic. We just use
@@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
next_group = le64_to_cpu(bg->bg_next_group);
prev_group_bh = group_bh;
group_bh = NULL;
- status = ocfs2_read_block(alloc_inode,
- next_group, &group_bh);
+ status = ocfs2_read_group_descriptor(alloc_inode, fe,
+ next_group, &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bg = (struct ocfs2_group_desc *) group_bh->b_data;
- status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
}
if (status < 0) {
if (status != -ENOSPC)
@@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
/* Ok, claim our bits now: set the info on dinode, chainlist
* and then the group */
- status = ocfs2_journal_access(handle,
- alloc_inode,
- ac->ac_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle,
+ alloc_inode,
+ ac->ac_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
BUG_ON(!ac->ac_bh);
fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
- status = -EIO;
- goto bail;
- }
+
+ /* The bh was validated by the inode read during
+ * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
mlog_entry_void();
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
- status = -EIO;
- goto bail;
- }
+ /* The caller got this descriptor from
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
if (ocfs2_is_cluster_bitmap(alloc_inode))
journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
- status = ocfs2_journal_access(handle, alloc_inode, group_bh,
- journal_type);
+ status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
+ journal_type);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
mlog_entry_void();
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
- status = -EIO;
- goto bail;
- }
+ /* The alloc_bh comes from ocfs2_free_dinode() or
+ * ocfs2_free_clusters(). The callers have all locked the
+ * allocator and gotten alloc_bh from the lock call. This
+ * validates the dinode buffer. Any corruption that has happended
+ * is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
(unsigned long long)bg_blkno, start_bit);
- status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
+ status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+ &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
-
group = (struct ocfs2_group_desc *) group_bh->b_data;
- status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
+
BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
status = ocfs2_block_group_clear_bits(handle, alloc_inode,
@@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
goto bail;
}
- status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 4df159d8f45..e3c13c77f9e 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
* and return that block offset. */
u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
-/* somewhat more expensive than our other checks, so use sparingly. */
+/*
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
+ * finds a problem. A caller that wants to check a group descriptor
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function. This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
+ */
int ocfs2_check_group_descriptor(struct super_block *sb,
struct ocfs2_dinode *di,
- struct ocfs2_group_desc *gd);
+ struct buffer_head *bh);
+/*
+ * Read a group descriptor block into *bh. If *bh is NULL, a bh will be
+ * allocated. This is a cached read. The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+ u64 gd_blkno, struct buffer_head **bh);
+
int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
u32 clusters_to_add, u32 extents_to_split,
struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 304b63ac78c..43ed11345b5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
#include <linux/debugfs.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
@@ -51,6 +52,7 @@
#include "ocfs1_fs_compat.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "export.h"
#include "extent_map.h"
@@ -65,10 +67,13 @@
#include "uptodate.h"
#include "ver.h"
#include "xattr.h"
+#include "quota.h"
#include "buffer_head_io.h"
static struct kmem_cache *ocfs2_inode_cachep = NULL;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
/* OCFS2 needs to schedule several differnt types of work which
* require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb,
static void ocfs2_write_super(struct super_block *sb);
static struct inode *ocfs2_alloc_inode(struct super_block *sb);
static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
static const struct super_operations ocfs2_sops = {
.statfs = ocfs2_statfs,
@@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = {
.put_super = ocfs2_put_super,
.remount_fs = ocfs2_remount,
.show_options = ocfs2_show_options,
+ .quota_read = ocfs2_quota_read,
+ .quota_write = ocfs2_quota_write,
};
enum {
@@ -158,6 +168,10 @@ enum {
Opt_user_xattr,
Opt_nouser_xattr,
Opt_inode64,
+ Opt_acl,
+ Opt_noacl,
+ Opt_usrquota,
+ Opt_grpquota,
Opt_err,
};
@@ -180,6 +194,10 @@ static const match_table_t tokens = {
{Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_inode64, "inode64"},
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_usrquota, "usrquota"},
+ {Opt_grpquota, "grpquota"},
{Opt_err, NULL}
};
@@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
return 0;
}
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+ && (ino == USER_QUOTA_SYSTEM_INODE
+ || ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+ return 0;
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+ && (ino == GROUP_QUOTA_SYSTEM_INODE
+ || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+ return 0;
+ return 1;
+}
+
static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
{
struct inode *new = NULL;
@@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+ if (!ocfs2_need_system_inode(osb, i))
+ continue;
new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs2_release_system_inodes(osb);
@@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
i < NUM_SYSTEM_INODES;
i++) {
+ if (!ocfs2_need_system_inode(osb, i))
+ continue;
new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs2_release_system_inodes(osb);
@@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
/* We're going to/from readonly mode. */
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+ /* Disable quota accounting before remounting RO */
+ if (*flags & MS_RDONLY) {
+ ret = ocfs2_susp_quotas(osb, 0);
+ if (ret < 0)
+ goto out;
+ }
/* Lock here so the check of HARD_RO and the potential
* setting of SOFT_RO is atomic. */
spin_lock(&osb->osb_lock);
@@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
}
unlock_osb:
spin_unlock(&osb->osb_lock);
+ /* Enable quota accounting after remounting RW */
+ if (!ret && !(*flags & MS_RDONLY)) {
+ if (sb_any_quota_suspended(sb))
+ ret = ocfs2_susp_quotas(osb, 1);
+ else
+ ret = ocfs2_enable_quotas(osb);
+ if (ret < 0) {
+ /* Return back changes... */
+ spin_lock(&osb->osb_lock);
+ sb->s_flags |= MS_RDONLY;
+ osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+ spin_unlock(&osb->osb_lock);
+ goto out;
+ }
+ }
}
if (!ret) {
/* Only save off the new mount options in case of a successful
* remount. */
+ if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+ parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
@@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
return 0;
}
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+ int type;
+ struct super_block *sb = osb->sb;
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ int status = 0;
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ continue;
+ if (unsuspend)
+ status = vfs_quota_enable(
+ sb_dqopt(sb)->files[type],
+ type, QFMT_OCFS2,
+ DQUOT_SUSPENDED);
+ else
+ status = vfs_quota_disable(sb, type,
+ DQUOT_SUSPENDED);
+ if (status < 0)
+ break;
+ }
+ if (status < 0)
+ mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+ "remount (error = %d).\n", status);
+ return status;
+}
+
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+ struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+ struct super_block *sb = osb->sb;
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ int status;
+ int type;
+
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ continue;
+ inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+ osb->slot_num);
+ if (!inode[type]) {
+ status = -ENOENT;
+ goto out_quota_off;
+ }
+ status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+ DQUOT_USAGE_ENABLED);
+ if (status < 0)
+ goto out_quota_off;
+ }
+
+ for (type = 0; type < MAXQUOTAS; type++)
+ iput(inode[type]);
+ return 0;
+out_quota_off:
+ ocfs2_disable_quotas(osb);
+ for (type = 0; type < MAXQUOTAS; type++)
+ iput(inode[type]);
+ mlog_errno(status);
+ return status;
+}
+
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+ int type;
+ struct inode *inode;
+ struct super_block *sb = osb->sb;
+
+ /* We mostly ignore errors in this function because there's not much
+ * we can do when we see them */
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!sb_has_quota_loaded(sb, type))
+ continue;
+ inode = igrab(sb->s_dquot.files[type]);
+ /* Turn off quotas. This will remove all dquot structures from
+ * memory and so they will be automatically synced to global
+ * quota files */
+ vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED);
+ if (!inode)
+ continue;
+ iput(inode);
+ }
+}
+
+/* Handle quota on quotactl */
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+ char *path, int remount)
+{
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ return -EINVAL;
+
+ if (remount)
+ return 0; /* Just ignore it has been handled in
+ * ocfs2_remount() */
+ return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
+ format_id, DQUOT_LIMITS_ENABLED);
+}
+
+/* Handle quota off quotactl */
+static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+{
+ if (remount)
+ return 0; /* Ignore now and handle later in
+ * ocfs2_remount() */
+ return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+
+static struct quotactl_ops ocfs2_quotactl_ops = {
+ .quota_on = ocfs2_quota_on,
+ .quota_off = ocfs2_quota_off,
+ .quota_sync = vfs_quota_sync,
+ .get_info = vfs_get_dqinfo,
+ .set_info = vfs_set_dqinfo,
+ .get_dqblk = vfs_get_dqblk,
+ .set_dqblk = vfs_set_dqblk,
+};
+
static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
{
struct dentry *root;
@@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
}
brelse(bh);
bh = NULL;
+
+ if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+ parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
osb->osb_commit_interval = parsed_options.commit_interval;
osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
osb->local_alloc_bits = osb->local_alloc_default_bits;
+ if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
+ !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "User quotas were requested, but this "
+ "filesystem does not have the feature enabled.\n");
+ goto read_super_error;
+ }
+ if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+ !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Group quotas were requested, but this "
+ "filesystem does not have the feature enabled.\n");
+ goto read_super_error;
+ }
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
@@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = OCFS2_SUPER_MAGIC;
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
* heartbeat=none */
if (bdev_read_only(sb->s_bdev)) {
@@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
atomic_set(&osb->vol_state, VOLUME_MOUNTED);
wake_up(&osb->osb_mount_event);
+ /* Now we can initialize quotas because we can afford to wait
+ * for cluster locks recovery now. That also means that truncation
+ * log recovery can happen but that waits for proper quota setup */
+ if (!(sb->s_flags & MS_RDONLY)) {
+ status = ocfs2_enable_quotas(osb);
+ if (status < 0) {
+ /* We have to err-out specially here because
+ * s_root is already set */
+ mlog_errno(status);
+ atomic_set(&osb->vol_state, VOLUME_DISABLED);
+ wake_up(&osb->osb_mount_event);
+ mlog_exit(status);
+ return status;
+ }
+ }
+
+ ocfs2_complete_quota_recovery(osb);
+
+ /* Now we wake up again for processes waiting for quotas */
+ atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+ wake_up(&osb->osb_mount_event);
+
mlog_exit(status);
return status;
@@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_inode64:
mopt->mount_opt |= OCFS2_MOUNT_INODE64;
break;
+ case Opt_usrquota:
+ /* We check only on remount, otherwise features
+ * aren't yet initialized. */
+ if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+ mlog(ML_ERROR, "User quota requested but "
+ "filesystem feature is not set\n");
+ status = 0;
+ goto bail;
+ }
+ mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+ break;
+ case Opt_grpquota:
+ if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+ mlog(ML_ERROR, "Group quota requested but "
+ "filesystem feature is not set\n");
+ status = 0;
+ goto bail;
+ }
+ mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+ break;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+ case Opt_acl:
+ mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+ break;
+ case Opt_noacl:
+ mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+ break;
+#else
+ case Opt_acl:
+ case Opt_noacl:
+ printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
+ break;
+#endif
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (osb->osb_cluster_stack[0])
seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
osb->osb_cluster_stack);
+ if (opts & OCFS2_MOUNT_USRQUOTA)
+ seq_printf(s, ",usrquota");
+ if (opts & OCFS2_MOUNT_GRPQUOTA)
+ seq_printf(s, ",grpquota");
if (opts & OCFS2_MOUNT_NOUSERXATTR)
seq_printf(s, ",nouser_xattr");
@@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (opts & OCFS2_MOUNT_INODE64)
seq_printf(s, ",inode64");
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+ if (opts & OCFS2_MOUNT_POSIX_ACL)
+ seq_printf(s, ",acl");
+ else
+ seq_printf(s, ",noacl");
+#endif
+
return 0;
}
@@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void)
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
}
+ status = ocfs2_quota_setup();
+ if (status)
+ goto leave;
+
ocfs2_set_locking_protocol();
+ status = register_quota_format(&ocfs2_quota_format);
leave:
if (status < 0) {
+ ocfs2_quota_shutdown();
ocfs2_free_mem_caches();
exit_ocfs2_uptodate_cache();
}
@@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void)
{
mlog_entry_void();
+ ocfs2_quota_shutdown();
+
if (ocfs2_wq) {
flush_workqueue(ocfs2_wq);
destroy_workqueue(ocfs2_wq);
}
+ unregister_quota_format(&ocfs2_quota_format);
+
debugfs_remove(ocfs2_debugfs_root);
ocfs2_free_mem_caches();
@@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void)
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
ocfs2_inode_init_once);
- if (!ocfs2_inode_cachep)
+ ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+ sizeof(struct ocfs2_dquot),
+ 0,
+ (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ NULL);
+ ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+ sizeof(struct ocfs2_quota_chunk),
+ 0,
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ NULL);
+ if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+ !ocfs2_qf_chunk_cachep) {
+ if (ocfs2_inode_cachep)
+ kmem_cache_destroy(ocfs2_inode_cachep);
+ if (ocfs2_dquot_cachep)
+ kmem_cache_destroy(ocfs2_dquot_cachep);
+ if (ocfs2_qf_chunk_cachep)
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
return -ENOMEM;
+ }
return 0;
}
@@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void)
{
if (ocfs2_inode_cachep)
kmem_cache_destroy(ocfs2_inode_cachep);
-
ocfs2_inode_cachep = NULL;
+
+ if (ocfs2_dquot_cachep)
+ kmem_cache_destroy(ocfs2_dquot_cachep);
+ ocfs2_dquot_cachep = NULL;
+
+ if (ocfs2_qf_chunk_cachep)
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+ ocfs2_qf_chunk_cachep = NULL;
}
static int ocfs2_get_sector(struct super_block *sb,
@@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
osb = OCFS2_SB(sb);
BUG_ON(!osb);
+ ocfs2_disable_quotas(osb);
+
ocfs2_shutdown_local_alloc(osb);
ocfs2_truncate_log_shutdown(osb);
@@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
sb->s_export_op = &ocfs2_export_ops;
+ sb->s_qcop = &ocfs2_quotactl_ops;
+ sb->dq_op = &ocfs2_quota_operations;
sb->s_xattr = ocfs2_xattr_handlers;
sb->s_time_gran = 1;
sb->s_flags |= MS_NOATIME;
@@ -1676,6 +1990,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+ /* We have to do a raw check of the feature here */
+ if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+ OCFS2_FEATURE_INCOMPAT_META_ECC) {
+ status = ocfs2_block_check_validate(bh->b_data,
+ bh->b_size,
+ &di->i_check);
+ if (status)
+ goto out;
+ }
status = -EINVAL;
if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1717,6 +2040,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
}
}
+out:
mlog_exit(status);
return status;
}
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index cbd03dfdc7b..ed0a0cfd68d 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
mlog_entry_void();
- status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
+ status = ocfs2_read_inode_block(inode, bh);
if (status < 0) {
mlog_errno(status);
link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d7367ade1..e1d638af6ac 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -35,12 +35,14 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
+#include <linux/security.h>
#define MLOG_MASK_PREFIX ML_XATTR
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "file.h"
#include "symlink.h"
@@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root {
};
struct ocfs2_xattr_bucket {
- struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
- struct ocfs2_xattr_header *xh;
+ /* The inode these xattrs are associated with */
+ struct inode *bu_inode;
+
+ /* The actual buffers that make up the bucket */
+ struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+
+ /* How many blocks make up one bucket for this filesystem */
+ int bu_blocks;
+};
+
+struct ocfs2_xattr_set_ctxt {
+ handle_t *handle;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
};
#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
#define OCFS2_XATTR_INLINE_SIZE 80
+#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \
+ - sizeof(struct ocfs2_xattr_header) \
+ - sizeof(__u32))
+#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \
+ - sizeof(struct ocfs2_xattr_block) \
+ - sizeof(struct ocfs2_xattr_header) \
+ - sizeof(__u32))
static struct ocfs2_xattr_def_value_root def_xv = {
.xv.xr_list.l_count = cpu_to_le16(1),
@@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = {
struct xattr_handler *ocfs2_xattr_handlers[] = {
&ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+ &ocfs2_xattr_acl_access_handler,
+ &ocfs2_xattr_acl_default_handler,
+#endif
&ocfs2_xattr_trusted_handler,
+ &ocfs2_xattr_security_handler,
NULL
};
static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
[OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+ [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+ = &ocfs2_xattr_acl_access_handler,
+ [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+ = &ocfs2_xattr_acl_default_handler,
+#endif
[OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
+ [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
};
struct ocfs2_xattr_info {
@@ -98,7 +132,7 @@ struct ocfs2_xattr_search {
*/
struct buffer_head *xattr_bh;
struct ocfs2_xattr_header *header;
- struct ocfs2_xattr_bucket bucket;
+ struct ocfs2_xattr_bucket *bucket;
void *base;
void *end;
struct ocfs2_xattr_entry *here;
@@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
size_t buffer_size);
static int ocfs2_xattr_create_index_block(struct inode *inode,
- struct ocfs2_xattr_search *xs);
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt);
static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs);
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt);
static int ocfs2_delete_xattr_index_block(struct inode *inode,
struct buffer_head *xb_bh);
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+ u64 src_blk, u64 last_blk, u64 to_blk,
+ unsigned int start_bucket,
+ u32 *first_hash);
static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
{
@@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
return len / sizeof(struct ocfs2_xattr_entry);
}
+#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
+#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
+
+static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
+{
+ struct ocfs2_xattr_bucket *bucket;
+ int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
+
+ bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
+ if (bucket) {
+ bucket->bu_inode = inode;
+ bucket->bu_blocks = blks;
+ }
+
+ return bucket;
+}
+
+static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
+{
+ int i;
+
+ for (i = 0; i < bucket->bu_blocks; i++) {
+ brelse(bucket->bu_bhs[i]);
+ bucket->bu_bhs[i] = NULL;
+ }
+}
+
+static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
+{
+ if (bucket) {
+ ocfs2_xattr_bucket_relse(bucket);
+ bucket->bu_inode = NULL;
+ kfree(bucket);
+ }
+}
+
+/*
+ * A bucket that has never been written to disk doesn't need to be
+ * read. We just need the buffer_heads. Don't call this for
+ * buckets that are already on disk. ocfs2_read_xattr_bucket() initializes
+ * them fully.
+ */
+static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+ u64 xb_blkno)
+{
+ int i, rc = 0;
+
+ for (i = 0; i < bucket->bu_blocks; i++) {
+ bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
+ xb_blkno + i);
+ if (!bucket->bu_bhs[i]) {
+ rc = -EIO;
+ mlog_errno(rc);
+ break;
+ }
+
+ if (!ocfs2_buffer_uptodate(bucket->bu_inode,
+ bucket->bu_bhs[i]))
+ ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
+ bucket->bu_bhs[i]);
+ }
+
+ if (rc)
+ ocfs2_xattr_bucket_relse(bucket);
+ return rc;
+}
+
+/* Read the xattr bucket at xb_blkno */
+static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+ u64 xb_blkno)
+{
+ int rc;
+
+ rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
+ bucket->bu_blocks, bucket->bu_bhs, 0,
+ NULL);
+ if (!rc) {
+ rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
+ bucket->bu_bhs,
+ bucket->bu_blocks,
+ &bucket_xh(bucket)->xh_check);
+ if (rc)
+ mlog_errno(rc);
+ }
+
+ if (rc)
+ ocfs2_xattr_bucket_relse(bucket);
+ return rc;
+}
+
+static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
+ struct ocfs2_xattr_bucket *bucket,
+ int type)
+{
+ int i, rc = 0;
+
+ for (i = 0; i < bucket->bu_blocks; i++) {
+ rc = ocfs2_journal_access(handle, bucket->bu_inode,
+ bucket->bu_bhs[i], type);
+ if (rc) {
+ mlog_errno(rc);
+ break;
+ }
+ }
+
+ return rc;
+}
+
+static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
+ struct ocfs2_xattr_bucket *bucket)
+{
+ int i;
+
+ ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
+ bucket->bu_bhs, bucket->bu_blocks,
+ &bucket_xh(bucket)->xh_check);
+
+ for (i = 0; i < bucket->bu_blocks; i++)
+ ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
+}
+
+static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
+ struct ocfs2_xattr_bucket *src)
+{
+ int i;
+ int blocksize = src->bu_inode->i_sb->s_blocksize;
+
+ BUG_ON(dest->bu_blocks != src->bu_blocks);
+ BUG_ON(dest->bu_inode != src->bu_inode);
+
+ for (i = 0; i < src->bu_blocks; i++) {
+ memcpy(bucket_block(dest, i), bucket_block(src, i),
+ blocksize);
+ }
+}
+
+static int ocfs2_validate_xattr_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)bh->b_data;
+
+ mlog(0, "Validating xattr block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
+ if (rc)
+ return rc;
+
+ /*
+ * Errors after here are fatal
+ */
+
+ if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+ ocfs2_error(sb,
+ "Extended attribute block #%llu has bad "
+ "signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ xb->xb_signature);
+ return -EINVAL;
+ }
+
+ if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb,
+ "Extended attribute block #%llu has an "
+ "invalid xb_blkno of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(xb->xb_blkno));
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid "
+ "xb_fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(xb->xb_fs_generation));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
+ struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(inode, xb_blkno, &tmp,
+ ocfs2_validate_xattr_block);
+
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
static inline const char *ocfs2_xattr_prefix(int name_index)
{
struct xattr_handler *handler = NULL;
@@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
return;
}
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+{
+ int size = 0;
+
+ if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+ size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+ else
+ size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+ size += sizeof(struct ocfs2_xattr_entry);
+
+ return size;
+}
+
+int ocfs2_calc_security_init(struct inode *dir,
+ struct ocfs2_security_xattr_info *si,
+ int *want_clusters,
+ int *xattr_credits,
+ struct ocfs2_alloc_context **xattr_ac)
+{
+ int ret = 0;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+ si->value_len);
+
+ /*
+ * The max space of security xattr taken inline is
+ * 256(name) + 80(value) + 16(entry) = 352 bytes,
+ * So reserve one metadata block for it is ok.
+ */
+ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+ s_size > OCFS2_XATTR_FREE_IN_IBODY) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+ *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ }
+
+ /* reserve clusters for xattr value which will be set in B tree*/
+ if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+ si->value_len);
+
+ *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+ new_clusters);
+ *want_clusters += new_clusters;
+ }
+ return ret;
+}
+
+int ocfs2_calc_xattr_init(struct inode *dir,
+ struct buffer_head *dir_bh,
+ int mode,
+ struct ocfs2_security_xattr_info *si,
+ int *want_clusters,
+ int *xattr_credits,
+ struct ocfs2_alloc_context **xattr_ac)
+{
+ int ret = 0;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
+
+ if (si->enable)
+ s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+ si->value_len);
+
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
+ OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+ "", NULL, 0);
+ if (acl_len > 0) {
+ a_size = ocfs2_xattr_entry_real_size(0, acl_len);
+ if (S_ISDIR(mode))
+ a_size <<= 1;
+ } else if (acl_len != 0 && acl_len != -ENODATA) {
+ mlog_errno(ret);
+ return ret;
+ }
+ }
+
+ if (!(s_size + a_size))
+ return ret;
+
+ /*
+ * The max space of security xattr taken inline is
+ * 256(name) + 80(value) + 16(entry) = 352 bytes,
+ * The max space of acl xattr taken inline is
+ * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
+ * when blocksize = 512, may reserve one more cluser for
+ * xattr bucket, otherwise reserve one metadata block
+ * for them is ok.
+ */
+ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+ (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+ *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ }
+
+ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
+ (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
+ *want_clusters += 1;
+ *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
+ }
+
+ /*
+ * reserve credits and clusters for xattrs which has large value
+ * and have to be set outside
+ */
+ if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+ si->value_len);
+ *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+ new_clusters);
+ *want_clusters += new_clusters;
+ }
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+ acl_len > OCFS2_XATTR_INLINE_SIZE) {
+ /* for directory, it has DEFAULT and ACCESS two types of acls */
+ new_clusters = (S_ISDIR(mode) ? 2 : 1) *
+ ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+ *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+ new_clusters);
+ *want_clusters += new_clusters;
+ }
+
+ return ret;
+}
+
static int ocfs2_xattr_extend_allocation(struct inode *inode,
u32 clusters_to_add,
- struct buffer_head *xattr_bh,
- struct ocfs2_xattr_value_root *xv)
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int status = 0;
- int restart_func = 0;
- int credits = 0;
- handle_t *handle = NULL;
- struct ocfs2_alloc_context *data_ac = NULL;
- struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = ctxt->handle;
enum ocfs2_alloc_restarted why;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+ u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
struct ocfs2_extent_tree et;
mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
- ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
-
-restart_all:
-
- status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
- &data_ac, &meta_ac);
- if (status) {
- mlog_errno(status);
- goto leave;
- }
-
- credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
- clusters_to_add);
- handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- handle = NULL;
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
-restarted_transaction:
- status = ocfs2_journal_access(handle, inode, xattr_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = vb->vb_access(handle, inode, vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- prev_clusters = le32_to_cpu(xv->xr_clusters);
+ prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
status = ocfs2_add_clusters_in_btree(osb,
inode,
&logical_start,
@@ -255,157 +614,84 @@ restarted_transaction:
0,
&et,
handle,
- data_ac,
- meta_ac,
+ ctxt->data_ac,
+ ctxt->meta_ac,
&why);
- if ((status < 0) && (status != -EAGAIN)) {
- if (status != -ENOSPC)
- mlog_errno(status);
+ if (status < 0) {
+ mlog_errno(status);
goto leave;
}
- status = ocfs2_journal_dirty(handle, xattr_bh);
+ status = ocfs2_journal_dirty(handle, vb->vb_bh);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+ clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
- if (why != RESTART_NONE && clusters_to_add) {
- if (why == RESTART_META) {
- mlog(0, "restarting function.\n");
- restart_func = 1;
- } else {
- BUG_ON(why != RESTART_TRANS);
-
- mlog(0, "restarting transaction.\n");
- /* TODO: This can be more intelligent. */
- credits = ocfs2_calc_extend_credits(osb->sb,
- et.et_root_el,
- clusters_to_add);
- status = ocfs2_extend_trans(handle, credits);
- if (status < 0) {
- /* handle still has to be committed at
- * this point. */
- status = -ENOMEM;
- mlog_errno(status);
- goto leave;
- }
- goto restarted_transaction;
- }
- }
+ /*
+ * We should have already allocated enough space before the transaction,
+ * so no need to restart.
+ */
+ BUG_ON(why != RESTART_NONE || clusters_to_add);
leave:
- if (handle) {
- ocfs2_commit_trans(osb, handle);
- handle = NULL;
- }
- if (data_ac) {
- ocfs2_free_alloc_context(data_ac);
- data_ac = NULL;
- }
- if (meta_ac) {
- ocfs2_free_alloc_context(meta_ac);
- meta_ac = NULL;
- }
- if ((!status) && restart_func) {
- restart_func = 0;
- goto restart_all;
- }
return status;
}
static int __ocfs2_remove_xattr_range(struct inode *inode,
- struct buffer_head *root_bh,
- struct ocfs2_xattr_value_root *xv,
+ struct ocfs2_xattr_value_buf *vb,
u32 cpos, u32 phys_cpos, u32 len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct inode *tl_inode = osb->osb_tl_inode;
- handle_t *handle;
- struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = ctxt->handle;
struct ocfs2_extent_tree et;
- ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+ ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
- ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+ ret = vb->vb_access(handle, inode, vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- return ret;
- }
-
- mutex_lock(&tl_inode->i_mutex);
-
- if (ocfs2_truncate_log_needs_flush(osb)) {
- ret = __ocfs2_flush_truncate_log(osb);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
- dealloc);
+ ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
+ &ctxt->dealloc);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
- le32_add_cpu(&xv->xr_clusters, -len);
+ le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
- ret = ocfs2_journal_dirty(handle, root_bh);
+ ret = ocfs2_journal_dirty(handle, vb->vb_bh);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
- ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+ ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
if (ret)
mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(osb, handle);
out:
- mutex_unlock(&tl_inode->i_mutex);
-
- if (meta_ac)
- ocfs2_free_alloc_context(meta_ac);
-
return ret;
}
static int ocfs2_xattr_shrink_size(struct inode *inode,
u32 old_clusters,
u32 new_clusters,
- struct buffer_head *root_bh,
- struct ocfs2_xattr_value_root *xv)
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret = 0;
u32 trunc_len, cpos, phys_cpos, alloc_size;
u64 block;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_cached_dealloc_ctxt dealloc;
-
- ocfs2_init_dealloc_ctxt(&dealloc);
if (old_clusters <= new_clusters)
return 0;
@@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
trunc_len = old_clusters - new_clusters;
while (trunc_len) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
- &alloc_size, &xv->xr_list);
+ &alloc_size,
+ &vb->vb_xv->xr_list);
if (ret) {
mlog_errno(ret);
goto out;
@@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
if (alloc_size > trunc_len)
alloc_size = trunc_len;
- ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+ ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
phys_cpos, alloc_size,
- &dealloc);
+ ctxt);
if (ret) {
mlog_errno(ret);
goto out;
@@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
}
out:
- ocfs2_schedule_truncate_log_flush(osb, 1);
- ocfs2_run_deallocs(osb, &dealloc);
-
return ret;
}
static int ocfs2_xattr_value_truncate(struct inode *inode,
- struct buffer_head *root_bh,
- struct ocfs2_xattr_value_root *xv,
- int len)
+ struct ocfs2_xattr_value_buf *vb,
+ int len,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret;
u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
- u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+ u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
if (new_clusters == old_clusters)
return 0;
@@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
if (new_clusters > old_clusters)
ret = ocfs2_xattr_extend_allocation(inode,
new_clusters - old_clusters,
- root_bh, xv);
+ vb, ctxt);
else
ret = ocfs2_xattr_shrink_size(inode,
old_clusters, new_clusters,
- root_bh, xv);
+ vb, ctxt);
return ret;
}
@@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode,
if (!di->i_xattr_loc)
return ret;
- ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+ ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
}
xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
- if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ret = -EIO;
- goto cleanup;
- }
-
if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
ret = ocfs2_xattr_list_entries(inode, header,
@@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
ret = ocfs2_xattr_tree_list_index_block(inode, xt,
buffer, buffer_size);
}
-cleanup:
+
brelse(blk_bh);
return ret;
@@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
/* Copy ocfs2_xattr_value */
for (i = 0; i < num_clusters * bpc; i++, blkno++) {
- ret = ocfs2_read_block(inode, blkno, &bh);
+ ret = ocfs2_read_block(inode, blkno, &bh, NULL);
if (ret) {
mlog_errno(ret);
goto out;
@@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
size_t size;
int ret = -ENODATA, name_offset, name_len, block_off, i;
- memset(&xs->bucket, 0, sizeof(xs->bucket));
+ xs->bucket = ocfs2_xattr_bucket_new(inode);
+ if (!xs->bucket) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto cleanup;
+ }
ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
if (ret) {
@@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
ret = ocfs2_xattr_bucket_get_name_value(inode,
- xs->bucket.xh,
+ bucket_xh(xs->bucket),
i,
&block_off,
&name_offset);
- xs->base = xs->bucket.bhs[block_off]->b_data;
+ xs->base = bucket_block(xs->bucket, block_off);
}
if (ocfs2_xattr_is_local(xs->here)) {
memcpy(buffer, (void *)xs->base +
@@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode,
}
ret = size;
cleanup:
- for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
- brelse(xs->bucket.bhs[i]);
- memset(&xs->bucket, 0, sizeof(xs->bucket));
+ ocfs2_xattr_bucket_free(xs->bucket);
brelse(xs->xattr_bh);
xs->xattr_bh = NULL;
return ret;
}
-/* ocfs2_xattr_get()
- *
- * Copy an extended attribute into the buffer provided.
- * Buffer is NULL to compute the size of buffer required.
- */
-static int ocfs2_xattr_get(struct inode *inode,
+int ocfs2_xattr_get_nolock(struct inode *inode,
+ struct buffer_head *di_bh,
int name_index,
const char *name,
void *buffer,
@@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode,
{
int ret;
struct ocfs2_dinode *di = NULL;
- struct buffer_head *di_bh = NULL;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_xattr_search xis = {
.not_found = -ENODATA,
@@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode,
if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
ret = -ENODATA;
- ret = ocfs2_inode_lock(inode, &di_bh, 0);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
xis.inode_bh = xbs.inode_bh = di_bh;
di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode,
ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
buffer_size, &xbs);
up_read(&oi->ip_xattr_sem);
+
+ return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+static int ocfs2_xattr_get(struct inode *inode,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+ name, buffer, buffer_size);
+
ocfs2_inode_unlock(inode, 0);
brelse(di_bh);
@@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode,
}
static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_value_root *xv,
const void *value,
int value_len)
{
- int ret = 0, i, cp_len, credits;
+ int ret = 0, i, cp_len;
u16 blocksize = inode->i_sb->s_blocksize;
u32 p_cluster, num_clusters;
u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
u64 blkno;
struct buffer_head *bh = NULL;
- handle_t *handle;
BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
- credits = clusters * bpc;
- handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
-
while (cpos < clusters) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
&num_clusters, &xv->xr_list);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
for (i = 0; i < num_clusters * bpc; i++, blkno++) {
- ret = ocfs2_read_block(inode, blkno, &bh);
+ ret = ocfs2_read_block(inode, blkno, &bh, NULL);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
ret = ocfs2_journal_access(handle,
@@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
cp_len = value_len > blocksize ? blocksize : value_len;
@@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
ret = ocfs2_journal_dirty(handle, bh);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
brelse(bh);
bh = NULL;
@@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
}
cpos += num_clusters;
}
-out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
brelse(bh);
@@ -960,28 +1249,22 @@ out:
}
static int ocfs2_xattr_cleanup(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_info *xi,
struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_value_buf *vb,
size_t offs)
{
- handle_t *handle = NULL;
int ret = 0;
size_t name_len = strlen(xi->name);
void *val = xs->base + offs;
size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
- handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
- OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
- ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = vb->vb_access(handle, inode, vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
/* Decrease xattr count */
le16_add_cpu(&xs->header->xh_count, -1);
@@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
memset(val, 0, size);
- ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+ ret = ocfs2_journal_dirty(handle, vb->vb_bh);
if (ret < 0)
mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
return ret;
}
static int ocfs2_xattr_update_entry(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_info *xi,
struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_value_buf *vb,
size_t offs)
{
- handle_t *handle = NULL;
- int ret = 0;
+ int ret;
- handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
- OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
- ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = vb->vb_access(handle, inode, vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
xs->here->xe_name_offset = cpu_to_le16(offs);
@@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
ocfs2_xattr_set_local(xs->here, 0);
ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
- ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+ ret = ocfs2_journal_dirty(handle, vb->vb_bh);
if (ret < 0)
mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
return ret;
}
@@ -1045,6 +1318,8 @@ out:
static int ocfs2_xattr_set_value_outside(struct inode *inode,
struct ocfs2_xattr_info *xi,
struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt,
+ struct ocfs2_xattr_value_buf *vb,
size_t offs)
{
size_t name_len = strlen(xi->name);
@@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
xv->xr_list.l_tree_depth = 0;
xv->xr_list.l_count = cpu_to_le16(1);
xv->xr_list.l_next_free_rec = 0;
+ vb->vb_xv = xv;
- ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
- xi->value_len);
+ ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
if (ret < 0) {
mlog_errno(ret);
return ret;
}
- ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
- xi->value_len);
+ ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
if (ret < 0) {
mlog_errno(ret);
return ret;
}
- ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+ ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
+ xi->value, xi->value_len);
if (ret < 0)
mlog_errno(ret);
@@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
static int ocfs2_xattr_set_entry(struct inode *inode,
struct ocfs2_xattr_info *xi,
struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt,
int flag)
{
struct ocfs2_xattr_entry *last;
@@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
size_t size_l = 0;
- handle_t *handle = NULL;
+ handle_t *handle = ctxt->handle;
int free, i, ret;
struct ocfs2_xattr_info xi_l = {
.name_index = xi->name_index,
@@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
.value = xi->value,
.value_len = xi->value_len,
};
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = xs->xattr_bh,
+ .vb_access = ocfs2_journal_access_di,
+ };
+
+ if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+ BUG_ON(xs->xattr_bh == xs->inode_bh);
+ vb.vb_access = ocfs2_journal_access_xb;
+ } else
+ BUG_ON(xs->xattr_bh != xs->inode_bh);
/* Compute min_offs, last and free space. */
last = xs->header->xh_entries;
@@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
/* Replace existing local xattr with tree root */
ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
- offs);
+ ctxt, &vb, offs);
if (ret < 0)
mlog_errno(ret);
goto out;
} else if (!ocfs2_xattr_is_local(xs->here)) {
/* For existing xattr which has value outside */
- struct ocfs2_xattr_value_root *xv = NULL;
- xv = (struct ocfs2_xattr_value_root *)(val +
- OCFS2_XATTR_SIZE(name_len));
+ vb.vb_xv = (struct ocfs2_xattr_value_root *)
+ (val + OCFS2_XATTR_SIZE(name_len));
if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
/*
@@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
* then set new value with set_value_outside().
*/
ret = ocfs2_xattr_value_truncate(inode,
- xs->xattr_bh,
- xv,
- xi->value_len);
+ &vb,
+ xi->value_len,
+ ctxt);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- ret = __ocfs2_xattr_set_value_outside(inode,
- xv,
- xi->value,
- xi->value_len);
+ ret = ocfs2_xattr_update_entry(inode,
+ handle,
+ xi,
+ xs,
+ &vb,
+ offs);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_xattr_update_entry(inode,
- xi,
- xs,
- offs);
+ ret = __ocfs2_xattr_set_value_outside(inode,
+ handle,
+ vb.vb_xv,
+ xi->value,
+ xi->value_len);
if (ret < 0)
mlog_errno(ret);
goto out;
@@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
* just trucate old value to zero.
*/
ret = ocfs2_xattr_value_truncate(inode,
- xs->xattr_bh,
- xv,
- 0);
+ &vb,
+ 0,
+ ctxt);
if (ret < 0)
mlog_errno(ret);
}
}
}
- handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
- OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
if (!(flag & OCFS2_INLINE_XATTR_FL)) {
- /* set extended attribute in external block. */
- ret = ocfs2_extend_trans(handle,
- OCFS2_INODE_UPDATE_CREDITS +
- OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
- ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = vb.vb_access(handle, inode, vb.vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
}
@@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
}
@@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
oi->ip_dyn_features |= flag;
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
- /* Update inode ctime */
- inode->i_ctime = CURRENT_TIME;
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
ret = ocfs2_journal_dirty(handle, xs->inode_bh);
if (ret < 0)
mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
-
if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
/*
* Set value outside in B tree.
* This is the second step for value size > INLINE_SIZE.
*/
size_t offs = le16_to_cpu(xs->here->xe_name_offset);
- ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+ ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
+ &vb, offs);
if (ret < 0) {
int ret2;
@@ -1418,41 +1684,56 @@ out_commit:
* If set value outside failed, we have to clean
* the junk tree root we have already set in local.
*/
- ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+ ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
+ xi, xs, &vb, offs);
if (ret2 < 0)
mlog_errno(ret2);
}
}
out:
return ret;
-
}
static int ocfs2_remove_value_outside(struct inode*inode,
- struct buffer_head *bh,
+ struct ocfs2_xattr_value_buf *vb,
struct ocfs2_xattr_header *header)
{
int ret = 0, i;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+
+ ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+
+ ctxt.handle = ocfs2_start_trans(osb,
+ ocfs2_remove_extent_credits(osb->sb));
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ goto out;
+ }
for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
if (!ocfs2_xattr_is_local(entry)) {
- struct ocfs2_xattr_value_root *xv;
void *val;
val = (void *)header +
le16_to_cpu(entry->xe_name_offset);
- xv = (struct ocfs2_xattr_value_root *)
+ vb->vb_xv = (struct ocfs2_xattr_value_root *)
(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
- ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+ ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ break;
}
}
}
+ ocfs2_commit_trans(osb, ctxt.handle);
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &ctxt.dealloc);
+out:
return ret;
}
@@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_xattr_header *header;
int ret;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = di_bh,
+ .vb_access = ocfs2_journal_access_di,
+ };
header = (struct ocfs2_xattr_header *)
((void *)di + inode->i_sb->s_blocksize -
le16_to_cpu(di->i_xattr_inline_size));
- ret = ocfs2_remove_value_outside(inode, di_bh, header);
+ ret = ocfs2_remove_value_outside(inode, &vb, header);
return ret;
}
@@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
{
struct ocfs2_xattr_block *xb;
int ret = 0;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = blk_bh,
+ .vb_access = ocfs2_journal_access_xb,
+ };
xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
- ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+ ret = ocfs2_remove_value_outside(inode, &vb, header);
} else
ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
@@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode,
u64 blk, bg_blkno;
u16 bit;
- ret = ocfs2_read_block(inode, block, &blk_bh);
+ ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
- if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ret = -EIO;
- goto out;
- }
-
ret = ocfs2_xattr_block_remove(inode, blk_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
blk = le64_to_cpu(xb->xb_blkno);
bit = le16_to_cpu(xb->xb_suballoc_bit);
bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
*/
static int ocfs2_xattr_ibody_set(struct inode *inode,
struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs)
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
@@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
}
}
- ret = ocfs2_xattr_set_entry(inode, xi, xs,
+ ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
out:
up_write(&oi->ip_alloc_sem);
@@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode,
if (!di->i_xattr_loc)
return ret;
- ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+ ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
}
- xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
- if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ret = -EIO;
- goto cleanup;
- }
-
xs->xattr_bh = blk_bh;
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
xs->header = &xb->xb_attrs.xb_header;
@@ -1804,13 +2085,13 @@ cleanup:
*/
static int ocfs2_xattr_block_set(struct inode *inode,
struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs)
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
struct buffer_head *new_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
- struct ocfs2_alloc_context *meta_ac = NULL;
- handle_t *handle = NULL;
+ handle_t *handle = ctxt->handle;
struct ocfs2_xattr_block *xblk = NULL;
u16 suballoc_bit_start;
u32 num_got;
@@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode,
int ret;
if (!xs->xattr_bh) {
- /*
- * Alloc one external block for extended attribute
- * outside of inode.
- */
- ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+ ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (ret < 0) {
mlog_errno(ret);
- goto out;
- }
- handle = ocfs2_start_trans(osb,
- OCFS2_XATTR_BLOCK_CREATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
- ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
+ goto end;
}
- ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+ ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
&suballoc_bit_start, &num_got,
&first_blkno);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto end;
}
new_bh = sb_getblk(inode->i_sb, first_blkno);
ocfs2_set_new_buffer_uptodate(inode, new_bh);
- ret = ocfs2_journal_access(handle, inode, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ ret = ocfs2_journal_access_xb(handle, inode, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto end;
}
/* Initialize ocfs2_xattr_block */
@@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode,
xs->end = (void *)xblk + inode->i_sb->s_blocksize;
xs->here = xs->header->xh_entries;
-
ret = ocfs2_journal_dirty(handle, new_bh);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto end;
}
di->i_xattr_loc = cpu_to_le64(first_blkno);
- ret = ocfs2_journal_dirty(handle, xs->inode_bh);
- if (ret < 0)
- mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(osb, handle);
-out:
- if (meta_ac)
- ocfs2_free_alloc_context(meta_ac);
- if (ret < 0)
- return ret;
+ ocfs2_journal_dirty(handle, xs->inode_bh);
} else
xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
/* Set extended attribute into external block */
- ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+ ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
+ OCFS2_HAS_XATTR_FL);
if (!ret || ret != -ENOSPC)
goto end;
- ret = ocfs2_xattr_create_index_block(inode, xs);
+ ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
if (ret)
goto end;
}
- ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+ ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
end:
return ret;
}
+/* Check whether the new xattr can be inserted into the inode. */
+static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs)
+{
+ u64 value_size;
+ struct ocfs2_xattr_entry *last;
+ int free, i;
+ size_t min_offs = xs->end - xs->base;
+
+ if (!xs->header)
+ return 0;
+
+ last = xs->header->xh_entries;
+
+ for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+ size_t offs = le16_to_cpu(last->xe_name_offset);
+ if (offs < min_offs)
+ min_offs = offs;
+ last += 1;
+ }
+
+ free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+ if (free < 0)
+ return 0;
+
+ BUG_ON(!xs->not_found);
+
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+ value_size = OCFS2_XATTR_ROOT_SIZE;
+ else
+ value_size = OCFS2_XATTR_SIZE(xi->value_len);
+
+ if (free >= sizeof(struct ocfs2_xattr_entry) +
+ OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
+ return 1;
+
+ return 0;
+}
+
+static int ocfs2_calc_xattr_set_need(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ int *clusters_need,
+ int *meta_need,
+ int *credits_need)
+{
+ int ret = 0, old_in_xb = 0;
+ int clusters_add = 0, meta_add = 0, credits = 0;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_xattr_block *xb = NULL;
+ struct ocfs2_xattr_entry *xe = NULL;
+ struct ocfs2_xattr_value_root *xv = NULL;
+ char *base = NULL;
+ int name_offset, name_len = 0;
+ u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+ xi->value_len);
+ u64 value_size;
+
+ /*
+ * Calculate the clusters we need to write.
+ * No matter whether we replace an old one or add a new one,
+ * we need this for writing.
+ */
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+ credits += new_clusters *
+ ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+ if (xis->not_found && xbs->not_found) {
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ clusters_add += new_clusters;
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ &def_xv.xv.xr_list,
+ new_clusters);
+ }
+
+ goto meta_guess;
+ }
+
+ if (!xis->not_found) {
+ xe = xis->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ base = xis->base;
+ credits += OCFS2_INODE_UPDATE_CREDITS;
+ } else {
+ int i, block_off = 0;
+ xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+ xe = xbs->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ i = xbs->here - xbs->header->xh_entries;
+ old_in_xb = 1;
+
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode,
+ bucket_xh(xbs->bucket),
+ i, &block_off,
+ &name_offset);
+ base = bucket_block(xbs->bucket, block_off);
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ } else {
+ base = xbs->base;
+ credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
+ }
+ }
+
+ /*
+ * delete a xattr doesn't need metadata and cluster allocation.
+ * so just calculate the credits and return.
+ *
+ * The credits for removing the value tree will be extended
+ * by ocfs2_remove_extent itself.
+ */
+ if (!xi->value) {
+ if (!ocfs2_xattr_is_local(xe))
+ credits += ocfs2_remove_extent_credits(inode->i_sb);
+
+ goto out;
+ }
+
+ /* do cluster allocation guess first. */
+ value_size = le64_to_cpu(xe->xe_value_size);
+
+ if (old_in_xb) {
+ /*
+ * In xattr set, we always try to set the xe in inode first,
+ * so if it can be inserted into inode successfully, the old
+ * one will be removed from the xattr block, and this xattr
+ * will be inserted into inode as a new xattr in inode.
+ */
+ if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
+ clusters_add += new_clusters;
+ credits += ocfs2_remove_extent_credits(inode->i_sb) +
+ OCFS2_INODE_UPDATE_CREDITS;
+ if (!ocfs2_xattr_is_local(xe))
+ credits += ocfs2_calc_extend_credits(
+ inode->i_sb,
+ &def_xv.xv.xr_list,
+ new_clusters);
+ goto out;
+ }
+ }
+
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ /* the new values will be stored outside. */
+ u32 old_clusters = 0;
+
+ if (!ocfs2_xattr_is_local(xe)) {
+ old_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+ value_size);
+ xv = (struct ocfs2_xattr_value_root *)
+ (base + name_offset + name_len);
+ value_size = OCFS2_XATTR_ROOT_SIZE;
+ } else
+ xv = &def_xv.xv;
+
+ if (old_clusters >= new_clusters) {
+ credits += ocfs2_remove_extent_credits(inode->i_sb);
+ goto out;
+ } else {
+ meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
+ clusters_add += new_clusters - old_clusters;
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ &xv->xr_list,
+ new_clusters -
+ old_clusters);
+ if (value_size >= OCFS2_XATTR_ROOT_SIZE)
+ goto out;
+ }
+ } else {
+ /*
+ * Now the new value will be stored inside. So if the new
+ * value is smaller than the size of value root or the old
+ * value, we don't need any allocation, otherwise we have
+ * to guess metadata allocation.
+ */
+ if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+ (!ocfs2_xattr_is_local(xe) &&
+ OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+ goto out;
+ }
+
+meta_guess:
+ /* calculate metadata allocation. */
+ if (di->i_xattr_loc) {
+ if (!xbs->xattr_bh) {
+ ret = ocfs2_read_xattr_block(inode,
+ le64_to_cpu(di->i_xattr_loc),
+ &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xb = (struct ocfs2_xattr_block *)bh->b_data;
+ } else
+ xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+
+ /*
+ * If there is already an xattr tree, good, we can calculate
+ * like other b-trees. Otherwise we may have the chance of
+ * create a tree, the credit calculation is borrowed from
+ * ocfs2_calc_extend_credits with root_el = NULL. And the
+ * new tree will be cluster based, so no meta is needed.
+ */
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ struct ocfs2_extent_list *el =
+ &xb->xb_attrs.xb_root.xt_list;
+ meta_add += ocfs2_extend_meta_needed(el);
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ el, 1);
+ } else
+ credits += OCFS2_SUBALLOC_ALLOC + 1;
+
+ /*
+ * This cluster will be used either for new bucket or for
+ * new xattr block.
+ * If the cluster size is the same as the bucket size, one
+ * more is needed since we may need to extend the bucket
+ * also.
+ */
+ clusters_add += 1;
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ if (OCFS2_XATTR_BUCKET_SIZE ==
+ OCFS2_SB(inode->i_sb)->s_clustersize) {
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ clusters_add += 1;
+ }
+ } else {
+ meta_add += 1;
+ credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ }
+out:
+ if (clusters_need)
+ *clusters_need = clusters_add;
+ if (meta_need)
+ *meta_need = meta_add;
+ if (credits_need)
+ *credits_need = credits;
+ brelse(bh);
+ return ret;
+}
+
+static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_xattr_set_ctxt *ctxt,
+ int *credits)
+{
+ int clusters_add, meta_add, ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
+
+ ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
+
+ ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
+ &clusters_add, &meta_add, credits);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
+ "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
+
+ if (meta_add) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
+ &ctxt->meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (clusters_add) {
+ ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
+ if (ret)
+ mlog_errno(ret);
+ }
+out:
+ if (ret) {
+ if (ctxt->meta_ac) {
+ ocfs2_free_alloc_context(ctxt->meta_ac);
+ ctxt->meta_ac = NULL;
+ }
+
+ /*
+ * We cannot have an error and a non null ctxt->data_ac.
+ */
+ }
+
+ return ret;
+}
+
+static int __ocfs2_xattr_set_handle(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret = 0, credits, old_found;
+
+ if (!xi->value) {
+ /* Remove existing extended attribute */
+ if (!xis->not_found)
+ ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+ else if (!xbs->not_found)
+ ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+ } else {
+ /* We always try to set extended attribute into inode first*/
+ ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+ if (!ret && !xbs->not_found) {
+ /*
+ * If succeed and that extended attribute existing in
+ * external block, then we will remove it.
+ */
+ xi->value = NULL;
+ xi->value_len = 0;
+
+ old_found = xis->not_found;
+ xis->not_found = -ENODATA;
+ ret = ocfs2_calc_xattr_set_need(inode,
+ di,
+ xi,
+ xis,
+ xbs,
+ NULL,
+ NULL,
+ &credits);
+ xis->not_found = old_found;
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_extend_trans(ctxt->handle, credits +
+ ctxt->handle->h_buffer_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+ } else if (ret == -ENOSPC) {
+ if (di->i_xattr_loc && !xbs->xattr_bh) {
+ ret = ocfs2_xattr_block_find(inode,
+ xi->name_index,
+ xi->name, xbs);
+ if (ret)
+ goto out;
+
+ old_found = xis->not_found;
+ xis->not_found = -ENODATA;
+ ret = ocfs2_calc_xattr_set_need(inode,
+ di,
+ xi,
+ xis,
+ xbs,
+ NULL,
+ NULL,
+ &credits);
+ xis->not_found = old_found;
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_extend_trans(ctxt->handle, credits +
+ ctxt->handle->h_buffer_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ /*
+ * If no space in inode, we will set extended attribute
+ * into external block.
+ */
+ ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+ if (ret)
+ goto out;
+ if (!xis->not_found) {
+ /*
+ * If succeed and that extended attribute
+ * existing in inode, we will remove it.
+ */
+ xi->value = NULL;
+ xi->value_len = 0;
+ xbs->not_found = -ENODATA;
+ ret = ocfs2_calc_xattr_set_need(inode,
+ di,
+ xi,
+ xis,
+ xbs,
+ NULL,
+ NULL,
+ &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_extend_trans(ctxt->handle, credits +
+ ctxt->handle->h_buffer_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_xattr_ibody_set(inode, xi,
+ xis, ctxt);
+ }
+ }
+ }
+
+ if (!ret) {
+ /* Update inode ctime. */
+ ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
+ }
+out:
+ return ret;
+}
+
+/*
+ * This function only called duing creating inode
+ * for init security/acl xattrs of the new inode.
+ * All transanction credits have been reserved in mknod.
+ */
+int ocfs2_xattr_set_handle(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ int name_index,
+ const char *name,
+ const void *value,
+ size_t value_len,
+ int flags,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ struct ocfs2_dinode *di;
+ int ret;
+
+ struct ocfs2_xattr_info xi = {
+ .name_index = name_index,
+ .name = name,
+ .value = value,
+ .value_len = value_len,
+ };
+
+ struct ocfs2_xattr_search xis = {
+ .not_found = -ENODATA,
+ };
+
+ struct ocfs2_xattr_search xbs = {
+ .not_found = -ENODATA,
+ };
+
+ struct ocfs2_xattr_set_ctxt ctxt = {
+ .handle = handle,
+ .meta_ac = meta_ac,
+ .data_ac = data_ac,
+ };
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ /*
+ * In extreme situation, may need xattr bucket when
+ * block size is too small. And we have already reserved
+ * the credits for bucket in mknod.
+ */
+ if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
+ xbs.bucket = ocfs2_xattr_bucket_new(inode);
+ if (!xbs.bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+ }
+
+ xis.inode_bh = xbs.inode_bh = di_bh;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+ ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+ if (ret)
+ goto cleanup;
+ if (xis.not_found) {
+ ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+ if (ret)
+ goto cleanup;
+ }
+
+ ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+cleanup:
+ up_write(&OCFS2_I(inode)->ip_xattr_sem);
+ brelse(xbs.xattr_bh);
+ ocfs2_xattr_bucket_free(xbs.bucket);
+
+ return ret;
+}
+
/*
* ocfs2_xattr_set()
*
@@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode,
{
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
- int ret;
- u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int ret, credits;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
struct ocfs2_xattr_info xi = {
.name_index = name_index,
@@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode,
if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
+ /*
+ * Only xbs will be used on indexed trees. xis doesn't need a
+ * bucket.
+ */
+ xbs.bucket = ocfs2_xattr_bucket_new(inode);
+ if (!xbs.bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto cleanup_nolock;
}
xis.inode_bh = xbs.inode_bh = di_bh;
di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode,
goto cleanup;
}
- if (!value) {
- /* Remove existing extended attribute */
- if (!xis.not_found)
- ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
- else if (!xbs.not_found)
- ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
- } else {
- /* We always try to set extended attribute into inode first*/
- ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
- if (!ret && !xbs.not_found) {
- /*
- * If succeed and that extended attribute existing in
- * external block, then we will remove it.
- */
- xi.value = NULL;
- xi.value_len = 0;
- ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
- } else if (ret == -ENOSPC) {
- if (di->i_xattr_loc && !xbs.xattr_bh) {
- ret = ocfs2_xattr_block_find(inode, name_index,
- name, &xbs);
- if (ret)
- goto cleanup;
- }
- /*
- * If no space in inode, we will set extended attribute
- * into external block.
- */
- ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
- if (ret)
- goto cleanup;
- if (!xis.not_found) {
- /*
- * If succeed and that extended attribute
- * existing in inode, we will remove it.
- */
- xi.value = NULL;
- xi.value_len = 0;
- ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
- }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mutex_unlock(&tl_inode->i_mutex);
+ mlog_errno(ret);
+ goto cleanup;
}
}
+ mutex_unlock(&tl_inode->i_mutex);
+
+ ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
+ &xbs, &ctxt, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+
+ /* we need to update inode's ctime field, so add credit for it. */
+ credits += OCFS2_INODE_UPDATE_CREDITS;
+ ctxt.handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ goto cleanup;
+ }
+
+ ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+ ocfs2_commit_trans(osb, ctxt.handle);
+
+ if (ctxt.data_ac)
+ ocfs2_free_alloc_context(ctxt.data_ac);
+ if (ctxt.meta_ac)
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &ctxt.dealloc);
cleanup:
up_write(&OCFS2_I(inode)->ip_xattr_sem);
ocfs2_inode_unlock(inode, 1);
+cleanup_nolock:
brelse(di_bh);
brelse(xbs.xattr_bh);
- for (i = 0; i < blk_per_bucket; i++)
- brelse(xbs.bucket.bhs[i]);
+ ocfs2_xattr_bucket_free(xbs.bucket);
return ret;
}
@@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode,
void *para);
static int ocfs2_find_xe_in_bucket(struct inode *inode,
- struct buffer_head *header_bh,
+ struct ocfs2_xattr_bucket *bucket,
int name_index,
const char *name,
u32 name_hash,
@@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
int *found)
{
int i, ret = 0, cmp = 1, block_off, new_offset;
- struct ocfs2_xattr_header *xh =
- (struct ocfs2_xattr_header *)header_bh->b_data;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
size_t name_len = strlen(name);
struct ocfs2_xattr_entry *xe = NULL;
- struct buffer_head *name_bh = NULL;
char *xe_name;
/*
@@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
break;
}
- ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
- &name_bh);
- if (ret) {
- mlog_errno(ret);
- break;
- }
- xe_name = name_bh->b_data + new_offset;
- cmp = memcmp(name, xe_name, name_len);
- brelse(name_bh);
- name_bh = NULL;
-
- if (cmp == 0) {
+ xe_name = bucket_block(bucket, block_off) + new_offset;
+ if (!memcmp(name, xe_name, name_len)) {
*xe_index = i;
*found = 1;
ret = 0;
@@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
struct ocfs2_xattr_search *xs)
{
int ret, found = 0;
- struct buffer_head *bh = NULL;
- struct buffer_head *lower_bh = NULL;
struct ocfs2_xattr_header *xh = NULL;
struct ocfs2_xattr_entry *xe = NULL;
u16 index = 0;
u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
int low_bucket = 0, bucket, high_bucket;
+ struct ocfs2_xattr_bucket *search;
u32 last_hash;
- u64 blkno;
+ u64 blkno, lower_blkno = 0;
- ret = ocfs2_read_block(inode, p_blkno, &bh);
+ search = ocfs2_xattr_bucket_new(inode);
+ if (!search) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_xattr_bucket(search, p_blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
- xh = (struct ocfs2_xattr_header *)bh->b_data;
+ xh = bucket_xh(search);
high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
-
while (low_bucket <= high_bucket) {
- brelse(bh);
- bh = NULL;
- bucket = (low_bucket + high_bucket) / 2;
+ ocfs2_xattr_bucket_relse(search);
+ bucket = (low_bucket + high_bucket) / 2;
blkno = p_blkno + bucket * blk_per_bucket;
-
- ret = ocfs2_read_block(inode, blkno, &bh);
+ ret = ocfs2_read_xattr_bucket(search, blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
- xh = (struct ocfs2_xattr_header *)bh->b_data;
+ xh = bucket_xh(search);
xe = &xh->xh_entries[0];
if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
high_bucket = bucket - 1;
@@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
last_hash = le32_to_cpu(xe->xe_name_hash);
- /* record lower_bh which may be the insert place. */
- brelse(lower_bh);
- lower_bh = bh;
- bh = NULL;
+ /* record lower_blkno which may be the insert place. */
+ lower_blkno = blkno;
if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
low_bucket = bucket + 1;
@@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
}
/* the searched xattr should reside in this bucket if exists. */
- ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+ ret = ocfs2_find_xe_in_bucket(inode, search,
name_index, name, name_hash,
&index, &found);
if (ret) {
@@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
* When the xattr's hash value is in the gap of 2 buckets, we will
* always set it to the previous bucket.
*/
- if (!lower_bh) {
- /*
- * We can't find any bucket whose first name_hash is less
- * than the find name_hash.
- */
- BUG_ON(bh->b_blocknr != p_blkno);
- lower_bh = bh;
- bh = NULL;
+ if (!lower_blkno)
+ lower_blkno = p_blkno;
+
+ /* This should be in cache - we just read it during the search */
+ ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
- xs->bucket.bhs[0] = lower_bh;
- xs->bucket.xh = (struct ocfs2_xattr_header *)
- xs->bucket.bhs[0]->b_data;
- lower_bh = NULL;
- xs->header = xs->bucket.xh;
- xs->base = xs->bucket.bhs[0]->b_data;
+ xs->header = bucket_xh(xs->bucket);
+ xs->base = bucket_block(xs->bucket, 0);
xs->end = xs->base + inode->i_sb->s_blocksize;
if (found) {
- /*
- * If we have found the xattr enty, read all the blocks in
- * this bucket.
- */
- ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
- blk_per_bucket - 1, &xs->bucket.bhs[1],
- 0);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
xs->here = &xs->header->xh_entries[index];
mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
- (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+ (unsigned long long)bucket_blkno(xs->bucket), index);
} else
ret = -ENODATA;
out:
- brelse(bh);
- brelse(lower_bh);
+ ocfs2_xattr_bucket_free(search);
return ret;
}
@@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
xattr_bucket_func *func,
void *para)
{
- int i, j, ret = 0;
- int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int i, ret = 0;
u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
u32 num_buckets = clusters * bpc;
- struct ocfs2_xattr_bucket bucket;
+ struct ocfs2_xattr_bucket *bucket;
- memset(&bucket, 0, sizeof(bucket));
+ bucket = ocfs2_xattr_bucket_new(inode);
+ if (!bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
clusters, (unsigned long long)blkno);
- for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
- ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
- bucket.bhs, 0);
+ for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
+ ret = ocfs2_read_xattr_bucket(bucket, blkno);
if (ret) {
mlog_errno(ret);
- goto out;
+ break;
}
- bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
/*
* The real bucket num in this series of blocks is stored
* in the 1st bucket.
*/
if (i == 0)
- num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+ num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
mlog(0, "iterating xattr bucket %llu, first hash %u\n",
(unsigned long long)blkno,
- le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+ le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
if (func) {
- ret = func(inode, &bucket, para);
- if (ret) {
+ ret = func(inode, bucket, para);
+ if (ret)
mlog_errno(ret);
- break;
- }
+ /* Fall through to bucket_relse() */
}
- for (j = 0; j < blk_per_bucket; j++)
- brelse(bucket.bhs[j]);
- memset(&bucket, 0, sizeof(bucket));
+ ocfs2_xattr_bucket_relse(bucket);
+ if (ret)
+ break;
}
-out:
- for (j = 0; j < blk_per_bucket; j++)
- brelse(bucket.bhs[j]);
-
+ ocfs2_xattr_bucket_free(bucket);
return ret;
}
@@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
int i, block_off, new_offset;
const char *prefix, *name;
- for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
- struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+ for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
+ struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
prefix = ocfs2_xattr_prefix(type);
if (prefix) {
ret = ocfs2_xattr_bucket_get_name_value(inode,
- bucket->xh,
+ bucket_xh(bucket),
i,
&block_off,
&new_offset);
if (ret)
break;
- name = (const char *)bucket->bhs[block_off]->b_data +
+ name = (const char *)bucket_block(bucket, block_off) +
new_offset;
ret = ocfs2_xattr_list_entry(xl->buffer,
xl->buffer_size,
@@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size)
/*
* When the ocfs2_xattr_block is filled up, new bucket will be created
* and all the xattr entries will be moved to the new bucket.
+ * The header goes at the start of the bucket, and the names+values are
+ * filled from the end. This is why *target starts as the last buffer.
* Note: we need to sort the entries since they are not saved in order
* in the ocfs2_xattr_block.
*/
static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
struct buffer_head *xb_bh,
- struct buffer_head *xh_bh,
- struct buffer_head *data_bh)
+ struct ocfs2_xattr_bucket *bucket)
{
int i, blocksize = inode->i_sb->s_blocksize;
+ int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
u16 offset, size, off_change;
struct ocfs2_xattr_entry *xe;
struct ocfs2_xattr_block *xb =
(struct ocfs2_xattr_block *)xb_bh->b_data;
struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
- struct ocfs2_xattr_header *xh =
- (struct ocfs2_xattr_header *)xh_bh->b_data;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
u16 count = le16_to_cpu(xb_xh->xh_count);
- char *target = xh_bh->b_data, *src = xb_bh->b_data;
+ char *src = xb_bh->b_data;
+ char *target = bucket_block(bucket, blks - 1);
mlog(0, "cp xattr from block %llu to bucket %llu\n",
(unsigned long long)xb_bh->b_blocknr,
- (unsigned long long)xh_bh->b_blocknr);
+ (unsigned long long)bucket_blkno(bucket));
+
+ for (i = 0; i < blks; i++)
+ memset(bucket_block(bucket, i), 0, blocksize);
- memset(xh_bh->b_data, 0, blocksize);
- if (data_bh)
- memset(data_bh->b_data, 0, blocksize);
/*
* Since the xe_name_offset is based on ocfs2_xattr_header,
* there is a offset change corresponding to the change of
@@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
size = blocksize - offset;
/* copy all the names and values. */
- if (data_bh)
- target = data_bh->b_data;
memcpy(target + offset, src + offset, size);
/* Init new header now. */
@@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
/* copy all the entries. */
- target = xh_bh->b_data;
+ target = bucket_block(bucket, 0);
offset = offsetof(struct ocfs2_xattr_header, xh_entries);
size = count * sizeof(struct ocfs2_xattr_entry);
memcpy(target + offset, (char *)xb_xh + offset, size);
@@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
* While if the entry is in index b-tree, "bucket" indicates the
* real place of the xattr.
*/
-static int ocfs2_xattr_update_xattr_search(struct inode *inode,
- struct ocfs2_xattr_search *xs,
- struct buffer_head *old_bh,
- struct buffer_head *new_bh)
+static void ocfs2_xattr_update_xattr_search(struct inode *inode,
+ struct ocfs2_xattr_search *xs,
+ struct buffer_head *old_bh)
{
- int ret = 0;
char *buf = old_bh->b_data;
struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
- int i, blocksize = inode->i_sb->s_blocksize;
- u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
- xs->bucket.bhs[0] = new_bh;
- get_bh(new_bh);
- xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
- xs->header = xs->bucket.xh;
+ int i;
- xs->base = new_bh->b_data;
+ xs->header = bucket_xh(xs->bucket);
+ xs->base = bucket_block(xs->bucket, 0);
xs->end = xs->base + inode->i_sb->s_blocksize;
- if (!xs->not_found) {
- if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
- ret = ocfs2_read_blocks(inode,
- xs->bucket.bhs[0]->b_blocknr + 1,
- blk_per_bucket - 1, &xs->bucket.bhs[1],
- 0);
- if (ret) {
- mlog_errno(ret);
- return ret;
- }
-
- }
- i = xs->here - old_xh->xh_entries;
- xs->here = &xs->header->xh_entries[i];
- }
+ if (xs->not_found)
+ return;
- return ret;
+ i = xs->here - old_xh->xh_entries;
+ xs->here = &xs->header->xh_entries[i];
}
static int ocfs2_xattr_create_index_block(struct inode *inode,
- struct ocfs2_xattr_search *xs)
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
- int ret, credits = OCFS2_SUBALLOC_ALLOC;
+ int ret;
u32 bit_off, len;
u64 blkno;
- handle_t *handle;
+ handle_t *handle = ctxt->handle;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_alloc_context *data_ac;
- struct buffer_head *xh_bh = NULL, *data_bh = NULL;
struct buffer_head *xb_bh = xs->xattr_bh;
struct ocfs2_xattr_block *xb =
(struct ocfs2_xattr_block *)xb_bh->b_data;
struct ocfs2_xattr_tree_root *xr;
u16 xb_flags = le16_to_cpu(xb->xb_flags);
- u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
mlog(0, "create xattr index block for %llu\n",
(unsigned long long)xb_bh->b_blocknr);
BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
-
- ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ BUG_ON(!xs->bucket);
/*
* XXX:
@@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
*/
down_write(&oi->ip_alloc_sem);
- /*
- * 3 more credits, one for xattr block update, one for the 1st block
- * of the new xattr bucket and one for the value/data.
- */
- credits += 3;
- handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out_sem;
- }
-
- ret = ocfs2_journal_access(handle, inode, xb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
- ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+ ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+ 1, 1, &bit_off, &len);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
/*
@@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
mlog(0, "allocate 1 cluster from %llu to xattr block\n",
(unsigned long long)blkno);
- xh_bh = sb_getblk(inode->i_sb, blkno);
- if (!xh_bh) {
- ret = -EIO;
+ ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
+ if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
- ocfs2_set_new_buffer_uptodate(inode, xh_bh);
-
- ret = ocfs2_journal_access(handle, inode, xh_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
- }
-
- if (bpb > 1) {
- data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
- if (!data_bh) {
- ret = -EIO;
- mlog_errno(ret);
- goto out_commit;
- }
-
- ocfs2_set_new_buffer_uptodate(inode, data_bh);
-
- ret = ocfs2_journal_access(handle, inode, data_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ goto out;
}
- ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+ ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
+ ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
- ocfs2_journal_dirty(handle, xh_bh);
- if (data_bh)
- ocfs2_journal_dirty(handle, data_bh);
-
- ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
- ret = ocfs2_journal_dirty(handle, xb_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
-out_commit:
- ocfs2_commit_trans(osb, handle);
-
-out_sem:
- up_write(&oi->ip_alloc_sem);
+ ocfs2_journal_dirty(handle, xb_bh);
out:
- if (data_ac)
- ocfs2_free_alloc_context(data_ac);
-
- brelse(xh_bh);
- brelse(data_bh);
+ up_write(&oi->ip_alloc_sem);
return ret;
}
@@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b)
* so that we can spare some space for insertion.
*/
static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_bucket *bucket)
{
int ret, i;
size_t end, offset, len, value_len;
struct ocfs2_xattr_header *xh;
char *entries, *buf, *bucket_buf = NULL;
- u64 blkno = bucket->bhs[0]->b_blocknr;
- u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ u64 blkno = bucket_blkno(bucket);
u16 xh_free_start;
size_t blocksize = inode->i_sb->s_blocksize;
- handle_t *handle;
- struct buffer_head **bhs;
struct ocfs2_xattr_entry *xe;
- bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
- GFP_NOFS);
- if (!bhs)
- return -ENOMEM;
-
- ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
- if (ret)
- goto out;
-
/*
* In order to make the operation more efficient and generic,
* we copy all the blocks into a contiguous memory and do the
@@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
}
buf = bucket_buf;
- for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
- memcpy(buf, bhs[i]->b_data, blocksize);
+ for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+ memcpy(buf, bucket_block(bucket, i), blocksize);
- handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
mlog_errno(ret);
goto out;
}
- for (i = 0; i < blk_per_bucket; i++) {
- ret = ocfs2_journal_access(handle, inode, bhs[i],
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto commit;
- }
- }
-
xh = (struct ocfs2_xattr_header *)bucket_buf;
entries = (char *)xh->xh_entries;
xh_free_start = le16_to_cpu(xh->xh_free_start);
@@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
"bucket %llu\n", (unsigned long long)blkno);
if (xh_free_start == end)
- goto commit;
+ goto out;
memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
xh->xh_free_start = cpu_to_le16(end);
@@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
cmp_xe, swap_xe);
buf = bucket_buf;
- for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
- memcpy(bhs[i]->b_data, buf, blocksize);
- ocfs2_journal_dirty(handle, bhs[i]);
- }
+ for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+ memcpy(bucket_block(bucket, i), buf, blocksize);
+ ocfs2_xattr_bucket_journal_dirty(handle, bucket);
-commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
-
- if (bhs) {
- for (i = 0; i < blk_per_bucket; i++)
- brelse(bhs[i]);
- }
- kfree(bhs);
-
kfree(bucket_buf);
return ret;
}
/*
- * Move half nums of the xattr bucket in the previous cluster to this new
- * cluster. We only touch the last cluster of the previous extend record.
+ * prev_blkno points to the start of an existing extent. new_blkno
+ * points to a newly allocated extent. Because we know each of our
+ * clusters contains more than bucket, we can easily split one cluster
+ * at a bucket boundary. So we take the last cluster of the existing
+ * extent and split it down the middle. We move the last half of the
+ * buckets in the last cluster of the existing extent over to the new
+ * extent.
+ *
+ * first_bh is the buffer at prev_blkno so we can update the existing
+ * extent's bucket count. header_bh is the bucket were we were hoping
+ * to insert our xattr. If the bucket move places the target in the new
+ * extent, we'll update first_bh and header_bh after modifying the old
+ * extent.
*
- * first_bh is the first buffer_head of a series of bucket in the same
- * extent rec and header_bh is the header of one bucket in this cluster.
- * They will be updated if we move the data header_bh contains to the new
- * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
+ * first_hash will be set as the 1st xe's name_hash in the new extent.
*/
static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
handle_t *handle,
- struct buffer_head **first_bh,
- struct buffer_head **header_bh,
+ struct ocfs2_xattr_bucket *first,
+ struct ocfs2_xattr_bucket *target,
u64 new_blkno,
- u64 prev_blkno,
u32 num_clusters,
u32 *first_hash)
{
- int i, ret, credits;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
- int blocksize = inode->i_sb->s_blocksize;
- struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
- struct ocfs2_xattr_header *new_xh;
- struct ocfs2_xattr_header *xh =
- (struct ocfs2_xattr_header *)((*first_bh)->b_data);
-
- BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
- BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
-
- prev_bh = *first_bh;
- get_bh(prev_bh);
- xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
+ int ret;
+ struct super_block *sb = inode->i_sb;
+ int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
+ int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
+ int to_move = num_buckets / 2;
+ u64 src_blkno;
+ u64 last_cluster_blkno = bucket_blkno(first) +
+ ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
- prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+ BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
+ BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
mlog(0, "move half of xattrs in cluster %llu to %llu\n",
- (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
+ (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
- /*
- * We need to update the 1st half of the new cluster and
- * 1 more for the update of the 1st bucket of the previous
- * extent record.
- */
- credits = bpc / 2 + 1;
- ret = ocfs2_extend_trans(handle, credits);
+ ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
+ last_cluster_blkno, new_blkno,
+ to_move, first_hash);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, prev_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ /* This is the first bucket that got moved */
+ src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
- for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
- old_bh = new_bh = NULL;
- new_bh = sb_getblk(inode->i_sb, new_blkno);
- if (!new_bh) {
- ret = -EIO;
- mlog_errno(ret);
- goto out;
- }
+ /*
+ * If the target bucket was part of the moved buckets, we need to
+ * update first and target.
+ */
+ if (bucket_blkno(target) >= src_blkno) {
+ /* Find the block for the new target bucket */
+ src_blkno = new_blkno +
+ (bucket_blkno(target) - src_blkno);
- ocfs2_set_new_buffer_uptodate(inode, new_bh);
+ ocfs2_xattr_bucket_relse(first);
+ ocfs2_xattr_bucket_relse(target);
- ret = ocfs2_journal_access(handle, inode, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
- if (ret < 0) {
+ /*
+ * These shouldn't fail - the buffers are in the
+ * journal from ocfs2_cp_xattr_bucket().
+ */
+ ret = ocfs2_read_xattr_bucket(first, new_blkno);
+ if (ret) {
mlog_errno(ret);
- brelse(new_bh);
goto out;
}
-
- ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
- if (ret < 0) {
+ ret = ocfs2_read_xattr_bucket(target, src_blkno);
+ if (ret)
mlog_errno(ret);
- brelse(new_bh);
- goto out;
- }
- memcpy(new_bh->b_data, old_bh->b_data, blocksize);
-
- if (i == 0) {
- new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
- new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
-
- if (first_hash)
- *first_hash = le32_to_cpu(
- new_xh->xh_entries[0].xe_name_hash);
- new_first_bh = new_bh;
- get_bh(new_first_bh);
- }
-
- ocfs2_journal_dirty(handle, new_bh);
-
- if (*header_bh == old_bh) {
- brelse(*header_bh);
- *header_bh = new_bh;
- get_bh(*header_bh);
-
- brelse(*first_bh);
- *first_bh = new_first_bh;
- get_bh(*first_bh);
- }
- brelse(new_bh);
- brelse(old_bh);
}
- le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
-
- ocfs2_journal_dirty(handle, prev_bh);
out:
- brelse(prev_bh);
- brelse(new_first_bh);
- return ret;
-}
-
-static int ocfs2_read_xattr_bucket(struct inode *inode,
- u64 blkno,
- struct buffer_head **bhs,
- int new)
-{
- int ret = 0;
- u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
- if (!new)
- return ocfs2_read_blocks(inode, blkno,
- blk_per_bucket, bhs, 0);
-
- for (i = 0; i < blk_per_bucket; i++) {
- bhs[i] = sb_getblk(inode->i_sb, blkno + i);
- if (bhs[i] == NULL) {
- ret = -EIO;
- mlog_errno(ret);
- break;
- }
- ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
- }
-
return ret;
}
@@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
{
int ret, i;
int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
- u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- struct buffer_head **s_bhs, **t_bhs = NULL;
+ struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
struct ocfs2_xattr_header *xh;
struct ocfs2_xattr_entry *xe;
int blocksize = inode->i_sb->s_blocksize;
@@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
mlog(0, "move some of xattrs from bucket %llu to %llu\n",
(unsigned long long)blk, (unsigned long long)new_blk);
- s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
- if (!s_bhs)
- return -ENOMEM;
-
- ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
- if (ret) {
+ s_bucket = ocfs2_xattr_bucket_new(inode);
+ t_bucket = ocfs2_xattr_bucket_new(inode);
+ if (!s_bucket || !t_bucket) {
+ ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, s_bhs[0],
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_read_xattr_bucket(s_bucket, blk);
if (ret) {
mlog_errno(ret);
goto out;
}
- t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
- if (!t_bhs) {
- ret = -ENOMEM;
+ ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
goto out;
}
- ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+ /*
+ * Even if !new_bucket_head, we're overwriting t_bucket. Thus,
+ * there's no need to read it.
+ */
+ ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
if (ret) {
mlog_errno(ret);
goto out;
}
- for (i = 0; i < blk_per_bucket; i++) {
- ret = ocfs2_journal_access(handle, inode, t_bhs[i],
- new_bucket_head ?
- OCFS2_JOURNAL_ACCESS_CREATE :
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ /*
+ * Hey, if we're overwriting t_bucket, what difference does
+ * ACCESS_CREATE vs ACCESS_WRITE make? See the comment in the
+ * same part of ocfs2_cp_xattr_bucket().
+ */
+ ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+ new_bucket_head ?
+ OCFS2_JOURNAL_ACCESS_CREATE :
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
- xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+ xh = bucket_xh(s_bucket);
count = le16_to_cpu(xh->xh_count);
start = ocfs2_xattr_find_divide_pos(xh);
@@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
* The hash value is set as one larger than
* that of the last entry in the previous bucket.
*/
- for (i = 0; i < blk_per_bucket; i++)
- memset(t_bhs[i]->b_data, 0, blocksize);
+ for (i = 0; i < t_bucket->bu_blocks; i++)
+ memset(bucket_block(t_bucket, i), 0, blocksize);
- xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+ xh = bucket_xh(t_bucket);
xh->xh_free_start = cpu_to_le16(blocksize);
xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
}
/* copy the whole bucket to the new first. */
- for (i = 0; i < blk_per_bucket; i++)
- memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+ ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
/* update the new bucket. */
- xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+ xh = bucket_xh(t_bucket);
/*
* Calculate the total name/value len and xh_free_start for
@@ -3319,11 +3902,7 @@ set_num_buckets:
else
xh->xh_num_buckets = 0;
- for (i = 0; i < blk_per_bucket; i++) {
- ocfs2_journal_dirty(handle, t_bhs[i]);
- if (ret)
- mlog_errno(ret);
- }
+ ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
/* store the first_hash of the new bucket. */
if (first_hash)
@@ -3337,29 +3916,18 @@ set_num_buckets:
if (start == count)
goto out;
- xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+ xh = bucket_xh(s_bucket);
memset(&xh->xh_entries[start], 0,
sizeof(struct ocfs2_xattr_entry) * (count - start));
xh->xh_count = cpu_to_le16(start);
xh->xh_free_start = cpu_to_le16(name_offset);
xh->xh_name_value_len = cpu_to_le16(name_value_len);
- ocfs2_journal_dirty(handle, s_bhs[0]);
- if (ret)
- mlog_errno(ret);
+ ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
out:
- if (s_bhs) {
- for (i = 0; i < blk_per_bucket; i++)
- brelse(s_bhs[i]);
- }
- kfree(s_bhs);
-
- if (t_bhs) {
- for (i = 0; i < blk_per_bucket; i++)
- brelse(t_bhs[i]);
- }
- kfree(t_bhs);
+ ocfs2_xattr_bucket_free(s_bucket);
+ ocfs2_xattr_bucket_free(t_bucket);
return ret;
}
@@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
u64 t_blkno,
int t_is_new)
{
- int ret, i;
- int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- int blocksize = inode->i_sb->s_blocksize;
- struct buffer_head **s_bhs, **t_bhs = NULL;
+ int ret;
+ struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
BUG_ON(s_blkno == t_blkno);
@@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
(unsigned long long)s_blkno, (unsigned long long)t_blkno,
t_is_new);
- s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
- GFP_NOFS);
- if (!s_bhs)
- return -ENOMEM;
+ s_bucket = ocfs2_xattr_bucket_new(inode);
+ t_bucket = ocfs2_xattr_bucket_new(inode);
+ if (!s_bucket || !t_bucket) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
- ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+ ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
if (ret)
goto out;
- t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
- GFP_NOFS);
- if (!t_bhs) {
- ret = -ENOMEM;
+ /*
+ * Even if !t_is_new, we're overwriting t_bucket. Thus,
+ * there's no need to read it.
+ */
+ ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+ if (ret)
goto out;
- }
- ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+ /*
+ * Hey, if we're overwriting t_bucket, what difference does
+ * ACCESS_CREATE vs ACCESS_WRITE make? Well, if we allocated a new
+ * cluster to fill, we came here from
+ * ocfs2_mv_xattr_buckets(), and it is really new -
+ * ACCESS_CREATE is required. But we also might have moved data
+ * out of t_bucket before extending back into it.
+ * ocfs2_add_new_xattr_bucket() can do this - its call to
+ * ocfs2_add_new_xattr_cluster() may have created a new extent
+ * and copied out the end of the old extent. Then it re-extends
+ * the old extent back to create space for new xattrs. That's
+ * how we get here, and the bucket isn't really new.
+ */
+ ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+ t_is_new ?
+ OCFS2_JOURNAL_ACCESS_CREATE :
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret)
goto out;
- for (i = 0; i < blk_per_bucket; i++) {
- ret = ocfs2_journal_access(handle, inode, t_bhs[i],
- t_is_new ?
- OCFS2_JOURNAL_ACCESS_CREATE :
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret)
- goto out;
- }
-
- for (i = 0; i < blk_per_bucket; i++) {
- memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
- ocfs2_journal_dirty(handle, t_bhs[i]);
- }
+ ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
+ ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
out:
- if (s_bhs) {
- for (i = 0; i < blk_per_bucket; i++)
- brelse(s_bhs[i]);
- }
- kfree(s_bhs);
-
- if (t_bhs) {
- for (i = 0; i < blk_per_bucket; i++)
- brelse(t_bhs[i]);
- }
- kfree(t_bhs);
+ ocfs2_xattr_bucket_free(t_bucket);
+ ocfs2_xattr_bucket_free(s_bucket);
return ret;
}
/*
- * Copy one xattr cluster from src_blk to to_blk.
- * The to_blk will become the first bucket header of the cluster, so its
- * xh_num_buckets will be initialized as the bucket num in the cluster.
+ * src_blk points to the start of an existing extent. last_blk points to
+ * last cluster in that extent. to_blk points to a newly allocated
+ * extent. We copy the buckets from the cluster at last_blk to the new
+ * extent. If start_bucket is non-zero, we skip that many buckets before
+ * we start copying. The new extent's xh_num_buckets gets set to the
+ * number of buckets we copied. The old extent's xh_num_buckets shrinks
+ * by the same amount.
*/
-static int ocfs2_cp_xattr_cluster(struct inode *inode,
- handle_t *handle,
- struct buffer_head *first_bh,
- u64 src_blk,
- u64 to_blk,
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+ u64 src_blk, u64 last_blk, u64 to_blk,
+ unsigned int start_bucket,
u32 *first_hash)
{
int i, ret, credits;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
- struct buffer_head *bh = NULL;
- struct ocfs2_xattr_header *xh;
- u64 to_blk_start = to_blk;
+ struct ocfs2_xattr_bucket *old_first, *new_first;
+
+ mlog(0, "mv xattrs from cluster %llu to %llu\n",
+ (unsigned long long)last_blk, (unsigned long long)to_blk);
+
+ BUG_ON(start_bucket >= num_buckets);
+ if (start_bucket) {
+ num_buckets -= start_bucket;
+ last_blk += (start_bucket * blks_per_bucket);
+ }
+
+ /* The first bucket of the original extent */
+ old_first = ocfs2_xattr_bucket_new(inode);
+ /* The first bucket of the new extent */
+ new_first = ocfs2_xattr_bucket_new(inode);
+ if (!old_first || !new_first) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
- mlog(0, "cp xattrs from cluster %llu to %llu\n",
- (unsigned long long)src_blk, (unsigned long long)to_blk);
+ ret = ocfs2_read_xattr_bucket(old_first, src_blk);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
- * We need to update the new cluster and 1 more for the update of
- * the 1st bucket of the previous extent rec.
+ * We need to update the first bucket of the old extent and all
+ * the buckets going to the new extent.
*/
- credits = bpc + 1;
+ credits = ((num_buckets + 1) * blks_per_bucket) +
+ handle->h_buffer_credits;
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, first_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
for (i = 0; i < num_buckets; i++) {
ret = ocfs2_cp_xattr_bucket(inode, handle,
- src_blk, to_blk, 1);
+ last_blk + (i * blks_per_bucket),
+ to_blk + (i * blks_per_bucket),
+ 1);
if (ret) {
mlog_errno(ret);
goto out;
}
-
- src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
}
- /* update the old bucket header. */
- xh = (struct ocfs2_xattr_header *)first_bh->b_data;
- le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
-
- ocfs2_journal_dirty(handle, first_bh);
-
- /* update the new bucket header. */
- ret = ocfs2_read_block(inode, to_blk_start, &bh);
- if (ret < 0) {
+ /*
+ * Get the new bucket ready before we dirty anything
+ * (This actually shouldn't fail, because we already dirtied
+ * it once in ocfs2_cp_xattr_bucket()).
+ */
+ ret = ocfs2_read_xattr_bucket(new_first, to_blk);
+ if (ret) {
mlog_errno(ret);
goto out;
}
-
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
- xh = (struct ocfs2_xattr_header *)bh->b_data;
- xh->xh_num_buckets = cpu_to_le16(num_buckets);
+ /* Now update the headers */
+ le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
+ ocfs2_xattr_bucket_journal_dirty(handle, old_first);
- ocfs2_journal_dirty(handle, bh);
+ bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
+ ocfs2_xattr_bucket_journal_dirty(handle, new_first);
if (first_hash)
- *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+ *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
+
out:
- brelse(bh);
+ ocfs2_xattr_bucket_free(new_first);
+ ocfs2_xattr_bucket_free(old_first);
return ret;
}
@@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
u32 *first_hash)
{
u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- int ret, credits = 2 * blk_per_bucket;
+ int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
*/
static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
handle_t *handle,
- struct buffer_head **first_bh,
- struct buffer_head **header_bh,
+ struct ocfs2_xattr_bucket *first,
+ struct ocfs2_xattr_bucket *target,
u64 new_blk,
- u64 prev_blk,
u32 prev_clusters,
u32 *v_start,
int *extend)
{
- int ret = 0;
- int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ int ret;
mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
- (unsigned long long)prev_blk, prev_clusters,
+ (unsigned long long)bucket_blkno(first), prev_clusters,
(unsigned long long)new_blk);
- if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+ if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
handle,
- first_bh,
- header_bh,
+ first, target,
new_blk,
- prev_blk,
prev_clusters,
v_start);
- else {
- u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
-
- if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
- ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
- last_blk, new_blk,
+ if (ret)
+ mlog_errno(ret);
+ } else {
+ /* The start of the last cluster in the first extent */
+ u64 last_blk = bucket_blkno(first) +
+ ((prev_clusters - 1) *
+ ocfs2_clusters_to_blocks(inode->i_sb, 1));
+
+ if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
+ ret = ocfs2_mv_xattr_buckets(inode, handle,
+ bucket_blkno(first),
+ last_blk, new_blk, 0,
v_start);
- else {
+ if (ret)
+ mlog_errno(ret);
+ } else {
ret = ocfs2_divide_xattr_cluster(inode, handle,
last_blk, new_blk,
v_start);
+ if (ret)
+ mlog_errno(ret);
- if ((*header_bh)->b_blocknr == last_blk && extend)
+ if ((bucket_blkno(target) == last_blk) && extend)
*extend = 0;
}
}
@@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
*/
static int ocfs2_add_new_xattr_cluster(struct inode *inode,
struct buffer_head *root_bh,
- struct buffer_head **first_bh,
- struct buffer_head **header_bh,
+ struct ocfs2_xattr_bucket *first,
+ struct ocfs2_xattr_bucket *target,
u32 *num_clusters,
u32 prev_cpos,
- u64 prev_blkno,
- int *extend)
+ int *extend,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
- int ret, credits;
+ int ret;
u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
u32 prev_clusters = *num_clusters;
u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
u64 block;
- handle_t *handle = NULL;
- struct ocfs2_alloc_context *data_ac = NULL;
- struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = ctxt->handle;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_extent_tree et;
mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
"previous xattr blkno = %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
- prev_cpos, (unsigned long long)prev_blkno);
+ prev_cpos, (unsigned long long)bucket_blkno(first));
ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
- ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
- &data_ac, &meta_ac);
- if (ret) {
- mlog_errno(ret);
- goto leave;
- }
-
- credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
- clusters_to_add);
- handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
- mlog_errno(ret);
- goto leave;
- }
-
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto leave;
}
- ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+ ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
clusters_to_add, &bit_off, &num_bits);
if (ret < 0) {
if (ret != -ENOSPC)
@@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
- if (prev_blkno + prev_clusters * bpc == block &&
+ if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
(prev_clusters + num_bits) << osb->s_clustersize_bits <=
OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
/*
@@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
} else {
ret = ocfs2_adjust_xattr_cross_cluster(inode,
handle,
- first_bh,
- header_bh,
+ first,
+ target,
block,
- prev_blkno,
prev_clusters,
&v_start,
extend);
@@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
}
}
- if (handle->h_buffer_credits < credits) {
- /*
- * The journal has been restarted before, and don't
- * have enough space for the insertion, so extend it
- * here.
- */
- ret = ocfs2_extend_trans(handle, credits);
- if (ret) {
- mlog_errno(ret);
- goto leave;
- }
- }
mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
num_bits, (unsigned long long)block, v_start);
ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
- num_bits, 0, meta_ac);
+ num_bits, 0, ctxt->meta_ac);
if (ret < 0) {
mlog_errno(ret);
goto leave;
}
ret = ocfs2_journal_dirty(handle, root_bh);
- if (ret < 0) {
+ if (ret < 0)
mlog_errno(ret);
- goto leave;
- }
leave:
- if (handle)
- ocfs2_commit_trans(osb, handle);
- if (data_ac)
- ocfs2_free_alloc_context(data_ac);
- if (meta_ac)
- ocfs2_free_alloc_context(meta_ac);
-
return ret;
}
/*
- * Extend a new xattr bucket and move xattrs to the end one by one until
- * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ * We are given an extent. 'first' is the bucket at the very front of
+ * the extent. The extent has space for an additional bucket past
+ * bucket_xh(first)->xh_num_buckets. 'target_blkno' is the block number
+ * of the target bucket. We wish to shift every bucket past the target
+ * down one, filling in that additional space. When we get back to the
+ * target, we split the target between itself and the now-empty bucket
+ * at target+1 (aka, target_blkno + blks_per_bucket).
*/
static int ocfs2_extend_xattr_bucket(struct inode *inode,
- struct buffer_head *first_bh,
- struct buffer_head *start_bh,
+ handle_t *handle,
+ struct ocfs2_xattr_bucket *first,
+ u64 target_blk,
u32 num_clusters)
{
int ret, credits;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- u64 start_blk = start_bh->b_blocknr, end_blk;
- u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
- handle_t *handle;
- struct ocfs2_xattr_header *first_xh =
- (struct ocfs2_xattr_header *)first_bh->b_data;
- u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
+ u64 end_blk;
+ u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
- "from %llu, len = %u\n", (unsigned long long)start_blk,
- (unsigned long long)first_bh->b_blocknr, num_clusters);
+ "from %llu, len = %u\n", (unsigned long long)target_blk,
+ (unsigned long long)bucket_blkno(first), num_clusters);
- BUG_ON(bucket >= num_buckets);
+ /* The extent must have room for an additional bucket */
+ BUG_ON(new_bucket >=
+ (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
- end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+ /* end_blk points to the last existing bucket */
+ end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
/*
- * We will touch all the buckets after the start_bh(include it).
- * Add one more bucket and modify the first_bh.
+ * end_blk is the start of the last existing bucket.
+ * Thus, (end_blk - target_blk) covers the target bucket and
+ * every bucket after it up to, but not including, the last
+ * existing bucket. Then we add the last existing bucket, the
+ * new bucket, and the first bucket (3 * blk_per_bucket).
*/
- credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
- handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
+ handle->h_buffer_credits;
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, first_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_xattr_bucket_journal_access(handle, first,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto commit;
+ goto out;
}
- while (end_blk != start_blk) {
+ while (end_blk != target_blk) {
ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
end_blk + blk_per_bucket, 0);
if (ret)
- goto commit;
+ goto out;
end_blk -= blk_per_bucket;
}
- /* Move half of the xattr in start_blk to the next bucket. */
- ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
- start_blk + blk_per_bucket, NULL, 0);
+ /* Move half of the xattr in target_blkno to the next bucket. */
+ ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
+ target_blk + blk_per_bucket, NULL, 0);
- le16_add_cpu(&first_xh->xh_num_buckets, 1);
- ocfs2_journal_dirty(handle, first_bh);
+ le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
+ ocfs2_xattr_bucket_journal_dirty(handle, first);
-commit:
- ocfs2_commit_trans(osb, handle);
out:
return ret;
}
/*
- * Add new xattr bucket in an extent record and adjust the buckets accordingly.
- * xb_bh is the ocfs2_xattr_block.
- * We will move all the buckets starting from header_bh to the next place. As
- * for this one, half num of its xattrs will be moved to the next one.
+ * Add new xattr bucket in an extent record and adjust the buckets
+ * accordingly. xb_bh is the ocfs2_xattr_block, and target is the
+ * bucket we want to insert into.
+ *
+ * In the easy case, we will move all the buckets after target down by
+ * one. Half of target's xattrs will be moved to the next bucket.
*
- * We will allocate a new cluster if current cluster is full and adjust
- * header_bh and first_bh if the insert place is moved to the new cluster.
+ * If current cluster is full, we'll allocate a new one. This may not
+ * be contiguous. The underlying calls will make sure that there is
+ * space for the insert, shifting buckets around if necessary.
+ * 'target' may be moved by those calls.
*/
static int ocfs2_add_new_xattr_bucket(struct inode *inode,
struct buffer_head *xb_bh,
- struct buffer_head *header_bh)
+ struct ocfs2_xattr_bucket *target,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
- struct ocfs2_xattr_header *first_xh = NULL;
- struct buffer_head *first_bh = NULL;
struct ocfs2_xattr_block *xb =
(struct ocfs2_xattr_block *)xb_bh->b_data;
struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
struct ocfs2_extent_list *el = &xb_root->xt_list;
- struct ocfs2_xattr_header *xh =
- (struct ocfs2_xattr_header *)header_bh->b_data;
- u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
- struct super_block *sb = inode->i_sb;
- struct ocfs2_super *osb = OCFS2_SB(sb);
+ u32 name_hash =
+ le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int ret, num_buckets, extend = 1;
u64 p_blkno;
u32 e_cpos, num_clusters;
+ /* The bucket at the front of the extent */
+ struct ocfs2_xattr_bucket *first;
- mlog(0, "Add new xattr bucket starting form %llu\n",
- (unsigned long long)header_bh->b_blocknr);
+ mlog(0, "Add new xattr bucket starting from %llu\n",
+ (unsigned long long)bucket_blkno(target));
- /*
- * Add refrence for header_bh here because it may be
- * changed in ocfs2_add_new_xattr_cluster and we need
- * to free it in the end.
- */
- get_bh(header_bh);
+ /* The first bucket of the original extent */
+ first = ocfs2_xattr_bucket_new(inode);
+ if (!first) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
&num_clusters, el);
@@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
goto out;
}
- ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+ ret = ocfs2_read_xattr_bucket(first, p_blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
- first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
-
- if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+ if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
+ /*
+ * This can move first+target if the target bucket moves
+ * to the new extent.
+ */
ret = ocfs2_add_new_xattr_cluster(inode,
xb_bh,
- &first_bh,
- &header_bh,
+ first,
+ target,
&num_clusters,
e_cpos,
- p_blkno,
- &extend);
+ &extend,
+ ctxt);
if (ret) {
mlog_errno(ret);
goto out;
}
}
- if (extend)
+ if (extend) {
ret = ocfs2_extend_xattr_bucket(inode,
- first_bh,
- header_bh,
+ ctxt->handle,
+ first,
+ bucket_blkno(target),
num_clusters);
- if (ret)
- mlog_errno(ret);
+ if (ret)
+ mlog_errno(ret);
+ }
+
out:
- brelse(first_bh);
- brelse(header_bh);
+ ocfs2_xattr_bucket_free(first);
+
return ret;
}
@@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
int block_off = offs >> inode->i_sb->s_blocksize_bits;
offs = offs % inode->i_sb->s_blocksize;
- return bucket->bhs[block_off]->b_data + offs;
+ return bucket_block(bucket, block_off) + offs;
}
/*
@@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
xe->xe_value_size = 0;
val = ocfs2_xattr_bucket_get_val(inode,
- &xs->bucket, offs);
+ xs->bucket, offs);
memset(val + OCFS2_XATTR_SIZE(name_len), 0,
size - OCFS2_XATTR_SIZE(name_len));
if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
@@ -4062,8 +4630,7 @@ set_new_name_value:
xh->xh_free_start = cpu_to_le16(offs);
}
- val = ocfs2_xattr_bucket_get_val(inode,
- &xs->bucket, offs - size);
+ val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
xe->xe_name_offset = cpu_to_le16(offs - size);
memset(val, 0, size);
@@ -4079,125 +4646,45 @@ set_new_name_value:
return;
}
-static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
- handle_t *handle,
- struct ocfs2_xattr_search *xs,
- struct buffer_head **bhs,
- u16 bh_num)
-{
- int ret = 0, off, block_off;
- struct ocfs2_xattr_entry *xe = xs->here;
-
- /*
- * First calculate all the blocks we should journal_access
- * and journal_dirty. The first block should always be touched.
- */
- ret = ocfs2_journal_dirty(handle, bhs[0]);
- if (ret)
- mlog_errno(ret);
-
- /* calc the data. */
- off = le16_to_cpu(xe->xe_name_offset);
- block_off = off >> inode->i_sb->s_blocksize_bits;
- ret = ocfs2_journal_dirty(handle, bhs[block_off]);
- if (ret)
- mlog_errno(ret);
-
- return ret;
-}
-
/*
* Set the xattr entry in the specified bucket.
* The bucket is indicated by xs->bucket and it should have the enough
* space for the xattr insertion.
*/
static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_info *xi,
struct ocfs2_xattr_search *xs,
u32 name_hash,
int local)
{
- int i, ret;
- handle_t *handle = NULL;
- u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int ret;
+ u64 blkno;
mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
(unsigned long)xi->value_len, xi->name_index,
- (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+ (unsigned long long)bucket_blkno(xs->bucket));
- if (!xs->bucket.bhs[1]) {
- ret = ocfs2_read_blocks(inode,
- xs->bucket.bhs[0]->b_blocknr + 1,
- blk_per_bucket - 1, &xs->bucket.bhs[1],
- 0);
+ if (!xs->bucket->bu_bhs[1]) {
+ blkno = bucket_blkno(xs->bucket);
+ ocfs2_xattr_bucket_relse(xs->bucket);
+ ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
}
- handle = ocfs2_start_trans(osb, blk_per_bucket);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
mlog_errno(ret);
goto out;
}
- for (i = 0; i < blk_per_bucket; i++) {
- ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+ ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
- /*Only dirty the blocks we have touched in set xattr. */
- ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
- xs->bucket.bhs, blk_per_bucket);
- if (ret)
- mlog_errno(ret);
-out:
- ocfs2_commit_trans(osb, handle);
-
- return ret;
-}
-
-static int ocfs2_xattr_value_update_size(struct inode *inode,
- struct buffer_head *xe_bh,
- struct ocfs2_xattr_entry *xe,
- u64 new_size)
-{
- int ret;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- handle_t *handle = NULL;
-
- handle = ocfs2_start_trans(osb, 1);
- if (IS_ERR(handle)) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_journal_access(handle, inode, xe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- xe->xe_value_size = cpu_to_le64(new_size);
-
- ret = ocfs2_journal_dirty(handle, xe_bh);
- if (ret < 0)
- mlog_errno(ret);
-
-out_commit:
- ocfs2_commit_trans(osb, handle);
out:
return ret;
}
@@ -4210,18 +4697,19 @@ out:
* Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
*/
static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
- struct buffer_head *header_bh,
+ struct ocfs2_xattr_bucket *bucket,
int xe_off,
- int len)
+ int len,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret, offset;
u64 value_blk;
- struct buffer_head *value_bh = NULL;
- struct ocfs2_xattr_value_root *xv;
struct ocfs2_xattr_entry *xe;
- struct ocfs2_xattr_header *xh =
- (struct ocfs2_xattr_header *)header_bh->b_data;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
size_t blocksize = inode->i_sb->s_blocksize;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_access = ocfs2_journal_access,
+ };
xe = &xh->xh_entries[xe_off];
@@ -4234,49 +4722,58 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
/* We don't allow ocfs2_xattr_value to be stored in different block. */
BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
- value_blk += header_bh->b_blocknr;
- ret = ocfs2_read_block(inode, value_blk, &value_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ vb.vb_bh = bucket->bu_bhs[value_blk];
+ BUG_ON(!vb.vb_bh);
- xv = (struct ocfs2_xattr_value_root *)
- (value_bh->b_data + offset % blocksize);
+ vb.vb_xv = (struct ocfs2_xattr_value_root *)
+ (vb.vb_bh->b_data + offset % blocksize);
- mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
- xe_off, (unsigned long long)header_bh->b_blocknr, len);
- ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+ ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+ /*
+ * From here on out we have to dirty the bucket. The generic
+ * value calls only modify one of the bucket's bhs, but we need
+ * to send the bucket at once. So if they error, they *could* have
+ * modified something. We have to assume they did, and dirty
+ * the whole bucket. This leaves us in a consistent state.
+ */
+ mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+ xe_off, (unsigned long long)bucket_blkno(bucket), len);
+ ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_dirty;
}
+ xe->xe_value_size = cpu_to_le64(len);
+
+out_dirty:
+ ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
+
out:
- brelse(value_bh);
return ret;
}
static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
- struct ocfs2_xattr_search *xs,
- int len)
+ struct ocfs2_xattr_search *xs,
+ int len,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret, offset;
struct ocfs2_xattr_entry *xe = xs->here;
struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
- BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+ BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
offset = xe - xh->xh_entries;
- ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
- offset, len);
+ ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
+ offset, len, ctxt);
if (ret)
mlog_errno(ret);
@@ -4284,6 +4781,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
}
static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_search *xs,
char *val,
int value_len)
@@ -4299,7 +4797,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
- return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+ return __ocfs2_xattr_set_value_outside(inode, handle,
+ xv, val, value_len);
}
static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -4343,15 +4842,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
}
}
- handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+ handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
if (IS_ERR(handle)) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -4392,26 +4891,19 @@ out:
}
static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_search *xs)
{
- handle_t *handle = NULL;
- struct ocfs2_xattr_header *xh = xs->bucket.xh;
+ struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
struct ocfs2_xattr_entry *last = &xh->xh_entries[
le16_to_cpu(xh->xh_count) - 1];
int ret = 0;
- handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- return;
- }
-
- ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ return;
}
/* Remove the old entry. */
@@ -4420,11 +4912,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
memset(last, 0, sizeof(struct ocfs2_xattr_entry));
le16_add_cpu(&xh->xh_count, -1);
- ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
- if (ret < 0)
- mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+ ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
}
/*
@@ -4440,7 +4928,8 @@ out_commit:
*/
static int ocfs2_xattr_set_in_bucket(struct inode *inode,
struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs)
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret, local = 1;
size_t value_len;
@@ -4468,7 +4957,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
value_len = 0;
ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
- value_len);
+ value_len,
+ ctxt);
if (ret)
goto out;
@@ -4488,7 +4978,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
xi->value_len = OCFS2_XATTR_ROOT_SIZE;
}
- ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+ ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
+ name_hash, local);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4499,7 +4990,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
/* allocate the space now for the outside block storage. */
ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
- value_len);
+ value_len, ctxt);
if (ret) {
mlog_errno(ret);
@@ -4509,13 +5000,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
* storage and we have allocated xattr already,
* so need to remove it.
*/
- ocfs2_xattr_bucket_remove_xs(inode, xs);
+ ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
}
goto out;
}
set_value_outside:
- ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+ ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
+ xs, val, value_len);
out:
return ret;
}
@@ -4530,7 +5022,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
struct ocfs2_xattr_bucket *bucket,
const char *name)
{
- struct ocfs2_xattr_header *xh = bucket->xh;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4540,7 +5032,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
xh->xh_entries[0].xe_name_hash) {
mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
"hash = %u\n",
- (unsigned long long)bucket->bhs[0]->b_blocknr,
+ (unsigned long long)bucket_blkno(bucket),
le32_to_cpu(xh->xh_entries[0].xe_name_hash));
return -ENOSPC;
}
@@ -4550,16 +5042,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs)
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
struct ocfs2_xattr_header *xh;
struct ocfs2_xattr_entry *xe;
u16 count, header_size, xh_free_start;
- int i, free, max_free, need, old;
+ int free, max_free, need, old;
size_t value_size = 0, name_len = strlen(xi->name);
size_t blocksize = inode->i_sb->s_blocksize;
int ret, allocation = 0;
- u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
mlog_entry("Set xattr %s in xattr index block\n", xi->name);
@@ -4574,7 +5066,7 @@ try_again:
mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
"of %u which exceed block size\n",
- (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+ (unsigned long long)bucket_blkno(xs->bucket),
header_size);
if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,11 +5106,13 @@ try_again:
mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
"need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
" %u\n", xs->not_found,
- (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+ (unsigned long long)bucket_blkno(xs->bucket),
free, need, max_free, le16_to_cpu(xh->xh_free_start),
le16_to_cpu(xh->xh_name_value_len));
- if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+ if (free < need ||
+ (xs->not_found &&
+ count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
if (need <= max_free &&
count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
/*
@@ -4626,7 +5120,8 @@ try_again:
* name/value will be moved, the xe shouldn't be changed
* in xs.
*/
- ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+ ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+ xs->bucket);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4658,7 +5153,7 @@ try_again:
* add a new bucket for the insert.
*/
ret = ocfs2_check_xattr_bucket_collision(inode,
- &xs->bucket,
+ xs->bucket,
xi->name);
if (ret) {
mlog_errno(ret);
@@ -4667,17 +5162,21 @@ try_again:
ret = ocfs2_add_new_xattr_bucket(inode,
xs->xattr_bh,
- xs->bucket.bhs[0]);
+ xs->bucket,
+ ctxt);
if (ret) {
mlog_errno(ret);
goto out;
}
- for (i = 0; i < blk_per_bucket; i++)
- brelse(xs->bucket.bhs[i]);
-
- memset(&xs->bucket, 0, sizeof(xs->bucket));
-
+ /*
+ * ocfs2_add_new_xattr_bucket() will have updated
+ * xs->bucket if it moved, but it will not have updated
+ * any of the other search fields. Thus, we drop it and
+ * re-search. Everything should be cached, so it'll be
+ * quick.
+ */
+ ocfs2_xattr_bucket_relse(xs->bucket);
ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
xi->name_index,
xi->name, xs);
@@ -4689,7 +5188,7 @@ try_again:
}
xattr_set:
- ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+ ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
out:
mlog_exit(ret);
return ret;
@@ -4700,24 +5199,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
void *para)
{
int ret = 0;
- struct ocfs2_xattr_header *xh = bucket->xh;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
u16 i;
struct ocfs2_xattr_entry *xe;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+ int credits = ocfs2_remove_extent_credits(osb->sb) +
+ ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+
+ ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
xe = &xh->xh_entries[i];
if (ocfs2_xattr_is_local(xe))
continue;
- ret = ocfs2_xattr_bucket_value_truncate(inode,
- bucket->bhs[0],
- i, 0);
+ ctxt.handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
+ i, 0, &ctxt);
+
+ ocfs2_commit_trans(osb, ctxt.handle);
if (ret) {
mlog_errno(ret);
break;
}
}
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &ctxt.dealloc);
return ret;
}
@@ -4768,6 +5284,74 @@ out:
}
/*
+ * 'security' attributes support
+ */
+static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
+ size_t list_size, const char *name,
+ size_t name_len)
+{
+ const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+ memcpy(list + prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
+ buffer, size);
+}
+
+static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
+ size, flags);
+}
+
+int ocfs2_init_security_get(struct inode *inode,
+ struct inode *dir,
+ struct ocfs2_security_xattr_info *si)
+{
+ /* check whether ocfs2 support feature xattr */
+ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
+ return -EOPNOTSUPP;
+ return security_inode_init_security(inode, dir, &si->name, &si->value,
+ &si->value_len);
+}
+
+int ocfs2_init_security_set(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ struct ocfs2_security_xattr_info *si,
+ struct ocfs2_alloc_context *xattr_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ return ocfs2_xattr_set_handle(handle, inode, di_bh,
+ OCFS2_XATTR_INDEX_SECURITY,
+ si->name, si->value, si->value_len, 0,
+ xattr_ac, data_ac);
+}
+
+struct xattr_handler ocfs2_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = ocfs2_xattr_security_list,
+ .get = ocfs2_xattr_security_get,
+ .set = ocfs2_xattr_security_set,
+};
+
+/*
* 'trusted' attributes support
*/
static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1d8314c7656..5a1ebc789f7 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,13 +30,58 @@ enum ocfs2_xattr_type {
OCFS2_XATTR_MAX
};
+struct ocfs2_security_xattr_info {
+ int enable;
+ char *name;
+ void *value;
+ size_t value_len;
+};
+
extern struct xattr_handler ocfs2_xattr_user_handler;
extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern struct xattr_handler ocfs2_xattr_security_handler;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+#endif
extern struct xattr_handler *ocfs2_xattr_handlers[];
ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+ const char *, void *, size_t);
int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+ int, const char *, const void *, size_t, int,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+ struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+ struct buffer_head *,
+ struct ocfs2_security_xattr_info *,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+ struct ocfs2_security_xattr_info *,
+ int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+ int, struct ocfs2_security_xattr_info *,
+ int *, int *, struct ocfs2_alloc_context **);
+
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block. Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal. Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+ struct buffer_head *vb_bh;
+ ocfs2_journal_access_func vb_access;
+ struct ocfs2_xattr_value_root *vb_xv;
+};
+
#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6afe57c84f8..633e9dc972b 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_blocks = 0;
inode->i_mapping->a_ops = &omfs_aops;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index 1cd7d40e999..d882fd2351d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -412,7 +412,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
goto out_fput;
- if (inode->i_op && inode->i_op->fallocate)
+ if (inode->i_op->fallocate)
ret = inode->i_op->fallocate(inode, mode, offset, len);
else
ret = -EOPNOTSUPP;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d41bdc784de..ffcd04f0012 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,9 +256,6 @@ found:
break;
}
- inode->i_gid = 0;
- inode->i_uid = 0;
-
d_add(dentry, inode);
return NULL;
}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d5b213b8a9..6d720243f5f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno)
blk_free_devt(part_devt(part));
rcu_assign_pointer(ptbl->part[partno], NULL);
+ rcu_assign_pointer(ptbl->last_lookup, NULL);
kobject_put(part->holder_dir);
device_del(part_to_dev(part));
@@ -384,9 +385,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
dname = dev_name(ddev);
if (isdigit(dname[strlen(dname) - 1]))
- snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
+ dev_set_name(pdev, "%sp%d", dname, partno);
else
- snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
+ dev_set_name(pdev, "%s%d", dname, partno);
device_initialize(pdev);
pdev->class = &block_class;
@@ -447,16 +448,11 @@ void register_disk(struct gendisk *disk)
struct block_device *bdev;
struct disk_part_iter piter;
struct hd_struct *part;
- char *s;
int err;
ddev->parent = disk->driverfs_dev;
- strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
- /* ewww... some of these buggers have / in the name... */
- s = strchr(ddev->bus_id, '/');
- if (s)
- *s = '!';
+ dev_set_name(ddev, disk->disk_name);
/* delay uevents, until we scanned partition table */
ddev->uevent_suppress = 1;
diff --git a/fs/pipe.c b/fs/pipe.c
index aaf797bd57b..891697112f6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1016,10 +1016,7 @@ int do_pipe_flags(int *fd, int flags)
goto err_fdr;
fdw = error;
- error = audit_fd_pair(fdr, fdw);
- if (error < 0)
- goto err_fdw;
-
+ audit_fd_pair(fdr, fdw);
fd_install(fdr, fr);
fd_install(fdw, fw);
fd[0] = fdr;
@@ -1027,8 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
return 0;
- err_fdw:
- put_unused_fd(fdw);
err_fdr:
put_unused_fd(fdr);
err_read_pipe:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cad92c1ac2b..0c9de19a163 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
#include <linux/mm.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
#include <linux/mount.h>
@@ -109,25 +110,22 @@ struct pid_entry {
.op = OP, \
}
-#define DIR(NAME, MODE, OTYPE) \
- NOD(NAME, (S_IFDIR|(MODE)), \
- &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations, \
- {} )
-#define LNK(NAME, OTYPE) \
+#define DIR(NAME, MODE, iops, fops) \
+ NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
+#define LNK(NAME, get_link) \
NOD(NAME, (S_IFLNK|S_IRWXUGO), \
&proc_pid_link_inode_operations, NULL, \
- { .proc_get_link = &proc_##OTYPE##_link } )
-#define REG(NAME, MODE, OTYPE) \
- NOD(NAME, (S_IFREG|(MODE)), NULL, \
- &proc_##OTYPE##_operations, {})
-#define INF(NAME, MODE, OTYPE) \
+ { .proc_get_link = get_link } )
+#define REG(NAME, MODE, fops) \
+ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
+#define INF(NAME, MODE, read) \
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_info_file_operations, \
- { .proc_read = &proc_##OTYPE } )
-#define ONE(NAME, MODE, OTYPE) \
+ { .proc_read = read } )
+#define ONE(NAME, MODE, show) \
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
- { .proc_show = &proc_##OTYPE } )
+ { .proc_show = show } )
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
@@ -308,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
struct mm_struct *mm = get_task_mm(task);
if (mm) {
unsigned int nwords = 0;
- do
+ do {
nwords += 2;
- while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+ } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
res = nwords * sizeof(mm->saved_auxv[0]);
if (res > PAGE_SIZE)
res = PAGE_SIZE;
@@ -340,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
}
#endif /* CONFIG_KALLSYMS */
+#ifdef CONFIG_STACKTRACE
+
+#define MAX_STACK_TRACE_DEPTH 64
+
+static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct stack_trace trace;
+ unsigned long *entries;
+ int i;
+
+ entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
+ if (!entries)
+ return -ENOMEM;
+
+ trace.nr_entries = 0;
+ trace.max_entries = MAX_STACK_TRACE_DEPTH;
+ trace.entries = entries;
+ trace.skip = 0;
+ save_stack_trace_tsk(task, &trace);
+
+ for (i = 0; i < trace.nr_entries; i++) {
+ seq_printf(m, "[<%p>] %pS\n",
+ (void *)entries[i], (void *)entries[i]);
+ }
+ kfree(entries);
+
+ return 0;
+}
+#endif
+
#ifdef CONFIG_SCHEDSTATS
/*
* Provides /proc/PID/schedstat
@@ -1186,8 +1215,6 @@ static int sched_show(struct seq_file *m, void *v)
struct inode *inode = m->private;
struct task_struct *p;
- WARN_ON(!inode);
-
p = get_proc_task(inode);
if (!p)
return -ESRCH;
@@ -1205,8 +1232,6 @@ sched_write(struct file *file, const char __user *buf,
struct inode *inode = file->f_path.dentry->d_inode;
struct task_struct *p;
- WARN_ON(!inode);
-
p = get_proc_task(inode);
if (!p)
return -ESRCH;
@@ -1426,8 +1451,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
if (!ei->pid)
goto out_unlock;
- inode->i_uid = 0;
- inode->i_gid = 0;
if (task_dumpable(task)) {
rcu_read_lock();
cred = __task_cred(task);
@@ -1976,13 +1999,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
const struct pid_entry *ents,
unsigned int nents)
{
- struct inode *inode;
struct dentry *error;
struct task_struct *task = get_proc_task(dir);
const struct pid_entry *p, *last;
error = ERR_PTR(-ENOENT);
- inode = NULL;
if (!task)
goto out_no_task;
@@ -2138,12 +2159,12 @@ static const struct file_operations proc_pid_attr_operations = {
};
static const struct pid_entry attr_dir_stuff[] = {
- REG("current", S_IRUGO|S_IWUGO, pid_attr),
- REG("prev", S_IRUGO, pid_attr),
- REG("exec", S_IRUGO|S_IWUGO, pid_attr),
- REG("fscreate", S_IRUGO|S_IWUGO, pid_attr),
- REG("keycreate", S_IRUGO|S_IWUGO, pid_attr),
- REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr),
+ REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+ REG("prev", S_IRUGO, proc_pid_attr_operations),
+ REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+ REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+ REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+ REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
};
static int proc_attr_dir_readdir(struct file * filp,
@@ -2349,8 +2370,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
if (!ei->pid)
goto out_iput;
- inode->i_uid = 0;
- inode->i_gid = 0;
inode->i_mode = p->mode;
if (S_ISDIR(inode->i_mode))
inode->i_nlink = 2;
@@ -2465,74 +2484,77 @@ static const struct file_operations proc_task_operations;
static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
- DIR("task", S_IRUGO|S_IXUGO, task),
- DIR("fd", S_IRUSR|S_IXUSR, fd),
- DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo),
+ DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
+ DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+ DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
#ifdef CONFIG_NET
- DIR("net", S_IRUGO|S_IXUGO, net),
+ DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
- REG("environ", S_IRUSR, environ),
- INF("auxv", S_IRUSR, pid_auxv),
- ONE("status", S_IRUGO, pid_status),
- ONE("personality", S_IRUSR, pid_personality),
- INF("limits", S_IRUSR, pid_limits),
+ REG("environ", S_IRUSR, proc_environ_operations),
+ INF("auxv", S_IRUSR, proc_pid_auxv),
+ ONE("status", S_IRUGO, proc_pid_status),
+ ONE("personality", S_IRUSR, proc_pid_personality),
+ INF("limits", S_IRUSR, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
- REG("sched", S_IRUGO|S_IWUSR, pid_sched),
+ REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- INF("syscall", S_IRUSR, pid_syscall),
+ INF("syscall", S_IRUSR, proc_pid_syscall),
#endif
- INF("cmdline", S_IRUGO, pid_cmdline),
- ONE("stat", S_IRUGO, tgid_stat),
- ONE("statm", S_IRUGO, pid_statm),
- REG("maps", S_IRUGO, maps),
+ INF("cmdline", S_IRUGO, proc_pid_cmdline),
+ ONE("stat", S_IRUGO, proc_tgid_stat),
+ ONE("statm", S_IRUGO, proc_pid_statm),
+ REG("maps", S_IRUGO, proc_maps_operations),
#ifdef CONFIG_NUMA
- REG("numa_maps", S_IRUGO, numa_maps),
+ REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
#endif
- REG("mem", S_IRUSR|S_IWUSR, mem),
- LNK("cwd", cwd),
- LNK("root", root),
- LNK("exe", exe),
- REG("mounts", S_IRUGO, mounts),
- REG("mountinfo", S_IRUGO, mountinfo),
- REG("mountstats", S_IRUSR, mountstats),
+ REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
+ LNK("cwd", proc_cwd_link),
+ LNK("root", proc_root_link),
+ LNK("exe", proc_exe_link),
+ REG("mounts", S_IRUGO, proc_mounts_operations),
+ REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
+ REG("mountstats", S_IRUSR, proc_mountstats_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
- REG("clear_refs", S_IWUSR, clear_refs),
- REG("smaps", S_IRUGO, smaps),
- REG("pagemap", S_IRUSR, pagemap),
+ REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+ REG("smaps", S_IRUGO, proc_smaps_operations),
+ REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
- DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
+ DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
- INF("wchan", S_IRUGO, pid_wchan),
+ INF("wchan", S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+ ONE("stack", S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
- INF("schedstat", S_IRUGO, pid_schedstat),
+ INF("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
- REG("latency", S_IRUGO, lstats),
+ REG("latency", S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
- REG("cpuset", S_IRUGO, cpuset),
+ REG("cpuset", S_IRUGO, proc_cpuset_operations),
#endif
#ifdef CONFIG_CGROUPS
- REG("cgroup", S_IRUGO, cgroup),
+ REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
- INF("oom_score", S_IRUGO, oom_score),
- REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust),
+ INF("oom_score", S_IRUGO, proc_oom_score),
+ REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
#ifdef CONFIG_AUDITSYSCALL
- REG("loginuid", S_IWUSR|S_IRUGO, loginuid),
- REG("sessionid", S_IRUGO, sessionid),
+ REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
+ REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
- REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+ REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
#endif
#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
- REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
+ REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- INF("io", S_IRUGO, tgid_io_accounting),
+ INF("io", S_IRUGO, proc_tgid_io_accounting),
#endif
};
@@ -2805,66 +2827,69 @@ out_no_task:
* Tasks
*/
static const struct pid_entry tid_base_stuff[] = {
- DIR("fd", S_IRUSR|S_IXUSR, fd),
- DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo),
- REG("environ", S_IRUSR, environ),
- INF("auxv", S_IRUSR, pid_auxv),
- ONE("status", S_IRUGO, pid_status),
- ONE("personality", S_IRUSR, pid_personality),
- INF("limits", S_IRUSR, pid_limits),
+ DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+ DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
+ REG("environ", S_IRUSR, proc_environ_operations),
+ INF("auxv", S_IRUSR, proc_pid_auxv),
+ ONE("status", S_IRUGO, proc_pid_status),
+ ONE("personality", S_IRUSR, proc_pid_personality),
+ INF("limits", S_IRUSR, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
- REG("sched", S_IRUGO|S_IWUSR, pid_sched),
+ REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- INF("syscall", S_IRUSR, pid_syscall),
+ INF("syscall", S_IRUSR, proc_pid_syscall),
#endif
- INF("cmdline", S_IRUGO, pid_cmdline),
- ONE("stat", S_IRUGO, tid_stat),
- ONE("statm", S_IRUGO, pid_statm),
- REG("maps", S_IRUGO, maps),
+ INF("cmdline", S_IRUGO, proc_pid_cmdline),
+ ONE("stat", S_IRUGO, proc_tid_stat),
+ ONE("statm", S_IRUGO, proc_pid_statm),
+ REG("maps", S_IRUGO, proc_maps_operations),
#ifdef CONFIG_NUMA
- REG("numa_maps", S_IRUGO, numa_maps),
+ REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
#endif
- REG("mem", S_IRUSR|S_IWUSR, mem),
- LNK("cwd", cwd),
- LNK("root", root),
- LNK("exe", exe),
- REG("mounts", S_IRUGO, mounts),
- REG("mountinfo", S_IRUGO, mountinfo),
+ REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
+ LNK("cwd", proc_cwd_link),
+ LNK("root", proc_root_link),
+ LNK("exe", proc_exe_link),
+ REG("mounts", S_IRUGO, proc_mounts_operations),
+ REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
- REG("clear_refs", S_IWUSR, clear_refs),
- REG("smaps", S_IRUGO, smaps),
- REG("pagemap", S_IRUSR, pagemap),
+ REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+ REG("smaps", S_IRUGO, proc_smaps_operations),
+ REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
- DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
+ DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
- INF("wchan", S_IRUGO, pid_wchan),
+ INF("wchan", S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+ ONE("stack", S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
- INF("schedstat", S_IRUGO, pid_schedstat),
+ INF("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
- REG("latency", S_IRUGO, lstats),
+ REG("latency", S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
- REG("cpuset", S_IRUGO, cpuset),
+ REG("cpuset", S_IRUGO, proc_cpuset_operations),
#endif
#ifdef CONFIG_CGROUPS
- REG("cgroup", S_IRUGO, cgroup),
+ REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
- INF("oom_score", S_IRUGO, oom_score),
- REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust),
+ INF("oom_score", S_IRUGO, proc_oom_score),
+ REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
#ifdef CONFIG_AUDITSYSCALL
- REG("loginuid", S_IWUSR|S_IRUGO, loginuid),
- REG("sessionid", S_IRUSR, sessionid),
+ REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
+ REG("sessionid", S_IRUSR, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
- REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+ REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- INF("io", S_IRUGO, tid_io_accounting),
+ INF("io", S_IRUGO, proc_tid_io_accounting),
#endif
};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 60a359b3558..db7fa5cab98 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -14,7 +14,6 @@
#include <linux/stat.h>
#include <linux/module.h>
#include <linux/mount.h>
-#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/idr.h>
#include <linux/namei.h>
@@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
struct inode *inode = NULL;
int error = -ENOENT;
- lock_kernel();
spin_lock(&proc_subdir_lock);
for (de = de->subdir; de ; de = de->next) {
if (de->namelen != dentry->d_name.len)
@@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
}
spin_unlock(&proc_subdir_lock);
out_unlock:
- unlock_kernel();
if (inode) {
dentry->d_op = &proc_dentry_operations;
@@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
struct inode *inode = filp->f_path.dentry->d_inode;
int ret = 0;
- lock_kernel();
-
ino = inode->i_ino;
i = filp->f_pos;
switch (i) {
@@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
spin_unlock(&proc_subdir_lock);
}
ret = 1;
-out: unlock_kernel();
+out:
return ret;
}
@@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
* the /proc directory.
*/
static const struct file_operations proc_dir_operations = {
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = proc_readdir,
};
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2543fd00c65..3e76bb9b3ad 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de)
*/
void de_put(struct proc_dir_entry *de)
{
- lock_kernel();
if (!atomic_read(&de->count)) {
printk("de_put: entry %s already free!\n", de->name);
- unlock_kernel();
return;
}
if (atomic_dec_and_test(&de->count))
free_proc_entry(de);
- unlock_kernel();
}
/*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e8aeb8b61c..cd53ff83849 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -41,8 +41,6 @@ do { \
(vmi)->used = 0; \
(vmi)->largest_chunk = 0; \
} while(0)
-
-extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
#endif
extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b1675c4e66d..43d23948384 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -74,6 +74,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
"LowTotal: %8lu kB\n"
"LowFree: %8lu kB\n"
#endif
+#ifndef CONFIG_MMU
+ "MmapCopy: %8lu kB\n"
+#endif
"SwapTotal: %8lu kB\n"
"SwapFree: %8lu kB\n"
"Dirty: %8lu kB\n"
@@ -116,6 +119,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(i.totalram-i.totalhigh),
K(i.freeram-i.freehigh),
#endif
+#ifndef CONFIG_MMU
+ K((unsigned long) atomic_read(&mmap_pages_allocated)),
+#endif
K(i.totalswap),
K(i.freeswap),
K(global_page_state(NR_FILE_DIRTY)),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 3f87d263294..b446d7ad0b0 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -33,33 +33,33 @@
#include "internal.h"
/*
- * display a single VMA to a sequenced file
+ * display a single region to a sequenced file
*/
-int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+static int nommu_region_show(struct seq_file *m, struct vm_region *region)
{
unsigned long ino = 0;
struct file *file;
dev_t dev = 0;
int flags, len;
- flags = vma->vm_flags;
- file = vma->vm_file;
+ flags = region->vm_flags;
+ file = region->vm_file;
if (file) {
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = region->vm_file->f_path.dentry->d_inode;
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
}
seq_printf(m,
"%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
- vma->vm_start,
- vma->vm_end,
+ region->vm_start,
+ region->vm_end,
flags & VM_READ ? 'r' : '-',
flags & VM_WRITE ? 'w' : '-',
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
- ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
+ ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
MAJOR(dev), MINOR(dev), ino, &len);
if (file) {
@@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
}
/*
- * display a list of all the VMAs the kernel knows about
+ * display a list of all the REGIONs the kernel knows about
* - nommu kernals have a single flat list
*/
-static int nommu_vma_list_show(struct seq_file *m, void *v)
+static int nommu_region_list_show(struct seq_file *m, void *_p)
{
- struct vm_area_struct *vma;
+ struct rb_node *p = _p;
- vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb);
- return nommu_vma_show(m, vma);
+ return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
}
-static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos)
+static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
{
- struct rb_node *_rb;
+ struct rb_node *p;
loff_t pos = *_pos;
- void *next = NULL;
- down_read(&nommu_vma_sem);
+ down_read(&nommu_region_sem);
- for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) {
- if (pos == 0) {
- next = _rb;
- break;
- }
- pos--;
- }
-
- return next;
+ for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
+ if (pos-- == 0)
+ return p;
+ return NULL;
}
-static void nommu_vma_list_stop(struct seq_file *m, void *v)
+static void nommu_region_list_stop(struct seq_file *m, void *v)
{
- up_read(&nommu_vma_sem);
+ up_read(&nommu_region_sem);
}
-static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos)
+static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
{
(*pos)++;
return rb_next((struct rb_node *) v);
}
-static const struct seq_operations proc_nommu_vma_list_seqop = {
- .start = nommu_vma_list_start,
- .next = nommu_vma_list_next,
- .stop = nommu_vma_list_stop,
- .show = nommu_vma_list_show
+static struct seq_operations proc_nommu_region_list_seqop = {
+ .start = nommu_region_list_start,
+ .next = nommu_region_list_next,
+ .stop = nommu_region_list_stop,
+ .show = nommu_region_list_show
};
-static int proc_nommu_vma_list_open(struct inode *inode, struct file *file)
+static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &proc_nommu_vma_list_seqop);
+ return seq_open(file, &proc_nommu_region_list_seqop);
}
-static const struct file_operations proc_nommu_vma_list_operations = {
- .open = proc_nommu_vma_list_open,
+static const struct file_operations proc_nommu_region_list_operations = {
+ .open = proc_nommu_region_list_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
@@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = {
static int __init proc_nommu_init(void)
{
- proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations);
+ proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
return 0;
}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7bc296f424a..04d1270f1c3 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -18,7 +18,6 @@
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/bitops.h>
-#include <linux/smp_lock.h>
#include <linux/mount.h>
#include <linux/nsproxy.h>
#include <net/net_namespace.h>
@@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
}
const struct file_operations proc_net_operations = {
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = proc_tgid_net_readdir,
};
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 06ed10b7da9..94fcfff6863 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
inode->i_mode = table->mode;
- inode->i_uid = inode->i_gid = 0;
if (!table->child) {
inode->i_mode |= S_IFREG;
inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7761602af9d..f6299a25594 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,7 +16,6 @@
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/bitops.h>
-#include <linux/smp_lock.h>
#include <linux/mount.h>
#include <linux/pid_namespace.h>
@@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp,
unsigned int nr = filp->f_pos;
int ret;
- lock_kernel();
-
if (nr < FIRST_PROCESS_ENTRY) {
int error = proc_readdir(filp, dirent, filldir);
- if (error <= 0) {
- unlock_kernel();
+ if (error <= 0)
return error;
- }
filp->f_pos = FIRST_PROCESS_ENTRY;
}
- unlock_kernel();
ret = proc_pid_readdir(filp, dirent, filldir);
return ret;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3a8bdd7f575..94063840832 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v)
"Private_Clean: %8lu kB\n"
"Private_Dirty: %8lu kB\n"
"Referenced: %8lu kB\n"
- "Swap: %8lu kB\n",
+ "Swap: %8lu kB\n"
+ "KernelPageSize: %8lu kB\n"
+ "MMUPageSize: %8lu kB\n",
(vma->vm_end - vma->vm_start) >> 10,
mss.resident >> 10,
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v)
mss.private_clean >> 10,
mss.private_dirty >> 10,
mss.referenced >> 10,
- mss.swap >> 10);
+ mss.swap >> 10,
+ vma_kernel_pagesize(vma) >> 10,
+ vma_mmu_pagesize(vma) >> 10);
if (m->count < m->size) /* vma is copied successfully */
m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 219bd79ea89..343ea1216bc 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -9,31 +9,38 @@
/*
* Logic: we've got two memory sums for each process, "shared", and
- * "non-shared". Shared memory may get counted more then once, for
+ * "non-shared". Shared memory may get counted more than once, for
* each process that owns it. Non-shared memory is counted
* accurately.
*/
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
- struct vm_list_struct *vml;
- unsigned long bytes = 0, sbytes = 0, slack = 0;
+ struct vm_area_struct *vma;
+ struct vm_region *region;
+ struct rb_node *p;
+ unsigned long bytes = 0, sbytes = 0, slack = 0, size;
down_read(&mm->mmap_sem);
- for (vml = mm->context.vmlist; vml; vml = vml->next) {
- if (!vml->vma)
- continue;
+ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+ vma = rb_entry(p, struct vm_area_struct, vm_rb);
+
+ bytes += kobjsize(vma);
+
+ region = vma->vm_region;
+ if (region) {
+ size = kobjsize(region);
+ size += region->vm_end - region->vm_start;
+ } else {
+ size = vma->vm_end - vma->vm_start;
+ }
- bytes += kobjsize(vml);
if (atomic_read(&mm->mm_count) > 1 ||
- atomic_read(&vml->vma->vm_usage) > 1
- ) {
- sbytes += kobjsize((void *) vml->vma->vm_start);
- sbytes += kobjsize(vml->vma);
+ vma->vm_flags & VM_MAYSHARE) {
+ sbytes += size;
} else {
- bytes += kobjsize((void *) vml->vma->vm_start);
- bytes += kobjsize(vml->vma);
- slack += kobjsize((void *) vml->vma->vm_start) -
- (vml->vma->vm_end - vml->vma->vm_start);
+ bytes += size;
+ if (region)
+ slack = region->vm_end - vma->vm_end;
}
}
@@ -70,13 +77,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
unsigned long task_vsize(struct mm_struct *mm)
{
- struct vm_list_struct *tbp;
+ struct vm_area_struct *vma;
+ struct rb_node *p;
unsigned long vsize = 0;
down_read(&mm->mmap_sem);
- for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
- if (tbp->vma)
- vsize += kobjsize((void *) tbp->vma->vm_start);
+ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+ vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ vsize += vma->vm_end - vma->vm_start;
}
up_read(&mm->mmap_sem);
return vsize;
@@ -85,15 +93,19 @@ unsigned long task_vsize(struct mm_struct *mm)
int task_statm(struct mm_struct *mm, int *shared, int *text,
int *data, int *resident)
{
- struct vm_list_struct *tbp;
+ struct vm_area_struct *vma;
+ struct vm_region *region;
+ struct rb_node *p;
int size = kobjsize(mm);
down_read(&mm->mmap_sem);
- for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
- size += kobjsize(tbp);
- if (tbp->vma) {
- size += kobjsize(tbp->vma);
- size += kobjsize((void *) tbp->vma->vm_start);
+ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+ vma = rb_entry(p, struct vm_area_struct, vm_rb);
+ size += kobjsize(vma);
+ region = vma->vm_region;
+ if (region) {
+ size += kobjsize(region);
+ size += region->vm_end - region->vm_start;
}
}
@@ -105,20 +117,62 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
}
/*
+ * display a single VMA to a sequenced file
+ */
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+{
+ unsigned long ino = 0;
+ struct file *file;
+ dev_t dev = 0;
+ int flags, len;
+
+ flags = vma->vm_flags;
+ file = vma->vm_file;
+
+ if (file) {
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ dev = inode->i_sb->s_dev;
+ ino = inode->i_ino;
+ }
+
+ seq_printf(m,
+ "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+ vma->vm_start,
+ vma->vm_end,
+ flags & VM_READ ? 'r' : '-',
+ flags & VM_WRITE ? 'w' : '-',
+ flags & VM_EXEC ? 'x' : '-',
+ flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+ vma->vm_pgoff << PAGE_SHIFT,
+ MAJOR(dev), MINOR(dev), ino, &len);
+
+ if (file) {
+ len = 25 + sizeof(void *) * 6 - len;
+ if (len < 1)
+ len = 1;
+ seq_printf(m, "%*c", len, ' ');
+ seq_path(m, &file->f_path, "");
+ }
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+/*
* display mapping lines for a particular process's /proc/pid/maps
*/
-static int show_map(struct seq_file *m, void *_vml)
+static int show_map(struct seq_file *m, void *_p)
{
- struct vm_list_struct *vml = _vml;
+ struct rb_node *p = _p;
- return nommu_vma_show(m, vml->vma);
+ return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
}
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
- struct vm_list_struct *vml;
struct mm_struct *mm;
+ struct rb_node *p;
loff_t n = *pos;
/* pin the task and mm whilst we play with them */
@@ -134,9 +188,9 @@ static void *m_start(struct seq_file *m, loff_t *pos)
}
/* start from the Nth VMA */
- for (vml = mm->context.vmlist; vml; vml = vml->next)
+ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
if (n-- == 0)
- return vml;
+ return p;
return NULL;
}
@@ -152,12 +206,12 @@ static void m_stop(struct seq_file *m, void *_vml)
}
}
-static void *m_next(struct seq_file *m, void *_vml, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
{
- struct vm_list_struct *vml = _vml;
+ struct rb_node *p = _p;
(*pos)++;
- return vml ? vml->next : NULL;
+ return p ? rb_next(p) : NULL;
}
static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 03ec5950490..5edcc3f92ba 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
offset = (unsigned long)(*ppos % PAGE_SIZE);
pfn = (unsigned long)(*ppos / PAGE_SIZE);
- if (pfn > saved_max_pfn)
- return -EINVAL;
do {
if (count > (PAGE_SIZE - offset))
diff --git a/fs/quota.c b/fs/quota.c
index b7fe44e0161..4a8c94f05f7 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
case Q_SETQUOTA:
case Q_GETQUOTA:
/* This is just informative test so we are satisfied without a lock */
- if (!sb_has_quota_enabled(sb, type))
+ if (!sb_has_quota_active(sb, type))
return -ESRCH;
}
@@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
int cnt;
sb->s_qcop->quota_sync(sb, type);
+
+ if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
+ return;
/* This is not very clever (and fast) but currently I don't know about
* any other simple way of getting quota data to disk and we must get
* them there for userspace to be visible... */
@@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && cnt != type)
continue;
- if (!sb_has_quota_enabled(sb, cnt))
+ if (!sb_has_quota_active(sb, cnt))
continue;
mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -201,7 +204,7 @@ restart:
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && type != cnt)
continue;
- if (!sb_has_quota_enabled(sb, cnt))
+ if (!sb_has_quota_active(sb, cnt))
continue;
if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
@@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
__u32 fmt;
down_read(&sb_dqopt(sb)->dqptr_sem);
- if (!sb_has_quota_enabled(sb, type)) {
+ if (!sb_has_quota_active(sb, type)) {
up_read(&sb_dqopt(sb)->dqptr_sem);
return -ESRCH;
}
diff --git a/fs/quota_tree.c b/fs/quota_tree.c
new file mode 100644
index 00000000000..953404c95b1
--- /dev/null
+++ b/fs/quota_tree.c
@@ -0,0 +1,645 @@
+/*
+ * vfsv0 quota IO operations on file
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+
+#include <asm/byteorder.h>
+
+#include "quota_tree.h"
+
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota trie support");
+MODULE_LICENSE("GPL");
+
+#define __QUOTA_QT_PARANOIA
+
+typedef char *dqbuf_t;
+
+static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
+{
+ unsigned int epb = info->dqi_usable_bs >> 2;
+
+ depth = info->dqi_qtree_depth - depth - 1;
+ while (depth--)
+ id /= epb;
+ return id % epb;
+}
+
+/* Number of entries in one blocks */
+static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+{
+ return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
+ / info->dqi_entry_size;
+}
+
+static dqbuf_t getdqbuf(size_t size)
+{
+ dqbuf_t buf = kmalloc(size, GFP_NOFS);
+ if (!buf)
+ printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
+ return buf;
+}
+
+static inline void freedqbuf(dqbuf_t buf)
+{
+ kfree(buf);
+}
+
+static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+ struct super_block *sb = info->dqi_sb;
+
+ memset(buf, 0, info->dqi_usable_bs);
+ return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
+ info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+ struct super_block *sb = info->dqi_sb;
+
+ return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
+ info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+/* Remove empty block from list and return it */
+static int get_free_dqblk(struct qtree_mem_dqinfo *info)
+{
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+ int ret, blk;
+
+ if (!buf)
+ return -ENOMEM;
+ if (info->dqi_free_blk) {
+ blk = info->dqi_free_blk;
+ ret = read_blk(info, blk, buf);
+ if (ret < 0)
+ goto out_buf;
+ info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+ }
+ else {
+ memset(buf, 0, info->dqi_usable_bs);
+ /* Assure block allocation... */
+ ret = write_blk(info, info->dqi_blocks, buf);
+ if (ret < 0)
+ goto out_buf;
+ blk = info->dqi_blocks++;
+ }
+ mark_info_dirty(info->dqi_sb, info->dqi_type);
+ ret = blk;
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Insert empty block to the list */
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+ struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+ int err;
+
+ dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
+ dh->dqdh_prev_free = cpu_to_le32(0);
+ dh->dqdh_entries = cpu_to_le16(0);
+ err = write_blk(info, blk, buf);
+ if (err < 0)
+ return err;
+ info->dqi_free_blk = blk;
+ mark_info_dirty(info->dqi_sb, info->dqi_type);
+ return 0;
+}
+
+/* Remove given block from the list of blocks with free entries */
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+ dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+ struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+ uint nextblk = le32_to_cpu(dh->dqdh_next_free);
+ uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
+ int err;
+
+ if (!tmpbuf)
+ return -ENOMEM;
+ if (nextblk) {
+ err = read_blk(info, nextblk, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+ dh->dqdh_prev_free;
+ err = write_blk(info, nextblk, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ }
+ if (prevblk) {
+ err = read_blk(info, prevblk, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
+ dh->dqdh_next_free;
+ err = write_blk(info, prevblk, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ } else {
+ info->dqi_free_entry = nextblk;
+ mark_info_dirty(info->dqi_sb, info->dqi_type);
+ }
+ freedqbuf(tmpbuf);
+ dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+ /* No matter whether write succeeds block is out of list */
+ if (write_blk(info, blk, buf) < 0)
+ printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
+ return 0;
+out_buf:
+ freedqbuf(tmpbuf);
+ return err;
+}
+
+/* Insert given block to the beginning of list with free entries */
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+ dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+ struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+ int err;
+
+ if (!tmpbuf)
+ return -ENOMEM;
+ dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
+ dh->dqdh_prev_free = cpu_to_le32(0);
+ err = write_blk(info, blk, buf);
+ if (err < 0)
+ goto out_buf;
+ if (info->dqi_free_entry) {
+ err = read_blk(info, info->dqi_free_entry, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+ cpu_to_le32(blk);
+ err = write_blk(info, info->dqi_free_entry, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ }
+ freedqbuf(tmpbuf);
+ info->dqi_free_entry = blk;
+ mark_info_dirty(info->dqi_sb, info->dqi_type);
+ return 0;
+out_buf:
+ freedqbuf(tmpbuf);
+ return err;
+}
+
+/* Is the entry in the block free? */
+int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
+{
+ int i;
+
+ for (i = 0; i < info->dqi_entry_size; i++)
+ if (disk[i])
+ return 0;
+ return 1;
+}
+EXPORT_SYMBOL(qtree_entry_unused);
+
+/* Find space for dquot */
+static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
+ struct dquot *dquot, int *err)
+{
+ uint blk, i;
+ struct qt_disk_dqdbheader *dh;
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ char *ddquot;
+
+ *err = 0;
+ if (!buf) {
+ *err = -ENOMEM;
+ return 0;
+ }
+ dh = (struct qt_disk_dqdbheader *)buf;
+ if (info->dqi_free_entry) {
+ blk = info->dqi_free_entry;
+ *err = read_blk(info, blk, buf);
+ if (*err < 0)
+ goto out_buf;
+ } else {
+ blk = get_free_dqblk(info);
+ if ((int)blk < 0) {
+ *err = blk;
+ freedqbuf(buf);
+ return 0;
+ }
+ memset(buf, 0, info->dqi_usable_bs);
+ /* This is enough as block is already zeroed and entry list is empty... */
+ info->dqi_free_entry = blk;
+ mark_info_dirty(dquot->dq_sb, dquot->dq_type);
+ }
+ /* Block will be full? */
+ if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
+ *err = remove_free_dqentry(info, buf, blk);
+ if (*err < 0) {
+ printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+ "remove block (%u) from entry free list.\n",
+ blk);
+ goto out_buf;
+ }
+ }
+ le16_add_cpu(&dh->dqdh_entries, 1);
+ /* Find free structure in block */
+ for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+ i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
+ i++, ddquot += info->dqi_entry_size);
+#ifdef __QUOTA_QT_PARANOIA
+ if (i == qtree_dqstr_in_blk(info)) {
+ printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
+ "but it shouldn't.\n");
+ *err = -EIO;
+ goto out_buf;
+ }
+#endif
+ *err = write_blk(info, blk, buf);
+ if (*err < 0) {
+ printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+ "data block %u.\n", blk);
+ goto out_buf;
+ }
+ dquot->dq_off = (blk << info->dqi_blocksize_bits) +
+ sizeof(struct qt_disk_dqdbheader) +
+ i * info->dqi_entry_size;
+ freedqbuf(buf);
+ return blk;
+out_buf:
+ freedqbuf(buf);
+ return 0;
+}
+
+/* Insert reference to structure into the trie */
+static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+ uint *treeblk, int depth)
+{
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ int ret = 0, newson = 0, newact = 0;
+ __le32 *ref;
+ uint newblk;
+
+ if (!buf)
+ return -ENOMEM;
+ if (!*treeblk) {
+ ret = get_free_dqblk(info);
+ if (ret < 0)
+ goto out_buf;
+ *treeblk = ret;
+ memset(buf, 0, info->dqi_usable_bs);
+ newact = 1;
+ } else {
+ ret = read_blk(info, *treeblk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't read tree quota block "
+ "%u.\n", *treeblk);
+ goto out_buf;
+ }
+ }
+ ref = (__le32 *)buf;
+ newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ if (!newblk)
+ newson = 1;
+ if (depth == info->dqi_qtree_depth - 1) {
+#ifdef __QUOTA_QT_PARANOIA
+ if (newblk) {
+ printk(KERN_ERR "VFS: Inserting already present quota "
+ "entry (block %u).\n",
+ le32_to_cpu(ref[get_index(info,
+ dquot->dq_id, depth)]));
+ ret = -EIO;
+ goto out_buf;
+ }
+#endif
+ newblk = find_free_dqentry(info, dquot, &ret);
+ } else {
+ ret = do_insert_tree(info, dquot, &newblk, depth+1);
+ }
+ if (newson && ret >= 0) {
+ ref[get_index(info, dquot->dq_id, depth)] =
+ cpu_to_le32(newblk);
+ ret = write_blk(info, *treeblk, buf);
+ } else if (newact && ret < 0) {
+ put_free_dqblk(info, buf, *treeblk);
+ }
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Wrapper for inserting quota structure into tree */
+static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
+ struct dquot *dquot)
+{
+ int tmp = QT_TREEOFF;
+ return do_insert_tree(info, dquot, &tmp, 0);
+}
+
+/*
+ * We don't have to be afraid of deadlocks as we never have quotas on quota files...
+ */
+int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+ int type = dquot->dq_type;
+ struct super_block *sb = dquot->dq_sb;
+ ssize_t ret;
+ dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
+
+ if (!ddquot)
+ return -ENOMEM;
+
+ /* dq_off is guarded by dqio_mutex */
+ if (!dquot->dq_off) {
+ ret = dq_insert_tree(info, dquot);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Error %zd occurred while "
+ "creating quota.\n", ret);
+ freedqbuf(ddquot);
+ return ret;
+ }
+ }
+ spin_lock(&dq_data_lock);
+ info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
+ spin_unlock(&dq_data_lock);
+ ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
+ info->dqi_entry_size, dquot->dq_off);
+ if (ret != info->dqi_entry_size) {
+ printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+ sb->s_id);
+ if (ret >= 0)
+ ret = -ENOSPC;
+ } else {
+ ret = 0;
+ }
+ dqstats.writes++;
+ freedqbuf(ddquot);
+
+ return ret;
+}
+EXPORT_SYMBOL(qtree_write_dquot);
+
+/* Free dquot entry in data block */
+static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+ uint blk)
+{
+ struct qt_disk_dqdbheader *dh;
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ int ret = 0;
+
+ if (!buf)
+ return -ENOMEM;
+ if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
+ printk(KERN_ERR "VFS: Quota structure has offset to other "
+ "block (%u) than it should (%u).\n", blk,
+ (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+ goto out_buf;
+ }
+ ret = read_blk(info, blk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+ goto out_buf;
+ }
+ dh = (struct qt_disk_dqdbheader *)buf;
+ le16_add_cpu(&dh->dqdh_entries, -1);
+ if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
+ ret = remove_free_dqentry(info, buf, blk);
+ if (ret >= 0)
+ ret = put_free_dqblk(info, buf, blk);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+ "to free list.\n", blk);
+ goto out_buf;
+ }
+ } else {
+ memset(buf +
+ (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
+ 0, info->dqi_entry_size);
+ if (le16_to_cpu(dh->dqdh_entries) ==
+ qtree_dqstr_in_blk(info) - 1) {
+ /* Insert will write block itself */
+ ret = insert_free_dqentry(info, buf, blk);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't insert quota data "
+ "block (%u) to free entry list.\n", blk);
+ goto out_buf;
+ }
+ } else {
+ ret = write_blk(info, blk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't write quota data "
+ "block %u\n", blk);
+ goto out_buf;
+ }
+ }
+ }
+ dquot->dq_off = 0; /* Quota is now unattached */
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Remove reference to dquot from tree */
+static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+ uint *blk, int depth)
+{
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ int ret = 0;
+ uint newblk;
+ __le32 *ref = (__le32 *)buf;
+
+ if (!buf)
+ return -ENOMEM;
+ ret = read_blk(info, *blk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+ goto out_buf;
+ }
+ newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ if (depth == info->dqi_qtree_depth - 1) {
+ ret = free_dqentry(info, dquot, newblk);
+ newblk = 0;
+ } else {
+ ret = remove_tree(info, dquot, &newblk, depth+1);
+ }
+ if (ret >= 0 && !newblk) {
+ int i;
+ ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
+ /* Block got empty? */
+ for (i = 0;
+ i < (info->dqi_usable_bs >> 2) && !ref[i];
+ i++);
+ /* Don't put the root block into the free block list */
+ if (i == (info->dqi_usable_bs >> 2)
+ && *blk != QT_TREEOFF) {
+ put_free_dqblk(info, buf, *blk);
+ *blk = 0;
+ } else {
+ ret = write_blk(info, *blk, buf);
+ if (ret < 0)
+ printk(KERN_ERR "VFS: Can't write quota tree "
+ "block %u.\n", *blk);
+ }
+ }
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Delete dquot from tree */
+int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+ uint tmp = QT_TREEOFF;
+
+ if (!dquot->dq_off) /* Even not allocated? */
+ return 0;
+ return remove_tree(info, dquot, &tmp, 0);
+}
+EXPORT_SYMBOL(qtree_delete_dquot);
+
+/* Find entry in block */
+static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
+ struct dquot *dquot, uint blk)
+{
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ loff_t ret = 0;
+ int i;
+ char *ddquot;
+
+ if (!buf)
+ return -ENOMEM;
+ ret = read_blk(info, blk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+ goto out_buf;
+ }
+ for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+ i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
+ i++, ddquot += info->dqi_entry_size);
+ if (i == qtree_dqstr_in_blk(info)) {
+ printk(KERN_ERR "VFS: Quota for id %u referenced "
+ "but not present.\n", dquot->dq_id);
+ ret = -EIO;
+ goto out_buf;
+ } else {
+ ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
+ qt_disk_dqdbheader) + i * info->dqi_entry_size;
+ }
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Find entry for given id in the tree */
+static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
+ struct dquot *dquot, uint blk, int depth)
+{
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ loff_t ret = 0;
+ __le32 *ref = (__le32 *)buf;
+
+ if (!buf)
+ return -ENOMEM;
+ ret = read_blk(info, blk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+ goto out_buf;
+ }
+ ret = 0;
+ blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ if (!blk) /* No reference? */
+ goto out_buf;
+ if (depth < info->dqi_qtree_depth - 1)
+ ret = find_tree_dqentry(info, dquot, blk, depth+1);
+ else
+ ret = find_block_dqentry(info, dquot, blk);
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Find entry for given id in the tree - wrapper function */
+static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
+ struct dquot *dquot)
+{
+ return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+}
+
+int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+ int type = dquot->dq_type;
+ struct super_block *sb = dquot->dq_sb;
+ loff_t offset;
+ dqbuf_t ddquot;
+ int ret = 0;
+
+#ifdef __QUOTA_QT_PARANOIA
+ /* Invalidated quota? */
+ if (!sb_dqopt(dquot->dq_sb)->files[type]) {
+ printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+ return -EIO;
+ }
+#endif
+ /* Do we know offset of the dquot entry in the quota file? */
+ if (!dquot->dq_off) {
+ offset = find_dqentry(info, dquot);
+ if (offset <= 0) { /* Entry not present? */
+ if (offset < 0)
+ printk(KERN_ERR "VFS: Can't read quota "
+ "structure for id %u.\n", dquot->dq_id);
+ dquot->dq_off = 0;
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+ ret = offset;
+ goto out;
+ }
+ dquot->dq_off = offset;
+ }
+ ddquot = getdqbuf(info->dqi_entry_size);
+ if (!ddquot)
+ return -ENOMEM;
+ ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
+ info->dqi_entry_size, dquot->dq_off);
+ if (ret != info->dqi_entry_size) {
+ if (ret >= 0)
+ ret = -EIO;
+ printk(KERN_ERR "VFS: Error while reading quota "
+ "structure for id %u.\n", dquot->dq_id);
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+ freedqbuf(ddquot);
+ goto out;
+ }
+ spin_lock(&dq_data_lock);
+ info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
+ if (!dquot->dq_dqb.dqb_bhardlimit &&
+ !dquot->dq_dqb.dqb_bsoftlimit &&
+ !dquot->dq_dqb.dqb_ihardlimit &&
+ !dquot->dq_dqb.dqb_isoftlimit)
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ spin_unlock(&dq_data_lock);
+ freedqbuf(ddquot);
+out:
+ dqstats.reads++;
+ return ret;
+}
+EXPORT_SYMBOL(qtree_read_dquot);
+
+/* Check whether dquot should not be deleted. We know we are
+ * the only one operating on dquot (thanks to dq_lock) */
+int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+ if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+ return qtree_delete_dquot(info, dquot);
+ return 0;
+}
+EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/fs/quota_tree.h b/fs/quota_tree.h
new file mode 100644
index 00000000000..a1ab8db81a5
--- /dev/null
+++ b/fs/quota_tree.h
@@ -0,0 +1,25 @@
+/*
+ * Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTA_TREE_H
+#define _LINUX_QUOTA_TREE_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ * Structure of header of block with quota structures. It is padded to 16 bytes so
+ * there will be space for exactly 21 quota-entries in a block
+ */
+struct qt_disk_dqdbheader {
+ __le32 dqdh_next_free; /* Number of next block with free entry */
+ __le32 dqdh_prev_free; /* Number of previous block with free entry */
+ __le16 dqdh_entries; /* Number of valid entries in block */
+ __le16 dqdh_pad1;
+ __le32 dqdh_pad2;
+};
+
+#define QT_TREEOFF 1 /* Offset of tree in file in blocks */
+
+#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index 5ae15b13eeb..b4af1c69ad1 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -3,25 +3,39 @@
#include <linux/quota.h>
#include <linux/quotaops.h>
#include <linux/dqblk_v1.h>
-#include <linux/quotaio_v1.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <asm/byteorder.h>
+#include "quotaio_v1.h"
+
MODULE_AUTHOR("Jan Kara");
MODULE_DESCRIPTION("Old quota format support");
MODULE_LICENSE("GPL");
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+static inline qsize_t v1_stoqb(qsize_t space)
+{
+ return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v1_qbtos(qsize_t blocks)
+{
+ return blocks << QUOTABLOCK_BITS;
+}
+
static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
{
m->dqb_ihardlimit = d->dqb_ihardlimit;
m->dqb_isoftlimit = d->dqb_isoftlimit;
m->dqb_curinodes = d->dqb_curinodes;
- m->dqb_bhardlimit = d->dqb_bhardlimit;
- m->dqb_bsoftlimit = d->dqb_bsoftlimit;
- m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS;
+ m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
+ m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
+ m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
m->dqb_itime = d->dqb_itime;
m->dqb_btime = d->dqb_btime;
}
@@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
d->dqb_ihardlimit = m->dqb_ihardlimit;
d->dqb_isoftlimit = m->dqb_isoftlimit;
d->dqb_curinodes = m->dqb_curinodes;
- d->dqb_bhardlimit = m->dqb_bhardlimit;
- d->dqb_bsoftlimit = m->dqb_bsoftlimit;
- d->dqb_curblocks = toqb(m->dqb_curspace);
+ d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
+ d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
+ d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
d->dqb_itime = m->dqb_itime;
d->dqb_btime = m->dqb_btime;
}
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index b53827dc02d..b618b563635 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -6,7 +6,6 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/dqblk_v2.h>
-#include <linux/quotaio_v2.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -15,16 +14,37 @@
#include <asm/byteorder.h>
+#include "quota_tree.h"
+#include "quotaio_v2.h"
+
MODULE_AUTHOR("Jan Kara");
MODULE_DESCRIPTION("Quota format v2 support");
MODULE_LICENSE("GPL");
#define __QUOTA_V2_PARANOIA
-typedef char *dqbuf_t;
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2_is_id(void *dp, struct dquot *dquot);
+
+static struct qtree_fmt_operations v2_qtree_ops = {
+ .mem2disk_dqblk = v2_mem2diskdqb,
+ .disk2mem_dqblk = v2_disk2memdqb,
+ .is_id = v2_is_id,
+};
+
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
-#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
+static inline qsize_t v2_stoqb(qsize_t space)
+{
+ return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v2_qbtos(qsize_t blocks)
+{
+ return blocks << QUOTABLOCK_BITS;
+}
/* Check whether given file is really vfsv0 quotafile */
static int v2_check_quota_file(struct super_block *sb, int type)
@@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
static int v2_read_file_info(struct super_block *sb, int type)
{
struct v2_disk_dqinfo dinfo;
- struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct qtree_mem_dqinfo *qinfo;
ssize_t size;
size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type)
sb->s_id);
return -1;
}
+ info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+ if (!info->dqi_priv) {
+ printk(KERN_WARNING
+ "Not enough memory for quota information structure.\n");
+ return -1;
+ }
+ qinfo = info->dqi_priv;
/* limits are stored as unsigned 32-bit data */
info->dqi_maxblimit = 0xffffffff;
info->dqi_maxilimit = 0xffffffff;
info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
- info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
- info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
- info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+ qinfo->dqi_sb = sb;
+ qinfo->dqi_type = type;
+ qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+ qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+ qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+ qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+ qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+ qinfo->dqi_qtree_depth = qtree_depth(qinfo);
+ qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
+ qinfo->dqi_ops = &v2_qtree_ops;
return 0;
}
@@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type)
static int v2_write_file_info(struct super_block *sb, int type)
{
struct v2_disk_dqinfo dinfo;
- struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
ssize_t size;
spin_lock(&dq_data_lock);
@@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
spin_unlock(&dq_data_lock);
- dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks);
- dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk);
- dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry);
+ dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
+ dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
+ dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type)
return 0;
}
-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
+static void v2_disk2memdqb(struct dquot *dquot, void *dp)
{
+ struct v2_disk_dqblk *d = dp, empty;
+ struct mem_dqblk *m = &dquot->dq_dqb;
+
m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
m->dqb_itime = le64_to_cpu(d->dqb_itime);
- m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
- m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
+ m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
+ m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
m->dqb_btime = le64_to_cpu(d->dqb_btime);
+ /* We need to escape back all-zero structure */
+ memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+ empty.dqb_itime = cpu_to_le64(1);
+ if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
+ m->dqb_itime = 0;
}
-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
{
+ struct v2_disk_dqblk *d = dp;
+ struct mem_dqblk *m = &dquot->dq_dqb;
+ struct qtree_mem_dqinfo *info =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
d->dqb_itime = cpu_to_le64(m->dqb_itime);
- d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
- d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
+ d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
+ d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
d->dqb_btime = cpu_to_le64(m->dqb_btime);
- d->dqb_id = cpu_to_le32(id);
-}
-
-static dqbuf_t getdqbuf(void)
-{
- dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS);
- if (!buf)
- printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
- return buf;
-}
-
-static inline void freedqbuf(dqbuf_t buf)
-{
- kfree(buf);
-}
-
-static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
- memset(buf, 0, V2_DQBLKSIZE);
- return sb->s_op->quota_read(sb, type, (char *)buf,
- V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-
-static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
- return sb->s_op->quota_write(sb, type, (char *)buf,
- V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-
-/* Remove empty block from list and return it */
-static int get_free_dqblk(struct super_block *sb, int type)
-{
- dqbuf_t buf = getdqbuf();
- struct mem_dqinfo *info = sb_dqinfo(sb, type);
- struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
- int ret, blk;
-
- if (!buf)
- return -ENOMEM;
- if (info->u.v2_i.dqi_free_blk) {
- blk = info->u.v2_i.dqi_free_blk;
- if ((ret = read_blk(sb, type, blk, buf)) < 0)
- goto out_buf;
- info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
- }
- else {
- memset(buf, 0, V2_DQBLKSIZE);
- /* Assure block allocation... */
- if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
- goto out_buf;
- blk = info->u.v2_i.dqi_blocks++;
- }
- mark_info_dirty(sb, type);
- ret = blk;
-out_buf:
- freedqbuf(buf);
- return ret;
-}
-
-/* Insert empty block to the list */
-static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
- struct mem_dqinfo *info = sb_dqinfo(sb, type);
- struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
- int err;
-
- dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk);
- dh->dqdh_prev_free = cpu_to_le32(0);
- dh->dqdh_entries = cpu_to_le16(0);
- info->u.v2_i.dqi_free_blk = blk;
- mark_info_dirty(sb, type);
- /* Some strange block. We had better leave it... */
- if ((err = write_blk(sb, type, blk, buf)) < 0)
- return err;
- return 0;
+ d->dqb_id = cpu_to_le32(dquot->dq_id);
+ if (qtree_entry_unused(info, dp))
+ d->dqb_itime = cpu_to_le64(1);
}
-/* Remove given block from the list of blocks with free entries */
-static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
+static int v2_is_id(void *dp, struct dquot *dquot)
{
- dqbuf_t tmpbuf = getdqbuf();
- struct mem_dqinfo *info = sb_dqinfo(sb, type);
- struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
- uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
- int err;
+ struct v2_disk_dqblk *d = dp;
+ struct qtree_mem_dqinfo *info =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
- if (!tmpbuf)
- return -ENOMEM;
- if (nextblk) {
- if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
- goto out_buf;
- ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
- if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
- goto out_buf;
- }
- if (prevblk) {
- if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
- goto out_buf;
- ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
- if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
- goto out_buf;
- }
- else {
- info->u.v2_i.dqi_free_entry = nextblk;
- mark_info_dirty(sb, type);
- }
- freedqbuf(tmpbuf);
- dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
- /* No matter whether write succeeds block is out of list */
- if (write_blk(sb, type, blk, buf) < 0)
- printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
- return 0;
-out_buf:
- freedqbuf(tmpbuf);
- return err;
-}
-
-/* Insert given block to the beginning of list with free entries */
-static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
- dqbuf_t tmpbuf = getdqbuf();
- struct mem_dqinfo *info = sb_dqinfo(sb, type);
- struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
- int err;
-
- if (!tmpbuf)
- return -ENOMEM;
- dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
- dh->dqdh_prev_free = cpu_to_le32(0);
- if ((err = write_blk(sb, type, blk, buf)) < 0)
- goto out_buf;
- if (info->u.v2_i.dqi_free_entry) {
- if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
- goto out_buf;
- ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
- if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
- goto out_buf;
- }
- freedqbuf(tmpbuf);
- info->u.v2_i.dqi_free_entry = blk;
- mark_info_dirty(sb, type);
- return 0;
-out_buf:
- freedqbuf(tmpbuf);
- return err;
-}
-
-/* Find space for dquot */
-static uint find_free_dqentry(struct dquot *dquot, int *err)
-{
- struct super_block *sb = dquot->dq_sb;
- struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
- uint blk, i;
- struct v2_disk_dqdbheader *dh;
- struct v2_disk_dqblk *ddquot;
- struct v2_disk_dqblk fakedquot;
- dqbuf_t buf;
-
- *err = 0;
- if (!(buf = getdqbuf())) {
- *err = -ENOMEM;
+ if (qtree_entry_unused(info, dp))
return 0;
- }
- dh = (struct v2_disk_dqdbheader *)buf;
- ddquot = GETENTRIES(buf);
- if (info->u.v2_i.dqi_free_entry) {
- blk = info->u.v2_i.dqi_free_entry;
- if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
- goto out_buf;
- }
- else {
- blk = get_free_dqblk(sb, dquot->dq_type);
- if ((int)blk < 0) {
- *err = blk;
- freedqbuf(buf);
- return 0;
- }
- memset(buf, 0, V2_DQBLKSIZE);
- /* This is enough as block is already zeroed and entry list is empty... */
- info->u.v2_i.dqi_free_entry = blk;
- mark_info_dirty(sb, dquot->dq_type);
- }
- if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */
- if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
- printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
- goto out_buf;
- }
- le16_add_cpu(&dh->dqdh_entries, 1);
- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
- /* Find free structure in block */
- for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
-#ifdef __QUOTA_V2_PARANOIA
- if (i == V2_DQSTRINBLK) {
- printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
- *err = -EIO;
- goto out_buf;
- }
-#endif
- if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
- goto out_buf;
- }
- dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
- freedqbuf(buf);
- return blk;
-out_buf:
- freedqbuf(buf);
- return 0;
-}
-
-/* Insert reference to structure into the trie */
-static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
-{
- struct super_block *sb = dquot->dq_sb;
- dqbuf_t buf;
- int ret = 0, newson = 0, newact = 0;
- __le32 *ref;
- uint newblk;
-
- if (!(buf = getdqbuf()))
- return -ENOMEM;
- if (!*treeblk) {
- ret = get_free_dqblk(sb, dquot->dq_type);
- if (ret < 0)
- goto out_buf;
- *treeblk = ret;
- memset(buf, 0, V2_DQBLKSIZE);
- newact = 1;
- }
- else {
- if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
- goto out_buf;
- }
- }
- ref = (__le32 *)buf;
- newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
- if (!newblk)
- newson = 1;
- if (depth == V2_DQTREEDEPTH-1) {
-#ifdef __QUOTA_V2_PARANOIA
- if (newblk) {
- printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]));
- ret = -EIO;
- goto out_buf;
- }
-#endif
- newblk = find_free_dqentry(dquot, &ret);
- }
- else
- ret = do_insert_tree(dquot, &newblk, depth+1);
- if (newson && ret >= 0) {
- ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
- ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
- }
- else if (newact && ret < 0)
- put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
-out_buf:
- freedqbuf(buf);
- return ret;
+ return le32_to_cpu(d->dqb_id) == dquot->dq_id;
}
-/* Wrapper for inserting quota structure into tree */
-static inline int dq_insert_tree(struct dquot *dquot)
+static int v2_read_dquot(struct dquot *dquot)
{
- int tmp = V2_DQTREEOFF;
- return do_insert_tree(dquot, &tmp, 0);
+ return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
}
-/*
- * We don't have to be afraid of deadlocks as we never have quotas on quota files...
- */
static int v2_write_dquot(struct dquot *dquot)
{
- int type = dquot->dq_type;
- ssize_t ret;
- struct v2_disk_dqblk ddquot, empty;
-
- /* dq_off is guarded by dqio_mutex */
- if (!dquot->dq_off)
- if ((ret = dq_insert_tree(dquot)) < 0) {
- printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
- return ret;
- }
- spin_lock(&dq_data_lock);
- mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
- /* Argh... We may need to write structure full of zeroes but that would be
- * treated as an empty place by the rest of the code. Format change would
- * be definitely cleaner but the problems probably are not worth it */
- memset(&empty, 0, sizeof(struct v2_disk_dqblk));
- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
- ddquot.dqb_itime = cpu_to_le64(1);
- spin_unlock(&dq_data_lock);
- ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
- (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
- if (ret != sizeof(struct v2_disk_dqblk)) {
- printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
- if (ret >= 0)
- ret = -ENOSPC;
- }
- else
- ret = 0;
- dqstats.writes++;
-
- return ret;
+ return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
}
-/* Free dquot entry in data block */
-static int free_dqentry(struct dquot *dquot, uint blk)
-{
- struct super_block *sb = dquot->dq_sb;
- int type = dquot->dq_type;
- struct v2_disk_dqdbheader *dh;
- dqbuf_t buf = getdqbuf();
- int ret = 0;
-
- if (!buf)
- return -ENOMEM;
- if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
- printk(KERN_ERR "VFS: Quota structure has offset to other "
- "block (%u) than it should (%u).\n", blk,
- (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
- goto out_buf;
- }
- if ((ret = read_blk(sb, type, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
- goto out_buf;
- }
- dh = (struct v2_disk_dqdbheader *)buf;
- le16_add_cpu(&dh->dqdh_entries, -1);
- if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
- if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
- (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
- printk(KERN_ERR "VFS: Can't move quota data block (%u) "
- "to free list.\n", blk);
- goto out_buf;
- }
- }
- else {
- memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
- sizeof(struct v2_disk_dqblk));
- if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
- /* Insert will write block itself */
- if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
- printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
- goto out_buf;
- }
- }
- else
- if ((ret = write_blk(sb, type, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't write quota data "
- "block %u\n", blk);
- goto out_buf;
- }
- }
- dquot->dq_off = 0; /* Quota is now unattached */
-out_buf:
- freedqbuf(buf);
- return ret;
-}
-
-/* Remove reference to dquot from tree */
-static int remove_tree(struct dquot *dquot, uint *blk, int depth)
-{
- struct super_block *sb = dquot->dq_sb;
- int type = dquot->dq_type;
- dqbuf_t buf = getdqbuf();
- int ret = 0;
- uint newblk;
- __le32 *ref = (__le32 *)buf;
-
- if (!buf)
- return -ENOMEM;
- if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
- goto out_buf;
- }
- newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
- if (depth == V2_DQTREEDEPTH-1) {
- ret = free_dqentry(dquot, newblk);
- newblk = 0;
- }
- else
- ret = remove_tree(dquot, &newblk, depth+1);
- if (ret >= 0 && !newblk) {
- int i;
- ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
- for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++); /* Block got empty? */
- /* Don't put the root block into the free block list */
- if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) {
- put_free_dqblk(sb, type, buf, *blk);
- *blk = 0;
- }
- else
- if ((ret = write_blk(sb, type, *blk, buf)) < 0)
- printk(KERN_ERR "VFS: Can't write quota tree "
- "block %u.\n", *blk);
- }
-out_buf:
- freedqbuf(buf);
- return ret;
-}
-
-/* Delete dquot from tree */
-static int v2_delete_dquot(struct dquot *dquot)
-{
- uint tmp = V2_DQTREEOFF;
-
- if (!dquot->dq_off) /* Even not allocated? */
- return 0;
- return remove_tree(dquot, &tmp, 0);
-}
-
-/* Find entry in block */
-static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
-{
- dqbuf_t buf = getdqbuf();
- loff_t ret = 0;
- int i;
- struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
-
- if (!buf)
- return -ENOMEM;
- if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
- goto out_buf;
- }
- if (dquot->dq_id)
- for (i = 0; i < V2_DQSTRINBLK &&
- le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
- else { /* ID 0 as a bit more complicated searching... */
- struct v2_disk_dqblk fakedquot;
-
- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
- for (i = 0; i < V2_DQSTRINBLK; i++)
- if (!le32_to_cpu(ddquot[i].dqb_id) &&
- memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
- break;
- }
- if (i == V2_DQSTRINBLK) {
- printk(KERN_ERR "VFS: Quota for id %u referenced "
- "but not present.\n", dquot->dq_id);
- ret = -EIO;
- goto out_buf;
- }
- else
- ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
- v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
-out_buf:
- freedqbuf(buf);
- return ret;
-}
-
-/* Find entry for given id in the tree */
-static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
-{
- dqbuf_t buf = getdqbuf();
- loff_t ret = 0;
- __le32 *ref = (__le32 *)buf;
-
- if (!buf)
- return -ENOMEM;
- if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
- goto out_buf;
- }
- ret = 0;
- blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
- if (!blk) /* No reference? */
- goto out_buf;
- if (depth < V2_DQTREEDEPTH-1)
- ret = find_tree_dqentry(dquot, blk, depth+1);
- else
- ret = find_block_dqentry(dquot, blk);
-out_buf:
- freedqbuf(buf);
- return ret;
-}
-
-/* Find entry for given id in the tree - wrapper function */
-static inline loff_t find_dqentry(struct dquot *dquot)
-{
- return find_tree_dqentry(dquot, V2_DQTREEOFF, 0);
-}
-
-static int v2_read_dquot(struct dquot *dquot)
+static int v2_release_dquot(struct dquot *dquot)
{
- int type = dquot->dq_type;
- loff_t offset;
- struct v2_disk_dqblk ddquot, empty;
- int ret = 0;
-
-#ifdef __QUOTA_V2_PARANOIA
- /* Invalidated quota? */
- if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
- printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
- return -EIO;
- }
-#endif
- offset = find_dqentry(dquot);
- if (offset <= 0) { /* Entry not present? */
- if (offset < 0)
- printk(KERN_ERR "VFS: Can't read quota "
- "structure for id %u.\n", dquot->dq_id);
- dquot->dq_off = 0;
- set_bit(DQ_FAKE_B, &dquot->dq_flags);
- memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
- ret = offset;
- }
- else {
- dquot->dq_off = offset;
- if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
- (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
- != sizeof(struct v2_disk_dqblk)) {
- if (ret >= 0)
- ret = -EIO;
- printk(KERN_ERR "VFS: Error while reading quota "
- "structure for id %u.\n", dquot->dq_id);
- memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
- }
- else {
- ret = 0;
- /* We need to escape back all-zero structure */
- memset(&empty, 0, sizeof(struct v2_disk_dqblk));
- empty.dqb_itime = cpu_to_le64(1);
- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
- ddquot.dqb_itime = 0;
- }
- disk2memdqb(&dquot->dq_dqb, &ddquot);
- if (!dquot->dq_dqb.dqb_bhardlimit &&
- !dquot->dq_dqb.dqb_bsoftlimit &&
- !dquot->dq_dqb.dqb_ihardlimit &&
- !dquot->dq_dqb.dqb_isoftlimit)
- set_bit(DQ_FAKE_B, &dquot->dq_flags);
- }
- dqstats.reads++;
-
- return ret;
+ return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
}
-/* Check whether dquot should not be deleted. We know we are
- * the only one operating on dquot (thanks to dq_lock) */
-static int v2_release_dquot(struct dquot *dquot)
+static int v2_free_file_info(struct super_block *sb, int type)
{
- if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
- return v2_delete_dquot(dquot);
+ kfree(sb_dqinfo(sb, type)->dqi_priv);
return 0;
}
@@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = {
.check_quota_file = v2_check_quota_file,
.read_file_info = v2_read_file_info,
.write_file_info = v2_write_file_info,
- .free_file_info = NULL,
+ .free_file_info = v2_free_file_info,
.read_dqblk = v2_read_dquot,
.commit_dqblk = v2_write_dquot,
.release_dqblk = v2_release_dquot,
diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h
new file mode 100644
index 00000000000..746654b5de7
--- /dev/null
+++ b/fs/quotaio_v1.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_QUOTAIO_V1_H
+#define _LINUX_QUOTAIO_V1_H
+
+#include <linux/types.h>
+
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */
+#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is an array of these structures
+ * indexed by user or group number.
+ */
+struct v1_disk_dqblk {
+ __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */
+ __u32 dqb_bsoftlimit; /* preferred limit on disk blks */
+ __u32 dqb_curblocks; /* current block count */
+ __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */
+ __u32 dqb_isoftlimit; /* preferred inode limit */
+ __u32 dqb_curinodes; /* current # allocated inodes */
+ time_t dqb_btime; /* time limit for excessive disk use */
+ time_t dqb_itime; /* time limit for excessive inode use */
+};
+
+#define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
+
+#endif /* _LINUX_QUOTAIO_V1_H */
diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h
new file mode 100644
index 00000000000..530fe580685
--- /dev/null
+++ b/fs/quotaio_v2.h
@@ -0,0 +1,60 @@
+/*
+ * Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTAIO_V2_H
+#define _LINUX_QUOTAIO_V2_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ * Definitions of magics and versions of current quota files
+ */
+#define V2_INITQMAGICS {\
+ 0xd9c01f11, /* USRQUOTA */\
+ 0xd9c01927 /* GRPQUOTA */\
+}
+
+#define V2_INITQVERSIONS {\
+ 0, /* USRQUOTA */\
+ 0 /* GRPQUOTA */\
+}
+
+/* First generic header */
+struct v2_disk_dqheader {
+ __le32 dqh_magic; /* Magic number identifying file */
+ __le32 dqh_version; /* File version */
+};
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is a radix tree whose leaves point
+ * to blocks of these structures.
+ */
+struct v2_disk_dqblk {
+ __le32 dqb_id; /* id this quota applies to */
+ __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */
+ __le32 dqb_isoftlimit; /* preferred inode limit */
+ __le32 dqb_curinodes; /* current # allocated inodes */
+ __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+ __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+ __le64 dqb_curspace; /* current space occupied (in bytes) */
+ __le64 dqb_btime; /* time limit for excessive disk use */
+ __le64 dqb_itime; /* time limit for excessive inode use */
+};
+
+/* Header with type and version specific information */
+struct v2_disk_dqinfo {
+ __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */
+ __le32 dqi_igrace; /* Time before inode soft limit becomes hard limit */
+ __le32 dqi_flags; /* Flags for quotafile (DQF_*) */
+ __le32 dqi_blocks; /* Number of blocks in file */
+ __le32 dqi_free_blk; /* Number of first free block in the list */
+ __le32 dqi_free_entry; /* Number of block with at least one free entry */
+};
+
+#define V2_DQINFOOFF sizeof(struct v2_disk_dqheader) /* Offset of info header in file */
+#define V2_DQBLKSIZE_BITS 10 /* Size of leaf block in tree */
+
+#endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 76acdbc3461..b9b567a2837 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -262,11 +262,11 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
ret = -ENOMEM;
pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
if (!pages)
- goto out;
+ goto out_free;
nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
if (nr != lpages)
- goto out; /* leave if some pages were missing */
+ goto out_free_pages; /* leave if some pages were missing */
/* check the pages for physical adjacency */
ptr = pages;
@@ -274,19 +274,18 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
page++;
for (loop = lpages; loop > 1; loop--)
if (*ptr++ != page++)
- goto out;
+ goto out_free_pages;
/* okay - all conditions fulfilled */
ret = (unsigned long) page_address(pages[0]);
- out:
- if (pages) {
- ptr = pages;
- for (loop = lpages; loop > 0; loop--)
- put_page(*ptr++);
- kfree(pages);
- }
-
+out_free_pages:
+ ptr = pages;
+ for (loop = nr; loop > 0; loop--)
+ put_page(*ptr++);
+out_free:
+ kfree(pages);
+out:
return ret;
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a83a3518ae3..b7e6ac706b8 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_blocks = 0;
inode->i_mapping->a_ops = &ramfs_aops;
inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9c020..5cc6924eb15 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
offset += inode->i_size;
break;
case SEEK_CUR:
+ /*
+ * Here we special-case the lseek(fd, 0, SEEK_CUR)
+ * position-querying operation. Avoid rewriting the "same"
+ * f_pos value back to the file because a concurrent read(),
+ * write() or lseek() might have altered it
+ */
+ if (offset == 0)
+ return file->f_pos;
offset += file->f_pos;
break;
}
@@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
offset += i_size_read(file->f_path.dentry->d_inode);
break;
case SEEK_CUR:
+ if (offset == 0) {
+ retval = file->f_pos;
+ goto out;
+ }
offset += file->f_pos;
}
retval = -EINVAL;
@@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
}
retval = offset;
}
+out:
unlock_kernel();
return retval;
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 145c2d3e5e0..55fce92cdf1 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1782,6 +1782,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
goto out_bad_inode;
}
args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+ if (old_format_only(sb))
+ make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
+ TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
+ else
+ make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
+ TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
if (insert_inode_locked4(inode, args.objectid,
@@ -1834,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
reiserfs_init_acl_default(inode);
reiserfs_init_xattr_rwsem(inode);
- if (old_format_only(sb))
- make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
- TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
- else
- make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
- TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
-
/* key to search for correct place for new stat data */
_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
@@ -2561,7 +2560,7 @@ static int reiserfs_write_begin(struct file *file,
}
index = pos >> PAGE_CACHE_SHIFT;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 663a91f5dce..c55651f1407 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -649,6 +649,8 @@ static struct dquot_operations reiserfs_quota_operations = {
.release_dquot = reiserfs_release_dquot,
.mark_dirty = reiserfs_mark_dquot_dirty,
.write_info = reiserfs_write_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
};
static struct quotactl_ops reiserfs_qctl_operations = {
@@ -994,8 +996,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
if (c == 'u' || c == 'g') {
int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
- if ((sb_any_quota_enabled(s) ||
- sb_any_quota_suspended(s)) &&
+ if (sb_any_quota_loaded(s) &&
(!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
reiserfs_warning(s,
"reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1041,8 +1042,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
"reiserfs_parse_options: unknown quota format specified.");
return 0;
}
- if ((sb_any_quota_enabled(s) ||
- sb_any_quota_suspended(s)) &&
+ if (sb_any_quota_loaded(s) &&
*qfmt != REISERFS_SB(s)->s_jquota_fmt) {
reiserfs_warning(s,
"reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1067,7 +1067,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
}
/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
if (!(*mount_options & (1 << REISERFS_QUOTA))
- && sb_any_quota_enabled(s)) {
+ && sb_any_quota_loaded(s)) {
reiserfs_warning(s,
"reiserfs_parse_options: quota options must be present when quota is turned on.");
return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 60d2f822e87..98a232f7196 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] =
static struct inode *
romfs_iget(struct super_block *sb, unsigned long ino)
{
- int nextfh;
+ int nextfh, ret;
struct romfs_inode ri;
struct inode *i;
@@ -524,14 +524,13 @@ romfs_iget(struct super_block *sb, unsigned long ino)
i->i_size = be32_to_cpu(ri.size);
i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
- i->i_uid = i->i_gid = 0;
/* Precalculate the data offset */
- ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN);
- if (ino >= 0)
- ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK);
- else
- ino = 0;
+ ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
+ if (ret >= 0)
+ ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
+ else
+ ino = 0;
ROMFS_I(i)->i_metasize = ino;
ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
diff --git a/fs/select.c b/fs/select.c
index 87df51eadcf..08b91beed80 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
+ pwq->polling_task = current;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
-
EXPORT_SYMBOL(poll_initwait);
static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
free_page((unsigned long) old);
}
}
-
EXPORT_SYMBOL(poll_freewait);
-static struct poll_table_entry *poll_get_entry(poll_table *_p)
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
- struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
struct poll_table_page *table = p->table;
if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
if (!new_table) {
p->error = -ENOMEM;
- __set_current_state(TASK_RUNNING);
return NULL;
}
new_table->entry = new_table->entries;
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
return table->entry++;
}
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct poll_wqueues *pwq = wait->private;
+ DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+
+ /*
+ * Although this function is called under waitqueue lock, LOCK
+ * doesn't imply write barrier and the users expect write
+ * barrier semantics on wakeup functions. The following
+ * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+ * and is paired with set_mb() in poll_schedule_timeout.
+ */
+ smp_wmb();
+ pwq->triggered = 1;
+
+ /*
+ * Perform the default wake up operation using a dummy
+ * waitqueue.
+ *
+ * TODO: This is hacky but there currently is no interface to
+ * pass in @sync. @sync is scheduled to be removed and once
+ * that happens, wake_up_process() can be used directly.
+ */
+ return default_wake_function(&dummy_wait, mode, sync, key);
+}
+
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
- struct poll_table_entry *entry = poll_get_entry(p);
+ struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+ struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
get_file(filp);
entry->filp = filp;
entry->wait_address = wait_address;
- init_waitqueue_entry(&entry->wait, current);
+ init_waitqueue_func_entry(&entry->wait, pollwake);
+ entry->wait.private = pwq;
add_wait_queue(wait_address, &entry->wait);
}
+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+ ktime_t *expires, unsigned long slack)
+{
+ int rc = -EINTR;
+
+ set_current_state(state);
+ if (!pwq->triggered)
+ rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * Prepare for the next iteration.
+ *
+ * The following set_mb() serves two purposes. First, it's
+ * the counterpart rmb of the wmb in pollwake() such that data
+ * written before wake up is always visible after wake up.
+ * Second, the full barrier guarantees that triggered clearing
+ * doesn't pass event check of the next iteration. Note that
+ * this problem doesn't exist for the first iteration as
+ * add_wait_queue() has full barrier semantics.
+ */
+ set_mb(pwq->triggered, 0);
+
+ return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
+
/**
* poll_select_set_timeout - helper function to setup the timeout value
* @to: pointer to timespec variable for the final timeout
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
- set_current_state(TASK_INTERRUPTIBLE);
-
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
to = &expire;
}
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+ to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);
poll_freewait(&table);
@@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
for (;;) {
struct poll_list *walk;
- set_current_state(TASK_INTERRUPTIBLE);
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
@@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
to = &expire;
}
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
- __set_current_state(TASK_RUNNING);
return count;
}
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index e4f8d51a555..92d5e8ffb63 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -297,7 +297,7 @@ static int smb_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- *pagep = __grab_cache_page(mapping, index);
+ *pagep = grab_cache_page_write_begin(mapping, index, flags);
if (!*pagep)
return -ENOMEM;
return 0;
diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4b..a54b3e3f10a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
+#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/writeback.h>
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
new file mode 100644
index 00000000000..8258cf9a031
--- /dev/null
+++ b/fs/squashfs/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the linux squashfs routines.
+#
+
+obj-$(CONFIG_SQUASHFS) += squashfs.o
+squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
+squashfs-y += namei.o super.o symlink.o
+#squashfs-y += squashfs2_0.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
new file mode 100644
index 00000000000..c837dfc2b3c
--- /dev/null
+++ b/fs/squashfs/block.c
@@ -0,0 +1,274 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * block.c
+ */
+
+/*
+ * This file implements the low-level routines to read and decompress
+ * datablocks and metadata blocks.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Read the metadata block length, this is stored in the first two
+ * bytes of the metadata block.
+ */
+static struct buffer_head *get_block_length(struct super_block *sb,
+ u64 *cur_index, int *offset, int *length)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ struct buffer_head *bh;
+
+ bh = sb_bread(sb, *cur_index);
+ if (bh == NULL)
+ return NULL;
+
+ if (msblk->devblksize - *offset == 1) {
+ *length = (unsigned char) bh->b_data[*offset];
+ put_bh(bh);
+ bh = sb_bread(sb, ++(*cur_index));
+ if (bh == NULL)
+ return NULL;
+ *length |= (unsigned char) bh->b_data[0] << 8;
+ *offset = 1;
+ } else {
+ *length = (unsigned char) bh->b_data[*offset] |
+ (unsigned char) bh->b_data[*offset + 1] << 8;
+ *offset += 2;
+ }
+
+ return bh;
+}
+
+
+/*
+ * Read and decompress a metadata block or datablock. Length is non-zero
+ * if a datablock is being read (the size is stored elsewhere in the
+ * filesystem), otherwise the length is obtained from the first two bytes of
+ * the metadata block. A bit in the length field indicates if the block
+ * is stored uncompressed in the filesystem (usually because compression
+ * generated a larger block - this does occasionally happen with zlib).
+ */
+int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
+ int length, u64 *next_index, int srclength)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ struct buffer_head **bh;
+ int offset = index & ((1 << msblk->devblksize_log2) - 1);
+ u64 cur_index = index >> msblk->devblksize_log2;
+ int bytes, compressed, b = 0, k = 0, page = 0, avail;
+
+
+ bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
+ sizeof(*bh), GFP_KERNEL);
+ if (bh == NULL)
+ return -ENOMEM;
+
+ if (length) {
+ /*
+ * Datablock.
+ */
+ bytes = -offset;
+ compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+ length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+ if (next_index)
+ *next_index = index + length;
+
+ TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
+ index, compressed ? "" : "un", length, srclength);
+
+ if (length < 0 || length > srclength ||
+ (index + length) > msblk->bytes_used)
+ goto read_failure;
+
+ for (b = 0; bytes < length; b++, cur_index++) {
+ bh[b] = sb_getblk(sb, cur_index);
+ if (bh[b] == NULL)
+ goto block_release;
+ bytes += msblk->devblksize;
+ }
+ ll_rw_block(READ, b, bh);
+ } else {
+ /*
+ * Metadata block.
+ */
+ if ((index + 2) > msblk->bytes_used)
+ goto read_failure;
+
+ bh[0] = get_block_length(sb, &cur_index, &offset, &length);
+ if (bh[0] == NULL)
+ goto read_failure;
+ b = 1;
+
+ bytes = msblk->devblksize - offset;
+ compressed = SQUASHFS_COMPRESSED(length);
+ length = SQUASHFS_COMPRESSED_SIZE(length);
+ if (next_index)
+ *next_index = index + length + 2;
+
+ TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
+ compressed ? "" : "un", length);
+
+ if (length < 0 || length > srclength ||
+ (index + length) > msblk->bytes_used)
+ goto block_release;
+
+ for (; bytes < length; b++) {
+ bh[b] = sb_getblk(sb, ++cur_index);
+ if (bh[b] == NULL)
+ goto block_release;
+ bytes += msblk->devblksize;
+ }
+ ll_rw_block(READ, b - 1, bh + 1);
+ }
+
+ if (compressed) {
+ int zlib_err = 0, zlib_init = 0;
+
+ /*
+ * Uncompress block.
+ */
+
+ mutex_lock(&msblk->read_data_mutex);
+
+ msblk->stream.avail_out = 0;
+ msblk->stream.avail_in = 0;
+
+ bytes = length;
+ do {
+ if (msblk->stream.avail_in == 0 && k < b) {
+ avail = min(bytes, msblk->devblksize - offset);
+ bytes -= avail;
+ wait_on_buffer(bh[k]);
+ if (!buffer_uptodate(bh[k]))
+ goto release_mutex;
+
+ if (avail == 0) {
+ offset = 0;
+ put_bh(bh[k++]);
+ continue;
+ }
+
+ msblk->stream.next_in = bh[k]->b_data + offset;
+ msblk->stream.avail_in = avail;
+ offset = 0;
+ }
+
+ if (msblk->stream.avail_out == 0) {
+ msblk->stream.next_out = buffer[page++];
+ msblk->stream.avail_out = PAGE_CACHE_SIZE;
+ }
+
+ if (!zlib_init) {
+ zlib_err = zlib_inflateInit(&msblk->stream);
+ if (zlib_err != Z_OK) {
+ ERROR("zlib_inflateInit returned"
+ " unexpected result 0x%x,"
+ " srclength %d\n", zlib_err,
+ srclength);
+ goto release_mutex;
+ }
+ zlib_init = 1;
+ }
+
+ zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH);
+
+ if (msblk->stream.avail_in == 0 && k < b)
+ put_bh(bh[k++]);
+ } while (zlib_err == Z_OK);
+
+ if (zlib_err != Z_STREAM_END) {
+ ERROR("zlib_inflate returned unexpected result"
+ " 0x%x, srclength %d, avail_in %d,"
+ " avail_out %d\n", zlib_err, srclength,
+ msblk->stream.avail_in,
+ msblk->stream.avail_out);
+ goto release_mutex;
+ }
+
+ zlib_err = zlib_inflateEnd(&msblk->stream);
+ if (zlib_err != Z_OK) {
+ ERROR("zlib_inflateEnd returned unexpected result 0x%x,"
+ " srclength %d\n", zlib_err, srclength);
+ goto release_mutex;
+ }
+ length = msblk->stream.total_out;
+ mutex_unlock(&msblk->read_data_mutex);
+ } else {
+ /*
+ * Block is uncompressed.
+ */
+ int i, in, pg_offset = 0;
+
+ for (i = 0; i < b; i++) {
+ wait_on_buffer(bh[i]);
+ if (!buffer_uptodate(bh[i]))
+ goto block_release;
+ }
+
+ for (bytes = length; k < b; k++) {
+ in = min(bytes, msblk->devblksize - offset);
+ bytes -= in;
+ while (in) {
+ if (pg_offset == PAGE_CACHE_SIZE) {
+ page++;
+ pg_offset = 0;
+ }
+ avail = min_t(int, in, PAGE_CACHE_SIZE -
+ pg_offset);
+ memcpy(buffer[page] + pg_offset,
+ bh[k]->b_data + offset, avail);
+ in -= avail;
+ pg_offset += avail;
+ offset += avail;
+ }
+ offset = 0;
+ put_bh(bh[k]);
+ }
+ }
+
+ kfree(bh);
+ return length;
+
+release_mutex:
+ mutex_unlock(&msblk->read_data_mutex);
+
+block_release:
+ for (; k < b; k++)
+ put_bh(bh[k]);
+
+read_failure:
+ ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
+ kfree(bh);
+ return -EIO;
+}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
new file mode 100644
index 00000000000..f29eda16d25
--- /dev/null
+++ b/fs/squashfs/cache.c
@@ -0,0 +1,412 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * cache.c
+ */
+
+/*
+ * Blocks in Squashfs are compressed. To avoid repeatedly decompressing
+ * recently accessed data Squashfs uses two small metadata and fragment caches.
+ *
+ * This file implements a generic cache implementation used for both caches,
+ * plus functions layered ontop of the generic cache implementation to
+ * access the metadata and fragment caches.
+ *
+ * To avoid out of memory and fragmentation isssues with vmalloc the cache
+ * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
+ *
+ * It should be noted that the cache is not used for file datablocks, these
+ * are decompressed and cached in the page-cache in the normal way. The
+ * cache is only used to temporarily cache fragment and metadata blocks
+ * which have been read as as a result of a metadata (i.e. inode or
+ * directory) or fragment access. Because metadata and fragments are packed
+ * together into blocks (to gain greater compression) the read of a particular
+ * piece of metadata or fragment will retrieve other metadata/fragments which
+ * have been packed with it, these because of locality-of-reference may be read
+ * in the near future. Temporarily caching them ensures they are available for
+ * near future access without requiring an additional read and decompress.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/zlib.h>
+#include <linux/pagemap.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Look-up block in cache, and increment usage count. If not in cache, read
+ * and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
+ struct squashfs_cache *cache, u64 block, int length)
+{
+ int i, n;
+ struct squashfs_cache_entry *entry;
+
+ spin_lock(&cache->lock);
+
+ while (1) {
+ for (i = 0; i < cache->entries; i++)
+ if (cache->entry[i].block == block)
+ break;
+
+ if (i == cache->entries) {
+ /*
+ * Block not in cache, if all cache entries are used
+ * go to sleep waiting for one to become available.
+ */
+ if (cache->unused == 0) {
+ cache->num_waiters++;
+ spin_unlock(&cache->lock);
+ wait_event(cache->wait_queue, cache->unused);
+ spin_lock(&cache->lock);
+ cache->num_waiters--;
+ continue;
+ }
+
+ /*
+ * At least one unused cache entry. A simple
+ * round-robin strategy is used to choose the entry to
+ * be evicted from the cache.
+ */
+ i = cache->next_blk;
+ for (n = 0; n < cache->entries; n++) {
+ if (cache->entry[i].refcount == 0)
+ break;
+ i = (i + 1) % cache->entries;
+ }
+
+ cache->next_blk = (i + 1) % cache->entries;
+ entry = &cache->entry[i];
+
+ /*
+ * Initialise choosen cache entry, and fill it in from
+ * disk.
+ */
+ cache->unused--;
+ entry->block = block;
+ entry->refcount = 1;
+ entry->pending = 1;
+ entry->num_waiters = 0;
+ entry->error = 0;
+ spin_unlock(&cache->lock);
+
+ entry->length = squashfs_read_data(sb, entry->data,
+ block, length, &entry->next_index,
+ cache->block_size);
+
+ spin_lock(&cache->lock);
+
+ if (entry->length < 0)
+ entry->error = entry->length;
+
+ entry->pending = 0;
+
+ /*
+ * While filling this entry one or more other processes
+ * have looked it up in the cache, and have slept
+ * waiting for it to become available.
+ */
+ if (entry->num_waiters) {
+ spin_unlock(&cache->lock);
+ wake_up_all(&entry->wait_queue);
+ } else
+ spin_unlock(&cache->lock);
+
+ goto out;
+ }
+
+ /*
+ * Block already in cache. Increment refcount so it doesn't
+ * get reused until we're finished with it, if it was
+ * previously unused there's one less cache entry available
+ * for reuse.
+ */
+ entry = &cache->entry[i];
+ if (entry->refcount == 0)
+ cache->unused--;
+ entry->refcount++;
+
+ /*
+ * If the entry is currently being filled in by another process
+ * go to sleep waiting for it to become available.
+ */
+ if (entry->pending) {
+ entry->num_waiters++;
+ spin_unlock(&cache->lock);
+ wait_event(entry->wait_queue, !entry->pending);
+ } else
+ spin_unlock(&cache->lock);
+
+ goto out;
+ }
+
+out:
+ TRACE("Got %s %d, start block %lld, refcount %d, error %d\n",
+ cache->name, i, entry->block, entry->refcount, entry->error);
+
+ if (entry->error)
+ ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
+ block);
+ return entry;
+}
+
+
+/*
+ * Release cache entry, once usage count is zero it can be reused.
+ */
+void squashfs_cache_put(struct squashfs_cache_entry *entry)
+{
+ struct squashfs_cache *cache = entry->cache;
+
+ spin_lock(&cache->lock);
+ entry->refcount--;
+ if (entry->refcount == 0) {
+ cache->unused++;
+ /*
+ * If there's any processes waiting for a block to become
+ * available, wake one up.
+ */
+ if (cache->num_waiters) {
+ spin_unlock(&cache->lock);
+ wake_up(&cache->wait_queue);
+ return;
+ }
+ }
+ spin_unlock(&cache->lock);
+}
+
+/*
+ * Delete cache reclaiming all kmalloced buffers.
+ */
+void squashfs_cache_delete(struct squashfs_cache *cache)
+{
+ int i, j;
+
+ if (cache == NULL)
+ return;
+
+ for (i = 0; i < cache->entries; i++) {
+ if (cache->entry[i].data) {
+ for (j = 0; j < cache->pages; j++)
+ kfree(cache->entry[i].data[j]);
+ kfree(cache->entry[i].data);
+ }
+ }
+
+ kfree(cache->entry);
+ kfree(cache);
+}
+
+
+/*
+ * Initialise cache allocating the specified number of entries, each of
+ * size block_size. To avoid vmalloc fragmentation issues each entry
+ * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
+ */
+struct squashfs_cache *squashfs_cache_init(char *name, int entries,
+ int block_size)
+{
+ int i, j;
+ struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+
+ if (cache == NULL) {
+ ERROR("Failed to allocate %s cache\n", name);
+ return NULL;
+ }
+
+ cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL);
+ if (cache->entry == NULL) {
+ ERROR("Failed to allocate %s cache\n", name);
+ goto cleanup;
+ }
+
+ cache->next_blk = 0;
+ cache->unused = entries;
+ cache->entries = entries;
+ cache->block_size = block_size;
+ cache->pages = block_size >> PAGE_CACHE_SHIFT;
+ cache->name = name;
+ cache->num_waiters = 0;
+ spin_lock_init(&cache->lock);
+ init_waitqueue_head(&cache->wait_queue);
+
+ for (i = 0; i < entries; i++) {
+ struct squashfs_cache_entry *entry = &cache->entry[i];
+
+ init_waitqueue_head(&cache->entry[i].wait_queue);
+ entry->cache = cache;
+ entry->block = SQUASHFS_INVALID_BLK;
+ entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
+ if (entry->data == NULL) {
+ ERROR("Failed to allocate %s cache entry\n", name);
+ goto cleanup;
+ }
+
+ for (j = 0; j < cache->pages; j++) {
+ entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ if (entry->data[j] == NULL) {
+ ERROR("Failed to allocate %s buffer\n", name);
+ goto cleanup;
+ }
+ }
+ }
+
+ return cache;
+
+cleanup:
+ squashfs_cache_delete(cache);
+ return NULL;
+}
+
+
+/*
+ * Copy upto length bytes from cache entry to buffer starting at offset bytes
+ * into the cache entry. If there's not length bytes then copy the number of
+ * bytes available. In all cases return the number of bytes copied.
+ */
+int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
+ int offset, int length)
+{
+ int remaining = length;
+
+ if (length == 0)
+ return 0;
+ else if (buffer == NULL)
+ return min(length, entry->length - offset);
+
+ while (offset < entry->length) {
+ void *buff = entry->data[offset / PAGE_CACHE_SIZE]
+ + (offset % PAGE_CACHE_SIZE);
+ int bytes = min_t(int, entry->length - offset,
+ PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
+
+ if (bytes >= remaining) {
+ memcpy(buffer, buff, remaining);
+ remaining = 0;
+ break;
+ }
+
+ memcpy(buffer, buff, bytes);
+ buffer += bytes;
+ remaining -= bytes;
+ offset += bytes;
+ }
+
+ return length - remaining;
+}
+
+
+/*
+ * Read length bytes from metadata position <block, offset> (block is the
+ * start of the compressed block on disk, and offset is the offset into
+ * the block once decompressed). Data is packed into consecutive blocks,
+ * and length bytes may require reading more than one block.
+ */
+int squashfs_read_metadata(struct super_block *sb, void *buffer,
+ u64 *block, int *offset, int length)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ int bytes, copied = length;
+ struct squashfs_cache_entry *entry;
+
+ TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
+
+ while (length) {
+ entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
+ if (entry->error)
+ return entry->error;
+ else if (*offset >= entry->length)
+ return -EIO;
+
+ bytes = squashfs_copy_data(buffer, entry, *offset, length);
+ if (buffer)
+ buffer += bytes;
+ length -= bytes;
+ *offset += bytes;
+
+ if (*offset == entry->length) {
+ *block = entry->next_index;
+ *offset = 0;
+ }
+
+ squashfs_cache_put(entry);
+ }
+
+ return copied;
+}
+
+
+/*
+ * Look-up in the fragmment cache the fragment located at <start_block> in the
+ * filesystem. If necessary read and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb,
+ u64 start_block, int length)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+
+ return squashfs_cache_get(sb, msblk->fragment_cache, start_block,
+ length);
+}
+
+
+/*
+ * Read and decompress the datablock located at <start_block> in the
+ * filesystem. The cache is used here to avoid duplicating locking and
+ * read/decompress code.
+ */
+struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
+ u64 start_block, int length)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+
+ return squashfs_cache_get(sb, msblk->read_page, start_block, length);
+}
+
+
+/*
+ * Read a filesystem table (uncompressed sequence of bytes) from disk
+ */
+int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
+ int length)
+{
+ int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int i, res;
+ void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+ if (data == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
+ data[i] = buffer;
+ res = squashfs_read_data(sb, data, block, length |
+ SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
+ kfree(data);
+ return res;
+}
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
new file mode 100644
index 00000000000..566b0eaed86
--- /dev/null
+++ b/fs/squashfs/dir.c
@@ -0,0 +1,235 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * dir.c
+ */
+
+/*
+ * This file implements code to read directories from disk.
+ *
+ * See namei.c for a description of directory organisation on disk.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static const unsigned char squashfs_filetype_table[] = {
+ DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK
+};
+
+/*
+ * Lookup offset (f_pos) in the directory index, returning the
+ * metadata block containing it.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_offset(struct super_block *sb,
+ u64 *next_block, int *next_offset, u64 index_start, int index_offset,
+ int i_count, u64 f_pos)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ int err, i, index, length = 0;
+ struct squashfs_dir_index dir_index;
+
+ TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n",
+ i_count, f_pos);
+
+ /*
+ * Translate from external f_pos to the internal f_pos. This
+ * is offset by 3 because we invent "." and ".." entries which are
+ * not actually stored in the directory.
+ */
+ if (f_pos < 3)
+ return f_pos;
+ f_pos -= 3;
+
+ for (i = 0; i < i_count; i++) {
+ err = squashfs_read_metadata(sb, &dir_index, &index_start,
+ &index_offset, sizeof(dir_index));
+ if (err < 0)
+ break;
+
+ index = le32_to_cpu(dir_index.index);
+ if (index > f_pos)
+ /*
+ * Found the index we're looking for.
+ */
+ break;
+
+ err = squashfs_read_metadata(sb, NULL, &index_start,
+ &index_offset, le32_to_cpu(dir_index.size) + 1);
+ if (err < 0)
+ break;
+
+ length = index;
+ *next_block = le32_to_cpu(dir_index.start_block) +
+ msblk->directory_table;
+ }
+
+ *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+
+ /*
+ * Translate back from internal f_pos to external f_pos.
+ */
+ return length + 3;
+}
+
+
+static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ u64 block = squashfs_i(inode)->start + msblk->directory_table;
+ int offset = squashfs_i(inode)->offset, length = 0, dir_count, size,
+ type, err;
+ unsigned int inode_number;
+ struct squashfs_dir_header dirh;
+ struct squashfs_dir_entry *dire;
+
+ TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset);
+
+ dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+ if (dire == NULL) {
+ ERROR("Failed to allocate squashfs_dir_entry\n");
+ goto finish;
+ }
+
+ /*
+ * Return "." and ".." entries as the first two filenames in the
+ * directory. To maximise compression these two entries are not
+ * stored in the directory, and so we invent them here.
+ *
+ * It also means that the external f_pos is offset by 3 from the
+ * on-disk directory f_pos.
+ */
+ while (file->f_pos < 3) {
+ char *name;
+ int i_ino;
+
+ if (file->f_pos == 0) {
+ name = ".";
+ size = 1;
+ i_ino = inode->i_ino;
+ } else {
+ name = "..";
+ size = 2;
+ i_ino = squashfs_i(inode)->parent;
+ }
+
+ TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
+ dirent, name, size, file->f_pos, i_ino,
+ squashfs_filetype_table[1]);
+
+ if (filldir(dirent, name, size, file->f_pos, i_ino,
+ squashfs_filetype_table[1]) < 0) {
+ TRACE("Filldir returned less than 0\n");
+ goto finish;
+ }
+
+ file->f_pos += size;
+ }
+
+ length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
+ squashfs_i(inode)->dir_idx_start,
+ squashfs_i(inode)->dir_idx_offset,
+ squashfs_i(inode)->dir_idx_cnt,
+ file->f_pos);
+
+ while (length < i_size_read(inode)) {
+ /*
+ * Read directory header
+ */
+ err = squashfs_read_metadata(inode->i_sb, &dirh, &block,
+ &offset, sizeof(dirh));
+ if (err < 0)
+ goto failed_read;
+
+ length += sizeof(dirh);
+
+ dir_count = le32_to_cpu(dirh.count) + 1;
+ while (dir_count--) {
+ /*
+ * Read directory entry.
+ */
+ err = squashfs_read_metadata(inode->i_sb, dire, &block,
+ &offset, sizeof(*dire));
+ if (err < 0)
+ goto failed_read;
+
+ size = le16_to_cpu(dire->size) + 1;
+
+ err = squashfs_read_metadata(inode->i_sb, dire->name,
+ &block, &offset, size);
+ if (err < 0)
+ goto failed_read;
+
+ length += sizeof(*dire) + size;
+
+ if (file->f_pos >= length)
+ continue;
+
+ dire->name[size] = '\0';
+ inode_number = le32_to_cpu(dirh.inode_number) +
+ ((short) le16_to_cpu(dire->inode_number));
+ type = le16_to_cpu(dire->type);
+
+ TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
+ "\n", dirent, dire->name, size,
+ file->f_pos,
+ le32_to_cpu(dirh.start_block),
+ le16_to_cpu(dire->offset),
+ inode_number,
+ squashfs_filetype_table[type]);
+
+ if (filldir(dirent, dire->name, size, file->f_pos,
+ inode_number,
+ squashfs_filetype_table[type]) < 0) {
+ TRACE("Filldir returned less than 0\n");
+ goto finish;
+ }
+
+ file->f_pos = length;
+ }
+ }
+
+finish:
+ kfree(dire);
+ return 0;
+
+failed_read:
+ ERROR("Unable to read directory block [%llx:%x]\n", block, offset);
+ kfree(dire);
+ return 0;
+}
+
+
+const struct file_operations squashfs_dir_ops = {
+ .read = generic_read_dir,
+ .readdir = squashfs_readdir
+};
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
new file mode 100644
index 00000000000..69e971d5ddc
--- /dev/null
+++ b/fs/squashfs/export.c
@@ -0,0 +1,155 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * export.c
+ */
+
+/*
+ * This file implements code to make Squashfs filesystems exportable (NFS etc.)
+ *
+ * The export code uses an inode lookup table to map inode numbers passed in
+ * filehandles to an inode location on disk. This table is stored compressed
+ * into metadata blocks. A second index table is used to locate these. This
+ * second index table for speed of access (and because it is small) is read at
+ * mount time and cached in memory.
+ *
+ * The inode lookup table is used only by the export code, inode disk
+ * locations are directly encoded in directories, enabling direct access
+ * without an intermediate lookup for all operations except the export ops.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/dcache.h>
+#include <linux/exportfs.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Look-up inode number (ino) in table, returning the inode location.
+ */
+static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
+ int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
+ u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+ __le64 ino;
+ int err;
+
+ TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
+
+ err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
+ if (err < 0)
+ return err;
+
+ TRACE("squashfs_inode_lookup, inode = 0x%llx\n",
+ (u64) le64_to_cpu(ino));
+
+ return le64_to_cpu(ino);
+}
+
+
+static struct dentry *squashfs_export_iget(struct super_block *sb,
+ unsigned int ino_num)
+{
+ long long ino;
+ struct dentry *dentry = ERR_PTR(-ENOENT);
+
+ TRACE("Entered squashfs_export_iget\n");
+
+ ino = squashfs_inode_lookup(sb, ino_num);
+ if (ino >= 0)
+ dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num));
+
+ return dentry;
+}
+
+
+static struct dentry *squashfs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT)
+ || fh_len < 2)
+ return NULL;
+
+ return squashfs_export_iget(sb, fid->i32.ino);
+}
+
+
+static struct dentry *squashfs_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4)
+ return NULL;
+
+ return squashfs_export_iget(sb, fid->i32.parent_ino);
+}
+
+
+static struct dentry *squashfs_get_parent(struct dentry *child)
+{
+ struct inode *inode = child->d_inode;
+ unsigned int parent_ino = squashfs_i(inode)->parent;
+
+ return squashfs_export_iget(inode->i_sb, parent_ino);
+}
+
+
+/*
+ * Read uncompressed inode lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
+ u64 lookup_table_start, unsigned int inodes)
+{
+ unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
+ __le64 *inode_lookup_table;
+ int err;
+
+ TRACE("In read_inode_lookup_table, length %d\n", length);
+
+ /* Allocate inode lookup table indexes */
+ inode_lookup_table = kmalloc(length, GFP_KERNEL);
+ if (inode_lookup_table == NULL) {
+ ERROR("Failed to allocate inode lookup table\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start,
+ length);
+ if (err < 0) {
+ ERROR("unable to read inode lookup table\n");
+ kfree(inode_lookup_table);
+ return ERR_PTR(err);
+ }
+
+ return inode_lookup_table;
+}
+
+
+const struct export_operations squashfs_export_ops = {
+ .fh_to_dentry = squashfs_fh_to_dentry,
+ .fh_to_parent = squashfs_fh_to_parent,
+ .get_parent = squashfs_get_parent
+};
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
new file mode 100644
index 00000000000..717767d831d
--- /dev/null
+++ b/fs/squashfs/file.c
@@ -0,0 +1,502 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * file.c
+ */
+
+/*
+ * This file contains code for handling regular files. A regular file
+ * consists of a sequence of contiguous compressed blocks, and/or a
+ * compressed fragment block (tail-end packed block). The compressed size
+ * of each datablock is stored in a block list contained within the
+ * file inode (itself stored in one or more compressed metadata blocks).
+ *
+ * To speed up access to datablocks when reading 'large' files (256 Mbytes or
+ * larger), the code implements an index cache that caches the mapping from
+ * block index to datablock location on disk.
+ *
+ * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
+ * retaining a simple and space-efficient block list on disk. The cache
+ * is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
+ * Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
+ * The index cache is designed to be memory efficient, and by default uses
+ * 16 KiB.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Locate cache slot in range [offset, index] for specified inode. If
+ * there's more than one return the slot closest to index.
+ */
+static struct meta_index *locate_meta_index(struct inode *inode, int offset,
+ int index)
+{
+ struct meta_index *meta = NULL;
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ int i;
+
+ mutex_lock(&msblk->meta_index_mutex);
+
+ TRACE("locate_meta_index: index %d, offset %d\n", index, offset);
+
+ if (msblk->meta_index == NULL)
+ goto not_allocated;
+
+ for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+ if (msblk->meta_index[i].inode_number == inode->i_ino &&
+ msblk->meta_index[i].offset >= offset &&
+ msblk->meta_index[i].offset <= index &&
+ msblk->meta_index[i].locked == 0) {
+ TRACE("locate_meta_index: entry %d, offset %d\n", i,
+ msblk->meta_index[i].offset);
+ meta = &msblk->meta_index[i];
+ offset = meta->offset;
+ }
+ }
+
+ if (meta)
+ meta->locked = 1;
+
+not_allocated:
+ mutex_unlock(&msblk->meta_index_mutex);
+
+ return meta;
+}
+
+
+/*
+ * Find and initialise an empty cache slot for index offset.
+ */
+static struct meta_index *empty_meta_index(struct inode *inode, int offset,
+ int skip)
+{
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ struct meta_index *meta = NULL;
+ int i;
+
+ mutex_lock(&msblk->meta_index_mutex);
+
+ TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip);
+
+ if (msblk->meta_index == NULL) {
+ /*
+ * First time cache index has been used, allocate and
+ * initialise. The cache index could be allocated at
+ * mount time but doing it here means it is allocated only
+ * if a 'large' file is read.
+ */
+ msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS,
+ sizeof(*(msblk->meta_index)), GFP_KERNEL);
+ if (msblk->meta_index == NULL) {
+ ERROR("Failed to allocate meta_index\n");
+ goto failed;
+ }
+ for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+ msblk->meta_index[i].inode_number = 0;
+ msblk->meta_index[i].locked = 0;
+ }
+ msblk->next_meta_index = 0;
+ }
+
+ for (i = SQUASHFS_META_SLOTS; i &&
+ msblk->meta_index[msblk->next_meta_index].locked; i--)
+ msblk->next_meta_index = (msblk->next_meta_index + 1) %
+ SQUASHFS_META_SLOTS;
+
+ if (i == 0) {
+ TRACE("empty_meta_index: failed!\n");
+ goto failed;
+ }
+
+ TRACE("empty_meta_index: returned meta entry %d, %p\n",
+ msblk->next_meta_index,
+ &msblk->meta_index[msblk->next_meta_index]);
+
+ meta = &msblk->meta_index[msblk->next_meta_index];
+ msblk->next_meta_index = (msblk->next_meta_index + 1) %
+ SQUASHFS_META_SLOTS;
+
+ meta->inode_number = inode->i_ino;
+ meta->offset = offset;
+ meta->skip = skip;
+ meta->entries = 0;
+ meta->locked = 1;
+
+failed:
+ mutex_unlock(&msblk->meta_index_mutex);
+ return meta;
+}
+
+
+static void release_meta_index(struct inode *inode, struct meta_index *meta)
+{
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ mutex_lock(&msblk->meta_index_mutex);
+ meta->locked = 0;
+ mutex_unlock(&msblk->meta_index_mutex);
+}
+
+
+/*
+ * Read the next n blocks from the block list, starting from
+ * metadata block <start_block, offset>.
+ */
+static long long read_indexes(struct super_block *sb, int n,
+ u64 *start_block, int *offset)
+{
+ int err, i;
+ long long block = 0;
+ __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+
+ if (blist == NULL) {
+ ERROR("read_indexes: Failed to allocate block_list\n");
+ return -ENOMEM;
+ }
+
+ while (n) {
+ int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
+
+ err = squashfs_read_metadata(sb, blist, start_block,
+ offset, blocks << 2);
+ if (err < 0) {
+ ERROR("read_indexes: reading block [%llx:%x]\n",
+ *start_block, *offset);
+ goto failure;
+ }
+
+ for (i = 0; i < blocks; i++) {
+ int size = le32_to_cpu(blist[i]);
+ block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size);
+ }
+ n -= blocks;
+ }
+
+ kfree(blist);
+ return block;
+
+failure:
+ kfree(blist);
+ return err;
+}
+
+
+/*
+ * Each cache index slot has SQUASHFS_META_ENTRIES, each of which
+ * can cache one index -> datablock/blocklist-block mapping. We wish
+ * to distribute these over the length of the file, entry[0] maps index x,
+ * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on.
+ * The larger the file, the greater the skip factor. The skip factor is
+ * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure
+ * the number of metadata blocks that need to be read fits into the cache.
+ * If the skip factor is limited in this way then the file will use multiple
+ * slots.
+ */
+static inline int calculate_skip(int blocks)
+{
+ int skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
+ * SQUASHFS_META_INDEXES);
+ return min(SQUASHFS_CACHED_BLKS - 1, skip + 1);
+}
+
+
+/*
+ * Search and grow the index cache for the specified inode, returning the
+ * on-disk locations of the datablock and block list metadata block
+ * <index_block, index_offset> for index (scaled to nearest cache index).
+ */
+static int fill_meta_index(struct inode *inode, int index,
+ u64 *index_block, int *index_offset, u64 *data_block)
+{
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ int skip = calculate_skip(i_size_read(inode) >> msblk->block_log);
+ int offset = 0;
+ struct meta_index *meta;
+ struct meta_entry *meta_entry;
+ u64 cur_index_block = squashfs_i(inode)->block_list_start;
+ int cur_offset = squashfs_i(inode)->offset;
+ u64 cur_data_block = squashfs_i(inode)->start;
+ int err, i;
+
+ /*
+ * Scale index to cache index (cache slot entry)
+ */
+ index /= SQUASHFS_META_INDEXES * skip;
+
+ while (offset < index) {
+ meta = locate_meta_index(inode, offset + 1, index);
+
+ if (meta == NULL) {
+ meta = empty_meta_index(inode, offset + 1, skip);
+ if (meta == NULL)
+ goto all_done;
+ } else {
+ offset = index < meta->offset + meta->entries ? index :
+ meta->offset + meta->entries - 1;
+ meta_entry = &meta->meta_entry[offset - meta->offset];
+ cur_index_block = meta_entry->index_block +
+ msblk->inode_table;
+ cur_offset = meta_entry->offset;
+ cur_data_block = meta_entry->data_block;
+ TRACE("get_meta_index: offset %d, meta->offset %d, "
+ "meta->entries %d\n", offset, meta->offset,
+ meta->entries);
+ TRACE("get_meta_index: index_block 0x%llx, offset 0x%x"
+ " data_block 0x%llx\n", cur_index_block,
+ cur_offset, cur_data_block);
+ }
+
+ /*
+ * If necessary grow cache slot by reading block list. Cache
+ * slot is extended up to index or to the end of the slot, in
+ * which case further slots will be used.
+ */
+ for (i = meta->offset + meta->entries; i <= index &&
+ i < meta->offset + SQUASHFS_META_ENTRIES; i++) {
+ int blocks = skip * SQUASHFS_META_INDEXES;
+ long long res = read_indexes(inode->i_sb, blocks,
+ &cur_index_block, &cur_offset);
+
+ if (res < 0) {
+ if (meta->entries == 0)
+ /*
+ * Don't leave an empty slot on read
+ * error allocated to this inode...
+ */
+ meta->inode_number = 0;
+ err = res;
+ goto failed;
+ }
+
+ cur_data_block += res;
+ meta_entry = &meta->meta_entry[i - meta->offset];
+ meta_entry->index_block = cur_index_block -
+ msblk->inode_table;
+ meta_entry->offset = cur_offset;
+ meta_entry->data_block = cur_data_block;
+ meta->entries++;
+ offset++;
+ }
+
+ TRACE("get_meta_index: meta->offset %d, meta->entries %d\n",
+ meta->offset, meta->entries);
+
+ release_meta_index(inode, meta);
+ }
+
+all_done:
+ *index_block = cur_index_block;
+ *index_offset = cur_offset;
+ *data_block = cur_data_block;
+
+ /*
+ * Scale cache index (cache slot entry) to index
+ */
+ return offset * SQUASHFS_META_INDEXES * skip;
+
+failed:
+ release_meta_index(inode, meta);
+ return err;
+}
+
+
+/*
+ * Get the on-disk location and compressed size of the datablock
+ * specified by index. Fill_meta_index() does most of the work.
+ */
+static int read_blocklist(struct inode *inode, int index, u64 *block)
+{
+ u64 start;
+ long long blks;
+ int offset;
+ __le32 size;
+ int res = fill_meta_index(inode, index, &start, &offset, block);
+
+ TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset"
+ " 0x%x, block 0x%llx\n", res, index, start, offset,
+ *block);
+
+ if (res < 0)
+ return res;
+
+ /*
+ * res contains the index of the mapping returned by fill_meta_index(),
+ * this will likely be less than the desired index (because the
+ * meta_index cache works at a higher granularity). Read any
+ * extra block indexes needed.
+ */
+ if (res < index) {
+ blks = read_indexes(inode->i_sb, index - res, &start, &offset);
+ if (blks < 0)
+ return (int) blks;
+ *block += blks;
+ }
+
+ /*
+ * Read length of block specified by index.
+ */
+ res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset,
+ sizeof(size));
+ if (res < 0)
+ return res;
+ return le32_to_cpu(size);
+}
+
+
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ int bytes, i, offset = 0, sparse = 0;
+ struct squashfs_cache_entry *buffer = NULL;
+ void *pageaddr;
+
+ int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+ int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+ int start_index = page->index & ~mask;
+ int end_index = start_index | mask;
+ int file_end = i_size_read(inode) >> msblk->block_log;
+
+ TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
+ page->index, squashfs_i(inode)->start);
+
+ if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT))
+ goto out;
+
+ if (index < file_end || squashfs_i(inode)->fragment_block ==
+ SQUASHFS_INVALID_BLK) {
+ /*
+ * Reading a datablock from disk. Need to read block list
+ * to get location and block size.
+ */
+ u64 block = 0;
+ int bsize = read_blocklist(inode, index, &block);
+ if (bsize < 0)
+ goto error_out;
+
+ if (bsize == 0) { /* hole */
+ bytes = index == file_end ?
+ (i_size_read(inode) & (msblk->block_size - 1)) :
+ msblk->block_size;
+ sparse = 1;
+ } else {
+ /*
+ * Read and decompress datablock.
+ */
+ buffer = squashfs_get_datablock(inode->i_sb,
+ block, bsize);
+ if (buffer->error) {
+ ERROR("Unable to read page, block %llx, size %x"
+ "\n", block, bsize);
+ squashfs_cache_put(buffer);
+ goto error_out;
+ }
+ bytes = buffer->length;
+ }
+ } else {
+ /*
+ * Datablock is stored inside a fragment (tail-end packed
+ * block).
+ */
+ buffer = squashfs_get_fragment(inode->i_sb,
+ squashfs_i(inode)->fragment_block,
+ squashfs_i(inode)->fragment_size);
+
+ if (buffer->error) {
+ ERROR("Unable to read page, block %llx, size %x\n",
+ squashfs_i(inode)->fragment_block,
+ squashfs_i(inode)->fragment_size);
+ squashfs_cache_put(buffer);
+ goto error_out;
+ }
+ bytes = i_size_read(inode) & (msblk->block_size - 1);
+ offset = squashfs_i(inode)->fragment_offset;
+ }
+
+ /*
+ * Loop copying datablock into pages. As the datablock likely covers
+ * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
+ * grab the pages from the page cache, except for the page that we've
+ * been called to fill.
+ */
+ for (i = start_index; i <= end_index && bytes > 0; i++,
+ bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+ struct page *push_page;
+ int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
+
+ TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
+
+ push_page = (i == page->index) ? page :
+ grab_cache_page_nowait(page->mapping, i);
+
+ if (!push_page)
+ continue;
+
+ if (PageUptodate(push_page))
+ goto skip_page;
+
+ pageaddr = kmap_atomic(push_page, KM_USER0);
+ squashfs_copy_data(pageaddr, buffer, offset, avail);
+ memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+ kunmap_atomic(pageaddr, KM_USER0);
+ flush_dcache_page(push_page);
+ SetPageUptodate(push_page);
+skip_page:
+ unlock_page(push_page);
+ if (i != page->index)
+ page_cache_release(push_page);
+ }
+
+ if (!sparse)
+ squashfs_cache_put(buffer);
+
+ return 0;
+
+error_out:
+ SetPageError(page);
+out:
+ pageaddr = kmap_atomic(page, KM_USER0);
+ memset(pageaddr, 0, PAGE_CACHE_SIZE);
+ kunmap_atomic(pageaddr, KM_USER0);
+ flush_dcache_page(page);
+ if (!PageError(page))
+ SetPageUptodate(page);
+ unlock_page(page);
+
+ return 0;
+}
+
+
+const struct address_space_operations squashfs_aops = {
+ .readpage = squashfs_readpage
+};
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
new file mode 100644
index 00000000000..b5a2c15bbbc
--- /dev/null
+++ b/fs/squashfs/fragment.c
@@ -0,0 +1,98 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * fragment.c
+ */
+
+/*
+ * This file implements code to handle compressed fragments (tail-end packed
+ * datablocks).
+ *
+ * Regular files contain a fragment index which is mapped to a fragment
+ * location on disk and compressed size using a fragment lookup table.
+ * Like everything in Squashfs this fragment lookup table is itself stored
+ * compressed into metadata blocks. A second index table is used to locate
+ * these. This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Look-up fragment using the fragment index table. Return the on disk
+ * location of the fragment and its compressed size
+ */
+int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment,
+ u64 *fragment_block)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ int block = SQUASHFS_FRAGMENT_INDEX(fragment);
+ int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment);
+ u64 start_block = le64_to_cpu(msblk->fragment_index[block]);
+ struct squashfs_fragment_entry fragment_entry;
+ int size;
+
+ size = squashfs_read_metadata(sb, &fragment_entry, &start_block,
+ &offset, sizeof(fragment_entry));
+ if (size < 0)
+ return size;
+
+ *fragment_block = le64_to_cpu(fragment_entry.start_block);
+ size = le32_to_cpu(fragment_entry.size);
+
+ return size;
+}
+
+
+/*
+ * Read the uncompressed fragment lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_fragment_index_table(struct super_block *sb,
+ u64 fragment_table_start, unsigned int fragments)
+{
+ unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments);
+ __le64 *fragment_index;
+ int err;
+
+ /* Allocate fragment lookup table indexes */
+ fragment_index = kmalloc(length, GFP_KERNEL);
+ if (fragment_index == NULL) {
+ ERROR("Failed to allocate fragment index table\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ err = squashfs_read_table(sb, fragment_index, fragment_table_start,
+ length);
+ if (err < 0) {
+ ERROR("unable to read fragment index table\n");
+ kfree(fragment_index);
+ return ERR_PTR(err);
+ }
+
+ return fragment_index;
+}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
new file mode 100644
index 00000000000..3795b837ba2
--- /dev/null
+++ b/fs/squashfs/id.c
@@ -0,0 +1,94 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * id.c
+ */
+
+/*
+ * This file implements code to handle uids and gids.
+ *
+ * For space efficiency regular files store uid and gid indexes, which are
+ * converted to 32-bit uids/gids using an id look up table. This table is
+ * stored compressed into metadata blocks. A second index table is used to
+ * locate these. This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Map uid/gid index into real 32-bit uid/gid using the id look up table
+ */
+int squashfs_get_id(struct super_block *sb, unsigned int index,
+ unsigned int *id)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ int block = SQUASHFS_ID_BLOCK(index);
+ int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
+ u64 start_block = le64_to_cpu(msblk->id_table[block]);
+ __le32 disk_id;
+ int err;
+
+ err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
+ sizeof(disk_id));
+ if (err < 0)
+ return err;
+
+ *id = le32_to_cpu(disk_id);
+ return 0;
+}
+
+
+/*
+ * Read uncompressed id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_id_index_table(struct super_block *sb,
+ u64 id_table_start, unsigned short no_ids)
+{
+ unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
+ __le64 *id_table;
+ int err;
+
+ TRACE("In read_id_index_table, length %d\n", length);
+
+ /* Allocate id lookup table indexes */
+ id_table = kmalloc(length, GFP_KERNEL);
+ if (id_table == NULL) {
+ ERROR("Failed to allocate id index table\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ err = squashfs_read_table(sb, id_table, id_table_start, length);
+ if (err < 0) {
+ ERROR("unable to read id index table\n");
+ kfree(id_table);
+ return ERR_PTR(err);
+ }
+
+ return id_table;
+}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
new file mode 100644
index 00000000000..7a63398bb85
--- /dev/null
+++ b/fs/squashfs/inode.c
@@ -0,0 +1,346 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * inode.c
+ */
+
+/*
+ * This file implements code to create and read inodes from disk.
+ *
+ * Inodes in Squashfs are identified by a 48-bit inode which encodes the
+ * location of the compressed metadata block containing the inode, and the byte
+ * offset into that block where the inode is placed (<block, offset>).
+ *
+ * To maximise compression there are different inodes for each file type
+ * (regular file, directory, device, etc.), the inode contents and length
+ * varying with the type.
+ *
+ * To further maximise compression, two types of regular file inode and
+ * directory inode are defined: inodes optimised for frequently occurring
+ * regular files and directories, and extended types where extra
+ * information has to be stored.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Initialise VFS inode with the base inode information common to all
+ * Squashfs inode types. Sqsh_ino contains the unswapped base inode
+ * off disk.
+ */
+static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
+ struct squashfs_base_inode *sqsh_ino)
+{
+ int err;
+
+ err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid);
+ if (err)
+ return err;
+
+ err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid);
+ if (err)
+ return err;
+
+ inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
+ inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
+ inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
+ inode->i_ctime.tv_sec = inode->i_mtime.tv_sec;
+ inode->i_mode = le16_to_cpu(sqsh_ino->mode);
+ inode->i_size = 0;
+
+ return err;
+}
+
+
+struct inode *squashfs_iget(struct super_block *sb, long long ino,
+ unsigned int ino_number)
+{
+ struct inode *inode = iget_locked(sb, ino_number);
+ int err;
+
+ TRACE("Entered squashfs_iget\n");
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ err = squashfs_read_inode(inode, ino);
+ if (err) {
+ iget_failed(inode);
+ return ERR_PTR(err);
+ }
+
+ unlock_new_inode(inode);
+ return inode;
+}
+
+
+/*
+ * Initialise VFS inode by reading inode from inode table (compressed
+ * metadata). The format and amount of data read depends on type.
+ */
+int squashfs_read_inode(struct inode *inode, long long ino)
+{
+ struct super_block *sb = inode->i_sb;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+ int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
+ union squashfs_inode squashfs_ino;
+ struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+
+ TRACE("Entered squashfs_read_inode\n");
+
+ /*
+ * Read inode base common to all inode types.
+ */
+ err = squashfs_read_metadata(sb, sqshb_ino, &block,
+ &offset, sizeof(*sqshb_ino));
+ if (err < 0)
+ goto failed_read;
+
+ err = squashfs_new_inode(sb, inode, sqshb_ino);
+ if (err)
+ goto failed_read;
+
+ block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+ offset = SQUASHFS_INODE_OFFSET(ino);
+
+ type = le16_to_cpu(sqshb_ino->inode_type);
+ switch (type) {
+ case SQUASHFS_REG_TYPE: {
+ unsigned int frag_offset, frag_size, frag;
+ u64 frag_blk;
+ struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
+
+ err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+ sizeof(*sqsh_ino));
+ if (err < 0)
+ goto failed_read;
+
+ frag = le32_to_cpu(sqsh_ino->fragment);
+ if (frag != SQUASHFS_INVALID_FRAG) {
+ frag_offset = le32_to_cpu(sqsh_ino->offset);
+ frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+ if (frag_size < 0) {
+ err = frag_size;
+ goto failed_read;
+ }
+ } else {
+ frag_blk = SQUASHFS_INVALID_BLK;
+ frag_size = 0;
+ frag_offset = 0;
+ }
+
+ inode->i_nlink = 1;
+ inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+ inode->i_fop = &generic_ro_fops;
+ inode->i_mode |= S_IFREG;
+ inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
+ squashfs_i(inode)->fragment_block = frag_blk;
+ squashfs_i(inode)->fragment_size = frag_size;
+ squashfs_i(inode)->fragment_offset = frag_offset;
+ squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+ squashfs_i(inode)->block_list_start = block;
+ squashfs_i(inode)->offset = offset;
+ inode->i_data.a_ops = &squashfs_aops;
+
+ TRACE("File inode %x:%x, start_block %llx, block_list_start "
+ "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+ offset, squashfs_i(inode)->start, block, offset);
+ break;
+ }
+ case SQUASHFS_LREG_TYPE: {
+ unsigned int frag_offset, frag_size, frag;
+ u64 frag_blk;
+ struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
+
+ err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+ sizeof(*sqsh_ino));
+ if (err < 0)
+ goto failed_read;
+
+ frag = le32_to_cpu(sqsh_ino->fragment);
+ if (frag != SQUASHFS_INVALID_FRAG) {
+ frag_offset = le32_to_cpu(sqsh_ino->offset);
+ frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+ if (frag_size < 0) {
+ err = frag_size;
+ goto failed_read;
+ }
+ } else {
+ frag_blk = SQUASHFS_INVALID_BLK;
+ frag_size = 0;
+ frag_offset = 0;
+ }
+
+ inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+ inode->i_fop = &generic_ro_fops;
+ inode->i_mode |= S_IFREG;
+ inode->i_blocks = ((inode->i_size -
+ le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
+
+ squashfs_i(inode)->fragment_block = frag_blk;
+ squashfs_i(inode)->fragment_size = frag_size;
+ squashfs_i(inode)->fragment_offset = frag_offset;
+ squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
+ squashfs_i(inode)->block_list_start = block;
+ squashfs_i(inode)->offset = offset;
+ inode->i_data.a_ops = &squashfs_aops;
+
+ TRACE("File inode %x:%x, start_block %llx, block_list_start "
+ "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+ offset, squashfs_i(inode)->start, block, offset);
+ break;
+ }
+ case SQUASHFS_DIR_TYPE: {
+ struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir;
+
+ err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+ sizeof(*sqsh_ino));
+ if (err < 0)
+ goto failed_read;
+
+ inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ inode->i_size = le16_to_cpu(sqsh_ino->file_size);
+ inode->i_op = &squashfs_dir_inode_ops;
+ inode->i_fop = &squashfs_dir_ops;
+ inode->i_mode |= S_IFDIR;
+ squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+ squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+ squashfs_i(inode)->dir_idx_cnt = 0;
+ squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+
+ TRACE("Directory inode %x:%x, start_block %llx, offset %x\n",
+ SQUASHFS_INODE_BLK(ino), offset,
+ squashfs_i(inode)->start,
+ le16_to_cpu(sqsh_ino->offset));
+ break;
+ }
+ case SQUASHFS_LDIR_TYPE: {
+ struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir;
+
+ err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+ sizeof(*sqsh_ino));
+ if (err < 0)
+ goto failed_read;
+
+ inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+ inode->i_op = &squashfs_dir_inode_ops;
+ inode->i_fop = &squashfs_dir_ops;
+ inode->i_mode |= S_IFDIR;
+ squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+ squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+ squashfs_i(inode)->dir_idx_start = block;
+ squashfs_i(inode)->dir_idx_offset = offset;
+ squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count);
+ squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+
+ TRACE("Long directory inode %x:%x, start_block %llx, offset "
+ "%x\n", SQUASHFS_INODE_BLK(ino), offset,
+ squashfs_i(inode)->start,
+ le16_to_cpu(sqsh_ino->offset));
+ break;
+ }
+ case SQUASHFS_SYMLINK_TYPE:
+ case SQUASHFS_LSYMLINK_TYPE: {
+ struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink;
+
+ err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+ sizeof(*sqsh_ino));
+ if (err < 0)
+ goto failed_read;
+
+ inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
+ inode->i_op = &page_symlink_inode_operations;
+ inode->i_data.a_ops = &squashfs_symlink_aops;
+ inode->i_mode |= S_IFLNK;
+ squashfs_i(inode)->start = block;
+ squashfs_i(inode)->offset = offset;
+
+ TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
+ "%x\n", SQUASHFS_INODE_BLK(ino), offset,
+ block, offset);
+ break;
+ }
+ case SQUASHFS_BLKDEV_TYPE:
+ case SQUASHFS_CHRDEV_TYPE:
+ case SQUASHFS_LBLKDEV_TYPE:
+ case SQUASHFS_LCHRDEV_TYPE: {
+ struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
+ unsigned int rdev;
+
+ err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+ sizeof(*sqsh_ino));
+ if (err < 0)
+ goto failed_read;
+
+ if (type == SQUASHFS_CHRDEV_TYPE)
+ inode->i_mode |= S_IFCHR;
+ else
+ inode->i_mode |= S_IFBLK;
+ inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ rdev = le32_to_cpu(sqsh_ino->rdev);
+ init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+
+ TRACE("Device inode %x:%x, rdev %x\n",
+ SQUASHFS_INODE_BLK(ino), offset, rdev);
+ break;
+ }
+ case SQUASHFS_FIFO_TYPE:
+ case SQUASHFS_SOCKET_TYPE:
+ case SQUASHFS_LFIFO_TYPE:
+ case SQUASHFS_LSOCKET_TYPE: {
+ struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
+
+ err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+ sizeof(*sqsh_ino));
+ if (err < 0)
+ goto failed_read;
+
+ if (type == SQUASHFS_FIFO_TYPE)
+ inode->i_mode |= S_IFIFO;
+ else
+ inode->i_mode |= S_IFSOCK;
+ inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ init_special_inode(inode, inode->i_mode, 0);
+ break;
+ }
+ default:
+ ERROR("Unknown inode type %d in squashfs_iget!\n", type);
+ return -EINVAL;
+ }
+
+ return 0;
+
+failed_read:
+ ERROR("Unable to read inode 0x%llx\n", ino);
+ return err;
+}
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
new file mode 100644
index 00000000000..9e398653b22
--- /dev/null
+++ b/fs/squashfs/namei.c
@@ -0,0 +1,242 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * namei.c
+ */
+
+/*
+ * This file implements code to do filename lookup in directories.
+ *
+ * Like inodes, directories are packed into compressed metadata blocks, stored
+ * in a directory table. Directories are accessed using the start address of
+ * the metablock containing the directory and the offset into the
+ * decompressed block (<block, offset>).
+ *
+ * Directories are organised in a slightly complex way, and are not simply
+ * a list of file names. The organisation takes advantage of the
+ * fact that (in most cases) the inodes of the files will be in the same
+ * compressed metadata block, and therefore, can share the start block.
+ * Directories are therefore organised in a two level list, a directory
+ * header containing the shared start block value, and a sequence of directory
+ * entries, each of which share the shared start block. A new directory header
+ * is written once/if the inode start block changes. The directory
+ * header/directory entry list is repeated as many times as necessary.
+ *
+ * Directories are sorted, and can contain a directory index to speed up
+ * file lookup. Directory indexes store one entry per metablock, each entry
+ * storing the index/filename mapping to the first directory header
+ * in each metadata block. Directories are sorted in alphabetical order,
+ * and at lookup the index is scanned linearly looking for the first filename
+ * alphabetically larger than the filename being looked up. At this point the
+ * location of the metadata block the filename is in has been found.
+ * The general idea of the index is ensure only one metadata block needs to be
+ * decompressed to do a lookup irrespective of the length of the directory.
+ * This scheme has the advantage that it doesn't require extra memory overhead
+ * and doesn't require much extra storage on disk.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dcache.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Lookup name in the directory index, returning the location of the metadata
+ * block containing it, and the directory index this represents.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_name(struct super_block *sb,
+ u64 *next_block, int *next_offset, u64 index_start,
+ int index_offset, int i_count, const char *name,
+ int len)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ int i, size, length = 0, err;
+ struct squashfs_dir_index *index;
+ char *str;
+
+ TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count);
+
+ index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL);
+ if (index == NULL) {
+ ERROR("Failed to allocate squashfs_dir_index\n");
+ goto out;
+ }
+
+ str = &index->name[SQUASHFS_NAME_LEN + 1];
+ strncpy(str, name, len);
+ str[len] = '\0';
+
+ for (i = 0; i < i_count; i++) {
+ err = squashfs_read_metadata(sb, index, &index_start,
+ &index_offset, sizeof(*index));
+ if (err < 0)
+ break;
+
+
+ size = le32_to_cpu(index->size) + 1;
+
+ err = squashfs_read_metadata(sb, index->name, &index_start,
+ &index_offset, size);
+ if (err < 0)
+ break;
+
+ index->name[size] = '\0';
+
+ if (strcmp(index->name, str) > 0)
+ break;
+
+ length = le32_to_cpu(index->index);
+ *next_block = le32_to_cpu(index->start_block) +
+ msblk->directory_table;
+ }
+
+ *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+ kfree(index);
+
+out:
+ /*
+ * Return index (f_pos) of the looked up metadata block. Translate
+ * from internal f_pos to external f_pos which is offset by 3 because
+ * we invent "." and ".." entries which are not actually stored in the
+ * directory.
+ */
+ return length + 3;
+}
+
+
+static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ const unsigned char *name = dentry->d_name.name;
+ int len = dentry->d_name.len;
+ struct inode *inode = NULL;
+ struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info;
+ struct squashfs_dir_header dirh;
+ struct squashfs_dir_entry *dire;
+ u64 block = squashfs_i(dir)->start + msblk->directory_table;
+ int offset = squashfs_i(dir)->offset;
+ int err, length = 0, dir_count, size;
+
+ TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset);
+
+ dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+ if (dire == NULL) {
+ ERROR("Failed to allocate squashfs_dir_entry\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (len > SQUASHFS_NAME_LEN) {
+ err = -ENAMETOOLONG;
+ goto failed;
+ }
+
+ length = get_dir_index_using_name(dir->i_sb, &block, &offset,
+ squashfs_i(dir)->dir_idx_start,
+ squashfs_i(dir)->dir_idx_offset,
+ squashfs_i(dir)->dir_idx_cnt, name, len);
+
+ while (length < i_size_read(dir)) {
+ /*
+ * Read directory header.
+ */
+ err = squashfs_read_metadata(dir->i_sb, &dirh, &block,
+ &offset, sizeof(dirh));
+ if (err < 0)
+ goto read_failure;
+
+ length += sizeof(dirh);
+
+ dir_count = le32_to_cpu(dirh.count) + 1;
+ while (dir_count--) {
+ /*
+ * Read directory entry.
+ */
+ err = squashfs_read_metadata(dir->i_sb, dire, &block,
+ &offset, sizeof(*dire));
+ if (err < 0)
+ goto read_failure;
+
+ size = le16_to_cpu(dire->size) + 1;
+
+ err = squashfs_read_metadata(dir->i_sb, dire->name,
+ &block, &offset, size);
+ if (err < 0)
+ goto read_failure;
+
+ length += sizeof(*dire) + size;
+
+ if (name[0] < dire->name[0])
+ goto exit_lookup;
+
+ if (len == size && !strncmp(name, dire->name, len)) {
+ unsigned int blk, off, ino_num;
+ long long ino;
+ blk = le32_to_cpu(dirh.start_block);
+ off = le16_to_cpu(dire->offset);
+ ino_num = le32_to_cpu(dirh.inode_number) +
+ (short) le16_to_cpu(dire->inode_number);
+ ino = SQUASHFS_MKINODE(blk, off);
+
+ TRACE("calling squashfs_iget for directory "
+ "entry %s, inode %x:%x, %d\n", name,
+ blk, off, ino_num);
+
+ inode = squashfs_iget(dir->i_sb, ino, ino_num);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto failed;
+ }
+
+ goto exit_lookup;
+ }
+ }
+ }
+
+exit_lookup:
+ kfree(dire);
+ if (inode)
+ return d_splice_alias(inode, dentry);
+ d_add(dentry, inode);
+ return ERR_PTR(0);
+
+read_failure:
+ ERROR("Unable to read directory block [%llx:%x]\n",
+ squashfs_i(dir)->start + msblk->directory_table,
+ squashfs_i(dir)->offset);
+failed:
+ kfree(dire);
+ return ERR_PTR(err);
+}
+
+
+const struct inode_operations squashfs_dir_inode_ops = {
+ .lookup = squashfs_lookup
+};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
new file mode 100644
index 00000000000..6b2515d027d
--- /dev/null
+++ b/fs/squashfs/squashfs.h
@@ -0,0 +1,90 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs.h
+ */
+
+#define TRACE(s, args...) pr_debug("SQUASHFS: "s, ## args)
+
+#define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args)
+
+#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args)
+
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+ return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
+
+/* block.c */
+extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
+ int);
+
+/* cache.c */
+extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
+extern void squashfs_cache_delete(struct squashfs_cache *);
+extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *,
+ struct squashfs_cache *, u64, int);
+extern void squashfs_cache_put(struct squashfs_cache_entry *);
+extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int);
+extern int squashfs_read_metadata(struct super_block *, void *, u64 *,
+ int *, int);
+extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *,
+ u64, int);
+extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
+ u64, int);
+extern int squashfs_read_table(struct super_block *, void *, u64, int);
+
+/* export.c */
+extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
+ unsigned int);
+
+/* fragment.c */
+extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
+extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
+ u64, unsigned int);
+
+/* id.c */
+extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
+extern __le64 *squashfs_read_id_index_table(struct super_block *, u64,
+ unsigned short);
+
+/* inode.c */
+extern struct inode *squashfs_iget(struct super_block *, long long,
+ unsigned int);
+extern int squashfs_read_inode(struct inode *, long long);
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations squashfs_dir_ops;
+
+/* export.c */
+extern const struct export_operations squashfs_export_ops;
+
+/* file.c */
+extern const struct address_space_operations squashfs_aops;
+
+/* namei.c */
+extern const struct inode_operations squashfs_dir_inode_ops;
+
+/* symlink.c */
+extern const struct address_space_operations squashfs_symlink_aops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
new file mode 100644
index 00000000000..6840da1bf21
--- /dev/null
+++ b/fs/squashfs/squashfs_fs.h
@@ -0,0 +1,381 @@
+#ifndef SQUASHFS_FS
+#define SQUASHFS_FS
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs.h
+ */
+
+#define SQUASHFS_CACHED_FRAGMENTS CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE
+#define SQUASHFS_MAJOR 4
+#define SQUASHFS_MINOR 0
+#define SQUASHFS_MAGIC 0x73717368
+#define SQUASHFS_START 0
+
+/* size of metadata (inode and directory) blocks */
+#define SQUASHFS_METADATA_SIZE 8192
+#define SQUASHFS_METADATA_LOG 13
+
+/* default size of data blocks */
+#define SQUASHFS_FILE_SIZE 131072
+#define SQUASHFS_FILE_LOG 17
+
+#define SQUASHFS_FILE_MAX_SIZE 1048576
+#define SQUASHFS_FILE_MAX_LOG 20
+
+/* Max number of uids and gids */
+#define SQUASHFS_IDS 65536
+
+/* Max length of filename (not 255) */
+#define SQUASHFS_NAME_LEN 256
+
+#define SQUASHFS_INVALID_FRAG (0xffffffffU)
+#define SQUASHFS_INVALID_BLK (-1LL)
+
+/* Filesystem flags */
+#define SQUASHFS_NOI 0
+#define SQUASHFS_NOD 1
+#define SQUASHFS_NOF 3
+#define SQUASHFS_NO_FRAG 4
+#define SQUASHFS_ALWAYS_FRAG 5
+#define SQUASHFS_DUPLICATE 6
+#define SQUASHFS_EXPORT 7
+
+#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1)
+
+#define SQUASHFS_UNCOMPRESSED_INODES(flags) SQUASHFS_BIT(flags, \
+ SQUASHFS_NOI)
+
+#define SQUASHFS_UNCOMPRESSED_DATA(flags) SQUASHFS_BIT(flags, \
+ SQUASHFS_NOD)
+
+#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
+ SQUASHFS_NOF)
+
+#define SQUASHFS_NO_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
+ SQUASHFS_NO_FRAG)
+
+#define SQUASHFS_ALWAYS_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
+ SQUASHFS_ALWAYS_FRAG)
+
+#define SQUASHFS_DUPLICATES(flags) SQUASHFS_BIT(flags, \
+ SQUASHFS_DUPLICATE)
+
+#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \
+ SQUASHFS_EXPORT)
+
+/* Max number of types and file types */
+#define SQUASHFS_DIR_TYPE 1
+#define SQUASHFS_REG_TYPE 2
+#define SQUASHFS_SYMLINK_TYPE 3
+#define SQUASHFS_BLKDEV_TYPE 4
+#define SQUASHFS_CHRDEV_TYPE 5
+#define SQUASHFS_FIFO_TYPE 6
+#define SQUASHFS_SOCKET_TYPE 7
+#define SQUASHFS_LDIR_TYPE 8
+#define SQUASHFS_LREG_TYPE 9
+#define SQUASHFS_LSYMLINK_TYPE 10
+#define SQUASHFS_LBLKDEV_TYPE 11
+#define SQUASHFS_LCHRDEV_TYPE 12
+#define SQUASHFS_LFIFO_TYPE 13
+#define SQUASHFS_LSOCKET_TYPE 14
+
+/* Flag whether block is compressed or uncompressed, bit is set if block is
+ * uncompressed */
+#define SQUASHFS_COMPRESSED_BIT (1 << 15)
+
+#define SQUASHFS_COMPRESSED_SIZE(B) (((B) & ~SQUASHFS_COMPRESSED_BIT) ? \
+ (B) & ~SQUASHFS_COMPRESSED_BIT : SQUASHFS_COMPRESSED_BIT)
+
+#define SQUASHFS_COMPRESSED(B) (!((B) & SQUASHFS_COMPRESSED_BIT))
+
+#define SQUASHFS_COMPRESSED_BIT_BLOCK (1 << 24)
+
+#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B) ((B) & \
+ ~SQUASHFS_COMPRESSED_BIT_BLOCK)
+
+#define SQUASHFS_COMPRESSED_BLOCK(B) (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK))
+
+/*
+ * Inode number ops. Inodes consist of a compressed block number, and an
+ * uncompressed offset within that block
+ */
+#define SQUASHFS_INODE_BLK(A) ((unsigned int) ((A) >> 16))
+
+#define SQUASHFS_INODE_OFFSET(A) ((unsigned int) ((A) & 0xffff))
+
+#define SQUASHFS_MKINODE(A, B) ((long long)(((long long) (A)\
+ << 16) + (B)))
+
+/* Translate between VFS mode and squashfs mode */
+#define SQUASHFS_MODE(A) ((A) & 0xfff)
+
+/* fragment and fragment table defines */
+#define SQUASHFS_FRAGMENT_BYTES(A) \
+ ((A) * sizeof(struct squashfs_fragment_entry))
+
+#define SQUASHFS_FRAGMENT_INDEX(A) (SQUASHFS_FRAGMENT_BYTES(A) / \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A) (SQUASHFS_FRAGMENT_BYTES(A) % \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEXES(A) ((SQUASHFS_FRAGMENT_BYTES(A) + \
+ SQUASHFS_METADATA_SIZE - 1) / \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEX_BYTES(A) (SQUASHFS_FRAGMENT_INDEXES(A) *\
+ sizeof(u64))
+
+/* inode lookup table defines */
+#define SQUASHFS_LOOKUP_BYTES(A) ((A) * sizeof(u64))
+
+#define SQUASHFS_LOOKUP_BLOCK(A) (SQUASHFS_LOOKUP_BYTES(A) / \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A) (SQUASHFS_LOOKUP_BYTES(A) % \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCKS(A) ((SQUASHFS_LOOKUP_BYTES(A) + \
+ SQUASHFS_METADATA_SIZE - 1) / \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCK_BYTES(A) (SQUASHFS_LOOKUP_BLOCKS(A) *\
+ sizeof(u64))
+
+/* uid/gid lookup table defines */
+#define SQUASHFS_ID_BYTES(A) ((A) * sizeof(unsigned int))
+
+#define SQUASHFS_ID_BLOCK(A) (SQUASHFS_ID_BYTES(A) / \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCK_OFFSET(A) (SQUASHFS_ID_BYTES(A) % \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCKS(A) ((SQUASHFS_ID_BYTES(A) + \
+ SQUASHFS_METADATA_SIZE - 1) / \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
+ sizeof(u64))
+
+/* cached data constants for filesystem */
+#define SQUASHFS_CACHED_BLKS 8
+
+#define SQUASHFS_MAX_FILE_SIZE_LOG 64
+
+#define SQUASHFS_MAX_FILE_SIZE (1LL << \
+ (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
+
+#define SQUASHFS_MARKER_BYTE 0xff
+
+/* meta index cache */
+#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
+#define SQUASHFS_META_ENTRIES 127
+#define SQUASHFS_META_SLOTS 8
+
+struct meta_entry {
+ u64 data_block;
+ unsigned int index_block;
+ unsigned short offset;
+ unsigned short pad;
+};
+
+struct meta_index {
+ unsigned int inode_number;
+ unsigned int offset;
+ unsigned short entries;
+ unsigned short skip;
+ unsigned short locked;
+ unsigned short pad;
+ struct meta_entry meta_entry[SQUASHFS_META_ENTRIES];
+};
+
+
+/*
+ * definitions for structures on disk
+ */
+#define ZLIB_COMPRESSION 1
+
+struct squashfs_super_block {
+ __le32 s_magic;
+ __le32 inodes;
+ __le32 mkfs_time;
+ __le32 block_size;
+ __le32 fragments;
+ __le16 compression;
+ __le16 block_log;
+ __le16 flags;
+ __le16 no_ids;
+ __le16 s_major;
+ __le16 s_minor;
+ __le64 root_inode;
+ __le64 bytes_used;
+ __le64 id_table_start;
+ __le64 xattr_table_start;
+ __le64 inode_table_start;
+ __le64 directory_table_start;
+ __le64 fragment_table_start;
+ __le64 lookup_table_start;
+};
+
+struct squashfs_dir_index {
+ __le32 index;
+ __le32 start_block;
+ __le32 size;
+ unsigned char name[0];
+};
+
+struct squashfs_base_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+};
+
+struct squashfs_ipc_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+ __le32 nlink;
+};
+
+struct squashfs_dev_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+ __le32 nlink;
+ __le32 rdev;
+};
+
+struct squashfs_symlink_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+ __le32 nlink;
+ __le32 symlink_size;
+ char symlink[0];
+};
+
+struct squashfs_reg_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+ __le32 start_block;
+ __le32 fragment;
+ __le32 offset;
+ __le32 file_size;
+ __le16 block_list[0];
+};
+
+struct squashfs_lreg_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+ __le64 start_block;
+ __le64 file_size;
+ __le64 sparse;
+ __le32 nlink;
+ __le32 fragment;
+ __le32 offset;
+ __le32 xattr;
+ __le16 block_list[0];
+};
+
+struct squashfs_dir_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+ __le32 start_block;
+ __le32 nlink;
+ __le16 file_size;
+ __le16 offset;
+ __le32 parent_inode;
+};
+
+struct squashfs_ldir_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+ __le32 nlink;
+ __le32 file_size;
+ __le32 start_block;
+ __le32 parent_inode;
+ __le16 i_count;
+ __le16 offset;
+ __le32 xattr;
+ struct squashfs_dir_index index[0];
+};
+
+union squashfs_inode {
+ struct squashfs_base_inode base;
+ struct squashfs_dev_inode dev;
+ struct squashfs_symlink_inode symlink;
+ struct squashfs_reg_inode reg;
+ struct squashfs_lreg_inode lreg;
+ struct squashfs_dir_inode dir;
+ struct squashfs_ldir_inode ldir;
+ struct squashfs_ipc_inode ipc;
+};
+
+struct squashfs_dir_entry {
+ __le16 offset;
+ __le16 inode_number;
+ __le16 type;
+ __le16 size;
+ char name[0];
+};
+
+struct squashfs_dir_header {
+ __le32 count;
+ __le32 start_block;
+ __le32 inode_number;
+};
+
+struct squashfs_fragment_entry {
+ __le64 start_block;
+ __le32 size;
+ unsigned int unused;
+};
+
+#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
new file mode 100644
index 00000000000..fbfca30c0c6
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -0,0 +1,45 @@
+#ifndef SQUASHFS_FS_I
+#define SQUASHFS_FS_I
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_i.h
+ */
+
+struct squashfs_inode_info {
+ u64 start;
+ int offset;
+ union {
+ struct {
+ u64 fragment_block;
+ int fragment_size;
+ int fragment_offset;
+ u64 block_list_start;
+ };
+ struct {
+ u64 dir_idx_start;
+ int dir_idx_offset;
+ int dir_idx_cnt;
+ int parent;
+ };
+ };
+ struct inode vfs_inode;
+};
+#endif
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
new file mode 100644
index 00000000000..c8c65614dd1
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -0,0 +1,76 @@
+#ifndef SQUASHFS_FS_SB
+#define SQUASHFS_FS_SB
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_sb.h
+ */
+
+#include "squashfs_fs.h"
+
+struct squashfs_cache {
+ char *name;
+ int entries;
+ int next_blk;
+ int num_waiters;
+ int unused;
+ int block_size;
+ int pages;
+ spinlock_t lock;
+ wait_queue_head_t wait_queue;
+ struct squashfs_cache_entry *entry;
+};
+
+struct squashfs_cache_entry {
+ u64 block;
+ int length;
+ int refcount;
+ u64 next_index;
+ int pending;
+ int error;
+ int num_waiters;
+ wait_queue_head_t wait_queue;
+ struct squashfs_cache *cache;
+ void **data;
+};
+
+struct squashfs_sb_info {
+ int devblksize;
+ int devblksize_log2;
+ struct squashfs_cache *block_cache;
+ struct squashfs_cache *fragment_cache;
+ struct squashfs_cache *read_page;
+ int next_meta_index;
+ __le64 *id_table;
+ __le64 *fragment_index;
+ unsigned int *fragment_index_2;
+ struct mutex read_data_mutex;
+ struct mutex meta_index_mutex;
+ struct meta_index *meta_index;
+ z_stream stream;
+ __le64 *inode_lookup_table;
+ u64 inode_table;
+ u64 directory_table;
+ unsigned int block_size;
+ unsigned short block_log;
+ long long bytes_used;
+ unsigned int inodes;
+};
+#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
new file mode 100644
index 00000000000..a0466d7467b
--- /dev/null
+++ b/fs/squashfs/super.c
@@ -0,0 +1,440 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * super.c
+ */
+
+/*
+ * This file implements code to read the superblock, read and initialise
+ * in-memory structures at mount time, and all the VFS glue code to register
+ * the filesystem.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static struct file_system_type squashfs_fs_type;
+static struct super_operations squashfs_super_ops;
+
+static int supported_squashfs_filesystem(short major, short minor, short comp)
+{
+ if (major < SQUASHFS_MAJOR) {
+ ERROR("Major/Minor mismatch, older Squashfs %d.%d "
+ "filesystems are unsupported\n", major, minor);
+ return -EINVAL;
+ } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
+ ERROR("Major/Minor mismatch, trying to mount newer "
+ "%d.%d filesystem\n", major, minor);
+ ERROR("Please update your kernel\n");
+ return -EINVAL;
+ }
+
+ if (comp != ZLIB_COMPRESSION)
+ return -EINVAL;
+
+ return 0;
+}
+
+
+static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct squashfs_sb_info *msblk;
+ struct squashfs_super_block *sblk = NULL;
+ char b[BDEVNAME_SIZE];
+ struct inode *root;
+ long long root_inode;
+ unsigned short flags;
+ unsigned int fragments;
+ u64 lookup_table_start;
+ int err;
+
+ TRACE("Entered squashfs_fill_superblock\n");
+
+ sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
+ if (sb->s_fs_info == NULL) {
+ ERROR("Failed to allocate squashfs_sb_info\n");
+ return -ENOMEM;
+ }
+ msblk = sb->s_fs_info;
+
+ msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
+ GFP_KERNEL);
+ if (msblk->stream.workspace == NULL) {
+ ERROR("Failed to allocate zlib workspace\n");
+ goto failure;
+ }
+
+ sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
+ if (sblk == NULL) {
+ ERROR("Failed to allocate squashfs_super_block\n");
+ goto failure;
+ }
+
+ msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
+ msblk->devblksize_log2 = ffz(~msblk->devblksize);
+
+ mutex_init(&msblk->read_data_mutex);
+ mutex_init(&msblk->meta_index_mutex);
+
+ /*
+ * msblk->bytes_used is checked in squashfs_read_table to ensure reads
+ * are not beyond filesystem end. But as we're using
+ * squashfs_read_table here to read the superblock (including the value
+ * of bytes_used) we need to set it to an initial sensible dummy value
+ */
+ msblk->bytes_used = sizeof(*sblk);
+ err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk));
+
+ if (err < 0) {
+ ERROR("unable to read squashfs_super_block\n");
+ goto failed_mount;
+ }
+
+ /* Check it is a SQUASHFS superblock */
+ sb->s_magic = le32_to_cpu(sblk->s_magic);
+ if (sb->s_magic != SQUASHFS_MAGIC) {
+ if (!silent)
+ ERROR("Can't find a SQUASHFS superblock on %s\n",
+ bdevname(sb->s_bdev, b));
+ err = -EINVAL;
+ goto failed_mount;
+ }
+
+ /* Check the MAJOR & MINOR versions and compression type */
+ err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major),
+ le16_to_cpu(sblk->s_minor),
+ le16_to_cpu(sblk->compression));
+ if (err < 0)
+ goto failed_mount;
+
+ err = -EINVAL;
+
+ /*
+ * Check if there's xattrs in the filesystem. These are not
+ * supported in this version, so warn that they will be ignored.
+ */
+ if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
+ ERROR("Xattrs in filesystem, these will be ignored\n");
+
+ /* Check the filesystem does not extend beyond the end of the
+ block device */
+ msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
+ if (msblk->bytes_used < 0 || msblk->bytes_used >
+ i_size_read(sb->s_bdev->bd_inode))
+ goto failed_mount;
+
+ /* Check block size for sanity */
+ msblk->block_size = le32_to_cpu(sblk->block_size);
+ if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
+ goto failed_mount;
+
+ msblk->block_log = le16_to_cpu(sblk->block_log);
+ if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
+ goto failed_mount;
+
+ /* Check the root inode for sanity */
+ root_inode = le64_to_cpu(sblk->root_inode);
+ if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
+ goto failed_mount;
+
+ msblk->inode_table = le64_to_cpu(sblk->inode_table_start);
+ msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
+ msblk->inodes = le32_to_cpu(sblk->inodes);
+ flags = le16_to_cpu(sblk->flags);
+
+ TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
+ TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
+ ? "un" : "");
+ TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
+ ? "un" : "");
+ TRACE("Filesystem size %lld bytes\n", msblk->bytes_used);
+ TRACE("Block size %d\n", msblk->block_size);
+ TRACE("Number of inodes %d\n", msblk->inodes);
+ TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments));
+ TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
+ TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
+ TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
+ TRACE("sblk->fragment_table_start %llx\n",
+ (u64) le64_to_cpu(sblk->fragment_table_start));
+ TRACE("sblk->id_table_start %llx\n",
+ (u64) le64_to_cpu(sblk->id_table_start));
+
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_flags |= MS_RDONLY;
+ sb->s_op = &squashfs_super_ops;
+
+ err = -ENOMEM;
+
+ msblk->block_cache = squashfs_cache_init("metadata",
+ SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
+ if (msblk->block_cache == NULL)
+ goto failed_mount;
+
+ /* Allocate read_page block */
+ msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size);
+ if (msblk->read_page == NULL) {
+ ERROR("Failed to allocate read_page block\n");
+ goto failed_mount;
+ }
+
+ /* Allocate and read id index table */
+ msblk->id_table = squashfs_read_id_index_table(sb,
+ le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
+ if (IS_ERR(msblk->id_table)) {
+ err = PTR_ERR(msblk->id_table);
+ msblk->id_table = NULL;
+ goto failed_mount;
+ }
+
+ fragments = le32_to_cpu(sblk->fragments);
+ if (fragments == 0)
+ goto allocate_lookup_table;
+
+ msblk->fragment_cache = squashfs_cache_init("fragment",
+ SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
+ if (msblk->fragment_cache == NULL) {
+ err = -ENOMEM;
+ goto failed_mount;
+ }
+
+ /* Allocate and read fragment index table */
+ msblk->fragment_index = squashfs_read_fragment_index_table(sb,
+ le64_to_cpu(sblk->fragment_table_start), fragments);
+ if (IS_ERR(msblk->fragment_index)) {
+ err = PTR_ERR(msblk->fragment_index);
+ msblk->fragment_index = NULL;
+ goto failed_mount;
+ }
+
+allocate_lookup_table:
+ lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
+ if (lookup_table_start == SQUASHFS_INVALID_BLK)
+ goto allocate_root;
+
+ /* Allocate and read inode lookup table */
+ msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
+ lookup_table_start, msblk->inodes);
+ if (IS_ERR(msblk->inode_lookup_table)) {
+ err = PTR_ERR(msblk->inode_lookup_table);
+ msblk->inode_lookup_table = NULL;
+ goto failed_mount;
+ }
+
+ sb->s_export_op = &squashfs_export_ops;
+
+allocate_root:
+ root = new_inode(sb);
+ if (!root) {
+ err = -ENOMEM;
+ goto failed_mount;
+ }
+
+ err = squashfs_read_inode(root, root_inode);
+ if (err) {
+ iget_failed(root);
+ goto failed_mount;
+ }
+ insert_inode_hash(root);
+
+ sb->s_root = d_alloc_root(root);
+ if (sb->s_root == NULL) {
+ ERROR("Root inode create failed\n");
+ err = -ENOMEM;
+ iput(root);
+ goto failed_mount;
+ }
+
+ TRACE("Leaving squashfs_fill_super\n");
+ kfree(sblk);
+ return 0;
+
+failed_mount:
+ squashfs_cache_delete(msblk->block_cache);
+ squashfs_cache_delete(msblk->fragment_cache);
+ squashfs_cache_delete(msblk->read_page);
+ kfree(msblk->inode_lookup_table);
+ kfree(msblk->fragment_index);
+ kfree(msblk->id_table);
+ kfree(msblk->stream.workspace);
+ kfree(sb->s_fs_info);
+ sb->s_fs_info = NULL;
+ kfree(sblk);
+ return err;
+
+failure:
+ kfree(msblk->stream.workspace);
+ kfree(sb->s_fs_info);
+ sb->s_fs_info = NULL;
+ return -ENOMEM;
+}
+
+
+static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
+
+ TRACE("Entered squashfs_statfs\n");
+
+ buf->f_type = SQUASHFS_MAGIC;
+ buf->f_bsize = msblk->block_size;
+ buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1;
+ buf->f_bfree = buf->f_bavail = 0;
+ buf->f_files = msblk->inodes;
+ buf->f_ffree = 0;
+ buf->f_namelen = SQUASHFS_NAME_LEN;
+
+ return 0;
+}
+
+
+static int squashfs_remount(struct super_block *sb, int *flags, char *data)
+{
+ *flags |= MS_RDONLY;
+ return 0;
+}
+
+
+static void squashfs_put_super(struct super_block *sb)
+{
+ if (sb->s_fs_info) {
+ struct squashfs_sb_info *sbi = sb->s_fs_info;
+ squashfs_cache_delete(sbi->block_cache);
+ squashfs_cache_delete(sbi->fragment_cache);
+ squashfs_cache_delete(sbi->read_page);
+ kfree(sbi->id_table);
+ kfree(sbi->fragment_index);
+ kfree(sbi->meta_index);
+ kfree(sbi->stream.workspace);
+ kfree(sb->s_fs_info);
+ sb->s_fs_info = NULL;
+ }
+}
+
+
+static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data,
+ struct vfsmount *mnt)
+{
+ return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
+ mnt);
+}
+
+
+static struct kmem_cache *squashfs_inode_cachep;
+
+
+static void init_once(void *foo)
+{
+ struct squashfs_inode_info *ei = foo;
+
+ inode_init_once(&ei->vfs_inode);
+}
+
+
+static int __init init_inodecache(void)
+{
+ squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
+ sizeof(struct squashfs_inode_info), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+
+ return squashfs_inode_cachep ? 0 : -ENOMEM;
+}
+
+
+static void destroy_inodecache(void)
+{
+ kmem_cache_destroy(squashfs_inode_cachep);
+}
+
+
+static int __init init_squashfs_fs(void)
+{
+ int err = init_inodecache();
+
+ if (err)
+ return err;
+
+ err = register_filesystem(&squashfs_fs_type);
+ if (err) {
+ destroy_inodecache();
+ return err;
+ }
+
+ printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
+ "Phillip Lougher\n");
+
+ return 0;
+}
+
+
+static void __exit exit_squashfs_fs(void)
+{
+ unregister_filesystem(&squashfs_fs_type);
+ destroy_inodecache();
+}
+
+
+static struct inode *squashfs_alloc_inode(struct super_block *sb)
+{
+ struct squashfs_inode_info *ei =
+ kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
+
+ return ei ? &ei->vfs_inode : NULL;
+}
+
+
+static void squashfs_destroy_inode(struct inode *inode)
+{
+ kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
+}
+
+
+static struct file_system_type squashfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "squashfs",
+ .get_sb = squashfs_get_sb,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV
+};
+
+static struct super_operations squashfs_super_ops = {
+ .alloc_inode = squashfs_alloc_inode,
+ .destroy_inode = squashfs_destroy_inode,
+ .statfs = squashfs_statfs,
+ .put_super = squashfs_put_super,
+ .remount_fs = squashfs_remount
+};
+
+module_init(init_squashfs_fs);
+module_exit(exit_squashfs_fs);
+MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
+MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>");
+MODULE_LICENSE("GPL");
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
new file mode 100644
index 00000000000..83d87880aac
--- /dev/null
+++ b/fs/squashfs/symlink.c
@@ -0,0 +1,118 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * symlink.c
+ */
+
+/*
+ * This file implements code to handle symbolic links.
+ *
+ * The data contents of symbolic links are stored inside the symbolic
+ * link inode within the inode table. This allows the normally small symbolic
+ * link to be compressed as part of the inode table, achieving much greater
+ * compression than if the symbolic link was compressed individually.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static int squashfs_symlink_readpage(struct file *file, struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ int index = page->index << PAGE_CACHE_SHIFT;
+ u64 block = squashfs_i(inode)->start;
+ int offset = squashfs_i(inode)->offset;
+ int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
+ int bytes, copied;
+ void *pageaddr;
+ struct squashfs_cache_entry *entry;
+
+ TRACE("Entered squashfs_symlink_readpage, page index %ld, start block "
+ "%llx, offset %x\n", page->index, block, offset);
+
+ /*
+ * Skip index bytes into symlink metadata.
+ */
+ if (index) {
+ bytes = squashfs_read_metadata(sb, NULL, &block, &offset,
+ index);
+ if (bytes < 0) {
+ ERROR("Unable to read symlink [%llx:%x]\n",
+ squashfs_i(inode)->start,
+ squashfs_i(inode)->offset);
+ goto error_out;
+ }
+ }
+
+ /*
+ * Read length bytes from symlink metadata. Squashfs_read_metadata
+ * is not used here because it can sleep and we want to use
+ * kmap_atomic to map the page. Instead call the underlying
+ * squashfs_cache_get routine. As length bytes may overlap metadata
+ * blocks, we may need to call squashfs_cache_get multiple times.
+ */
+ for (bytes = 0; bytes < length; offset = 0, bytes += copied) {
+ entry = squashfs_cache_get(sb, msblk->block_cache, block, 0);
+ if (entry->error) {
+ ERROR("Unable to read symlink [%llx:%x]\n",
+ squashfs_i(inode)->start,
+ squashfs_i(inode)->offset);
+ squashfs_cache_put(entry);
+ goto error_out;
+ }
+
+ pageaddr = kmap_atomic(page, KM_USER0);
+ copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
+ length - bytes);
+ if (copied == length - bytes)
+ memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
+ else
+ block = entry->next_index;
+ kunmap_atomic(pageaddr, KM_USER0);
+ squashfs_cache_put(entry);
+ }
+
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+
+error_out:
+ SetPageError(page);
+ unlock_page(page);
+ return 0;
+}
+
+
+const struct address_space_operations squashfs_symlink_aops = {
+ .readpage = squashfs_symlink_readpage
+};
diff --git a/fs/stat.c b/fs/stat.c
index 7c46fbeb8b7..7e12a6f8279 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -305,7 +305,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
struct inode *inode = path.dentry->d_inode;
error = -EINVAL;
- if (inode->i_op && inode->i_op->readlink) {
+ if (inode->i_op->readlink) {
error = security_inode_readlink(path.dentry);
if (!error) {
touch_atime(path.mnt, path.dentry);
diff --git a/fs/super.c b/fs/super.c
index ddba069d7a9..ed080c41716 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,6 +38,7 @@
#include <linux/kobject.h>
#include <linux/mutex.h>
#include <linux/file.h>
+#include <linux/async.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
INIT_HLIST_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
INIT_LIST_HEAD(&s->s_dentry_lru);
+ INIT_LIST_HEAD(&s->s_async_list);
init_rwsem(&s->s_umount);
mutex_init(&s->s_lock);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb)
{
const struct super_operations *sop = sb->s_op;
+
if (sb->s_root) {
shrink_dcache_for_umount(sb);
fsync_super(sb);
lock_super(sb);
sb->s_flags &= ~MS_ACTIVE;
+
+ /*
+ * wait for asynchronous fs operations to finish before going further
+ */
+ async_synchronize_full_special(&sb->s_async_list);
+
/* bad name - it should be evict_inodes() */
invalidate_inodes(sb);
lock_kernel();
@@ -461,6 +470,7 @@ restart:
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
+ async_synchronize_full_special(&sb->s_async_list);
if (sb->s_root && (wait || sb->s_dirt))
sb->s_op->sync_fs(sb, wait);
up_read(&sb->s_umount);
@@ -800,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
}
s->s_flags |= MS_ACTIVE;
+ bdev->bd_super = s;
}
return simple_set_mnt(mnt, s);
@@ -819,6 +830,7 @@ void kill_block_super(struct super_block *sb)
struct block_device *bdev = sb->s_bdev;
fmode_t mode = sb->s_mode;
+ bdev->bd_super = 0;
generic_shutdown_super(sb);
sync_blockdev(bdev);
close_bdev_exclusive(bdev, mode);
diff --git a/fs/sync.c b/fs/sync.c
index 2967562d416..ac02b56548b 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
return ret;
}
-long do_fsync(struct file *file, int datasync)
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file: file to sync
+ * @dentry: dentry of @file
+ * @data: only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk. If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set. This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
{
- int ret;
- int err;
- struct address_space *mapping = file->f_mapping;
+ const struct file_operations *fop;
+ struct address_space *mapping;
+ int err, ret;
+
+ /*
+ * Get mapping and operations from the file in case we have
+ * as file, or get the default values for them in case we
+ * don't have a struct file available. Damn nfsd..
+ */
+ if (file) {
+ mapping = file->f_mapping;
+ fop = file->f_op;
+ } else {
+ mapping = dentry->d_inode->i_mapping;
+ fop = dentry->d_inode->i_fop;
+ }
- if (!file->f_op || !file->f_op->fsync) {
- /* Why? We can still call filemap_fdatawrite */
+ if (!fop || !fop->fsync) {
ret = -EINVAL;
goto out;
}
@@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync)
* livelocks in fsync_buffers_list().
*/
mutex_lock(&mapping->host->i_mutex);
- err = file->f_op->fsync(file, file->f_path.dentry, datasync);
+ err = fop->fsync(file, dentry, datasync);
if (!ret)
ret = err;
mutex_unlock(&mapping->host->i_mutex);
@@ -104,15 +129,16 @@ long do_fsync(struct file *file, int datasync)
out:
return ret;
}
+EXPORT_SYMBOL(vfs_fsync);
-static long __do_fsync(unsigned int fd, int datasync)
+static int do_fsync(unsigned int fd, int datasync)
{
struct file *file;
int ret = -EBADF;
file = fget(fd);
if (file) {
- ret = do_fsync(file, datasync);
+ ret = vfs_fsync(file, file->f_path.dentry, datasync);
fput(file);
}
return ret;
@@ -120,12 +146,12 @@ static long __do_fsync(unsigned int fd, int datasync)
asmlinkage long sys_fsync(unsigned int fd)
{
- return __do_fsync(fd, 0);
+ return do_fsync(fd, 0);
}
asmlinkage long sys_fdatasync(unsigned int fd)
{
- return __do_fsync(fd, 1);
+ return do_fsync(fd, 1);
}
/*
@@ -269,7 +295,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
if (flags & SYNC_FILE_RANGE_WRITE) {
ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
- WB_SYNC_NONE);
+ WB_SYNC_ALL);
if (ret < 0)
goto out;
}
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index eb53c632f85..dfa3d94cfc7 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
{
inode->i_mode = mode;
- inode->i_uid = 0;
- inode->i_gid = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
}
@@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
{
struct bin_attribute *bin_attr;
- inode->i_blocks = 0;
inode->i_mapping->a_ops = &sysfs_aops;
inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
inode->i_op = &sysfs_inode_operations;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 91ceeda7e5b..e35b54d5059 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB
depends on UBIFS_FS
default y
help
- Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
+ Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
# Debugging-related stuff
config UBIFS_FS_DEBUG
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 0e5e54d8292..175f9c590b7 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -142,7 +142,7 @@ static long long get_liability(struct ubifs_info *c)
*
* This function is called when an operation cannot be budgeted because there
* is supposedly no free space. But in most cases there is some free space:
- * o budgeting is pessimistic, so it always budgets more then it is actually
+ * o budgeting is pessimistic, so it always budgets more than it is actually
* needed, so shrinking the liability is one way to make free space - the
* cached data will take less space then it was budgeted for;
* o GC may turn some dark space into free space (budgeting treats dark space
@@ -606,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
* @c: UBIFS file-system description object
*
* This function converts budget which was allocated for a new page of data to
- * the budget of changing an existing page of data. The latter is smaller then
+ * the budget of changing an existing page of data. The latter is smaller than
* the former, so this function only does simple re-calculation and does not
* involve any write-back.
*/
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index fe82d2464d4..bf37374567f 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -219,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c)
}
static int write_begin_slow(struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep)
+ loff_t pos, unsigned len, struct page **pagep,
+ unsigned flags)
{
struct inode *inode = mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -247,7 +248,7 @@ static int write_begin_slow(struct address_space *mapping,
if (unlikely(err))
return err;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (unlikely(!page)) {
ubifs_release_budget(c, &req);
return -ENOMEM;
@@ -438,7 +439,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
return -EROFS;
/* Try out the fast-path part first */
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (unlikely(!page))
return -ENOMEM;
@@ -483,7 +484,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
unlock_page(page);
page_cache_release(page);
- return write_begin_slow(mapping, pos, len, pagep);
+ return write_begin_slow(mapping, pos, len, pagep, flags);
}
/*
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 0bef6501d58..9832f9abe28 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -45,7 +45,7 @@
#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
/*
- * GC may need to move more then one LEB to make progress. The below constants
+ * GC may need to move more than one LEB to make progress. The below constants
* define "soft" and "hard" limits on the number of LEBs the garbage collector
* may move.
*/
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 10ae25b7d1d..9b7c54e0cd2 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -191,7 +191,7 @@ again:
if (wbuf->lnum != -1 && avail >= len) {
/*
* Someone else has switched the journal head and we have
- * enough space now. This happens when more then one process is
+ * enough space now. This happens when more than one process is
* trying to write to the same journal head at the same time.
*/
dbg_jnl("return LEB %d back, already have LEB %d:%d",
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index f248533841a..e7bab52a141 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
* @contention: if any contention, this is set to %1
*
* This function walks the list of mounted UBIFS file-systems and frees clean
- * znodes which are older then @age, until at least @nr znodes are freed.
+ * znodes which are older than @age, until at least @nr znodes are freed.
* Returns the number of freed znodes.
*/
static int shrink_tnc_trees(int nr, int age, int *contention)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 0d7564b95f8..89556ee7251 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -432,12 +432,19 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
int i, err;
struct ubifs_info *c = sb->s_fs_info;
struct writeback_control wbc = {
- .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+ .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
.range_start = 0,
.range_end = LLONG_MAX,
.nr_to_write = LONG_MAX,
};
+ /*
+ * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an
+ * advisory thing to help the file system shove lots of data into the
+ * queues. If some gets missed then it'll be picked up on the second
+ * '->sync_fs()' call, with non-zero @wait.
+ */
+
if (sb->s_flags & MS_RDONLY)
return 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index 468377e6653..237804cd6b5 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
if (error)
return error;
error = -EOPNOTSUPP;
- if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
+ if (d->d_inode->i_op->listxattr) {
error = d->d_inode->i_op->listxattr(d, list, size);
} else {
error = security_inode_listsecurity(d->d_inode, list, size);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 36f6cc703ef..be846d606ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1348,7 +1348,7 @@ xfs_finish_flags(
{
int ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
- /* Fail a mount where the logbuf is smaller then the log stripe */
+ /* Fail a mount where the logbuf is smaller than the log stripe */
if (xfs_sb_version_haslogv2(&mp->m_sb)) {
if (mp->m_logbsize <= 0 &&
mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {