diff options
Diffstat (limited to 'fs')
191 files changed, 5480 insertions, 3722 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig new file mode 100644 index 00000000000..74e0723e90b --- /dev/null +++ b/fs/9p/Kconfig @@ -0,0 +1,10 @@ +config 9P_FS + tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)" + depends on INET && NET_9P && EXPERIMENTAL + help + If you say Y here, you will get experimental support for + Plan 9 resource sharing via the 9P2000 protocol. + + See <http://v9fs.sf.net> for more information. + + If unsure, say N. diff --git a/fs/Kconfig b/fs/Kconfig index 51307b0fdf0..93945dd0b1a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -27,141 +27,8 @@ config FS_MBCACHE default y if EXT4_FS=y && EXT4_FS_XATTR default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR -config REISERFS_FS - tristate "Reiserfs support" - help - Stores not just filenames but the files themselves in a balanced - tree. Uses journalling. - - Balanced trees are more efficient than traditional file system - architectural foundations. - - In general, ReiserFS is as fast as ext2, but is very efficient with - large directories and small files. Additional patches are needed - for NFS and quotas, please see <http://www.namesys.com/> for links. - - It is more easily extended to have features currently found in - database and keyword search systems than block allocation based file - systems are. The next version will be so extended, and will support - plugins consistent with our motto ``It takes more than a license to - make source code open.'' - - Read <http://www.namesys.com/> to learn more about reiserfs. - - Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. - - If you like it, you can pay us to add new features to it that you - need, buy a support contract, or pay us to port it to another OS. - -config REISERFS_CHECK - bool "Enable reiserfs debug mode" - depends on REISERFS_FS - help - If you set this to Y, then ReiserFS will perform every check it can - possibly imagine of its internal consistency throughout its - operation. It will also go substantially slower. More than once we - have forgotten that this was on, and then gone despondent over the - latest benchmarks.:-) Use of this option allows our team to go all - out in checking for consistency when debugging without fear of its - effect on end users. If you are on the verge of sending in a bug - report, say Y and you might get a useful error message. Almost - everyone should say N. - -config REISERFS_PROC_INFO - bool "Stats in /proc/fs/reiserfs" - depends on REISERFS_FS && PROC_FS - help - Create under /proc/fs/reiserfs a hierarchy of files, displaying - various ReiserFS statistics and internal data at the expense of - making your kernel or module slightly larger (+8 KB). This also - increases the amount of kernel memory required for each mount. - Almost everyone but ReiserFS developers and people fine-tuning - reiserfs or tracing problems should say N. - -config REISERFS_FS_XATTR - bool "ReiserFS extended attributes" - depends on REISERFS_FS - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - -config REISERFS_FS_POSIX_ACL - bool "ReiserFS POSIX Access Control Lists" - depends on REISERFS_FS_XATTR - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config REISERFS_FS_SECURITY - bool "ReiserFS Security Labels" - depends on REISERFS_FS_XATTR - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the ReiserFS filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config JFS_FS - tristate "JFS filesystem support" - select NLS - help - This is a port of IBM's Journaled Filesystem . More information is - available in the file <file:Documentation/filesystems/jfs.txt>. - - If you do not intend to use the JFS filesystem, say N. - -config JFS_POSIX_ACL - bool "JFS POSIX Access Control Lists" - depends on JFS_FS - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config JFS_SECURITY - bool "JFS Security Labels" - depends on JFS_FS - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the jfs filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config JFS_DEBUG - bool "JFS debugging" - depends on JFS_FS - help - If you are experiencing any problems with the JFS filesystem, say - Y here. This will result in additional debugging messages to be - written to the system log. Under normal circumstances, this - results in very little overhead. - -config JFS_STATISTICS - bool "JFS statistics" - depends on JFS_FS - help - Enabling this option will cause statistics from the JFS file system - to be made available to the user in the /proc/fs/jfs/ directory. +source "fs/reiserfs/Kconfig" +source "fs/jfs/Kconfig" config FS_POSIX_ACL # Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4) @@ -182,111 +49,8 @@ config FILE_LOCKING source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" - -config OCFS2_FS - tristate "OCFS2 file system support" - depends on NET && SYSFS - select CONFIGFS_FS - select JBD2 - select CRC32 - select QUOTA - select QUOTA_TREE - help - OCFS2 is a general purpose extent based shared disk cluster file - system with many similarities to ext3. It supports 64 bit inode - numbers, and has automatically extending metadata groups which may - also make it attractive for non-clustered use. - - You'll want to install the ocfs2-tools package in order to at least - get "mount.ocfs2". - - Project web page: http://oss.oracle.com/projects/ocfs2 - Tools web page: http://oss.oracle.com/projects/ocfs2-tools - OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ - - For more information on OCFS2, see the file - <file:Documentation/filesystems/ocfs2.txt>. - -config OCFS2_FS_O2CB - tristate "O2CB Kernelspace Clustering" - depends on OCFS2_FS - default y - help - OCFS2 includes a simple kernelspace clustering package, the OCFS2 - Cluster Base. It only requires a very small userspace component - to configure it. This comes with the standard ocfs2-tools package. - O2CB is limited to maintaining a cluster for OCFS2 file systems. - It cannot manage any other cluster applications. - - It is always safe to say Y here, as the clustering method is - run-time selectable. - -config OCFS2_FS_USERSPACE_CLUSTER - tristate "OCFS2 Userspace Clustering" - depends on OCFS2_FS && DLM - default y - help - This option will allow OCFS2 to use userspace clustering services - in conjunction with the DLM in fs/dlm. If you are using a - userspace cluster manager, say Y here. - - It is safe to say Y, as the clustering method is run-time - selectable. - -config OCFS2_FS_STATS - bool "OCFS2 statistics" - depends on OCFS2_FS - default y - help - This option allows some fs statistics to be captured. Enabling - this option may increase the memory consumption. - -config OCFS2_DEBUG_MASKLOG - bool "OCFS2 logging support" - depends on OCFS2_FS - default y - help - The ocfs2 filesystem has an extensive logging system. The system - allows selection of events to log via files in /sys/o2cb/logmask/. - This option will enlarge your kernel, but it allows debugging of - ocfs2 filesystem issues. - -config OCFS2_DEBUG_FS - bool "OCFS2 expensive checks" - depends on OCFS2_FS - default n - help - This option will enable expensive consistency checks. Enable - this option for debugging only as it is likely to decrease - performance of the filesystem. - -config OCFS2_FS_POSIX_ACL - bool "OCFS2 POSIX Access Control Lists" - depends on OCFS2_FS - select FS_POSIX_ACL - default n - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - -config BTRFS_FS - tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" - depends on EXPERIMENTAL - select LIBCRC32C - select ZLIB_INFLATE - select ZLIB_DEFLATE - help - Btrfs is a new filesystem with extents, writable snapshotting, - support for multiple devices and many more features. - - Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET - FINALIZED. You should say N here unless you are interested in - testing Btrfs with non-critical data. - - To compile this file system support as a module, choose M here. The - module will be called btrfs. - - If unsure, say N. +source "fs/ocfs2/Kconfig" +source "fs/btrfs/Kconfig" endif # BLOCK @@ -348,64 +112,9 @@ config QUOTACTL depends on XFS_QUOTA || QUOTA default y -config AUTOFS_FS - tristate "Kernel automounter support" - help - The automounter is a tool to automatically mount remote file systems - on demand. This implementation is partially kernel-based to reduce - overhead in the already-mounted case; this is unlike the BSD - automounter (amd), which is a pure user space daemon. - - To use the automounter you need the user-space tools from the autofs - package; you can find the location in <file:Documentation/Changes>. - You also want to answer Y to "NFS file system support", below. - - If you want to use the newer version of the automounter with more - features, say N here and say Y to "Kernel automounter v4 support", - below. - - To compile this support as a module, choose M here: the module will be - called autofs. - - If you are not a part of a fairly large, distributed network, you - probably do not need an automounter, and can say N here. - -config AUTOFS4_FS - tristate "Kernel automounter version 4 support (also supports v3)" - help - The automounter is a tool to automatically mount remote file systems - on demand. This implementation is partially kernel-based to reduce - overhead in the already-mounted case; this is unlike the BSD - automounter (amd), which is a pure user space daemon. - - To use the automounter you need the user-space tools from - <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also - want to answer Y to "NFS file system support", below. - - To compile this support as a module, choose M here: the module will be - called autofs4. You will need to add "alias autofs autofs4" to your - modules configuration file. - - If you are not a part of a fairly large, distributed network or - don't have a laptop which needs to dynamically reconfigure to the - local network, you probably do not need an automounter, and can say - N here. - -config FUSE_FS - tristate "FUSE (Filesystem in Userspace) support" - help - With FUSE it is possible to implement a fully functional filesystem - in a userspace program. - - There's also companion library: libfuse. This library along with - utilities is available from the FUSE homepage: - <http://fuse.sourceforge.net/> - - See <file:Documentation/filesystems/fuse.txt> for more information. - See <file:Documentation/Changes> for needed library/utility version. - - If you want to develop a userspace FS, or if you want to use - a filesystem based on FUSE, answer Y or M. +source "fs/autofs/Kconfig" +source "fs/autofs4/Kconfig" +source "fs/fuse/Kconfig" config GENERIC_ACL bool @@ -414,64 +123,8 @@ config GENERIC_ACL if BLOCK menu "CD-ROM/DVD Filesystems" -config ISO9660_FS - tristate "ISO 9660 CDROM file system support" - help - This is the standard file system used on CD-ROMs. It was previously - known as "High Sierra File System" and is called "hsfs" on other - Unix systems. The so-called Rock-Ridge extensions which allow for - long Unix filenames and symbolic links are also supported by this - driver. If you have a CD-ROM drive and want to do more with it than - just listen to audio CDs and watch its LEDs, say Y (and read - <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO, - available from <http://www.tldp.org/docs.html#howto>), thereby - enlarging your kernel by about 27 KB; otherwise say N. - - To compile this file system support as a module, choose M here: the - module will be called isofs. - -config JOLIET - bool "Microsoft Joliet CDROM extensions" - depends on ISO9660_FS - select NLS - help - Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system - which allows for long filenames in unicode format (unicode is the - new 16 bit character code, successor to ASCII, which encodes the - characters of almost all languages of the world; see - <http://www.unicode.org/> for more information). Say Y here if you - want to be able to read Joliet CD-ROMs under Linux. - -config ZISOFS - bool "Transparent decompression extension" - depends on ISO9660_FS - select ZLIB_INFLATE - help - This is a Linux-specific extension to RockRidge which lets you store - data in compressed form on a CD-ROM and have it transparently - decompressed when the CD-ROM is accessed. See - <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools - necessary to create such a filesystem. Say Y here if you want to be - able to read such compressed CD-ROMs. - -config UDF_FS - tristate "UDF file system support" - select CRC_ITU_T - help - This is the new file system used on some CD-ROMs and DVDs. Say Y if - you intend to mount DVD discs or CDRW's written in packet mode, or - if written to by other UDF utilities, such as DirectCD. - Please read <file:Documentation/filesystems/udf.txt>. - - To compile this file system support as a module, choose M here: the - module will be called udf. - - If unsure, say N. - -config UDF_NLS - bool - default y - depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y) +source "fs/isofs/Kconfig" +source "fs/udf/Kconfig" endmenu endif # BLOCK @@ -479,182 +132,8 @@ endif # BLOCK if BLOCK menu "DOS/FAT/NT Filesystems" -config FAT_FS - tristate - select NLS - help - If you want to use one of the FAT-based file systems (the MS-DOS and - VFAT (Windows 95) file systems), then you must say Y or M here - to include FAT support. You will then be able to mount partitions or - diskettes with FAT-based file systems and transparently access the - files on them, i.e. MSDOS files will look and behave just like all - other Unix files. - - This FAT support is not a file system in itself, it only provides - the foundation for the other file systems. You will have to say Y or - M to at least one of "MSDOS fs support" or "VFAT fs support" in - order to make use of it. - - Another way to read and write MSDOS floppies and hard drive - partitions from within Linux (but not transparently) is with the - mtools ("man mtools") program suite. You don't need to say Y here in - order to do that. - - If you need to move large files on floppies between a DOS and a - Linux box, say Y here, mount the floppy under Linux with an MSDOS - file system and use GNU tar's M option. GNU tar is a program - available for Unix and DOS ("man tar" or "info tar"). - - The FAT support will enlarge your kernel by about 37 KB. If unsure, - say Y. - - To compile this as a module, choose M here: the module will be called - fat. Note that if you compile the FAT support as a module, you - cannot compile any of the FAT-based file systems into the kernel - -- they will have to be modules as well. - -config MSDOS_FS - tristate "MSDOS fs support" - select FAT_FS - help - This allows you to mount MSDOS partitions of your hard drive (unless - they are compressed; to access compressed MSDOS partitions under - Linux, you can either use the DOS emulator DOSEMU, described in the - DOSEMU-HOWTO, available from - <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in - <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you - intend to use dosemu with a non-compressed MSDOS partition, say Y - here) and MSDOS floppies. This means that file access becomes - transparent, i.e. the MSDOS files look and behave just like all - other Unix files. - - If you have Windows 95 or Windows NT installed on your MSDOS - partitions, you should use the VFAT file system (say Y to "VFAT fs - support" below), or you will not be able to see the long filenames - generated by Windows 95 / Windows NT. - - This option will enlarge your kernel by about 7 KB. If unsure, - answer Y. This will only work if you said Y to "DOS FAT fs support" - as well. To compile this as a module, choose M here: the module will - be called msdos. - -config VFAT_FS - tristate "VFAT (Windows-95) fs support" - select FAT_FS - help - This option provides support for normal Windows file systems with - long filenames. That includes non-compressed FAT-based file systems - used by Windows 95, Windows 98, Windows NT 4.0, and the Unix - programs from the mtools package. - - The VFAT support enlarges your kernel by about 10 KB and it only - works if you said Y to the "DOS FAT fs support" above. Please read - the file <file:Documentation/filesystems/vfat.txt> for details. If - unsure, say Y. - - To compile this as a module, choose M here: the module will be called - vfat. - -config FAT_DEFAULT_CODEPAGE - int "Default codepage for FAT" - depends on MSDOS_FS || VFAT_FS - default 437 - help - This option should be set to the codepage of your FAT filesystems. - It can be overridden with the "codepage" mount option. - See <file:Documentation/filesystems/vfat.txt> for more information. - -config FAT_DEFAULT_IOCHARSET - string "Default iocharset for FAT" - depends on VFAT_FS - default "iso8859-1" - help - Set this to the default input/output character set you'd - like FAT to use. It should probably match the character set - that most of your FAT filesystems use, and can be overridden - with the "iocharset" mount option for FAT filesystems. - Note that "utf8" is not recommended for FAT filesystems. - If unsure, you shouldn't set "utf8" here. - See <file:Documentation/filesystems/vfat.txt> for more information. - -config NTFS_FS - tristate "NTFS file system support" - select NLS - help - NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. - - Saying Y or M here enables read support. There is partial, but - safe, write support available. For write support you must also - say Y to "NTFS write support" below. - - There are also a number of user-space tools available, called - ntfsprogs. These include ntfsundelete and ntfsresize, that work - without NTFS support enabled in the kernel. - - This is a rewrite from scratch of Linux NTFS support and replaced - the old NTFS code starting with Linux 2.5.11. A backport to - the Linux 2.4 kernel series is separately available as a patch - from the project web site. - - For more information see <file:Documentation/filesystems/ntfs.txt> - and <http://www.linux-ntfs.org/>. - - To compile this file system support as a module, choose M here: the - module will be called ntfs. - - If you are not using Windows NT, 2000, XP or 2003 in addition to - Linux on your computer it is safe to say N. - -config NTFS_DEBUG - bool "NTFS debugging support" - depends on NTFS_FS - help - If you are experiencing any problems with the NTFS file system, say - Y here. This will result in additional consistency checks to be - performed by the driver as well as additional debugging messages to - be written to the system log. Note that debugging messages are - disabled by default. To enable them, supply the option debug_msgs=1 - at the kernel command line when booting the kernel or as an option - to insmod when loading the ntfs module. Once the driver is active, - you can enable debugging messages by doing (as root): - echo 1 > /proc/sys/fs/ntfs-debug - Replacing the "1" with "0" would disable debug messages. - - If you leave debugging messages disabled, this results in little - overhead, but enabling debug messages results in very significant - slowdown of the system. - - When reporting bugs, please try to have available a full dump of - debugging messages while the misbehaviour was occurring. - -config NTFS_RW - bool "NTFS write support" - depends on NTFS_FS - help - This enables the partial, but safe, write support in the NTFS driver. - - The only supported operation is overwriting existing files, without - changing the file length. No file or directory creation, deletion or - renaming is possible. Note only non-resident files can be written to - so you may find that some very small files (<500 bytes or so) cannot - be written to. - - While we cannot guarantee that it will not damage any data, we have - so far not received a single report where the driver would have - damaged someones data so we assume it is perfectly safe to use. - - Note: While write support is safe in this version (a rewrite from - scratch of the NTFS support), it should be noted that the old NTFS - write support, included in Linux 2.5.10 and before (since 1997), - is not safe. - - This is currently useful with TopologiLinux. TopologiLinux is run - on top of any DOS/Microsoft Windows system without partitioning your - hard disk. Unlike other Linux distributions TopologiLinux does not - need its own partition. For more information see - <http://topologi-linux.sourceforge.net/> - - It is perfectly safe to say N here. +source "fs/fat/Kconfig" +source "fs/ntfs/Kconfig" endmenu endif # BLOCK @@ -662,30 +141,7 @@ endif # BLOCK menu "Pseudo filesystems" source "fs/proc/Kconfig" - -config SYSFS - bool "sysfs file system support" if EMBEDDED - default y - help - The sysfs filesystem is a virtual filesystem that the kernel uses to - export internal kernel objects, their attributes, and their - relationships to one another. - - Users can use sysfs to ascertain useful information about the running - kernel, such as the devices the kernel has discovered on each bus and - which driver each is bound to. sysfs can also be used to tune devices - and other kernel subsystems. - - Some system agents rely on the information in sysfs to operate. - /sbin/hotplug uses device and object attributes in sysfs to assist in - delegating policy decisions, like persistently naming devices. - - sysfs is currently used by the block subsystem to mount the root - partition. If sysfs is disabled you must specify the boot device on - the kernel boot command line via its major and minor numbers. For - example, "root=03:01" for /dev/hda1. - - Designers of embedded systems may wish to say N here to conserve space. +source "fs/sysfs/Kconfig" config TMPFS bool "Virtual memory file system support (former shm fs)" @@ -726,17 +182,7 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS -config CONFIGFS_FS - tristate "Userspace-driven configuration filesystem" - depends on SYSFS - help - configfs is a ram-based filesystem that provides the converse - of sysfs's functionality. Where sysfs is a filesystem-based - view of kernel objects, configfs is a filesystem-based manager - of kernel objects, or config_items. - - Both sysfs and configfs can and should exist together on the - same system. One is not a replacement for the other. +source "fs/configfs/Kconfig" endmenu @@ -755,425 +201,27 @@ menuconfig MISC_FILESYSTEMS if MISC_FILESYSTEMS -config ADFS_FS - tristate "ADFS file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL - help - The Acorn Disc Filing System is the standard file system of the - RiscOS operating system which runs on Acorn's ARM-based Risc PC - systems and the Acorn Archimedes range of machines. If you say Y - here, Linux will be able to read from ADFS partitions on hard drives - and from ADFS-formatted floppy discs. If you also want to be able to - write to those devices, say Y to "ADFS write support" below. - - The ADFS partition should be the first partition (i.e., - /dev/[hs]d?1) on each of your drives. Please read the file - <file:Documentation/filesystems/adfs.txt> for further details. - - To compile this code as a module, choose M here: the module will be - called adfs. - - If unsure, say N. - -config ADFS_FS_RW - bool "ADFS write support (DANGEROUS)" - depends on ADFS_FS - help - If you say Y here, you will be able to write to ADFS partitions on - hard drives and ADFS-formatted floppy disks. This is experimental - codes, so if you're unsure, say N. - -config AFFS_FS - tristate "Amiga FFS file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL - help - The Fast File System (FFS) is the common file system used on hard - disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y - if you want to be able to read and write files from and to an Amiga - FFS partition on your hard drive. Amiga floppies however cannot be - read with this driver due to an incompatibility of the floppy - controller used in an Amiga and the standard floppy controller in - PCs and workstations. Read <file:Documentation/filesystems/affs.txt> - and <file:fs/affs/Changes>. - - With this driver you can also mount disk files used by Bernd - Schmidt's Un*X Amiga Emulator - (<http://www.freiburg.linux.de/~uae/>). - If you want to do this, you will also need to say Y or M to "Loop - device support", above. - - To compile this file system support as a module, choose M here: the - module will be called affs. If unsure, say N. - -config ECRYPT_FS - tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && CRYPTO && NET - help - Encrypted filesystem that operates on the VFS layer. See - <file:Documentation/filesystems/ecryptfs.txt> to learn more about - eCryptfs. Userspace components are required and can be - obtained from <http://ecryptfs.sf.net>. - - To compile this file system support as a module, choose M here: the - module will be called ecryptfs. - -config HFS_FS - tristate "Apple Macintosh file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL - select NLS - help - If you say Y here, you will be able to mount Macintosh-formatted - floppy disks and hard drive partitions with full read-write access. - Please read <file:Documentation/filesystems/hfs.txt> to learn about - the available mount options. - - To compile this file system support as a module, choose M here: the - module will be called hfs. - -config HFSPLUS_FS - tristate "Apple Extended HFS file system support" - depends on BLOCK - select NLS - select NLS_UTF8 - help - If you say Y here, you will be able to mount extended format - Macintosh-formatted hard drive partitions with full read-write access. - - This file system is often called HFS+ and was introduced with - MacOS 8. It includes all Mac specific filesystem data such as - data forks and creator codes, but it also has several UNIX - style features such as file ownership and permissions. - -config BEFS_FS - tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL - select NLS - help - The BeOS File System (BeFS) is the native file system of Be, Inc's - BeOS. Notable features include support for arbitrary attributes - on files and directories, and database-like indices on selected - attributes. (Also note that this driver doesn't make those features - available at this time). It is a 64 bit filesystem, so it supports - extremely large volumes and files. - - If you use this filesystem, you should also say Y to at least one - of the NLS (native language support) options below. - - If you don't know what this is about, say N. - - To compile this as a module, choose M here: the module will be - called befs. - -config BEFS_DEBUG - bool "Debug BeFS" - depends on BEFS_FS - help - If you say Y here, you can use the 'debug' mount option to enable - debugging output from the driver. - -config BFS_FS - tristate "BFS file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL - help - Boot File System (BFS) is a file system used under SCO UnixWare to - allow the bootloader access to the kernel image and other important - files during the boot process. It is usually mounted under /stand - and corresponds to the slice marked as "STAND" in the UnixWare - partition. You should say Y if you want to read or write the files - on your /stand slice from within Linux. You then also need to say Y - to "UnixWare slices support", below. More information about the BFS - file system is contained in the file - <file:Documentation/filesystems/bfs.txt>. - - If you don't know what this is about, say N. - - To compile this as a module, choose M here: the module will be called - bfs. Note that the file system of your root partition (the one - containing the directory /) cannot be compiled as a module. - - - -config EFS_FS - tristate "EFS file system support (read only) (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL - help - EFS is an older file system used for non-ISO9660 CD-ROMs and hard - disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer - uses the XFS file system for hard disk partitions however). - - This implementation only offers read-only access. If you don't know - what all this is about, it's safe to say N. For more information - about EFS see its home page at <http://aeschi.ch.eu.org/efs/>. - - To compile the EFS file system support as a module, choose M here: the - module will be called efs. - +source "fs/adfs/Kconfig" +source "fs/affs/Kconfig" +source "fs/ecryptfs/Kconfig" +source "fs/hfs/Kconfig" +source "fs/hfsplus/Kconfig" +source "fs/befs/Kconfig" +source "fs/bfs/Kconfig" +source "fs/efs/Kconfig" source "fs/jffs2/Kconfig" # UBIFS File system configuration source "fs/ubifs/Kconfig" - -config CRAMFS - tristate "Compressed ROM file system support (cramfs)" - depends on BLOCK - select ZLIB_INFLATE - help - Saying Y here includes support for CramFs (Compressed ROM File - System). CramFs is designed to be a simple, small, and compressed - file system for ROM based embedded systems. CramFs is read-only, - limited to 256MB file systems (with 16MB files), and doesn't support - 16/32 bits uid/gid, hard links and timestamps. - - See <file:Documentation/filesystems/cramfs.txt> and - <file:fs/cramfs/README> for further information. - - To compile this as a module, choose M here: the module will be called - cramfs. Note that the root file system (the one containing the - directory /) cannot be compiled as a module. - - If unsure, say N. - -config SQUASHFS - tristate "SquashFS 4.0 - Squashed file system support" - depends on BLOCK - select ZLIB_INFLATE - help - Saying Y here includes support for SquashFS 4.0 (a Compressed - Read-Only File System). Squashfs is a highly compressed read-only - filesystem for Linux. It uses zlib compression to compress both - files, inodes and directories. Inodes in the system are very small - and all blocks are packed to minimise data overhead. Block sizes - greater than 4K are supported up to a maximum of 1 Mbytes (default - block size 128K). SquashFS 4.0 supports 64 bit filesystems and files - (larger than 4GB), full uid/gid information, hard links and - timestamps. - - Squashfs is intended for general read-only filesystem use, for - archival use (i.e. in cases where a .tar.gz file may be used), and in - embedded systems where low overhead is needed. Further information - and tools are available from http://squashfs.sourceforge.net. - - If you want to compile this as a module ( = code which can be - inserted in and removed from the running kernel whenever you want), - say M here and read <file:Documentation/modules.txt>. The module - will be called squashfs. Note that the root file system (the one - containing the directory /) cannot be compiled as a module. - - If unsure, say N. - -config SQUASHFS_EMBEDDED - - bool "Additional option for memory-constrained systems" - depends on SQUASHFS - default n - help - Saying Y here allows you to specify cache size. - - If unsure, say N. - -config SQUASHFS_FRAGMENT_CACHE_SIZE - int "Number of fragments cached" if SQUASHFS_EMBEDDED - depends on SQUASHFS - default "3" - help - By default SquashFS caches the last 3 fragments read from - the filesystem. Increasing this amount may mean SquashFS - has to re-read fragments less often from disk, at the expense - of extra system memory. Decreasing this amount will mean - SquashFS uses less memory at the expense of extra reads from disk. - - Note there must be at least one cached fragment. Anything - much more than three will probably not make much difference. - -config VXFS_FS - tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" - depends on BLOCK - help - FreeVxFS is a file system driver that support the VERITAS VxFS(TM) - file system format. VERITAS VxFS(TM) is the standard file system - of SCO UnixWare (and possibly others) and optionally available - for Sunsoft Solaris, HP-UX and many other operating systems. - Currently only readonly access is supported. - - NOTE: the file system type as used by mount(1), mount(2) and - fstab(5) is 'vxfs' as it describes the file system format, not - the actual driver. - - To compile this as a module, choose M here: the module will be - called freevxfs. If unsure, say N. - -config MINIX_FS - tristate "Minix file system support" - depends on BLOCK - help - Minix is a simple operating system used in many classes about OS's. - The minix file system (method to organize files on a hard disk - partition or a floppy disk) was the original file system for Linux, - but has been superseded by the second extended file system ext2fs. - You don't want to use the minix file system on your hard disk - because of certain built-in restrictions, but it is sometimes found - on older Linux floppy disks. This option will enlarge your kernel - by about 28 KB. If unsure, say N. - - To compile this file system support as a module, choose M here: the - module will be called minix. Note that the file system of your root - partition (the one containing the directory /) cannot be compiled as - a module. - -config OMFS_FS - tristate "SonicBlue Optimized MPEG File System support" - depends on BLOCK - select CRC_ITU_T - help - This is the proprietary file system used by the Rio Karma music - player and ReplayTV DVR. Despite the name, this filesystem is not - more efficient than a standard FS for MPEG files, in fact likely - the opposite is true. Say Y if you have either of these devices - and wish to mount its disk. - - To compile this file system support as a module, choose M here: the - module will be called omfs. If unsure, say N. - -config HPFS_FS - tristate "OS/2 HPFS file system support" - depends on BLOCK - help - OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS - is the file system used for organizing files on OS/2 hard disk - partitions. Say Y if you want to be able to read files from and - write files to an OS/2 HPFS partition on your hard drive. OS/2 - floppies however are in regular MSDOS format, so you don't need this - option in order to be able to read them. Read - <file:Documentation/filesystems/hpfs.txt>. - - To compile this file system support as a module, choose M here: the - module will be called hpfs. If unsure, say N. - - -config QNX4FS_FS - tristate "QNX4 file system support (read only)" - depends on BLOCK - help - This is the file system used by the real-time operating systems - QNX 4 and QNX 6 (the latter is also called QNX RTP). - Further information is available at <http://www.qnx.com/>. - Say Y if you intend to mount QNX hard disks or floppies. - Unless you say Y to "QNX4FS read-write support" below, you will - only be able to read these file systems. - - To compile this file system support as a module, choose M here: the - module will be called qnx4. - - If you don't know whether you need it, then you don't need it: - answer N. - -config QNX4FS_RW - bool "QNX4FS write support (DANGEROUS)" - depends on QNX4FS_FS && EXPERIMENTAL && BROKEN - help - Say Y if you want to test write support for QNX4 file systems. - - It's currently broken, so for now: - answer N. - -config ROMFS_FS - tristate "ROM file system support" - depends on BLOCK - ---help--- - This is a very small read-only file system mainly intended for - initial ram disks of installation disks, but it could be used for - other read-only media as well. Read - <file:Documentation/filesystems/romfs.txt> for details. - - To compile this file system support as a module, choose M here: the - module will be called romfs. Note that the file system of your - root partition (the one containing the directory /) cannot be a - module. - - If you don't know whether you need it, then you don't need it: - answer N. - - -config SYSV_FS - tristate "System V/Xenix/V7/Coherent file system support" - depends on BLOCK - help - SCO, Xenix and Coherent are commercial Unix systems for Intel - machines, and Version 7 was used on the DEC PDP-11. Saying Y - here would allow you to read from their floppies and hard disk - partitions. - - If you have floppies or hard disk partitions like that, it is likely - that they contain binaries from those other Unix systems; in order - to run these binaries, you will want to install linux-abi which is - a set of kernel modules that lets you run SCO, Xenix, Wyse, - UnixWare, Dell Unix and System V programs under Linux. It is - available via FTP (user: ftp) from - <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>). - NOTE: that will work only for binaries from Intel-based systems; - PDP ones will have to wait until somebody ports Linux to -11 ;-) - - If you only intend to mount files from some other Unix over the - network using NFS, you don't need the System V file system support - (but you need NFS file system support obviously). - - Note that this option is generally not needed for floppies, since a - good portable way to transport files and directories between unixes - (and even other operating systems) is given by the tar program ("man - tar" or preferably "info tar"). Note also that this option has - nothing whatsoever to do with the option "System V IPC". Read about - the System V file system in - <file:Documentation/filesystems/sysv-fs.txt>. - Saying Y here will enlarge your kernel by about 27 KB. - - To compile this as a module, choose M here: the module will be called - sysv. - - If you haven't heard about all of this before, it's safe to say N. - - -config UFS_FS - tristate "UFS file system support (read only)" - depends on BLOCK - help - BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, - OpenBSD and NeXTstep) use a file system called UFS. Some System V - Unixes can create and mount hard disk partitions and diskettes using - this file system as well. Saying Y here will allow you to read from - these partitions; if you also want to write to them, say Y to the - experimental "UFS file system write support", below. Please read the - file <file:Documentation/filesystems/ufs.txt> for more information. - - The recently released UFS2 variant (used in FreeBSD 5.x) is - READ-ONLY supported. - - Note that this option is generally not needed for floppies, since a - good portable way to transport files and directories between unixes - (and even other operating systems) is given by the tar program ("man - tar" or preferably "info tar"). - - When accessing NeXTstep files, you may need to convert them from the - NeXT character set to the Latin1 character set; use the program - recode ("info recode") for this purpose. - - To compile the UFS file system support as a module, choose M here: the - module will be called ufs. - - If you haven't heard about all of this before, it's safe to say N. - -config UFS_FS_WRITE - bool "UFS file system write support (DANGEROUS)" - depends on UFS_FS && EXPERIMENTAL - help - Say Y here if you want to try writing to UFS partitions. This is - experimental, so you should back up your UFS partitions beforehand. - -config UFS_DEBUG - bool "UFS debugging" - depends on UFS_FS - help - If you are experiencing any problems with the UFS filesystem, say - Y here. This will result in _many_ additional debugging messages to be - written to the system log. +source "fs/cramfs/Kconfig" +source "fs/squashfs/Kconfig" +source "fs/freevxfs/Kconfig" +source "fs/minix/Kconfig" +source "fs/omfs/Kconfig" +source "fs/hpfs/Kconfig" +source "fs/qnx4/Kconfig" +source "fs/romfs/Kconfig" +source "fs/sysv/Kconfig" +source "fs/ufs/Kconfig" endif # MISC_FILESYSTEMS @@ -1193,173 +241,8 @@ menuconfig NETWORK_FILESYSTEMS if NETWORK_FILESYSTEMS -config NFS_FS - tristate "NFS client support" - depends on INET - select LOCKD - select SUNRPC - select NFS_ACL_SUPPORT if NFS_V3_ACL - help - Choose Y here if you want to access files residing on other - computers using Sun's Network File System protocol. To compile - this file system support as a module, choose M here: the module - will be called nfs. - - To mount file systems exported by NFS servers, you also need to - install the user space mount.nfs command which can be found in - the Linux nfs-utils package, available from http://linux-nfs.org/. - Information about using the mount command is available in the - mount(8) man page. More detail about the Linux NFS client - implementation is available via the nfs(5) man page. - - Below you can choose which versions of the NFS protocol are - available in the kernel to mount NFS servers. Support for NFS - version 2 (RFC 1094) is always available when NFS_FS is selected. - - To configure a system which mounts its root file system via NFS - at boot time, say Y here, select "Kernel level IP - autoconfiguration" in the NETWORK menu, and select "Root file - system on NFS" below. You cannot compile this file system as a - module in this case. - - If unsure, say N. - -config NFS_V3 - bool "NFS client support for NFS version 3" - depends on NFS_FS - help - This option enables support for version 3 of the NFS protocol - (RFC 1813) in the kernel's NFS client. - - If unsure, say Y. - -config NFS_V3_ACL - bool "NFS client support for the NFSv3 ACL protocol extension" - depends on NFS_V3 - help - Some NFS servers support an auxiliary NFSv3 ACL protocol that - Sun added to Solaris but never became an official part of the - NFS version 3 protocol. This protocol extension allows - applications on NFS clients to manipulate POSIX Access Control - Lists on files residing on NFS servers. NFS servers enforce - ACLs on local files whether this protocol is available or not. - - Choose Y here if your NFS server supports the Solaris NFSv3 ACL - protocol extension and you want your NFS client to allow - applications to access and modify ACLs on files on the server. - - Most NFS servers don't support the Solaris NFSv3 ACL protocol - extension. You can choose N here or specify the "noacl" mount - option to prevent your NFS client from trying to use the NFSv3 - ACL protocol. - - If unsure, say N. - -config NFS_V4 - bool "NFS client support for NFS version 4 (EXPERIMENTAL)" - depends on NFS_FS && EXPERIMENTAL - select RPCSEC_GSS_KRB5 - help - This option enables support for version 4 of the NFS protocol - (RFC 3530) in the kernel's NFS client. - - To mount NFS servers using NFSv4, you also need to install user - space programs which can be found in the Linux nfs-utils package, - available from http://linux-nfs.org/. - - If unsure, say N. - -config ROOT_NFS - bool "Root file system on NFS" - depends on NFS_FS=y && IP_PNP - help - If you want your system to mount its root file system via NFS, - choose Y here. This is common practice for managing systems - without local permanent storage. For details, read - <file:Documentation/filesystems/nfsroot.txt>. - - Most people say N here. - -config NFSD - tristate "NFS server support" - depends on INET - select LOCKD - select SUNRPC - select EXPORTFS - select NFS_ACL_SUPPORT if NFSD_V2_ACL - help - Choose Y here if you want to allow other computers to access - files residing on this system using Sun's Network File System - protocol. To compile the NFS server support as a module, - choose M here: the module will be called nfsd. - - You may choose to use a user-space NFS server instead, in which - case you can choose N here. - - To export local file systems using NFS, you also need to install - user space programs which can be found in the Linux nfs-utils - package, available from http://linux-nfs.org/. More detail about - the Linux NFS server implementation is available via the - exports(5) man page. - - Below you can choose which versions of the NFS protocol are - available to clients mounting the NFS server on this system. - Support for NFS version 2 (RFC 1094) is always available when - CONFIG_NFSD is selected. - - If unsure, say N. - -config NFSD_V2_ACL - bool - depends on NFSD - -config NFSD_V3 - bool "NFS server support for NFS version 3" - depends on NFSD - help - This option enables support in your system's NFS server for - version 3 of the NFS protocol (RFC 1813). - - If unsure, say Y. - -config NFSD_V3_ACL - bool "NFS server support for the NFSv3 ACL protocol extension" - depends on NFSD_V3 - select NFSD_V2_ACL - help - Solaris NFS servers support an auxiliary NFSv3 ACL protocol that - never became an official part of the NFS version 3 protocol. - This protocol extension allows applications on NFS clients to - manipulate POSIX Access Control Lists on files residing on NFS - servers. NFS servers enforce POSIX ACLs on local files whether - this protocol is available or not. - - This option enables support in your system's NFS server for the - NFSv3 ACL protocol extension allowing NFS clients to manipulate - POSIX ACLs on files exported by your system's NFS server. NFS - clients which support the Solaris NFSv3 ACL protocol can then - access and modify ACLs on your NFS server. - - To store ACLs on your NFS server, you also need to enable ACL- - related CONFIG options for your local file systems of choice. - - If unsure, say N. - -config NFSD_V4 - bool "NFS server support for NFS version 4 (EXPERIMENTAL)" - depends on NFSD && PROC_FS && EXPERIMENTAL - select NFSD_V3 - select FS_POSIX_ACL - select RPCSEC_GSS_KRB5 - help - This option enables support in your system's NFS server for - version 4 of the NFS protocol (RFC 3530). - - To export files using NFSv4, you need to install additional user - space programs which can be found in the Linux nfs-utils package, - available from http://linux-nfs.org/. - - If unsure, say N. +source "fs/nfs/Kconfig" +source "fs/nfsd/Kconfig" config LOCKD tristate @@ -1381,221 +264,13 @@ config NFS_COMMON depends on NFSD || NFS_FS default y -config SUNRPC - tristate - -config SUNRPC_GSS - tristate - -config SUNRPC_XPRT_RDMA - tristate - depends on SUNRPC && INFINIBAND && EXPERIMENTAL - default SUNRPC && INFINIBAND - help - This option enables an RPC client transport capability that - allows the NFS client to mount servers via an RDMA-enabled - transport. - - To compile RPC client RDMA transport support as a module, - choose M here: the module will be called xprtrdma. - - If unsure, say N. - -config SUNRPC_REGISTER_V4 - bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)" - depends on SUNRPC && EXPERIMENTAL - default n - help - Sun added support for registering RPC services at an IPv6 - address by creating two new versions of the rpcbind protocol - (RFC 1833). - - This option enables support in the kernel RPC server for - registering kernel RPC services via version 4 of the rpcbind - protocol. If you enable this option, you must run a portmapper - daemon that supports rpcbind protocol version 4. - - Serving NFS over IPv6 from knfsd (the kernel's NFS server) - requires that you enable this option and use a portmapper that - supports rpcbind version 4. - - If unsure, say N to get traditional behavior (register kernel - RPC services using only rpcbind version 2). Distributions - using the legacy Linux portmapper daemon must say N here. - -config RPCSEC_GSS_KRB5 - tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" - depends on SUNRPC && EXPERIMENTAL - select SUNRPC_GSS - select CRYPTO - select CRYPTO_MD5 - select CRYPTO_DES - select CRYPTO_CBC - help - Choose Y here to enable Secure RPC using the Kerberos version 5 - GSS-API mechanism (RFC 1964). - - Secure RPC calls with Kerberos require an auxiliary user-space - daemon which may be found in the Linux nfs-utils package - available from http://linux-nfs.org/. In addition, user-space - Kerberos support should be installed. - - If unsure, say N. - -config RPCSEC_GSS_SPKM3 - tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)" - depends on SUNRPC && EXPERIMENTAL - select SUNRPC_GSS - select CRYPTO - select CRYPTO_MD5 - select CRYPTO_DES - select CRYPTO_CAST5 - select CRYPTO_CBC - help - Choose Y here to enable Secure RPC using the SPKM3 public key - GSS-API mechansim (RFC 2025). - - Secure RPC calls with SPKM3 require an auxiliary userspace - daemon which may be found in the Linux nfs-utils package - available from http://linux-nfs.org/. - - If unsure, say N. - -config SMB_FS - tristate "SMB file system support (OBSOLETE, please use CIFS)" - depends on INET - select NLS - help - SMB (Server Message Block) is the protocol Windows for Workgroups - (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share - files and printers over local networks. Saying Y here allows you to - mount their file systems (often called "shares" in this context) and - access them just like any other Unix directory. Currently, this - works only if the Windows machines use TCP/IP as the underlying - transport protocol, and not NetBEUI. For details, read - <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO, - available from <http://www.tldp.org/docs.html#howto>. - - Note: if you just want your box to act as an SMB *server* and make - files and printing services available to Windows clients (which need - to have a TCP/IP stack), you don't need to say Y here; you can use - the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>) - for that. - - General information about how to connect Linux, Windows machines and - Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. - - To compile the SMB support as a module, choose M here: - the module will be called smbfs. Most people say N, however. - -config SMB_NLS_DEFAULT - bool "Use a default NLS" - depends on SMB_FS - help - Enabling this will make smbfs use nls translations by default. You - need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls - settings and you need to give the default nls for the SMB server as - CONFIG_SMB_NLS_REMOTE. - - The nls settings can be changed at mount time, if your smbmount - supports that, using the codepage and iocharset parameters. - - smbmount from samba 2.2.0 or later supports this. - -config SMB_NLS_REMOTE - string "Default Remote NLS Option" - depends on SMB_NLS_DEFAULT - default "cp437" - help - This setting allows you to specify a default value for which - codepage the server uses. If this field is left blank no - translations will be done by default. The local codepage/charset - default to CONFIG_NLS_DEFAULT. - - The nls settings can be changed at mount time, if your smbmount - supports that, using the codepage and iocharset parameters. - - smbmount from samba 2.2.0 or later supports this. - +source "net/sunrpc/Kconfig" +source "fs/smbfs/Kconfig" source "fs/cifs/Kconfig" - -config NCP_FS - tristate "NCP file system support (to mount NetWare volumes)" - depends on IPX!=n || INET - help - NCP (NetWare Core Protocol) is a protocol that runs over IPX and is - used by Novell NetWare clients to talk to file servers. It is to - IPX what NFS is to TCP/IP, if that helps. Saying Y here allows you - to mount NetWare file server volumes and to access them just like - any other Unix directory. For details, please read the file - <file:Documentation/filesystems/ncpfs.txt> in the kernel source and - the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>. - - You do not have to say Y here if you want your Linux box to act as a - file *server* for Novell NetWare clients. - - General information about how to connect Linux, Windows machines and - Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. - - To compile this as a module, choose M here: the module will be called - ncpfs. Say N unless you are connected to a Novell network. - source "fs/ncpfs/Kconfig" - -config CODA_FS - tristate "Coda file system support (advanced network fs)" - depends on INET - help - Coda is an advanced network file system, similar to NFS in that it - enables you to mount file systems of a remote server and access them - with regular Unix commands as if they were sitting on your hard - disk. Coda has several advantages over NFS: support for - disconnected operation (e.g. for laptops), read/write server - replication, security model for authentication and encryption, - persistent client caches and write back caching. - - If you say Y here, your Linux box will be able to act as a Coda - *client*. You will need user level code as well, both for the - client and server. Servers are currently user level, i.e. they need - no kernel support. Please read - <file:Documentation/filesystems/coda.txt> and check out the Coda - home page <http://www.coda.cs.cmu.edu/>. - - To compile the coda client support as a module, choose M here: the - module will be called coda. - -config AFS_FS - tristate "Andrew File System support (AFS) (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL - select AF_RXRPC - help - If you say Y here, you will get an experimental Andrew File System - driver. It currently only supports unsecured read-only AFS access. - - See <file:Documentation/filesystems/afs.txt> for more information. - - If unsure, say N. - -config AFS_DEBUG - bool "AFS dynamic debugging" - depends on AFS_FS - help - Say Y here to make runtime controllable debugging messages appear. - - See <file:Documentation/filesystems/afs.txt> for more information. - - If unsure, say N. - -config 9P_FS - tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)" - depends on INET && NET_9P && EXPERIMENTAL - help - If you say Y here, you will get experimental support for - Plan 9 resource sharing via the 9P2000 protocol. - - See <http://v9fs.sf.net> for more information. - - If unsure, say N. +source "fs/coda/Kconfig" +source "fs/afs/Kconfig" +source "fs/9p/Kconfig" endif # NETWORK_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 38bc735c67a..dc20db34867 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -69,10 +69,12 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 -obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 +obj-$(CONFIG_EXT2_FS) += ext2/ +# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 +# unless explicitly requested by rootfstype +obj-$(CONFIG_EXT4_FS) += ext4/ obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ -obj-$(CONFIG_EXT2_FS) += ext2/ obj-$(CONFIG_CRAMFS) += cramfs/ obj-$(CONFIG_SQUASHFS) += squashfs/ obj-y += ramfs/ diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig new file mode 100644 index 00000000000..e55182a7460 --- /dev/null +++ b/fs/adfs/Kconfig @@ -0,0 +1,27 @@ +config ADFS_FS + tristate "ADFS file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + help + The Acorn Disc Filing System is the standard file system of the + RiscOS operating system which runs on Acorn's ARM-based Risc PC + systems and the Acorn Archimedes range of machines. If you say Y + here, Linux will be able to read from ADFS partitions on hard drives + and from ADFS-formatted floppy discs. If you also want to be able to + write to those devices, say Y to "ADFS write support" below. + + The ADFS partition should be the first partition (i.e., + /dev/[hs]d?1) on each of your drives. Please read the file + <file:Documentation/filesystems/adfs.txt> for further details. + + To compile this code as a module, choose M here: the module will be + called adfs. + + If unsure, say N. + +config ADFS_FS_RW + bool "ADFS write support (DANGEROUS)" + depends on ADFS_FS + help + If you say Y here, you will be able to write to ADFS partitions on + hard drives and ADFS-formatted floppy disks. This is experimental + codes, so if you're unsure, say N. diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig new file mode 100644 index 00000000000..cfad9afb476 --- /dev/null +++ b/fs/affs/Kconfig @@ -0,0 +1,21 @@ +config AFFS_FS + tristate "Amiga FFS file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + help + The Fast File System (FFS) is the common file system used on hard + disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y + if you want to be able to read and write files from and to an Amiga + FFS partition on your hard drive. Amiga floppies however cannot be + read with this driver due to an incompatibility of the floppy + controller used in an Amiga and the standard floppy controller in + PCs and workstations. Read <file:Documentation/filesystems/affs.txt> + and <file:fs/affs/Changes>. + + With this driver you can also mount disk files used by Bernd + Schmidt's Un*X Amiga Emulator + (<http://www.freiburg.linux.de/~uae/>). + If you want to do this, you will also need to say Y or M to "Loop + device support", above. + + To compile this file system support as a module, choose M here: the + module will be called affs. If unsure, say N. diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig new file mode 100644 index 00000000000..e7b522fe15e --- /dev/null +++ b/fs/afs/Kconfig @@ -0,0 +1,21 @@ +config AFS_FS + tristate "Andrew File System support (AFS) (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + select AF_RXRPC + help + If you say Y here, you will get an experimental Andrew File System + driver. It currently only supports unsecured read-only AFS access. + + See <file:Documentation/filesystems/afs.txt> for more information. + + If unsure, say N. + +config AFS_DEBUG + bool "AFS dynamic debugging" + depends on AFS_FS + help + Say Y here to make runtime controllable debugging messages appear. + + See <file:Documentation/filesystems/afs.txt> for more information. + + If unsure, say N. diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig new file mode 100644 index 00000000000..5f3bea90911 --- /dev/null +++ b/fs/autofs/Kconfig @@ -0,0 +1,21 @@ +config AUTOFS_FS + tristate "Kernel automounter support" + help + The automounter is a tool to automatically mount remote file systems + on demand. This implementation is partially kernel-based to reduce + overhead in the already-mounted case; this is unlike the BSD + automounter (amd), which is a pure user space daemon. + + To use the automounter you need the user-space tools from the autofs + package; you can find the location in <file:Documentation/Changes>. + You also want to answer Y to "NFS file system support", below. + + If you want to use the newer version of the automounter with more + features, say N here and say Y to "Kernel automounter v4 support", + below. + + To compile this support as a module, choose M here: the module will be + called autofs. + + If you are not a part of a fairly large, distributed network, you + probably do not need an automounter, and can say N here. diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig new file mode 100644 index 00000000000..1204d6384d3 --- /dev/null +++ b/fs/autofs4/Kconfig @@ -0,0 +1,20 @@ +config AUTOFS4_FS + tristate "Kernel automounter version 4 support (also supports v3)" + help + The automounter is a tool to automatically mount remote file systems + on demand. This implementation is partially kernel-based to reduce + overhead in the already-mounted case; this is unlike the BSD + automounter (amd), which is a pure user space daemon. + + To use the automounter you need the user-space tools from + <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also + want to answer Y to "NFS file system support", below. + + To compile this support as a module, choose M here: the module will be + called autofs4. You will need to add "alias autofs autofs4" to your + modules configuration file. + + If you are not a part of a fairly large, distributed network or + don't have a laptop which needs to dynamically reconfigure to the + local network, you probably do not need an automounter, and can say + N here. diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig new file mode 100644 index 00000000000..7835d30f211 --- /dev/null +++ b/fs/befs/Kconfig @@ -0,0 +1,26 @@ +config BEFS_FS + tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + select NLS + help + The BeOS File System (BeFS) is the native file system of Be, Inc's + BeOS. Notable features include support for arbitrary attributes + on files and directories, and database-like indices on selected + attributes. (Also note that this driver doesn't make those features + available at this time). It is a 64 bit filesystem, so it supports + extremely large volumes and files. + + If you use this filesystem, you should also say Y to at least one + of the NLS (native language support) options below. + + If you don't know what this is about, say N. + + To compile this as a module, choose M here: the module will be + called befs. + +config BEFS_DEBUG + bool "Debug BeFS" + depends on BEFS_FS + help + If you say Y here, you can use the 'debug' mount option to enable + debugging output from the driver. diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig new file mode 100644 index 00000000000..c2336c62024 --- /dev/null +++ b/fs/bfs/Kconfig @@ -0,0 +1,19 @@ +config BFS_FS + tristate "BFS file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + help + Boot File System (BFS) is a file system used under SCO UnixWare to + allow the bootloader access to the kernel image and other important + files during the boot process. It is usually mounted under /stand + and corresponds to the slice marked as "STAND" in the UnixWare + partition. You should say Y if you want to read or write the files + on your /stand slice from within Linux. You then also need to say Y + to "UnixWare slices support", below. More information about the BFS + file system is contained in the file + <file:Documentation/filesystems/bfs.txt>. + + If you don't know what this is about, say N. + + To compile this as a module, choose M here: the module will be called + bfs. Note that the file system of your root partition (the one + containing the directory /) cannot be compiled as a module. diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e3ff2b9e602..33b7235f853 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1208,9 +1208,11 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, * check for an ELF header. If we find one, dump the first page to * aid in determining what was mapped here. */ - if (FILTER(ELF_HEADERS) && vma->vm_file != NULL && vma->vm_pgoff == 0) { + if (FILTER(ELF_HEADERS) && + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { u32 __user *header = (u32 __user *) vma->vm_start; u32 word; + mm_segment_t fs = get_fs(); /* * Doing it this way gets the constant folded by GCC. */ @@ -1223,7 +1225,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, magic.elfmag[EI_MAG1] = ELFMAG1; magic.elfmag[EI_MAG2] = ELFMAG2; magic.elfmag[EI_MAG3] = ELFMAG3; - if (get_user(word, header) == 0 && word == magic.cmp) + /* + * Switch to the user "segment" for get_user(), + * then put back what elf_core_dump() had in place. + */ + set_fs(USER_DS); + if (unlikely(get_user(word, header))) + word = 0; + set_fs(fs); + if (word == magic.cmp) return PAGE_SIZE; } diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 77ebc3c263d..549b0144da1 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -140,7 +140,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip_vec_idx(bip, bip->bip_vcnt); BUG_ON(iv == NULL); - BUG_ON(iv->bv_page != NULL); iv->bv_page = page; iv->bv_len = len; @@ -465,7 +464,7 @@ static int bio_integrity_verify(struct bio *bio) if (ret) { kunmap_atomic(kaddr, KM_USER0); - break; + return ret; } sectors = bv->bv_len / bi->sector_size; @@ -493,18 +492,13 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - int error = bip->bip_error; + int error; - if (bio_integrity_verify(bio)) { - clear_bit(BIO_UPTODATE, &bio->bi_flags); - error = -EIO; - } + error = bio_integrity_verify(bio); /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; - - if (bio->bi_end_io) - bio->bi_end_io(bio, error); + bio_endio(bio, error); } /** @@ -525,7 +519,17 @@ void bio_integrity_endio(struct bio *bio, int error) BUG_ON(bip->bip_bio != bio); - bip->bip_error = error; + /* In case of an I/O error there is no point in verifying the + * integrity metadata. Restore original bio end_io handler + * and run it. + */ + if (error) { + bio->bi_end_io = bip->bip_end_io; + bio_endio(bio, error); + + return; + } + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bip->bip_work); } @@ -302,9 +302,10 @@ void bio_init(struct bio *bio) struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { struct bio *bio = NULL; + void *uninitialized_var(p); if (bs) { - void *p = mempool_alloc(bs->bio_pool, gfp_mask); + p = mempool_alloc(bs->bio_pool, gfp_mask); if (p) bio = p + bs->front_pad; @@ -329,7 +330,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) } if (unlikely(!bvl)) { if (bs) - mempool_free(bio, bs->bio_pool); + mempool_free(p, bs->bio_pool); else kfree(bio); bio = NULL; diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig new file mode 100644 index 00000000000..7bb3c020e57 --- /dev/null +++ b/fs/btrfs/Kconfig @@ -0,0 +1,31 @@ +config BTRFS_FS + tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" + depends on EXPERIMENTAL + select LIBCRC32C + select ZLIB_INFLATE + select ZLIB_DEFLATE + help + Btrfs is a new filesystem with extents, writable snapshotting, + support for multiple devices and many more features. + + Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET + FINALIZED. You should say N here unless you are interested in + testing Btrfs with non-critical data. + + To compile this file system support as a module, choose M here. The + module will be called btrfs. + + If unsure, say N. + +config BTRFS_FS_POSIX_ACL + bool "Btrfs POSIX Access Control Lists" + depends on BTRFS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 8e2fec05dbe..c84ca1f5259 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -16,11 +16,11 @@ * Boston, MA 021110-1307, USA. */ -#include <linux/version.h> #include <linux/kthread.h> #include <linux/list.h> #include <linux/spinlock.h> -# include <linux/freezer.h> +#include <linux/freezer.h> +#include <linux/ftrace.h> #include "async-thread.h" #define WORK_QUEUED_BIT 0 @@ -143,6 +143,7 @@ static int worker_loop(void *arg) struct btrfs_work *work; do { spin_lock_irq(&worker->lock); +again_locked: while (!list_empty(&worker->pending)) { cur = worker->pending.next; work = list_entry(cur, struct btrfs_work, list); @@ -165,14 +166,50 @@ static int worker_loop(void *arg) check_idle_worker(worker); } - worker->working = 0; if (freezing(current)) { + worker->working = 0; + spin_unlock_irq(&worker->lock); refrigerator(); } else { - set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&worker->lock); - if (!kthread_should_stop()) + if (!kthread_should_stop()) { + cpu_relax(); + /* + * we've dropped the lock, did someone else + * jump_in? + */ + smp_mb(); + if (!list_empty(&worker->pending)) + continue; + + /* + * this short schedule allows more work to + * come in without the queue functions + * needing to go through wake_up_process() + * + * worker->working is still 1, so nobody + * is going to try and wake us up + */ + schedule_timeout(1); + smp_mb(); + if (!list_empty(&worker->pending)) + continue; + + /* still no more work?, sleep for real */ + spin_lock_irq(&worker->lock); + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&worker->pending)) + goto again_locked; + + /* + * this makes sure we get a wakeup when someone + * adds something new to the queue + */ + worker->working = 0; + spin_unlock_irq(&worker->lock); + schedule(); + } __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); @@ -350,13 +387,14 @@ int btrfs_requeue_work(struct btrfs_work *work) { struct btrfs_worker_thread *worker = work->worker; unsigned long flags; + int wake = 0; if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) goto out; spin_lock_irqsave(&worker->lock, flags); - atomic_inc(&worker->num_pending); list_add_tail(&work->list, &worker->pending); + atomic_inc(&worker->num_pending); /* by definition we're busy, take ourselves off the idle * list @@ -368,10 +406,16 @@ int btrfs_requeue_work(struct btrfs_work *work) &worker->workers->worker_list); spin_unlock_irqrestore(&worker->workers->lock, flags); } + if (!worker->working) { + wake = 1; + worker->working = 1; + } spin_unlock_irqrestore(&worker->lock, flags); - + if (wake) + wake_up_process(worker->task); out: + return 0; } @@ -398,9 +442,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) } spin_lock_irqsave(&worker->lock, flags); + + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); check_busy_worker(worker); - list_add_tail(&work->list, &worker->pending); /* * avoid calling into wake_up_process if this thread has already diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a8c9693b75a..72677ce2b74 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -66,6 +66,9 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; + /* the space_info for where this inode's data allocations are done */ + struct btrfs_space_info *space_info; + /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ @@ -94,6 +97,11 @@ struct btrfs_inode { */ u64 delalloc_bytes; + /* total number of bytes that may be used for this inode for + * delalloc + */ + u64 reserved_bytes; + /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ee848d8585d..ab07627084f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -32,7 +32,6 @@ #include <linux/swap.h> #include <linux/writeback.h> #include <linux/bit_spinlock.h> -#include <linux/version.h> #include <linux/pagevec.h> #include "compat.h" #include "ctree.h" diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 9e46c077681..42491d728e9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -38,22 +38,64 @@ static int balance_node_right(struct btrfs_trans_handle *trans, static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -inline void btrfs_init_path(struct btrfs_path *p) -{ - memset(p, 0, sizeof(*p)); -} - struct btrfs_path *btrfs_alloc_path(void) { struct btrfs_path *path; - path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); - if (path) { - btrfs_init_path(path); + path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); + if (path) path->reada = 1; - } return path; } +/* + * set all locked nodes in the path to blocking locks. This should + * be done before scheduling + */ +noinline void btrfs_set_path_blocking(struct btrfs_path *p) +{ + int i; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (p->nodes[i] && p->locks[i]) + btrfs_set_lock_blocking(p->nodes[i]); + } +} + +/* + * reset all the locked nodes in the patch to spinning locks. + * + * held is used to keep lockdep happy, when lockdep is enabled + * we set held to a blocking lock before we go around and + * retake all the spinlocks in the path. You can safely use NULL + * for held + */ +noinline void btrfs_clear_path_blocking(struct btrfs_path *p, + struct extent_buffer *held) +{ + int i; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* lockdep really cares that we take all of these spinlocks + * in the right order. If any of the locks in the path are not + * currently blocking, it is going to complain. So, make really + * really sure by forcing the path to blocking before we clear + * the path blocking. + */ + if (held) + btrfs_set_lock_blocking(held); + btrfs_set_path_blocking(p); +#endif + + for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { + if (p->nodes[i] && p->locks[i]) + btrfs_clear_lock_blocking(p->nodes[i]); + } + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (held) + btrfs_clear_lock_blocking(held); +#endif +} + /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { @@ -261,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, level, &ins); BUG_ON(ret); cow = btrfs_init_new_buffer(trans, root, prealloc_dest, - buf->len); + buf->len, level); } else { cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, @@ -272,6 +314,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (IS_ERR(cow)) return PTR_ERR(cow); + /* cow is set to blocking by btrfs_init_new_buffer */ + copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); @@ -388,17 +432,20 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, WARN_ON(1); } - spin_lock(&root->fs_info->hash_lock); if (btrfs_header_generation(buf) == trans->transid && btrfs_header_owner(buf) == root->root_key.objectid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { *cow_ret = buf; - spin_unlock(&root->fs_info->hash_lock); WARN_ON(prealloc_dest); return 0; } - spin_unlock(&root->fs_info->hash_lock); + search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + + if (parent) + btrfs_set_lock_blocking(parent); + btrfs_set_lock_blocking(buf); + ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0, prealloc_dest); @@ -504,6 +551,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (parent_nritems == 1) return 0; + btrfs_set_lock_blocking(parent); + for (i = start_slot; i < end_slot; i++) { int close = 1; @@ -564,6 +613,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, search_start = last_block; btrfs_tree_lock(cur); + btrfs_set_lock_blocking(cur); err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, @@ -862,6 +912,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; mid = path->nodes[level]; + WARN_ON(!path->locks[level]); WARN_ON(btrfs_header_generation(mid) != trans->transid); @@ -883,8 +934,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - btrfs_tree_lock(child); BUG_ON(!child); + btrfs_tree_lock(child); + btrfs_set_lock_blocking(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); BUG_ON(ret); @@ -900,6 +952,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); btrfs_tree_unlock(child); + path->locks[level] = 0; path->nodes[level] = NULL; clean_tree_block(trans, root, mid); @@ -924,6 +977,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, left = read_node_slot(root, parent, pslot - 1); if (left) { btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, 0); if (wret) { @@ -934,6 +988,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = read_node_slot(root, parent, pslot + 1); if (right) { btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, 0); if (wret) { @@ -1109,6 +1164,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 left_nr; btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; @@ -1155,7 +1212,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, */ if (right) { u32 right_nr; + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; @@ -1210,8 +1270,7 @@ static noinline void reada_for_search(struct btrfs_root *root, struct btrfs_disk_key disk_key; u32 nritems; u64 search; - u64 lowest_read; - u64 highest_read; + u64 target; u64 nread = 0; int direction = path->reada; struct extent_buffer *eb; @@ -1235,8 +1294,7 @@ static noinline void reada_for_search(struct btrfs_root *root, return; } - highest_read = search; - lowest_read = search; + target = search; nritems = btrfs_header_nritems(node); nr = slot; @@ -1256,27 +1314,80 @@ static noinline void reada_for_search(struct btrfs_root *root, break; } search = btrfs_node_blockptr(node, nr); - if ((search >= lowest_read && search <= highest_read) || - (search < lowest_read && lowest_read - search <= 16384) || - (search > highest_read && search - highest_read <= 16384)) { + if ((search <= target && target - search <= 65536) || + (search > target && search - target <= 65536)) { readahead_tree_block(root, search, blocksize, btrfs_node_ptr_generation(node, nr)); nread += blocksize; } nscan++; - if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) + if ((nread > 65536 || nscan > 32)) break; + } +} - if (nread > (256 * 1024) || nscan > 128) - break; +/* + * returns -EAGAIN if it had to drop the path, or zero if everything was in + * cache + */ +static noinline int reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + int slot; + int nritems; + struct extent_buffer *parent; + struct extent_buffer *eb; + u64 gen; + u64 block1 = 0; + u64 block2 = 0; + int ret = 0; + int blocksize; + + parent = path->nodes[level - 1]; + if (!parent) + return 0; - if (search < lowest_read) - lowest_read = search; - if (search > highest_read) - highest_read = search; + nritems = btrfs_header_nritems(parent); + slot = path->slots[level]; + blocksize = btrfs_level_size(root, level); + + if (slot > 0) { + block1 = btrfs_node_blockptr(parent, slot - 1); + gen = btrfs_node_ptr_generation(parent, slot - 1); + eb = btrfs_find_tree_block(root, block1, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block1 = 0; + free_extent_buffer(eb); + } + if (slot < nritems) { + block2 = btrfs_node_blockptr(parent, slot + 1); + gen = btrfs_node_ptr_generation(parent, slot + 1); + eb = btrfs_find_tree_block(root, block2, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block2 = 0; + free_extent_buffer(eb); } + if (block1 || block2) { + ret = -EAGAIN; + btrfs_release_path(root, path); + if (block1) + readahead_tree_block(root, block1, blocksize, 0); + if (block2) + readahead_tree_block(root, block2, blocksize, 0); + + if (block1) { + eb = read_tree_block(root, block1, blocksize, 0); + free_extent_buffer(eb); + } + if (block1) { + eb = read_tree_block(root, block2, blocksize, 0); + free_extent_buffer(eb); + } + } + return ret; } + /* * when we walk down the tree, it is usually safe to unlock the higher layers * in the tree. The exceptions are when our path goes through slot 0, because @@ -1328,6 +1439,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } /* + * This releases any locks held in the path starting at level and + * going all the way up to the root. + * + * btrfs_search_slot will keep the lock held on higher nodes in a few + * corner cases, such as COW of the block at slot zero in the node. This + * ignores those rules, and it should only be called when there are no + * more updates to be done higher up in the tree. + */ +noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) +{ + int i; + + if (path->keep_locks || path->lowest_level) + return; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + continue; + if (!path->locks[i]) + continue; + btrfs_tree_unlock(path->nodes[i]); + path->locks[i] = 0; + } +} + +/* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf * level of the path (level 0) @@ -1387,32 +1524,30 @@ again: int wret; /* is a cow on this block not required */ - spin_lock(&root->fs_info->hash_lock); if (btrfs_header_generation(b) == trans->transid && btrfs_header_owner(b) == root->root_key.objectid && !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { - spin_unlock(&root->fs_info->hash_lock); goto cow_done; } - spin_unlock(&root->fs_info->hash_lock); /* ok, we have to cow, is our old prealloc the right * size? */ if (prealloc_block.objectid && prealloc_block.offset != b->len) { + btrfs_release_path(root, p); btrfs_free_reserved_extent(root, prealloc_block.objectid, prealloc_block.offset); prealloc_block.objectid = 0; + goto again; } /* * for higher level blocks, try not to allocate blocks * with the block and the parent locks held. */ - if (level > 1 && !prealloc_block.objectid && - btrfs_path_lock_waiting(p, level)) { + if (level > 0 && !prealloc_block.objectid) { u32 size = b->len; u64 hint = b->start; @@ -1425,6 +1560,8 @@ again: goto again; } + btrfs_set_path_blocking(p); + wret = btrfs_cow_block(trans, root, b, p->nodes[level + 1], p->slots[level + 1], @@ -1446,6 +1583,22 @@ cow_done: if (!p->skip_locking) p->locks[level] = 1; + btrfs_clear_path_blocking(p, NULL); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + * + * If cow is true, then we might be changing slot zero, + * which may require changing the parent. So, we can't + * drop the lock until after we know which slot we're + * operating on. + */ + if (!cow) + btrfs_unlock_up_safe(p, level + 1); + ret = check_block(root, p, level); if (ret) { ret = -1; @@ -1453,6 +1606,7 @@ cow_done: } ret = bin_search(b, key, level, &slot); + if (level != 0) { if (ret && slot > 0) slot -= 1; @@ -1460,7 +1614,16 @@ cow_done: if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { - int sret = split_node(trans, root, p, level); + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + BUG_ON(sret > 0); if (sret) { ret = sret; @@ -1468,9 +1631,19 @@ cow_done: } b = p->nodes[level]; slot = p->slots[level]; - } else if (ins_len < 0) { - int sret = balance_level(trans, root, p, - level); + } else if (ins_len < 0 && + btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + if (sret) { ret = sret; goto done; @@ -1504,7 +1677,7 @@ cow_done: * of the btree by dropping locks before * we read. */ - if (level > 1) { + if (level > 0) { btrfs_release_path(NULL, p); if (tmp) free_extent_buffer(tmp); @@ -1519,6 +1692,7 @@ cow_done: free_extent_buffer(tmp); goto again; } else { + btrfs_set_path_blocking(p); if (tmp) free_extent_buffer(tmp); if (should_reada) @@ -1528,14 +1702,29 @@ cow_done: b = read_node_slot(root, b, slot); } } - if (!p->skip_locking) - btrfs_tree_lock(b); + if (!p->skip_locking) { + int lret; + + btrfs_clear_path_blocking(p, NULL); + lret = btrfs_try_spin_lock(b); + + if (!lret) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(p, b); + } + } } else { p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { - int sret = split_leaf(trans, root, key, + int sret; + + btrfs_set_path_blocking(p); + sret = split_leaf(trans, root, key, p, ins_len, ret == 0); + btrfs_clear_path_blocking(p, NULL); + BUG_ON(sret > 0); if (sret) { ret = sret; @@ -1549,12 +1738,16 @@ cow_done: } ret = 1; done: + /* + * we don't really know what they plan on doing with the path + * from here on, so for now just mark it as blocking + */ + btrfs_set_path_blocking(p); if (prealloc_block.objectid) { btrfs_free_reserved_extent(root, prealloc_block.objectid, prealloc_block.offset); } - return ret; } @@ -1578,6 +1771,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); BUG_ON(ret); + btrfs_set_lock_blocking(eb); + parent = eb; while (1) { level = btrfs_header_level(parent); @@ -1602,6 +1797,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, eb = read_tree_block(root, bytenr, blocksize, generation); btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); } /* @@ -1626,6 +1822,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, eb = read_tree_block(root, bytenr, blocksize, generation); btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); } ret = btrfs_cow_block(trans, root, eb, parent, slot, @@ -2172,6 +2369,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root right = read_node_slot(root, upper, slot + 1); btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + free_space = btrfs_leaf_free_space(root, right); if (free_space < data_size) goto out_unlock; @@ -2367,6 +2566,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root left = read_node_slot(root, path->nodes[1], slot - 1); btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + free_space = btrfs_leaf_free_space(root, left); if (free_space < data_size) { ret = 1; @@ -2825,6 +3026,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, path->keep_locks = 0; BUG_ON(ret); + /* + * make sure any changes to the path from split_leaf leave it + * in a blocking state + */ + btrfs_set_path_blocking(path); + leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); @@ -3354,6 +3561,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, BUG(); } out: + btrfs_unlock_up_safe(path, 1); return ret; } @@ -3441,15 +3649,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, { int ret; u64 root_gen = btrfs_header_generation(path->nodes[1]); + u64 parent_start = path->nodes[1]->start; + u64 parent_owner = btrfs_header_owner(path->nodes[1]); ret = del_ptr(trans, root, path, 1, path->slots[1]); if (ret) return ret; + /* + * btrfs_free_extent is expensive, we want to make sure we + * aren't holding any locks when we call it + */ + btrfs_unlock_up_safe(path, 0); + ret = btrfs_free_extent(trans, root, bytenr, btrfs_level_size(root, 0), - path->nodes[1]->start, - btrfs_header_owner(path->nodes[1]), + parent_start, parent_owner, root_gen, 0, 1); return ret; } @@ -3721,6 +3936,7 @@ find_next_key: */ if (slot >= nritems) { path->slots[level] = slot; + btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, cache_only, min_trans); if (sret == 0) { @@ -3738,16 +3954,20 @@ find_next_key: unlock_up(path, level, 1); goto out; } + btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); btrfs_tree_lock(cur); + path->locks[level - 1] = 1; path->nodes[level - 1] = cur; unlock_up(path, level, 1); + btrfs_clear_path_blocking(path, NULL); } out: if (ret == 0) memcpy(min_key, &found_key, sizeof(found_key)); + btrfs_set_path_blocking(path); return ret; } @@ -3843,6 +4063,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) if (ret < 0) return ret; + btrfs_set_path_blocking(path); nritems = btrfs_header_nritems(path->nodes[0]); /* * by releasing the path above we dropped all our locks. A balance @@ -3873,6 +4094,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) free_extent_buffer(next); } + /* the path was set to blocking above */ if (level == 1 && (path->locks[1] || path->skip_locking) && path->reada) reada_for_search(root, path, level, slot, 0); @@ -3881,6 +4103,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) if (!path->skip_locking) { WARN_ON(!btrfs_tree_locked(c)); btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); } break; } @@ -3897,12 +4120,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) path->locks[level] = 1; if (!level) break; + + btrfs_set_path_blocking(path); if (level == 1 && path->locks[1] && path->reada) reada_for_search(root, path, level, slot, 0); next = read_node_slot(root, next, 0); if (!path->skip_locking) { WARN_ON(!btrfs_tree_locked(path->nodes[level])); btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); } } done: @@ -3927,6 +4153,7 @@ int btrfs_previous_item(struct btrfs_root *root, while (1) { if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); ret = btrfs_prev_leaf(root, path); if (ret != 0) return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index eee060f8811..82491ba8fa4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -43,11 +43,7 @@ struct btrfs_ordered_sum; #define BTRFS_ACL_NOT_CACHED ((void *)-1) -#ifdef CONFIG_LOCKDEP -# define BTRFS_MAX_LEVEL 7 -#else -# define BTRFS_MAX_LEVEL 8 -#endif +#define BTRFS_MAX_LEVEL 8 /* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL @@ -454,17 +450,11 @@ struct btrfs_timespec { __le32 nsec; } __attribute__ ((__packed__)); -typedef enum { +enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, BTRFS_COMPRESS_LAST = 2, -} btrfs_compression_type; - -/* we don't understand any encryption methods right now */ -typedef enum { - BTRFS_ENCRYPTION_NONE = 0, - BTRFS_ENCRYPTION_LAST = 1, -} btrfs_encryption_type; +}; struct btrfs_inode_item { /* nfs style generation number */ @@ -606,13 +596,27 @@ struct btrfs_block_group_item { struct btrfs_space_info { u64 flags; - u64 total_bytes; - u64 bytes_used; - u64 bytes_pinned; - u64 bytes_reserved; - u64 bytes_readonly; - int full; - int force_alloc; + + u64 total_bytes; /* total bytes in the space */ + u64 bytes_used; /* total bytes used on disk */ + u64 bytes_pinned; /* total bytes pinned, will be freed when the + transaction finishes */ + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + /* delalloc accounting */ + u64 bytes_delalloc; /* number of bytes reserved for allocation, + this space is not necessarily reserved yet + by the allocator */ + u64 bytes_may_use; /* number of bytes that may be used for + delalloc */ + + int full; /* indicates that we cannot allocate any more + chunks for this space */ + int force_alloc; /* set if we need to force a chunk alloc for + this space */ + struct list_head list; /* for block groups in our same type */ @@ -701,9 +705,7 @@ struct btrfs_fs_info { struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; - wait_queue_head_t async_submit_wait; - wait_queue_head_t tree_log_wait; struct btrfs_super_block super_copy; struct btrfs_super_block super_for_commit; @@ -711,7 +713,6 @@ struct btrfs_fs_info { struct super_block *sb; struct inode *btree_inode; struct backing_dev_info bdi; - spinlock_t hash_lock; struct mutex trans_mutex; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; @@ -730,10 +731,6 @@ struct btrfs_fs_info { atomic_t async_submit_draining; atomic_t nr_async_bios; atomic_t async_delalloc_pages; - atomic_t tree_log_writers; - atomic_t tree_log_commit; - unsigned long tree_log_batch; - u64 tree_log_transid; /* * this is used by the balancing code to wait for all the pending @@ -833,7 +830,14 @@ struct btrfs_root { struct kobject root_kobj; struct completion kobj_unregister; struct mutex objectid_mutex; + struct mutex log_mutex; + wait_queue_head_t log_writer_wait; + wait_queue_head_t log_commit_wait[2]; + atomic_t log_writers; + atomic_t log_commit[2]; + unsigned long log_transid; + unsigned long log_batch; u64 objectid; u64 last_trans; @@ -1721,7 +1725,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 empty_size); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize); + u64 bytenr, u32 blocksize, + int level); int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 parent, u64 min_bytes, @@ -1791,6 +1796,16 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root); int btrfs_cleanup_reloc_trees(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); +int btrfs_check_metadata_free_space(struct btrfs_root *root); +int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); +void btrfs_free_reserved_data_space(struct btrfs_root *root, + struct inode *inode, u64 bytes); +void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); +void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); /* ctree.c */ int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, @@ -1840,7 +1855,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); -void btrfs_init_path(struct btrfs_path *p); +void btrfs_set_path_blocking(struct btrfs_path *p); +void btrfs_unlock_up_safe(struct btrfs_path *p, int level); + int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); int btrfs_del_leaf(struct btrfs_trans_handle *trans, @@ -2034,8 +2051,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t offset, pgoff_t last_index); -int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, - int for_del); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 81a313874ae..adda739a021 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -16,7 +16,6 @@ * Boston, MA 021110-1307, USA. */ -#include <linux/version.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/scatterlist.h> @@ -76,6 +75,40 @@ struct async_submit_bio { struct btrfs_work work; }; +/* These are used to set the lockdep class on the extent buffer locks. + * The class is set by the readpage_end_io_hook after the buffer has + * passed csum validation but before the pages are unlocked. + * + * The lockdep class is also set by btrfs_init_new_buffer on freshly + * allocated blocks. + * + * The class is based on the level in the tree block, which allows lockdep + * to know that lower nodes nest inside the locks of higher nodes. + * + * We also add a check to make sure the highest level of the tree is + * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this + * code needs update as well. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# if BTRFS_MAX_LEVEL != 8 +# error +# endif +static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; +static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { + /* leaf */ + "btrfs-extent-00", + "btrfs-extent-01", + "btrfs-extent-02", + "btrfs-extent-03", + "btrfs-extent-04", + "btrfs-extent-05", + "btrfs-extent-06", + "btrfs-extent-07", + /* highest possible level */ + "btrfs-extent-08", +}; +#endif + /* * extents on the btree inode are pretty simple, there's one extent * that covers the entire device @@ -348,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root, return ret; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) +{ + lockdep_set_class_and_name(&eb->lock, + &btrfs_eb_class[level], + btrfs_eb_name[level]); +} +#endif + static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { @@ -393,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, } found_level = btrfs_header_level(eb); + btrfs_set_buffer_lockdep_class(eb, found_level); + ret = csum_tree_block(root, eb, 1); if (ret) ret = -EIO; @@ -800,7 +844,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret == 0) - buf->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); else WARN_ON(1); return buf; @@ -814,6 +858,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) { WARN_ON(!btrfs_tree_locked(buf)); + + /* ugh, clear_extent_buffer_dirty can be expensive */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); } @@ -850,6 +898,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, spin_lock_init(&root->list_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); + init_waitqueue_head(&root->log_writer_wait); + init_waitqueue_head(&root->log_commit_wait[0]); + init_waitqueue_head(&root->log_commit_wait[1]); + atomic_set(&root->log_commit[0], 0); + atomic_set(&root->log_commit[1], 0); + atomic_set(&root->log_writers, 0); + root->log_batch = 0; + root->log_transid = 0; extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -934,15 +990,16 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, return 0; } -int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { struct btrfs_root *root; struct btrfs_root *tree_root = fs_info->tree_root; + struct extent_buffer *leaf; root = kzalloc(sizeof(*root), GFP_NOFS); if (!root) - return -ENOMEM; + return ERR_PTR(-ENOMEM); __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, @@ -951,12 +1008,23 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + /* + * log trees do not get reference counted because they go away + * before a real commit is actually done. They do store pointers + * to file data extents, and those reference counts still get + * updated (along with back refs to the log tree). + */ root->ref_cows = 0; - root->node = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, BTRFS_TREE_LOG_OBJECTID, - trans->transid, 0, 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, BTRFS_TREE_LOG_OBJECTID, + trans->transid, 0, 0, 0); + if (IS_ERR(leaf)) { + kfree(root); + return ERR_CAST(leaf); + } + root->node = leaf; btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); btrfs_set_header_bytenr(root->node, root->node->start); @@ -968,7 +1036,48 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); - fs_info->log_root_tree = root; + return root; +} + +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *log_root; + + log_root = alloc_log_tree(trans, fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + WARN_ON(fs_info->log_root_tree); + fs_info->log_root_tree = log_root; + return 0; +} + +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *log_root; + struct btrfs_inode_item *inode_item; + + log_root = alloc_log_tree(trans, root->fs_info); + if (IS_ERR(log_root)) + return PTR_ERR(log_root); + + log_root->last_trans = trans->transid; + log_root->root_key.offset = root->root_key.objectid; + + inode_item = &log_root->root_item.inode; + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start); + btrfs_set_root_generation(&log_root->root_item, trans->transid); + + WARN_ON(root->log_root); + root->log_root = log_root; + root->log_transid = 0; return 0; } @@ -1136,7 +1245,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) { struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; int ret = 0; - struct list_head *cur; struct btrfs_device *device; struct backing_dev_info *bdi; #if 0 @@ -1144,8 +1252,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) btrfs_congested_async(info, 0)) return 1; #endif - list_for_each(cur, &info->fs_devices->devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; bdi = blk_get_backing_dev_info(device->bdev); @@ -1163,13 +1270,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) */ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { - struct list_head *cur; struct btrfs_device *device; struct btrfs_fs_info *info; info = (struct btrfs_fs_info *)bdi->unplug_io_data; - list_for_each(cur, &info->fs_devices->devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; @@ -1447,7 +1552,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); - spin_lock_init(&fs_info->hash_lock); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); @@ -1535,10 +1639,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->async_submit_wait); - init_waitqueue_head(&fs_info->tree_log_wait); - atomic_set(&fs_info->tree_log_commit, 0); - atomic_set(&fs_info->tree_log_writers, 0); - fs_info->tree_log_transid = 0; __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); @@ -1627,6 +1727,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, * low idle thresh */ fs_info->endio_workers.idle_thresh = 4; + fs_info->endio_meta_workers.idle_thresh = 4; + fs_info->endio_write_workers.idle_thresh = 64; fs_info->endio_meta_write_workers.idle_thresh = 64; @@ -1720,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); dev_root->track_dirty = 1; - if (ret) goto fail_extent_root; @@ -1740,13 +1841,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); - if (!fs_info->cleaner_kthread) + if (IS_ERR(fs_info->cleaner_kthread)) goto fail_csum_root; fs_info->transaction_kthread = kthread_run(transaction_kthread, tree_root, "btrfs-transaction"); - if (!fs_info->transaction_kthread) + if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; if (btrfs_super_log_root(disk_super) != 0) { @@ -1828,13 +1929,14 @@ fail_sb_buffer: fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); -fail: + btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); + bdi_destroy(&fs_info->bdi); +fail: kfree(extent_root); kfree(tree_root); - bdi_destroy(&fs_info->bdi); kfree(fs_info); kfree(chunk_root); kfree(dev_root); @@ -1995,7 +2097,6 @@ static int write_dev_supers(struct btrfs_device *device, int write_all_supers(struct btrfs_root *root, int max_mirrors) { - struct list_head *cur; struct list_head *head = &root->fs_info->fs_devices->devices; struct btrfs_device *dev; struct btrfs_super_block *sb; @@ -2011,8 +2112,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) sb = &root->fs_info->super_for_commit; dev_item = &sb->dev_item; - list_for_each(cur, head) { - dev = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; continue; @@ -2045,8 +2145,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) } total_errors = 0; - list_for_each(cur, head) { - dev = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) continue; if (!dev->in_fs_metadata || !dev->writeable) @@ -2260,6 +2359,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) u64 transid = btrfs_header_generation(buf); struct inode *btree_inode = root->fs_info->btree_inode; + btrfs_set_lock_blocking(buf); + WARN_ON(!btrfs_tree_locked(buf)); if (transid != root->fs_info->generation) { printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " @@ -2302,14 +2403,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) int ret; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret == 0) - buf->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); return ret; } int btree_lock_page_hook(struct page *page) { struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_buffer *eb; unsigned long len; @@ -2324,9 +2424,7 @@ int btree_lock_page_hook(struct page *page) goto out; btrfs_tree_lock(eb); - spin_lock(&root->fs_info->hash_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - spin_unlock(&root->fs_info->hash_lock); btrfs_tree_unlock(eb); free_extent_buffer(eb); out: diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c0ff404c31b..95029db227b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -98,5 +98,17 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btree_lock_page_hook(struct page *page); + + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); +#else +static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, + int level) +{ +} +#endif #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 293da650873..6b5966aacf4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -19,7 +19,7 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> -#include <linux/version.h> +#include <linux/sort.h> #include "compat.h" #include "hash.h" #include "crc32c.h" @@ -30,7 +30,6 @@ #include "volumes.h" #include "locking.h" #include "ref-cache.h" -#include "compat.h" #define PENDING_EXTENT_INSERT 0 #define PENDING_EXTENT_DELETE 1 @@ -61,6 +60,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int alloc, int mark_free); +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force); + static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) { return (cache->flags & bits) == bits; @@ -326,10 +329,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, u64 flags) { struct list_head *head = &info->space_info; - struct list_head *cur; struct btrfs_space_info *found; - list_for_each(cur, head) { - found = list_entry(cur, struct btrfs_space_info, list); + list_for_each_entry(found, head, list) { if (found->flags == flags) return found; } @@ -1326,8 +1327,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, int btrfs_extent_post_op(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - finish_current_insert(trans, root->fs_info->extent_root, 1); - del_pending_extents(trans, root->fs_info->extent_root, 1); + u64 start; + u64 end; + int ret; + + while(1) { + finish_current_insert(trans, root->fs_info->extent_root, 1); + del_pending_extents(trans, root->fs_info->extent_root, 1); + + /* is there more work to do? */ + ret = find_first_extent_bit(&root->fs_info->pending_del, + 0, &start, &end, EXTENT_WRITEBACK); + if (!ret) + continue; + ret = find_first_extent_bit(&root->fs_info->extent_ins, + 0, &start, &end, EXTENT_WRITEBACK); + if (!ret) + continue; + break; + } return 0; } @@ -1525,15 +1543,55 @@ out: return ret; } -int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *orig_buf, struct extent_buffer *buf, - u32 *nr_extents) +/* when a block goes through cow, we update the reference counts of + * everything that block points to. The internal pointers of the block + * can be in just about any order, and it is likely to have clusters of + * things that are close together and clusters of things that are not. + * + * To help reduce the seeks that come with updating all of these reference + * counts, sort them by byte number before actual updates are done. + * + * struct refsort is used to match byte number to slot in the btree block. + * we sort based on the byte number and then use the slot to actually + * find the item. + * + * struct refsort is smaller than strcut btrfs_item and smaller than + * struct btrfs_key_ptr. Since we're currently limited to the page size + * for a btree block, there's no way for a kmalloc of refsorts for a + * single node to be bigger than a page. + */ +struct refsort { + u64 bytenr; + u32 slot; +}; + +/* + * for passing into sort() + */ +static int refsort_cmp(const void *a_void, const void *b_void) +{ + const struct refsort *a = a_void; + const struct refsort *b = b_void; + + if (a->bytenr < b->bytenr) + return -1; + if (a->bytenr > b->bytenr) + return 1; + return 0; +} + + +noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *orig_buf, + struct extent_buffer *buf, u32 *nr_extents) { u64 bytenr; u64 ref_root; u64 orig_root; u64 ref_generation; u64 orig_generation; + struct refsort *sorted; u32 nritems; u32 nr_file_extents = 0; struct btrfs_key key; @@ -1542,6 +1600,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int level; int ret = 0; int faili = 0; + int refi = 0; + int slot; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, u64, u64, u64, u64, u64, u64, u64, u64); @@ -1553,6 +1613,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); + sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS); + BUG_ON(!sorted); + if (root->ref_cows) { process_func = __btrfs_inc_extent_ref; } else { @@ -1565,6 +1628,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, process_func = __btrfs_update_extent_ref; } + /* + * we make two passes through the items. In the first pass we + * only record the byte number and slot. Then we sort based on + * byte number and do the actual work based on the sorted results + */ for (i = 0; i < nritems; i++) { cond_resched(); if (level == 0) { @@ -1581,6 +1649,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, continue; nr_file_extents++; + sorted[refi].bytenr = bytenr; + sorted[refi].slot = i; + refi++; + } else { + bytenr = btrfs_node_blockptr(buf, i); + sorted[refi].bytenr = bytenr; + sorted[refi].slot = i; + refi++; + } + } + /* + * if refi == 0, we didn't actually put anything into the sorted + * array and we're done + */ + if (refi == 0) + goto out; + + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); + + for (i = 0; i < refi; i++) { + cond_resched(); + slot = sorted[i].slot; + bytenr = sorted[i].bytenr; + + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, slot); ret = process_func(trans, root, bytenr, orig_buf->start, buf->start, @@ -1589,25 +1683,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, key.objectid); if (ret) { - faili = i; + faili = slot; WARN_ON(1); goto fail; } } else { - bytenr = btrfs_node_blockptr(buf, i); ret = process_func(trans, root, bytenr, orig_buf->start, buf->start, orig_root, ref_root, orig_generation, ref_generation, level - 1); if (ret) { - faili = i; + faili = slot; WARN_ON(1); goto fail; } } } out: + kfree(sorted); if (nr_extents) { if (level == 0) *nr_extents = nr_file_extents; @@ -1616,6 +1710,7 @@ out: } return 0; fail: + kfree(sorted); WARN_ON(1); return ret; } @@ -1818,6 +1913,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_pinned = 0; found->bytes_reserved = 0; found->bytes_readonly = 0; + found->bytes_delalloc = 0; found->full = 0; found->force_alloc = 0; *space_info = found; @@ -1881,6 +1977,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return flags; } +static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) +{ + struct btrfs_fs_info *info = root->fs_info; + u64 alloc_profile; + + if (data) { + alloc_profile = info->avail_data_alloc_bits & + info->data_alloc_profile; + data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; + } else if (root == root->fs_info->chunk_root) { + alloc_profile = info->avail_system_alloc_bits & + info->system_alloc_profile; + data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; + } else { + alloc_profile = info->avail_metadata_alloc_bits & + info->metadata_alloc_profile; + data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; + } + + return btrfs_reduce_alloc_profile(root, data); +} + +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) +{ + u64 alloc_target; + + alloc_target = btrfs_get_alloc_profile(root, 1); + BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, + alloc_target); +} + +/* + * for now this just makes sure we have at least 5% of our metadata space free + * for use. + */ +int btrfs_check_metadata_free_space(struct btrfs_root *root) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 alloc_target, thresh; + int committed = 0, ret; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + +again: + spin_lock(&meta_sinfo->lock); + if (!meta_sinfo->full) + thresh = meta_sinfo->total_bytes * 80; + else + thresh = meta_sinfo->total_bytes * 95; + + do_div(thresh, 100); + + if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { + struct btrfs_trans_handle *trans; + if (!meta_sinfo->full) { + meta_sinfo->force_alloc = 1; + spin_unlock(&meta_sinfo->lock); + + trans = btrfs_start_transaction(root, 1); + if (!trans) + return -ENOMEM; + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, alloc_target, 0); + btrfs_end_transaction(trans, root); + goto again; + } + spin_unlock(&meta_sinfo->lock); + + if (!committed) { + committed = 1; + trans = btrfs_join_transaction(root, 1); + if (!trans) + return -ENOMEM; + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + goto again; + } + return -ENOSPC; + } + spin_unlock(&meta_sinfo->lock); + + return 0; +} + +/* + * This will check the space that the inode allocates from to make sure we have + * enough space for bytes. + */ +int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + int ret = 0, committed = 0; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; +again: + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + if (data_sinfo->total_bytes - data_sinfo->bytes_used - + data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - + data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - + data_sinfo->bytes_may_use < bytes) { + struct btrfs_trans_handle *trans; + + /* + * if we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full) { + u64 alloc_target; + + data_sinfo->force_alloc = 1; + spin_unlock(&data_sinfo->lock); + + alloc_target = btrfs_get_alloc_profile(root, 1); + trans = btrfs_start_transaction(root, 1); + if (!trans) + return -ENOMEM; + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + bytes + 2 * 1024 * 1024, + alloc_target, 0); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + goto again; + } + spin_unlock(&data_sinfo->lock); + + /* commit the current transaction and try again */ + if (!committed) { + committed = 1; + trans = btrfs_join_transaction(root, 1); + if (!trans) + return -ENOMEM; + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + goto again; + } + + printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" + ", %llu bytes_used, %llu bytes_reserved, " + "%llu bytes_pinned, %llu bytes_readonly, %llu may use" + "%llu total\n", bytes, data_sinfo->bytes_delalloc, + data_sinfo->bytes_used, data_sinfo->bytes_reserved, + data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, + data_sinfo->bytes_may_use, data_sinfo->total_bytes); + return -ENOSPC; + } + data_sinfo->bytes_may_use += bytes; + BTRFS_I(inode)->reserved_bytes += bytes; + spin_unlock(&data_sinfo->lock); + + return btrfs_check_metadata_free_space(root); +} + +/* + * if there was an error for whatever reason after calling + * btrfs_check_data_free_space, call this so we can cleanup the counters. + */ +void btrfs_free_reserved_data_space(struct btrfs_root *root, + struct inode *inode, u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; + spin_lock(&data_sinfo->lock); + data_sinfo->bytes_may_use -= bytes; + BTRFS_I(inode)->reserved_bytes -= bytes; + spin_unlock(&data_sinfo->lock); +} + +/* called when we are adding a delalloc extent to the inode's io_tree */ +void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + + /* get the space info for where this inode will be storing its data */ + data_sinfo = BTRFS_I(inode)->space_info; + + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + data_sinfo->bytes_delalloc += bytes; + + /* + * we are adding a delalloc extent without calling + * btrfs_check_data_free_space first. This happens on a weird + * writepage condition, but shouldn't hurt our accounting + */ + if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { + data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; + BTRFS_I(inode)->reserved_bytes = 0; + } else { + data_sinfo->bytes_may_use -= bytes; + BTRFS_I(inode)->reserved_bytes -= bytes; + } + + spin_unlock(&data_sinfo->lock); +} + +/* called when we are clearing an delalloc extent from the inode's io_tree */ +void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *info; + + info = BTRFS_I(inode)->space_info; + + spin_lock(&info->lock); + info->bytes_delalloc -= bytes; + spin_unlock(&info->lock); +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) @@ -2137,13 +2460,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, u64 end; u64 priv; u64 search = 0; - u64 skipped = 0; struct btrfs_fs_info *info = extent_root->fs_info; struct btrfs_path *path; struct pending_extent_op *extent_op, *tmp; struct list_head insert_list, update_list; int ret; - int num_inserts = 0, max_inserts; + int num_inserts = 0, max_inserts, restart = 0; path = btrfs_alloc_path(); INIT_LIST_HEAD(&insert_list); @@ -2159,18 +2481,19 @@ again: ret = find_first_extent_bit(&info->extent_ins, search, &start, &end, EXTENT_WRITEBACK); if (ret) { - if (skipped && all && !num_inserts) { - skipped = 0; + if (restart && !num_inserts && + list_empty(&update_list)) { + restart = 0; search = 0; continue; } - mutex_unlock(&info->extent_ins_mutex); break; } ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); if (!ret) { - skipped = 1; + if (all) + restart = 1; search = end + 1; if (need_resched()) { mutex_unlock(&info->extent_ins_mutex); @@ -2189,7 +2512,7 @@ again: list_add_tail(&extent_op->list, &insert_list); search = end + 1; if (num_inserts == max_inserts) { - mutex_unlock(&info->extent_ins_mutex); + restart = 1; break; } } else if (extent_op->type == PENDING_BACKREF_UPDATE) { @@ -2205,7 +2528,6 @@ again: * somebody marked this thing for deletion then just unlock it and be * done, the free_extents will handle it */ - mutex_lock(&info->extent_ins_mutex); list_for_each_entry_safe(extent_op, tmp, &update_list, list) { clear_extent_bits(&info->extent_ins, extent_op->bytenr, extent_op->bytenr + extent_op->num_bytes - 1, @@ -2227,6 +2549,10 @@ again: if (!list_empty(&update_list)) { ret = update_backrefs(trans, extent_root, path, &update_list); BUG_ON(ret); + + /* we may have COW'ed new blocks, so lets start over */ + if (all) + restart = 1; } /* @@ -2234,9 +2560,9 @@ again: * need to make sure everything is cleaned then reset everything and * go back to the beginning */ - if (!num_inserts && all && skipped) { + if (!num_inserts && restart) { search = 0; - skipped = 0; + restart = 0; INIT_LIST_HEAD(&update_list); INIT_LIST_HEAD(&insert_list); goto again; @@ -2293,27 +2619,19 @@ again: BUG_ON(ret); /* - * if we broke out of the loop in order to insert stuff because we hit - * the maximum number of inserts at a time we can handle, then loop - * back and pick up where we left off - */ - if (num_inserts == max_inserts) { - INIT_LIST_HEAD(&insert_list); - INIT_LIST_HEAD(&update_list); - num_inserts = 0; - goto again; - } - - /* - * again, if we need to make absolutely sure there are no more pending - * extent operations left and we know that we skipped some, go back to - * the beginning and do it all again + * if restart is set for whatever reason we need to go back and start + * searching through the pending list again. + * + * We just inserted some extents, which could have resulted in new + * blocks being allocated, which would result in new blocks needing + * updates, so if all is set we _must_ restart to get the updated + * blocks. */ - if (all && skipped) { + if (restart || all) { INIT_LIST_HEAD(&insert_list); INIT_LIST_HEAD(&update_list); search = 0; - skipped = 0; + restart = 0; num_inserts = 0; goto again; } @@ -2547,6 +2865,7 @@ again: if (ret) { if (all && skipped && !nr) { search = 0; + skipped = 0; continue; } mutex_unlock(&info->extent_ins_mutex); @@ -2633,6 +2952,8 @@ again: goto again; } + if (!err) + finish_current_insert(trans, extent_root, 0); return err; } @@ -2700,13 +3021,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, /* if metadata always pin */ if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_block_group_cache *cache; - - /* btrfs_free_reserved_extent */ - cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); - btrfs_add_free_space(cache, bytenr, num_bytes); - put_block_group(cache); + mutex_lock(&root->fs_info->pinned_mutex); + btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + mutex_unlock(&root->fs_info->pinned_mutex); update_reserved_extents(root, bytenr, num_bytes, 0); return 0; } @@ -2787,7 +3104,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (data & BTRFS_BLOCK_GROUP_METADATA) { last_ptr = &root->fs_info->last_alloc; - empty_cluster = 64 * 1024; + if (!btrfs_test_opt(root, SSD)) + empty_cluster = 64 * 1024; } if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) @@ -3014,16 +3332,18 @@ loop_check: static void dump_space_info(struct btrfs_space_info *info, u64 bytes) { struct btrfs_block_group_cache *cache; - struct list_head *l; printk(KERN_INFO "space_info has %llu free, is %sfull\n", (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved), (info->full) ? "" : "not "); + printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," + " may_use=%llu, used=%llu\n", info->total_bytes, + info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, + info->bytes_used); down_read(&info->groups_sem); - list_for_each(l, &info->block_groups) { - cache = list_entry(l, struct btrfs_block_group_cache, list); + list_for_each_entry(cache, &info->block_groups, list) { spin_lock(&cache->lock); printk(KERN_INFO "block group %llu has %llu bytes, %llu used " "%llu pinned %llu reserved\n", @@ -3047,24 +3367,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, { int ret; u64 search_start = 0; - u64 alloc_profile; struct btrfs_fs_info *info = root->fs_info; - if (data) { - alloc_profile = info->avail_data_alloc_bits & - info->data_alloc_profile; - data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; - } else if (root == root->fs_info->chunk_root) { - alloc_profile = info->avail_system_alloc_bits & - info->system_alloc_profile; - data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; - } else { - alloc_profile = info->avail_metadata_alloc_bits & - info->metadata_alloc_profile; - data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; - } + data = btrfs_get_alloc_profile(root, data); again: - data = btrfs_reduce_alloc_profile(root, data); /* * the only place that sets empty_size is btrfs_realloc_node, which * is not called recursively on allocations @@ -3332,7 +3638,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize) + u64 bytenr, u32 blocksize, + int level) { struct extent_buffer *buf; @@ -3340,9 +3647,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); + btrfs_set_buffer_lockdep_class(buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); + + btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { set_extent_dirty(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); @@ -3351,6 +3662,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, buf->start + buf->len - 1, GFP_NOFS); } trans->blocks_used++; + /* this returns a buffer locked for blocking */ return buf; } @@ -3379,7 +3691,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, + blocksize, level); return buf; } @@ -3388,36 +3701,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, { u64 leaf_owner; u64 leaf_generation; + struct refsort *sorted; struct btrfs_key key; struct btrfs_file_extent_item *fi; int i; int nritems; int ret; + int refi = 0; + int slot; BUG_ON(!btrfs_is_leaf(leaf)); nritems = btrfs_header_nritems(leaf); leaf_owner = btrfs_header_owner(leaf); leaf_generation = btrfs_header_generation(leaf); + sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); + /* we do this loop twice. The first time we build a list + * of the extents we have a reference on, then we sort the list + * by bytenr. The second time around we actually do the + * extent freeing. + */ for (i = 0; i < nritems; i++) { u64 disk_bytenr; cond_resched(); btrfs_item_key_to_cpu(leaf, &key, i); + + /* only extents have references, skip everything else */ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + + /* inline extents live in the btree, they don't have refs */ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; - /* - * FIXME make sure to insert a trans record that - * repeats the snapshot del on crash - */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + + /* holes don't have refs */ if (disk_bytenr == 0) continue; + sorted[refi].bytenr = disk_bytenr; + sorted[refi].slot = i; + refi++; + } + + if (refi == 0) + goto out; + + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); + + for (i = 0; i < refi; i++) { + u64 disk_bytenr; + + disk_bytenr = sorted[i].bytenr; + slot = sorted[i].slot; + + cond_resched(); + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + ret = __btrfs_free_extent(trans, root, disk_bytenr, btrfs_file_extent_disk_num_bytes(leaf, fi), leaf->start, leaf_owner, leaf_generation, @@ -3428,6 +3778,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, wake_up(&root->fs_info->transaction_throttle); cond_resched(); } +out: + kfree(sorted); return 0; } @@ -3437,9 +3789,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, { int i; int ret; - struct btrfs_extent_info *info = ref->extents; + struct btrfs_extent_info *info; + struct refsort *sorted; + + if (ref->nritems == 0) + return 0; + sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); for (i = 0; i < ref->nritems; i++) { + sorted[i].bytenr = ref->extents[i].bytenr; + sorted[i].slot = i; + } + sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); + + /* + * the items in the ref were sorted when the ref was inserted + * into the ref cache, so this is already in order + */ + for (i = 0; i < ref->nritems; i++) { + info = ref->extents + sorted[i].slot; ret = __btrfs_free_extent(trans, root, info->bytenr, info->num_bytes, ref->bytenr, ref->owner, ref->generation, @@ -3453,6 +3821,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, info++; } + kfree(sorted); return 0; } @@ -3497,6 +3866,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, } /* + * this is used while deleting old snapshots, and it drops the refs + * on a whole subtree starting from a level 1 node. + * + * The idea is to sort all the leaf pointers, and then drop the + * ref on all the leaves in order. Most of the time the leaves + * will have ref cache entries, so no leaf IOs will be required to + * find the extents they have references on. + * + * For each leaf, any references it has are also dropped in order + * + * This ends up dropping the references in something close to optimal + * order for reading and modifying the extent allocation tree. + */ +static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + u64 bytenr; + u64 root_owner; + u64 root_gen; + struct extent_buffer *eb = path->nodes[1]; + struct extent_buffer *leaf; + struct btrfs_leaf_ref *ref; + struct refsort *sorted = NULL; + int nritems = btrfs_header_nritems(eb); + int ret; + int i; + int refi = 0; + int slot = path->slots[1]; + u32 blocksize = btrfs_level_size(root, 0); + u32 refs; + + if (nritems == 0) + goto out; + + root_owner = btrfs_header_owner(eb); + root_gen = btrfs_header_generation(eb); + sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); + + /* + * step one, sort all the leaf pointers so we don't scribble + * randomly into the extent allocation tree + */ + for (i = slot; i < nritems; i++) { + sorted[refi].bytenr = btrfs_node_blockptr(eb, i); + sorted[refi].slot = i; + refi++; + } + + /* + * nritems won't be zero, but if we're picking up drop_snapshot + * after a crash, slot might be > 0, so double check things + * just in case. + */ + if (refi == 0) + goto out; + + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); + + /* + * the first loop frees everything the leaves point to + */ + for (i = 0; i < refi; i++) { + u64 ptr_gen; + + bytenr = sorted[i].bytenr; + + /* + * check the reference count on this leaf. If it is > 1 + * we just decrement it below and don't update any + * of the refs the leaf points to. + */ + ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + BUG_ON(ret); + if (refs != 1) + continue; + + ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); + + /* + * the leaf only had one reference, which means the + * only thing pointing to this leaf is the snapshot + * we're deleting. It isn't possible for the reference + * count to increase again later + * + * The reference cache is checked for the leaf, + * and if found we'll be able to drop any refs held by + * the leaf without needing to read it in. + */ + ref = btrfs_lookup_leaf_ref(root, bytenr); + if (ref && ref->generation != ptr_gen) { + btrfs_free_leaf_ref(root, ref); + ref = NULL; + } + if (ref) { + ret = cache_drop_leaf_ref(trans, root, ref); + BUG_ON(ret); + btrfs_remove_leaf_ref(root, ref); + btrfs_free_leaf_ref(root, ref); + } else { + /* + * the leaf wasn't in the reference cache, so + * we have to read it. + */ + leaf = read_tree_block(root, bytenr, blocksize, + ptr_gen); + ret = btrfs_drop_leaf_ref(trans, root, leaf); + BUG_ON(ret); + free_extent_buffer(leaf); + } + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + } + + /* + * run through the loop again to free the refs on the leaves. + * This is faster than doing it in the loop above because + * the leaves are likely to be clustered together. We end up + * working in nice chunks on the extent allocation tree. + */ + for (i = 0; i < refi; i++) { + bytenr = sorted[i].bytenr; + ret = __btrfs_free_extent(trans, root, bytenr, + blocksize, eb->start, + root_owner, root_gen, 0, 1); + BUG_ON(ret); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + } +out: + kfree(sorted); + + /* + * update the path to show we've processed the entire level 1 + * node. This will get saved into the root's drop_snapshot_progress + * field so these drops are not repeated again if this transaction + * commits. + */ + path->slots[1] = nritems; + return 0; +} + +/* * helper function for drop_snapshot, this walks down the tree dropping ref * counts as it goes. */ @@ -3511,7 +4026,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct extent_buffer *next; struct extent_buffer *cur; struct extent_buffer *parent; - struct btrfs_leaf_ref *ref; u32 blocksize; int ret; u32 refs; @@ -3538,17 +4052,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, if (path->slots[*level] >= btrfs_header_nritems(cur)) break; + + /* the new code goes down to level 1 and does all the + * leaves pointed to that node in bulk. So, this check + * for level 0 will always be false. + * + * But, the disk format allows the drop_snapshot_progress + * field in the root to leave things in a state where + * a leaf will need cleaning up here. If someone crashes + * with the old code and then boots with the new code, + * we might find a leaf here. + */ if (*level == 0) { ret = btrfs_drop_leaf_ref(trans, root, cur); BUG_ON(ret); break; } + + /* + * once we get to level one, process the whole node + * at once, including everything below it. + */ + if (*level == 1) { + ret = drop_level_one_refs(trans, root, path); + BUG_ON(ret); + break; + } + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); blocksize = btrfs_level_size(root, *level - 1); ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); BUG_ON(ret); + + /* + * if there is more than one reference, we don't need + * to read that node to drop any references it has. We + * just drop the ref we hold on that node and move on to the + * next slot in this level. + */ if (refs != 1) { parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); @@ -3567,46 +4110,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, continue; } + /* - * at this point, we have a single ref, and since the - * only place referencing this extent is a dead root - * the reference count should never go higher. - * So, we don't need to check it again + * we need to keep freeing things in the next level down. + * read the block and loop around to process it */ - if (*level == 1) { - ref = btrfs_lookup_leaf_ref(root, bytenr); - if (ref && ref->generation != ptr_gen) { - btrfs_free_leaf_ref(root, ref); - ref = NULL; - } - if (ref) { - ret = cache_drop_leaf_ref(trans, root, ref); - BUG_ON(ret); - btrfs_remove_leaf_ref(root, ref); - btrfs_free_leaf_ref(root, ref); - *level = 0; - break; - } - } - next = btrfs_find_tree_block(root, bytenr, blocksize); - if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { - free_extent_buffer(next); - - next = read_tree_block(root, bytenr, blocksize, - ptr_gen); - cond_resched(); -#if 0 - /* - * this is a debugging check and can go away - * the ref should never go all the way down to 1 - * at this point - */ - ret = lookup_extent_ref(NULL, root, bytenr, blocksize, - &refs); - BUG_ON(ret); - WARN_ON(refs != 1); -#endif - } + next = read_tree_block(root, bytenr, blocksize, ptr_gen); WARN_ON(*level <= 0); if (path->nodes[*level-1]) free_extent_buffer(path->nodes[*level-1]); @@ -3631,11 +4140,16 @@ out: root_owner = btrfs_header_owner(parent); root_gen = btrfs_header_generation(parent); + /* + * cleanup and free the reference on the last node + * we processed + */ ret = __btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, root_owner, root_gen, *level, 1); free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; + *level += 1; BUG_ON(ret); @@ -3687,6 +4201,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, next = read_tree_block(root, bytenr, blocksize, ptr_gen); btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, &refs); @@ -3754,6 +4269,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { struct extent_buffer *node; struct btrfs_disk_key disk_key; + + /* + * there is more work to do in this level. + * Update the drop_progress marker to reflect + * the work we've done so far, and then bump + * the slot number + */ node = path->nodes[i]; path->slots[i]++; *level = i; @@ -3765,6 +4287,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, return 0; } else { struct extent_buffer *parent; + + /* + * this whole node is done, free our reference + * on it and go up one level + */ if (path->nodes[*level] == root->node) parent = path->nodes[*level]; else @@ -4444,7 +4971,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans, u64 lock_end = 0; u64 num_bytes; u64 ext_offset; - u64 first_pos; + u64 search_end = (u64)-1; u32 nritems; int nr_scaned = 0; int extent_locked = 0; @@ -4452,7 +4979,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans, int ret; memcpy(&key, leaf_key, sizeof(key)); - first_pos = INT_LIMIT(loff_t) - extent_key->offset; if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { if (key.objectid < ref_path->owner_objectid || (key.objectid == ref_path->owner_objectid && @@ -4501,7 +5027,7 @@ next: if ((key.objectid > ref_path->owner_objectid) || (key.objectid == ref_path->owner_objectid && key.type > BTRFS_EXTENT_DATA_KEY) || - (key.offset >= first_pos + extent_key->offset)) + key.offset >= search_end) break; } @@ -4534,8 +5060,10 @@ next: num_bytes = btrfs_file_extent_num_bytes(leaf, fi); ext_offset = btrfs_file_extent_offset(leaf, fi); - if (first_pos > key.offset - ext_offset) - first_pos = key.offset - ext_offset; + if (search_end == (u64)-1) { + search_end = key.offset - ext_offset + + btrfs_file_extent_ram_bytes(leaf, fi); + } if (!extent_locked) { lock_start = key.offset; @@ -4724,7 +5252,7 @@ next: } skip: if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && - key.offset >= first_pos + extent_key->offset) + key.offset >= search_end) break; cond_resched(); @@ -4778,6 +5306,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, ref->bytenr = buf->start; ref->owner = btrfs_header_owner(buf); ref->generation = btrfs_header_generation(buf); + ret = btrfs_add_leaf_ref(root, ref, 0); WARN_ON(ret); btrfs_free_leaf_ref(root, ref); @@ -5351,7 +5880,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, prev_block = block_start; } + mutex_lock(&extent_root->fs_info->trans_mutex); btrfs_record_root_in_trans(found_root); + mutex_unlock(&extent_root->fs_info->trans_mutex); if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { /* * try to update data extent references while @@ -5957,9 +6488,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); - btrfs_remove_free_space_cache(block_group); + spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_remove_free_space_cache(block_group); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); up_write(&block_group->space_info->groups_sem); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e086d407f1f..ebe6b29e606 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -9,7 +9,6 @@ #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> -#include <linux/version.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include "extent_io.h" @@ -31,7 +30,7 @@ static LIST_HEAD(buffers); static LIST_HEAD(states); #define LEAK_DEBUG 0 -#ifdef LEAK_DEBUG +#if LEAK_DEBUG static DEFINE_SPINLOCK(leak_lock); #endif @@ -120,7 +119,7 @@ void extent_io_tree_init(struct extent_io_tree *tree, static struct extent_state *alloc_extent_state(gfp_t mask) { struct extent_state *state; -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; #endif @@ -130,7 +129,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->state = 0; state->private = 0; state->tree = NULL; -#ifdef LEAK_DEBUG +#if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); list_add(&state->leak_list, &states); spin_unlock_irqrestore(&leak_lock, flags); @@ -145,11 +144,11 @@ static void free_extent_state(struct extent_state *state) if (!state) return; if (atomic_dec_and_test(&state->refs)) { -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; #endif WARN_ON(state->tree); -#ifdef LEAK_DEBUG +#if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); list_del(&state->leak_list); spin_unlock_irqrestore(&leak_lock, flags); @@ -416,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); free_extent_state(prealloc); return -EEXIST; } @@ -2378,11 +2375,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, int scanned = 0; int range_whole = 0; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - pagevec_init(&pvec, 0); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ @@ -2855,6 +2847,98 @@ out: return sector; } +int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, get_extent_t *get_extent) +{ + int ret; + u64 off = start; + u64 max = start + len; + u32 flags = 0; + u64 disko = 0; + struct extent_map *em = NULL; + int end = 0; + u64 em_start = 0, em_len = 0; + unsigned long emflags; + ret = 0; + + if (len == 0) + return -EINVAL; + + lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, + GFP_NOFS); + em = get_extent(inode, NULL, 0, off, max - off, 0); + if (!em) + goto out; + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + while (!end) { + off = em->start + em->len; + if (off >= max) + end = 1; + + em_start = em->start; + em_len = em->len; + + disko = 0; + flags = 0; + + switch (em->block_start) { + case EXTENT_MAP_LAST_BYTE: + end = 1; + flags |= FIEMAP_EXTENT_LAST; + break; + case EXTENT_MAP_HOLE: + flags |= FIEMAP_EXTENT_UNWRITTEN; + break; + case EXTENT_MAP_INLINE: + flags |= (FIEMAP_EXTENT_DATA_INLINE | + FIEMAP_EXTENT_NOT_ALIGNED); + break; + case EXTENT_MAP_DELALLOC: + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + break; + default: + disko = em->block_start; + break; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + flags |= FIEMAP_EXTENT_ENCODED; + + emflags = em->flags; + free_extent_map(em); + em = NULL; + + if (!end) { + em = get_extent(inode, NULL, 0, off, max - off, 0); + if (!em) + goto out; + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + emflags = em->flags; + } + if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; + } +out_free: + free_extent_map(em); +out: + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, + GFP_NOFS); + return ret; +} + static inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { @@ -2892,15 +2976,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, gfp_t mask) { struct extent_buffer *eb = NULL; -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; #endif eb = kmem_cache_zalloc(extent_buffer_cache, mask); eb->start = start; eb->len = len; - mutex_init(&eb->mutex); -#ifdef LEAK_DEBUG + spin_lock_init(&eb->lock); + init_waitqueue_head(&eb->lock_wq); + +#if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); list_add(&eb->leak_list, &buffers); spin_unlock_irqrestore(&leak_lock, flags); @@ -2912,7 +2998,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, static void __free_extent_buffer(struct extent_buffer *eb) { -#ifdef LEAK_DEBUG +#if LEAK_DEBUG unsigned long flags; spin_lock_irqsave(&leak_lock, flags); list_del(&eb->leak_list); @@ -2980,8 +3066,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, unlock_page(p); } if (uptodate) - eb->flags |= EXTENT_UPTODATE; - eb->flags |= EXTENT_BUFFER_FILLED; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); spin_lock(&tree->buffer_lock); exists = buffer_tree_insert(tree, start, &eb->rb_node); @@ -3135,7 +3220,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, unsigned long num_pages; num_pages = num_extent_pages(eb->start, eb->len); - eb->flags &= ~EXTENT_UPTODATE; + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); @@ -3206,7 +3291,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, struct page *page; int pg_uptodate = 1; - if (eb->flags & EXTENT_UPTODATE) + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 1; ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, @@ -3242,7 +3327,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, struct bio *bio = NULL; unsigned long bio_flags = 0; - if (eb->flags & EXTENT_UPTODATE) + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, @@ -3273,7 +3358,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (all_uptodate) { if (start_i == 0) - eb->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); goto unlock_exit; } @@ -3309,7 +3394,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (!ret) - eb->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: @@ -3406,7 +3491,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start, unmap_extent_buffer(eb, eb->map_token, km); eb->map_token = NULL; save = 1; - WARN_ON(!mutex_is_locked(&eb->mutex)); } err = map_private_extent_buffer(eb, start, min_len, token, map, map_start, map_len, km); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c5b483a7913..1f9df88afbf 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -22,6 +22,10 @@ /* flags for bio submission */ #define EXTENT_BIO_COMPRESSED 1 +/* these are bit numbers for test/set bit */ +#define EXTENT_BUFFER_UPTODATE 0 +#define EXTENT_BUFFER_BLOCKING 1 + /* * page->private values. Every page that is controlled by the extent * map has page->private set to one. @@ -95,11 +99,19 @@ struct extent_buffer { unsigned long map_start; unsigned long map_len; struct page *first_page; + unsigned long bflags; atomic_t refs; - int flags; struct list_head leak_list; struct rb_node rb_node; - struct mutex mutex; + + /* the spinlock is used to protect most operations */ + spinlock_t lock; + + /* + * when we keep the lock held while blocking, waiters go onto + * the wq + */ + wait_queue_head_t lock_wq; }; struct extent_map_tree; @@ -193,6 +205,8 @@ int extent_commit_write(struct extent_io_tree *tree, unsigned from, unsigned to); sector_t extent_bmap(struct address_space *mapping, sector_t iblock, get_extent_t *get_extent); +int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, get_extent_t *get_extent); int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 4a83e33ada3..50da69da20c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -3,7 +3,6 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/spinlock.h> -#include <linux/version.h> #include <linux/hardirq.h> #include "extent_map.h" diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 90268334145..dc78954861b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -29,7 +29,6 @@ #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> -#include <linux/version.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1092,19 +1091,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); - ret = btrfs_check_free_space(root, write_bytes, 0); + ret = btrfs_check_data_free_space(root, inode, write_bytes); if (ret) goto out; ret = prepare_pages(root, file, pages, num_pages, pos, first_index, last_index, write_bytes); - if (ret) + if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); goto out; + } ret = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, buf); if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); btrfs_drop_pages(pages, num_pages); goto out; } @@ -1112,8 +1116,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, ret = dirty_and_release_pages(NULL, root, file, pages, num_pages, pos, write_bytes); btrfs_drop_pages(pages, num_pages); - if (ret) + if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); goto out; + } if (will_write) { btrfs_fdatawrite_range(inode->i_mapping, pos, @@ -1137,6 +1144,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } out: mutex_unlock(&inode->i_mutex); + if (ret) + err = ret; out_nolock: kfree(pages); @@ -1215,15 +1224,15 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) } mutex_unlock(&root->fs_info->trans_mutex); - root->fs_info->tree_log_batch++; + root->log_batch++; filemap_fdatawrite(inode->i_mapping); btrfs_wait_ordered_range(inode, 0, (u64)-1); - root->fs_info->tree_log_batch++; + root->log_batch++; /* * ok we haven't committed the transaction yet, lets do a commit */ - if (file->private_data) + if (file && file->private_data) btrfs_ioctl_trans_end(file); trans = btrfs_start_transaction(root, 1); @@ -1232,7 +1241,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) goto out; } - ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + ret = btrfs_log_dentry_safe(trans, root, dentry); if (ret < 0) goto out; @@ -1246,7 +1255,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - mutex_unlock(&file->f_dentry->d_inode->i_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); if (ret > 0) { ret = btrfs_commit_transaction(trans, root); @@ -1254,7 +1263,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) btrfs_sync_log(trans, root); ret = btrfs_end_transaction(trans, root); } - mutex_lock(&file->f_dentry->d_inode->i_mutex); + mutex_lock(&dentry->d_inode->i_mutex); out: return ret > 0 ? EIO : ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 2aa79873eb4..cc7334d833c 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, search_key.type = 0; search_key.offset = 0; - btrfs_init_path(path); start_found = 0; ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); if (ret < 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8adfe059ab4..7d4f948bc22 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -34,7 +34,6 @@ #include <linux/statfs.h> #include <linux/compat.h> #include <linux/bit_spinlock.h> -#include <linux/version.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/falloc.h> @@ -51,6 +50,7 @@ #include "tree-log.h" #include "ref-cache.h" #include "compression.h" +#include "locking.h" struct btrfs_iget_args { u64 ino; @@ -91,32 +91,14 @@ static noinline int cow_file_range(struct inode *inode, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); -/* - * a very lame attempt at stopping writes when the FS is 85% full. There - * are countless ways this is incorrect, but it is better than nothing. - */ -int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, - int for_del) +static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) { - u64 total; - u64 used; - u64 thresh; - int ret = 0; - - spin_lock(&root->fs_info->delalloc_lock); - total = btrfs_super_total_bytes(&root->fs_info->super_copy); - used = btrfs_super_bytes_used(&root->fs_info->super_copy); - if (for_del) - thresh = total * 90; - else - thresh = total * 85; - - do_div(thresh, 100); + int err; - if (used + root->fs_info->delalloc_bytes + num_required > thresh) - ret = -ENOSPC; - spin_unlock(&root->fs_info->delalloc_lock); - return ret; + err = btrfs_init_acl(inode, dir); + if (!err) + err = btrfs_xattr_security_init(inode, dir); + return err; } /* @@ -350,6 +332,19 @@ again: nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + /* + * we don't want to send crud past the end of i_size through + * compression, that's just a waste of CPU time. So, if the + * end of the file is before the start of our current + * requested range of bytes, we bail out to the uncompressed + * cleanup code that can deal with all of this. + * + * It isn't really the fastest way to fix things, but this is a + * very uncommon corner. + */ + if (actual_end <= start) + goto cleanup_and_bail_uncompressed; + total_compressed = actual_end - start; /* we want to make sure that amount of ram required to uncompress @@ -494,6 +489,7 @@ again: goto again; } } else { +cleanup_and_bail_uncompressed: /* * No compression, but we still need to write the pages in * the file we've been given so far. redirty the locked @@ -1166,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, */ if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + btrfs_delalloc_reserve_space(root, inode, end - start + 1); spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += end - start + 1; root->fs_info->delalloc_bytes += end - start + 1; @@ -1199,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, (unsigned long long)end - start + 1, (unsigned long long) root->fs_info->delalloc_bytes); + btrfs_delalloc_free_space(root, inode, (u64)-1); root->fs_info->delalloc_bytes = 0; BTRFS_I(inode)->delalloc_bytes = 0; } else { + btrfs_delalloc_free_space(root, inode, + end - start + 1); root->fs_info->delalloc_bytes -= end - start + 1; BTRFS_I(inode)->delalloc_bytes -= end - start + 1; } @@ -1324,12 +1324,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_offset, struct list_head *list) { - struct list_head *cur; struct btrfs_ordered_sum *sum; btrfs_set_trans_block_group(trans, inode); - list_for_each(cur, list) { - sum = list_entry(cur, struct btrfs_ordered_sum, list); + + list_for_each_entry(sum, list, list) { btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); } @@ -2013,6 +2012,7 @@ void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); btrfs_free_path(path); @@ -2039,6 +2039,7 @@ void btrfs_read_locked_inode(struct inode *inode) inode->i_mapping->backing_dev_info = &root->fs_info->bdi; break; default: + inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); break; } @@ -2108,6 +2109,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, goto failed; } + btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -2219,10 +2221,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) root = BTRFS_I(dir)->root; - ret = btrfs_check_free_space(root, 1, 1); - if (ret) - goto fail; - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); @@ -2235,7 +2233,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -fail: btrfs_btree_balance_dirty(root, nr); return ret; } @@ -2258,10 +2255,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return -ENOTEMPTY; } - ret = btrfs_check_free_space(root, 1, 1); - if (ret) - goto fail; - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); @@ -2278,7 +2271,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) fail_trans: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); -fail: btrfs_btree_balance_dirty(root, nr); if (ret && !err) @@ -2429,6 +2421,8 @@ next_node: ref->generation = leaf_gen; ref->nritems = 0; + btrfs_sort_leaf_ref(ref); + ret = btrfs_add_leaf_ref(root, ref, 0); WARN_ON(ret); btrfs_free_leaf_ref(root, ref); @@ -2476,7 +2470,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - u32 found_type; + u32 found_type = (u8)-1; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; u64 extent_start = 0; @@ -2503,8 +2497,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.offset = (u64)-1; key.type = (u8)-1; - btrfs_init_path(path); - search_again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -2663,6 +2655,8 @@ next: if (pending_del_nr) goto del_pending; btrfs_release_path(root, path); + if (found_type == BTRFS_INODE_ITEM_KEY) + break; goto search_again; } @@ -2679,6 +2673,8 @@ del_pending: BUG_ON(ret); pending_del_nr = 0; btrfs_release_path(root, path); + if (found_type == BTRFS_INODE_ITEM_KEY) + break; goto search_again; } } @@ -2788,7 +2784,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) if (size <= hole_start) return 0; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) return err; @@ -2984,6 +2980,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->last_trans = 0; bi->logged_trans = 0; bi->delalloc_bytes = 0; + bi->reserved_bytes = 0; bi->disk_i_size = 0; bi->flags = 0; bi->index_cnt = (u64)-1; @@ -3005,6 +3002,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) inode->i_ino = args->ino; init_btrfs_i(inode); BTRFS_I(inode)->root = args->root; + btrfs_set_inode_space_info(args->root, inode); return 0; } @@ -3265,7 +3263,7 @@ skip: /* Reached end of directory/root. Bump pos past the last item. */ if (key_type == BTRFS_DIR_INDEX_KEY) - filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); + filp->f_pos = INT_LIMIT(off_t); else filp->f_pos++; nopos: @@ -3425,6 +3423,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; + btrfs_set_inode_space_info(root, inode); if (mode & S_IFDIR) owner = 0; @@ -3458,7 +3457,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, root->highest_inode = objectid; inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + + if (dir && (dir->i_mode & S_ISGID)) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; inode->i_ino = objectid; inode_set_bytes(inode, 0); @@ -3565,7 +3571,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto fail; @@ -3586,7 +3592,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -3628,7 +3634,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, u64 objectid; u64 index = 0; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto fail; trans = btrfs_start_transaction(root, 1); @@ -3649,7 +3655,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -3696,7 +3702,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, return -ENOENT; btrfs_inc_nlink(inode); - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto fail; err = btrfs_set_inode_index(dir, &index); @@ -3742,7 +3748,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) u64 index = 0; unsigned long nr = 1; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto out_unlock; @@ -3772,7 +3778,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) drop_on_err = 1; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) goto out_fail; @@ -4158,9 +4164,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, return -EINVAL; } -static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) +static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) { - return extent_bmap(mapping, iblock, btrfs_get_extent); + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); } int btrfs_readpage(struct file *file, struct page *page) @@ -4223,7 +4230,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) return 0; - return __btrfs_releasepage(page, gfp_flags); + return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); } static void btrfs_invalidatepage(struct page *page, unsigned long offset) @@ -4298,7 +4305,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) u64 page_start; u64 page_end; - ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); if (ret) goto out; @@ -4311,6 +4318,7 @@ again: if ((page->mapping != inode->i_mapping) || (page_start >= size)) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); /* page got truncated out from underneath us */ goto out_unlock; } @@ -4593,7 +4601,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -EXDEV; - ret = btrfs_check_free_space(root, 1, 0); + ret = btrfs_check_metadata_free_space(root); if (ret) goto out_unlock; @@ -4711,7 +4719,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto out_fail; @@ -4733,7 +4741,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_inode_security(inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -4987,13 +4995,24 @@ static struct extent_io_ops btrfs_extent_io_ops = { .clear_bit_hook = btrfs_clear_bit_hook, }; +/* + * btrfs doesn't support the bmap operation because swapfiles + * use bmap to make a mapping of extents in the file. They assume + * these extents won't change over the life of the file and they + * use the bmap result to do IO directly to the drive. + * + * the btrfs bmap call would return logical addresses that aren't + * suitable for IO and they also will change frequently as COW + * operations happen. So, swapfile + btrfs == corruption. + * + * For now we're avoiding this by dropping bmap. + */ static struct address_space_operations btrfs_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readpages = btrfs_readpages, .sync_page = block_sync_page, - .bmap = btrfs_bmap, .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, @@ -5017,6 +5036,7 @@ static struct inode_operations btrfs_file_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, .fallocate = btrfs_fallocate, + .fiemap = btrfs_fiemap, }; static struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, @@ -5032,4 +5052,8 @@ static struct inode_operations btrfs_symlink_inode_operations = { .follow_link = page_follow_link_light, .put_link = page_put_link, .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c2aa33e3feb..bca729fc80c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -38,7 +38,6 @@ #include <linux/compat.h> #include <linux/bit_spinlock.h> #include <linux/security.h> -#include <linux/version.h> #include <linux/xattr.h> #include <linux/vmalloc.h> #include "compat.h" @@ -71,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root, u64 index = 0; unsigned long nr = 1; - ret = btrfs_check_free_space(root, 1, 0); + ret = btrfs_check_metadata_free_space(root); if (ret) goto fail_commit; @@ -204,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!root->ref_cows) return -EINVAL; - ret = btrfs_check_free_space(root, 1, 0); + ret = btrfs_check_metadata_free_space(root); if (ret) goto fail_unlock; @@ -375,7 +374,7 @@ static int btrfs_defrag_file(struct file *file) unsigned long i; int ret; - ret = btrfs_check_free_space(root, inode->i_size, 0); + ret = btrfs_check_data_free_space(root, inode, inode->i_size); if (ret) return -ENOSPC; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 78049ea208d..b320b103fa1 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -22,13 +22,20 @@ #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_VOL_NAME_MAX 255 -#define BTRFS_PATH_NAME_MAX 3072 +#define BTRFS_PATH_NAME_MAX 4087 +/* this should be 4k */ struct btrfs_ioctl_vol_args { __s64 fd; char name[BTRFS_PATH_NAME_MAX + 1]; }; +struct btrfs_ioctl_clone_range_args { + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -52,11 +59,6 @@ struct btrfs_ioctl_vol_args { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ struct btrfs_ioctl_vol_args) -struct btrfs_ioctl_clone_range_args { - __s64 src_fd; - __u64 src_offset, src_length; - __u64 dest_offset; -}; #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ struct btrfs_ioctl_clone_range_args) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 39bae7761db..85506c4a3af 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -25,64 +25,203 @@ #include "extent_io.h" #include "locking.h" +static inline void spin_nested(struct extent_buffer *eb) +{ + spin_lock(&eb->lock); +} + /* - * locks the per buffer mutex in an extent buffer. This uses adaptive locks - * and the spin is not tuned very extensively. The spinning does make a big - * difference in almost every workload, but spinning for the right amount of - * time needs some help. - * - * In general, we want to spin as long as the lock holder is doing btree - * searches, and we should give up if they are in more expensive code. + * Setting a lock to blocking will drop the spinlock and set the + * flag that forces other procs who want the lock to wait. After + * this you can safely schedule with the lock held. */ +void btrfs_set_lock_blocking(struct extent_buffer *eb) +{ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); + spin_unlock(&eb->lock); + } + /* exit with the spin lock released and the bit set */ +} -int btrfs_tree_lock(struct extent_buffer *eb) +/* + * clearing the blocking flag will take the spinlock again. + * After this you can't safely schedule + */ +void btrfs_clear_lock_blocking(struct extent_buffer *eb) { - int i; + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + spin_nested(eb); + clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); + smp_mb__after_clear_bit(); + } + /* exit with the spin lock held */ +} - if (mutex_trylock(&eb->mutex)) - return 0; +/* + * unfortunately, many of the places that currently set a lock to blocking + * don't end up blocking for every long, and often they don't block + * at all. For a dbench 50 run, if we don't spin one the blocking bit + * at all, the context switch rate can jump up to 400,000/sec or more. + * + * So, we're still stuck with this crummy spin on the blocking bit, + * at least until the most common causes of the short blocks + * can be dealt with. + */ +static int btrfs_spin_on_block(struct extent_buffer *eb) +{ + int i; for (i = 0; i < 512; i++) { cpu_relax(); - if (mutex_trylock(&eb->mutex)) - return 0; + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + if (need_resched()) + break; } - cpu_relax(); - mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); return 0; } -int btrfs_try_tree_lock(struct extent_buffer *eb) +/* + * This is somewhat different from trylock. It will take the + * spinlock but if it finds the lock is set to blocking, it will + * return without the lock held. + * + * returns 1 if it was able to take the lock and zero otherwise + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_try_spin_lock(struct extent_buffer *eb) { - return mutex_trylock(&eb->mutex); + int i; + + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + + /* spin for a bit on the BLOCKING flag */ + for (i = 0; i < 2; i++) { + if (!btrfs_spin_on_block(eb)) + break; + + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + } + return 0; } -int btrfs_tree_unlock(struct extent_buffer *eb) +/* + * the autoremove wake function will return 0 if it tried to wake up + * a process that was already awake, which means that process won't + * count as an exclusive wakeup. The waitq code will continue waking + * procs until it finds one that was actually sleeping. + * + * For btrfs, this isn't quite what we want. We want a single proc + * to be notified that the lock is ready for taking. If that proc + * already happen to be awake, great, it will loop around and try for + * the lock. + * + * So, btrfs_wake_function always returns 1, even when the proc that we + * tried to wake up was already awake. + */ +static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, + int sync, void *key) { - mutex_unlock(&eb->mutex); - return 0; + autoremove_wake_function(wait, mode, sync, key); + return 1; } -int btrfs_tree_locked(struct extent_buffer *eb) +/* + * returns with the extent buffer spinlocked. + * + * This will spin and/or wait as required to take the lock, and then + * return with the spinlock held. + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_tree_lock(struct extent_buffer *eb) { - return mutex_is_locked(&eb->mutex); + DEFINE_WAIT(wait); + wait.func = btrfs_wake_function; + + while(1) { + spin_nested(eb); + + /* nobody is blocking, exit with the spinlock held */ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 0; + + /* + * we have the spinlock, but the real owner is blocking. + * wait for them + */ + spin_unlock(&eb->lock); + + /* + * spin for a bit, and if the blocking flag goes away, + * loop around + */ + if (btrfs_spin_on_block(eb)) + continue; + + prepare_to_wait_exclusive(&eb->lock_wq, &wait, + TASK_UNINTERRUPTIBLE); + + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + schedule(); + + finish_wait(&eb->lock_wq, &wait); + } + return 0; } /* - * btrfs_search_slot uses this to decide if it should drop its locks - * before doing something expensive like allocating free blocks for cow. + * Very quick trylock, this does not spin or schedule. It returns + * 1 with the spinlock held if it was able to take the lock, or it + * returns zero if it was unable to take the lock. + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() */ -int btrfs_path_lock_waiting(struct btrfs_path *path, int level) +int btrfs_try_tree_lock(struct extent_buffer *eb) { - int i; - struct extent_buffer *eb; - for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { - eb = path->nodes[i]; - if (!eb) - break; - smp_mb(); - if (!list_empty(&eb->mutex.wait_list)) - return 1; + if (spin_trylock(&eb->lock)) { + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + /* + * we've got the spinlock, but the real owner is + * blocking. Drop the spinlock and return failure + */ + spin_unlock(&eb->lock); + return 0; + } + return 1; } + /* someone else has the spinlock giveup */ return 0; } +int btrfs_tree_unlock(struct extent_buffer *eb) +{ + /* + * if we were a blocking owner, we don't have the spinlock held + * just clear the bit and look for waiters + */ + if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + smp_mb__after_clear_bit(); + else + spin_unlock(&eb->lock); + + if (waitqueue_active(&eb->lock_wq)) + wake_up(&eb->lock_wq); + return 0; +} + +int btrfs_tree_locked(struct extent_buffer *eb) +{ + return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || + spin_is_locked(&eb->lock); +} diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index bc1faef1251..6bb0afbff92 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -22,6 +22,10 @@ int btrfs_tree_lock(struct extent_buffer *eb); int btrfs_tree_unlock(struct extent_buffer *eb); int btrfs_tree_locked(struct extent_buffer *eb); + int btrfs_try_tree_lock(struct extent_buffer *eb); -int btrfs_path_lock_waiting(struct btrfs_path *path, int level); +int btrfs_try_spin_lock(struct extent_buffer *eb); + +void btrfs_set_lock_blocking(struct extent_buffer *eb); +void btrfs_clear_lock_blocking(struct extent_buffer *eb); #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a2094017027..77c2411a5f0 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -613,7 +613,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, struct btrfs_sector_sum *sector_sums; struct btrfs_ordered_extent *ordered; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct list_head *cur; unsigned long num_sectors; unsigned long i; u32 sectorsize = BTRFS_I(inode)->root->sectorsize; @@ -624,8 +623,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, return 1; mutex_lock(&tree->mutex); - list_for_each_prev(cur, &ordered->list) { - ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list); + list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr) { num_sectors = ordered_sum->len / sectorsize; sector_sums = ordered_sum->sums; diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c index 6f0acc4c9ea..d0cc62bccb9 100644 --- a/fs/btrfs/ref-cache.c +++ b/fs/btrfs/ref-cache.c @@ -17,6 +17,7 @@ */ #include <linux/sched.h> +#include <linux/sort.h> #include "ctree.h" #include "ref-cache.h" #include "transaction.h" diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h index 16f3183d7c5..bc283ad2db7 100644 --- a/fs/btrfs/ref-cache.h +++ b/fs/btrfs/ref-cache.h @@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, int shared); int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); - #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7256cf242eb..19a4daf03cc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -37,7 +37,6 @@ #include <linux/ctype.h> #include <linux/namei.h> #include <linux/miscdevice.h> -#include <linux/version.h> #include <linux/magic.h> #include "compat.h" #include "ctree.h" @@ -380,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_start_delalloc_inodes(root); btrfs_wait_ordered_extents(root, 0); - btrfs_clean_old_snapshots(root); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); sb->s_dirt = 0; @@ -512,6 +510,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) struct btrfs_root *root = btrfs_sb(sb); int ret; + ret = btrfs_parse_options(root, data); + if (ret) + return -EINVAL; + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; @@ -582,18 +584,20 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, { struct btrfs_ioctl_vol_args *vol; struct btrfs_fs_devices *fs_devices; - int ret = 0; - int len; + int ret = -ENOTTY; if (!capable(CAP_SYS_ADMIN)) return -EPERM; vol = kmalloc(sizeof(*vol), GFP_KERNEL); + if (!vol) + return -ENOMEM; + if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { ret = -EFAULT; goto out; } - len = strnlen(vol->name, BTRFS_PATH_NAME_MAX); + switch (cmd) { case BTRFS_IOC_SCAN_DEV: ret = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8a08f944334..4112d53d4f4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, num_bytes -= btrfs_root_used(&dirty->root->root_item); bytes_used = btrfs_root_used(&root->root_item); if (num_bytes) { + mutex_lock(&root->fs_info->trans_mutex); btrfs_record_root_in_trans(root); + mutex_unlock(&root->fs_info->trans_mutex); btrfs_set_root_used(&root->root_item, bytes_used - num_bytes); } @@ -852,11 +854,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, { struct btrfs_pending_snapshot *pending; struct list_head *head = &trans->transaction->pending_snapshots; - struct list_head *cur; int ret; - list_for_each(cur, head) { - pending = list_entry(cur, struct btrfs_pending_snapshot, list); + list_for_each_entry(pending, head, list) { ret = create_pending_snapshot(trans, fs_info, pending); BUG_ON(ret); } diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3e8358c3616..98d25fa4570 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, u32 nritems; root_node = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(root_node); nritems = btrfs_header_nritems(root_node); root->defrag_max.objectid = 0; /* from above we know this is not a leaf */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d81cda2e077..9c462fbd60f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -78,104 +78,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans, */ /* - * btrfs_add_log_tree adds a new per-subvolume log tree into the - * tree of log tree roots. This must be called with a tree log transaction - * running (see start_log_trans). - */ -static int btrfs_add_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_key key; - struct btrfs_root_item root_item; - struct btrfs_inode_item *inode_item; - struct extent_buffer *leaf; - struct btrfs_root *new_root = root; - int ret; - u64 objectid = root->root_key.objectid; - - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, - trans->transid, 0, 0, 0); - if (IS_ERR(leaf)) { - ret = PTR_ERR(leaf); - return ret; - } - - btrfs_set_header_nritems(leaf, 0); - btrfs_set_header_level(leaf, 0); - btrfs_set_header_bytenr(leaf, leaf->start); - btrfs_set_header_generation(leaf, trans->transid); - btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); - - write_extent_buffer(leaf, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(leaf), - BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(leaf); - - inode_item = &root_item.inode; - memset(inode_item, 0, sizeof(*inode_item)); - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nbytes = cpu_to_le64(root->leafsize); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); - - btrfs_set_root_bytenr(&root_item, leaf->start); - btrfs_set_root_generation(&root_item, trans->transid); - btrfs_set_root_level(&root_item, 0); - btrfs_set_root_refs(&root_item, 0); - btrfs_set_root_used(&root_item, 0); - - memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); - root_item.drop_level = 0; - - btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - leaf = NULL; - - btrfs_set_root_dirid(&root_item, 0); - - key.objectid = BTRFS_TREE_LOG_OBJECTID; - key.offset = objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, - &root_item); - if (ret) - goto fail; - - new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, - &key); - BUG_ON(!new_root); - - WARN_ON(root->log_root); - root->log_root = new_root; - - /* - * log trees do not get reference counted because they go away - * before a real commit is actually done. They do store pointers - * to file data extents, and those reference counts still get - * updated (along with back refs to the log tree). - */ - new_root->ref_cows = 0; - new_root->last_trans = trans->transid; - - /* - * we need to make sure the root block for this new tree - * is marked as dirty in the dirty_log_pages tree. This - * is how it gets flushed down to disk at tree log commit time. - * - * the tree logging mutex keeps others from coming in and changing - * the new_root->node, so we can safely access it here - */ - set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start, - new_root->node->start + new_root->node->len - 1, - GFP_NOFS); - -fail: - return ret; -} - -/* * start a sub transaction and setup the log tree * this increments the log tree writer count to make the people * syncing the tree wait for us to finish @@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; + + mutex_lock(&root->log_mutex); + if (root->log_root) { + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); + return 0; + } mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, root->fs_info); @@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, ret = btrfs_add_log_tree(trans, root); BUG_ON(ret); } - atomic_inc(&root->fs_info->tree_log_writers); - root->fs_info->tree_log_batch++; mutex_unlock(&root->fs_info->tree_log_mutex); + root->log_batch++; + atomic_inc(&root->log_writers); + mutex_unlock(&root->log_mutex); return 0; } @@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root) if (!root->log_root) return -ENOENT; - mutex_lock(&root->fs_info->tree_log_mutex); + mutex_lock(&root->log_mutex); if (root->log_root) { ret = 0; - atomic_inc(&root->fs_info->tree_log_writers); - root->fs_info->tree_log_batch++; + atomic_inc(&root->log_writers); } - mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->log_mutex); return ret; } @@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root) */ static int end_log_trans(struct btrfs_root *root) { - atomic_dec(&root->fs_info->tree_log_writers); - smp_mb(); - if (waitqueue_active(&root->fs_info->tree_log_wait)) - wake_up(&root->fs_info->tree_log_wait); + if (atomic_dec_and_test(&root->log_writers)) { + smp_mb(); + if (waitqueue_active(&root->log_writer_wait)) + wake_up(&root->log_writer_wait); + } return 0; } @@ -1704,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1750,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1807,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1879,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, log, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1902,26 +1817,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, } } btrfs_free_path(path); - if (wc->free) - free_extent_buffer(log->node); return ret; } -static int wait_log_commit(struct btrfs_root *log) +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + int ret; + + if (log->log_transid == 1) { + /* insert root item on the first sync */ + ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } else { + ret = btrfs_update_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + } + return ret; +} + +static int wait_log_commit(struct btrfs_root *root, unsigned long transid) { DEFINE_WAIT(wait); - u64 transid = log->fs_info->tree_log_transid; + int index = transid % 2; + /* + * we only allow two pending log transactions at a time, + * so we know that if ours is more than 2 older than the + * current transaction, we're done + */ do { - prepare_to_wait(&log->fs_info->tree_log_wait, &wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&log->fs_info->tree_log_mutex); - if (atomic_read(&log->fs_info->tree_log_commit)) + prepare_to_wait(&root->log_commit_wait[index], + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + if (root->log_transid < transid + 2 && + atomic_read(&root->log_commit[index])) schedule(); - finish_wait(&log->fs_info->tree_log_wait, &wait); - mutex_lock(&log->fs_info->tree_log_mutex); - } while (transid == log->fs_info->tree_log_transid && - atomic_read(&log->fs_info->tree_log_commit)); + finish_wait(&root->log_commit_wait[index], &wait); + mutex_lock(&root->log_mutex); + } while (root->log_transid < transid + 2 && + atomic_read(&root->log_commit[index])); + return 0; +} + +static int wait_for_writer(struct btrfs_root *root) +{ + DEFINE_WAIT(wait); + while (atomic_read(&root->log_writers)) { + prepare_to_wait(&root->log_writer_wait, + &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&root->log_mutex); + if (atomic_read(&root->log_writers)) + schedule(); + mutex_lock(&root->log_mutex); + finish_wait(&root->log_writer_wait, &wait); + } return 0; } @@ -1933,57 +1887,114 @@ static int wait_log_commit(struct btrfs_root *log) int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + int index1; + int index2; int ret; - unsigned long batch; struct btrfs_root *log = root->log_root; + struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - mutex_lock(&log->fs_info->tree_log_mutex); - if (atomic_read(&log->fs_info->tree_log_commit)) { - wait_log_commit(log); - goto out; + mutex_lock(&root->log_mutex); + index1 = root->log_transid % 2; + if (atomic_read(&root->log_commit[index1])) { + wait_log_commit(root, root->log_transid); + mutex_unlock(&root->log_mutex); + return 0; } - atomic_set(&log->fs_info->tree_log_commit, 1); + atomic_set(&root->log_commit[index1], 1); + + /* wait for previous tree log sync to complete */ + if (atomic_read(&root->log_commit[(index1 + 1) % 2])) + wait_log_commit(root, root->log_transid - 1); while (1) { - batch = log->fs_info->tree_log_batch; - mutex_unlock(&log->fs_info->tree_log_mutex); + unsigned long batch = root->log_batch; + mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); - mutex_lock(&log->fs_info->tree_log_mutex); - - while (atomic_read(&log->fs_info->tree_log_writers)) { - DEFINE_WAIT(wait); - prepare_to_wait(&log->fs_info->tree_log_wait, &wait, - TASK_UNINTERRUPTIBLE); - mutex_unlock(&log->fs_info->tree_log_mutex); - if (atomic_read(&log->fs_info->tree_log_writers)) - schedule(); - mutex_lock(&log->fs_info->tree_log_mutex); - finish_wait(&log->fs_info->tree_log_wait, &wait); - } - if (batch == log->fs_info->tree_log_batch) + mutex_lock(&root->log_mutex); + wait_for_writer(root); + if (batch == root->log_batch) break; } ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); - ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, - &root->fs_info->log_root_tree->dirty_log_pages); + + btrfs_set_root_bytenr(&log->root_item, log->node->start); + btrfs_set_root_generation(&log->root_item, trans->transid); + btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); + + root->log_batch = 0; + root->log_transid++; + log->log_transid = root->log_transid; + smp_mb(); + /* + * log tree has been flushed to disk, new modifications of + * the log will be written to new positions. so it's safe to + * allow log writers to go in. + */ + mutex_unlock(&root->log_mutex); + + mutex_lock(&log_root_tree->log_mutex); + log_root_tree->log_batch++; + atomic_inc(&log_root_tree->log_writers); + mutex_unlock(&log_root_tree->log_mutex); + + ret = update_log_root(trans, log); + BUG_ON(ret); + + mutex_lock(&log_root_tree->log_mutex); + if (atomic_dec_and_test(&log_root_tree->log_writers)) { + smp_mb(); + if (waitqueue_active(&log_root_tree->log_writer_wait)) + wake_up(&log_root_tree->log_writer_wait); + } + + index2 = log_root_tree->log_transid % 2; + if (atomic_read(&log_root_tree->log_commit[index2])) { + wait_log_commit(log_root_tree, log_root_tree->log_transid); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + atomic_set(&log_root_tree->log_commit[index2], 1); + + if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) + wait_log_commit(log_root_tree, log_root_tree->log_transid - 1); + + wait_for_writer(log_root_tree); + + ret = btrfs_write_and_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages); BUG_ON(ret); btrfs_set_super_log_root(&root->fs_info->super_for_commit, - log->fs_info->log_root_tree->node->start); + log_root_tree->node->start); btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, - btrfs_header_level(log->fs_info->log_root_tree->node)); + btrfs_header_level(log_root_tree->node)); + + log_root_tree->log_batch = 0; + log_root_tree->log_transid++; + smp_mb(); + + mutex_unlock(&log_root_tree->log_mutex); + + /* + * nobody else is going to jump in and write the the ctree + * super here because the log_commit atomic below is protecting + * us. We must be called with a transaction handle pinning + * the running transaction open, so a full commit can't hop + * in and cause problems either. + */ + write_ctree_super(trans, root->fs_info->tree_root, 2); - write_ctree_super(trans, log->fs_info->tree_root, 2); - log->fs_info->tree_log_transid++; - log->fs_info->tree_log_batch = 0; - atomic_set(&log->fs_info->tree_log_commit, 0); + atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); - if (waitqueue_active(&log->fs_info->tree_log_wait)) - wake_up(&log->fs_info->tree_log_wait); + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) + wake_up(&log_root_tree->log_commit_wait[index2]); out: - mutex_unlock(&log->fs_info->tree_log_mutex); + atomic_set(&root->log_commit[index1], 0); + smp_mb(); + if (waitqueue_active(&root->log_commit_wait[index1])) + wake_up(&root->log_commit_wait[index1]); return 0; } @@ -2019,38 +2030,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) start, end, GFP_NOFS); } - log = root->log_root; - ret = btrfs_del_root(trans, root->fs_info->log_root_tree, - &log->root_key); - BUG_ON(ret); + if (log->log_transid > 0) { + ret = btrfs_del_root(trans, root->fs_info->log_root_tree, + &log->root_key); + BUG_ON(ret); + } root->log_root = NULL; - kfree(root->log_root); + free_extent_buffer(log->node); + kfree(log); return 0; } /* - * helper function to update the item for a given subvolumes log root - * in the tree of log roots - */ -static int update_log_root(struct btrfs_trans_handle *trans, - struct btrfs_root *log) -{ - u64 bytenr = btrfs_root_bytenr(&log->root_item); - int ret; - - if (log->node->start == bytenr) - return 0; - - btrfs_set_root_bytenr(&log->root_item, log->node->start); - btrfs_set_root_generation(&log->root_item, trans->transid); - btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); - ret = btrfs_update_root(trans, log->fs_info->log_root_tree, - &log->root_key, &log->root_item); - BUG_ON(ret); - return ret; -} - -/* * If both a file and directory are logged, and unlinks or renames are * mixed in, we have a few interesting corners: * @@ -2711,11 +2702,6 @@ next_slot: btrfs_free_path(path); btrfs_free_path(dst_path); - - mutex_lock(&root->fs_info->tree_log_mutex); - ret = update_log_root(trans, log); - BUG_ON(ret); - mutex_unlock(&root->fs_info->tree_log_mutex); out: return 0; } @@ -2846,7 +2832,9 @@ again: BUG_ON(!wc.replay_dest); wc.replay_dest->log_root = log; + mutex_lock(&fs_info->trans_mutex); btrfs_record_root_in_trans(wc.replay_dest); + mutex_unlock(&fs_info->trans_mutex); ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b187b537888..1316139bf9e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,7 +20,6 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/random.h> -#include <linux/version.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -104,10 +103,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head, u64 devid, u8 *uuid) { struct btrfs_device *dev; - struct list_head *cur; - list_for_each(cur, head) { - dev = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(dev, head, dev_list) { if (dev->devid == devid && (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { return dev; @@ -118,11 +115,9 @@ static noinline struct btrfs_device *__find_device(struct list_head *head, static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) { - struct list_head *cur; struct btrfs_fs_devices *fs_devices; - list_for_each(cur, &fs_uuids) { - fs_devices = list_entry(cur, struct btrfs_fs_devices, list); + list_for_each_entry(fs_devices, &fs_uuids, list) { if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) return fs_devices; } @@ -159,6 +154,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) loop: spin_lock(&device->io_lock); +loop_lock: /* take all the bios off the list at once and process them * later on (without the lock held). But, remember the * tail and other pointers so the bios can be properly reinserted @@ -208,7 +204,7 @@ loop: * is now congested. Back off and let other work structs * run instead */ - if (pending && bdi_write_congested(bdi) && + if (pending && bdi_write_congested(bdi) && num_run > 16 && fs_info->fs_devices->open_devices > 1) { struct bio *old_head; @@ -221,6 +217,8 @@ loop: else device->pending_bio_tail = tail; + device->running_pending = 1; + spin_unlock(&device->io_lock); btrfs_requeue_work(&device->work); goto done; @@ -228,6 +226,11 @@ loop: } if (again) goto loop; + + spin_lock(&device->io_lock); + if (device->pending_bios) + goto loop_lock; + spin_unlock(&device->io_lock); done: return 0; } @@ -344,14 +347,11 @@ error: int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) { - struct list_head *tmp; - struct list_head *cur; - struct btrfs_device *device; + struct btrfs_device *device, *next; mutex_lock(&uuid_mutex); again: - list_for_each_safe(cur, tmp, &fs_devices->devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->in_fs_metadata) continue; @@ -382,14 +382,12 @@ again: static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct list_head *cur; struct btrfs_device *device; if (--fs_devices->opened > 0) return 0; - list_for_each(cur, &fs_devices->devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev) { close_bdev_exclusive(device->bdev, device->mode); fs_devices->open_devices--; @@ -438,7 +436,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, { struct block_device *bdev; struct list_head *head = &fs_devices->devices; - struct list_head *cur; struct btrfs_device *device; struct block_device *latest_bdev = NULL; struct buffer_head *bh; @@ -449,8 +446,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int seeding = 1; int ret = 0; - list_for_each(cur, head) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, head, dev_list) { if (device->bdev) continue; if (!device->name) @@ -577,7 +573,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, *(unsigned long long *)disk_super->fsid, *(unsigned long long *)(disk_super->fsid + 8)); } - printk(KERN_INFO "devid %llu transid %llu %s\n", + printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); @@ -1016,14 +1012,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } if (strcmp(device_path, "missing") == 0) { - struct list_head *cur; struct list_head *devices; struct btrfs_device *tmp; device = NULL; devices = &root->fs_info->fs_devices->devices; - list_for_each(cur, devices) { - tmp = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(tmp, devices, dev_list) { if (tmp->in_fs_metadata && !tmp->bdev) { device = tmp; break; @@ -1279,7 +1273,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; - struct list_head *cur; struct list_head *devices; struct super_block *sb = root->fs_info->sb; u64 total_bytes; @@ -1303,8 +1296,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; - list_for_each(cur, devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; goto error; @@ -1703,7 +1695,6 @@ static u64 div_factor(u64 num, int factor) int btrfs_balance(struct btrfs_root *dev_root) { int ret; - struct list_head *cur; struct list_head *devices = &dev_root->fs_info->fs_devices->devices; struct btrfs_device *device; u64 old_size; @@ -1722,8 +1713,7 @@ int btrfs_balance(struct btrfs_root *dev_root) dev_root = dev_root->fs_info->dev_root; /* step one make some room on all the devices */ - list_for_each(cur, devices) { - device = list_entry(cur, struct btrfs_device, dev_list); + list_for_each_entry(device, devices, dev_list) { old_size = device->total_bytes; size_to_free = div_factor(old_size, 1); size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); @@ -2904,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, free_extent_map(em); } - map = kzalloc(sizeof(*map), GFP_NOFS); - if (!map) - return -ENOMEM; - em = alloc_extent_map(GFP_NOFS); if (!em) return -ENOMEM; @@ -3116,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); + btrfs_set_buffer_lockdep_class(sb, 0); + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 7f332e27089..a9d3bf4d268 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/rwsem.h> #include <linux/xattr.h> +#include <linux/security.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" @@ -45,9 +46,12 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name, /* lookup the xattr by name */ di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, strlen(name), 0); - if (!di || IS_ERR(di)) { + if (!di) { ret = -ENODATA; goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; } leaf = path->nodes[0]; @@ -62,6 +66,14 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name, ret = -ERANGE; goto out; } + + /* + * The way things are packed into the leaf is like this + * |struct btrfs_dir_item|name|data| + * where name is the xattr name, so security.foo, and data is the + * content of the xattr. data_ptr points to the location in memory + * where the data starts in the in memory leaf + */ data_ptr = (unsigned long)((char *)(di + 1) + btrfs_dir_name_len(leaf, di)); read_extent_buffer(leaf, buffer, data_ptr, @@ -86,7 +98,7 @@ int __btrfs_setxattr(struct inode *inode, const char *name, if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); /* first lets see if we already have this xattr */ @@ -176,7 +188,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto err; - ret = 0; advance = 0; while (1) { leaf = path->nodes[0]; @@ -320,3 +331,34 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) return -EOPNOTSUPP; return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); } + +int btrfs_xattr_security_init(struct inode *inode, struct inode *dir) +{ + int err; + size_t len; + void *value; + char *suffix; + char *name; + + err = security_inode_init_security(inode, dir, &suffix, &value, &len); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1, + GFP_NOFS); + if (!name) { + err = -ENOMEM; + } else { + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); + err = __btrfs_setxattr(inode, name, value, len, 0); + kfree(name); + } + + kfree(suffix); + kfree(value); + return err; +} diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 5b1d08f8e68..c71e9c3cf3f 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -36,4 +36,6 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int btrfs_removexattr(struct dentry *dentry, const char *name); +extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir); + #endif /* __XATTR__ */ diff --git a/fs/buffer.c b/fs/buffer.c index b58208f1640..9f697419ed8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -777,6 +777,7 @@ static int __set_page_dirty(struct page *page, __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, @@ -2688,7 +2689,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping, struct buffer_head *bh; BUG_ON(fsdata != NULL && page_has_buffers(page)); - if (unlikely(copied < len) && !page_has_buffers(page)) + if (unlikely(copied < len) && head) attach_nobh_buffers(page, head); if (page_has_buffers(page)) return generic_write_end(file, mapping, pos, len, @@ -3108,7 +3109,7 @@ int sync_dirty_buffer(struct buffer_head *bh) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(WRITE, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 080703a15f4..851388fafc7 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,13 @@ +Version 1.57 +------------ +Improve support for multiple security contexts to the same server. We +used to use the same "vcnumber" for all connections which could cause +the server to treat subsequent connections, especially those that +are authenticated as guest, as reconnections, invalidating the earlier +user's smb session. This fix allows cifs to mount multiple times to the +same server with different userids without risking invalidating earlier +established security contexts. + Version 1.56 ------------ Add "forcemandatorylock" mount option to allow user to use mandatory @@ -5,7 +15,12 @@ rather than posix (advisory) byte range locks, even though server would support posix byte range locks. Fix query of root inode when prefixpath specified and user does not have access to query information about the top of the share. Fix problem in 2.6.28 resolving DFS paths to -Samba servers (worked to Windows). +Samba servers (worked to Windows). Fix rmdir so that pending search +(readdir) requests do not get invalid results which include the now +removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable +when using DFS. Add better file create support to servers which support +the CIFS POSIX protocol extensions (this adds support for new flags +on create, and improves semantics for write of locked ranges). Version 1.55 ------------ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index d4839cf0cb2..7c9809523f4 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -48,11 +48,11 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) return -EINVAL; - MD5Init(&context); - MD5Update(&context, (char *)&key->data, key->len); - MD5Update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); + cifs_MD5_init(&context); + cifs_MD5_update(&context, (char *)&key->data, key->len); + cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); - MD5Final(signature, &context); + cifs_MD5_final(signature, &context); return 0; } @@ -96,8 +96,8 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if ((iov == NULL) || (signature == NULL) || (key == NULL)) return -EINVAL; - MD5Init(&context); - MD5Update(&context, (char *)&key->data, key->len); + cifs_MD5_init(&context); + cifs_MD5_update(&context, (char *)&key->data, key->len); for (i = 0; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; @@ -110,13 +110,13 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ - MD5Update(&context, iov[0].iov_base+4, + cifs_MD5_update(&context, iov[0].iov_base+4, iov[0].iov_len-4); } else - MD5Update(&context, iov[i].iov_base, iov[i].iov_len); + cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len); } - MD5Final(signature, &context); + cifs_MD5_final(signature, &context); return 0; } diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7ac481841f8..2b1d28a9ee2 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.56" +#define CIFS_VERSION "1.57" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 94c1ca0ec95..e004f6db5fc 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -164,9 +164,12 @@ struct TCP_Server_Info { /* multiplexed reads or writes */ unsigned int maxBuf; /* maxBuf specifies the maximum */ /* message size the server can send or receive for non-raw SMBs */ - unsigned int maxRw; /* maxRw specifies the maximum */ + unsigned int max_rw; /* maxRw specifies the maximum */ /* message size the server can send or receive for */ /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ + unsigned int max_vcs; /* maximum number of smb sessions, at least + those that can be specified uniquely with + vcnumbers */ char sessid[4]; /* unique token id for this session */ /* (returned on Negotiate */ int capabilities; /* allow selective disabling of caps by smb sess */ @@ -210,6 +213,7 @@ struct cifsSesInfo { unsigned overrideSecFlg; /* if non-zero override global sec flags */ __u16 ipc_tid; /* special tid for connection to IPC share */ __u16 flags; + __u16 vcnum; char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 06f6779988b..083dfc57c7a 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -35,13 +35,14 @@ extern struct smb_hdr *cifs_buf_get(void); extern void cifs_buf_release(void *); extern struct smb_hdr *cifs_small_buf_get(void); extern void cifs_small_buf_release(void *); -extern int smb_send(struct socket *, struct smb_hdr *, - unsigned int /* length */ , struct sockaddr *, bool); +extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, + unsigned int /* length */); extern unsigned int _GetXid(void); extern void _FreeXid(unsigned int); #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid())); #define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));} extern char *build_path_from_dentry(struct dentry *); +extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); extern char *build_wildcard_path_from_dentry(struct dentry *direntry); /* extern void renew_parental_timestamps(struct dentry *direntry);*/ extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, @@ -91,6 +92,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time); +extern void posix_fill_in_inode(struct inode *tmp_inode, + FILE_UNIX_BASIC_INFO *pData, int isNewInode); +extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum); extern int cifs_get_inode_info(struct inode **pinode, const unsigned char *search_path, FILE_ALL_INFO *pfile_info, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 552642a507c..939e2f76b95 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -528,14 +528,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) server->maxReq = le16_to_cpu(rsp->MaxMpxCount); server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey); /* even though we do not use raw we might as well set this accurately, in case we ever find a need for it */ if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { - server->maxRw = 0xFF00; + server->max_rw = 0xFF00; server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; } else { - server->maxRw = 0;/* we do not need to use raw anyway */ + server->max_rw = 0;/* do not need to use raw anyway */ server->capabilities = CAP_MPX_MODE; } tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone); @@ -638,7 +639,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) /* probably no need to store and check maxvcs */ server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); - server->maxRw = le32_to_cpu(pSMBr->MaxRawSize); + server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf)); GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); server->capabilities = le32_to_cpu(pSMBr->Capabilities); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e9ea394ee07..da0f4ffa061 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -23,7 +23,6 @@ #include <linux/string.h> #include <linux/list.h> #include <linux/wait.h> -#include <linux/ipv6.h> #include <linux/pagemap.h> #include <linux/ctype.h> #include <linux/utsname.h> @@ -35,6 +34,7 @@ #include <linux/freezer.h> #include <asm/uaccess.h> #include <asm/processor.h> +#include <net/ipv6.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -1354,7 +1354,7 @@ cifs_parse_mount_options(char *options, const char *devname, } static struct TCP_Server_Info * -cifs_find_tcp_session(struct sockaddr *addr) +cifs_find_tcp_session(struct sockaddr_storage *addr) { struct list_head *tmp; struct TCP_Server_Info *server; @@ -1374,13 +1374,13 @@ cifs_find_tcp_session(struct sockaddr *addr) if (server->tcpStatus == CifsNew) continue; - if (addr->sa_family == AF_INET && + if (addr->ss_family == AF_INET && (addr4->sin_addr.s_addr != server->addr.sockAddr.sin_addr.s_addr)) continue; - else if (addr->sa_family == AF_INET6 && - memcmp(&server->addr.sockAddr6.sin6_addr, - &addr6->sin6_addr, sizeof(addr6->sin6_addr))) + else if (addr->ss_family == AF_INET6 && + !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr, + &addr6->sin6_addr)) continue; ++server->srv_count; @@ -1419,12 +1419,12 @@ static struct TCP_Server_Info * cifs_get_tcp_session(struct smb_vol *volume_info) { struct TCP_Server_Info *tcp_ses = NULL; - struct sockaddr addr; + struct sockaddr_storage addr; struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr; struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr; int rc; - memset(&addr, 0, sizeof(struct sockaddr)); + memset(&addr, 0, sizeof(struct sockaddr_storage)); if (volume_info->UNCip && volume_info->UNC) { rc = cifs_inet_pton(AF_INET, volume_info->UNCip, @@ -1435,9 +1435,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info) rc = cifs_inet_pton(AF_INET6, volume_info->UNCip, &sin_server6->sin6_addr.in6_u); if (rc > 0) - addr.sa_family = AF_INET6; + addr.ss_family = AF_INET6; } else { - addr.sa_family = AF_INET; + addr.ss_family = AF_INET; } if (rc <= 0) { @@ -1502,7 +1502,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->tcpStatus = CifsNew; ++tcp_ses->srv_count; - if (addr.sa_family == AF_INET6) { + if (addr.ss_family == AF_INET6) { cFYI(1, ("attempting ipv6 connect")); /* BB should we allow ipv6 on port 139? */ /* other OS never observed in Wild doing 139 with v6 */ @@ -1802,7 +1802,7 @@ ipv4_connect(struct TCP_Server_Info *server) * user space buffer */ socket->sk->sk_rcvtimeo = 7 * HZ; - socket->sk->sk_sndtimeo = 3 * HZ; + socket->sk->sk_sndtimeo = 5 * HZ; /* make the bufsizes depend on wsize/rsize and max requests */ if (server->noautotune) { @@ -1860,9 +1860,7 @@ ipv4_connect(struct TCP_Server_Info *server) smb_buf = (struct smb_hdr *)ses_init_buf; /* sizeof RFC1002_SESSION_REQUEST with no scope */ smb_buf->smb_buf_length = 0x81000044; - rc = smb_send(socket, smb_buf, 0x44, - (struct sockaddr *) &server->addr.sockAddr, - server->noblocksnd); + rc = smb_send(server, smb_buf, 0x44); kfree(ses_init_buf); msleep(1); /* RFC1001 layer in at least one server requires very short break before negprot @@ -1955,7 +1953,7 @@ ipv6_connect(struct TCP_Server_Info *server) * user space buffer */ socket->sk->sk_rcvtimeo = 7 * HZ; - socket->sk->sk_sndtimeo = 3 * HZ; + socket->sk->sk_sndtimeo = 5 * HZ; server->ssocket = socket; return rc; @@ -2182,6 +2180,33 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info, "mount option supported")); } +static int +is_path_accessible(int xid, struct cifsTconInfo *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path) +{ + int rc; + __u64 inode_num; + FILE_ALL_INFO *pfile_info; + + rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc != -EOPNOTSUPP) + return rc; + + pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (pfile_info == NULL) + return -ENOMEM; + + rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info, + 0 /* not legacy */, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + kfree(pfile_info); + return rc; +} + int cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, char *mount_data, const char *devname) @@ -2192,6 +2217,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, struct cifsSesInfo *pSesInfo = NULL; struct cifsTconInfo *tcon = NULL; struct TCP_Server_Info *srvTcp = NULL; + char *full_path; xid = GetXid(); @@ -2428,6 +2454,23 @@ mount_fail_check: cifs_sb->rsize = min(cifs_sb->rsize, (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); + if (!rc && cifs_sb->prepathlen) { + /* build_path_to_root works only when we have a valid tcon */ + full_path = cifs_build_path_to_root(cifs_sb); + if (full_path == NULL) { + rc = -ENOMEM; + goto mount_fail_check; + } + rc = is_path_accessible(xid, tcon, cifs_sb, full_path); + if (rc) { + cERROR(1, ("Path %s in not accessible: %d", + full_path, rc)); + kfree(full_path); + goto mount_fail_check; + } + kfree(full_path); + } + /* volume_info->password is freed above when existing session found (in which case it is not needed anymore) but when new sesion is created the password ptr is put in the new session structure (in which case the diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 838d9c720a5..89fb7283265 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -3,7 +3,7 @@ * * vfs operations that deal with dentries * - * Copyright (C) International Business Machines Corp., 2002,2008 + * Copyright (C) International Business Machines Corp., 2002,2009 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -129,6 +129,89 @@ cifs_bp_rename_retry: return full_path; } +static int cifs_posix_open(char *full_path, struct inode **pinode, + struct super_block *sb, int mode, int oflags, + int *poplock, __u16 *pnetfid, int xid) +{ + int rc; + __u32 oplock; + FILE_UNIX_BASIC_INFO *presp_data; + __u32 posix_flags = 0; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + + cFYI(1, ("posix open %s", full_path)); + + presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); + if (presp_data == NULL) + return -ENOMEM; + +/* So far cifs posix extensions can only map the following flags. + There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but + so far we do not seem to need them, and we can treat them as local only */ + if ((oflags & (FMODE_READ | FMODE_WRITE)) == + (FMODE_READ | FMODE_WRITE)) + posix_flags = SMB_O_RDWR; + else if (oflags & FMODE_READ) + posix_flags = SMB_O_RDONLY; + else if (oflags & FMODE_WRITE) + posix_flags = SMB_O_WRONLY; + if (oflags & O_CREAT) + posix_flags |= SMB_O_CREAT; + if (oflags & O_EXCL) + posix_flags |= SMB_O_EXCL; + if (oflags & O_TRUNC) + posix_flags |= SMB_O_TRUNC; + if (oflags & O_APPEND) + posix_flags |= SMB_O_APPEND; + if (oflags & O_SYNC) + posix_flags |= SMB_O_SYNC; + if (oflags & O_DIRECTORY) + posix_flags |= SMB_O_DIRECTORY; + if (oflags & O_NOFOLLOW) + posix_flags |= SMB_O_NOFOLLOW; + if (oflags & O_DIRECT) + posix_flags |= SMB_O_DIRECT; + + + rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode, + pnetfid, presp_data, &oplock, full_path, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) + goto posix_open_ret; + + if (presp_data->Type == cpu_to_le32(-1)) + goto posix_open_ret; /* open ok, caller does qpathinfo */ + + /* get new inode and set it up */ + if (!pinode) + goto posix_open_ret; /* caller does not need info */ + + *pinode = cifs_new_inode(sb, &presp_data->UniqueId); + + /* We do not need to close the file if new_inode fails since + the caller will retry qpathinfo as long as inode is null */ + if (*pinode == NULL) + goto posix_open_ret; + + posix_fill_in_inode(*pinode, presp_data, 1); + +posix_open_ret: + kfree(presp_data); + return rc; +} + +static void setup_cifs_dentry(struct cifsTconInfo *tcon, + struct dentry *direntry, + struct inode *newinode) +{ + if (tcon->nocase) + direntry->d_op = &cifs_ci_dentry_ops; + else + direntry->d_op = &cifs_dentry_ops; + d_instantiate(direntry, newinode); +} + /* Inode operations in similar order to how they appear in Linux file fs.h */ int @@ -139,14 +222,21 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, int xid; int create_options = CREATE_NOT_DIR; int oplock = 0; + int oflags; + /* + * BB below access is probably too much for mknod to request + * but we have to do query and setpathinfo so requesting + * less could fail (unless we want to request getatr and setatr + * permissions (only). At least for POSIX we do not have to + * request so much. + */ int desiredAccess = GENERIC_READ | GENERIC_WRITE; __u16 fileHandle; struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; + struct cifsTconInfo *tcon; char *full_path = NULL; FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; - struct cifsFileInfo *pCifsFile = NULL; struct cifsInodeInfo *pCifsInode; int disposition = FILE_OVERWRITE_IF; bool write_only = false; @@ -154,7 +244,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, xid = GetXid(); cifs_sb = CIFS_SB(inode->i_sb); - pTcon = cifs_sb->tcon; + tcon = cifs_sb->tcon; full_path = build_path_from_dentry(direntry); if (full_path == NULL) { @@ -162,12 +252,44 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, return -ENOMEM; } - if (nd && (nd->flags & LOOKUP_OPEN)) { - int oflags = nd->intent.open.flags; + mode &= ~current->fs->umask; + if (oplockEnabled) + oplock = REQ_OPLOCK; + + if (nd && (nd->flags & LOOKUP_OPEN)) + oflags = nd->intent.open.flags; + else + oflags = FMODE_READ; + + if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + rc = cifs_posix_open(full_path, &newinode, inode->i_sb, + mode, oflags, &oplock, &fileHandle, xid); + /* EIO could indicate that (posix open) operation is not + supported, despite what server claimed in capability + negotation. EREMOTE indicates DFS junction, which is not + handled in posix open */ + + if ((rc == 0) && (newinode == NULL)) + goto cifs_create_get_file_info; /* query inode info */ + else if (rc == 0) /* success, no need to query */ + goto cifs_create_set_dentry; + else if ((rc != -EIO) && (rc != -EREMOTE) && + (rc != -EOPNOTSUPP)) /* path not found or net err */ + goto cifs_create_out; + /* else fallthrough to retry, using older open call, this is + case where server does not support this SMB level, and + falsely claims capability (also get here for DFS case + which should be rare for path not covered on files) */ + } + if (nd && (nd->flags & LOOKUP_OPEN)) { + /* if the file is going to stay open, then we + need to set the desired access properly */ desiredAccess = 0; if (oflags & FMODE_READ) - desiredAccess |= GENERIC_READ; + desiredAccess |= GENERIC_READ; /* is this too little? */ if (oflags & FMODE_WRITE) { desiredAccess |= GENERIC_WRITE; if (!(oflags & FMODE_READ)) @@ -186,8 +308,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, /* BB add processing to set equivalent of mode - e.g. via CreateX with ACLs */ - if (oplockEnabled) - oplock = REQ_OPLOCK; buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (buf == NULL) { @@ -196,17 +316,15 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, return -ENOMEM; } - mode &= ~current->fs->umask; - /* * if we're not using unix extensions, see if we need to set * ATTR_READONLY on the create call */ - if (!pTcon->unix_ext && (mode & S_IWUGO) == 0) + if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) - rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, + rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess, create_options, &fileHandle, &oplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); @@ -215,128 +333,119 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, if (rc == -EIO) { /* old server, retry the open legacy style */ - rc = SMBLegacyOpen(xid, pTcon, full_path, disposition, + rc = SMBLegacyOpen(xid, tcon, full_path, disposition, desiredAccess, create_options, &fileHandle, &oplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); } if (rc) { cFYI(1, ("cifs_create returned 0x%x", rc)); - } else { - /* If Open reported that we actually created a file - then we now have to set the mode if possible */ - if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) { - struct cifs_unix_set_info_args args = { + goto cifs_create_out; + } + + /* If Open reported that we actually created a file + then we now have to set the mode if possible */ + if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) { + struct cifs_unix_set_info_args args = { .mode = mode, .ctime = NO_CHANGE_64, .atime = NO_CHANGE_64, .mtime = NO_CHANGE_64, .device = 0, - }; + }; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = (__u64) current_fsuid(); - if (inode->i_mode & S_ISGID) - args.gid = (__u64) inode->i_gid; - else - args.gid = (__u64) current_fsgid(); - } else { - args.uid = NO_CHANGE_64; - args.gid = NO_CHANGE_64; - } - CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = (__u64) current_fsuid(); + if (inode->i_mode & S_ISGID) + args.gid = (__u64) inode->i_gid; + else + args.gid = (__u64) current_fsgid(); } else { - /* BB implement mode setting via Windows security - descriptors e.g. */ - /* CIFSSMBWinSetPerms(xid,pTcon,path,mode,-1,-1,nls);*/ - - /* Could set r/o dos attribute if mode & 0222 == 0 */ + args.uid = NO_CHANGE_64; + args.gid = NO_CHANGE_64; } + CIFSSMBUnixSetInfo(xid, tcon, full_path, &args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + } else { + /* BB implement mode setting via Windows security + descriptors e.g. */ + /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/ - /* server might mask mode so we have to query for it */ - if (pTcon->unix_ext) - rc = cifs_get_inode_info_unix(&newinode, full_path, - inode->i_sb, xid); - else { - rc = cifs_get_inode_info(&newinode, full_path, - buf, inode->i_sb, xid, - &fileHandle); - if (newinode) { - if (cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_DYNPERM) - newinode->i_mode = mode; - if ((oplock & CIFS_CREATE_ACTION) && - (cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_SET_UID)) { - newinode->i_uid = current_fsuid(); - if (inode->i_mode & S_ISGID) - newinode->i_gid = - inode->i_gid; - else - newinode->i_gid = - current_fsgid(); - } + /* Could set r/o dos attribute if mode & 0222 == 0 */ + } + +cifs_create_get_file_info: + /* server might mask mode so we have to query for it */ + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + else { + rc = cifs_get_inode_info(&newinode, full_path, buf, + inode->i_sb, xid, &fileHandle); + if (newinode) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + newinode->i_mode = mode; + if ((oplock & CIFS_CREATE_ACTION) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { + newinode->i_uid = current_fsuid(); + if (inode->i_mode & S_ISGID) + newinode->i_gid = inode->i_gid; + else + newinode->i_gid = current_fsgid(); } } + } - if (rc != 0) { - cFYI(1, - ("Create worked but get_inode_info failed rc = %d", - rc)); - } else { - if (pTcon->nocase) - direntry->d_op = &cifs_ci_dentry_ops; - else - direntry->d_op = &cifs_dentry_ops; - d_instantiate(direntry, newinode); - } - if ((nd == NULL /* nfsd case - nfs srv does not set nd */) || - (!(nd->flags & LOOKUP_OPEN))) { - /* mknod case - do not leave file open */ - CIFSSMBClose(xid, pTcon, fileHandle); - } else if (newinode) { - pCifsFile = - kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); - - if (pCifsFile == NULL) - goto cifs_create_out; - pCifsFile->netfid = fileHandle; - pCifsFile->pid = current->tgid; - pCifsFile->pInode = newinode; - pCifsFile->invalidHandle = false; - pCifsFile->closePend = false; - init_MUTEX(&pCifsFile->fh_sem); - mutex_init(&pCifsFile->lock_mutex); - INIT_LIST_HEAD(&pCifsFile->llist); - atomic_set(&pCifsFile->wrtPending, 0); - - /* set the following in open now +cifs_create_set_dentry: + if (rc == 0) + setup_cifs_dentry(tcon, direntry, newinode); + else + cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc)); + + /* nfsd case - nfs srv does not set nd */ + if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) { + /* mknod case - do not leave file open */ + CIFSSMBClose(xid, tcon, fileHandle); + } else if (newinode) { + struct cifsFileInfo *pCifsFile = + kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + + if (pCifsFile == NULL) + goto cifs_create_out; + pCifsFile->netfid = fileHandle; + pCifsFile->pid = current->tgid; + pCifsFile->pInode = newinode; + pCifsFile->invalidHandle = false; + pCifsFile->closePend = false; + init_MUTEX(&pCifsFile->fh_sem); + mutex_init(&pCifsFile->lock_mutex); + INIT_LIST_HEAD(&pCifsFile->llist); + atomic_set(&pCifsFile->wrtPending, 0); + + /* set the following in open now pCifsFile->pfile = file; */ - write_lock(&GlobalSMBSeslock); - list_add(&pCifsFile->tlist, &pTcon->openFileList); - pCifsInode = CIFS_I(newinode); - if (pCifsInode) { - /* if readable file instance put first in list*/ - if (write_only) { - list_add_tail(&pCifsFile->flist, - &pCifsInode->openFileList); - } else { - list_add(&pCifsFile->flist, - &pCifsInode->openFileList); - } - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, ("Exclusive Oplock inode %p", - newinode)); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + write_lock(&GlobalSMBSeslock); + list_add(&pCifsFile->tlist, &tcon->openFileList); + pCifsInode = CIFS_I(newinode); + if (pCifsInode) { + /* if readable file instance put first in list*/ + if (write_only) { + list_add_tail(&pCifsFile->flist, + &pCifsInode->openFileList); + } else { + list_add(&pCifsFile->flist, + &pCifsInode->openFileList); } - write_unlock(&GlobalSMBSeslock); + if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { + pCifsInode->clientCanCacheAll = true; + pCifsInode->clientCanCacheRead = true; + cFYI(1, ("Exclusive Oplock inode %p", + newinode)); + } else if ((oplock & 0xF) == OPLOCK_READ) + pCifsInode->clientCanCacheRead = true; } + write_unlock(&GlobalSMBSeslock); } cifs_create_out: kfree(buf); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 5ab9896fdcb..4690a360c85 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -199,6 +199,49 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat, pfnd_dat->Gid = cpu_to_le64(pinode->i_gid); } +/** + * cifs_new inode - create new inode, initialize, and hash it + * @sb - pointer to superblock + * @inum - if valid pointer and serverino is enabled, replace i_ino with val + * + * Create a new inode, initialize it for CIFS and hash it. Returns the new + * inode or NULL if one couldn't be allocated. + * + * If the share isn't mounted with "serverino" or inum is a NULL pointer then + * we'll just use the inode number assigned by new_inode(). Note that this can + * mean i_ino collisions since the i_ino assigned by new_inode is not + * guaranteed to be unique. + */ +struct inode * +cifs_new_inode(struct super_block *sb, __u64 *inum) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode == NULL) + return NULL; + + /* + * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we + * stop passing inum as ptr. Are there sanity checks we can use to + * ensure that the server is really filling in that field? Also, + * if serverino is disabled, perhaps we should be using iunique()? + */ + if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) + inode->i_ino = (unsigned long) *inum; + + /* + * must set this here instead of cifs_alloc_inode since VFS will + * clobber i_flags + */ + if (sb->s_flags & MS_NOATIME) + inode->i_flags |= S_NOATIME | S_NOCMTIME; + + insert_inode_hash(inode); + + return inode; +} + int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *full_path, struct super_block *sb, int xid) { @@ -233,22 +276,11 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* get new inode */ if (*pinode == NULL) { - *pinode = new_inode(sb); + *pinode = cifs_new_inode(sb, &find_data.UniqueId); if (*pinode == NULL) { rc = -ENOMEM; goto cgiiu_exit; } - /* Is an i_ino of zero legal? */ - /* note ino incremented to unique num in new_inode */ - /* Are there sanity checks we can use to ensure that - the server is really filling in that field? */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) - (*pinode)->i_ino = (unsigned long)find_data.UniqueId; - - if (sb->s_flags & MS_NOATIME) - (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME; - - insert_inode_hash(*pinode); } inode = *pinode; @@ -465,11 +497,9 @@ int cifs_get_inode_info(struct inode **pinode, /* get new inode */ if (*pinode == NULL) { - *pinode = new_inode(sb); - if (*pinode == NULL) { - rc = -ENOMEM; - goto cgii_exit; - } + __u64 inode_num; + __u64 *pinum = &inode_num; + /* Is an i_ino of zero legal? Can we use that to check if the server supports returning inode numbers? Are there other sanity checks we can use to ensure that @@ -486,22 +516,26 @@ int cifs_get_inode_info(struct inode **pinode, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { int rc1 = 0; - __u64 inode_num; rc1 = CIFSGetSrvInodeNumber(xid, pTcon, - full_path, &inode_num, + full_path, pinum, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc1) { cFYI(1, ("GetSrvInodeNum rc %d", rc1)); + pinum = NULL; /* BB EOPNOSUPP disable SERVER_INUM? */ - } else /* do we need cast or hash to ino? */ - (*pinode)->i_ino = inode_num; - } /* else ino incremented to unique num in new_inode*/ - if (sb->s_flags & MS_NOATIME) - (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME; - insert_inode_hash(*pinode); + } + } else { + pinum = NULL; + } + + *pinode = cifs_new_inode(sb, pinum); + if (*pinode == NULL) { + rc = -ENOMEM; + goto cgii_exit; + } } inode = *pinode; cifsInfo = CIFS_I(inode); @@ -621,7 +655,7 @@ static const struct inode_operations cifs_ipc_inode_ops = { .lookup = cifs_lookup, }; -static char *build_path_to_root(struct cifs_sb_info *cifs_sb) +char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb) { int pplen = cifs_sb->prepathlen; int dfsplen; @@ -678,7 +712,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino) return inode; cifs_sb = CIFS_SB(inode->i_sb); - full_path = build_path_to_root(cifs_sb); + full_path = cifs_build_path_to_root(cifs_sb); if (full_path == NULL) return ERR_PTR(-ENOMEM); @@ -1017,7 +1051,7 @@ out_reval: return rc; } -static void posix_fill_in_inode(struct inode *tmp_inode, +void posix_fill_in_inode(struct inode *tmp_inode, FILE_UNIX_BASIC_INFO *pData, int isNewInode) { struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode); @@ -1114,24 +1148,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) else direntry->d_op = &cifs_dentry_ops; - newinode = new_inode(inode->i_sb); + newinode = cifs_new_inode(inode->i_sb, + &pInfo->UniqueId); if (newinode == NULL) { kfree(pInfo); goto mkdir_get_info; } - /* Is an i_ino of zero legal? */ - /* Are there sanity checks we can use to ensure that - the server is really filling in that field? */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { - newinode->i_ino = - (unsigned long)pInfo->UniqueId; - } /* note ino incremented to unique num in new_inode */ - if (inode->i_sb->s_flags & MS_NOATIME) - newinode->i_flags |= S_NOATIME | S_NOCMTIME; newinode->i_nlink = 2; - - insert_inode_hash(newinode); d_instantiate(direntry, newinode); /* we already checked in POSIXCreate whether @@ -1285,6 +1309,11 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifsInode = CIFS_I(direntry->d_inode); cifsInode->time = 0; /* force revalidate to go get info when needed */ + + cifsInode = CIFS_I(inode); + cifsInode->time = 0; /* force revalidate to get parent dir info + since cached search results now invalid */ + direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c index 462bbfefd4b..98b66a54c31 100644 --- a/fs/cifs/md5.c +++ b/fs/cifs/md5.c @@ -10,8 +10,8 @@ * with every copy. * * To compute the message digest of a chunk of bytes, declare an - * MD5Context structure, pass it to MD5Init, call MD5Update as - * needed on buffers full of bytes, and then call MD5Final, which + * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as + * needed on buffers full of bytes, and then call cifs_MD5_final, which * will fill a supplied 16-byte array with the digest. */ @@ -45,7 +45,7 @@ byteReverse(unsigned char *buf, unsigned longs) * initialization constants. */ void -MD5Init(struct MD5Context *ctx) +cifs_MD5_init(struct MD5Context *ctx) { ctx->buf[0] = 0x67452301; ctx->buf[1] = 0xefcdab89; @@ -61,7 +61,7 @@ MD5Init(struct MD5Context *ctx) * of bytes. */ void -MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len) +cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len) { register __u32 t; @@ -110,7 +110,7 @@ MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len) * 1 0* (64-bit count of bits processed, MSB-first) */ void -MD5Final(unsigned char digest[16], struct MD5Context *ctx) +cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx) { unsigned int count; unsigned char *p; @@ -165,7 +165,7 @@ MD5Final(unsigned char digest[16], struct MD5Context *ctx) /* * The core of the MD5 algorithm, this alters an existing MD5 hash to - * reflect the addition of 16 longwords of new data. MD5Update blocks + * reflect the addition of 16 longwords of new data. cifs_MD5_update blocks * the data and converts bytes into longwords for this routine. */ static void @@ -267,9 +267,9 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len, unsigned char tk[16]; struct MD5Context tctx; - MD5Init(&tctx); - MD5Update(&tctx, key, key_len); - MD5Final(tk, &tctx); + cifs_MD5_init(&tctx); + cifs_MD5_update(&tctx, key, key_len); + cifs_MD5_final(tk, &tctx); key = tk; key_len = 16; @@ -287,8 +287,8 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len, ctx->k_opad[i] ^= 0x5c; } - MD5Init(&ctx->ctx); - MD5Update(&ctx->ctx, ctx->k_ipad, 64); + cifs_MD5_init(&ctx->ctx); + cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64); } #endif @@ -317,8 +317,8 @@ hmac_md5_init_limK_to_64(const unsigned char *key, int key_len, ctx->k_opad[i] ^= 0x5c; } - MD5Init(&ctx->ctx); - MD5Update(&ctx->ctx, ctx->k_ipad, 64); + cifs_MD5_init(&ctx->ctx); + cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64); } /*********************************************************************** @@ -328,7 +328,7 @@ void hmac_md5_update(const unsigned char *text, int text_len, struct HMACMD5Context *ctx) { - MD5Update(&ctx->ctx, text, text_len); /* then text of datagram */ + cifs_MD5_update(&ctx->ctx, text, text_len); /* then text of datagram */ } /*********************************************************************** @@ -339,12 +339,12 @@ hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx) { struct MD5Context ctx_o; - MD5Final(digest, &ctx->ctx); + cifs_MD5_final(digest, &ctx->ctx); - MD5Init(&ctx_o); - MD5Update(&ctx_o, ctx->k_opad, 64); - MD5Update(&ctx_o, digest, 16); - MD5Final(digest, &ctx_o); + cifs_MD5_init(&ctx_o); + cifs_MD5_update(&ctx_o, ctx->k_opad, 64); + cifs_MD5_update(&ctx_o, digest, 16); + cifs_MD5_final(digest, &ctx_o); } /*********************************************************** diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h index f7d4f4197ba..6fba8cb402f 100644 --- a/fs/cifs/md5.h +++ b/fs/cifs/md5.h @@ -20,10 +20,10 @@ struct HMACMD5Context { }; #endif /* _HMAC_MD5_H */ -void MD5Init(struct MD5Context *context); -void MD5Update(struct MD5Context *context, unsigned char const *buf, +void cifs_MD5_init(struct MD5Context *context); +void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf, unsigned len); -void MD5Final(unsigned char digest[16], struct MD5Context *context); +void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context); /* The following definitions come from lib/hmacmd5.c */ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 9f51f9bf029..c2c01ff4c32 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -56,35 +56,34 @@ static inline void dump_cifs_file_struct(struct file *file, char *label) } #endif /* DEBUG2 */ -/* Returns one if new inode created (which therefore needs to be hashed) */ +/* Returns 1 if new inode created, 2 if both dentry and inode were */ /* Might check in the future if inode number changed so we can rehash inode */ -static int construct_dentry(struct qstr *qstring, struct file *file, - struct inode **ptmp_inode, struct dentry **pnew_dentry) +static int +construct_dentry(struct qstr *qstring, struct file *file, + struct inode **ptmp_inode, struct dentry **pnew_dentry, + __u64 *inum) { - struct dentry *tmp_dentry; - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; + struct dentry *tmp_dentry = NULL; + struct super_block *sb = file->f_path.dentry->d_sb; int rc = 0; cFYI(1, ("For %s", qstring->name)); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - pTcon = cifs_sb->tcon; qstring->hash = full_name_hash(qstring->name, qstring->len); tmp_dentry = d_lookup(file->f_path.dentry, qstring); if (tmp_dentry) { + /* BB: overwrite old name? i.e. tmp_dentry->d_name and + * tmp_dentry->d_name.len?? + */ cFYI(0, ("existing dentry with inode 0x%p", tmp_dentry->d_inode)); *ptmp_inode = tmp_dentry->d_inode; -/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/ if (*ptmp_inode == NULL) { - *ptmp_inode = new_inode(file->f_path.dentry->d_sb); + *ptmp_inode = cifs_new_inode(sb, inum); if (*ptmp_inode == NULL) return rc; rc = 1; } - if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME) - (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME; } else { tmp_dentry = d_alloc(file->f_path.dentry, qstring); if (tmp_dentry == NULL) { @@ -93,15 +92,14 @@ static int construct_dentry(struct qstr *qstring, struct file *file, return rc; } - *ptmp_inode = new_inode(file->f_path.dentry->d_sb); - if (pTcon->nocase) + if (CIFS_SB(sb)->tcon->nocase) tmp_dentry->d_op = &cifs_ci_dentry_ops; else tmp_dentry->d_op = &cifs_dentry_ops; + + *ptmp_inode = cifs_new_inode(sb, inum); if (*ptmp_inode == NULL) return rc; - if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME) - (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME; rc = 2; } @@ -822,7 +820,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, /* inode num, inode type and filename returned */ static int cifs_get_name_from_search_buf(struct qstr *pqst, char *current_entry, __u16 level, unsigned int unicode, - struct cifs_sb_info *cifs_sb, int max_len, ino_t *pinum) + struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum) { int rc = 0; unsigned int len = 0; @@ -842,9 +840,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst, len = strnlen(filename, PATH_MAX); } - /* BB fixme - hash low and high 32 bits if not 64 bit arch BB */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) - *pinum = pFindData->UniqueId; + *pinum = pFindData->UniqueId; } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { FILE_DIRECTORY_INFO *pFindData = (FILE_DIRECTORY_INFO *)current_entry; @@ -907,7 +903,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, struct qstr qstring; struct cifsFileInfo *pCifsF; unsigned int obj_type; - ino_t inum; + __u64 inum; struct cifs_sb_info *cifs_sb; struct inode *tmp_inode; struct dentry *tmp_dentry; @@ -940,20 +936,18 @@ static int cifs_filldir(char *pfindEntry, struct file *file, if (rc) return rc; - rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry); + /* only these two infolevels return valid inode numbers */ + if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX || + pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO) + rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry, + &inum); + else + rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry, + NULL); + if ((tmp_inode == NULL) || (tmp_dentry == NULL)) return -ENOMEM; - if (rc) { - /* inode created, we need to hash it with right inode number */ - if (inum != 0) { - /* BB fixme - hash the 2 32 quantities bits together if - * necessary BB */ - tmp_inode->i_ino = inum; - } - insert_inode_hash(tmp_inode); - } - /* we pass in rc below, indicating whether it is a new inode, so we can figure out whether to invalidate the inode cached data if the file has changed */ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 5f22de7b79a..5c68b4282be 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -34,15 +34,99 @@ extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); +/* Checks if this is the first smb session to be reconnected after + the socket has been reestablished (so we know whether to use vc 0). + Called while holding the cifs_tcp_ses_lock, so do not block */ +static bool is_first_ses_reconnect(struct cifsSesInfo *ses) +{ + struct list_head *tmp; + struct cifsSesInfo *tmp_ses; + + list_for_each(tmp, &ses->server->smb_ses_list) { + tmp_ses = list_entry(tmp, struct cifsSesInfo, + smb_ses_list); + if (tmp_ses->need_reconnect == false) + return false; + } + /* could not find a session that was already connected, + this must be the first one we are reconnecting */ + return true; +} + +/* + * vc number 0 is treated specially by some servers, and should be the + * first one we request. After that we can use vcnumbers up to maxvcs, + * one for each smb session (some Windows versions set maxvcs incorrectly + * so maxvc=1 can be ignored). If we have too many vcs, we can reuse + * any vc but zero (some servers reset the connection on vcnum zero) + * + */ +static __le16 get_next_vcnum(struct cifsSesInfo *ses) +{ + __u16 vcnum = 0; + struct list_head *tmp; + struct cifsSesInfo *tmp_ses; + __u16 max_vcs = ses->server->max_vcs; + __u16 i; + int free_vc_found = 0; + + /* Quoting the MS-SMB specification: "Windows-based SMB servers set this + field to one but do not enforce this limit, which allows an SMB client + to establish more virtual circuits than allowed by this value ... but + other server implementations can enforce this limit." */ + if (max_vcs < 2) + max_vcs = 0xFFFF; + + write_lock(&cifs_tcp_ses_lock); + if ((ses->need_reconnect) && is_first_ses_reconnect(ses)) + goto get_vc_num_exit; /* vcnum will be zero */ + for (i = ses->server->srv_count - 1; i < max_vcs; i++) { + if (i == 0) /* this is the only connection, use vc 0 */ + break; + + free_vc_found = 1; + + list_for_each(tmp, &ses->server->smb_ses_list) { + tmp_ses = list_entry(tmp, struct cifsSesInfo, + smb_ses_list); + if (tmp_ses->vcnum == i) { + free_vc_found = 0; + break; /* found duplicate, try next vcnum */ + } + } + if (free_vc_found) + break; /* we found a vcnumber that will work - use it */ + } + + if (i == 0) + vcnum = 0; /* for most common case, ie if one smb session, use + vc zero. Also for case when no free vcnum, zero + is safest to send (some clients only send zero) */ + else if (free_vc_found == 0) + vcnum = 1; /* we can not reuse vc=0 safely, since some servers + reset all uids on that, but 1 is ok. */ + else + vcnum = i; + ses->vcnum = vcnum; +get_vc_num_exit: + write_unlock(&cifs_tcp_ses_lock); + + return le16_to_cpu(vcnum); +} + static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB) { __u32 capabilities = 0; /* init fields common to all four types of SessSetup */ - /* note that header is initialized to zero in header_assemble */ + /* Note that offsets for first seven fields in req struct are same */ + /* in CIFS Specs so does not matter which of 3 forms of struct */ + /* that we use in next few lines */ + /* Note that header is initialized to zero in header_assemble */ pSMB->req.AndXCommand = 0xFF; pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); + pSMB->req.VcNumber = get_next_vcnum(ses); /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ @@ -71,7 +155,6 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB) if (ses->capabilities & CAP_UNIX) capabilities |= CAP_UNIX; - /* BB check whether to init vcnum BB */ return capabilities; } @@ -228,7 +311,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft, kfree(ses->serverOS); /* UTF-8 string will not grow more than four times as big as UCS-16 */ - ses->serverOS = kzalloc(4 * len, GFP_KERNEL); + ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL); if (ses->serverOS != NULL) cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp); data += 2 * (len + 1); @@ -241,7 +324,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft, return rc; kfree(ses->serverNOS); - ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */ + ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL); if (ses->serverNOS != NULL) { cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len, nls_cp); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 7ebe6599ed3..0ad3e2d116a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -154,81 +154,8 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon) spin_unlock(&GlobalMid_Lock); } -int -smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer, - unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd) -{ - int rc = 0; - int i = 0; - struct msghdr smb_msg; - struct kvec iov; - unsigned len = smb_buf_length + 4; - - if (ssocket == NULL) - return -ENOTSOCK; /* BB eventually add reconnect code here */ - iov.iov_base = smb_buffer; - iov.iov_len = len; - - smb_msg.msg_name = sin; - smb_msg.msg_namelen = sizeof(struct sockaddr); - smb_msg.msg_control = NULL; - smb_msg.msg_controllen = 0; - if (noblocksnd) - smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; - else - smb_msg.msg_flags = MSG_NOSIGNAL; - - /* smb header is converted in header_assemble. bcc and rest of SMB word - area, and byte area if necessary, is converted to littleendian in - cifssmb.c and RFC1001 len is converted to bigendian in smb_send - Flags2 is converted in SendReceive */ - - smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length); - cFYI(1, ("Sending smb of length %d", smb_buf_length)); - dump_smb(smb_buffer, len); - - while (len > 0) { - rc = kernel_sendmsg(ssocket, &smb_msg, &iov, 1, len); - if ((rc == -ENOSPC) || (rc == -EAGAIN)) { - i++; - /* smaller timeout here than send2 since smaller size */ - /* Although it may not be required, this also is smaller - oplock break time */ - if (i > 12) { - cERROR(1, - ("sends on sock %p stuck for 7 seconds", - ssocket)); - rc = -EAGAIN; - break; - } - msleep(1 << i); - continue; - } - if (rc < 0) - break; - else - i = 0; /* reset i after each successful send */ - iov.iov_base += rc; - iov.iov_len -= rc; - len -= rc; - } - - if (rc < 0) { - cERROR(1, ("Error %d sending data on socket to server", rc)); - } else { - rc = 0; - } - - /* Don't want to modify the buffer as a - side effect of this call. */ - smb_buffer->smb_buf_length = smb_buf_length; - - return rc; -} - static int -smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec, - struct sockaddr *sin, bool noblocksnd) +smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) { int rc = 0; int i = 0; @@ -243,11 +170,11 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec, if (ssocket == NULL) return -ENOTSOCK; /* BB eventually add reconnect code here */ - smb_msg.msg_name = sin; + smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr; smb_msg.msg_namelen = sizeof(struct sockaddr); smb_msg.msg_control = NULL; smb_msg.msg_controllen = 0; - if (noblocksnd) + if (server->noblocksnd) smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; else smb_msg.msg_flags = MSG_NOSIGNAL; @@ -272,7 +199,25 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec, n_vec - first_vec, total_len); if ((rc == -ENOSPC) || (rc == -EAGAIN)) { i++; - if (i >= 14) { + /* if blocking send we try 3 times, since each can block + for 5 seconds. For nonblocking we have to try more + but wait increasing amounts of time allowing time for + socket to clear. The overall time we wait in either + case to send on the socket is about 15 seconds. + Similarly we wait for 15 seconds for + a response from the server in SendReceive[2] + for the server to send a response back for + most types of requests (except SMB Write + past end of file which can be slow, and + blocking lock operations). NFS waits slightly longer + than CIFS, but this can make it take longer for + nonresponsive servers to be detected and 15 seconds + is more than enough time for modern networks to + send a packet. In most cases if we fail to send + after the retries we will kill the socket and + reconnect which may clear the network problem. + */ + if ((i >= 14) || (!server->noblocksnd && (i > 2))) { cERROR(1, ("sends on sock %p stuck for 15 seconds", ssocket)); @@ -339,6 +284,18 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec, return rc; } +int +smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, + unsigned int smb_buf_length) +{ + struct kvec iov; + + iov.iov_base = smb_buffer; + iov.iov_len = smb_buf_length + 4; + + return smb_sendv(server, &iov, 1); +} + static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op) { if (long_op == CIFS_ASYNC_OP) { @@ -540,9 +497,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, #ifdef CONFIG_CIFS_STATS2 atomic_inc(&ses->server->inSend); #endif - rc = smb_send2(ses->server, iov, n_vec, - (struct sockaddr *) &(ses->server->addr.sockAddr), - ses->server->noblocksnd); + rc = smb_sendv(ses->server, iov, n_vec); #ifdef CONFIG_CIFS_STATS2 atomic_dec(&ses->server->inSend); midQ->when_sent = jiffies; @@ -736,9 +691,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, #ifdef CONFIG_CIFS_STATS2 atomic_inc(&ses->server->inSend); #endif - rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, - (struct sockaddr *) &(ses->server->addr.sockAddr), - ses->server->noblocksnd); + rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length); #ifdef CONFIG_CIFS_STATS2 atomic_dec(&ses->server->inSend); midQ->when_sent = jiffies; @@ -879,9 +832,7 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf, mutex_unlock(&ses->server->srv_mutex); return rc; } - rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, - (struct sockaddr *) &(ses->server->addr.sockAddr), - ses->server->noblocksnd); + rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length); mutex_unlock(&ses->server->srv_mutex); return rc; } @@ -973,9 +924,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, #ifdef CONFIG_CIFS_STATS2 atomic_inc(&ses->server->inSend); #endif - rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, - (struct sockaddr *) &(ses->server->addr.sockAddr), - ses->server->noblocksnd); + rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length); #ifdef CONFIG_CIFS_STATS2 atomic_dec(&ses->server->inSend); midQ->when_sent = jiffies; diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig new file mode 100644 index 00000000000..c0e5a7fad06 --- /dev/null +++ b/fs/coda/Kconfig @@ -0,0 +1,21 @@ +config CODA_FS + tristate "Coda file system support (advanced network fs)" + depends on INET + help + Coda is an advanced network file system, similar to NFS in that it + enables you to mount file systems of a remote server and access them + with regular Unix commands as if they were sitting on your hard + disk. Coda has several advantages over NFS: support for + disconnected operation (e.g. for laptops), read/write server + replication, security model for authentication and encryption, + persistent client caches and write back caching. + + If you say Y here, your Linux box will be able to act as a Coda + *client*. You will need user level code as well, both for the + client and server. Servers are currently user level, i.e. they need + no kernel support. Please read + <file:Documentation/filesystems/coda.txt> and check out the Coda + home page <http://www.coda.cs.cmu.edu/>. + + To compile the coda client support as a module, choose M here: the + module will be called coda. diff --git a/fs/compat.c b/fs/compat.c index 65a070e705a..d0145ca2757 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1407,7 +1407,7 @@ int compat_do_execve(char * filename, bprm->cred = prepare_exec_creds(); if (!bprm->cred) goto out_unlock; - check_unsafe_exec(bprm); + check_unsafe_exec(bprm, current->files); file = open_exec(filename); retval = PTR_ERR(file); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 5235c67e759..45e59d3c7f1 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -538,6 +538,7 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg) * cannot be fixed without breaking all existing apps. */ case TUNSETIFF: + case TUNGETIFF: case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: @@ -784,7 +785,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) if (copy_in_user(&sgio->status, &sgio32->status, (4 * sizeof(unsigned char)) + - (2 * sizeof(unsigned (short))) + + (2 * sizeof(unsigned short)) + (3 * sizeof(int)))) return -EFAULT; @@ -1912,6 +1913,9 @@ COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ /* 0x00 */ COMPATIBLE_IOCTL(FIBMAP) COMPATIBLE_IOCTL(FIGETBSZ) +/* 'X' - originally XFS but some now in the VFS */ +COMPATIBLE_IOCTL(FIFREEZE) +COMPATIBLE_IOCTL(FITHAW) /* RAID */ COMPATIBLE_IOCTL(RAID_VERSION) COMPATIBLE_IOCTL(GET_ARRAY_INFO) @@ -1937,6 +1941,8 @@ ULONG_IOCTL(SET_BITMAP_FILE) /* Big K */ COMPATIBLE_IOCTL(PIO_FONT) COMPATIBLE_IOCTL(GIO_FONT) +COMPATIBLE_IOCTL(PIO_CMAP) +COMPATIBLE_IOCTL(GIO_CMAP) ULONG_IOCTL(KDSIGACCEPT) COMPATIBLE_IOCTL(KDGETKEYCODE) COMPATIBLE_IOCTL(KDSETKEYCODE) @@ -1982,6 +1988,11 @@ COMPATIBLE_IOCTL(TUNSETNOCSUM) COMPATIBLE_IOCTL(TUNSETDEBUG) COMPATIBLE_IOCTL(TUNSETPERSIST) COMPATIBLE_IOCTL(TUNSETOWNER) +COMPATIBLE_IOCTL(TUNSETLINK) +COMPATIBLE_IOCTL(TUNSETGROUP) +COMPATIBLE_IOCTL(TUNGETFEATURES) +COMPATIBLE_IOCTL(TUNSETOFFLOAD) +COMPATIBLE_IOCTL(TUNSETTXFILTER) /* Big V */ COMPATIBLE_IOCTL(VT_SETMODE) COMPATIBLE_IOCTL(VT_GETMODE) @@ -2573,6 +2584,7 @@ HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc) HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc) HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc) HANDLE_IOCTL(TUNSETIFF, dev_ifsioc) +HANDLE_IOCTL(TUNGETIFF, dev_ifsioc) HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl) HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl) HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl) diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig new file mode 100644 index 00000000000..13587cc97a0 --- /dev/null +++ b/fs/configfs/Kconfig @@ -0,0 +1,11 @@ +config CONFIGFS_FS + tristate "Userspace-driven configuration filesystem" + depends on SYSFS + help + configfs is a ram-based filesystem that provides the converse + of sysfs's functionality. Where sysfs is a filesystem-based + view of kernel objects, configfs is a filesystem-based manager + of kernel objects, or config_items. + + Both sysfs and configfs can and should exist together on the + same system. One is not a replacement for the other. diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig new file mode 100644 index 00000000000..cd06466f365 --- /dev/null +++ b/fs/cramfs/Kconfig @@ -0,0 +1,19 @@ +config CRAMFS + tristate "Compressed ROM file system support (cramfs)" + depends on BLOCK + select ZLIB_INFLATE + help + Saying Y here includes support for CramFs (Compressed ROM File + System). CramFs is designed to be a simple, small, and compressed + file system for ROM based embedded systems. CramFs is read-only, + limited to 256MB file systems (with 16MB files), and doesn't support + 16/32 bits uid/gid, hard links and timestamps. + + See <file:Documentation/filesystems/cramfs.txt> and + <file:fs/cramfs/README> for further information. + + To compile this as a module, choose M here: the module will be called + cramfs. Note that the root file system (the one containing the + directory /) cannot be compiled as a module. + + If unsure, say N. diff --git a/fs/dcache.c b/fs/dcache.c index 937df0fb0da..07e2d4a44bd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1180,7 +1180,7 @@ struct dentry *d_obtain_alias(struct inode *inode) iput(inode); return res; } -EXPORT_SYMBOL_GPL(d_obtain_alias); +EXPORT_SYMBOL(d_obtain_alias); /** * d_splice_alias - splice a disconnected dentry into the tree if one exists diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index eba87ff3177..894a32d438d 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -168,7 +168,7 @@ static int dlm_plock_callback(struct plock_op *op) notify = xop->callback; if (op->info.rv) { - notify(flc, NULL, op->info.rv); + notify(fl, NULL, op->info.rv); goto out; } @@ -187,7 +187,7 @@ static int dlm_plock_callback(struct plock_op *op) (unsigned long long)op->info.number, file, fl); } - rv = notify(flc, NULL, 0); + rv = notify(fl, NULL, 0); if (rv) { /* XXX: We need to cancel the fs lock here: */ log_print("dlm_plock_callback: lock granted after lock request " @@ -304,7 +304,9 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (rv == -ENOENT) rv = 0; else if (rv > 0) { + locks_init_lock(fl); fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; + fl->fl_flags = FL_POSIX; fl->fl_pid = op->info.pid; fl->fl_start = op->info.start; fl->fl_end = op->info.end; diff --git a/fs/dquot.c b/fs/dquot.c index 48c0571f831..bca3cac4bee 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -87,14 +87,17 @@ #define __DQUOT_PARANOIA /* - * There are two quota SMP locks. dq_list_lock protects all lists with quotas - * and quota formats and also dqstats structure containing statistics about the - * lists. dq_data_lock protects data from dq_dqb and also mem_dqinfo structures - * and also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. + * There are three quota SMP locks. dq_list_lock protects all lists with quotas + * and quota formats, dqstats structure containing statistics about the lists + * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and + * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly - * in inode_add_bytes() and inode_sub_bytes(). + * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects + * modifications of quota state (on quotaon and quotaoff) and readers who care + * about latest values take it as well. * - * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock + * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock, + * dq_list_lock > dq_state_lock * * Note that some things (eg. sb pointer, type, id) doesn't change during * the life of the dquot structure and so needn't to be protected by a lock @@ -103,12 +106,7 @@ * operation is just reading pointers from inode (or not using them at all) the * read lock is enough. If pointers are altered function must hold write lock * (these locking rules also apply for S_NOQUOTA flag in the inode - note that - * for altering the flag i_mutex is also needed). If operation is holding - * reference to dquot in other way (e.g. quotactl ops) it must be guarded by - * dqonoff_mutex. - * This locking assures that: - * a) update/access to dquot pointers in inode is serialized - * b) everyone is guarded against invalidate_dquots() + * for altering the flag i_mutex is also needed). * * Each dquot has its dq_lock mutex. Locked dquots might not be referenced * from inodes (dquot_alloc_space() and such don't check the dq_lock). @@ -122,10 +120,17 @@ * Lock ordering (including related VFS locks) is the following: * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > * dqio_mutex + * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem > + * dqptr_sem. But filesystem has to count with the fact that functions such as + * dquot_alloc_space() acquire dqptr_sem and they usually have to be called + * from inside a transaction to keep filesystem consistency after a crash. Also + * filesystems usually want to do some IO on dquot from ->mark_dirty which is + * called with dqptr_sem held. * i_mutex on quota files is special (it's below dqio_mutex) */ static DEFINE_SPINLOCK(dq_list_lock); +static DEFINE_SPINLOCK(dq_state_lock); DEFINE_SPINLOCK(dq_data_lock); static char *quotatypes[] = INITQFNAMES; @@ -428,7 +433,7 @@ static inline void do_destroy_dquot(struct dquot *dquot) * quota is disabled and pointers from inodes removed so there cannot be new * quota users. There can still be some users of quotas due to inodes being * just deleted or pruned by prune_icache() (those are not attached to any - * list). We have to wait for such users. + * list) or parallel quotactl call. We have to wait for such users. */ static void invalidate_dquots(struct super_block *sb, int type) { @@ -600,7 +605,6 @@ static struct shrinker dqcache_shrinker = { /* * Put reference to dquot * NOTE: If you change this function please check whether dqput_blocks() works right... - * MUST be called with either dqptr_sem or dqonoff_mutex held */ void dqput(struct dquot *dquot) { @@ -697,36 +701,30 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) } /* - * Check whether dquot is in memory. - * MUST be called with either dqptr_sem or dqonoff_mutex held - */ -int dquot_is_cached(struct super_block *sb, unsigned int id, int type) -{ - unsigned int hashent = hashfn(sb, id, type); - int ret = 0; - - if (!sb_has_quota_active(sb, type)) - return 0; - spin_lock(&dq_list_lock); - if (find_dquot(hashent, sb, id, type) != NODQUOT) - ret = 1; - spin_unlock(&dq_list_lock); - return ret; -} - -/* * Get reference to dquot - * MUST be called with either dqptr_sem or dqonoff_mutex held + * + * Locking is slightly tricky here. We are guarded from parallel quotaoff() + * destroying our dquot by: + * a) checking for quota flags under dq_list_lock and + * b) getting a reference to dquot before we release dq_list_lock */ struct dquot *dqget(struct super_block *sb, unsigned int id, int type) { unsigned int hashent = hashfn(sb, id, type); - struct dquot *dquot, *empty = NODQUOT; + struct dquot *dquot = NODQUOT, *empty = NODQUOT; if (!sb_has_quota_active(sb, type)) return NODQUOT; we_slept: spin_lock(&dq_list_lock); + spin_lock(&dq_state_lock); + if (!sb_has_quota_active(sb, type)) { + spin_unlock(&dq_state_lock); + spin_unlock(&dq_list_lock); + goto out; + } + spin_unlock(&dq_state_lock); + if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) { if (empty == NODQUOT) { spin_unlock(&dq_list_lock); @@ -735,6 +733,7 @@ we_slept: goto we_slept; } dquot = empty; + empty = NODQUOT; dquot->dq_id = id; /* all dquots go on the inuse_list */ put_inuse(dquot); @@ -749,8 +748,6 @@ we_slept: dqstats.cache_hits++; dqstats.lookups++; spin_unlock(&dq_list_lock); - if (empty) - do_destroy_dquot(empty); } /* Wait for dq_lock - after this we know that either dquot_release() is already * finished or it will be canceled due to dq_count > 1 test */ @@ -758,11 +755,15 @@ we_slept: /* Read the dquot and instantiate it (everything done only if needed) */ if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) { dqput(dquot); - return NODQUOT; + dquot = NODQUOT; + goto out; } #ifdef __DQUOT_PARANOIA BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ #endif +out: + if (empty) + do_destroy_dquot(empty); return dquot; } @@ -1198,63 +1199,76 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space) } /* * Initialize quota pointers in inode - * Transaction must be started at entry + * We do things in a bit complicated way but by that we avoid calling + * dqget() and thus filesystem callbacks under dqptr_sem. */ int dquot_initialize(struct inode *inode, int type) { unsigned int id = 0; int cnt, ret = 0; + struct dquot *got[MAXQUOTAS] = { NODQUOT, NODQUOT }; + struct super_block *sb = inode->i_sb; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) return 0; - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + + /* First get references to structures we might need. */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + switch (cnt) { + case USRQUOTA: + id = inode->i_uid; + break; + case GRPQUOTA: + id = inode->i_gid; + break; + } + got[cnt] = dqget(sb, id, cnt); + } + + down_write(&sb_dqopt(sb)->dqptr_sem); /* Having dqptr_sem we know NOQUOTA flags can't be altered... */ if (IS_NOQUOTA(inode)) goto out_err; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; + /* Avoid races with quotaoff() */ + if (!sb_has_quota_active(sb, cnt)) + continue; if (inode->i_dquot[cnt] == NODQUOT) { - switch (cnt) { - case USRQUOTA: - id = inode->i_uid; - break; - case GRPQUOTA: - id = inode->i_gid; - break; - } - inode->i_dquot[cnt] = dqget(inode->i_sb, id, cnt); + inode->i_dquot[cnt] = got[cnt]; + got[cnt] = NODQUOT; } } out_err: - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + up_write(&sb_dqopt(sb)->dqptr_sem); + /* Drop unused references */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + dqput(got[cnt]); return ret; } /* * Release all quotas referenced by inode - * Transaction must be started at an entry */ -int dquot_drop_locked(struct inode *inode) +int dquot_drop(struct inode *inode) { int cnt; + struct dquot *put[MAXQUOTAS]; + down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt] != NODQUOT) { - dqput(inode->i_dquot[cnt]); - inode->i_dquot[cnt] = NODQUOT; - } + put[cnt] = inode->i_dquot[cnt]; + inode->i_dquot[cnt] = NODQUOT; } - return 0; -} - -int dquot_drop(struct inode *inode) -{ - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - dquot_drop_locked(inode); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + dqput(put[cnt]); return 0; } @@ -1470,8 +1484,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) qsize_t space; struct dquot *transfer_from[MAXQUOTAS]; struct dquot *transfer_to[MAXQUOTAS]; - int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid, - chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid; + int cnt, ret = QUOTA_OK; + int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid, + chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid; char warntype_to[MAXQUOTAS]; char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; @@ -1479,21 +1494,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) return QUOTA_OK; - /* Clear the arrays */ + /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - transfer_to[cnt] = transfer_from[cnt] = NODQUOT; + transfer_from[cnt] = NODQUOT; + transfer_to[cnt] = NODQUOT; warntype_to[cnt] = QUOTA_NL_NOWARN; - } - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ - if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; - } - /* First build the transfer_to list - here we can block on - * reading/instantiating of dquots. We know that the transaction for - * us was already started so we don't violate lock ranking here */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { switch (cnt) { case USRQUOTA: if (!chuid) @@ -1507,6 +1512,13 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) break; } } + + down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Now recheck reliably when holding dqptr_sem */ + if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + goto put_all; + } spin_lock(&dq_data_lock); space = inode_get_bytes(inode); /* Build the transfer_from list and check the limits */ @@ -1517,7 +1529,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt) == NO_QUOTA) - goto warn_put_all; + goto over_quota; } /* @@ -1545,28 +1557,37 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) inode->i_dquot[cnt] = transfer_to[cnt]; } - ret = QUOTA_OK; -warn_put_all: spin_unlock(&dq_data_lock); + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Dirtify all the dquots - this can block when journalling */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (transfer_from[cnt]) mark_dquot_dirty(transfer_from[cnt]); - if (transfer_to[cnt]) + if (transfer_to[cnt]) { mark_dquot_dirty(transfer_to[cnt]); + /* The reference we got is transferred to the inode */ + transfer_to[cnt] = NODQUOT; + } } +warn_put_all: flush_warnings(transfer_to, warntype_to); flush_warnings(transfer_from, warntype_from_inodes); flush_warnings(transfer_from, warntype_from_space); - +put_all: for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT) - dqput(transfer_from[cnt]); - if (ret == NO_QUOTA && transfer_to[cnt] != NODQUOT) - dqput(transfer_to[cnt]); + dqput(transfer_from[cnt]); + dqput(transfer_to[cnt]); } - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); return ret; +over_quota: + spin_unlock(&dq_data_lock); + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Clear dquot pointers we don't want to dqput() */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + transfer_from[cnt] = NODQUOT; + ret = NO_QUOTA; + goto warn_put_all; } /* Wrapper for transferring ownership of an inode */ @@ -1651,19 +1672,24 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) continue; if (flags & DQUOT_SUSPENDED) { + spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(DQUOT_SUSPENDED, cnt); + spin_unlock(&dq_state_lock); } else { + spin_lock(&dq_state_lock); dqopt->flags &= ~dquot_state_flag(flags, cnt); /* Turning off suspended quotas? */ if (!sb_has_quota_loaded(sb, cnt) && sb_has_quota_suspended(sb, cnt)) { dqopt->flags &= ~dquot_state_flag( DQUOT_SUSPENDED, cnt); + spin_unlock(&dq_state_lock); iput(dqopt->files[cnt]); dqopt->files[cnt] = NULL; continue; } + spin_unlock(&dq_state_lock); } /* We still have to keep quota loaded? */ @@ -1830,7 +1856,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, } mutex_unlock(&dqopt->dqio_mutex); mutex_unlock(&inode->i_mutex); + spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); + spin_unlock(&dq_state_lock); add_dquot_ref(sb, type); mutex_unlock(&dqopt->dqonoff_mutex); @@ -1872,9 +1900,11 @@ static int vfs_quota_on_remount(struct super_block *sb, int type) } inode = dqopt->files[type]; dqopt->files[type] = NULL; + spin_lock(&dq_state_lock); flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, type); dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type); + spin_unlock(&dq_state_lock); mutex_unlock(&dqopt->dqonoff_mutex); flags = dquot_generic_flag(flags, type); @@ -1952,7 +1982,9 @@ int vfs_quota_enable(struct inode *inode, int type, int format_id, ret = -EBUSY; goto out_lock; } + spin_lock(&dq_state_lock); sb_dqopt(sb)->flags |= dquot_state_flag(flags, type); + spin_unlock(&dq_state_lock); out_lock: mutex_unlock(&dqopt->dqonoff_mutex); return ret; @@ -2039,14 +2071,12 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d { struct dquot *dquot; - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!(dquot = dqget(sb, id, type))) { - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + dquot = dqget(sb, id, type); + if (dquot == NODQUOT) return -ESRCH; - } do_get_dqblk(dquot, di); dqput(dquot); - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return 0; } @@ -2130,7 +2160,6 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d struct dquot *dquot; int rc; - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); dquot = dqget(sb, id, type); if (!dquot) { rc = -ESRCH; @@ -2139,7 +2168,6 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d rc = do_set_dqblk(dquot, di); dqput(dquot); out: - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return rc; } @@ -2370,11 +2398,9 @@ EXPORT_SYMBOL(dquot_release); EXPORT_SYMBOL(dquot_mark_dquot_dirty); EXPORT_SYMBOL(dquot_initialize); EXPORT_SYMBOL(dquot_drop); -EXPORT_SYMBOL(dquot_drop_locked); EXPORT_SYMBOL(vfs_dq_drop); EXPORT_SYMBOL(dqget); EXPORT_SYMBOL(dqput); -EXPORT_SYMBOL(dquot_is_cached); EXPORT_SYMBOL(dquot_alloc_space); EXPORT_SYMBOL(dquot_alloc_inode); EXPORT_SYMBOL(dquot_free_space); diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig new file mode 100644 index 00000000000..0c754e64232 --- /dev/null +++ b/fs/ecryptfs/Kconfig @@ -0,0 +1,11 @@ +config ECRYPT_FS + tristate "eCrypt filesystem layer support (EXPERIMENTAL)" + depends on EXPERIMENTAL && KEYS && CRYPTO && NET + help + Encrypted filesystem that operates on the VFS layer. See + <file:Documentation/filesystems/ecryptfs.txt> to learn more about + eCryptfs. Userspace components are required and can be + obtained from <http://ecryptfs.sf.net>. + + To compile this file system support as a module, choose M here: the + module will be called ecryptfs. diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index c01e043670e..f6caeb1d110 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1716,7 +1716,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size, { int rc = 0; - (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL); + (*copied_name) = kmalloc((name_size + 1), GFP_KERNEL); if (!(*copied_name)) { rc = -ENOMEM; goto out; @@ -1726,7 +1726,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size, * in printing out the * string in debug * messages */ - (*copied_name_size) = (name_size + 1); + (*copied_name_size) = name_size; out: return rc; } diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig new file mode 100644 index 00000000000..6ebfc1c207a --- /dev/null +++ b/fs/efs/Kconfig @@ -0,0 +1,14 @@ +config EFS_FS + tristate "EFS file system support (read only) (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + help + EFS is an older file system used for non-ISO9660 CD-ROMs and hard + disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer + uses the XFS file system for hard disk partitions however). + + This implementation only offers read-only access. If you don't know + what all this is about, it's safe to say N. For more information + about EFS see its home page at <http://aeschi.ch.eu.org/efs/>. + + To compile the EFS file system support as a module, choose M here: the + module will be called efs. diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ba2f9ec7119..011b9b8c90c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -234,8 +234,6 @@ struct ep_pqueue { /* * Configuration options available inside /proc/sys/fs/epoll/ */ -/* Maximum number of epoll devices, per user */ -static int max_user_instances __read_mostly; /* Maximum number of epoll watched descriptors, per user */ static int max_user_watches __read_mostly; @@ -261,14 +259,6 @@ static int zero; ctl_table epoll_table[] = { { - .procname = "max_user_instances", - .data = &max_user_instances, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &zero, - }, - { .procname = "max_user_watches", .data = &max_user_watches, .maxlen = sizeof(int), @@ -491,7 +481,6 @@ static void ep_free(struct eventpoll *ep) mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); - atomic_dec(&ep->user->epoll_devs); free_uid(ep->user); kfree(ep); } @@ -581,10 +570,6 @@ static int ep_alloc(struct eventpoll **pep) struct eventpoll *ep; user = get_current_user(); - error = -EMFILE; - if (unlikely(atomic_read(&user->epoll_devs) >= - max_user_instances)) - goto free_uid; error = -ENOMEM; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) @@ -1141,7 +1126,6 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) flags & O_CLOEXEC); if (fd < 0) ep_free(ep); - atomic_inc(&ep->user->epoll_devs); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", @@ -1366,8 +1350,10 @@ static int __init eventpoll_init(void) struct sysinfo si; si_meminfo(&si); - max_user_instances = 128; - max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / + /* + * Allows top 4% of lomem to be allocated for epoll watches (per user). + */ + max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; /* Initialize the structure used to perform safe poll wait head wake ups */ diff --git a/fs/exec.c b/fs/exec.c index 0dd60a01f1b..929b58004b7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1049,16 +1049,32 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold current->cred_exec_mutex to protect against * PTRACE_ATTACH */ -void check_unsafe_exec(struct linux_binprm *bprm) +void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files) { - struct task_struct *p = current; + struct task_struct *p = current, *t; + unsigned long flags; + unsigned n_fs, n_files, n_sighand; bprm->unsafe = tracehook_unsafe_exec(p); - if (atomic_read(&p->fs->count) > 1 || - atomic_read(&p->files->count) > 1 || - atomic_read(&p->sighand->count) > 1) + n_fs = 1; + n_files = 1; + n_sighand = 1; + lock_task_sighand(p, &flags); + for (t = next_thread(p); t != p; t = next_thread(t)) { + if (t->fs == p->fs) + n_fs++; + if (t->files == files) + n_files++; + n_sighand++; + } + + if (atomic_read(&p->fs->count) > n_fs || + atomic_read(&p->files->count) > n_files || + atomic_read(&p->sighand->count) > n_sighand) bprm->unsafe |= LSM_UNSAFE_SHARE; + + unlock_task_sighand(p, &flags); } /* @@ -1273,7 +1289,7 @@ int do_execve(char * filename, bprm->cred = prepare_exec_creds(); if (!bprm->cred) goto out_unlock; - check_unsafe_exec(bprm); + check_unsafe_exec(bprm, displaced); file = open_exec(filename); retval = PTR_ERR(file); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index da8bdeaa2e6..7c6e3606f0e 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1185,9 +1185,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) es = sbi->s_es; if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != (old_mount_opt & EXT2_MOUNT_XIP)) && - invalidate_inodes(sb)) - ext2_warning(sb, __func__, "busy inodes while remounting "\ - "xip remain in cache (no functional problem)"); + invalidate_inodes(sb)) { + ext2_warning(sb, __func__, "refusing change of xip flag " + "with busy inodes while remounting"); + sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; + sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; + } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; if (*flags & MS_RDONLY) { diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 69a3d19ca9f..4db4ffa1eda 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1358,7 +1358,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct fake_dirent *fde; blocksize = dir->i_sb->s_blocksize; - dxtrace(printk("Creating index\n")); + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); retval = ext3_journal_get_write_access(handle, bh); if (retval) { ext3_std_error(dir->i_sb, retval); @@ -1367,6 +1367,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, } root = (struct dx_root *) bh->b_data; + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)fde + + ext3_rec_len_from_disk(fde->rec_len)); + if ((char *) de >= (((char *) root) + blocksize)) { + ext3_error(dir->i_sb, __func__, + "invalid rec_len for '..' in inode %lu", + dir->i_ino); + brelse(bh); + return -EIO; + } + len = ((char *) root) + blocksize - (char *) de; + bh2 = ext3_append (handle, dir, &block, &retval); if (!(bh2)) { brelse(bh); @@ -1375,11 +1388,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; data1 = bh2->b_data; - /* The 0th block becomes the root, move the dirents out */ - fde = &root->dotdot; - de = (struct ext3_dir_entry_2 *)((char *)fde + - ext3_rec_len_from_disk(fde->rec_len)); - len = ((char *) root) + blocksize - (char *) de; memcpy (data1, de, len); de = (struct ext3_dir_entry_2 *) data1; top = data1 + len; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index b70d90e08a3..4a970411a45 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2428,12 +2428,13 @@ static void ext3_write_super (struct super_block * sb) static int ext3_sync_fs(struct super_block *sb, int wait) { - sb->s_dirt = 0; - if (wait) - ext3_force_commit(sb); - else - journal_start_commit(EXT3_SB(sb)->s_journal, NULL); + tid_t target; + sb->s_dirt = 0; + if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { + if (wait) + log_wait_commit(EXT3_SB(sb)->s_journal, target); + } return 0; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 6bba06b09dd..de9459b4cb9 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -609,7 +609,9 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi, */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3) + if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || + (*retries)++ > 3 || + !EXT4_SB(sb)->s_journal) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); @@ -684,15 +686,15 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + desc_count += ext4_free_blks_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (bitmap_bh == NULL) continue; x = ext4_count_free(bitmap_bh, sb->s_blocksize); - printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n", - i, le16_to_cpu(gdp->bg_free_blocks_count), x); + printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", + i, ext4_free_blks_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c668e4377d7..b0c87dce66a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -868,7 +868,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen) { unsigned len = le16_to_cpu(dlen); - if (len == EXT4_MAX_REC_LEN) + if (len == EXT4_MAX_REC_LEN || len == 0) return 1 << 16; return len; } @@ -1206,8 +1206,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, static inline loff_t ext4_isize(struct ext4_inode *raw_inode) { - return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | - le32_to_cpu(raw_inode->i_size_lo); + if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + else + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 54bf0623a9a..e2eab196875 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3048,7 +3048,7 @@ retry: WARN_ON(ret <= 0); printk(KERN_ERR "%s: ext4_ext_get_blocks " "returned error inode#%lu, block=%u, " - "max_blocks=%lu", __func__, + "max_blocks=%u", __func__, inode->i_ino, block, max_blocks); #endif ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4fb86a0061d..f18a919be70 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -715,6 +715,13 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) if (sbi->s_log_groups_per_flex) { ret2 = find_group_flex(sb, dir, &group); + if (ret2 == -1) { + ret2 = find_group_other(sb, dir, &group); + if (ret2 == 0 && printk_ratelimit()) + printk(KERN_NOTICE "ext4: find_group_flex " + "failed, fallback succeeded dir %lu\n", + dir->i_ino); + } goto got_group; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a6444cee0c7..c7fed5b1874 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,8 +47,10 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + EXT4_SB(inode->i_sb)->s_journal, + &EXT4_I(inode)->jinode, + new_size); } static void ext4_invalidatepage(struct page *page, unsigned long offset); @@ -360,9 +362,9 @@ static int ext4_block_to_path(struct inode *inode, final = ptrs; } else { ext4_warning(inode->i_sb, "ext4_block_to_path", - "block %lu > max", + "block %lu > max in inode %lu", i_block + direct_blocks + - indirect_blocks + double_blocks); + indirect_blocks + double_blocks, inode->i_ino); } if (boundary) *boundary = final - 1 - (i_block & (ptrs - 1)); @@ -1366,6 +1368,10 @@ retry: goto out; } + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { ext4_journal_stop(handle); @@ -1375,7 +1381,7 @@ retry: *pagep = page; ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext4_get_block); + ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = walk_page_buffers(handle, page_buffers(page), @@ -2437,6 +2443,7 @@ static int ext4_da_writepages(struct address_space *mapping, int no_nrwrite_index_update; int pages_written = 0; long pages_skipped; + int range_cyclic, cycled = 1, io_done = 0; int needed_blocks, ret = 0, nr_to_writebump = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); @@ -2488,9 +2495,15 @@ static int ext4_da_writepages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - if (wbc->range_cyclic) + range_cyclic = wbc->range_cyclic; + if (wbc->range_cyclic) { index = mapping->writeback_index; - else + if (index) + cycled = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = LLONG_MAX; + wbc->range_cyclic = 0; + } else index = wbc->range_start >> PAGE_CACHE_SHIFT; mpd.wbc = wbc; @@ -2504,6 +2517,7 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->no_nrwrite_index_update = 1; pages_skipped = wbc->pages_skipped; +retry: while (!ret && wbc->nr_to_write > 0) { /* @@ -2530,7 +2544,7 @@ static int ext4_da_writepages(struct address_space *mapping, ext4_journal_stop(handle); - if (mpd.retval == -ENOSPC) { + if ((mpd.retval == -ENOSPC) && sbi->s_journal) { /* commit the transaction which would * free blocks released in the transaction * and try again @@ -2546,6 +2560,7 @@ static int ext4_da_writepages(struct address_space *mapping, pages_written += mpd.pages_written; wbc->pages_skipped = pages_skipped; ret = 0; + io_done = 1; } else if (wbc->nr_to_write) /* * There is no more writeout needed @@ -2554,6 +2569,13 @@ static int ext4_da_writepages(struct address_space *mapping, */ break; } + if (!io_done && !cycled) { + cycled = 1; + index = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = mapping->writeback_index - 1; + goto retry; + } if (pages_skipped != wbc->pages_skipped) printk(KERN_EMERG "This should not happen leaving %s " "with nr_to_write = %ld ret = %d\n", @@ -2561,6 +2583,7 @@ static int ext4_da_writepages(struct address_space *mapping, /* Update index */ index += pages_written; + wbc->range_cyclic = range_cyclic; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * set the writeback_index so that range_cyclic @@ -2648,6 +2671,9 @@ retry: ret = PTR_ERR(handle); goto out; } + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { @@ -2821,9 +2847,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); } - BUG_ON(!EXT4_JOURNAL(inode) && - EXT4_I(inode)->i_state & EXT4_STATE_JDATA); - if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { /* * This is a REALLY heavyweight approach, but the use of @@ -3622,7 +3645,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, * block pointed to itself, it would have been detached when * the block was cleared. Check for this instead of OOPSing. */ - if (bh2jh(this_bh)) + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else ext4_error(inode->i_sb, __func__, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 918aec0c8a1..4415beeb0b6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3025,7 +3025,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, goto out_err; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, - gdp->bg_free_blocks_count); + ext4_free_blks_count(sb, gdp)); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) @@ -3693,6 +3693,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_linear = 0; @@ -3755,6 +3757,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_linear = 1; @@ -4476,23 +4479,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); - /* - * We want to add the pa to the right bucket. - * Remove it from the list and while adding - * make sure the list to which we are adding - * doesn't grow big. - */ - if (likely(pa->pa_free)) { - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); - ext4_mb_add_n_trim(ac); - } } - ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->alloc_semp) up_read(ac->alloc_semp); + if (pa) { + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. We need to release + * alloc_semp before calling ext4_mb_add_n_trim() + */ + if (pa->pa_linear && likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } + ext4_mb_put_pa(ac, ac->ac_sb, pa); + } if (ac->ac_bitmap_page) page_cache_release(ac->ac_bitmap_page); if (ac->ac_buddy_page) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 734abca25e3..fe64d9f7985 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode) + 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); - goto err_out; + return retval; } tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, @@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode) if (IS_ERR(tmp_inode)) { retval = -ENOMEM; ext4_journal_stop(handle); - tmp_inode = NULL; - goto err_out; + return retval; } i_size_write(tmp_inode, i_size_read(inode)); /* @@ -618,8 +617,7 @@ err_out: ext4_journal_stop(handle); - if (tmp_inode) - iput(tmp_inode); + iput(tmp_inode); return retval; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index fec0b4c2f5f..ba702bd7910 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1368,7 +1368,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct fake_dirent *fde; blocksize = dir->i_sb->s_blocksize; - dxtrace(printk(KERN_DEBUG "Creating index\n")); + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); retval = ext4_journal_get_write_access(handle, bh); if (retval) { ext4_std_error(dir->i_sb, retval); @@ -1377,6 +1377,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, } root = (struct dx_root *) bh->b_data; + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext4_dir_entry_2 *)((char *)fde + + ext4_rec_len_from_disk(fde->rec_len)); + if ((char *) de >= (((char *) root) + blocksize)) { + ext4_error(dir->i_sb, __func__, + "invalid rec_len for '..' in inode %lu", + dir->i_ino); + brelse(bh); + return -EIO; + } + len = ((char *) root) + blocksize - (char *) de; + + /* Allocate new block for the 0th block's dirents */ bh2 = ext4_append(handle, dir, &block, &retval); if (!(bh2)) { brelse(bh); @@ -1385,11 +1399,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; data1 = bh2->b_data; - /* The 0th block becomes the root, move the dirents out */ - fde = &root->dotdot; - de = (struct ext4_dir_entry_2 *)((char *)fde + - ext4_rec_len_from_disk(fde->rec_len)); - len = ((char *) root) + blocksize - (char *) de; memcpy (data1, de, len); de = (struct ext4_dir_entry_2 *) data1; top = data1 + len; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c328be5d688..c06886abd65 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -861,12 +861,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) gdp = (struct ext4_group_desc *)((char *)primary->b_data + gdb_off * EXT4_DESC_SIZE(sb)); + memset(gdp, 0, EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ ext4_free_blks_set(sb, gdp, input->free_blocks_count); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); - gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); + gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e5f06a5f045..39d1993cfa1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3046,14 +3046,17 @@ static void ext4_write_super(struct super_block *sb) static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; + tid_t target; trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; if (EXT4_SB(sb)->s_journal) { - if (wait) - ret = ext4_force_commit(sb); - else - jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, + &target)) { + if (wait) + jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, + target); + } } else { ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); } @@ -3088,7 +3091,6 @@ static int ext4_freeze(struct super_block *sb) /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); if (error) goto out; diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig new file mode 100644 index 00000000000..d0a69ff2537 --- /dev/null +++ b/fs/fat/Kconfig @@ -0,0 +1,97 @@ +config FAT_FS + tristate + select NLS + help + If you want to use one of the FAT-based file systems (the MS-DOS and + VFAT (Windows 95) file systems), then you must say Y or M here + to include FAT support. You will then be able to mount partitions or + diskettes with FAT-based file systems and transparently access the + files on them, i.e. MSDOS files will look and behave just like all + other Unix files. + + This FAT support is not a file system in itself, it only provides + the foundation for the other file systems. You will have to say Y or + M to at least one of "MSDOS fs support" or "VFAT fs support" in + order to make use of it. + + Another way to read and write MSDOS floppies and hard drive + partitions from within Linux (but not transparently) is with the + mtools ("man mtools") program suite. You don't need to say Y here in + order to do that. + + If you need to move large files on floppies between a DOS and a + Linux box, say Y here, mount the floppy under Linux with an MSDOS + file system and use GNU tar's M option. GNU tar is a program + available for Unix and DOS ("man tar" or "info tar"). + + The FAT support will enlarge your kernel by about 37 KB. If unsure, + say Y. + + To compile this as a module, choose M here: the module will be called + fat. Note that if you compile the FAT support as a module, you + cannot compile any of the FAT-based file systems into the kernel + -- they will have to be modules as well. + +config MSDOS_FS + tristate "MSDOS fs support" + select FAT_FS + help + This allows you to mount MSDOS partitions of your hard drive (unless + they are compressed; to access compressed MSDOS partitions under + Linux, you can either use the DOS emulator DOSEMU, described in the + DOSEMU-HOWTO, available from + <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in + <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you + intend to use dosemu with a non-compressed MSDOS partition, say Y + here) and MSDOS floppies. This means that file access becomes + transparent, i.e. the MSDOS files look and behave just like all + other Unix files. + + If you have Windows 95 or Windows NT installed on your MSDOS + partitions, you should use the VFAT file system (say Y to "VFAT fs + support" below), or you will not be able to see the long filenames + generated by Windows 95 / Windows NT. + + This option will enlarge your kernel by about 7 KB. If unsure, + answer Y. This will only work if you said Y to "DOS FAT fs support" + as well. To compile this as a module, choose M here: the module will + be called msdos. + +config VFAT_FS + tristate "VFAT (Windows-95) fs support" + select FAT_FS + help + This option provides support for normal Windows file systems with + long filenames. That includes non-compressed FAT-based file systems + used by Windows 95, Windows 98, Windows NT 4.0, and the Unix + programs from the mtools package. + + The VFAT support enlarges your kernel by about 10 KB and it only + works if you said Y to the "DOS FAT fs support" above. Please read + the file <file:Documentation/filesystems/vfat.txt> for details. If + unsure, say Y. + + To compile this as a module, choose M here: the module will be called + vfat. + +config FAT_DEFAULT_CODEPAGE + int "Default codepage for FAT" + depends on MSDOS_FS || VFAT_FS + default 437 + help + This option should be set to the codepage of your FAT filesystems. + It can be overridden with the "codepage" mount option. + See <file:Documentation/filesystems/vfat.txt> for more information. + +config FAT_DEFAULT_IOCHARSET + string "Default iocharset for FAT" + depends on VFAT_FS + default "iso8859-1" + help + Set this to the default input/output character set you'd + like FAT to use. It should probably match the character set + that most of your FAT filesystems use, and can be overridden + with the "iocharset" mount option for FAT filesystems. + Note that "utf8" is not recommended for FAT filesystems. + If unsure, you shouldn't set "utf8" here. + See <file:Documentation/filesystems/vfat.txt> for more information. diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig new file mode 100644 index 00000000000..8dc1cd5c1ef --- /dev/null +++ b/fs/freevxfs/Kconfig @@ -0,0 +1,16 @@ +config VXFS_FS + tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" + depends on BLOCK + help + FreeVxFS is a file system driver that support the VERITAS VxFS(TM) + file system format. VERITAS VxFS(TM) is the standard file system + of SCO UnixWare (and possibly others) and optionally available + for Sunsoft Solaris, HP-UX and many other operating systems. + Currently only readonly access is supported. + + NOTE: the file system type as used by mount(1), mount(2) and + fstab(5) is 'vxfs' as it describes the file system format, not + the actual driver. + + To compile this as a module, choose M here: the module will be + called freevxfs. If unsure, say N. diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig new file mode 100644 index 00000000000..0cf160a94ed --- /dev/null +++ b/fs/fuse/Kconfig @@ -0,0 +1,15 @@ +config FUSE_FS + tristate "FUSE (Filesystem in Userspace) support" + help + With FUSE it is possible to implement a fully functional filesystem + in a userspace program. + + There's also companion library: libfuse. This library along with + utilities is available from the FUSE homepage: + <http://fuse.sourceforge.net/> + + See <file:Documentation/filesystems/fuse.txt> for more information. + See <file:Documentation/Changes> for needed library/utility version. + + If you want to develop a userspace FS, or if you want to use + a filesystem based on FUSE, answer Y or M. diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e0c7ada08a1..ba76b68c52f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -281,7 +281,8 @@ __releases(&fc->lock) fc->blocked = 0; wake_up_all(&fc->blocked_waitq); } - if (fc->num_background == FUSE_CONGESTION_THRESHOLD) { + if (fc->num_background == FUSE_CONGESTION_THRESHOLD && + fc->connected) { clear_bdi_congested(&fc->bdi, READ); clear_bdi_congested(&fc->bdi, WRITE); } @@ -825,16 +826,21 @@ static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_poll_wakeup_out outarg; - int err; + int err = -EINVAL; if (size != sizeof(outarg)) - return -EINVAL; + goto err; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - return err; + goto err; + fuse_copy_finish(cs); return fuse_notify_poll_wakeup(fc, &outarg); + +err: + fuse_copy_finish(cs); + return err; } static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, @@ -845,6 +851,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, return fuse_notify_poll(fc, size, cs); default: + fuse_copy_finish(cs); return -EINVAL; } } @@ -923,7 +930,6 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, */ if (!oh.unique) { err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); - fuse_copy_finish(&cs); return err ? err : nbytes; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e8162646a9b..d9fdb7cec53 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -54,7 +54,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) ff->reserved_req = fuse_request_alloc(); if (!ff->reserved_req) { kfree(ff); - ff = NULL; + return NULL; } else { INIT_LIST_HEAD(&ff->write_entry); atomic_set(&ff->count, 0); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 47c96fdca1a..459b73dd45e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -292,6 +292,7 @@ static void fuse_put_super(struct super_block *sb) list_del(&fc->entry); fuse_ctl_remove_conn(fc); mutex_unlock(&fuse_mutex); + bdi_destroy(&fc->bdi); fuse_conn_put(fc); } @@ -532,7 +533,6 @@ void fuse_conn_put(struct fuse_conn *fc) if (fc->destroy_req) fuse_request_free(fc->destroy_req); mutex_destroy(&fc->inst_mutex); - bdi_destroy(&fc->bdi); fc->release(fc); } } @@ -805,16 +805,18 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) int err; int is_bdev = sb->s_bdev != NULL; + err = -EINVAL; if (sb->s_flags & MS_MANDLOCK) - return -EINVAL; + goto err; if (!parse_fuse_opt((char *) data, &d, is_bdev)) - return -EINVAL; + goto err; if (is_bdev) { #ifdef CONFIG_BLOCK + err = -EINVAL; if (!sb_set_blocksize(sb, d.blksize)) - return -EINVAL; + goto err; #endif } else { sb->s_blocksize = PAGE_CACHE_SIZE; @@ -826,20 +828,22 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_export_op = &fuse_export_operations; file = fget(d.fd); + err = -EINVAL; if (!file) - return -EINVAL; + goto err; if (file->f_op != &fuse_dev_operations) - return -EINVAL; + goto err_fput; fc = kmalloc(sizeof(*fc), GFP_KERNEL); + err = -ENOMEM; if (!fc) - return -ENOMEM; + goto err_fput; err = fuse_conn_init(fc, sb); if (err) { kfree(fc); - return err; + goto err_fput; } fc->release = fuse_free_conn; @@ -854,12 +858,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) err = -ENOMEM; root = fuse_get_root_inode(sb, d.rootmode); if (!root) - goto err; + goto err_put_conn; root_dentry = d_alloc_root(root); if (!root_dentry) { iput(root); - goto err; + goto err_put_conn; } init_req = fuse_request_alloc(); @@ -903,9 +907,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fuse_request_free(init_req); err_put_root: dput(root_dentry); - err: - fput(file); + err_put_conn: fuse_conn_put(fc); + err_fput: + fput(file); + err: return err; } diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig new file mode 100644 index 00000000000..b77c5bc20f8 --- /dev/null +++ b/fs/hfs/Kconfig @@ -0,0 +1,12 @@ +config HFS_FS + tristate "Apple Macintosh file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + select NLS + help + If you say Y here, you will be able to mount Macintosh-formatted + floppy disks and hard drive partitions with full read-write access. + Please read <file:Documentation/filesystems/hfs.txt> to learn about + the available mount options. + + To compile this file system support as a module, choose M here: the + module will be called hfs. diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig new file mode 100644 index 00000000000..a63371815aa --- /dev/null +++ b/fs/hfsplus/Kconfig @@ -0,0 +1,13 @@ +config HFSPLUS_FS + tristate "Apple Extended HFS file system support" + depends on BLOCK + select NLS + select NLS_UTF8 + help + If you say Y here, you will be able to mount extended format + Macintosh-formatted hard drive partitions with full read-write access. + + This file system is often called HFS+ and was introduced with + MacOS 8. It includes all Mac specific filesystem data such as + data forks and creator codes, but it also has several UNIX + style features such as file ownership and permissions. diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig new file mode 100644 index 00000000000..56bd15c5bf6 --- /dev/null +++ b/fs/hpfs/Kconfig @@ -0,0 +1,14 @@ +config HPFS_FS + tristate "OS/2 HPFS file system support" + depends on BLOCK + help + OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS + is the file system used for organizing files on OS/2 hard disk + partitions. Say Y if you want to be able to read files from and + write files to an OS/2 HPFS partition on your hard drive. OS/2 + floppies however are in regular MSDOS format, so you don't need this + option in order to be able to read them. Read + <file:Documentation/filesystems/hpfs.txt>. + + To compile this file system support as a module, choose M here: the + module will be called hpfs. If unsure, say N. diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6903d37af03..9b800d97a68 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), - len >> huge_page_shift(h), vma)) + len >> huge_page_shift(h), vma, + vma->vm_flags)) goto out; ret = 0; @@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void) can_do_mlock()); } -struct file *hugetlb_file_setup(const char *name, size_t size) +struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) { int error = -ENOMEM; struct file *file; @@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size) error = -ENOMEM; if (hugetlb_reserve_pages(inode, 0, - size >> huge_page_shift(hstate_inode(inode)), NULL)) + size >> huge_page_shift(hstate_inode(inode)), NULL, + acctflag)) goto out_inode; d_instantiate(dentry, inode); diff --git a/fs/internal.h b/fs/internal.h index 53af885f173..0d8ac497b3d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -43,7 +43,7 @@ extern void __init chrdev_init(void); /* * exec.c */ -extern void check_unsafe_exec(struct linux_binprm *); +extern void check_unsafe_exec(struct linux_binprm *, struct files_struct *); /* * namespace.c diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig new file mode 100644 index 00000000000..8ab9878e367 --- /dev/null +++ b/fs/isofs/Kconfig @@ -0,0 +1,39 @@ +config ISO9660_FS + tristate "ISO 9660 CDROM file system support" + help + This is the standard file system used on CD-ROMs. It was previously + known as "High Sierra File System" and is called "hsfs" on other + Unix systems. The so-called Rock-Ridge extensions which allow for + long Unix filenames and symbolic links are also supported by this + driver. If you have a CD-ROM drive and want to do more with it than + just listen to audio CDs and watch its LEDs, say Y (and read + <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO, + available from <http://www.tldp.org/docs.html#howto>), thereby + enlarging your kernel by about 27 KB; otherwise say N. + + To compile this file system support as a module, choose M here: the + module will be called isofs. + +config JOLIET + bool "Microsoft Joliet CDROM extensions" + depends on ISO9660_FS + select NLS + help + Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system + which allows for long filenames in unicode format (unicode is the + new 16 bit character code, successor to ASCII, which encodes the + characters of almost all languages of the world; see + <http://www.unicode.org/> for more information). Say Y here if you + want to be able to read Joliet CD-ROMs under Linux. + +config ZISOFS + bool "Transparent decompression extension" + depends on ISO9660_FS + select ZLIB_INFLATE + help + This is a Linux-specific extension to RockRidge which lets you store + data in compressed form on a CD-ROM and have it transparently + decompressed when the CD-ROM is accessed. See + <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools + necessary to create such a filesystem. Say Y here if you want to be + able to read such compressed CD-ROMs. diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 9e4fa52d7dc..e79c07812af 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -427,7 +427,7 @@ int __log_space_left(journal_t *journal) } /* - * Called under j_state_lock. Returns true if a transaction was started. + * Called under j_state_lock. Returns true if a transaction commit was started. */ int __log_start_commit(journal_t *journal, tid_t target) { @@ -495,7 +495,8 @@ int journal_force_commit_nested(journal_t *journal) /* * Start a commit of the current running transaction (if any). Returns true - * if a transaction was started, and fills its tid in at *ptid + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid */ int journal_start_commit(journal_t *journal, tid_t *ptid) { @@ -505,15 +506,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid) if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; - ret = __log_start_commit(journal, tid); - if (ret && ptid) + __log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) *ptid = tid; - } else if (journal->j_committing_transaction && ptid) { + ret = 1; + } else if (journal->j_committing_transaction) { /* * If ext3_write_super() recently started a commit, then we * have to wait for completion of that transaction */ - *ptid = journal->j_committing_transaction->t_tid; + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; ret = 1; } spin_unlock(&journal->j_state_lock); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 56675306ed8..58144102bf2 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -37,10 +37,10 @@ #include <linux/proc_fs.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/math64.h> #include <asm/uaccess.h> #include <asm/page.h> -#include <asm/div64.h> EXPORT_SYMBOL(jbd2_journal_start); EXPORT_SYMBOL(jbd2_journal_restart); @@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal) } /* - * Called under j_state_lock. Returns true if a transaction was started. + * Called under j_state_lock. Returns true if a transaction commit was started. */ int __jbd2_log_start_commit(journal_t *journal, tid_t target) { @@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal) /* * Start a commit of the current running transaction (if any). Returns true - * if a transaction was started, and fills its tid in at *ptid + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid */ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) { @@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; - ret = __jbd2_log_start_commit(journal, tid); - if (ret && ptid) + __jbd2_log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) *ptid = tid; - } else if (journal->j_committing_transaction && ptid) { + ret = 1; + } else if (journal->j_committing_transaction) { /* * If ext3_write_super() recently started a commit, then we * have to wait for completion of that transaction */ - *ptid = journal->j_committing_transaction->t_tid; + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; ret = 1; } spin_unlock(&journal->j_state_lock); @@ -846,8 +851,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); seq_printf(seq, " %ums logging transaction\n", jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); - seq_printf(seq, " %luus average transaction commit time\n", - do_div(s->journal->j_average_commit_time, 1000)); + seq_printf(seq, " %lluus average transaction commit time\n", + div_u64(s->journal->j_average_commit_time, 1000)); seq_printf(seq, " %lu handles per transaction\n", s->stats->u.run.rs_handle_count / s->stats->ts_tid); seq_printf(seq, " %lu blocks per transaction\n", diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 46b4e347ed7..28ce21d8598 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2129,26 +2129,46 @@ done: } /* - * This function must be called when inode is journaled in ordered mode - * before truncation happens. It starts writeout of truncated part in - * case it is in the committing transaction so that we stand to ordered - * mode consistency guarantees. + * File truncate and transaction commit interact with each other in a + * non-trivial way. If a transaction writing data block A is + * committing, we cannot discard the data by truncate until we have + * written them. Otherwise if we crashed after the transaction with + * write has committed but before the transaction with truncate has + * committed, we could see stale data in block A. This function is a + * helper to solve this problem. It starts writeout of the truncated + * part in case it is in the committing transaction. + * + * Filesystem code must call this function when inode is journaled in + * ordered mode before truncation happens and after the inode has been + * placed on orphan list with the new inode size. The second condition + * avoids the race that someone writes new data and we start + * committing the transaction after this function has been called but + * before a transaction for truncate is started (and furthermore it + * allows us to optimize the case where the addition to orphan list + * happens in the same transaction as write --- we don't have to write + * any data in such case). */ -int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, +int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *jinode, loff_t new_size) { - journal_t *journal; - transaction_t *commit_trans; + transaction_t *inode_trans, *commit_trans; int ret = 0; - if (!inode->i_transaction && !inode->i_next_transaction) + /* This is a quick check to avoid locking if not necessary */ + if (!jinode->i_transaction) goto out; - journal = inode->i_transaction->t_journal; + /* Locks are here just to force reading of recent values, it is + * enough that the transaction was not committing before we started + * a transaction adding the inode to orphan list */ spin_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; spin_unlock(&journal->j_state_lock); - if (inode->i_transaction == commit_trans) { - ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, + spin_lock(&journal->j_list_lock); + inode_trans = jinode->i_transaction; + spin_unlock(&journal->j_list_lock); + if (inode_trans == commit_trans) { + ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, new_size, LLONG_MAX); if (ret) jbd2_journal_abort(journal, ret); diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 3cceef4ad2b..e9580104b6b 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -95,13 +95,17 @@ static int jffs2_garbage_collect_thread(void *_c) spin_unlock(&c->erase_completion_lock); - /* This thread is purely an optimisation. But if it runs when - other things could be running, it actually makes things a - lot worse. Use yield() and put it at the back of the runqueue - every time. Especially during boot, pulling an inode in - with read_inode() is much preferable to having the GC thread - get there first. */ - yield(); + /* Problem - immediately after bootup, the GCD spends a lot + * of time in places like jffs2_kill_fragtree(); so much so + * that userspace processes (like gdm and X) are starved + * despite plenty of cond_resched()s and renicing. Yield() + * doesn't help, either (presumably because userspace and GCD + * are generally competing for a higher latency resource - + * disk). + * This forces the GCD to slow the hell down. Pulling an + * inode in with read_inode() is much preferable to having + * the GC thread get there first. */ + schedule_timeout_interruptible(msecs_to_jiffies(50)); /* Put_super will send a SIGKILL and then wait on the sem. */ diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 6ca08ad887c..1fc1e92356e 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -220,7 +220,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn) { uint32_t fn_end = tn->fn->ofs + tn->fn->size; - struct jffs2_tmp_dnode_info *this; + struct jffs2_tmp_dnode_info *this, *ptn; dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw)); @@ -251,11 +251,18 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, if (this) { /* If the node is coincident with another at a lower address, back up until the other node is found. It may be relevant */ - while (this->overlapped) - this = tn_prev(this); - - /* First node should never be marked overlapped */ - BUG_ON(!this); + while (this->overlapped) { + ptn = tn_prev(this); + if (!ptn) { + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + this->overlapped = 0; + break; + } + this = ptn; + } dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole"); } @@ -360,7 +367,17 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, } if (!this->overlapped) break; - this = tn_prev(this); + + ptn = tn_prev(this); + if (!ptn) { + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + this->overlapped = 0; + break; + } + this = ptn; } } @@ -456,8 +473,15 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c, eat_last(&rii->tn_root, &last->rb); ver_insert(&ver_root, last); - if (unlikely(last->overlapped)) - continue; + if (unlikely(last->overlapped)) { + if (pen) + continue; + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + last->overlapped = 0; + } /* Now we have a bunch of nodes in reverse version order, in the tree at ver_root. Most of the time, diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig new file mode 100644 index 00000000000..9ff619a6f9c --- /dev/null +++ b/fs/jfs/Kconfig @@ -0,0 +1,49 @@ +config JFS_FS + tristate "JFS filesystem support" + select NLS + help + This is a port of IBM's Journaled Filesystem . More information is + available in the file <file:Documentation/filesystems/jfs.txt>. + + If you do not intend to use the JFS filesystem, say N. + +config JFS_POSIX_ACL + bool "JFS POSIX Access Control Lists" + depends on JFS_FS + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config JFS_SECURITY + bool "JFS Security Labels" + depends on JFS_FS + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the jfs filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config JFS_DEBUG + bool "JFS debugging" + depends on JFS_FS + help + If you are experiencing any problems with the JFS filesystem, say + Y here. This will result in additional debugging messages to be + written to the system log. Under normal circumstances, this + results in very little overhead. + +config JFS_STATISTICS + bool "JFS statistics" + depends on JFS_FS + help + Enabling this option will cause statistics from the JFS file system + to be made available to the user in the /proc/fs/jfs/ directory. diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 6063a8e4b9f..763b78a6e9d 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -427,7 +427,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; case -EAGAIN: ret = nlm_lck_denied; - goto out; + break; case FILE_LOCK_DEFERRED: if (wait) break; @@ -443,6 +443,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + ret = nlm_lck_denied; + if (!wait) + goto out; + ret = nlm_lck_blocked; /* Append to list of blocked */ diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig new file mode 100644 index 00000000000..0fd7ca99426 --- /dev/null +++ b/fs/minix/Kconfig @@ -0,0 +1,17 @@ +config MINIX_FS + tristate "Minix file system support" + depends on BLOCK + help + Minix is a simple operating system used in many classes about OS's. + The minix file system (method to organize files on a hard disk + partition or a floppy disk) was the original file system for Linux, + but has been superseded by the second extended file system ext2fs. + You don't want to use the minix file system on your hard disk + because of certain built-in restrictions, but it is sometimes found + on older Linux floppy disks. This option will enlarge your kernel + by about 28 KB. If unsure, say N. + + To compile this file system support as a module, choose M here: the + module will be called minix. Note that the file system of your root + partition (the one containing the directory /) cannot be compiled as + a module. diff --git a/fs/namespace.c b/fs/namespace.c index 228d8c4bfd1..06f8e63f6cb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt) */ for_each_possible_cpu(cpu) { struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); - if (cpu_writer->mnt != mnt) - continue; spin_lock(&cpu_writer->lock); + if (cpu_writer->mnt != mnt) { + spin_unlock(&cpu_writer->lock); + continue; + } atomic_add(cpu_writer->count, &mnt->__mnt_writers); cpu_writer->count = 0; /* diff --git a/fs/ncpfs/Kconfig b/fs/ncpfs/Kconfig index 142808427b2..c931cf22a1f 100644 --- a/fs/ncpfs/Kconfig +++ b/fs/ncpfs/Kconfig @@ -1,6 +1,27 @@ # # NCP Filesystem configuration # +config NCP_FS + tristate "NCP file system support (to mount NetWare volumes)" + depends on IPX!=n || INET + help + NCP (NetWare Core Protocol) is a protocol that runs over IPX and is + used by Novell NetWare clients to talk to file servers. It is to + IPX what NFS is to TCP/IP, if that helps. Saying Y here allows you + to mount NetWare file server volumes and to access them just like + any other Unix directory. For details, please read the file + <file:Documentation/filesystems/ncpfs.txt> in the kernel source and + the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>. + + You do not have to say Y here if you want your Linux box to act as a + file *server* for Novell NetWare clients. + + General information about how to connect Linux, Windows machines and + Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. + + To compile this as a module, choose M here: the module will be called + ncpfs. Say N unless you are connected to a Novell network. + config NCPFS_PACKET_SIGNING bool "Packet signatures" depends on NCP_FS diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig new file mode 100644 index 00000000000..36fe20d6eba --- /dev/null +++ b/fs/nfs/Kconfig @@ -0,0 +1,86 @@ +config NFS_FS + tristate "NFS client support" + depends on INET + select LOCKD + select SUNRPC + select NFS_ACL_SUPPORT if NFS_V3_ACL + help + Choose Y here if you want to access files residing on other + computers using Sun's Network File System protocol. To compile + this file system support as a module, choose M here: the module + will be called nfs. + + To mount file systems exported by NFS servers, you also need to + install the user space mount.nfs command which can be found in + the Linux nfs-utils package, available from http://linux-nfs.org/. + Information about using the mount command is available in the + mount(8) man page. More detail about the Linux NFS client + implementation is available via the nfs(5) man page. + + Below you can choose which versions of the NFS protocol are + available in the kernel to mount NFS servers. Support for NFS + version 2 (RFC 1094) is always available when NFS_FS is selected. + + To configure a system which mounts its root file system via NFS + at boot time, say Y here, select "Kernel level IP + autoconfiguration" in the NETWORK menu, and select "Root file + system on NFS" below. You cannot compile this file system as a + module in this case. + + If unsure, say N. + +config NFS_V3 + bool "NFS client support for NFS version 3" + depends on NFS_FS + help + This option enables support for version 3 of the NFS protocol + (RFC 1813) in the kernel's NFS client. + + If unsure, say Y. + +config NFS_V3_ACL + bool "NFS client support for the NFSv3 ACL protocol extension" + depends on NFS_V3 + help + Some NFS servers support an auxiliary NFSv3 ACL protocol that + Sun added to Solaris but never became an official part of the + NFS version 3 protocol. This protocol extension allows + applications on NFS clients to manipulate POSIX Access Control + Lists on files residing on NFS servers. NFS servers enforce + ACLs on local files whether this protocol is available or not. + + Choose Y here if your NFS server supports the Solaris NFSv3 ACL + protocol extension and you want your NFS client to allow + applications to access and modify ACLs on files on the server. + + Most NFS servers don't support the Solaris NFSv3 ACL protocol + extension. You can choose N here or specify the "noacl" mount + option to prevent your NFS client from trying to use the NFSv3 + ACL protocol. + + If unsure, say N. + +config NFS_V4 + bool "NFS client support for NFS version 4 (EXPERIMENTAL)" + depends on NFS_FS && EXPERIMENTAL + select RPCSEC_GSS_KRB5 + help + This option enables support for version 4 of the NFS protocol + (RFC 3530) in the kernel's NFS client. + + To mount NFS servers using NFSv4, you also need to install user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. + + If unsure, say N. + +config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP + help + If you want your system to mount its root file system via NFS, + choose Y here. This is common practice for managing systems + without local permanent storage. For details, read + <file:Documentation/filesystems/nfsroot.txt>. + + Most people say N here. diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig new file mode 100644 index 00000000000..44d7d04dab9 --- /dev/null +++ b/fs/nfsd/Kconfig @@ -0,0 +1,80 @@ +config NFSD + tristate "NFS server support" + depends on INET + select LOCKD + select SUNRPC + select EXPORTFS + select NFS_ACL_SUPPORT if NFSD_V2_ACL + help + Choose Y here if you want to allow other computers to access + files residing on this system using Sun's Network File System + protocol. To compile the NFS server support as a module, + choose M here: the module will be called nfsd. + + You may choose to use a user-space NFS server instead, in which + case you can choose N here. + + To export local file systems using NFS, you also need to install + user space programs which can be found in the Linux nfs-utils + package, available from http://linux-nfs.org/. More detail about + the Linux NFS server implementation is available via the + exports(5) man page. + + Below you can choose which versions of the NFS protocol are + available to clients mounting the NFS server on this system. + Support for NFS version 2 (RFC 1094) is always available when + CONFIG_NFSD is selected. + + If unsure, say N. + +config NFSD_V2_ACL + bool + depends on NFSD + +config NFSD_V3 + bool "NFS server support for NFS version 3" + depends on NFSD + help + This option enables support in your system's NFS server for + version 3 of the NFS protocol (RFC 1813). + + If unsure, say Y. + +config NFSD_V3_ACL + bool "NFS server support for the NFSv3 ACL protocol extension" + depends on NFSD_V3 + select NFSD_V2_ACL + help + Solaris NFS servers support an auxiliary NFSv3 ACL protocol that + never became an official part of the NFS version 3 protocol. + This protocol extension allows applications on NFS clients to + manipulate POSIX Access Control Lists on files residing on NFS + servers. NFS servers enforce POSIX ACLs on local files whether + this protocol is available or not. + + This option enables support in your system's NFS server for the + NFSv3 ACL protocol extension allowing NFS clients to manipulate + POSIX ACLs on files exported by your system's NFS server. NFS + clients which support the Solaris NFSv3 ACL protocol can then + access and modify ACLs on your NFS server. + + To store ACLs on your NFS server, you also need to enable ACL- + related CONFIG options for your local file systems of choice. + + If unsure, say N. + +config NFSD_V4 + bool "NFS server support for NFS version 4 (EXPERIMENTAL)" + depends on NFSD && PROC_FS && EXPERIMENTAL + select NFSD_V3 + select FS_POSIX_ACL + select RPCSEC_GSS_KRB5 + help + This option enables support in your system's NFS server for + version 4 of the NFS protocol (RFC 3530). + + To export files using NFSv4, you need to install additional user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. + + If unsure, say N. diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index c903e04aa21..5573508f707 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -49,6 +49,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) new->fsuid = exp->ex_anon_uid; new->fsgid = exp->ex_anon_gid; gi = groups_alloc(0); + if (!gi) + goto oom; } else if (flags & NFSEXP_ROOTSQUASH) { if (!new->fsuid) new->fsuid = exp->ex_anon_uid; @@ -85,6 +87,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); put_cred(override_creds(new)); + put_cred(new); return 0; oom: diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 88db7d3ec12..b6f60f48e94 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2871,7 +2871,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; file_lock.fl_pid = current->tgid; file_lock.fl_flags = FL_POSIX; - file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lockt->lt_offset; file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c index dae3f28f30d..331f2e88e28 100644 --- a/fs/notify/inotify/inotify.c +++ b/fs/notify/inotify/inotify.c @@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih, int ret; do { - if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL))) + if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS))) return -ENOSPC; ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd); } while (ret == -EAGAIN); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index d53a1838d6e..bed766e435b 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -427,10 +427,61 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) return ret; } +/* + * Get an inotify_kernel_event if one exists and is small + * enough to fit in "count". Return an error pointer if + * not large enough. + * + * Called with the device ev_mutex held. + */ +static struct inotify_kernel_event *get_one_event(struct inotify_device *dev, + size_t count) +{ + size_t event_size = sizeof(struct inotify_event); + struct inotify_kernel_event *kevent; + + if (list_empty(&dev->events)) + return NULL; + + kevent = inotify_dev_get_event(dev); + if (kevent->name) + event_size += kevent->event.len; + + if (event_size > count) + return ERR_PTR(-EINVAL); + + remove_kevent(dev, kevent); + return kevent; +} + +/* + * Copy an event to user space, returning how much we copied. + * + * We already checked that the event size is smaller than the + * buffer we had in "get_one_event()" above. + */ +static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent, + char __user *buf) +{ + size_t event_size = sizeof(struct inotify_event); + + if (copy_to_user(buf, &kevent->event, event_size)) + return -EFAULT; + + if (kevent->name) { + buf += event_size; + + if (copy_to_user(buf, kevent->name, kevent->event.len)) + return -EFAULT; + + event_size += kevent->event.len; + } + return event_size; +} + static ssize_t inotify_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { - size_t event_size = sizeof (struct inotify_event); struct inotify_device *dev; char __user *start; int ret; @@ -440,81 +491,43 @@ static ssize_t inotify_read(struct file *file, char __user *buf, dev = file->private_data; while (1) { + struct inotify_kernel_event *kevent; prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); mutex_lock(&dev->ev_mutex); - if (!list_empty(&dev->events)) { - ret = 0; - break; - } + kevent = get_one_event(dev, count); mutex_unlock(&dev->ev_mutex); - if (file->f_flags & O_NONBLOCK) { - ret = -EAGAIN; - break; - } - - if (signal_pending(current)) { - ret = -EINTR; - break; + if (kevent) { + ret = PTR_ERR(kevent); + if (IS_ERR(kevent)) + break; + ret = copy_event_to_user(kevent, buf); + free_kevent(kevent); + if (ret < 0) + break; + buf += ret; + count -= ret; + continue; } - schedule(); - } - - finish_wait(&dev->wq, &wait); - if (ret) - return ret; - - while (1) { - struct inotify_kernel_event *kevent; - - ret = buf - start; - if (list_empty(&dev->events)) + ret = -EAGAIN; + if (file->f_flags & O_NONBLOCK) break; - - kevent = inotify_dev_get_event(dev); - if (event_size + kevent->event.len > count) { - if (ret == 0 && count > 0) { - /* - * could not get a single event because we - * didn't have enough buffer space. - */ - ret = -EINVAL; - } + ret = -EINTR; + if (signal_pending(current)) break; - } - remove_kevent(dev, kevent); - /* - * Must perform the copy_to_user outside the mutex in order - * to avoid a lock order reversal with mmap_sem. - */ - mutex_unlock(&dev->ev_mutex); - - if (copy_to_user(buf, &kevent->event, event_size)) { - ret = -EFAULT; + if (start != buf) break; - } - buf += event_size; - count -= event_size; - - if (kevent->name) { - if (copy_to_user(buf, kevent->name, kevent->event.len)){ - ret = -EFAULT; - break; - } - buf += kevent->event.len; - count -= kevent->event.len; - } - - free_kevent(kevent); - mutex_lock(&dev->ev_mutex); + schedule(); } - mutex_unlock(&dev->ev_mutex); + finish_wait(&dev->wq, &wait); + if (start != buf && ret != -EFAULT) + ret = buf - start; return ret; } diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig new file mode 100644 index 00000000000..f5a868cc915 --- /dev/null +++ b/fs/ntfs/Kconfig @@ -0,0 +1,78 @@ +config NTFS_FS + tristate "NTFS file system support" + select NLS + help + NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. + + Saying Y or M here enables read support. There is partial, but + safe, write support available. For write support you must also + say Y to "NTFS write support" below. + + There are also a number of user-space tools available, called + ntfsprogs. These include ntfsundelete and ntfsresize, that work + without NTFS support enabled in the kernel. + + This is a rewrite from scratch of Linux NTFS support and replaced + the old NTFS code starting with Linux 2.5.11. A backport to + the Linux 2.4 kernel series is separately available as a patch + from the project web site. + + For more information see <file:Documentation/filesystems/ntfs.txt> + and <http://www.linux-ntfs.org/>. + + To compile this file system support as a module, choose M here: the + module will be called ntfs. + + If you are not using Windows NT, 2000, XP or 2003 in addition to + Linux on your computer it is safe to say N. + +config NTFS_DEBUG + bool "NTFS debugging support" + depends on NTFS_FS + help + If you are experiencing any problems with the NTFS file system, say + Y here. This will result in additional consistency checks to be + performed by the driver as well as additional debugging messages to + be written to the system log. Note that debugging messages are + disabled by default. To enable them, supply the option debug_msgs=1 + at the kernel command line when booting the kernel or as an option + to insmod when loading the ntfs module. Once the driver is active, + you can enable debugging messages by doing (as root): + echo 1 > /proc/sys/fs/ntfs-debug + Replacing the "1" with "0" would disable debug messages. + + If you leave debugging messages disabled, this results in little + overhead, but enabling debug messages results in very significant + slowdown of the system. + + When reporting bugs, please try to have available a full dump of + debugging messages while the misbehaviour was occurring. + +config NTFS_RW + bool "NTFS write support" + depends on NTFS_FS + help + This enables the partial, but safe, write support in the NTFS driver. + + The only supported operation is overwriting existing files, without + changing the file length. No file or directory creation, deletion or + renaming is possible. Note only non-resident files can be written to + so you may find that some very small files (<500 bytes or so) cannot + be written to. + + While we cannot guarantee that it will not damage any data, we have + so far not received a single report where the driver would have + damaged someones data so we assume it is perfectly safe to use. + + Note: While write support is safe in this version (a rewrite from + scratch of the NTFS support), it should be noted that the old NTFS + write support, included in Linux 2.5.10 and before (since 1997), + is not safe. + + This is currently useful with TopologiLinux. TopologiLinux is run + on top of any DOS/Microsoft Windows system without partitioning your + hard disk. Unlike other Linux distributions TopologiLinux does not + need its own partition. For more information see + <http://topologi-linux.sourceforge.net/> + + It is perfectly safe to say N here. diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig new file mode 100644 index 00000000000..701b7a3a872 --- /dev/null +++ b/fs/ocfs2/Kconfig @@ -0,0 +1,85 @@ +config OCFS2_FS + tristate "OCFS2 file system support" + depends on NET && SYSFS + select CONFIGFS_FS + select JBD2 + select CRC32 + select QUOTA + select QUOTA_TREE + help + OCFS2 is a general purpose extent based shared disk cluster file + system with many similarities to ext3. It supports 64 bit inode + numbers, and has automatically extending metadata groups which may + also make it attractive for non-clustered use. + + You'll want to install the ocfs2-tools package in order to at least + get "mount.ocfs2". + + Project web page: http://oss.oracle.com/projects/ocfs2 + Tools web page: http://oss.oracle.com/projects/ocfs2-tools + OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ + + For more information on OCFS2, see the file + <file:Documentation/filesystems/ocfs2.txt>. + +config OCFS2_FS_O2CB + tristate "O2CB Kernelspace Clustering" + depends on OCFS2_FS + default y + help + OCFS2 includes a simple kernelspace clustering package, the OCFS2 + Cluster Base. It only requires a very small userspace component + to configure it. This comes with the standard ocfs2-tools package. + O2CB is limited to maintaining a cluster for OCFS2 file systems. + It cannot manage any other cluster applications. + + It is always safe to say Y here, as the clustering method is + run-time selectable. + +config OCFS2_FS_USERSPACE_CLUSTER + tristate "OCFS2 Userspace Clustering" + depends on OCFS2_FS && DLM + default y + help + This option will allow OCFS2 to use userspace clustering services + in conjunction with the DLM in fs/dlm. If you are using a + userspace cluster manager, say Y here. + + It is safe to say Y, as the clustering method is run-time + selectable. + +config OCFS2_FS_STATS + bool "OCFS2 statistics" + depends on OCFS2_FS + default y + help + This option allows some fs statistics to be captured. Enabling + this option may increase the memory consumption. + +config OCFS2_DEBUG_MASKLOG + bool "OCFS2 logging support" + depends on OCFS2_FS + default y + help + The ocfs2 filesystem has an extensive logging system. The system + allows selection of events to log via files in /sys/o2cb/logmask/. + This option will enlarge your kernel, but it allows debugging of + ocfs2 filesystem issues. + +config OCFS2_DEBUG_FS + bool "OCFS2 expensive checks" + depends on OCFS2_FS + default n + help + This option will enable expensive consistency checks. Enable + this option for debugging only as it is likely to decrease + performance of the filesystem. + +config OCFS2_FS_POSIX_ACL + bool "OCFS2 POSIX Access Control Lists" + depends on OCFS2_FS + select FS_POSIX_ACL + default n + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d861096c9d8..3a9e5deed74 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4796,6 +4796,29 @@ out: return ret; } +static int ocfs2_replace_extent_rec(struct inode *inode, + handle_t *handle, + struct ocfs2_path *path, + struct ocfs2_extent_list *el, + int split_index, + struct ocfs2_extent_rec *split_rec) +{ + int ret; + + ret = ocfs2_path_bh_journal_access(handle, inode, path, + path_num_items(path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + el->l_recs[split_index] = *split_rec; + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); +out: + return ret; +} + /* * Mark part or all of the extent record at split_index in the leaf * pointed to by path as written. This removes the unwritten @@ -4885,7 +4908,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode, if (ctxt.c_contig_type == CONTIG_NONE) { if (ctxt.c_split_covers_rec) - el->l_recs[split_index] = *split_rec; + ret = ocfs2_replace_extent_rec(inode, handle, + path, el, + split_index, split_rec); else ret = ocfs2_split_and_insert(inode, handle, path, et, &last_eb_bh, split_index, @@ -5390,6 +5415,9 @@ int ocfs2_remove_btree_range(struct inode *inode, goto out; } + vfs_dq_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(inode->i_sb, len)); + ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, dealloc); if (ret) { diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index b1cc7c381e8..e9d7c2038c0 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -38,6 +38,7 @@ #include "dlmglue.h" #include "file.h" #include "inode.h" +#include "super.h" static int ocfs2_dentry_revalidate(struct dentry *dentry, @@ -294,6 +295,34 @@ out_attach: return ret; } +static DEFINE_SPINLOCK(dentry_list_lock); + +/* We limit the number of dentry locks to drop in one go. We have + * this limit so that we don't starve other users of ocfs2_wq. */ +#define DL_INODE_DROP_COUNT 64 + +/* Drop inode references from dentry locks */ +void ocfs2_drop_dl_inodes(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dentry_lock_work); + struct ocfs2_dentry_lock *dl; + int drop_count = DL_INODE_DROP_COUNT; + + spin_lock(&dentry_list_lock); + while (osb->dentry_lock_list && drop_count--) { + dl = osb->dentry_lock_list; + osb->dentry_lock_list = dl->dl_next; + spin_unlock(&dentry_list_lock); + iput(dl->dl_inode); + kfree(dl); + spin_lock(&dentry_list_lock); + } + if (osb->dentry_lock_list) + queue_work(ocfs2_wq, &osb->dentry_lock_work); + spin_unlock(&dentry_list_lock); +} + /* * ocfs2_dentry_iput() and friends. * @@ -318,16 +347,23 @@ out_attach: static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { - iput(dl->dl_inode); ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); ocfs2_lock_res_free(&dl->dl_lockres); - kfree(dl); + + /* We leave dropping of inode reference to ocfs2_wq as that can + * possibly lead to inode deletion which gets tricky */ + spin_lock(&dentry_list_lock); + if (!osb->dentry_lock_list) + queue_work(ocfs2_wq, &osb->dentry_lock_work); + dl->dl_next = osb->dentry_lock_list; + osb->dentry_lock_list = dl; + spin_unlock(&dentry_list_lock); } void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { - int unlock = 0; + int unlock; BUG_ON(dl->dl_count == 0); diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index c091c34d988..d06e16c0664 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -29,8 +29,13 @@ extern struct dentry_operations ocfs2_dentry_ops; struct ocfs2_dentry_lock { + /* Use count of dentry lock */ unsigned int dl_count; - u64 dl_parent_blkno; + union { + /* Linked list of dentry locks to release */ + struct ocfs2_dentry_lock *dl_next; + u64 dl_parent_blkno; + }; /* * The ocfs2_dentry_lock keeps an inode reference until @@ -47,6 +52,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); +void ocfs2_drop_dl_inodes(struct work_struct *work); + struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 54e182a27ca..0a281394785 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1849,12 +1849,12 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, if (!mle) { if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && res->owner != assert->node_idx) { - mlog(ML_ERROR, "assert_master from " - "%u, but current owner is " - "%u! (%.*s)\n", - assert->node_idx, res->owner, - namelen, name); - goto kill; + mlog(ML_ERROR, "DIE! Mastery assert from %u, " + "but current owner is %u! (%.*s)\n", + assert->node_idx, res->owner, namelen, + name); + __dlm_print_one_lock_resource(res); + BUG(); } } else if (mle->type != DLM_MLE_MIGRATION) { if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index d1295203029..4060bb328bc 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -181,8 +181,7 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); /* This ensures that clear refmap is sent after the set */ - __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG | - DLM_LOCK_RES_MIGRATING)); + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); spin_unlock(&res->spinlock); /* clear our bit from the master's refmap, ignore errors */ diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 86ca085ef32..fcf879ed693 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -117,11 +117,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, else BUG_ON(res->owner == dlm->node_num); - spin_lock(&dlm->spinlock); + spin_lock(&dlm->ast_lock); /* We want to be sure that we're not freeing a lock * that still has AST's pending... */ in_use = !list_empty(&lock->ast_list); - spin_unlock(&dlm->spinlock); + spin_unlock(&dlm->ast_lock); if (in_use) { mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " "while waiting for an ast!", res->lockname.len, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index b0c4cadd4c4..7219a86d34c 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -320,9 +320,14 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, int convert); -#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \ - mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \ - _err, _func, _lockres->l_name); \ +#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \ + if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY) \ + mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \ + _err, _func, _lockres->l_name); \ + else \ + mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n", \ + _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name, \ + (unsigned int)ocfs2_get_dentry_lock_ino(_lockres)); \ } while (0) static int ocfs2_downconvert_thread(void *arg); static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, @@ -2860,6 +2865,10 @@ static void ocfs2_unlock_ast(void *opaque, int error) case OCFS2_UNLOCK_CANCEL_CONVERT: mlog(0, "Cancel convert success for %s\n", lockres->l_name); lockres->l_action = OCFS2_AST_INVALID; + /* Downconvert thread may have requeued this lock, we + * need to wake it. */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres)); break; case OCFS2_UNLOCK_DROP_LOCK: lockres->l_level = DLM_LOCK_IV; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3c3532e1307..172850a9a12 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) static inline int ocfs2_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + OCFS2_SB(inode->i_sb)->journal->j_journal, + &OCFS2_I(inode)->ip_jinode, + new_size); } #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index ad5c24a29ed..946d3c34b90 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -210,6 +210,7 @@ struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_quota_recovery; +struct ocfs2_dentry_lock; struct ocfs2_super { struct task_struct *commit_task; @@ -325,6 +326,11 @@ struct ocfs2_super struct list_head blocked_lock_list; unsigned long blocked_lock_count; + /* List of dentry locks to release. Anyone can add locks to + * the list, ocfs2_wq processes the list */ + struct ocfs2_dentry_lock *dentry_lock_list; + struct work_struct dentry_lock_work; + wait_queue_head_t osb_mount_event; /* Truncate log info */ @@ -335,6 +341,9 @@ struct ocfs2_super struct ocfs2_node_map osb_recovering_orphan_dirs; unsigned int *osb_orphan_wipes; wait_queue_head_t osb_wipe_event; + + /* used to protect metaecc calculation check of xattr. */ + spinlock_t osb_xattr_lock; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 6aff8f2d3e4..1ed0f7c8686 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -754,7 +754,9 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) if (dquot->dq_flags & mask) sync = 1; spin_unlock(&dq_data_lock); - if (!sync) { + /* This is a slight hack but we can't afford getting global quota + * lock if we already have a transaction started. */ + if (!sync || journal_current_handle()) { status = ocfs2_write_dquot(dquot); goto out; } @@ -810,171 +812,6 @@ out: return status; } -/* This is difficult. We have to lock quota inode and start transaction - * in this function but we don't want to take the penalty of exlusive - * quota file lock when we are just going to use cached structures. So - * we just take read lock check whether we have dquot cached and if so, - * we don't have to take the write lock... */ -static int ocfs2_dquot_initialize(struct inode *inode, int type) -{ - handle_t *handle = NULL; - int status = 0; - struct super_block *sb = inode->i_sb; - struct ocfs2_mem_dqinfo *oinfo; - int exclusive = 0; - int cnt; - qid_t id; - - mlog_entry_void(); - - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (type != -1 && cnt != type) - continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - oinfo = sb_dqinfo(sb, cnt)->dqi_priv; - status = ocfs2_lock_global_qf(oinfo, 0); - if (status < 0) - goto out; - /* This is just a performance optimization not a reliable test. - * Since we hold an inode lock, noone can actually release - * the structure until we are finished with initialization. */ - if (inode->i_dquot[cnt] != NODQUOT) { - ocfs2_unlock_global_qf(oinfo, 0); - continue; - } - /* When we have inode lock, we know that no dquot_release() can - * run and thus we can safely check whether we need to - * read+modify global file to get quota information or whether - * our node already has it. */ - if (cnt == USRQUOTA) - id = inode->i_uid; - else if (cnt == GRPQUOTA) - id = inode->i_gid; - else - BUG(); - /* Obtain exclusion from quota off... */ - down_write(&sb_dqopt(sb)->dqptr_sem); - exclusive = !dquot_is_cached(sb, id, cnt); - up_write(&sb_dqopt(sb)->dqptr_sem); - if (exclusive) { - status = ocfs2_lock_global_qf(oinfo, 1); - if (status < 0) { - exclusive = 0; - mlog_errno(status); - goto out_ilock; - } - handle = ocfs2_start_trans(OCFS2_SB(sb), - ocfs2_calc_qinit_credits(sb, cnt)); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_ilock; - } - } - dquot_initialize(inode, cnt); - if (exclusive) { - ocfs2_commit_trans(OCFS2_SB(sb), handle); - ocfs2_unlock_global_qf(oinfo, 1); - } - ocfs2_unlock_global_qf(oinfo, 0); - } - mlog_exit(0); - return 0; -out_ilock: - if (exclusive) - ocfs2_unlock_global_qf(oinfo, 1); - ocfs2_unlock_global_qf(oinfo, 0); -out: - mlog_exit(status); - return status; -} - -static int ocfs2_dquot_drop_slow(struct inode *inode) -{ - int status = 0; - int cnt; - int got_lock[MAXQUOTAS] = {0, 0}; - handle_t *handle; - struct super_block *sb = inode->i_sb; - struct ocfs2_mem_dqinfo *oinfo; - - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!sb_has_quota_active(sb, cnt)) - continue; - oinfo = sb_dqinfo(sb, cnt)->dqi_priv; - status = ocfs2_lock_global_qf(oinfo, 1); - if (status < 0) - goto out; - got_lock[cnt] = 1; - } - handle = ocfs2_start_trans(OCFS2_SB(sb), - ocfs2_calc_qinit_credits(sb, USRQUOTA) + - ocfs2_calc_qinit_credits(sb, GRPQUOTA)); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out; - } - dquot_drop(inode); - ocfs2_commit_trans(OCFS2_SB(sb), handle); -out: - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (got_lock[cnt]) { - oinfo = sb_dqinfo(sb, cnt)->dqi_priv; - ocfs2_unlock_global_qf(oinfo, 1); - } - return status; -} - -/* See the comment before ocfs2_dquot_initialize. */ -static int ocfs2_dquot_drop(struct inode *inode) -{ - int status = 0; - struct super_block *sb = inode->i_sb; - struct ocfs2_mem_dqinfo *oinfo; - int exclusive = 0; - int cnt; - int got_lock[MAXQUOTAS] = {0, 0}; - - mlog_entry_void(); - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!sb_has_quota_active(sb, cnt)) - continue; - oinfo = sb_dqinfo(sb, cnt)->dqi_priv; - status = ocfs2_lock_global_qf(oinfo, 0); - if (status < 0) - goto out; - got_lock[cnt] = 1; - } - /* Lock against anyone releasing references so that when when we check - * we know we are not going to be last ones to release dquot */ - down_write(&sb_dqopt(sb)->dqptr_sem); - /* Urgh, this is a terrible hack :( */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt] != NODQUOT && - atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) { - exclusive = 1; - break; - } - } - if (!exclusive) - dquot_drop_locked(inode); - up_write(&sb_dqopt(sb)->dqptr_sem); -out: - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (got_lock[cnt]) { - oinfo = sb_dqinfo(sb, cnt)->dqi_priv; - ocfs2_unlock_global_qf(oinfo, 0); - } - /* In case we bailed out because we had to do expensive locking - * do it now... */ - if (exclusive) - status = ocfs2_dquot_drop_slow(inode); - mlog_exit(status); - return status; -} - static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type) { struct ocfs2_dquot *dquot = @@ -991,8 +828,8 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) } struct dquot_operations ocfs2_quota_operations = { - .initialize = ocfs2_dquot_initialize, - .drop = ocfs2_dquot_drop, + .initialize = dquot_initialize, + .drop = dquot_drop, .alloc_space = dquot_alloc_space, .alloc_inode = dquot_alloc_inode, .free_space = dquot_free_space, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 43ed11345b5..7ac83a81ee5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1537,6 +1537,13 @@ static int ocfs2_get_sector(struct super_block *sb, unlock_buffer(*bh); ll_rw_block(READ, 1, bh); wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + mlog_errno(-EIO); + brelse(*bh); + *bh = NULL; + return -EIO; + } + return 0; } @@ -1747,6 +1754,7 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_LIST_HEAD(&osb->blocked_lock_list); osb->blocked_lock_count = 0; spin_lock_init(&osb->osb_lock); + spin_lock_init(&osb->osb_xattr_lock); ocfs2_init_inode_steal_slot(osb); atomic_set(&osb->alloc_stats.moves, 0); @@ -1887,6 +1895,9 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; + INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); + osb->dentry_lock_list = NULL; + /* get some pseudo constants for clustersize bits */ osb->s_clustersize_bits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e1d638af6ac..4ddd788add6 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -82,13 +82,14 @@ struct ocfs2_xattr_set_ctxt { #define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) #define OCFS2_XATTR_INLINE_SIZE 80 +#define OCFS2_XATTR_HEADER_GAP 4 #define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \ - sizeof(struct ocfs2_xattr_header) \ - - sizeof(__u32)) + - OCFS2_XATTR_HEADER_GAP) #define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \ - sizeof(struct ocfs2_xattr_block) \ - sizeof(struct ocfs2_xattr_header) \ - - sizeof(__u32)) + - OCFS2_XATTR_HEADER_GAP) static struct ocfs2_xattr_def_value_root def_xv = { .xv.xr_list.l_count = cpu_to_le16(1), @@ -274,10 +275,12 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket, bucket->bu_blocks, bucket->bu_bhs, 0, NULL); if (!rc) { + spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb, bucket->bu_bhs, bucket->bu_blocks, &bucket_xh(bucket)->xh_check); + spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); if (rc) mlog_errno(rc); } @@ -310,9 +313,11 @@ static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle, { int i; + spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb, bucket->bu_bhs, bucket->bu_blocks, &bucket_xh(bucket)->xh_check); + spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); for (i = 0; i < bucket->bu_blocks; i++) ocfs2_journal_dirty(handle, bucket->bu_bhs[i]); @@ -1507,7 +1512,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, last += 1; } - free = min_offs - ((void *)last - xs->base) - sizeof(__u32); + free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; if (free < 0) return -EIO; @@ -2190,7 +2195,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode, last += 1; } - free = min_offs - ((void *)last - xs->base) - sizeof(__u32); + free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; if (free < 0) return 0; @@ -2592,8 +2597,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, if (!ret) { /* Update inode ctime. */ - ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(ctxt->handle, inode, + xis->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -4729,13 +4735,6 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, vb.vb_xv = (struct ocfs2_xattr_value_root *) (vb.vb_bh->b_data + offset % blocksize); - ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - /* * From here on out we have to dirty the bucket. The generic * value calls only modify one of the bucket's bhs, but we need @@ -4748,12 +4747,18 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt); if (ret) { mlog_errno(ret); - goto out_dirty; + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; } xe->xe_value_size = cpu_to_le64(len); -out_dirty: ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket); out: @@ -5061,8 +5066,8 @@ try_again: xh_free_start = le16_to_cpu(xh->xh_free_start); header_size = sizeof(struct ocfs2_xattr_header) + count * sizeof(struct ocfs2_xattr_entry); - max_free = OCFS2_XATTR_BUCKET_SIZE - - le16_to_cpu(xh->xh_name_value_len) - header_size; + max_free = OCFS2_XATTR_BUCKET_SIZE - header_size - + le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP; mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " "of %u which exceed block size\n", @@ -5095,7 +5100,7 @@ try_again: need = 0; } - free = xh_free_start - header_size; + free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP; /* * We need to make sure the new name/value pair * can exist in the same block. @@ -5128,7 +5133,8 @@ try_again: } xh_free_start = le16_to_cpu(xh->xh_free_start); - free = xh_free_start - header_size; + free = xh_free_start - header_size + - OCFS2_XATTR_HEADER_GAP; if (xh_free_start % blocksize < need) free -= xh_free_start % blocksize; diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig new file mode 100644 index 00000000000..b1b9a0aba6f --- /dev/null +++ b/fs/omfs/Kconfig @@ -0,0 +1,13 @@ +config OMFS_FS + tristate "SonicBlue Optimized MPEG File System support" + depends on BLOCK + select CRC_ITU_T + help + This is the proprietary file system used by the Rio Karma music + player and ReplayTV DVR. Despite the name, this filesystem is not + more efficient than a standard FS for MPEG files, in fact likely + the opposite is true. Say Y if you have either of these devices + and wish to mount its disk. + + To compile this file system support as a module, choose M here: the + module will be called omfs. If unsure, say N. diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 3e76bb9b3ad..d8bb5c671f4 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -485,8 +485,10 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, } } unlock_new_inode(inode); - } else + } else { module_put(de->owner); + de_put(de); + } return inode; out_ino: diff --git a/fs/proc/page.c b/fs/proc/page.c index 767d95a6d1b..2d1345112a4 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -107,7 +107,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, else kflags = ppage->flags; - uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) | + uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) | kpf_copy_bit(kflags, KPF_ERROR, PG_error) | kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) | kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) | diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig new file mode 100644 index 00000000000..be8e0e1445b --- /dev/null +++ b/fs/qnx4/Kconfig @@ -0,0 +1,25 @@ +config QNX4FS_FS + tristate "QNX4 file system support (read only)" + depends on BLOCK + help + This is the file system used by the real-time operating systems + QNX 4 and QNX 6 (the latter is also called QNX RTP). + Further information is available at <http://www.qnx.com/>. + Say Y if you intend to mount QNX hard disks or floppies. + Unless you say Y to "QNX4FS read-write support" below, you will + only be able to read these file systems. + + To compile this file system support as a module, choose M here: the + module will be called qnx4. + + If you don't know whether you need it, then you don't need it: + answer N. + +config QNX4FS_RW + bool "QNX4FS write support (DANGEROUS)" + depends on QNX4FS_FS && EXPERIMENTAL && BROKEN + help + Say Y if you want to test write support for QNX4 file systems. + + It's currently broken, so for now: + answer N. diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig new file mode 100644 index 00000000000..949b8c6addc --- /dev/null +++ b/fs/reiserfs/Kconfig @@ -0,0 +1,85 @@ +config REISERFS_FS + tristate "Reiserfs support" + help + Stores not just filenames but the files themselves in a balanced + tree. Uses journalling. + + Balanced trees are more efficient than traditional file system + architectural foundations. + + In general, ReiserFS is as fast as ext2, but is very efficient with + large directories and small files. Additional patches are needed + for NFS and quotas, please see <http://www.namesys.com/> for links. + + It is more easily extended to have features currently found in + database and keyword search systems than block allocation based file + systems are. The next version will be so extended, and will support + plugins consistent with our motto ``It takes more than a license to + make source code open.'' + + Read <http://www.namesys.com/> to learn more about reiserfs. + + Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. + + If you like it, you can pay us to add new features to it that you + need, buy a support contract, or pay us to port it to another OS. + +config REISERFS_CHECK + bool "Enable reiserfs debug mode" + depends on REISERFS_FS + help + If you set this to Y, then ReiserFS will perform every check it can + possibly imagine of its internal consistency throughout its + operation. It will also go substantially slower. More than once we + have forgotten that this was on, and then gone despondent over the + latest benchmarks.:-) Use of this option allows our team to go all + out in checking for consistency when debugging without fear of its + effect on end users. If you are on the verge of sending in a bug + report, say Y and you might get a useful error message. Almost + everyone should say N. + +config REISERFS_PROC_INFO + bool "Stats in /proc/fs/reiserfs" + depends on REISERFS_FS && PROC_FS + help + Create under /proc/fs/reiserfs a hierarchy of files, displaying + various ReiserFS statistics and internal data at the expense of + making your kernel or module slightly larger (+8 KB). This also + increases the amount of kernel memory required for each mount. + Almost everyone but ReiserFS developers and people fine-tuning + reiserfs or tracing problems should say N. + +config REISERFS_FS_XATTR + bool "ReiserFS extended attributes" + depends on REISERFS_FS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config REISERFS_FS_POSIX_ACL + bool "ReiserFS POSIX Access Control Lists" + depends on REISERFS_FS_XATTR + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config REISERFS_FS_SECURITY + bool "ReiserFS Security Labels" + depends on REISERFS_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ReiserFS filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig new file mode 100644 index 00000000000..1a17020f9fa --- /dev/null +++ b/fs/romfs/Kconfig @@ -0,0 +1,16 @@ +config ROMFS_FS + tristate "ROM file system support" + depends on BLOCK + ---help--- + This is a very small read-only file system mainly intended for + initial ram disks of installation disks, but it could be used for + other read-only media as well. Read + <file:Documentation/filesystems/romfs.txt> for details. + + To compile this file system support as a module, choose M here: the + module will be called romfs. Note that the file system of your + root partition (the one containing the directory /) cannot be a + module. + + If you don't know whether you need it, then you don't need it: + answer N. diff --git a/fs/seq_file.c b/fs/seq_file.c index b569ff1c4dc..a1a4cfe1921 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -48,12 +48,78 @@ int seq_open(struct file *file, const struct seq_operations *op) */ file->f_version = 0; - /* SEQ files support lseek, but not pread/pwrite */ - file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE); + /* + * seq_files support lseek() and pread(). They do not implement + * write() at all, but we clear FMODE_PWRITE here for historical + * reasons. + * + * If a client of seq_files a) implements file.write() and b) wishes to + * support pwrite() then that client will need to implement its own + * file.open() which calls seq_open() and then sets FMODE_PWRITE. + */ + file->f_mode &= ~FMODE_PWRITE; return 0; } EXPORT_SYMBOL(seq_open); +static int traverse(struct seq_file *m, loff_t offset) +{ + loff_t pos = 0, index; + int error = 0; + void *p; + + m->version = 0; + index = 0; + m->count = m->from = 0; + if (!offset) { + m->index = index; + return 0; + } + if (!m->buf) { + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + if (!m->buf) + return -ENOMEM; + } + p = m->op->start(m, &index); + while (p) { + error = PTR_ERR(p); + if (IS_ERR(p)) + break; + error = m->op->show(m, p); + if (error < 0) + break; + if (unlikely(error)) { + error = 0; + m->count = 0; + } + if (m->count == m->size) + goto Eoverflow; + if (pos + m->count > offset) { + m->from = offset - pos; + m->count -= m->from; + m->index = index; + break; + } + pos += m->count; + m->count = 0; + if (pos == offset) { + index++; + m->index = index; + break; + } + p = m->op->next(m, p, &index); + } + m->op->stop(m, p); + m->index = index; + return error; + +Eoverflow: + m->op->stop(m, p); + kfree(m->buf); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + return !m->buf ? -ENOMEM : -EAGAIN; +} + /** * seq_read - ->read() method for sequential files. * @file: the file to read from @@ -73,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) int err = 0; mutex_lock(&m->lock); + + /* Don't assume *ppos is where we left it */ + if (unlikely(*ppos != m->read_pos)) { + m->read_pos = *ppos; + while ((err = traverse(m, *ppos)) == -EAGAIN) + ; + if (err) { + /* With prejudice... */ + m->read_pos = 0; + m->version = 0; + m->index = 0; + m->count = 0; + goto Done; + } + } + /* * seq_file->op->..m_start/m_stop/m_next may do special actions * or optimisations based on the file->f_version, so we want to @@ -172,8 +254,10 @@ Fill: Done: if (!copied) copied = err; - else + else { *ppos += copied; + m->read_pos += copied; + } file->f_version = m->version; mutex_unlock(&m->lock); return copied; @@ -186,63 +270,6 @@ Efault: } EXPORT_SYMBOL(seq_read); -static int traverse(struct seq_file *m, loff_t offset) -{ - loff_t pos = 0, index; - int error = 0; - void *p; - - m->version = 0; - index = 0; - m->count = m->from = 0; - if (!offset) { - m->index = index; - return 0; - } - if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); - if (!m->buf) - return -ENOMEM; - } - p = m->op->start(m, &index); - while (p) { - error = PTR_ERR(p); - if (IS_ERR(p)) - break; - error = m->op->show(m, p); - if (error < 0) - break; - if (unlikely(error)) { - error = 0; - m->count = 0; - } - if (m->count == m->size) - goto Eoverflow; - if (pos + m->count > offset) { - m->from = offset - pos; - m->count -= m->from; - m->index = index; - break; - } - pos += m->count; - m->count = 0; - if (pos == offset) { - index++; - m->index = index; - break; - } - p = m->op->next(m, p, &index); - } - m->op->stop(m, p); - return error; - -Eoverflow: - m->op->stop(m, p); - kfree(m->buf); - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); - return !m->buf ? -ENOMEM : -EAGAIN; -} - /** * seq_lseek - ->llseek() method for sequential files. * @file: the file in question @@ -265,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin) if (offset < 0) break; retval = offset; - if (offset != file->f_pos) { + if (offset != m->read_pos) { while ((retval=traverse(m, offset)) == -EAGAIN) ; if (retval) { /* with extreme prejudice... */ file->f_pos = 0; + m->read_pos = 0; m->version = 0; m->index = 0; m->count = 0; } else { + m->read_pos = offset; retval = file->f_pos = offset; } } diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig new file mode 100644 index 00000000000..e668127c8b2 --- /dev/null +++ b/fs/smbfs/Kconfig @@ -0,0 +1,55 @@ +config SMB_FS + tristate "SMB file system support (OBSOLETE, please use CIFS)" + depends on INET + select NLS + help + SMB (Server Message Block) is the protocol Windows for Workgroups + (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share + files and printers over local networks. Saying Y here allows you to + mount their file systems (often called "shares" in this context) and + access them just like any other Unix directory. Currently, this + works only if the Windows machines use TCP/IP as the underlying + transport protocol, and not NetBEUI. For details, read + <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO, + available from <http://www.tldp.org/docs.html#howto>. + + Note: if you just want your box to act as an SMB *server* and make + files and printing services available to Windows clients (which need + to have a TCP/IP stack), you don't need to say Y here; you can use + the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>) + for that. + + General information about how to connect Linux, Windows machines and + Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. + + To compile the SMB support as a module, choose M here: + the module will be called smbfs. Most people say N, however. + +config SMB_NLS_DEFAULT + bool "Use a default NLS" + depends on SMB_FS + help + Enabling this will make smbfs use nls translations by default. You + need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls + settings and you need to give the default nls for the SMB server as + CONFIG_SMB_NLS_REMOTE. + + The nls settings can be changed at mount time, if your smbmount + supports that, using the codepage and iocharset parameters. + + smbmount from samba 2.2.0 or later supports this. + +config SMB_NLS_REMOTE + string "Default Remote NLS Option" + depends on SMB_NLS_DEFAULT + default "cp437" + help + This setting allows you to specify a default value for which + codepage the server uses. If this field is left blank no + translations will be done by default. The local codepage/charset + default to CONFIG_NLS_DEFAULT. + + The nls settings can be changed at mount time, if your smbmount + supports that, using the codepage and iocharset parameters. + + smbmount from samba 2.2.0 or later supports this. diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig new file mode 100644 index 00000000000..25a00d19d68 --- /dev/null +++ b/fs/squashfs/Kconfig @@ -0,0 +1,51 @@ +config SQUASHFS + tristate "SquashFS 4.0 - Squashed file system support" + depends on BLOCK + select ZLIB_INFLATE + help + Saying Y here includes support for SquashFS 4.0 (a Compressed + Read-Only File System). Squashfs is a highly compressed read-only + filesystem for Linux. It uses zlib compression to compress both + files, inodes and directories. Inodes in the system are very small + and all blocks are packed to minimise data overhead. Block sizes + greater than 4K are supported up to a maximum of 1 Mbytes (default + block size 128K). SquashFS 4.0 supports 64 bit filesystems and files + (larger than 4GB), full uid/gid information, hard links and + timestamps. + + Squashfs is intended for general read-only filesystem use, for + archival use (i.e. in cases where a .tar.gz file may be used), and in + embedded systems where low overhead is needed. Further information + and tools are available from http://squashfs.sourceforge.net. + + If you want to compile this as a module ( = code which can be + inserted in and removed from the running kernel whenever you want), + say M here and read <file:Documentation/modules.txt>. The module + will be called squashfs. Note that the root file system (the one + containing the directory /) cannot be compiled as a module. + + If unsure, say N. + +config SQUASHFS_EMBEDDED + + bool "Additional option for memory-constrained systems" + depends on SQUASHFS + default n + help + Saying Y here allows you to specify cache size. + + If unsure, say N. + +config SQUASHFS_FRAGMENT_CACHE_SIZE + int "Number of fragments cached" if SQUASHFS_EMBEDDED + depends on SQUASHFS + default "3" + help + By default SquashFS caches the last 3 fragments read from + the filesystem. Increasing this amount may mean SquashFS + has to re-read fragments less often from disk, at the expense + of extra system memory. Decreasing this amount will mean + SquashFS uses less memory at the expense of extra reads from disk. + + Note there must be at least one cached fragment. Anything + much more than three will probably not make much difference. diff --git a/fs/super.c b/fs/super.c index 645e5403f2a..8349ed6b141 100644 --- a/fs/super.c +++ b/fs/super.c @@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type) * lock ordering than usbfs: */ lockdep_set_class(&s->s_lock, &type->s_lock_key); - down_write(&s->s_umount); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); s->s_count = S_BIAS; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); @@ -301,7 +316,7 @@ void generic_shutdown_super(struct super_block *sb) /* * wait for asynchronous fs operations to finish before going further */ - async_synchronize_full_special(&sb->s_async_list); + async_synchronize_full_domain(&sb->s_async_list); /* bad name - it should be evict_inodes() */ invalidate_inodes(sb); @@ -470,7 +485,7 @@ restart: sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - async_synchronize_full_special(&sb->s_async_list); + async_synchronize_full_domain(&sb->s_async_list); if (sb->s_root && (wait || sb->s_dirt)) sb->s_op->sync_fs(sb, wait); up_read(&sb->s_umount); diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig new file mode 100644 index 00000000000..f4b67588b9d --- /dev/null +++ b/fs/sysfs/Kconfig @@ -0,0 +1,23 @@ +config SYSFS + bool "sysfs file system support" if EMBEDDED + default y + help + The sysfs filesystem is a virtual filesystem that the kernel uses to + export internal kernel objects, their attributes, and their + relationships to one another. + + Users can use sysfs to ascertain useful information about the running + kernel, such as the devices the kernel has discovered on each bus and + which driver each is bound to. sysfs can also be used to tune devices + and other kernel subsystems. + + Some system agents rely on the information in sysfs to operate. + /sbin/hotplug uses device and object attributes in sysfs to assist in + delegating policy decisions, like persistently naming devices. + + sysfs is currently used by the block subsystem to mount the root + partition. If sysfs is disabled you must specify the boot device on + the kernel boot command line via its major and minor numbers. For + example, "root=03:01" for /dev/hda1. + + Designers of embedded systems may wish to say N here to conserve space. diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 66f6e58a7e4..f2c478c3424 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -63,6 +63,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) int count = min_t(size_t, bytes, PAGE_SIZE); char *temp; + if (!bytes) + return 0; + if (size) { if (offs > size) return 0; @@ -131,6 +134,9 @@ static ssize_t write(struct file *file, const char __user *userbuf, int count = min_t(size_t, bytes, PAGE_SIZE); char *temp; + if (!bytes) + return 0; + if (size) { if (offs > size) return 0; diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig new file mode 100644 index 00000000000..33aeb4b75db --- /dev/null +++ b/fs/sysv/Kconfig @@ -0,0 +1,36 @@ +config SYSV_FS + tristate "System V/Xenix/V7/Coherent file system support" + depends on BLOCK + help + SCO, Xenix and Coherent are commercial Unix systems for Intel + machines, and Version 7 was used on the DEC PDP-11. Saying Y + here would allow you to read from their floppies and hard disk + partitions. + + If you have floppies or hard disk partitions like that, it is likely + that they contain binaries from those other Unix systems; in order + to run these binaries, you will want to install linux-abi which is + a set of kernel modules that lets you run SCO, Xenix, Wyse, + UnixWare, Dell Unix and System V programs under Linux. It is + available via FTP (user: ftp) from + <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>). + NOTE: that will work only for binaries from Intel-based systems; + PDP ones will have to wait until somebody ports Linux to -11 ;-) + + If you only intend to mount files from some other Unix over the + network using NFS, you don't need the System V file system support + (but you need NFS file system support obviously). + + Note that this option is generally not needed for floppies, since a + good portable way to transport files and directories between unixes + (and even other operating systems) is given by the tar program ("man + tar" or preferably "info tar"). Note also that this option has + nothing whatsoever to do with the option "System V IPC". Read about + the System V file system in + <file:Documentation/filesystems/sysv-fs.txt>. + Saying Y here will enlarge your kernel by about 27 KB. + + To compile this as a module, choose M here: the module will be called + sysv. + + If you haven't heard about all of this before, it's safe to say N. diff --git a/fs/timerfd.c b/fs/timerfd.c index 6a123b8ff3f..b042bd7034b 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); - if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) - return -EINVAL; - if (clockid != CLOCK_MONOTONIC && - clockid != CLOCK_REALTIME) + if ((flags & ~TFD_CREATE_FLAGS) || + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME)) return -EINVAL; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, - flags & (O_CLOEXEC | O_NONBLOCK)); + flags & TFD_SHARED_FCNTL_FLAGS); if (ufd < 0) kfree(ctx); @@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) return -EFAULT; - if (!timespec_valid(&ktmr.it_value) || + if ((flags & ~TFD_SETTIME_FLAGS) || + !timespec_valid(&ktmr.it_value) || !timespec_valid(&ktmr.it_interval)) return -EINVAL; diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 175f9c590b7..f393620890e 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -689,7 +689,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free) } /** - * ubifs_get_free_space - return amount of free space. + * ubifs_get_free_space_nolock - return amount of free space. * @c: UBIFS file-system description object * * This function calculates amount of free space to report to user-space. @@ -704,16 +704,14 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free) * traditional file-systems, because they have way less overhead than UBIFS. * So, to keep users happy, UBIFS tries to take the overhead into account. */ -long long ubifs_get_free_space(struct ubifs_info *c) +long long ubifs_get_free_space_nolock(struct ubifs_info *c) { - int min_idx_lebs, rsvd_idx_lebs, lebs; + int rsvd_idx_lebs, lebs; long long available, outstanding, free; - spin_lock(&c->space_lock); - min_idx_lebs = c->min_idx_lebs; - ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); outstanding = c->budg_data_growth + c->budg_dd_growth; - available = ubifs_calc_available(c, min_idx_lebs); + available = ubifs_calc_available(c, c->min_idx_lebs); /* * When reporting free space to user-space, UBIFS guarantees that it is @@ -726,15 +724,14 @@ long long ubifs_get_free_space(struct ubifs_info *c) * Note, the calculations below are similar to what we have in * 'do_budget_space()', so refer there for comments. */ - if (min_idx_lebs > c->lst.idx_lebs) - rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; + if (c->min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; else rsvd_idx_lebs = 0; lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - c->lst.taken_empty_lebs; lebs -= rsvd_idx_lebs; available += lebs * (c->dark_wm - c->leb_overhead); - spin_unlock(&c->space_lock); if (available > outstanding) free = ubifs_reported_space(c, available - outstanding); @@ -742,3 +739,21 @@ long long ubifs_get_free_space(struct ubifs_info *c) free = 0; return free; } + +/** + * ubifs_get_free_space - return amount of free space. + * @c: UBIFS file-system description object + * + * This function calculates and retuns amount of free space to report to + * user-space. + */ +long long ubifs_get_free_space(struct ubifs_info *c) +{ + long long free; + + spin_lock(&c->space_lock); + free = ubifs_get_free_space_nolock(c); + spin_unlock(&c->space_lock); + + return free; +} diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 792c5a16c18..e975bd82f38 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -620,9 +620,11 @@ void dbg_dump_budg(struct ubifs_info *c) c->dark_wm, c->dead_wm, c->max_idx_node_sz); printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", c->gc_lnum, c->ihead_lnum); - for (i = 0; i < c->jhead_cnt; i++) - printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", - c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); + /* If we are in R/O mode, journal heads do not exist */ + if (c->jheads) + for (i = 0; i < c->jhead_cnt; i++) + printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", + c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { bud = rb_entry(rb, struct ubifs_bud, rb); printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); @@ -637,10 +639,7 @@ void dbg_dump_budg(struct ubifs_info *c) /* Print budgeting predictions */ available = ubifs_calc_available(c, c->min_idx_lebs); outstanding = c->budg_data_growth + c->budg_dd_growth; - if (available > outstanding) - free = ubifs_reported_space(c, available - outstanding); - else - free = 0; + free = ubifs_get_free_space_nolock(c); printk(KERN_DEBUG "Budgeting predictions:\n"); printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", available, outstanding, free); @@ -861,6 +860,65 @@ void dbg_dump_index(struct ubifs_info *c) } /** + * dbg_save_space_info - save information about flash space. + * @c: UBIFS file-system description object + * + * This function saves information about UBIFS free space, dirty space, etc, in + * order to check it later. + */ +void dbg_save_space_info(struct ubifs_info *c) +{ + struct ubifs_debug_info *d = c->dbg; + + ubifs_get_lp_stats(c, &d->saved_lst); + + spin_lock(&c->space_lock); + d->saved_free = ubifs_get_free_space_nolock(c); + spin_unlock(&c->space_lock); +} + +/** + * dbg_check_space_info - check flash space information. + * @c: UBIFS file-system description object + * + * This function compares current flash space information with the information + * which was saved when the 'dbg_save_space_info()' function was called. + * Returns zero if the information has not changed, and %-EINVAL it it has + * changed. + */ +int dbg_check_space_info(struct ubifs_info *c) +{ + struct ubifs_debug_info *d = c->dbg; + struct ubifs_lp_stats lst; + long long avail, free; + + spin_lock(&c->space_lock); + avail = ubifs_calc_available(c, c->min_idx_lebs); + spin_unlock(&c->space_lock); + free = ubifs_get_free_space(c); + + if (free != d->saved_free) { + ubifs_err("free space changed from %lld to %lld", + d->saved_free, free); + goto out; + } + + return 0; + +out: + ubifs_msg("saved lprops statistics dump"); + dbg_dump_lstats(&d->saved_lst); + ubifs_get_lp_stats(c, &lst); + ubifs_msg("current lprops statistics dump"); + dbg_dump_lstats(&d->saved_lst); + spin_lock(&c->space_lock); + dbg_dump_budg(c); + spin_unlock(&c->space_lock); + dump_stack(); + return -EINVAL; +} + +/** * dbg_check_synced_i_size - check synchronized inode size. * @inode: inode to check * @@ -1349,7 +1407,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra) * @c: UBIFS file-system description object * @leaf_cb: called for each leaf node * @znode_cb: called for each indexing node - * @priv: private date which is passed to callbacks + * @priv: private data which is passed to callbacks * * This function walks the UBIFS index and calls the @leaf_cb for each leaf * node and @znode_cb for each indexing node. Returns zero in case of success @@ -2409,7 +2467,7 @@ void ubifs_debugging_exit(struct ubifs_info *c) * Root directory for UBIFS stuff in debugfs. Contains sub-directories which * contain the stuff specific to particular file-system mounts. */ -static struct dentry *debugfs_rootdir; +static struct dentry *dfs_rootdir; /** * dbg_debugfs_init - initialize debugfs file-system. @@ -2421,9 +2479,9 @@ static struct dentry *debugfs_rootdir; */ int dbg_debugfs_init(void) { - debugfs_rootdir = debugfs_create_dir("ubifs", NULL); - if (IS_ERR(debugfs_rootdir)) { - int err = PTR_ERR(debugfs_rootdir); + dfs_rootdir = debugfs_create_dir("ubifs", NULL); + if (IS_ERR(dfs_rootdir)) { + int err = PTR_ERR(dfs_rootdir); ubifs_err("cannot create \"ubifs\" debugfs directory, " "error %d\n", err); return err; @@ -2437,7 +2495,7 @@ int dbg_debugfs_init(void) */ void dbg_debugfs_exit(void) { - debugfs_remove(debugfs_rootdir); + debugfs_remove(dfs_rootdir); } static int open_debugfs_file(struct inode *inode, struct file *file) @@ -2452,13 +2510,13 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf, struct ubifs_info *c = file->private_data; struct ubifs_debug_info *d = c->dbg; - if (file->f_path.dentry == d->dump_lprops) + if (file->f_path.dentry == d->dfs_dump_lprops) dbg_dump_lprops(c); - else if (file->f_path.dentry == d->dump_budg) { + else if (file->f_path.dentry == d->dfs_dump_budg) { spin_lock(&c->space_lock); dbg_dump_budg(c); spin_unlock(&c->space_lock); - } else if (file->f_path.dentry == d->dump_tnc) { + } else if (file->f_path.dentry == d->dfs_dump_tnc) { mutex_lock(&c->tnc_mutex); dbg_dump_tnc(c); mutex_unlock(&c->tnc_mutex); @@ -2469,7 +2527,7 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf, return count; } -static const struct file_operations debugfs_fops = { +static const struct file_operations dfs_fops = { .open = open_debugfs_file, .write = write_debugfs_file, .owner = THIS_MODULE, @@ -2494,36 +2552,32 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) struct dentry *dent; struct ubifs_debug_info *d = c->dbg; - sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); - d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name, - debugfs_rootdir); - if (IS_ERR(d->debugfs_dir)) { - err = PTR_ERR(d->debugfs_dir); + sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); + d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir); + if (IS_ERR(d->dfs_dir)) { + err = PTR_ERR(d->dfs_dir); ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", - d->debugfs_dir_name, err); + d->dfs_dir_name, err); goto out; } fname = "dump_lprops"; - dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, - &debugfs_fops); + dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; - d->dump_lprops = dent; + d->dfs_dump_lprops = dent; fname = "dump_budg"; - dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, - &debugfs_fops); + dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; - d->dump_budg = dent; + d->dfs_dump_budg = dent; fname = "dump_tnc"; - dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, - &debugfs_fops); + dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; - d->dump_tnc = dent; + d->dfs_dump_tnc = dent; return 0; @@ -2531,7 +2585,7 @@ out_remove: err = PTR_ERR(dent); ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", fname, err); - debugfs_remove_recursive(d->debugfs_dir); + debugfs_remove_recursive(d->dfs_dir); out: return err; } @@ -2542,7 +2596,7 @@ out: */ void dbg_debugfs_exit_fs(struct ubifs_info *c) { - debugfs_remove_recursive(c->dbg->debugfs_dir); + debugfs_remove_recursive(c->dbg->dfs_dir); } #endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 9820d6999f7..c1cd73b2e06 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -41,15 +41,17 @@ * @chk_lpt_wastage: used by LPT tree size checker * @chk_lpt_lebs: used by LPT tree size checker * @new_nhead_offs: used by LPT tree size checker - * @new_ihead_lnum: used by debugging to check ihead_lnum - * @new_ihead_offs: used by debugging to check ihead_offs + * @new_ihead_lnum: used by debugging to check @c->ihead_lnum + * @new_ihead_offs: used by debugging to check @c->ihead_offs * - * debugfs_dir_name: name of debugfs directory containing this file-system's - * files - * debugfs_dir: direntry object of the file-system debugfs directory - * dump_lprops: "dump lprops" debugfs knob - * dump_budg: "dump budgeting information" debugfs knob - * dump_tnc: "dump TNC" debugfs knob + * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()') + * @saved_free: saved free space (used by 'dbg_save_space_info()') + * + * dfs_dir_name: name of debugfs directory containing this file-system's files + * dfs_dir: direntry object of the file-system debugfs directory + * dfs_dump_lprops: "dump lprops" debugfs knob + * dfs_dump_budg: "dump budgeting information" debugfs knob + * dfs_dump_tnc: "dump TNC" debugfs knob */ struct ubifs_debug_info { void *buf; @@ -69,11 +71,14 @@ struct ubifs_debug_info { int new_ihead_lnum; int new_ihead_offs; - char debugfs_dir_name[100]; - struct dentry *debugfs_dir; - struct dentry *dump_lprops; - struct dentry *dump_budg; - struct dentry *dump_tnc; + struct ubifs_lp_stats saved_lst; + long long saved_free; + + char dfs_dir_name[100]; + struct dentry *dfs_dir; + struct dentry *dfs_dump_lprops; + struct dentry *dfs_dump_budg; + struct dentry *dfs_dump_tnc; }; #define ubifs_assert(expr) do { \ @@ -297,7 +302,8 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, dbg_znode_callback znode_cb, void *priv); /* Checking functions */ - +void dbg_save_space_info(struct ubifs_info *c); +int dbg_check_space_info(struct ubifs_info *c); int dbg_check_lprops(struct ubifs_info *c); int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); @@ -439,6 +445,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); #define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 #define dbg_old_index_check_init(c, zroot) 0 +#define dbg_save_space_info(c) ({}) +#define dbg_check_space_info(c) 0 #define dbg_check_old_index(c, zroot) 0 #define dbg_check_cats(c) 0 #define dbg_check_ltab(c) 0 diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index f448ab1f9c3..f55d523c52b 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -482,30 +482,29 @@ static int ubifs_dir_release(struct inode *dir, struct file *file) } /** - * lock_2_inodes - lock two UBIFS inodes. + * lock_2_inodes - a wrapper for locking two UBIFS inodes. * @inode1: first inode * @inode2: second inode + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. */ static void lock_2_inodes(struct inode *inode1, struct inode *inode2) { - if (inode1->i_ino < inode2->i_ino) { - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2); - mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3); - } else { - mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3); - } + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); } /** - * unlock_2_inodes - unlock two UBIFS inodes inodes. + * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. * @inode1: first inode * @inode2: second inode */ static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) { - mutex_unlock(&ubifs_inode(inode1)->ui_mutex); mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); } static int ubifs_link(struct dentry *old_dentry, struct inode *dir, @@ -527,6 +526,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu", dentry->d_name.len, dentry->d_name.name, inode->i_ino, inode->i_nlink, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); err = dbg_check_synced_i_size(inode); if (err) return err; @@ -580,6 +581,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu", dentry->d_name.len, dentry->d_name.name, inode->i_ino, inode->i_nlink, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); err = dbg_check_synced_i_size(inode); if (err) return err; @@ -667,7 +670,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len, dentry->d_name.name, inode->i_ino, dir->i_ino); - + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); err = check_dir_empty(c, dentry->d_inode); if (err) return err; @@ -922,59 +926,30 @@ out_budg: } /** - * lock_3_inodes - lock three UBIFS inodes for rename. + * lock_3_inodes - a wrapper for locking three UBIFS inodes. * @inode1: first inode * @inode2: second inode * @inode3: third inode * - * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may - * be null. + * This function is used for 'ubifs_rename()' and @inode1 may be the same as + * @inode2 whereas @inode3 may be %NULL. + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. */ static void lock_3_inodes(struct inode *inode1, struct inode *inode2, struct inode *inode3) { - struct inode *i1, *i2, *i3; - - if (!inode3) { - if (inode1 != inode2) { - lock_2_inodes(inode1, inode2); - return; - } - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); - return; - } - - if (inode1 == inode2) { - lock_2_inodes(inode1, inode3); - return; - } - - /* 3 different inodes */ - if (inode1 < inode2) { - i3 = inode2; - if (inode1 < inode3) { - i1 = inode1; - i2 = inode3; - } else { - i1 = inode3; - i2 = inode1; - } - } else { - i3 = inode1; - if (inode2 < inode3) { - i1 = inode2; - i2 = inode3; - } else { - i1 = inode3; - i2 = inode2; - } - } - mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1); - lock_2_inodes(i2, i3); + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + if (inode2 != inode1) + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); + if (inode3) + mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3); } /** - * unlock_3_inodes - unlock three UBIFS inodes for rename. + * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename. * @inode1: first inode * @inode2: second inode * @inode3: third inode @@ -982,11 +957,11 @@ static void lock_3_inodes(struct inode *inode1, struct inode *inode2, static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, struct inode *inode3) { - mutex_unlock(&ubifs_inode(inode1)->ui_mutex); - if (inode1 != inode2) - mutex_unlock(&ubifs_inode(inode2)->ui_mutex); if (inode3) mutex_unlock(&ubifs_inode(inode3)->ui_mutex); + if (inode1 != inode2) + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); } static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -1020,6 +995,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name, old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, new_dentry->d_name.name, new_dir->i_ino); + ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); + ubifs_assert(mutex_is_locked(&new_dir->i_mutex)); + if (unlink) + ubifs_assert(mutex_is_locked(&new_inode->i_mutex)); + if (unlink && is_dir) { err = check_dir_empty(c, new_inode); @@ -1199,7 +1179,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -struct inode_operations ubifs_dir_inode_operations = { +const struct inode_operations ubifs_dir_inode_operations = { .lookup = ubifs_lookup, .create = ubifs_create, .link = ubifs_link, @@ -1219,7 +1199,7 @@ struct inode_operations ubifs_dir_inode_operations = { #endif }; -struct file_operations ubifs_dir_operations = { +const struct file_operations ubifs_dir_operations = { .llseek = ubifs_dir_llseek, .release = ubifs_dir_release, .read = generic_read_dir, diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index bf37374567f..93b6de51f26 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -432,7 +432,6 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, int uninitialized_var(err), appending = !!(pos + len > inode->i_size); struct page *page; - ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); if (unlikely(c->ro_media)) @@ -1541,7 +1540,7 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -struct address_space_operations ubifs_file_address_operations = { +const struct address_space_operations ubifs_file_address_operations = { .readpage = ubifs_readpage, .writepage = ubifs_writepage, .write_begin = ubifs_write_begin, @@ -1551,7 +1550,7 @@ struct address_space_operations ubifs_file_address_operations = { .releasepage = ubifs_releasepage, }; -struct inode_operations ubifs_file_inode_operations = { +const struct inode_operations ubifs_file_inode_operations = { .setattr = ubifs_setattr, .getattr = ubifs_getattr, #ifdef CONFIG_UBIFS_FS_XATTR @@ -1562,14 +1561,14 @@ struct inode_operations ubifs_file_inode_operations = { #endif }; -struct inode_operations ubifs_symlink_inode_operations = { +const struct inode_operations ubifs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ubifs_follow_link, .setattr = ubifs_setattr, .getattr = ubifs_getattr, }; -struct file_operations ubifs_file_operations = { +const struct file_operations ubifs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 9832f9abe28..a711d33b3d3 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -31,6 +31,26 @@ * to be reused. Garbage collection will cause the number of dirty index nodes * to grow, however sufficient space is reserved for the index to ensure the * commit will never run out of space. + * + * Notes about dead watermark. At current UBIFS implementation we assume that + * LEBs which have less than @c->dead_wm bytes of free + dirty space are full + * and not worth garbage-collecting. The dead watermark is one min. I/O unit + * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS + * Garbage Collector has to synchronize the GC head's write buffer before + * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can + * actually reclaim even very small pieces of dirty space by garbage collecting + * enough dirty LEBs, but we do not bother doing this at this implementation. + * + * Notes about dark watermark. The results of GC work depends on how big are + * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed, + * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would + * have to waste large pieces of free space at the end of LEB B, because nodes + * from LEB A would not fit. And the worst situation is when all nodes are of + * maximum size. So dark watermark is the amount of free + dirty space in LEB + * which are guaranteed to be reclaimable. If LEB has less space, the GC migh + * be unable to reclaim it. So, LEBs with free + dirty greater than dark + * watermark are "good" LEBs from GC's point of few. The other LEBs are not so + * good, and GC takes extra care when moving them. */ #include <linux/pagemap.h> @@ -381,7 +401,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) /* * Don't release the LEB until after the next commit, because - * it may contain date which is needed for recovery. So + * it may contain data which is needed for recovery. So * although we freed this LEB, it will become usable only after * the commit. */ @@ -810,8 +830,9 @@ out: * ubifs_destroy_idx_gc - destroy idx_gc list. * @c: UBIFS file-system description object * - * This function destroys the idx_gc list. It is called when unmounting or - * remounting read-only so locks are not needed. + * This function destroys the @c->idx_gc list. It is called when unmounting + * so locks are not needed. Returns zero in case of success and a negative + * error code in case of failure. */ void ubifs_destroy_idx_gc(struct ubifs_info *c) { @@ -824,7 +845,6 @@ void ubifs_destroy_idx_gc(struct ubifs_info *c) list_del(&idx_gc->list); kfree(idx_gc); } - } /** diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 01682713af6..e8e632a1dcd 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -29,7 +29,7 @@ * would have been wasted for padding to the nearest minimal I/O unit boundary. * Instead, data first goes to the write-buffer and is flushed when the * buffer is full or when it is not used for some time (by timer). This is - * similarto the mechanism is used by JFFS2. + * similar to the mechanism is used by JFFS2. * * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by * mutexes defined inside these objects. Since sometimes upper-level code @@ -75,7 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * @lnum: logical eraseblock number * @offs: offset within the logical eraseblock * @quiet: print no messages - * @chk_crc: indicates whether to always check the CRC + * @must_chk_crc: indicates whether to always check the CRC * * This function checks node magic number and CRC checksum. This function also * validates node length to prevent UBIFS from becoming crazy when an attacker @@ -83,11 +83,17 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * node length in the common header could cause UBIFS to read memory outside of * allocated buffer when checking the CRC checksum. * - * This function returns zero in case of success %-EUCLEAN in case of bad CRC - * or magic. + * This function may skip data nodes CRC checking if @c->no_chk_data_crc is + * true, which is controlled by corresponding UBIFS mount option. However, if + * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is + * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is + * ignored and CRC is checked. + * + * This function returns zero in case of success and %-EUCLEAN in case of bad + * CRC or magic. */ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet, int chk_crc) + int offs, int quiet, int must_chk_crc) { int err = -EINVAL, type, node_len; uint32_t crc, node_crc, magic; @@ -123,9 +129,9 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, node_len > c->ranges[type].max_len) goto out_len; - if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc) - if (c->no_chk_data_crc) - return 0; + if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc && + c->no_chk_data_crc) + return 0; crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); node_crc = le32_to_cpu(ch->crc); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 9b7c54e0cd2..a11ca0958a2 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -208,7 +208,7 @@ again: offs = 0; out: - err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM); + err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype); if (err) goto out_unlock; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index dfd2bcece27..4cdd284dea5 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -635,10 +635,10 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, * @c: UBIFS file-system description object * @st: return statistics */ -void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st) +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst) { spin_lock(&c->space_lock); - memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats)); + memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats)); spin_unlock(&c->space_lock); } @@ -678,6 +678,9 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, out: ubifs_release_lprops(c); + if (err) + ubifs_err("cannot change properties of LEB %d, error %d", + lnum, err); return err; } @@ -714,6 +717,9 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, out: ubifs_release_lprops(c); + if (err) + ubifs_err("cannot update properties of LEB %d, error %d", + lnum, err); return err; } @@ -737,6 +743,8 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp) lpp = ubifs_lpt_lookup(c, lnum); if (IS_ERR(lpp)) { err = PTR_ERR(lpp); + ubifs_err("cannot read properties of LEB %d, error %d", + lnum, err); goto out; } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 96ca9570717..3216a1f277f 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -556,23 +556,23 @@ no_space: } /** - * next_pnode - find next pnode. + * next_pnode_to_dirty - find next pnode to dirty. * @c: UBIFS file-system description object * @pnode: pnode * - * This function returns the next pnode or %NULL if there are no more pnodes. + * This function returns the next pnode to dirty or %NULL if there are no more + * pnodes. Note that pnodes that have never been written (lnum == 0) are + * skipped. */ -static struct ubifs_pnode *next_pnode(struct ubifs_info *c, - struct ubifs_pnode *pnode) +static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c, + struct ubifs_pnode *pnode) { struct ubifs_nnode *nnode; int iip; /* Try to go right */ nnode = pnode->parent; - iip = pnode->iip + 1; - if (iip < UBIFS_LPT_FANOUT) { - /* We assume here that LEB zero is never an LPT LEB */ + for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) { if (nnode->nbranch[iip].lnum) return ubifs_get_pnode(c, nnode, iip); } @@ -583,8 +583,11 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c, nnode = nnode->parent; if (!nnode) return NULL; - /* We assume here that LEB zero is never an LPT LEB */ - } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum); + for (; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + break; + } + } while (iip >= UBIFS_LPT_FANOUT); /* Go right */ nnode = ubifs_get_nnode(c, nnode, iip); @@ -593,12 +596,29 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c, /* Go down to level 1 */ while (nnode->level > 1) { - nnode = ubifs_get_nnode(c, nnode, 0); + for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + break; + } + if (iip >= UBIFS_LPT_FANOUT) { + /* + * Should not happen, but we need to keep going + * if it does. + */ + iip = 0; + } + nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) return (void *)nnode; } - return ubifs_get_pnode(c, nnode, 0); + for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) + if (nnode->nbranch[iip].lnum) + break; + if (iip >= UBIFS_LPT_FANOUT) + /* Should not happen, but we need to keep going if it does */ + iip = 0; + return ubifs_get_pnode(c, nnode, iip); } /** @@ -688,7 +708,7 @@ static int make_tree_dirty(struct ubifs_info *c) pnode = pnode_lookup(c, 0); while (pnode) { do_make_pnode_dirty(c, pnode); - pnode = next_pnode(c, pnode); + pnode = next_pnode_to_dirty(c, pnode); if (IS_ERR(pnode)) return PTR_ERR(pnode); } diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 71d5493bf56..a88f33801b9 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -354,7 +354,7 @@ int ubifs_write_master(struct ubifs_info *c) int err, lnum, offs, len; if (c->ro_media) - return -EINVAL; + return -EROFS; lnum = UBIFS_MST_LNUM; offs = c->mst_offs + c->mst_node_alsz; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 9e6f403f170..152a7b34a14 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -46,7 +46,7 @@ * Orphans are accumulated in a rb-tree. When an inode's link count drops to * zero, the inode number is added to the rb-tree. It is removed from the tree * when the inode is deleted. Any new orphans that are in the orphan tree when - * the commit is run, are written to the orphan area in 1 or more orph nodes. + * the commit is run, are written to the orphan area in 1 or more orphan nodes. * If the orphan area is full, it is consolidated to make space. There is * always enough space because validation prevents the user from creating more * than the maximum number of orphans allowed. @@ -231,7 +231,7 @@ static int tot_avail_orphs(struct ubifs_info *c) } /** - * do_write_orph_node - write a node + * do_write_orph_node - write a node to the orphan head. * @c: UBIFS file-system description object * @len: length of node * @atomic: write atomically @@ -264,11 +264,11 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic) } /** - * write_orph_node - write an orph node + * write_orph_node - write an orphan node. * @c: UBIFS file-system description object * @atomic: write atomically * - * This function builds an orph node from the cnext list and writes it to the + * This function builds an orphan node from the cnext list and writes it to the * orphan head. On success, %0 is returned, otherwise a negative error code * is returned. */ @@ -326,11 +326,11 @@ static int write_orph_node(struct ubifs_info *c, int atomic) } /** - * write_orph_nodes - write orph nodes until there are no more to commit + * write_orph_nodes - write orphan nodes until there are no more to commit. * @c: UBIFS file-system description object * @atomic: write atomically * - * This function writes orph nodes for all the orphans to commit. On success, + * This function writes orphan nodes for all the orphans to commit. On success, * %0 is returned, otherwise a negative error code is returned. */ static int write_orph_nodes(struct ubifs_info *c, int atomic) @@ -478,14 +478,14 @@ int ubifs_orphan_end_commit(struct ubifs_info *c) } /** - * clear_orphans - erase all LEBs used for orphans. + * ubifs_clear_orphans - erase all LEBs used for orphans. * @c: UBIFS file-system description object * * If recovery is not required, then the orphans from the previous session * are not needed. This function locates the LEBs used to record * orphans, and un-maps them. */ -static int clear_orphans(struct ubifs_info *c) +int ubifs_clear_orphans(struct ubifs_info *c) { int lnum, err; @@ -547,9 +547,9 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) * do_kill_orphans - remove orphan inodes from the index. * @c: UBIFS file-system description object * @sleb: scanned LEB - * @last_cmt_no: cmt_no of last orph node read is passed and returned here + * @last_cmt_no: cmt_no of last orphan node read is passed and returned here * @outofdate: whether the LEB is out of date is returned here - * @last_flagged: whether the end orph node is encountered + * @last_flagged: whether the end orphan node is encountered * * This function is a helper to the 'kill_orphans()' function. It goes through * every orphan node in a LEB and for every inode number recorded, removes @@ -580,8 +580,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, /* * The commit number on the master node may be less, because * of a failed commit. If there are several failed commits in a - * row, the commit number written on orph nodes will continue to - * increase (because the commit number is adjusted here) even + * row, the commit number written on orphan nodes will continue + * to increase (because the commit number is adjusted here) even * though the commit number on the master node stays the same * because the master node has not been re-written. */ @@ -589,9 +589,9 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, c->cmt_no = cmt_no; if (cmt_no < *last_cmt_no && *last_flagged) { /* - * The last orph node had a higher commit number and was - * flagged as the last written for that commit number. - * That makes this orph node, out of date. + * The last orphan node had a higher commit number and + * was flagged as the last written for that commit + * number. That makes this orphan node, out of date. */ if (!first) { ubifs_err("out of order commit number %llu in " @@ -658,10 +658,10 @@ static int kill_orphans(struct ubifs_info *c) /* * Orph nodes always start at c->orph_first and are written to each * successive LEB in turn. Generally unused LEBs will have been unmapped - * but may contain out of date orph nodes if the unmap didn't go - * through. In addition, the last orph node written for each commit is + * but may contain out of date orphan nodes if the unmap didn't go + * through. In addition, the last orphan node written for each commit is * marked (top bit of orph->cmt_no is set to 1). It is possible that - * there are orph nodes from the next commit (i.e. the commit did not + * there are orphan nodes from the next commit (i.e. the commit did not * complete successfully). In that case, no orphans will have been lost * due to the way that orphans are written, and any orphans added will * be valid orphans anyway and so can be deleted. @@ -718,7 +718,7 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only) if (unclean) err = kill_orphans(c); else if (!read_only) - err = clear_orphans(c); + err = ubifs_clear_orphans(c); return err; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 89556ee7251..1182b66a549 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -397,6 +397,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = UBIFS_MAX_NLEN; buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); + ubifs_assert(buf->f_bfree <= c->block_cnt); return 0; } @@ -432,33 +433,24 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) int i, err; struct ubifs_info *c = sb->s_fs_info; struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .sync_mode = WB_SYNC_ALL, .range_start = 0, .range_end = LLONG_MAX, .nr_to_write = LONG_MAX, }; /* - * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an - * advisory thing to help the file system shove lots of data into the - * queues. If some gets missed then it'll be picked up on the second + * Zero @wait is just an advisory thing to help the file system shove + * lots of data into the queues, and there will be the second * '->sync_fs()' call, with non-zero @wait. */ + if (!wait) + return 0; if (sb->s_flags & MS_RDONLY) return 0; /* - * Synchronize write buffers, because 'ubifs_run_commit()' does not - * do this if it waits for an already running commit. - */ - for (i = 0; i < c->jhead_cnt; i++) { - err = ubifs_wbuf_sync(&c->jheads[i].wbuf); - if (err) - return err; - } - - /* * VFS calls '->sync_fs()' before synchronizing all dirty inodes and * pages, so synchronize them first, then commit the journal. Strictly * speaking, it is not necessary to commit the journal here, @@ -469,6 +461,16 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) */ generic_sync_sb_inodes(sb, &wbc); + /* + * Synchronize write buffers, because 'ubifs_run_commit()' does not + * do this if it waits for an already running commit. + */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + err = ubifs_run_commit(c); if (err) return err; @@ -572,15 +574,8 @@ static int init_constants_early(struct ubifs_info *c) c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; /* - * Initialize dead and dark LEB space watermarks. - * - * Dead space is the space which cannot be used. Its watermark is - * equivalent to min. I/O unit or minimum node size if it is greater - * then min. I/O unit. - * - * Dark space is the space which might be used, or might not, depending - * on which node should be written to the LEB. Its watermark is - * equivalent to maximum UBIFS node size. + * Initialize dead and dark LEB space watermarks. See gc.c for comments + * about these values. */ c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); @@ -741,12 +736,12 @@ static void init_constants_master(struct ubifs_info *c) * take_gc_lnum - reserve GC LEB. * @c: UBIFS file-system description object * - * This function ensures that the LEB reserved for garbage collection is - * unmapped and is marked as "taken" in lprops. We also have to set free space - * to LEB size and dirty space to zero, because lprops may contain out-of-date - * information if the file-system was un-mounted before it has been committed. - * This function returns zero in case of success and a negative error code in - * case of failure. + * This function ensures that the LEB reserved for garbage collection is marked + * as "taken" in lprops. We also have to set free space to LEB size and dirty + * space to zero, because lprops may contain out-of-date information if the + * file-system was un-mounted before it has been committed. This function + * returns zero in case of success and a negative error code in case of + * failure. */ static int take_gc_lnum(struct ubifs_info *c) { @@ -757,10 +752,6 @@ static int take_gc_lnum(struct ubifs_info *c) return -EINVAL; } - err = ubifs_leb_unmap(c, c->gc_lnum); - if (err) - return err; - /* And we have to tell lprops that this LEB is taken */ err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, LPROPS_TAKEN, 0, 0); @@ -966,13 +957,16 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, token = match_token(p, tokens, args); switch (token) { + /* + * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. + * We accepte them in order to be backware-compatible. But this + * should be removed at some point. + */ case Opt_fast_unmount: c->mount_opts.unmount_mode = 2; - c->fast_unmount = 1; break; case Opt_norm_unmount: c->mount_opts.unmount_mode = 1; - c->fast_unmount = 0; break; case Opt_bulk_read: c->mount_opts.bulk_read = 2; @@ -1094,12 +1088,7 @@ static int check_free_space(struct ubifs_info *c) ubifs_err("insufficient free space to mount in read/write mode"); dbg_dump_budg(c); dbg_dump_lprops(c); - /* - * We return %-EINVAL instead of %-ENOSPC because it seems to - * be the closest error code mentioned in the mount function - * documentation. - */ - return -EINVAL; + return -ENOSPC; } return 0; } @@ -1286,10 +1275,19 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_orphans; err = ubifs_rcvry_gc_commit(c); - } else + } else { err = take_gc_lnum(c); - if (err) - goto out_orphans; + if (err) + goto out_orphans; + + /* + * GC LEB may contain garbage if there was an unclean + * reboot, and it should be un-mapped. + */ + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + return err; + } err = dbg_check_lprops(c); if (err) @@ -1298,6 +1296,16 @@ static int mount_ubifs(struct ubifs_info *c) err = ubifs_recover_size(c); if (err) goto out_orphans; + } else { + /* + * Even if we mount read-only, we have to set space in GC LEB + * to proper value because this affects UBIFS free space + * reporting. We do not want to have a situation when + * re-mounting from R/O to R/W changes amount of free space. + */ + err = take_gc_lnum(c); + if (err) + goto out_orphans; } spin_lock(&ubifs_infos_lock); @@ -1310,14 +1318,17 @@ static int mount_ubifs(struct ubifs_info *c) else { c->need_recovery = 0; ubifs_msg("recovery completed"); + /* GC LEB has to be empty and taken at this point */ + ubifs_assert(c->lst.taken_empty_lebs == 1); } - } + } else + ubifs_assert(c->lst.taken_empty_lebs == 1); - err = dbg_debugfs_init_fs(c); + err = dbg_check_filesystem(c); if (err) goto out_infos; - err = dbg_check_filesystem(c); + err = dbg_debugfs_init_fs(c); if (err) goto out_infos; @@ -1351,7 +1362,6 @@ static int mount_ubifs(struct ubifs_info *c) c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7], c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11], c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]); - dbg_msg("fast unmount: %d", c->fast_unmount); dbg_msg("big_lpt %d", c->big_lpt); dbg_msg("log LEBs: %d (%d - %d)", c->log_lebs, UBIFS_LOG_LNUM, c->log_last); @@ -1475,10 +1485,8 @@ static int ubifs_remount_rw(struct ubifs_info *c) { int err, lnum; - if (c->ro_media) - return -EINVAL; - mutex_lock(&c->umount_mutex); + dbg_save_space_info(c); c->remounting_rw = 1; c->always_chk_crc = 1; @@ -1514,6 +1522,12 @@ static int ubifs_remount_rw(struct ubifs_info *c) err = ubifs_recover_inl_heads(c, c->sbuf); if (err) goto out; + } else { + /* A readonly mount is not allowed to have orphans */ + ubifs_assert(c->tot_orphans == 0); + err = ubifs_clear_orphans(c); + if (err) + goto out; } if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { @@ -1569,7 +1583,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (c->need_recovery) err = ubifs_rcvry_gc_commit(c); else - err = take_gc_lnum(c); + err = ubifs_leb_unmap(c, c->gc_lnum); if (err) goto out; @@ -1582,8 +1596,9 @@ static int ubifs_remount_rw(struct ubifs_info *c) c->vfs_sb->s_flags &= ~MS_RDONLY; c->remounting_rw = 0; c->always_chk_crc = 0; + err = dbg_check_space_info(c); mutex_unlock(&c->umount_mutex); - return 0; + return err; out: vfree(c->orph_buf); @@ -1603,43 +1618,18 @@ out: } /** - * commit_on_unmount - commit the journal when un-mounting. - * @c: UBIFS file-system description object - * - * This function is called during un-mounting and re-mounting, and it commits - * the journal unless the "fast unmount" mode is enabled. - */ -static void commit_on_unmount(struct ubifs_info *c) -{ - struct super_block *sb = c->vfs_sb; - long long bud_bytes; - - /* - * This function is called before the background thread is stopped, so - * we may race with ongoing commit, which means we have to take - * @c->bud_lock to access @c->bud_bytes. - */ - spin_lock(&c->buds_lock); - bud_bytes = c->bud_bytes; - spin_unlock(&c->buds_lock); - - if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes) - ubifs_run_commit(c); -} - -/** * ubifs_remount_ro - re-mount in read-only mode. * @c: UBIFS file-system description object * - * We rely on VFS to have stopped writing. Possibly the background thread could - * be running a commit, however kthread_stop will wait in that case. + * We assume VFS has stopped writing. Possibly the background thread could be + * running a commit, however kthread_stop will wait in that case. */ static void ubifs_remount_ro(struct ubifs_info *c) { int i, err; ubifs_assert(!c->need_recovery); - commit_on_unmount(c); + ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); mutex_lock(&c->umount_mutex); if (c->bgt) { @@ -1647,27 +1637,29 @@ static void ubifs_remount_ro(struct ubifs_info *c) c->bgt = NULL; } + dbg_save_space_info(c); + for (i = 0; i < c->jhead_cnt; i++) { ubifs_wbuf_sync(&c->jheads[i].wbuf); del_timer_sync(&c->jheads[i].wbuf.timer); } - if (!c->ro_media) { - c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); - c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); - c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); - err = ubifs_write_master(c); - if (err) - ubifs_ro_mode(c, err); - } + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); + err = ubifs_write_master(c); + if (err) + ubifs_ro_mode(c, err); - ubifs_destroy_idx_gc(c); free_wbufs(c); vfree(c->orph_buf); c->orph_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); + err = dbg_check_space_info(c); + if (err) + ubifs_ro_mode(c, err); mutex_unlock(&c->umount_mutex); } @@ -1760,11 +1752,20 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) } if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { + if (c->ro_media) { + ubifs_msg("cannot re-mount due to prior errors"); + return -EROFS; + } err = ubifs_remount_rw(c); if (err) return err; - } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) + } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { + if (c->ro_media) { + ubifs_msg("cannot re-mount due to prior errors"); + return -EROFS; + } ubifs_remount_ro(c); + } if (c->bulk_read == 1) bu_init(c); @@ -1774,10 +1775,11 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) c->bu.buf = NULL; } + ubifs_assert(c->lst.taken_empty_lebs == 1); return 0; } -struct super_operations ubifs_super_operations = { +const struct super_operations ubifs_super_operations = { .alloc_inode = ubifs_alloc_inode, .destroy_inode = ubifs_destroy_inode, .put_super = ubifs_put_super, @@ -2044,15 +2046,6 @@ out_close: static void ubifs_kill_sb(struct super_block *sb) { - struct ubifs_info *c = sb->s_fs_info; - - /* - * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()' - * in order to be outside BKL. - */ - if (sb->s_root) - commit_on_unmount(c); - /* The un-mount routine is actually done in put_super() */ generic_shutdown_super(sb); } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index f7e36f54552..fa28a84c6a1 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -443,6 +443,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, * This function performs that same function as ubifs_read_node except that * it does not require that there is actually a node present and instead * the return code indicates if a node was read. + * + * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc + * is true (it is controlled by corresponding mount option). However, if + * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always + * checked. */ static int try_read_node(const struct ubifs_info *c, void *buf, int type, int len, int lnum, int offs) @@ -470,9 +475,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, if (node_len != len) return 0; - if (type == UBIFS_DATA_NODE && !c->always_chk_crc) - if (c->no_chk_data_crc) - return 0; + if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc) + return 1; crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); node_crc = le32_to_cpu(ch->crc); @@ -1506,7 +1510,7 @@ out: * * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares - * maxumum possible amount of nodes for bulk-read. + * maximum possible amount of nodes for bulk-read. */ int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) { diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index fc2a4cc66d0..039a68bee29 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -426,9 +426,9 @@ struct ubifs_unclean_leb { * LEB properties flags. * * LPROPS_UNCAT: not categorized - * LPROPS_DIRTY: dirty > 0, not index + * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index - * LPROPS_FREE: free > 0, not empty, not index + * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs * LPROPS_EMPTY: LEB is empty, not taken * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken @@ -961,7 +961,6 @@ struct ubifs_debug_info; * @cs_lock: commit state lock * @cmt_wq: wait queue to sleep on if the log is full and a commit is running * - * @fast_unmount: do not run journal commit before un-mounting * @big_lpt: flag that LPT is too big to write whole during commit * @no_chk_data_crc: do not check CRCs when reading data nodes (except during * recovery) @@ -1202,7 +1201,6 @@ struct ubifs_info { spinlock_t cs_lock; wait_queue_head_t cmt_wq; - unsigned int fast_unmount:1; unsigned int big_lpt:1; unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; @@ -1405,13 +1403,13 @@ extern struct list_head ubifs_infos; extern spinlock_t ubifs_infos_lock; extern atomic_long_t ubifs_clean_zn_cnt; extern struct kmem_cache *ubifs_inode_slab; -extern struct super_operations ubifs_super_operations; -extern struct address_space_operations ubifs_file_address_operations; -extern struct file_operations ubifs_file_operations; -extern struct inode_operations ubifs_file_inode_operations; -extern struct file_operations ubifs_dir_operations; -extern struct inode_operations ubifs_dir_inode_operations; -extern struct inode_operations ubifs_symlink_inode_operations; +extern const struct super_operations ubifs_super_operations; +extern const struct address_space_operations ubifs_file_address_operations; +extern const struct file_operations ubifs_file_operations; +extern const struct inode_operations ubifs_file_inode_operations; +extern const struct file_operations ubifs_dir_operations; +extern const struct inode_operations ubifs_dir_inode_operations; +extern const struct inode_operations ubifs_symlink_inode_operations; extern struct backing_dev_info ubifs_backing_dev_info; extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; @@ -1428,7 +1426,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, int offs, int dtype); int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet, int chk_crc); + int offs, int quiet, int must_chk_crc); void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); int ubifs_io_init(struct ubifs_info *c); @@ -1495,6 +1493,7 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode, void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, struct ubifs_budget_req *req); long long ubifs_get_free_space(struct ubifs_info *c); +long long ubifs_get_free_space_nolock(struct ubifs_info *c); int ubifs_calc_min_idx_lebs(struct ubifs_info *c); void ubifs_convert_page_budget(struct ubifs_info *c); long long ubifs_reported_space(const struct ubifs_info *c, long long free); @@ -1603,6 +1602,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum); int ubifs_orphan_start_commit(struct ubifs_info *c); int ubifs_orphan_end_commit(struct ubifs_info *c); int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only); +int ubifs_clear_orphans(struct ubifs_info *c); /* lpt.c */ int ubifs_calc_lpt_geom(struct ubifs_info *c); @@ -1646,7 +1646,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, const struct ubifs_lprops *lp, int free, int dirty, int flags, int idx_gc_cnt); -void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats); +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst); void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, int cat); void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig new file mode 100644 index 00000000000..0e0e99bd6bc --- /dev/null +++ b/fs/udf/Kconfig @@ -0,0 +1,18 @@ +config UDF_FS + tristate "UDF file system support" + select CRC_ITU_T + help + This is the new file system used on some CD-ROMs and DVDs. Say Y if + you intend to mount DVD discs or CDRW's written in packet mode, or + if written to by other UDF utilities, such as DirectCD. + Please read <file:Documentation/filesystems/udf.txt>. + + To compile this file system support as a module, choose M here: the + module will be called udf. + + If unsure, say N. + +config UDF_NLS + bool + default y + depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y) diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig new file mode 100644 index 00000000000..e4f10a40768 --- /dev/null +++ b/fs/ufs/Kconfig @@ -0,0 +1,43 @@ +config UFS_FS + tristate "UFS file system support (read only)" + depends on BLOCK + help + BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, + OpenBSD and NeXTstep) use a file system called UFS. Some System V + Unixes can create and mount hard disk partitions and diskettes using + this file system as well. Saying Y here will allow you to read from + these partitions; if you also want to write to them, say Y to the + experimental "UFS file system write support", below. Please read the + file <file:Documentation/filesystems/ufs.txt> for more information. + + The recently released UFS2 variant (used in FreeBSD 5.x) is + READ-ONLY supported. + + Note that this option is generally not needed for floppies, since a + good portable way to transport files and directories between unixes + (and even other operating systems) is given by the tar program ("man + tar" or preferably "info tar"). + + When accessing NeXTstep files, you may need to convert them from the + NeXT character set to the Latin1 character set; use the program + recode ("info recode") for this purpose. + + To compile the UFS file system support as a module, choose M here: the + module will be called ufs. + + If you haven't heard about all of this before, it's safe to say N. + +config UFS_FS_WRITE + bool "UFS file system write support (DANGEROUS)" + depends on UFS_FS && EXPERIMENTAL + help + Say Y here if you want to try writing to UFS partitions. This is + experimental, so you should back up your UFS partitions beforehand. + +config UFS_DEBUG + bool "UFS debugging" + depends on UFS_FS + help + If you are experiencing any problems with the UFS filesystem, say + Y here. This will result in _many_ additional debugging messages to be + written to the system log. diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 3f53dd101f9..29228f5899c 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -1,6 +1,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK + select EXPORTFS help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index d71dc44e21e..cb329edc925 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -166,6 +166,75 @@ test_page_region( } /* + * Mapping of multi-page buffers into contiguous virtual space + */ + +typedef struct a_list { + void *vm_addr; + struct a_list *next; +} a_list_t; + +static a_list_t *as_free_head; +static int as_list_len; +static DEFINE_SPINLOCK(as_lock); + +/* + * Try to batch vunmaps because they are costly. + */ +STATIC void +free_address( + void *addr) +{ + a_list_t *aentry; + +#ifdef CONFIG_XEN + /* + * Xen needs to be able to make sure it can get an exclusive + * RO mapping of pages it wants to turn into a pagetable. If + * a newly allocated page is also still being vmap()ed by xfs, + * it will cause pagetable construction to fail. This is a + * quick workaround to always eagerly unmap pages so that Xen + * is happy. + */ + vunmap(addr); + return; +#endif + + aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); + if (likely(aentry)) { + spin_lock(&as_lock); + aentry->next = as_free_head; + aentry->vm_addr = addr; + as_free_head = aentry; + as_list_len++; + spin_unlock(&as_lock); + } else { + vunmap(addr); + } +} + +STATIC void +purge_addresses(void) +{ + a_list_t *aentry, *old; + + if (as_free_head == NULL) + return; + + spin_lock(&as_lock); + aentry = as_free_head; + as_free_head = NULL; + as_list_len = 0; + spin_unlock(&as_lock); + + while ((old = aentry) != NULL) { + vunmap(aentry->vm_addr); + aentry = aentry->next; + kfree(old); + } +} + +/* * Internal xfs_buf_t object manipulation */ @@ -264,7 +333,7 @@ xfs_buf_free( uint i; if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) - vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count); + free_address(bp->b_addr - bp->b_offset); for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; @@ -386,8 +455,10 @@ _xfs_buf_map_pages( bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); + if (as_list_len > 64) + purge_addresses(); + bp->b_addr = vmap(bp->b_pages, bp->b_page_count, + VM_MAP, PAGE_KERNEL); if (unlikely(bp->b_addr == NULL)) return -ENOMEM; bp->b_addr += bp->b_offset; @@ -1672,6 +1743,8 @@ xfsbufd( count++; } + if (as_list_len > 0) + purge_addresses(); if (count) blk_run_address_space(target->bt_mapping); diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index e5be1e0be80..4bd112313f3 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -50,12 +50,14 @@ #include "xfs_vnodeops.h" #include "xfs_quota.h" #include "xfs_inode_item.h" +#include "xfs_export.h" #include <linux/capability.h> #include <linux/dcache.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/pagemap.h> +#include <linux/exportfs.h> /* * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to @@ -164,97 +166,69 @@ xfs_find_handle( return 0; } - /* - * Convert userspace handle data into inode. - * - * We use the fact that all the fsop_handlereq ioctl calls have a data - * structure argument whose first component is always a xfs_fsop_handlereq_t, - * so we can pass that sub structure into this handy, shared routine. - * - * If no error, caller must always iput the returned inode. + * No need to do permission checks on the various pathname components + * as the handle operations are privileged. */ STATIC int -xfs_vget_fsop_handlereq( - xfs_mount_t *mp, - struct inode *parinode, /* parent inode pointer */ - xfs_fsop_handlereq_t *hreq, - struct inode **inode) +xfs_handle_acceptable( + void *context, + struct dentry *dentry) +{ + return 1; +} + +/* + * Convert userspace handle data into a dentry. + */ +struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen) { - void __user *hanp; - size_t hlen; - xfs_fid_t *xfid; - xfs_handle_t *handlep; xfs_handle_t handle; - xfs_inode_t *ip; - xfs_ino_t ino; - __u32 igen; - int error; + struct xfs_fid64 fid; /* * Only allow handle opens under a directory. */ - if (!S_ISDIR(parinode->i_mode)) - return XFS_ERROR(ENOTDIR); - - hanp = hreq->ihandle; - hlen = hreq->ihandlen; - handlep = &handle; - - if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) - return XFS_ERROR(EINVAL); - if (copy_from_user(handlep, hanp, hlen)) - return XFS_ERROR(EFAULT); - if (hlen < sizeof(*handlep)) - memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); - if (hlen > sizeof(handlep->ha_fsid)) { - if (handlep->ha_fid.fid_len != - (hlen - sizeof(handlep->ha_fsid) - - sizeof(handlep->ha_fid.fid_len)) || - handlep->ha_fid.fid_pad) - return XFS_ERROR(EINVAL); - } - - /* - * Crack the handle, obtain the inode # & generation # - */ - xfid = (struct xfs_fid *)&handlep->ha_fid; - if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) { - ino = xfid->fid_ino; - igen = xfid->fid_gen; - } else { - return XFS_ERROR(EINVAL); - } - - /* - * Get the XFS inode, building a Linux inode to go with it. - */ - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); - if (error) - return error; - if (ip == NULL) - return XFS_ERROR(EIO); - if (ip->i_d.di_gen != igen) { - xfs_iput_new(ip, XFS_ILOCK_SHARED); - return XFS_ERROR(ENOENT); - } - - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode)) + return ERR_PTR(-ENOTDIR); + + if (hlen != sizeof(xfs_handle_t)) + return ERR_PTR(-EINVAL); + if (copy_from_user(&handle, uhandle, hlen)) + return ERR_PTR(-EFAULT); + if (handle.ha_fid.fid_len != + sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len)) + return ERR_PTR(-EINVAL); + + memset(&fid, 0, sizeof(struct fid)); + fid.ino = handle.ha_fid.fid_ino; + fid.gen = handle.ha_fid.fid_gen; + + return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3, + FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG, + xfs_handle_acceptable, NULL); +} - *inode = VFS_I(ip); - return 0; +STATIC struct dentry * +xfs_handlereq_to_dentry( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen); } int xfs_open_by_handle( - xfs_mount_t *mp, - xfs_fsop_handlereq_t *hreq, struct file *parfilp, - struct inode *parinode) + xfs_fsop_handlereq_t *hreq) { const struct cred *cred = current_cred(); int error; - int new_fd; + int fd; int permflag; struct file *filp; struct inode *inode; @@ -263,19 +237,21 @@ xfs_open_by_handle( if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); - error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode); - if (error) - return -error; + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + inode = dentry->d_inode; /* Restrict xfs_open_by_handle to directories & regular files. */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - iput(inode); - return -XFS_ERROR(EINVAL); + error = -XFS_ERROR(EPERM); + goto out_dput; } #if BITS_PER_LONG != 32 hreq->oflags |= O_LARGEFILE; #endif + /* Put open permission in namei format. */ permflag = hreq->oflags; if ((permflag+1) & O_ACCMODE) @@ -285,50 +261,45 @@ xfs_open_by_handle( if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && (permflag & FMODE_WRITE) && IS_APPEND(inode)) { - iput(inode); - return -XFS_ERROR(EPERM); + error = -XFS_ERROR(EPERM); + goto out_dput; } if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - iput(inode); - return -XFS_ERROR(EACCES); + error = -XFS_ERROR(EACCES); + goto out_dput; } /* Can't write directories. */ - if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { - iput(inode); - return -XFS_ERROR(EISDIR); + if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { + error = -XFS_ERROR(EISDIR); + goto out_dput; } - if ((new_fd = get_unused_fd()) < 0) { - iput(inode); - return new_fd; + fd = get_unused_fd(); + if (fd < 0) { + error = fd; + goto out_dput; } - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - put_unused_fd(new_fd); - return PTR_ERR(dentry); - } - - /* Ensure umount returns EBUSY on umounts while this file is open. */ - mntget(parfilp->f_path.mnt); - - /* Create file pointer. */ - filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred); + filp = dentry_open(dentry, mntget(parfilp->f_path.mnt), + hreq->oflags, cred); if (IS_ERR(filp)) { - put_unused_fd(new_fd); - return -XFS_ERROR(-PTR_ERR(filp)); + put_unused_fd(fd); + return PTR_ERR(filp); } if (inode->i_mode & S_IFREG) { - /* invisible operation should not change atime */ filp->f_flags |= O_NOATIME; filp->f_mode |= FMODE_NOCMTIME; } - fd_install(new_fd, filp); - return new_fd; + fd_install(fd, filp); + return fd; + + out_dput: + dput(dentry); + return error; } /* @@ -359,11 +330,10 @@ do_readlink( int xfs_readlink_by_handle( - xfs_mount_t *mp, - xfs_fsop_handlereq_t *hreq, - struct inode *parinode) + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) { - struct inode *inode; + struct dentry *dentry; __u32 olen; void *link; int error; @@ -371,26 +341,28 @@ xfs_readlink_by_handle( if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); - error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode); - if (error) - return -error; + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); /* Restrict this handle operation to symlinks only. */ - if (!S_ISLNK(inode->i_mode)) { + if (!S_ISLNK(dentry->d_inode->i_mode)) { error = -XFS_ERROR(EINVAL); - goto out_iput; + goto out_dput; } if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { error = -XFS_ERROR(EFAULT); - goto out_iput; + goto out_dput; } link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); - if (!link) - goto out_iput; + if (!link) { + error = -XFS_ERROR(ENOMEM); + goto out_dput; + } - error = -xfs_readlink(XFS_I(inode), link); + error = -xfs_readlink(XFS_I(dentry->d_inode), link); if (error) goto out_kfree; error = do_readlink(hreq->ohandle, olen, link); @@ -399,32 +371,31 @@ xfs_readlink_by_handle( out_kfree: kfree(link); - out_iput: - iput(inode); + out_dput: + dput(dentry); return error; } STATIC int xfs_fssetdm_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct inode *parinode) + struct file *parfilp, + void __user *arg) { int error; struct fsdmidata fsd; xfs_fsop_setdm_handlereq_t dmhreq; - struct inode *inode; + struct dentry *dentry; if (!capable(CAP_MKNOD)) return -XFS_ERROR(EPERM); if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &inode); - if (error) - return -error; + dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { + if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { error = -XFS_ERROR(EPERM); goto out; } @@ -434,24 +405,23 @@ xfs_fssetdm_by_handle( goto out; } - error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask, + error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: - iput(inode); + dput(dentry); return error; } STATIC int xfs_attrlist_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct inode *parinode) + struct file *parfilp, + void __user *arg) { - int error; + int error = -ENOMEM; attrlist_cursor_kern_t *cursor; xfs_fsop_attrlist_handlereq_t al_hreq; - struct inode *inode; + struct dentry *dentry; char *kbuf; if (!capable(CAP_SYS_ADMIN)) @@ -467,16 +437,16 @@ xfs_attrlist_by_handle( if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) return -XFS_ERROR(EINVAL); - error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode); - if (error) - goto out; + dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); if (!kbuf) - goto out_vn_rele; + goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen, + error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -486,10 +456,9 @@ xfs_attrlist_by_handle( out_kfree: kfree(kbuf); - out_vn_rele: - iput(inode); - out: - return -error; + out_dput: + dput(dentry); + return error; } int @@ -564,15 +533,13 @@ xfs_attrmulti_attr_remove( STATIC int xfs_attrmulti_by_handle( - xfs_mount_t *mp, - void __user *arg, struct file *parfilp, - struct inode *parinode) + void __user *arg) { int error; xfs_attr_multiop_t *ops; xfs_fsop_attrmulti_handlereq_t am_hreq; - struct inode *inode; + struct dentry *dentry; unsigned int i, size; char *attr_name; @@ -581,19 +548,19 @@ xfs_attrmulti_by_handle( if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &inode); - if (error) - goto out; + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); error = E2BIG; size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); if (!size || size > 16 * PAGE_SIZE) - goto out_vn_rele; + goto out_dput; error = ENOMEM; ops = kmalloc(size, GFP_KERNEL); if (!ops) - goto out_vn_rele; + goto out_dput; error = EFAULT; if (copy_from_user(ops, am_hreq.ops, size)) @@ -615,25 +582,28 @@ xfs_attrmulti_by_handle( switch (ops[i].am_opcode) { case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get(inode, - attr_name, ops[i].am_attrvalue, - &ops[i].am_length, ops[i].am_flags); + ops[i].am_error = xfs_attrmulti_attr_get( + dentry->d_inode, attr_name, + ops[i].am_attrvalue, &ops[i].am_length, + ops[i].am_flags); break; case ATTR_OP_SET: ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); if (ops[i].am_error) break; - ops[i].am_error = xfs_attrmulti_attr_set(inode, - attr_name, ops[i].am_attrvalue, - ops[i].am_length, ops[i].am_flags); + ops[i].am_error = xfs_attrmulti_attr_set( + dentry->d_inode, attr_name, + ops[i].am_attrvalue, ops[i].am_length, + ops[i].am_flags); mnt_drop_write(parfilp->f_path.mnt); break; case ATTR_OP_REMOVE: ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); if (ops[i].am_error) break; - ops[i].am_error = xfs_attrmulti_attr_remove(inode, - attr_name, ops[i].am_flags); + ops[i].am_error = xfs_attrmulti_attr_remove( + dentry->d_inode, attr_name, + ops[i].am_flags); mnt_drop_write(parfilp->f_path.mnt); break; default: @@ -647,9 +617,8 @@ xfs_attrmulti_by_handle( kfree(attr_name); out_kfree_ops: kfree(ops); - out_vn_rele: - iput(inode); - out: + out_dput: + dput(dentry); return -error; } @@ -1440,23 +1409,23 @@ xfs_file_ioctl( if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) return -XFS_ERROR(EFAULT); - return xfs_open_by_handle(mp, &hreq, filp, inode); + return xfs_open_by_handle(filp, &hreq); } case XFS_IOC_FSSETDM_BY_HANDLE: - return xfs_fssetdm_by_handle(mp, arg, inode); + return xfs_fssetdm_by_handle(filp, arg); case XFS_IOC_READLINK_BY_HANDLE: { xfs_fsop_handlereq_t hreq; if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) return -XFS_ERROR(EFAULT); - return xfs_readlink_by_handle(mp, &hreq, inode); + return xfs_readlink_by_handle(filp, &hreq); } case XFS_IOC_ATTRLIST_BY_HANDLE: - return xfs_attrlist_by_handle(mp, arg, inode); + return xfs_attrlist_by_handle(filp, arg); case XFS_IOC_ATTRMULTI_BY_HANDLE: - return xfs_attrmulti_by_handle(mp, arg, filp, inode); + return xfs_attrmulti_by_handle(filp, arg); case XFS_IOC_SWAPEXT: { struct xfs_swapext sxp; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h index 8c16bf2d7e0..7bd7c6afc1e 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.h +++ b/fs/xfs/linux-2.6/xfs_ioctl.h @@ -34,16 +34,13 @@ xfs_find_handle( extern int xfs_open_by_handle( - xfs_mount_t *mp, - xfs_fsop_handlereq_t *hreq, struct file *parfilp, - struct inode *parinode); + xfs_fsop_handlereq_t *hreq); extern int xfs_readlink_by_handle( - xfs_mount_t *mp, - xfs_fsop_handlereq_t *hreq, - struct inode *parinode); + struct file *parfilp, + xfs_fsop_handlereq_t *hreq); extern int xfs_attrmulti_attr_get( @@ -67,6 +64,12 @@ xfs_attrmulti_attr_remove( char *name, __uint32_t flags); +extern struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen); + extern long xfs_file_ioctl( struct file *filp, diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 50903ad3182..c70c4e3db79 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -17,6 +17,7 @@ */ #include <linux/compat.h> #include <linux/ioctl.h> +#include <linux/mount.h> #include <asm/uaccess.h> #include "xfs.h" #include "xfs_fs.h" @@ -340,96 +341,24 @@ xfs_compat_handlereq_copyin( return 0; } -/* - * Convert userspace handle data into inode. - * - * We use the fact that all the fsop_handlereq ioctl calls have a data - * structure argument whose first component is always a xfs_fsop_handlereq_t, - * so we can pass that sub structure into this handy, shared routine. - * - * If no error, caller must always iput the returned inode. - */ -STATIC int -xfs_vget_fsop_handlereq_compat( - xfs_mount_t *mp, - struct inode *parinode, /* parent inode pointer */ - compat_xfs_fsop_handlereq_t *hreq, - struct inode **inode) +STATIC struct dentry * +xfs_compat_handlereq_to_dentry( + struct file *parfilp, + compat_xfs_fsop_handlereq_t *hreq) { - void __user *hanp; - size_t hlen; - xfs_fid_t *xfid; - xfs_handle_t *handlep; - xfs_handle_t handle; - xfs_inode_t *ip; - xfs_ino_t ino; - __u32 igen; - int error; - - /* - * Only allow handle opens under a directory. - */ - if (!S_ISDIR(parinode->i_mode)) - return XFS_ERROR(ENOTDIR); - - hanp = compat_ptr(hreq->ihandle); - hlen = hreq->ihandlen; - handlep = &handle; - - if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) - return XFS_ERROR(EINVAL); - if (copy_from_user(handlep, hanp, hlen)) - return XFS_ERROR(EFAULT); - if (hlen < sizeof(*handlep)) - memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); - if (hlen > sizeof(handlep->ha_fsid)) { - if (handlep->ha_fid.fid_len != - (hlen - sizeof(handlep->ha_fsid) - - sizeof(handlep->ha_fid.fid_len)) || - handlep->ha_fid.fid_pad) - return XFS_ERROR(EINVAL); - } - - /* - * Crack the handle, obtain the inode # & generation # - */ - xfid = (struct xfs_fid *)&handlep->ha_fid; - if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) { - ino = xfid->fid_ino; - igen = xfid->fid_gen; - } else { - return XFS_ERROR(EINVAL); - } - - /* - * Get the XFS inode, building a Linux inode to go with it. - */ - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); - if (error) - return error; - if (ip == NULL) - return XFS_ERROR(EIO); - if (ip->i_d.di_gen != igen) { - xfs_iput_new(ip, XFS_ILOCK_SHARED); - return XFS_ERROR(ENOENT); - } - - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - *inode = VFS_I(ip); - return 0; + return xfs_handle_to_dentry(parfilp, + compat_ptr(hreq->ihandle), hreq->ihandlen); } STATIC int xfs_compat_attrlist_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct inode *parinode) + struct file *parfilp, + void __user *arg) { int error; attrlist_cursor_kern_t *cursor; compat_xfs_fsop_attrlist_handlereq_t al_hreq; - struct inode *inode; + struct dentry *dentry; char *kbuf; if (!capable(CAP_SYS_ADMIN)) @@ -446,17 +375,17 @@ xfs_compat_attrlist_by_handle( if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) return -XFS_ERROR(EINVAL); - error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq, - &inode); - if (error) - goto out; + dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + error = -ENOMEM; kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); if (!kbuf) - goto out_vn_rele; + goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen, + error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -466,22 +395,20 @@ xfs_compat_attrlist_by_handle( out_kfree: kfree(kbuf); - out_vn_rele: - iput(inode); - out: - return -error; + out_dput: + dput(dentry); + return error; } STATIC int xfs_compat_attrmulti_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct inode *parinode) + struct file *parfilp, + void __user *arg) { int error; compat_xfs_attr_multiop_t *ops; compat_xfs_fsop_attrmulti_handlereq_t am_hreq; - struct inode *inode; + struct dentry *dentry; unsigned int i, size; char *attr_name; @@ -491,20 +418,19 @@ xfs_compat_attrmulti_by_handle( sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq, - &inode); - if (error) - goto out; + dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); error = E2BIG; size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t); if (!size || size > 16 * PAGE_SIZE) - goto out_vn_rele; + goto out_dput; error = ENOMEM; ops = kmalloc(size, GFP_KERNEL); if (!ops) - goto out_vn_rele; + goto out_dput; error = EFAULT; if (copy_from_user(ops, compat_ptr(am_hreq.ops), size)) @@ -527,20 +453,29 @@ xfs_compat_attrmulti_by_handle( switch (ops[i].am_opcode) { case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get(inode, - attr_name, + ops[i].am_error = xfs_attrmulti_attr_get( + dentry->d_inode, attr_name, compat_ptr(ops[i].am_attrvalue), &ops[i].am_length, ops[i].am_flags); break; case ATTR_OP_SET: - ops[i].am_error = xfs_attrmulti_attr_set(inode, - attr_name, + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_set( + dentry->d_inode, attr_name, compat_ptr(ops[i].am_attrvalue), ops[i].am_length, ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); break; case ATTR_OP_REMOVE: - ops[i].am_error = xfs_attrmulti_attr_remove(inode, - attr_name, ops[i].am_flags); + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_remove( + dentry->d_inode, attr_name, + ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); break; default: ops[i].am_error = EINVAL; @@ -553,22 +488,20 @@ xfs_compat_attrmulti_by_handle( kfree(attr_name); out_kfree_ops: kfree(ops); - out_vn_rele: - iput(inode); - out: + out_dput: + dput(dentry); return -error; } STATIC int xfs_compat_fssetdm_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct inode *parinode) + struct file *parfilp, + void __user *arg) { int error; struct fsdmidata fsd; compat_xfs_fsop_setdm_handlereq_t dmhreq; - struct inode *inode; + struct dentry *dentry; if (!capable(CAP_MKNOD)) return -XFS_ERROR(EPERM); @@ -576,12 +509,11 @@ xfs_compat_fssetdm_by_handle( sizeof(compat_xfs_fsop_setdm_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq, - &inode); - if (error) - return -error; + dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { + if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { error = -XFS_ERROR(EPERM); goto out; } @@ -591,11 +523,11 @@ xfs_compat_fssetdm_by_handle( goto out; } - error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask, + error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: - iput(inode); + dput(dentry); return error; } @@ -722,21 +654,21 @@ xfs_file_compat_ioctl( if (xfs_compat_handlereq_copyin(&hreq, arg)) return -XFS_ERROR(EFAULT); - return xfs_open_by_handle(mp, &hreq, filp, inode); + return xfs_open_by_handle(filp, &hreq); } case XFS_IOC_READLINK_BY_HANDLE_32: { struct xfs_fsop_handlereq hreq; if (xfs_compat_handlereq_copyin(&hreq, arg)) return -XFS_ERROR(EFAULT); - return xfs_readlink_by_handle(mp, &hreq, inode); + return xfs_readlink_by_handle(filp, &hreq); } case XFS_IOC_ATTRLIST_BY_HANDLE_32: - return xfs_compat_attrlist_by_handle(mp, arg, inode); + return xfs_compat_attrlist_by_handle(filp, arg); case XFS_IOC_ATTRMULTI_BY_HANDLE_32: - return xfs_compat_attrmulti_by_handle(mp, arg, inode); + return xfs_compat_attrmulti_by_handle(filp, arg); case XFS_IOC_FSSETDM_BY_HANDLE_32: - return xfs_compat_fssetdm_by_handle(mp, arg, inode); + return xfs_compat_fssetdm_by_handle(filp, arg); default: return -XFS_ERROR(ENOIOCTLCMD); } diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 95a97108036..c71e226da7f 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1197,6 +1197,7 @@ xfs_fs_remount( struct xfs_mount *mp = XFS_M(sb); substring_t args[MAX_OPT_ARGS]; char *p; + int error; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1247,11 +1248,25 @@ xfs_fs_remount( } } - /* rw/ro -> rw */ + /* ro -> rw */ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { mp->m_flags &= ~XFS_MOUNT_RDONLY; if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_mountfs_check_barriers(mp); + + /* + * If this is the first remount to writeable state we + * might have some superblock changes to update. + */ + if (mp->m_update_flags) { + error = xfs_mount_log_sb(mp, mp->m_update_flags); + if (error) { + cmn_err(CE_WARN, + "XFS: failed to write sb changes"); + return error; + } + mp->m_update_flags = 0; + } } /* rw -> ro */ diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 2ed035354c2..a608e72fa40 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -371,7 +371,11 @@ xfs_quiesce_attr( /* flush inodes and push all remaining buffers out to disk */ xfs_quiesce_fs(mp); - ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); + /* + * Just warn here till VFS can correctly support + * read-only remount without racing. + */ + WARN_ON(atomic_read(&mp->m_active_trans) != 0); /* Push the superblock and write an unmount record */ error = xfs_log_sbcount(mp, 1); diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 591ca6602bf..6543c0b2975 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -73,6 +73,8 @@ int xfs_dqreq_num; int xfs_dqerror_mod = 33; #endif +static struct lock_class_key xfs_dquot_other_class; + /* * Allocate and initialize a dquot. We don't always allocate fresh memory; * we try to reclaim a free dquot if the number of incore dquots are above @@ -139,7 +141,15 @@ xfs_qm_dqinit( ASSERT(dqp->q_trace); xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT"); #endif - } + } + + /* + * In either case we need to make sure group quotas have a different + * lock class than user quotas, to make sure lockdep knows we can + * locks of one of each at the same time. + */ + if (!(type & XFS_DQ_USER)) + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); /* * log item gets initialized later @@ -421,7 +431,7 @@ xfs_qm_dqalloc( /* * Initialize the bmap freelist prior to calling bmapi code. */ - XFS_BMAP_INIT(&flist, &firstblock); + xfs_bmap_init(&flist, &firstblock); xfs_ilock(quotip, XFS_ILOCK_EXCL); /* * Return if this type of quotas is turned off while we didn't @@ -1383,6 +1393,12 @@ xfs_dqunlock_nonotify( mutex_unlock(&(dqp->q_qlock)); } +/* + * Lock two xfs_dquot structures. + * + * To avoid deadlocks we always lock the quota structure with + * the lowerd id first. + */ void xfs_dqlock2( xfs_dquot_t *d1, @@ -1392,18 +1408,16 @@ xfs_dqlock2( ASSERT(d1 != d2); if (be32_to_cpu(d1->q_core.d_id) > be32_to_cpu(d2->q_core.d_id)) { - xfs_dqlock(d2); - xfs_dqlock(d1); + mutex_lock(&d2->q_qlock); + mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); } else { - xfs_dqlock(d1); - xfs_dqlock(d2); - } - } else { - if (d1) { - xfs_dqlock(d1); - } else if (d2) { - xfs_dqlock(d2); + mutex_lock(&d1->q_qlock); + mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED); } + } else if (d1) { + mutex_lock(&d1->q_qlock); + } else if (d2) { + mutex_lock(&d2->q_qlock); } } diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index 7e455337e2b..d443e93b433 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -97,6 +97,16 @@ typedef struct xfs_dquot { #define dq_hashlist q_lists.dqm_hashlist #define dq_flags q_lists.dqm_flags +/* + * Lock hierachy for q_qlock: + * XFS_QLOCK_NORMAL is the implicit default, + * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 + */ +enum { + XFS_QLOCK_NORMAL = 0, + XFS_QLOCK_NESTED, +}; + #define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) #ifdef DEBUG diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 6b13960cf31..7a2beb64314 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -1070,6 +1070,13 @@ xfs_qm_sync( return 0; } +/* + * The hash chains and the mplist use the same xfs_dqhash structure as + * their list head, but we can take the mplist qh_lock and one of the + * hash qh_locks at the same time without any problem as they aren't + * related. + */ +static struct lock_class_key xfs_quota_mplist_class; /* * This initializes all the quota information that's kept in the @@ -1105,6 +1112,8 @@ xfs_qm_init_quotainfo( } xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); + lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class); + qinf->qi_dqreclaims = 0; /* mutex used to serialize quotaoffs */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index d3b3cf74299..143d63ecb20 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -244,8 +244,8 @@ typedef struct xfs_perag #define XFS_AG_CHECK_DADDR(mp,d,len) \ ((len) == 1 ? \ ASSERT((d) == XFS_SB_DADDR || \ - XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \ - ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \ - XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1))) + xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \ + ASSERT(xfs_daddr_to_agno(mp, d) == \ + xfs_daddr_to_agno(mp, (d) + (len) - 1))) #endif /* __XFS_AG_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 733cb75a8c5..c10c3a292d3 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -115,7 +115,7 @@ xfs_allocbt_free_block( xfs_agblock_t bno; int error; - bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp)); + bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index f7cdc28aff4..5fde1654b43 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -374,7 +374,7 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, * It won't fit in the shortform, transform to a leaf block. * GROT: another possible req'mt for a double-split btree op. */ - XFS_BMAP_INIT(args.flist, args.firstblock); + xfs_bmap_init(args.flist, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); if (!error) { error = xfs_bmap_finish(&args.trans, args.flist, @@ -956,7 +956,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit that transaction so that the node_addname() call * can manage its own transactions. */ - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1057,7 +1057,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { @@ -1135,7 +1135,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { @@ -1290,7 +1290,7 @@ restart: * have been a b-tree. */ xfs_da_state_free(state); - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1331,7 +1331,7 @@ restart: * in the index/blkno/rmtblkno/rmtblkcnt fields and * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. */ - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_da_split(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1443,7 +1443,7 @@ restart: * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_da_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1579,7 +1579,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_da_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1630,7 +1630,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) == XFS_ATTR_LEAF_MAGIC); if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { @@ -2069,7 +2069,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) /* * Allocate a single extent, up to the size of the value. */ - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, @@ -2123,7 +2123,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) /* * Try to remember where we decided to put the value. */ - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, args->rmtblkcnt, @@ -2188,7 +2188,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) /* * Try to remember where we decided to put the value. */ - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, args->rmtblkcnt, @@ -2229,7 +2229,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) blkcnt = args->rmtblkcnt; done = 0; while (!done) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 1, args->firstblock, args->flist, diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 138308e70d1..c852cd65aae 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -595,9 +595,9 @@ xfs_bmap_add_extent( xfs_iext_insert(ifp, 0, 1, new); ASSERT(cur == NULL); ifp->if_lastex = 0; - if (!ISNULLSTARTBLOCK(new->br_startblock)) { + if (!isnullstartblock(new->br_startblock)) { XFS_IFORK_NEXT_SET(ip, whichfork, 1); - logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else logflags = 0; /* DELTA: single new extent */ @@ -613,7 +613,7 @@ xfs_bmap_add_extent( /* * Any kind of new delayed allocation goes here. */ - else if (ISNULLSTARTBLOCK(new->br_startblock)) { + else if (isnullstartblock(new->br_startblock)) { if (cur) ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); @@ -644,11 +644,11 @@ xfs_bmap_add_extent( * in a delayed or unwritten allocation with a real one, or * converting real back to unwritten. */ - if (!ISNULLSTARTBLOCK(new->br_startblock) && + if (!isnullstartblock(new->br_startblock) && new->br_startoff + new->br_blockcount > prev.br_startoff) { if (prev.br_state != XFS_EXT_UNWRITTEN && - ISNULLSTARTBLOCK(prev.br_startblock)) { - da_old = STARTBLOCKVAL(prev.br_startblock); + isnullstartblock(prev.br_startblock)) { + da_old = startblockval(prev.br_startblock); if (cur) ASSERT(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL); @@ -803,7 +803,7 @@ xfs_bmap_add_extent_delay_real( */ if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); + STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock)); } STATE_SET(LEFT_CONTIG, STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && @@ -820,7 +820,7 @@ xfs_bmap_add_extent_delay_real( idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); + STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock)); } STATE_SET(RIGHT_CONTIG, STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && @@ -1019,8 +1019,8 @@ xfs_bmap_add_extent_delay_real( goto done; } temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + startblockval(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: The boundary between two in-core extents moved. */ @@ -1067,10 +1067,10 @@ xfs_bmap_add_extent_delay_real( goto done; } temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock) - + startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, idx + 1); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK); *dnew = temp; /* DELTA: One in-core extent is split in two. */ @@ -1110,8 +1110,8 @@ xfs_bmap_add_extent_delay_real( goto done; } temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + startblockval(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: The boundary between two in-core extents moved. */ @@ -1157,10 +1157,10 @@ xfs_bmap_add_extent_delay_real( goto done; } temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock) - + startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: One in-core extent is split in two. */ @@ -1213,7 +1213,7 @@ xfs_bmap_add_extent_delay_real( } temp = xfs_bmap_worst_indlen(ip, temp); temp2 = xfs_bmap_worst_indlen(ip, temp2); - diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) - + diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); if (diff > 0 && xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { @@ -1241,11 +1241,11 @@ xfs_bmap_add_extent_delay_real( } } ep = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), - NULLSTARTBLOCK((int)temp2)); + nullstartblock((int)temp2)); XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); *dnew = temp + temp2; /* DELTA: One in-core extent is split in three. */ @@ -1365,7 +1365,7 @@ xfs_bmap_add_extent_unwritten_real( */ if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); + STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock)); } STATE_SET(LEFT_CONTIG, STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && @@ -1382,7 +1382,7 @@ xfs_bmap_add_extent_unwritten_real( idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); + STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock)); } STATE_SET(RIGHT_CONTIG, STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && @@ -1889,13 +1889,13 @@ xfs_bmap_add_extent_hole_delay( ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ep = xfs_iext_get_ext(ifp, idx); state = 0; - ASSERT(ISNULLSTARTBLOCK(new->br_startblock)); + ASSERT(isnullstartblock(new->br_startblock)); /* * Check and set flags if this segment has a left neighbor */ if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); + STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock)); } /* * Check and set flags if the current (right) segment exists. @@ -1905,7 +1905,7 @@ xfs_bmap_add_extent_hole_delay( idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { xfs_bmbt_get_all(ep, &right); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); + STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock)); } /* * Set contiguity flags on the left and right neighbors. @@ -1938,12 +1938,12 @@ xfs_bmap_add_extent_hole_delay( XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); - oldlen = STARTBLOCKVAL(left.br_startblock) + - STARTBLOCKVAL(new->br_startblock) + - STARTBLOCKVAL(right.br_startblock); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), - NULLSTARTBLOCK((int)newlen)); + nullstartblock((int)newlen)); XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, XFS_DATA_FORK); XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK); @@ -1964,11 +1964,11 @@ xfs_bmap_add_extent_hole_delay( XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); - oldlen = STARTBLOCKVAL(left.br_startblock) + - STARTBLOCKVAL(new->br_startblock); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), - NULLSTARTBLOCK((int)newlen)); + nullstartblock((int)newlen)); XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx - 1; @@ -1985,11 +1985,11 @@ xfs_bmap_add_extent_hole_delay( */ XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK); temp = new->br_blockcount + right.br_blockcount; - oldlen = STARTBLOCKVAL(new->br_startblock) + - STARTBLOCKVAL(right.br_startblock); + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_allf(ep, new->br_startoff, - NULLSTARTBLOCK((int)newlen), temp, right.br_state); + nullstartblock((int)newlen), temp, right.br_state); XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; /* DELTA: One in-core extent grew into a hole. */ @@ -2085,7 +2085,7 @@ xfs_bmap_add_extent_hole_real( */ if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); + STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock)); } /* * Check and set flags if this segment has a current value. @@ -2095,7 +2095,7 @@ xfs_bmap_add_extent_hole_real( idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { xfs_bmbt_get_all(ep, &right); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); + STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock)); } /* * We're inserting a real allocation between "left" and "right". @@ -2143,7 +2143,7 @@ xfs_bmap_add_extent_hole_real( XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) { - rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; if ((error = xfs_bmbt_lookup_eq(cur, @@ -2185,7 +2185,7 @@ xfs_bmap_add_extent_hole_real( XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork); ifp->if_lastex = idx - 1; if (cur == NULL) { - rval = XFS_ILOG_FEXT(whichfork); + rval = xfs_ilog_fext(whichfork); } else { rval = 0; if ((error = xfs_bmbt_lookup_eq(cur, @@ -2220,7 +2220,7 @@ xfs_bmap_add_extent_hole_real( XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork); ifp->if_lastex = idx; if (cur == NULL) { - rval = XFS_ILOG_FEXT(whichfork); + rval = xfs_ilog_fext(whichfork); } else { rval = 0; if ((error = xfs_bmbt_lookup_eq(cur, @@ -2254,7 +2254,7 @@ xfs_bmap_add_extent_hole_real( XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) { - rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; if ((error = xfs_bmbt_lookup_eq(cur, @@ -2482,7 +2482,7 @@ xfs_bmap_adjacent( * try to use it's last block as our starting point. */ if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && - !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && + !isnullstartblock(ap->prevp->br_startblock) && ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, ap->prevp->br_startblock)) { ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; @@ -2511,7 +2511,7 @@ xfs_bmap_adjacent( * start block based on it. */ if (ap->prevp->br_startoff != NULLFILEOFF && - !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && + !isnullstartblock(ap->prevp->br_startblock) && (prevbno = ap->prevp->br_startblock + ap->prevp->br_blockcount) && ISVALID(prevbno, ap->prevp->br_startblock)) { @@ -2552,7 +2552,7 @@ xfs_bmap_adjacent( * If there's a following (right) block, select a requested * start block based on it. */ - if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) { + if (!isnullstartblock(ap->gotp->br_startblock)) { /* * Calculate gap to start of next block. */ @@ -3082,7 +3082,7 @@ xfs_bmap_btree_to_extents( ASSERT(ifp->if_broot == NULL); ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -3136,8 +3136,8 @@ xfs_bmap_del_extent( del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got.br_startoff + got.br_blockcount; ASSERT(got_endoff >= del_endoff); - delay = ISNULLSTARTBLOCK(got.br_startblock); - ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); flags = 0; qfield = 0; error = 0; @@ -3189,7 +3189,7 @@ xfs_bmap_del_extent( } da_old = da_new = 0; } else { - da_old = STARTBLOCKVAL(got.br_startblock); + da_old = startblockval(got.br_startblock); da_new = 0; nblks = 0; do_fx = 0; @@ -3213,7 +3213,7 @@ xfs_bmap_del_extent( XFS_IFORK_NEXTENTS(ip, whichfork) - 1); flags |= XFS_ILOG_CORE; if (!cur) { - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_btree_delete(cur, &i))) @@ -3233,7 +3233,7 @@ xfs_bmap_del_extent( if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); da_new = temp; @@ -3242,7 +3242,7 @@ xfs_bmap_del_extent( xfs_bmbt_set_startblock(ep, del_endblock); XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); if (!cur) { - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, @@ -3262,7 +3262,7 @@ xfs_bmap_del_extent( if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); da_new = temp; @@ -3270,7 +3270,7 @@ xfs_bmap_del_extent( } XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); if (!cur) { - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_bmbt_update(cur, got.br_startoff, @@ -3345,22 +3345,22 @@ xfs_bmap_del_extent( } XFS_WANT_CORRUPTED_GOTO(i == 1, done); } else - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); } else { ASSERT(whichfork == XFS_DATA_FORK); temp = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); temp2 = xfs_bmap_worst_indlen(ip, temp2); - new.br_startblock = NULLSTARTBLOCK((int)temp2); + new.br_startblock = nullstartblock((int)temp2); da_new = temp + temp2; while (da_new > da_old) { if (temp) { temp--; da_new--; xfs_bmbt_set_startblock(ep, - NULLSTARTBLOCK((int)temp)); + nullstartblock((int)temp)); } if (da_new == da_old) break; @@ -3368,7 +3368,7 @@ xfs_bmap_del_extent( temp2--; da_new--; new.br_startblock = - NULLSTARTBLOCK((int)temp2); + nullstartblock((int)temp2); } } } @@ -3545,7 +3545,7 @@ xfs_bmap_extents_to_btree( nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); for (cnt = i = 0; i < nextents; i++) { ep = xfs_iext_get_ext(ifp, i); - if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) { + if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { arp->l0 = cpu_to_be64(ep->l0); arp->l1 = cpu_to_be64(ep->l1); arp++; cnt++; @@ -3572,7 +3572,7 @@ xfs_bmap_extents_to_btree( xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); ASSERT(*curp == NULL); *curp = cur; - *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); return 0; } @@ -3676,7 +3676,7 @@ xfs_bmap_local_to_extents( ip->i_d.di_nblocks = 1; XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); } else { ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); @@ -4082,7 +4082,7 @@ xfs_bmap_add_attrfork( XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; - XFS_BMAP_INIT(&flist, &firstblock); + xfs_bmap_init(&flist, &firstblock); switch (ip->i_d.di_format) { case XFS_DINODE_FMT_LOCAL: error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, @@ -4162,7 +4162,7 @@ xfs_bmap_add_free( ASSERT(bno != NULLFSBLOCK); ASSERT(len > 0); ASSERT(len <= MAXEXTLEN); - ASSERT(!ISNULLSTARTBLOCK(bno)); + ASSERT(!isnullstartblock(bno)); agno = XFS_FSB_TO_AGNO(mp, bno); agbno = XFS_FSB_TO_AGBNO(mp, bno); ASSERT(agno < mp->m_sb.sb_agcount); @@ -4909,7 +4909,7 @@ xfs_bmapi( got.br_startoff = end; inhole = eof || got.br_startoff > bno; wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) && - ISNULLSTARTBLOCK(got.br_startblock); + isnullstartblock(got.br_startblock); /* * First, deal with the hole before the allocated space * that we found, if any. @@ -5028,7 +5028,7 @@ xfs_bmapi( } ip->i_delayed_blks += alen; - abno = NULLSTARTBLOCK(indlen); + abno = nullstartblock(indlen); } else { /* * If first time, allocate and fill in @@ -5144,8 +5144,8 @@ xfs_bmapi( aoff + alen); #ifdef DEBUG if (flags & XFS_BMAPI_DELAY) { - ASSERT(ISNULLSTARTBLOCK(got.br_startblock)); - ASSERT(STARTBLOCKVAL(got.br_startblock) > 0); + ASSERT(isnullstartblock(got.br_startblock)); + ASSERT(startblockval(got.br_startblock) > 0); } ASSERT(got.br_state == XFS_EXT_NORM || got.br_state == XFS_EXT_UNWRITTEN); @@ -5179,7 +5179,7 @@ xfs_bmapi( ASSERT((bno >= obno) || (n == 0)); ASSERT(bno < end); mval->br_startoff = bno; - if (ISNULLSTARTBLOCK(got.br_startblock)) { + if (isnullstartblock(got.br_startblock)) { ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); mval->br_startblock = DELAYSTARTBLOCK; } else @@ -5201,7 +5201,7 @@ xfs_bmapi( ASSERT(mval->br_blockcount <= len); } else { *mval = got; - if (ISNULLSTARTBLOCK(mval->br_startblock)) { + if (isnullstartblock(mval->br_startblock)) { ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); mval->br_startblock = DELAYSTARTBLOCK; } @@ -5329,12 +5329,12 @@ error0: * Log everything. Do this after conversion, there's no point in * logging the extent records if we've converted to btree format. */ - if ((logflags & XFS_ILOG_FEXT(whichfork)) && + if ((logflags & xfs_ilog_fext(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - logflags &= ~XFS_ILOG_FEXT(whichfork); - else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && + logflags &= ~xfs_ilog_fext(whichfork); + else if ((logflags & xfs_ilog_fbroot(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - logflags &= ~XFS_ILOG_FBROOT(whichfork); + logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log whatever the flags say, even if error. Otherwise we might miss * detecting a case where the data is changed, there's an error, @@ -5411,7 +5411,7 @@ xfs_bmapi_single( *fsb = NULLFSBLOCK; return 0; } - ASSERT(!ISNULLSTARTBLOCK(got.br_startblock)); + ASSERT(!isnullstartblock(got.br_startblock)); ASSERT(bno < got.br_startoff + got.br_blockcount); *fsb = got.br_startblock + (bno - got.br_startoff); ifp->if_lastex = lastx; @@ -5543,7 +5543,7 @@ xfs_bunmapi( */ ASSERT(ep != NULL); del = got; - wasdel = ISNULLSTARTBLOCK(del.br_startblock); + wasdel = isnullstartblock(del.br_startblock); if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff; @@ -5638,7 +5638,7 @@ xfs_bunmapi( xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), &prev); ASSERT(prev.br_state == XFS_EXT_NORM); - ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock)); + ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == prev.br_startblock + prev.br_blockcount); if (prev.br_startoff < start) { @@ -5666,7 +5666,7 @@ xfs_bunmapi( } } if (wasdel) { - ASSERT(STARTBLOCKVAL(del.br_startblock) > 0); + ASSERT(startblockval(del.br_startblock) > 0); /* Update realtime/data freespace, unreserve quota */ if (isrt) { xfs_filblks_t rtexts; @@ -5782,12 +5782,12 @@ error0: * Log everything. Do this after conversion, there's no point in * logging the extent records if we've converted to btree format. */ - if ((logflags & XFS_ILOG_FEXT(whichfork)) && + if ((logflags & xfs_ilog_fext(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - logflags &= ~XFS_ILOG_FEXT(whichfork); - else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && + logflags &= ~xfs_ilog_fext(whichfork); + else if ((logflags & xfs_ilog_fbroot(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - logflags &= ~XFS_ILOG_FBROOT(whichfork); + logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log inode even in the error case, if the transaction * is dirty we'll need to shut down the filesystem. @@ -5838,7 +5838,7 @@ xfs_getbmapx_fix_eof_hole( if (startblock == DELAYSTARTBLOCK) out->bmv_block = -2; else - out->bmv_block = XFS_FSB_TO_DB(ip, startblock); + out->bmv_block = xfs_fsb_to_db(ip, startblock); fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && @@ -5979,7 +5979,7 @@ xfs_getbmap( if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; - bmapi_flags = XFS_BMAPI_AFLAG(whichfork) | + bmapi_flags = xfs_bmapi_aflag(whichfork) | ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE); /* @@ -6098,7 +6098,7 @@ xfs_bmap_isaeof( */ *aeof = (off >= s.br_startoff && off < s.br_startoff + s.br_blockcount && - ISNULLSTARTBLOCK(s.br_startblock)) || + isnullstartblock(s.br_startblock)) || off >= s.br_startoff + s.br_blockcount; return 0; } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 284571c05ed..be2979d88d3 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -95,7 +95,6 @@ typedef struct xfs_bmap_free /* need write cache flushing and no */ /* additional allocation alignments */ -#define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w) static inline int xfs_bmapi_aflag(int w) { return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); @@ -107,7 +106,6 @@ static inline int xfs_bmapi_aflag(int w) #define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) #define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) -#define XFS_BMAP_INIT(flp,fbp) xfs_bmap_init(flp,fbp) static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) { ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index ba6b08c2fb0..0760d352586 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -121,7 +121,7 @@ __xfs_bmbt_get_all( b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) | (((xfs_dfsbno_t)l1) >> 21); - ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); + ASSERT((b >> 32) == 0 || isnulldstartblock(b)); s->br_startblock = (xfs_fsblock_t)b; } #else /* !DEBUG */ @@ -172,7 +172,7 @@ xfs_bmbt_get_startblock( b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) | (((xfs_dfsbno_t)r->l1) >> 21); - ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); + ASSERT((b >> 32) == 0 || isnulldstartblock(b)); return (xfs_fsblock_t)b; #else /* !DEBUG */ return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21); @@ -261,7 +261,7 @@ xfs_bmbt_set_allf( ((xfs_bmbt_rec_base_t)blockcount & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); #else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(startblock)) { + if (isnullstartblock(startblock)) { r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | ((xfs_bmbt_rec_base_t)startoff << 9) | (xfs_bmbt_rec_base_t)xfs_mask64lo(9); @@ -321,7 +321,7 @@ xfs_bmbt_disk_set_allf( ((xfs_bmbt_rec_base_t)blockcount & (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); #else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(startblock)) { + if (isnullstartblock(startblock)) { r->l0 = cpu_to_be64( ((xfs_bmbt_rec_base_t)extent_flag << 63) | ((xfs_bmbt_rec_base_t)startoff << 9) | @@ -382,7 +382,7 @@ xfs_bmbt_set_startblock( r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) | (xfs_bmbt_rec_base_t)(v << 21); #else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(v)) { + if (isnullstartblock(v)) { r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9); r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) | ((xfs_bmbt_rec_base_t)v << 21) | diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index a4555abb662..0e8df007615 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -76,26 +76,22 @@ typedef struct xfs_bmbt_rec_host { #define DSTARTBLOCKMASK \ (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) -#define ISNULLSTARTBLOCK(x) isnullstartblock(x) static inline int isnullstartblock(xfs_fsblock_t x) { return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; } -#define ISNULLDSTARTBLOCK(x) isnulldstartblock(x) static inline int isnulldstartblock(xfs_dfsbno_t x) { return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; } -#define NULLSTARTBLOCK(k) nullstartblock(k) static inline xfs_fsblock_t nullstartblock(int k) { ASSERT(k < (1 << STARTBLOCKVALBITS)); return STARTBLOCKMASK | (k); } -#define STARTBLOCKVAL(x) startblockval(x) static inline xfs_filblks_t startblockval(xfs_fsblock_t x) { return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 2c3ef20f884..e73c332eb23 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -843,7 +843,7 @@ xfs_btree_ptr_is_null( union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return be64_to_cpu(ptr->l) == NULLFSBLOCK; + return be64_to_cpu(ptr->l) == NULLDFSBNO; else return be32_to_cpu(ptr->s) == NULLAGBLOCK; } @@ -854,7 +854,7 @@ xfs_btree_set_ptr_null( union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - ptr->l = cpu_to_be64(NULLFSBLOCK); + ptr->l = cpu_to_be64(NULLDFSBNO); else ptr->s = cpu_to_be32(NULLAGBLOCK); } @@ -918,8 +918,8 @@ xfs_btree_init_block( new->bb_numrecs = cpu_to_be16(numrecs); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); - new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); + new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); } else { new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); @@ -960,7 +960,7 @@ xfs_btree_buf_to_ptr( ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp))); else { - ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp, + ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp))); } } @@ -971,7 +971,7 @@ xfs_btree_ptr_to_daddr( union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK); + ASSERT(be64_to_cpu(ptr->l) != NULLDFSBNO); return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); } else { @@ -2454,7 +2454,7 @@ xfs_btree_new_iroot( xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); *logflags |= - XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); *stat = 1; XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; @@ -3048,7 +3048,7 @@ xfs_btree_kill_iroot( cur->bc_bufs[level - 1] = NULL; be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, - XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); cur->bc_nlevels--; out0: XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index a11a8390bf6..c45f74ff1a5 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1597,7 +1597,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) nmap = 1; ASSERT(args->firstblock != NULL); if ((error = xfs_bmapi(tp, dp, bno, count, - XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| + xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, args->flist, NULL))) { @@ -1618,7 +1618,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) nmap = MIN(XFS_BMAP_MAX_NMAP, count); c = (int)(bno + count - b); if ((error = xfs_bmapi(tp, dp, b, c, - XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE| + xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| XFS_BMAPI_METADATA, args->firstblock, args->total, &mapp[mapi], &nmap, args->flist, @@ -1882,7 +1882,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, * the last block to the place we want to kill. */ if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, - XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, NULL, &done)) == ENOSPC) { if (w != XFS_DATA_FORK) @@ -1987,7 +1987,7 @@ xfs_da_do_buf( if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, nfsb, XFS_BMAPI_METADATA | - XFS_BMAPI_AFLAG(whichfork), + xfs_bmapi_aflag(whichfork), NULL, 0, mapp, &nmap, NULL, NULL))) goto exit0; } diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index b4c1ee71349..f8278cfcc1d 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -55,17 +55,11 @@ xfs_swapext( struct file *file, *target_file; int error = 0; - sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL); - if (!sxp) { - error = XFS_ERROR(ENOMEM); - goto out; - } - /* Pull information for the target fd */ file = fget((int)sxp->sx_fdtarget); if (!file) { error = XFS_ERROR(EINVAL); - goto out_free_sxp; + goto out; } if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) { @@ -109,8 +103,6 @@ xfs_swapext( fput(target_file); out_put_file: fput(file); - out_free_sxp: - kmem_free(sxp); out: return error; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index e6ebbaeb4dc..ab016e5ae7b 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -357,7 +357,7 @@ xfs_ialloc_ag_alloc( int ioffset = i << args.mp->m_sb.sb_inodelog; uint isize = sizeof(struct xfs_dinode); - free = XFS_MAKE_IPTR(args.mp, fbuf, i); + free = xfs_make_iptr(args.mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); free->di_version = version; free->di_gen = cpu_to_be32(gen); @@ -937,7 +937,7 @@ nextag: } } } - offset = XFS_IALLOC_FIND_FREE(&rec.ir_free); + offset = xfs_ialloc_find_free(&rec.ir_free); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1279,7 +1279,7 @@ xfs_imap( offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno); + cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno); offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock; imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 50f558a4e0a..aeee8278f92 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -39,7 +39,6 @@ struct xfs_trans; /* * Make an inode pointer out of the buffer/offset. */ -#define XFS_MAKE_IPTR(mp,b,o) xfs_make_iptr(mp,b,o) static inline struct xfs_dinode * xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) { @@ -50,7 +49,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) /* * Find a free (set) bit in the inode bitmask. */ -#define XFS_IALLOC_FIND_FREE(fp) xfs_ialloc_find_free(fp) static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) { return xfs_lowbit64(*fp); diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 37e5dd01a57..5580e255ff0 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -36,7 +36,6 @@ typedef __uint64_t xfs_inofree_t; #define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) #define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) -#define XFS_INOBT_MASKN(i,n) xfs_inobt_maskn(i,n) static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { return (((n) >= XFS_INODES_PER_CHUNK ? \ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5a5e035e5d3..e7ae08d1df4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -424,6 +424,19 @@ xfs_iformat( case XFS_DINODE_FMT_LOCAL: atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); size = be16_to_cpu(atp->hdr.totsize); + + if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode %Lu " + "(bad attr fork size %Ld).", + (unsigned long long) ip->i_ino, + (long long) size); + XFS_CORRUPTION_ERROR("xfs_iformat(8)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); break; case XFS_DINODE_FMT_EXTENTS: @@ -1601,10 +1614,10 @@ xfs_itruncate_finish( * in this file with garbage in them once recovery * runs. */ - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); error = xfs_bunmapi(ntp, ip, first_unmap_block, unmap_len, - XFS_BMAPI_AFLAG(fork) | + xfs_bmapi_aflag(fork) | (sync ? 0 : XFS_BMAPI_ASYNC), XFS_ITRUNC_MAX_EXTENTS, &first_block, &free_list, @@ -2557,7 +2570,7 @@ xfs_iextents_copy( for (i = 0; i < nrecs; i++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); start_block = xfs_bmbt_get_startblock(ep); - if (ISNULLSTARTBLOCK(start_block)) { + if (isnullstartblock(start_block)) { /* * It's a delayed allocation extent, so skip it. */ diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 1ff04cc323a..9957d0602d5 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -111,20 +111,16 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED) - -#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w) static inline int xfs_ilog_fbroot(int w) { return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); } -#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w) static inline int xfs_ilog_fext(int w) { return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); } -#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w) static inline int xfs_ilog_fdata(int w) { return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 911062cf73a..08ce72316bf 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -155,7 +155,7 @@ xfs_imap_to_bmap( iomapp->iomap_bn = IOMAP_DADDR_NULL; iomapp->iomap_flags |= IOMAP_DELAY; } else { - iomapp->iomap_bn = XFS_FSB_TO_DB(ip, start_block); + iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block); if (ISUNWRITTEN(imap)) iomapp->iomap_flags |= IOMAP_UNWRITTEN; } @@ -261,7 +261,7 @@ xfs_iomap( xfs_iunlock(ip, lockmode); lockmode = 0; - if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) { + if (nimaps && !isnullstartblock(imap.br_startblock)) { xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, offset, count, iomapp, &imap, flags); break; @@ -491,7 +491,7 @@ xfs_iomap_write_direct( /* * Issue the xfs_bmapi() call to allocate the blocks */ - XFS_BMAP_INIT(&free_list, &firstfsb); + xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, &firstfsb, 0, &imap, &nimaps, &free_list, NULL); @@ -751,7 +751,7 @@ xfs_iomap_write_allocate( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); /* * it is possible that the extents have changed since @@ -911,7 +911,7 @@ xfs_iomap_write_unwritten( /* * Modify the unwritten extent state of the buffer. */ - XFS_BMAP_INIT(&free_list, &firstfsb); + xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index e19d0a8d561..cf98a805ec9 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -453,7 +453,7 @@ xfs_bulkstat( (chunkidx = agino - gino + 1) < XFS_INODES_PER_CHUNK && /* there are some left allocated */ - XFS_INOBT_MASKN(chunkidx, + xfs_inobt_maskn(chunkidx, XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { /* * Grab the chunk record. Mark all the @@ -464,7 +464,7 @@ xfs_bulkstat( if (XFS_INOBT_MASK(i) & ~gfree) gcnt++; } - gfree |= XFS_INOBT_MASKN(0, chunkidx); + gfree |= xfs_inobt_maskn(0, chunkidx); irbp->ir_startino = gino; irbp->ir_freecount = gcnt; irbp->ir_free = gfree; @@ -535,7 +535,7 @@ xfs_bulkstat( chunkidx < XFS_INODES_PER_CHUNK; chunkidx += nicluster, agbno += nbcluster) { - if (XFS_INOBT_MASKN(chunkidx, + if (xfs_inobt_maskn(chunkidx, nicluster) & ~gfree) xfs_btree_reada_bufs(mp, agno, agbno, nbcluster); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 35cca98bd94..b1047de2fff 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -70,16 +70,21 @@ STATIC void xlog_recover_check_summary(xlog_t *); xfs_buf_t * xlog_get_bp( xlog_t *log, - int num_bblks) + int nbblks) { - ASSERT(num_bblks > 0); + if (nbblks <= 0 || nbblks > log->l_logBBsize) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); + XFS_ERROR_REPORT("xlog_get_bp(1)", + XFS_ERRLEVEL_HIGH, log->l_mp); + return NULL; + } if (log->l_sectbb_log) { - if (num_bblks > 1) - num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); - num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks); + if (nbblks > 1) + nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); + nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); } - return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp); + return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); } void @@ -102,6 +107,13 @@ xlog_bread( { int error; + if (nbblks <= 0 || nbblks > log->l_logBBsize) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); + XFS_ERROR_REPORT("xlog_bread(1)", + XFS_ERRLEVEL_HIGH, log->l_mp); + return EFSCORRUPTED; + } + if (log->l_sectbb_log) { blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); @@ -139,6 +151,13 @@ xlog_bwrite( { int error; + if (nbblks <= 0 || nbblks > log->l_logBBsize) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); + XFS_ERROR_REPORT("xlog_bwrite(1)", + XFS_ERRLEVEL_HIGH, log->l_mp); + return EFSCORRUPTED; + } + if (log->l_sectbb_log) { blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 3c97c6463a4..35300250e86 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -45,7 +45,6 @@ #include "xfs_fsops.h" #include "xfs_utils.h" -STATIC int xfs_mount_log_sb(xfs_mount_t *, __int64_t); STATIC int xfs_uuid_mount(xfs_mount_t *); STATIC void xfs_unmountfs_wait(xfs_mount_t *); @@ -682,7 +681,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) * Update alignment values based on mount options and sb values */ STATIC int -xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags) +xfs_update_alignment(xfs_mount_t *mp) { xfs_sb_t *sbp = &(mp->m_sb); @@ -736,11 +735,11 @@ xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags) if (xfs_sb_version_hasdalign(sbp)) { if (sbp->sb_unit != mp->m_dalign) { sbp->sb_unit = mp->m_dalign; - *update_flags |= XFS_SB_UNIT; + mp->m_update_flags |= XFS_SB_UNIT; } if (sbp->sb_width != mp->m_swidth) { sbp->sb_width = mp->m_swidth; - *update_flags |= XFS_SB_WIDTH; + mp->m_update_flags |= XFS_SB_WIDTH; } } } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && @@ -905,7 +904,6 @@ xfs_mountfs( xfs_sb_t *sbp = &(mp->m_sb); xfs_inode_t *rip; __uint64_t resblks; - __int64_t update_flags = 0LL; uint quotamount, quotaflags; int uuid_mounted = 0; int error = 0; @@ -933,7 +931,7 @@ xfs_mountfs( "XFS: correcting sb_features alignment problem"); sbp->sb_features2 |= sbp->sb_bad_features2; sbp->sb_bad_features2 = sbp->sb_features2; - update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; + mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; /* * Re-check for ATTR2 in case it was found in bad_features2 @@ -947,11 +945,11 @@ xfs_mountfs( if (xfs_sb_version_hasattr2(&mp->m_sb) && (mp->m_flags & XFS_MOUNT_NOATTR2)) { xfs_sb_version_removeattr2(&mp->m_sb); - update_flags |= XFS_SB_FEATURES2; + mp->m_update_flags |= XFS_SB_FEATURES2; /* update sb_versionnum for the clearing of the morebits */ if (!sbp->sb_features2) - update_flags |= XFS_SB_VERSIONNUM; + mp->m_update_flags |= XFS_SB_VERSIONNUM; } /* @@ -960,7 +958,7 @@ xfs_mountfs( * allocator alignment is within an ag, therefore ag has * to be aligned at stripe boundary. */ - error = xfs_update_alignment(mp, &update_flags); + error = xfs_update_alignment(mp); if (error) goto error1; @@ -1137,10 +1135,12 @@ xfs_mountfs( } /* - * If fs is not mounted readonly, then update the superblock changes. + * If this is a read-only mount defer the superblock updates until + * the next remount into writeable mode. Otherwise we would never + * perform the update e.g. for the root filesystem. */ - if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { - error = xfs_mount_log_sb(mp, update_flags); + if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + error = xfs_mount_log_sb(mp, mp->m_update_flags); if (error) { cmn_err(CE_WARN, "XFS: failed to write sb changes"); goto error4; @@ -1820,7 +1820,7 @@ xfs_uuid_mount( * be altered by the mount options, as well as any potential sb_features2 * fixup. Only the first superblock is updated. */ -STATIC int +int xfs_mount_log_sb( xfs_mount_t *mp, __int64_t fields) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index c1e02846732..f5e9937f9bd 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -44,9 +44,9 @@ typedef struct xfs_trans_reservations { #ifndef __KERNEL__ -#define XFS_DADDR_TO_AGNO(mp,d) \ +#define xfs_daddr_to_agno(mp,d) \ ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks)) -#define XFS_DADDR_TO_AGBNO(mp,d) \ +#define xfs_daddr_to_agbno(mp,d) \ ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks)) #else /* __KERNEL__ */ @@ -327,6 +327,8 @@ typedef struct xfs_mount { spinlock_t m_sync_lock; /* work item list lock */ int m_sync_seq; /* sync thread generation no. */ wait_queue_head_t m_wait_single_sync_task; + __int64_t m_update_flags; /* sb flags we need to update + on the next remount,rw */ } xfs_mount_t; /* @@ -439,7 +441,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, */ #define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ -#define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d) static inline xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) { @@ -448,7 +449,6 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agnumber_t) ld; } -#define XFS_DADDR_TO_AGBNO(mp,d) xfs_daddr_to_agbno(mp,d) static inline xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) { @@ -514,6 +514,7 @@ extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); +extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 86471bb40fd..58f85e9cd11 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -147,7 +147,7 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, inodes, &num_inodes); - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index edf12c7b834..c5bb86f3ec0 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -120,7 +120,7 @@ xfs_growfs_rt_alloc( if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip))) goto error_cancel; - XFS_BMAP_INIT(&flist, &firstblock); + xfs_bmap_init(&flist, &firstblock); /* * Allocate blocks to the bitmap file. */ diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index f87db5344ce..f76c003ec55 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -28,7 +28,6 @@ struct xfs_mount; * file is a real time file or not, because the bmap code * does. */ -#define XFS_FSB_TO_DB(ip,fsb) xfs_fsb_to_db(ip,fsb) static inline xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) { diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 1ed71916e4c..1b017c65749 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -505,7 +505,7 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ - XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d)) + xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d)) #define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index f07bf8768c3..0e55c5d7db5 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -862,7 +862,7 @@ xfs_inactive_symlink_rmt( * Find the block(s) so we can inval and unmap them. */ done = 0; - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); nmaps = ARRAY_SIZE(mval); if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, @@ -1288,7 +1288,7 @@ xfs_inactive( /* * Free the inode. */ - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); error = xfs_ifree(tp, ip, &free_list); if (error) { /* @@ -1461,7 +1461,7 @@ xfs_create( xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = B_TRUE; - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); ASSERT(ip == NULL); @@ -1879,7 +1879,7 @@ xfs_remove( } } - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); error = xfs_dir_removename(tp, dp, name, ip->i_ino, &first_block, &free_list, resblks); if (error) { @@ -2059,7 +2059,7 @@ xfs_link( if (error) goto error_return; - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, &first_block, &free_list, resblks); @@ -2231,7 +2231,7 @@ xfs_mkdir( xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino, &first_block, &free_list, resblks ? @@ -2438,7 +2438,7 @@ xfs_symlink( * Initialize the bmap freelist prior to calling either * bmapi or the directory create code. */ - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); /* * Allocate an inode for the symlink. @@ -2860,7 +2860,7 @@ retry: /* * Issue the xfs_bmapi() call to allocate the blocks */ - XFS_BMAP_INIT(&free_list, &firstfsb); + xfs_bmap_init(&free_list, &firstfsb); error = xfs_bmapi(tp, ip, startoffset_fsb, allocatesize_fsb, bmapi_flag, &firstfsb, 0, imapp, &nimaps, @@ -2980,7 +2980,7 @@ xfs_zero_remaining_bytes( XFS_BUF_UNDONE(bp); XFS_BUF_UNWRITE(bp); XFS_BUF_READ(bp); - XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock)); + XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); xfsbdstrat(mp, bp); error = xfs_iowait(bp); if (error) { @@ -3186,7 +3186,7 @@ xfs_free_file_space( /* * issue the bunmapi() call to free the blocks */ - XFS_BMAP_INIT(&free_list, &firstfsb); + xfs_bmap_init(&free_list, &firstfsb); error = xfs_bunmapi(tp, ip, startoffset_fsb, endoffset_fsb - startoffset_fsb, 0, 2, &firstfsb, &free_list, NULL, &done); |