diff options
78 files changed, 17513 insertions, 3017 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index f56c7e172ce..5519d257b55 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2246,6 +2246,14 @@ L: linux-mtd@lists.infradead.org T: git git://git.infradead.org/mtd-2.6.git S: Maintained +UNSORTED BLOCK IMAGES (UBI) +P: Artem Bityutskiy +M: dedekind@infradead.org +W: http://www.linux-mtd.infradead.org/ +L: linux-mtd@lists.infradead.org +T: git git://git.infradead.org/ubi-2.6.git +S: Maintained + MICROTEK X6 SCANNER P: Oliver Neukum M: oliver@neukum.name @@ -2972,8 +2980,10 @@ P: Stephen Smalley M: sds@tycho.nsa.gov P: James Morris M: jmorris@namei.org +P: Eric Paris +M: eparis@parisplace.org L: linux-kernel@vger.kernel.org (kernel issues) -L: selinux@tycho.nsa.gov (general discussion) +L: selinux@tycho.nsa.gov (subscribers-only, general discussion) W: http://www.nsa.gov/selinux S: Supported diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig index 26f75c29944..6d1b91bf7ad 100644 --- a/drivers/mtd/Kconfig +++ b/drivers/mtd/Kconfig @@ -292,5 +292,7 @@ source "drivers/mtd/nand/Kconfig" source "drivers/mtd/onenand/Kconfig" +source "drivers/mtd/ubi/Kconfig" + endmenu diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile index c130e6261ad..92055405cb3 100644 --- a/drivers/mtd/Makefile +++ b/drivers/mtd/Makefile @@ -28,3 +28,5 @@ nftl-objs := nftlcore.o nftlmount.o inftl-objs := inftlcore.o inftlmount.o obj-y += chips/ maps/ devices/ nand/ onenand/ + +obj-$(CONFIG_MTD_UBI) += ubi/ diff --git a/drivers/mtd/ubi/Kconfig b/drivers/mtd/ubi/Kconfig new file mode 100644 index 00000000000..b9daf159a4a --- /dev/null +++ b/drivers/mtd/ubi/Kconfig @@ -0,0 +1,58 @@ +# drivers/mtd/ubi/Kconfig + +menu "UBI - Unsorted block images" + depends on MTD + +config MTD_UBI + tristate "Enable UBI" + depends on MTD + select CRC32 + help + UBI is a software layer above MTD layer which admits of LVM-like + logical volumes on top of MTD devices, hides some complexities of + flash chips like wear and bad blocks and provides some other useful + capabilities. Please, consult the MTD web site for more details + (www.linux-mtd.infradead.org). + +config MTD_UBI_WL_THRESHOLD + int "UBI wear-leveling threshold" + default 4096 + range 2 65536 + depends on MTD_UBI + help + This parameter defines the maximum difference between the highest + erase counter value and the lowest erase counter value of eraseblocks + of UBI devices. When this threshold is exceeded, UBI starts performing + wear leveling by means of moving data from eraseblock with low erase + counter to eraseblocks with high erase counter. Leave the default + value if unsure. + +config MTD_UBI_BEB_RESERVE + int "Percentage of reserved eraseblocks for bad eraseblocks handling" + default 1 + range 0 25 + depends on MTD_UBI + help + If the MTD device admits of bad eraseblocks (e.g. NAND flash), UBI + reserves some amount of physical eraseblocks to handle new bad + eraseblocks. For example, if a flash physical eraseblock becomes bad, + UBI uses these reserved physical eraseblocks to relocate the bad one. + This option specifies how many physical eraseblocks will be reserved + for bad eraseblock handling (percents of total number of good flash + eraseblocks). If the underlying flash does not admit of bad + eraseblocks (e.g. NOR flash), this value is ignored and nothing is + reserved. Leave the default value if unsure. + +config MTD_UBI_GLUEBI + bool "Emulate MTD devices" + default n + depends on MTD_UBI + help + This option enables MTD devices emulation on top of UBI volumes: for + each UBI volumes an MTD device is created, and all I/O to this MTD + device is redirected to the UBI volume. This is handy to make + MTD-oriented software (like JFFS2) work on top of UBI. Do not enable + this if no legacy software will be used. + +source "drivers/mtd/ubi/Kconfig.debug" +endmenu diff --git a/drivers/mtd/ubi/Kconfig.debug b/drivers/mtd/ubi/Kconfig.debug new file mode 100644 index 00000000000..1e2ee22edef --- /dev/null +++ b/drivers/mtd/ubi/Kconfig.debug @@ -0,0 +1,104 @@ +comment "UBI debugging options" + depends on MTD_UBI + +config MTD_UBI_DEBUG + bool "UBI debugging" + depends on SYSFS + depends on MTD_UBI + select DEBUG_FS + select KALLSYMS_ALL + help + This option enables UBI debugging. + +config MTD_UBI_DEBUG_MSG + bool "UBI debugging messages" + depends on MTD_UBI_DEBUG + default n + help + This option enables UBI debugging messages. + +config MTD_UBI_DEBUG_PARANOID + bool "Extra self-checks" + default n + depends on MTD_UBI_DEBUG + help + This option enables extra checks in UBI code. Note this slows UBI down + significantly. + +config MTD_UBI_DEBUG_DISABLE_BGT + bool "Do not enable the UBI background thread" + depends on MTD_UBI_DEBUG + default n + help + This option switches the background thread off by default. The thread + may be also be enabled/disabled via UBI sysfs. + +config MTD_UBI_DEBUG_USERSPACE_IO + bool "Direct user-space write/erase support" + default n + depends on MTD_UBI_DEBUG + help + By default, users cannot directly write and erase individual + eraseblocks of dynamic volumes, and have to use update operation + instead. This option enables this capability - it is very useful for + debugging and testing. + +config MTD_UBI_DEBUG_EMULATE_BITFLIPS + bool "Emulate flash bit-flips" + depends on MTD_UBI_DEBUG + default n + help + This option emulates bit-flips with probability 1/50, which in turn + causes scrubbing. Useful for debugging and stressing UBI. + +config MTD_UBI_DEBUG_EMULATE_WRITE_FAILURES + bool "Emulate flash write failures" + depends on MTD_UBI_DEBUG + default n + help + This option emulates write failures with probability 1/100. Useful for + debugging and testing how UBI handlines errors. + +config MTD_UBI_DEBUG_EMULATE_ERASE_FAILURES + bool "Emulate flash erase failures" + depends on MTD_UBI_DEBUG + default n + help + This option emulates erase failures with probability 1/100. Useful for + debugging and testing how UBI handlines errors. + +menu "Additional UBI debugging messages" + depends on MTD_UBI_DEBUG + +config MTD_UBI_DEBUG_MSG_BLD + bool "Additional UBI initialization and build messages" + default n + depends on MTD_UBI_DEBUG + help + This option enables detailed UBI initialization and device build + debugging messages. + +config MTD_UBI_DEBUG_MSG_EBA + bool "Eraseblock association unit messages" + default n + depends on MTD_UBI_DEBUG + help + This option enables debugging messages from the UBI eraseblock + association unit. + +config MTD_UBI_DEBUG_MSG_WL + bool "Wear-leveling unit messages" + default n + depends on MTD_UBI_DEBUG + help + This option enables debugging messages from the UBI wear-leveling + unit. + +config MTD_UBI_DEBUG_MSG_IO + bool "Input/output unit messages" + default n + depends on MTD_UBI_DEBUG + help + This option enables debugging messages from the UBI input/output unit. + +endmenu # UBI debugging messages diff --git a/drivers/mtd/ubi/Makefile b/drivers/mtd/ubi/Makefile new file mode 100644 index 00000000000..dd834e04151 --- /dev/null +++ b/drivers/mtd/ubi/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_MTD_UBI) += ubi.o + +ubi-y += vtbl.o vmt.o upd.o build.o cdev.o kapi.o eba.o io.o wl.o scan.o +ubi-y += misc.o + +ubi-$(CONFIG_MTD_UBI_DEBUG) += debug.o +ubi-$(CONFIG_MTD_UBI_GLUEBI) += gluebi.o diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c new file mode 100644 index 00000000000..555d594d181 --- /dev/null +++ b/drivers/mtd/ubi/build.c @@ -0,0 +1,848 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * Copyright (c) Nokia Corporation, 2007 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём), + * Frank Haverkamp + */ + +/* + * This file includes UBI initialization and building of UBI devices. At the + * moment UBI devices may only be added while UBI is initialized, but dynamic + * device add/remove functionality is planned. Also, at the moment we only + * attach UBI devices by scanning, which will become a bottleneck when flashes + * reach certain large size. Then one may improve UBI and add other methods. + */ + +#include <linux/err.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/stringify.h> +#include <linux/stat.h> +#include "ubi.h" + +/* Maximum length of the 'mtd=' parameter */ +#define MTD_PARAM_LEN_MAX 64 + +/** + * struct mtd_dev_param - MTD device parameter description data structure. + * @name: MTD device name or number string + * @vid_hdr_offs: VID header offset + * @data_offs: data offset + */ +struct mtd_dev_param +{ + char name[MTD_PARAM_LEN_MAX]; + int vid_hdr_offs; + int data_offs; +}; + +/* Numbers of elements set in the @mtd_dev_param array */ +static int mtd_devs = 0; + +/* MTD devices specification parameters */ +static struct mtd_dev_param mtd_dev_param[UBI_MAX_DEVICES]; + +/* Number of UBI devices in system */ +int ubi_devices_cnt; + +/* All UBI devices in system */ +struct ubi_device *ubi_devices[UBI_MAX_DEVICES]; + +/* Root UBI "class" object (corresponds to '/<sysfs>/class/ubi/') */ +struct class *ubi_class; + +/* "Show" method for files in '/<sysfs>/class/ubi/' */ +static ssize_t ubi_version_show(struct class *class, char *buf) +{ + return sprintf(buf, "%d\n", UBI_VERSION); +} + +/* UBI version attribute ('/<sysfs>/class/ubi/version') */ +static struct class_attribute ubi_version = + __ATTR(version, S_IRUGO, ubi_version_show, NULL); + +static ssize_t dev_attribute_show(struct device *dev, + struct device_attribute *attr, char *buf); + +/* UBI device attributes (correspond to files in '/<sysfs>/class/ubi/ubiX') */ +static struct device_attribute dev_eraseblock_size = + __ATTR(eraseblock_size, S_IRUGO, dev_attribute_show, NULL); +static struct device_attribute dev_avail_eraseblocks = + __ATTR(avail_eraseblocks, S_IRUGO, dev_attribute_show, NULL); +static struct device_attribute dev_total_eraseblocks = + __ATTR(total_eraseblocks, S_IRUGO, dev_attribute_show, NULL); +static struct device_attribute dev_volumes_count = + __ATTR(volumes_count, S_IRUGO, dev_attribute_show, NULL); +static struct device_attribute dev_max_ec = + __ATTR(max_ec, S_IRUGO, dev_attribute_show, NULL); +static struct device_attribute dev_reserved_for_bad = + __ATTR(reserved_for_bad, S_IRUGO, dev_attribute_show, NULL); +static struct device_attribute dev_bad_peb_count = + __ATTR(bad_peb_count, S_IRUGO, dev_attribute_show, NULL); +static struct device_attribute dev_max_vol_count = + __ATTR(max_vol_count, S_IRUGO, dev_attribute_show, NULL); +static struct device_attribute dev_min_io_size = + __ATTR(min_io_size, S_IRUGO, dev_attribute_show, NULL); +static struct device_attribute dev_bgt_enabled = + __ATTR(bgt_enabled, S_IRUGO, dev_attribute_show, NULL); + +/* "Show" method for files in '/<sysfs>/class/ubi/ubiX/' */ +static ssize_t dev_attribute_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct ubi_device *ubi; + + ubi = container_of(dev, struct ubi_device, dev); + if (attr == &dev_eraseblock_size) + return sprintf(buf, "%d\n", ubi->leb_size); + else if (attr == &dev_avail_eraseblocks) + return sprintf(buf, "%d\n", ubi->avail_pebs); + else if (attr == &dev_total_eraseblocks) + return sprintf(buf, "%d\n", ubi->good_peb_count); + else if (attr == &dev_volumes_count) + return sprintf(buf, "%d\n", ubi->vol_count); + else if (attr == &dev_max_ec) + return sprintf(buf, "%d\n", ubi->max_ec); + else if (attr == &dev_reserved_for_bad) + return sprintf(buf, "%d\n", ubi->beb_rsvd_pebs); + else if (attr == &dev_bad_peb_count) + return sprintf(buf, "%d\n", ubi->bad_peb_count); + else if (attr == &dev_max_vol_count) + return sprintf(buf, "%d\n", ubi->vtbl_slots); + else if (attr == &dev_min_io_size) + return sprintf(buf, "%d\n", ubi->min_io_size); + else if (attr == &dev_bgt_enabled) + return sprintf(buf, "%d\n", ubi->thread_enabled); + else + BUG(); + + return 0; +} + +/* Fake "release" method for UBI devices */ +static void dev_release(struct device *dev) { } + +/** + * ubi_sysfs_init - initialize sysfs for an UBI device. + * @ubi: UBI device description object + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int ubi_sysfs_init(struct ubi_device *ubi) +{ + int err; + + ubi->dev.release = dev_release; + ubi->dev.devt = MKDEV(ubi->major, 0); + ubi->dev.class = ubi_class; + sprintf(&ubi->dev.bus_id[0], UBI_NAME_STR"%d", ubi->ubi_num); + err = device_register(&ubi->dev); + if (err) + goto out; + + err = device_create_file(&ubi->dev, &dev_eraseblock_size); + if (err) + goto out_unregister; + err = device_create_file(&ubi->dev, &dev_avail_eraseblocks); + if (err) + goto out_eraseblock_size; + err = device_create_file(&ubi->dev, &dev_total_eraseblocks); + if (err) + goto out_avail_eraseblocks; + err = device_create_file(&ubi->dev, &dev_volumes_count); + if (err) + goto out_total_eraseblocks; + err = device_create_file(&ubi->dev, &dev_max_ec); + if (err) + goto out_volumes_count; + err = device_create_file(&ubi->dev, &dev_reserved_for_bad); + if (err) + goto out_volumes_max_ec; + err = device_create_file(&ubi->dev, &dev_bad_peb_count); + if (err) + goto out_reserved_for_bad; + err = device_create_file(&ubi->dev, &dev_max_vol_count); + if (err) + goto out_bad_peb_count; + err = device_create_file(&ubi->dev, &dev_min_io_size); + if (err) + goto out_max_vol_count; + err = device_create_file(&ubi->dev, &dev_bgt_enabled); + if (err) + goto out_min_io_size; + + return 0; + +out_min_io_size: + device_remove_file(&ubi->dev, &dev_min_io_size); +out_max_vol_count: + device_remove_file(&ubi->dev, &dev_max_vol_count); +out_bad_peb_count: + device_remove_file(&ubi->dev, &dev_bad_peb_count); +out_reserved_for_bad: + device_remove_file(&ubi->dev, &dev_reserved_for_bad); +out_volumes_max_ec: + device_remove_file(&ubi->dev, &dev_max_ec); +out_volumes_count: + device_remove_file(&ubi->dev, &dev_volumes_count); +out_total_eraseblocks: + device_remove_file(&ubi->dev, &dev_total_eraseblocks); +out_avail_eraseblocks: + device_remove_file(&ubi->dev, &dev_avail_eraseblocks); +out_eraseblock_size: + device_remove_file(&ubi->dev, &dev_eraseblock_size); +out_unregister: + device_unregister(&ubi->dev); +out: + ubi_err("failed to initialize sysfs for %s", ubi->ubi_name); + return err; +} + +/** + * ubi_sysfs_close - close sysfs for an UBI device. + * @ubi: UBI device description object + */ +static void ubi_sysfs_close(struct ubi_device *ubi) +{ + device_remove_file(&ubi->dev, &dev_bgt_enabled); + device_remove_file(&ubi->dev, &dev_min_io_size); + device_remove_file(&ubi->dev, &dev_max_vol_count); + device_remove_file(&ubi->dev, &dev_bad_peb_count); + device_remove_file(&ubi->dev, &dev_reserved_for_bad); + device_remove_file(&ubi->dev, &dev_max_ec); + device_remove_file(&ubi->dev, &dev_volumes_count); + device_remove_file(&ubi->dev, &dev_total_eraseblocks); + device_remove_file(&ubi->dev, &dev_avail_eraseblocks); + device_remove_file(&ubi->dev, &dev_eraseblock_size); + device_unregister(&ubi->dev); +} + +/** + * kill_volumes - destroy all volumes. + * @ubi: UBI device description object + */ +static void kill_volumes(struct ubi_device *ubi) +{ + int i; + + for (i = 0; i < ubi->vtbl_slots; i++) + if (ubi->volumes[i]) + ubi_free_volume(ubi, i); +} + +/** + * uif_init - initialize user interfaces for an UBI device. + * @ubi: UBI device description object + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int uif_init(struct ubi_device *ubi) +{ + int i, err; + dev_t dev; + + mutex_init(&ubi->vtbl_mutex); + spin_lock_init(&ubi->volumes_lock); + + sprintf(ubi->ubi_name, UBI_NAME_STR "%d", ubi->ubi_num); + + /* + * Major numbers for the UBI character devices are allocated + * dynamically. Major numbers of volume character devices are + * equivalent to ones of the corresponding UBI character device. Minor + * numbers of UBI character devices are 0, while minor numbers of + * volume character devices start from 1. Thus, we allocate one major + * number and ubi->vtbl_slots + 1 minor numbers. + */ + err = alloc_chrdev_region(&dev, 0, ubi->vtbl_slots + 1, ubi->ubi_name); + if (err) { + ubi_err("cannot register UBI character devices"); + return err; + } + + cdev_init(&ubi->cdev, &ubi_cdev_operations); + ubi->major = MAJOR(dev); + dbg_msg("%s major is %u", ubi->ubi_name, ubi->major); + ubi->cdev.owner = THIS_MODULE; + + dev = MKDEV(ubi->major, 0); + err = cdev_add(&ubi->cdev, dev, 1); + if (err) { + ubi_err("cannot add character device %s", ubi->ubi_name); + goto out_unreg; + } + + err = ubi_sysfs_init(ubi); + if (err) + goto out_cdev; + + for (i = 0; i < ubi->vtbl_slots; i++) + if (ubi->volumes[i]) { + err = ubi_add_volume(ubi, i); + if (err) + goto out_volumes; + } + + return 0; + +out_volumes: + kill_volumes(ubi); + ubi_sysfs_close(ubi); +out_cdev: + cdev_del(&ubi->cdev); +out_unreg: + unregister_chrdev_region(MKDEV(ubi->major, 0), + ubi->vtbl_slots + 1); + return err; +} + +/** + * uif_close - close user interfaces for an UBI device. + * @ubi: UBI device description object + */ +static void uif_close(struct ubi_device *ubi) +{ + kill_volumes(ubi); + ubi_sysfs_close(ubi); + cdev_del(&ubi->cdev); + unregister_chrdev_region(MKDEV(ubi->major, 0), ubi->vtbl_slots + 1); +} + +/** + * attach_by_scanning - attach an MTD device using scanning method. + * @ubi: UBI device descriptor + * + * This function returns zero in case of success and a negative error code in + * case of failure. + * + * Note, currently this is the only method to attach UBI devices. Hopefully in + * the future we'll have more scalable attaching methods and avoid full media + * scanning. But even in this case scanning will be needed as a fall-back + * attaching method if there are some on-flash table corruptions. + */ +static int attach_by_scanning(struct ubi_device *ubi) +{ + int err; + struct ubi_scan_info *si; + + si = ubi_scan(ubi); + if (IS_ERR(si)) + return PTR_ERR(si); + + ubi->bad_peb_count = si->bad_peb_count; + ubi->good_peb_count = ubi->peb_count - ubi->bad_peb_count; + ubi->max_ec = si->max_ec; + ubi->mean_ec = si->mean_ec; + + err = ubi_read_volume_table(ubi, si); + if (err) + goto out_si; + + err = ubi_wl_init_scan(ubi, si); + if (err) + goto out_vtbl; + + err = ubi_eba_init_scan(ubi, si); + if (err) + goto out_wl; + + ubi_scan_destroy_si(si); + return 0; + +out_wl: + ubi_wl_close(ubi); +out_vtbl: + kfree(ubi->vtbl); +out_si: + ubi_scan_destroy_si(si); + return err; +} + +/** + * io_init - initialize I/O unit for a given UBI device. + * @ubi: UBI device description object + * + * If @ubi->vid_hdr_offset or @ubi->leb_start is zero, default offsets are + * assumed: + * o EC header is always at offset zero - this cannot be changed; + * o VID header starts just after the EC header at the closest address + * aligned to @io->@hdrs_min_io_size; + * o data starts just after the VID header at the closest address aligned to + * @io->@min_io_size + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int io_init(struct ubi_device *ubi) +{ + if (ubi->mtd->numeraseregions != 0) { + /* + * Some flashes have several erase regions. Different regions + * may have different eraseblock size and other + * characteristics. It looks like mostly multi-region flashes + * have one "main" region and one or more small regions to + * store boot loader code or boot parameters or whatever. I + * guess we should just pick the largest region. But this is + * not implemented. + */ + ubi_err("multiple regions, not implemented"); + return -EINVAL; + } + + /* + * Note, in this implementation we support MTD devices with 0x7FFFFFFF + * physical eraseblocks maximum. + */ + + ubi->peb_size = ubi->mtd->erasesize; + ubi->peb_count = ubi->mtd->size / ubi->mtd->erasesize; + ubi->flash_size = ubi->mtd->size; + + if (ubi->mtd->block_isbad && ubi->mtd->block_markbad) + ubi->bad_allowed = 1; + + ubi->min_io_size = ubi->mtd->writesize; + ubi->hdrs_min_io_size = ubi->mtd->writesize >> ubi->mtd->subpage_sft; + + /* Make sure minimal I/O unit is power of 2 */ + if (ubi->min_io_size == 0 || + (ubi->min_io_size & (ubi->min_io_size - 1))) { + ubi_err("bad min. I/O unit"); + return -EINVAL; + } + + ubi_assert(ubi->hdrs_min_io_size > 0); + ubi_assert(ubi->hdrs_min_io_size <= ubi->min_io_size); + ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size == 0); + + /* Calculate default aligned sizes of EC and VID headers */ + ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size); + ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size); + + dbg_msg("min_io_size %d", ubi->min_io_size); + dbg_msg("hdrs_min_io_size %d", ubi->hdrs_min_io_size); + dbg_msg("ec_hdr_alsize %d", ubi->ec_hdr_alsize); + dbg_msg("vid_hdr_alsize %d", ubi->vid_hdr_alsize); + + if (ubi->vid_hdr_offset == 0) + /* Default offset */ + ubi->vid_hdr_offset = ubi->vid_hdr_aloffset = + ubi->ec_hdr_alsize; + else { + ubi->vid_hdr_aloffset = ubi->vid_hdr_offset & + ~(ubi->hdrs_min_io_size - 1); + ubi->vid_hdr_shift = ubi->vid_hdr_offset - + ubi->vid_hdr_aloffset; + } + + /* Similar for the data offset */ + if (ubi->leb_start == 0) { + ubi->leb_start = ubi->vid_hdr_offset + ubi->vid_hdr_alsize; + ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size); + } + + dbg_msg("vid_hdr_offset %d", ubi->vid_hdr_offset); + dbg_msg("vid_hdr_aloffset %d", ubi->vid_hdr_aloffset); + dbg_msg("vid_hdr_shift %d", ubi->vid_hdr_shift); + dbg_msg("leb_start %d", ubi->leb_start); + + /* The shift must be aligned to 32-bit boundary */ + if (ubi->vid_hdr_shift % 4) { + ubi_err("unaligned VID header shift %d", + ubi->vid_hdr_shift); + return -EINVAL; + } + + /* Check sanity */ + if (ubi->vid_hdr_offset < UBI_EC_HDR_SIZE || + ubi->leb_start < ubi->vid_hdr_offset + UBI_VID_HDR_SIZE || + ubi->leb_start > ubi->peb_size - UBI_VID_HDR_SIZE || + ubi->leb_start % ubi->min_io_size) { + ubi_err("bad VID header (%d) or data offsets (%d)", + ubi->vid_hdr_offset, ubi->leb_start); + return -EINVAL; + } + + /* + * It may happen that EC and VID headers are situated in one minimal + * I/O unit. In this case we can only accept this UBI image in + * read-only mode. + */ + if (ubi->vid_hdr_offset + UBI_VID_HDR_SIZE <= ubi->hdrs_min_io_size) { + ubi_warn("EC and VID headers are in the same minimal I/O unit, " + "switch to read-only mode"); + ubi->ro_mode = 1; + } + + ubi->leb_size = ubi->peb_size - ubi->leb_start; + + if (!(ubi->mtd->flags & MTD_WRITEABLE)) { + ubi_msg("MTD device %d is write-protected, attach in " + "read-only mode", ubi->mtd->index); + ubi->ro_mode = 1; + } + + dbg_msg("leb_size %d", ubi->leb_size); + dbg_msg("ro_mode %d", ubi->ro_mode); + + /* + * Note, ideally, we have to initialize ubi->bad_peb_count here. But + * unfortunately, MTD does not provide this information. We should loop + * over all physical eraseblocks and invoke mtd->block_is_bad() for + * each physical eraseblock. So, we skip ubi->bad_peb_count + * uninitialized and initialize it after scanning. + */ + + return 0; +} + +/** + * attach_mtd_dev - attach an MTD device. + * @mtd_dev: MTD device name or number string + * @vid_hdr_offset: VID header offset + * @data_offset: data offset + * + * This function attaches an MTD device to UBI. It first treats @mtd_dev as the + * MTD device name, and tries to open it by this name. If it is unable to open, + * it tries to convert @mtd_dev to an integer and open the MTD device by its + * number. Returns zero in case of success and a negative error code in case of + * failure. + */ +static int attach_mtd_dev(const char *mtd_dev, int vid_hdr_offset, + int data_offset) +{ + struct ubi_device *ubi; + struct mtd_info *mtd; + int i, err; + + mtd = get_mtd_device_nm(mtd_dev); + if (IS_ERR(mtd)) { + int mtd_num; + char *endp; + + if (PTR_ERR(mtd) != -ENODEV) + return PTR_ERR(mtd); + + /* + * Probably this is not MTD device name but MTD device number - + * check this out. + */ + mtd_num = simple_strtoul(mtd_dev, &endp, 0); + if (*endp != '\0' || mtd_dev == endp) { + ubi_err("incorrect MTD device: \"%s\"", mtd_dev); + return -ENODEV; + } + + mtd = get_mtd_device(NULL, mtd_num); + if (IS_ERR(mtd)) + return PTR_ERR(mtd); + } + + /* Check if we already have the same MTD device attached */ + for (i = 0; i < ubi_devices_cnt; i++) + if (ubi_devices[i]->mtd->index == mtd->index) { + ubi_err("mtd%d is already attached to ubi%d", + mtd->index, i); + err = -EINVAL; + goto out_mtd; + } + + ubi = ubi_devices[ubi_devices_cnt] = kzalloc(sizeof(struct ubi_device), + GFP_KERNEL); + if (!ubi) { + err = -ENOMEM; + goto out_mtd; + } + + ubi->ubi_num = ubi_devices_cnt; + ubi->mtd = mtd; + + dbg_msg("attaching mtd%d to ubi%d: VID header offset %d data offset %d", + ubi->mtd->index, ubi_devices_cnt, vid_hdr_offset, data_offset); + + ubi->vid_hdr_offset = vid_hdr_offset; + ubi->leb_start = data_offset; + err = io_init(ubi); + if (err) + goto out_free; + + err = attach_by_scanning(ubi); + if (err) { + dbg_err("failed to attach by scanning, error %d", err); + goto out_free; + } + + err = uif_init(ubi); + if (err) + goto out_detach; + + ubi_devices_cnt += 1; + + ubi_msg("attached mtd%d to ubi%d", ubi->mtd->index, ubi_devices_cnt); + ubi_msg("MTD device name: \"%s\"", ubi->mtd->name); + ubi_msg("MTD device size: %llu MiB", ubi->flash_size >> 20); + ubi_msg("physical eraseblock size: %d bytes (%d KiB)", + ubi->peb_size, ubi->peb_size >> 10); + ubi_msg("logical eraseblock size: %d bytes", ubi->leb_size); + ubi_msg("number of good PEBs: %d", ubi->good_peb_count); + ubi_msg("number of bad PEBs: %d", ubi->bad_peb_count); + ubi_msg("smallest flash I/O unit: %d", ubi->min_io_size); + ubi_msg("VID header offset: %d (aligned %d)", + ubi->vid_hdr_offset, ubi->vid_hdr_aloffset); + ubi_msg("data offset: %d", ubi->leb_start); + ubi_msg("max. allowed volumes: %d", ubi->vtbl_slots); + ubi_msg("wear-leveling threshold: %d", CONFIG_MTD_UBI_WL_THRESHOLD); + ubi_msg("number of internal volumes: %d", UBI_INT_VOL_COUNT); + ubi_msg("number of user volumes: %d", + ubi->vol_count - UBI_INT_VOL_COUNT); + ubi_msg("available PEBs: %d", ubi->avail_pebs); + ubi_msg("total number of reserved PEBs: %d", ubi->rsvd_pebs); + ubi_msg("number of PEBs reserved for bad PEB handling: %d", + ubi->beb_rsvd_pebs); + ubi_msg("max/mean erase counter: %d/%d", ubi->max_ec, ubi->mean_ec); + + /* Enable the background thread */ + if (!DBG_DISABLE_BGT) { + ubi->thread_enabled = 1; + wake_up_process(ubi->bgt_thread); + } + + return 0; + +out_detach: + ubi_eba_close(ubi); + ubi_wl_close(ubi); + kfree(ubi->vtbl); +out_free: + kfree(ubi); +out_mtd: + put_mtd_device(mtd); + ubi_devices[ubi_devices_cnt] = NULL; + return err; +} + +/** + * detach_mtd_dev - detach an MTD device. + * @ubi: UBI device description object + */ +static void detach_mtd_dev(struct ubi_device *ubi) +{ + int ubi_num = ubi->ubi_num, mtd_num = ubi->mtd->index; + + dbg_msg("detaching mtd%d from ubi%d", ubi->mtd->index, ubi_num); + uif_close(ubi); + ubi_eba_close(ubi); + ubi_wl_close(ubi); + kfree(ubi->vtbl); + put_mtd_device(ubi->mtd); + kfree(ubi_devices[ubi_num]); + ubi_devices[ubi_num] = NULL; + ubi_devices_cnt -= 1; + ubi_assert(ubi_devices_cnt >= 0); + ubi_msg("mtd%d is detached from ubi%d", mtd_num, ubi_num); +} + +static int __init ubi_init(void) +{ + int err, i, k; + + /* Ensure that EC and VID headers have correct size */ + BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64); + BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64); + + if (mtd_devs > UBI_MAX_DEVICES) { + printk("UBI error: too many MTD devices, maximum is %d\n", + UBI_MAX_DEVICES); + return -EINVAL; + } + + ubi_class = class_create(THIS_MODULE, UBI_NAME_STR); + if (IS_ERR(ubi_class)) + return PTR_ERR(ubi_class); + + err = class_create_file(ubi_class, &ubi_version); + if (err) + goto out_class; + + /* Attach MTD devices */ + for (i = 0; i < mtd_devs; i++) { + struct mtd_dev_param *p = &mtd_dev_param[i]; + + cond_resched(); + + if (!p->name) { + dbg_err("empty name"); + err = -EINVAL; + goto out_detach; + } + + err = attach_mtd_dev(p->name, p->vid_hdr_offs, p->data_offs); + if (err) + goto out_detach; + } + + return 0; + +out_detach: + for (k = 0; k < i; k++) + detach_mtd_dev(ubi_devices[k]); + class_remove_file(ubi_class, &ubi_version); +out_class: + class_destroy(ubi_class); + return err; +} +module_init(ubi_init); + +static void __exit ubi_exit(void) +{ + int i, n = ubi_devices_cnt; + + for (i = 0; i < n; i++) + detach_mtd_dev(ubi_devices[i]); + class_remove_file(ubi_class, &ubi_version); + class_destroy(ubi_class); +} +module_exit(ubi_exit); + +/** + * bytes_str_to_int - convert a string representing number of bytes to an + * integer. + * @str: the string to convert + * + * This function returns positive resulting integer in case of success and a + * negative error code in case of failure. + */ +static int __init bytes_str_to_int(const char *str) +{ + char *endp; + unsigned long result; + + result = simple_strtoul(str, &endp, 0); + if (str == endp || result < 0) { + printk("UBI error: incorrect bytes count: \"%s\"\n", str); + return -EINVAL; + } + + switch (*endp) { + case 'G': + result *= 1024; + case 'M': + result *= 1024; + case 'K': + case 'k': + result *= 1024; + if (endp[1] == 'i' && (endp[2] == '\0' || + endp[2] == 'B' || endp[2] == 'b')) + endp += 2; + case '\0': + break; + default: + printk("UBI error: incorrect bytes count: \"%s\"\n", str); + return -EINVAL; + } + + return result; +} + +/** + * ubi_mtd_param_parse - parse the 'mtd=' UBI parameter. + * @val: the parameter value to parse + * @kp: not used + * + * This function returns zero in case of success and a negative error code in + * case of error. + */ +static int __init ubi_mtd_param_parse(const char *val, struct kernel_param *kp) +{ + int i, len; + struct mtd_dev_param *p; + char buf[MTD_PARAM_LEN_MAX]; + char *pbuf = &buf[0]; + char *tokens[3] = {NULL, NULL, NULL}; + + if (mtd_devs == UBI_MAX_DEVICES) { + printk("UBI error: too many parameters, max. is %d\n", + UBI_MAX_DEVICES); + return -EINVAL; + } + + len = strnlen(val, MTD_PARAM_LEN_MAX); + if (len == MTD_PARAM_LEN_MAX) { + printk("UBI error: parameter \"%s\" is too long, max. is %d\n", + val, MTD_PARAM_LEN_MAX); + return -EINVAL; + } + + if (len == 0) { + printk("UBI warning: empty 'mtd=' parameter - ignored\n"); + return 0; + } + + strcpy(buf, val); + + /* Get rid of the final newline */ + if (buf[len - 1] == '\n') + buf[len - 1] = 0; + + for (i = 0; i < 3; i++) + tokens[i] = strsep(&pbuf, ","); + + if (pbuf) { + printk("UBI error: too many arguments at \"%s\"\n", val); + return -EINVAL; + } + + if (tokens[0] == '\0') + return -EINVAL; + + p = &mtd_dev_param[mtd_devs]; + strcpy(&p->name[0], tokens[0]); + + if (tokens[1]) + p->vid_hdr_offs = bytes_str_to_int(tokens[1]); + if (tokens[2]) + p->data_offs = bytes_str_to_int(tokens[2]); + + if (p->vid_hdr_offs < 0) + return p->vid_hdr_offs; + if (p->data_offs < 0) + return p->data_offs; + + mtd_devs += 1; + return 0; +} + +module_param_call(mtd, ubi_mtd_param_parse, NULL, NULL, 000); +MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: " + "mtd=<name|num>[,<vid_hdr_offs>,<data_offs>]. " + "Multiple \"mtd\" parameters may be specified.\n" + "MTD devices may be specified by their number or name. " + "Optional \"vid_hdr_offs\" and \"data_offs\" parameters " + "specify UBI VID header position and data starting " + "position to be used by UBI.\n" + "Example: mtd=content,1984,2048 mtd=4 - attach MTD device" + "with name content using VID header offset 1984 and data " + "start 2048, and MTD device number 4 using default " + "offsets"); + +MODULE_VERSION(__stringify(UBI_VERSION)); +MODULE_DESCRIPTION("UBI - Unsorted Block Images"); +MODULE_AUTHOR("Artem Bityutskiy"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c new file mode 100644 index 00000000000..6612eb79bf1 --- /dev/null +++ b/drivers/mtd/ubi/cdev.c @@ -0,0 +1,722 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file includes implementation of UBI character device operations. + * + * There are two kinds of character devices in UBI: UBI character devices and + * UBI volume character devices. UBI character devices allow users to + * manipulate whole volumes: create, remove, and re-size them. Volume character + * devices provide volume I/O capabilities. + * + * Major and minor numbers are assigned dynamically to both UBI and volume + * character devices. + */ + +#include <linux/module.h> +#include <linux/stat.h> +#include <linux/ioctl.h> +#include <linux/capability.h> +#include <mtd/ubi-user.h> +#include <asm/uaccess.h> +#include <asm/div64.h> +#include "ubi.h" + +/* + * Maximum sequence numbers of UBI and volume character device IOCTLs (direct + * logical eraseblock erase is a debug-only feature). + */ +#define UBI_CDEV_IOC_MAX_SEQ 2 +#ifndef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO +#define VOL_CDEV_IOC_MAX_SEQ 1 +#else +#define VOL_CDEV_IOC_MAX_SEQ 2 +#endif + +/** + * major_to_device - get UBI device object by character device major number. + * @major: major number + * + * This function returns a pointer to the UBI device object. + */ +static struct ubi_device *major_to_device(int major) +{ + int i; + + for (i = 0; i < ubi_devices_cnt; i++) + if (ubi_devices[i] && ubi_devices[i]->major == major) + return ubi_devices[i]; + BUG(); +} + +/** + * get_exclusive - get exclusive access to an UBI volume. + * @desc: volume descriptor + * + * This function changes UBI volume open mode to "exclusive". Returns previous + * mode value (positive integer) in case of success and a negative error code + * in case of failure. + */ +static int get_exclusive(struct ubi_volume_desc *desc) +{ + int users, err; + struct ubi_volume *vol = desc->vol; + + spin_lock(&vol->ubi->volumes_lock); + users = vol->readers + vol->writers + vol->exclusive; + ubi_assert(users > 0); + if (users > 1) { + dbg_err("%d users for volume %d", users, vol->vol_id); + err = -EBUSY; + } else { + vol->readers = vol->writers = 0; + vol->exclusive = 1; + err = desc->mode; + desc->mode = UBI_EXCLUSIVE; + } + spin_unlock(&vol->ubi->volumes_lock); + + return err; +} + +/** + * revoke_exclusive - revoke exclusive mode. + * @desc: volume descriptor + * @mode: new mode to switch to + */ +static void revoke_exclusive(struct ubi_volume_desc *desc, int mode) +{ + struct ubi_volume *vol = desc->vol; + + spin_lock(&vol->ubi->volumes_lock); + ubi_assert(vol->readers == 0 && vol->writers == 0); + ubi_assert(vol->exclusive == 1 && desc->mode == UBI_EXCLUSIVE); + vol->exclusive = 0; + if (mode == UBI_READONLY) + vol->readers = 1; + else if (mode == UBI_READWRITE) + vol->writers = 1; + else + vol->exclusive = 1; + spin_unlock(&vol->ubi->volumes_lock); + + desc->mode = mode; +} + +static int vol_cdev_open(struct inode *inode, struct file *file) +{ + struct ubi_volume_desc *desc; + const struct ubi_device *ubi = major_to_device(imajor(inode)); + int vol_id = iminor(inode) - 1; + int mode; + + if (file->f_mode & FMODE_WRITE) + mode = UBI_READWRITE; + else + mode = UBI_READONLY; + + dbg_msg("open volume %d, mode %d", vol_id, mode); + + desc = ubi_open_volume(ubi->ubi_num, vol_id, mode); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + file->private_data = desc; + return 0; +} + +static int vol_cdev_release(struct inode *inode, struct file *file) +{ + struct ubi_volume_desc *desc = file->private_data; + struct ubi_volume *vol = desc->vol; + + dbg_msg("release volume %d, mode %d", vol->vol_id, desc->mode); + + if (vol->updating) { + ubi_warn("update of volume %d not finished, volume is damaged", + vol->vol_id); + vol->updating = 0; + kfree(vol->upd_buf); + } + + ubi_close_volume(desc); + return 0; +} + +static loff_t vol_cdev_llseek(struct file *file, loff_t offset, int origin) +{ + struct ubi_volume_desc *desc = file->private_data; + struct ubi_volume *vol = desc->vol; + loff_t new_offset; + + if (vol->updating) { + /* Update is in progress, seeking is prohibited */ + dbg_err("updating"); + return -EBUSY; + } + + switch (origin) { + case 0: /* SEEK_SET */ + new_offset = offset; + break; + case 1: /* SEEK_CUR */ + new_offset = file->f_pos + offset; + break; + case 2: /* SEEK_END */ + new_offset = vol->used_bytes + offset; + break; + default: + return -EINVAL; + } + + if (new_offset < 0 || new_offset > vol->used_bytes) { + dbg_err("bad seek %lld", new_offset); + return -EINVAL; + } + + dbg_msg("seek volume %d, offset %lld, origin %d, new offset %lld", + vol->vol_id, offset, origin, new_offset); + + file->f_pos = new_offset; + return new_offset; +} + +static ssize_t vol_cdev_read(struct file *file, __user char *buf, size_t count, + loff_t *offp) +{ + struct ubi_volume_desc *desc = file->private_data; + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + int err, lnum, off, len, vol_id = desc->vol->vol_id, tbuf_size; + size_t count_save = count; + void *tbuf; + uint64_t tmp; + + dbg_msg("read %zd bytes from offset %lld of volume %d", + count, *offp, vol_id); + + if (vol->updating) { + dbg_err("updating"); + return -EBUSY; + } + if (vol->upd_marker) { + dbg_err("damaged volume, update marker is set"); + return -EBADF; + } + if (*offp == vol->used_bytes || count == 0) + return 0; + + if (vol->corrupted) + dbg_msg("read from corrupted volume %d", vol_id); + + if (*offp + count > vol->used_bytes) + count_save = count = vol->used_bytes - *offp; + + tbuf_size = vol->usable_leb_size; + if (count < tbuf_size) + tbuf_size = ALIGN(count, ubi->min_io_size); + tbuf = kmalloc(tbuf_size, GFP_KERNEL); + if (!tbuf) + return -ENOMEM; + + len = count > tbuf_size ? tbuf_size : count; + + tmp = *offp; + off = do_div(tmp, vol->usable_leb_size); + lnum = tmp; + + do { + cond_resched(); + + if (off + len >= vol->usable_leb_size) + len = vol->usable_leb_size - off; + + err = ubi_eba_read_leb(ubi, vol_id, lnum, tbuf, off, len, 0); + if (err) + break; + + off += len; + if (off == vol->usable_leb_size) { + lnum += 1; + off -= vol->usable_leb_size; + } + + count -= len; + *offp += len; + + err = copy_to_user(buf, tbuf, len); + if (err) { + err = -EFAULT; + break; + } + + buf += len; + len = count > tbuf_size ? tbuf_size : count; + } while (count); + + kfree(tbuf); + return err ? err : count_save - count; +} + +#ifdef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO + +/* + * This function allows to directly write to dynamic UBI volumes, without + * issuing the volume update operation. Available only as a debugging feature. + * Very useful for testing UBI. + */ +static ssize_t vol_cdev_direct_write(struct file *file, const char __user *buf, + size_t count, loff_t *offp) +{ + struct ubi_volume_desc *desc = file->private_data; + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + int lnum, off, len, tbuf_size, vol_id = vol->vol_id, err = 0; + size_t count_save = count; + char *tbuf; + uint64_t tmp; + + dbg_msg("requested: write %zd bytes to offset %lld of volume %u", + count, *offp, desc->vol->vol_id); + + if (vol->vol_type == UBI_STATIC_VOLUME) + return -EROFS; + + tmp = *offp; + off = do_div(tmp, vol->usable_leb_size); + lnum = tmp; + + if (off % ubi->min_io_size) { + dbg_err("unaligned position"); + return -EINVAL; + } + + if (*offp + count > vol->used_bytes) + count_save = count = vol->used_bytes - *offp; + + /* We can write only in fractions of the minimum I/O unit */ + if (count % ubi->min_io_size) { + dbg_err("unaligned write length"); + return -EINVAL; + } + + tbuf_size = vol->usable_leb_size; + if (count < tbuf_size) + tbuf_size = ALIGN(count, ubi->min_io_size); + tbuf = kmalloc(tbuf_size, GFP_KERNEL); + if (!tbuf) + return -ENOMEM; + + len = count > tbuf_size ? tbuf_size : count; + + while (count) { + cond_resched(); + + if (off + len >= vol->usable_leb_size) + len = vol->usable_leb_size - off; + + err = copy_from_user(tbuf, buf, len); + if (err) { + err = -EFAULT; + break; + } + + err = ubi_eba_write_leb(ubi, vol_id, lnum, tbuf, off, len, + UBI_UNKNOWN); + if (err) + break; + + off += len; + if (off == vol->usable_leb_size) { + lnum += 1; + off -= vol->usable_leb_size; + } + + count -= len; + *offp += len; + buf += len; + len = count > tbuf_size ? tbuf_size : count; + } + + kfree(tbuf); + return err ? err : count_save - count; +} + +#else +#define vol_cdev_direct_write(file, buf, count, offp) -EPERM +#endif /* CONFIG_MTD_UBI_DEBUG_USERSPACE_IO */ + +static ssize_t vol_cdev_write(struct file *file, const char __user *buf, + size_t count, loff_t *offp) +{ + int err = 0; + struct ubi_volume_desc *desc = file->private_data; + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + + if (!vol->updating) + return vol_cdev_direct_write(file, buf, count, offp); + + err = ubi_more_update_data(ubi, vol->vol_id, buf, count); + if (err < 0) { + ubi_err("cannot write %zd bytes of update data", count); + return err; + } + + if (err) { + /* + * Update is finished, @err contains number of actually written + * bytes now. + */ + count = err; + + err = ubi_check_volume(ubi, vol->vol_id); + if (err < 0) + return err; + + if (err) { + ubi_warn("volume %d on UBI device %d is corrupted", + vol->vol_id, ubi->ubi_num); + vol->corrupted = 1; + } + vol->checked = 1; + revoke_exclusive(desc, UBI_READWRITE); + } + + *offp += count; + return count; +} + +static int vol_cdev_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int err = 0; + struct ubi_volume_desc *desc = file->private_data; + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + void __user *argp = (void __user *)arg; + + if (_IOC_NR(cmd) > VOL_CDEV_IOC_MAX_SEQ || + _IOC_TYPE(cmd) != UBI_VOL_IOC_MAGIC) + return -ENOTTY; + + if (_IOC_DIR(cmd) && _IOC_READ) + err = !access_ok(VERIFY_WRITE, argp, _IOC_SIZE(cmd)); + else if (_IOC_DIR(cmd) && _IOC_WRITE) + err = !access_ok(VERIFY_READ, argp, _IOC_SIZE(cmd)); + if (err) + return -EFAULT; + + switch (cmd) { + + /* Volume update command */ + case UBI_IOCVOLUP: + { + int64_t bytes, rsvd_bytes; + + if (!capable(CAP_SYS_RESOURCE)) { + err = -EPERM; + break; + } + + err = copy_from_user(&bytes, argp, sizeof(int64_t)); + if (err) { + err = -EFAULT; + break; + } + + if (desc->mode == UBI_READONLY) { + err = -EROFS; + break; + } + + rsvd_bytes = vol->reserved_pebs * (ubi->leb_size-vol->data_pad); + if (bytes < 0 || bytes > rsvd_bytes) { + err = -EINVAL; + break; + } + + err = get_exclusive(desc); + if (err < 0) + break; + + err = ubi_start_update(ubi, vol->vol_id, bytes); + if (bytes == 0) + revoke_exclusive(desc, UBI_READWRITE); + + file->f_pos = 0; + break; + } + +#ifdef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO + /* Logical eraseblock erasure command */ + case UBI_IOCEBER: + { + int32_t lnum; + + err = __get_user(lnum, (__user int32_t *)argp); + if (err) { + err = -EFAULT; + break; + } + + if (desc->mode == UBI_READONLY) { + err = -EROFS; + break; + } + + if (lnum < 0 || lnum >= vol->reserved_pebs) { + err = -EINVAL; + break; + } + + if (vol->vol_type != UBI_DYNAMIC_VOLUME) { + err = -EROFS; + break; + } + + dbg_msg("erase LEB %d:%d", vol->vol_id, lnum); + err = ubi_eba_unmap_leb(ubi, vol->vol_id, lnum); + if (err) + break; + + err = ubi_wl_flush(ubi); + break; + } +#endif + + default: + err = -ENOTTY; + break; + } + + return err; +} + +/** + * verify_mkvol_req - verify volume creation request. + * @ubi: UBI device description object + * @req: the request to check + * + * This function zero if the request is correct, and %-EINVAL if not. + */ +static int verify_mkvol_req(const struct ubi_device *ubi, + const struct ubi_mkvol_req *req) +{ + int n, err = -EINVAL; + + if (req->bytes < 0 || req->alignment < 0 || req->vol_type < 0 || + req->name_len < 0) + goto bad; + + if ((req->vol_id < 0 || req->vol_id >= ubi->vtbl_slots) && + req->vol_id != UBI_VOL_NUM_AUTO) + goto bad; + + if (req->alignment == 0) + goto bad; + + if (req->bytes == 0) + goto bad; + + if (req->vol_type != UBI_DYNAMIC_VOLUME && + req->vol_type != UBI_STATIC_VOLUME) + goto bad; + + if (req->alignment > ubi->leb_size) + goto bad; + + n = req->alignment % ubi->min_io_size; + if (req->alignment != 1 && n) + goto bad; + + if (req->name_len > UBI_VOL_NAME_MAX) { + err = -ENAMETOOLONG; + goto bad; + } + + return 0; + +bad: + dbg_err("bad volume creation request"); + ubi_dbg_dump_mkvol_req(req); + return err; +} + +/** + * verify_rsvol_req - verify volume re-size request. + * @ubi: UBI device description object + * @req: the request to check + * + * This function returns zero if the request is correct, and %-EINVAL if not. + */ +static int verify_rsvol_req(const struct ubi_device *ubi, + const struct ubi_rsvol_req *req) +{ + if (req->bytes <= 0) + return -EINVAL; + + if (req->vol_id < 0 || req->vol_id >= ubi->vtbl_slots) + return -EINVAL; + + return 0; +} + +static int ubi_cdev_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int err = 0; + struct ubi_device *ubi; + struct ubi_volume_desc *desc; + void __user *argp = (void __user *)arg; + + if (_IOC_NR(cmd) > UBI_CDEV_IOC_MAX_SEQ || + _IOC_TYPE(cmd) != UBI_IOC_MAGIC) + return -ENOTTY; + + if (_IOC_DIR(cmd) && _IOC_READ) + err = !access_ok(VERIFY_WRITE, argp, _IOC_SIZE(cmd)); + else if (_IOC_DIR(cmd) && _IOC_WRITE) + err = !access_ok(VERIFY_READ, argp, _IOC_SIZE(cmd)); + if (err) + return -EFAULT; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + ubi = major_to_device(imajor(inode)); + if (IS_ERR(ubi)) + return PTR_ERR(ubi); + + switch (cmd) { + /* Create volume command */ + case UBI_IOCMKVOL: + { + struct ubi_mkvol_req req; + + dbg_msg("create volume"); + err = __copy_from_user(&req, argp, + sizeof(struct ubi_mkvol_req)); + if (err) { + err = -EFAULT; + break; + } + + err = verify_mkvol_req(ubi, &req); + if (err) + break; + + req.name[req.name_len] = '\0'; + + err = ubi_create_volume(ubi, &req); + if (err) + break; + + err = __put_user(req.vol_id, (__user int32_t *)argp); + if (err) + err = -EFAULT; + + break; + } + + /* Remove volume command */ + case UBI_IOCRMVOL: + { + int vol_id; + + dbg_msg("remove volume"); + err = __get_user(vol_id, (__user int32_t *)argp); + if (err) { + err = -EFAULT; + break; + } + + desc = ubi_open_volume(ubi->ubi_num, vol_id, UBI_EXCLUSIVE); + if (IS_ERR(desc)) { + err = PTR_ERR(desc); + break; + } + + err = ubi_remove_volume(desc); + if (err) + ubi_close_volume(desc); + + break; + } + + /* Re-size volume command */ + case UBI_IOCRSVOL: + { + int pebs; + uint64_t tmp; + struct ubi_rsvol_req req; + + dbg_msg("re-size volume"); + err = __copy_from_user(&req, argp, + sizeof(struct ubi_rsvol_req)); + if (err) { + err = -EFAULT; + break; + } + + err = verify_rsvol_req(ubi, &req); + if (err) + break; + + desc = ubi_open_volume(ubi->ubi_num, req.vol_id, UBI_EXCLUSIVE); + if (IS_ERR(desc)) { + err = PTR_ERR(desc); + break; + } + + tmp = req.bytes; + pebs = !!do_div(tmp, desc->vol->usable_leb_size); + pebs += tmp; + + err = ubi_resize_volume(desc, pebs); + ubi_close_volume(desc); + break; + } + + default: + err = -ENOTTY; + break; + } + + return err; +} + +/* UBI character device operations */ +struct file_operations ubi_cdev_operations = { + .owner = THIS_MODULE, + .ioctl = ubi_cdev_ioctl, + .llseek = no_llseek +}; + +/* UBI volume character device operations */ +struct file_operations ubi_vol_cdev_operations = { + .owner = THIS_MODULE, + .open = vol_cdev_open, + .release = vol_cdev_release, + .llseek = vol_cdev_llseek, + .read = vol_cdev_read, + .write = vol_cdev_write, + .ioctl = vol_cdev_ioctl +}; diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c new file mode 100644 index 00000000000..86364221faf --- /dev/null +++ b/drivers/mtd/ubi/debug.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * Here we keep all the UBI debugging stuff which should normally be disabled + * and compiled-out, but it is extremely helpful when hunting bugs or doing big + * changes. + */ + +#ifdef CONFIG_MTD_UBI_DEBUG_MSG + +#include "ubi.h" + +/** + * ubi_dbg_dump_ec_hdr - dump an erase counter header. + * @ec_hdr: the erase counter header to dump + */ +void ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr) +{ + dbg_msg("erase counter header dump:"); + dbg_msg("magic %#08x", ubi32_to_cpu(ec_hdr->magic)); + dbg_msg("version %d", (int)ec_hdr->version); + dbg_msg("ec %llu", (long long)ubi64_to_cpu(ec_hdr->ec)); + dbg_msg("vid_hdr_offset %d", ubi32_to_cpu(ec_hdr->vid_hdr_offset)); + dbg_msg("data_offset %d", ubi32_to_cpu(ec_hdr->data_offset)); + dbg_msg("hdr_crc %#08x", ubi32_to_cpu(ec_hdr->hdr_crc)); + dbg_msg("erase counter header hexdump:"); + ubi_dbg_hexdump(ec_hdr, UBI_EC_HDR_SIZE); +} + +/** + * ubi_dbg_dump_vid_hdr - dump a volume identifier header. + * @vid_hdr: the volume identifier header to dump + */ +void ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr) +{ + dbg_msg("volume identifier header dump:"); + dbg_msg("magic %08x", ubi32_to_cpu(vid_hdr->magic)); + dbg_msg("version %d", (int)vid_hdr->version); + dbg_msg("vol_type %d", (int)vid_hdr->vol_type); + dbg_msg("copy_flag %d", (int)vid_hdr->copy_flag); + dbg_msg("compat %d", (int)vid_hdr->compat); + dbg_msg("vol_id %d", ubi32_to_cpu(vid_hdr->vol_id)); + dbg_msg("lnum %d", ubi32_to_cpu(vid_hdr->lnum)); + dbg_msg("leb_ver %u", ubi32_to_cpu(vid_hdr->leb_ver)); + dbg_msg("data_size %d", ubi32_to_cpu(vid_hdr->data_size)); + dbg_msg("used_ebs %d", ubi32_to_cpu(vid_hdr->used_ebs)); + dbg_msg("data_pad %d", ubi32_to_cpu(vid_hdr->data_pad)); + dbg_msg("sqnum %llu", + (unsigned long long)ubi64_to_cpu(vid_hdr->sqnum)); + dbg_msg("hdr_crc %08x", ubi32_to_cpu(vid_hdr->hdr_crc)); + dbg_msg("volume identifier header hexdump:"); +} + +/** + * ubi_dbg_dump_vol_info- dump volume information. + * @vol: UBI volume description object + */ +void ubi_dbg_dump_vol_info(const struct ubi_volume *vol) +{ + dbg_msg("volume information dump:"); + dbg_msg("vol_id %d", vol->vol_id); + dbg_msg("reserved_pebs %d", vol->reserved_pebs); + dbg_msg("alignment %d", vol->alignment); + dbg_msg("data_pad %d", vol->data_pad); + dbg_msg("vol_type %d", vol->vol_type); + dbg_msg("name_len %d", vol->name_len); + dbg_msg("usable_leb_size %d", vol->usable_leb_size); + dbg_msg("used_ebs %d", vol->used_ebs); + dbg_msg("used_bytes %lld", vol->used_bytes); + dbg_msg("last_eb_bytes %d", vol->last_eb_bytes); + dbg_msg("corrupted %d", vol->corrupted); + dbg_msg("upd_marker %d", vol->upd_marker); + + if (vol->name_len <= UBI_VOL_NAME_MAX && + strnlen(vol->name, vol->name_len + 1) == vol->name_len) { + dbg_msg("name %s", vol->name); + } else { + dbg_msg("the 1st 5 characters of the name: %c%c%c%c%c", + vol->name[0], vol->name[1], vol->name[2], + vol->name[3], vol->name[4]); + } +} + +/** + * ubi_dbg_dump_vtbl_record - dump a &struct ubi_vtbl_record object. + * @r: the object to dump + * @idx: volume table index + */ +void ubi_dbg_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx) +{ + int name_len = ubi16_to_cpu(r->name_len); + + dbg_msg("volume table record %d dump:", idx); + dbg_msg("reserved_pebs %d", ubi32_to_cpu(r->reserved_pebs)); + dbg_msg("alignment %d", ubi32_to_cpu(r->alignment)); + dbg_msg("data_pad %d", ubi32_to_cpu(r->data_pad)); + dbg_msg("vol_type %d", (int)r->vol_type); + dbg_msg("upd_marker %d", (int)r->upd_marker); + dbg_msg("name_len %d", name_len); + + if (r->name[0] == '\0') { + dbg_msg("name NULL"); + return; + } + + if (name_len <= UBI_VOL_NAME_MAX && + strnlen(&r->name[0], name_len + 1) == name_len) { + dbg_msg("name %s", &r->name[0]); + } else { + dbg_msg("1st 5 characters of the name: %c%c%c%c%c", + r->name[0], r->name[1], r->name[2], r->name[3], + r->name[4]); + } + dbg_msg("crc %#08x", ubi32_to_cpu(r->crc)); +} + +/** + * ubi_dbg_dump_sv - dump a &struct ubi_scan_volume object. + * @sv: the object to dump + */ +void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv) +{ + dbg_msg("volume scanning information dump:"); + dbg_msg("vol_id %d", sv->vol_id); + dbg_msg("highest_lnum %d", sv->highest_lnum); + dbg_msg("leb_count %d", sv->leb_count); + dbg_msg("compat %d", sv->compat); + dbg_msg("vol_type %d", sv->vol_type); + dbg_msg("used_ebs %d", sv->used_ebs); + dbg_msg("last_data_size %d", sv->last_data_size); + dbg_msg("data_pad %d", sv->data_pad); +} + +/** + * ubi_dbg_dump_seb - dump a &struct ubi_scan_leb object. + * @seb: the object to dump + * @type: object type: 0 - not corrupted, 1 - corrupted + */ +void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type) +{ + dbg_msg("eraseblock scanning information dump:"); + dbg_msg("ec %d", seb->ec); + dbg_msg("pnum %d", seb->pnum); + if (type == 0) { + dbg_msg("lnum %d", seb->lnum); + dbg_msg("scrub %d", seb->scrub); + dbg_msg("sqnum %llu", seb->sqnum); + dbg_msg("leb_ver %u", seb->leb_ver); + } +} + +/** + * ubi_dbg_dump_mkvol_req - dump a &struct ubi_mkvol_req object. + * @req: the object to dump + */ +void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req) +{ + char nm[17]; + + dbg_msg("volume creation request dump:"); + dbg_msg("vol_id %d", req->vol_id); + dbg_msg("alignment %d", req->alignment); + dbg_msg("bytes %lld", (long long)req->bytes); + dbg_msg("vol_type %d", req->vol_type); + dbg_msg("name_len %d", req->name_len); + + memcpy(nm, req->name, 16); + nm[16] = 0; + dbg_msg("the 1st 16 characters of the name: %s", nm); +} + +#define BYTES_PER_LINE 32 + +/** + * ubi_dbg_hexdump - dump a buffer. + * @ptr: the buffer to dump + * @size: buffer size which must be multiple of 4 bytes + */ +void ubi_dbg_hexdump(const void *ptr, int size) +{ + int i, k = 0, rows, columns; + const uint8_t *p = ptr; + + size = ALIGN(size, 4); + rows = size/BYTES_PER_LINE + size % BYTES_PER_LINE; + for (i = 0; i < rows; i++) { + int j; + + cond_resched(); + columns = min(size - k, BYTES_PER_LINE) / 4; + if (columns == 0) + break; + printk(KERN_DEBUG "%5d: ", i * BYTES_PER_LINE); + for (j = 0; j < columns; j++) { + int n, N; + + N = size - k > 4 ? 4 : size - k; + for (n = 0; n < N; n++) + printk("%02x", p[k++]); + printk(" "); + } + printk("\n"); + } +} + +#endif /* CONFIG_MTD_UBI_DEBUG_MSG */ diff --git a/drivers/mtd/ubi/debug.h b/drivers/mtd/ubi/debug.h new file mode 100644 index 00000000000..f816ad9a36c --- /dev/null +++ b/drivers/mtd/ubi/debug.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +#ifndef __UBI_DEBUG_H__ +#define __UBI_DEBUG_H__ + +#ifdef CONFIG_MTD_UBI_DEBUG +#include <linux/random.h> + +#define ubi_assert(expr) BUG_ON(!(expr)) +#define dbg_err(fmt, ...) ubi_err(fmt, ##__VA_ARGS__) +#else +#define ubi_assert(expr) ({}) +#define dbg_err(fmt, ...) ({}) +#endif + +#ifdef CONFIG_MTD_UBI_DEBUG_DISABLE_BGT +#define DBG_DISABLE_BGT 1 +#else +#define DBG_DISABLE_BGT 0 +#endif + +#ifdef CONFIG_MTD_UBI_DEBUG_MSG +/* Generic debugging message */ +#define dbg_msg(fmt, ...) \ + printk(KERN_DEBUG "UBI DBG: %s: " fmt "\n", __FUNCTION__, ##__VA_ARGS__) + +#define ubi_dbg_dump_stack() dump_stack() + +struct ubi_ec_hdr; +struct ubi_vid_hdr; +struct ubi_volume; +struct ubi_vtbl_record; +struct ubi_scan_volume; +struct ubi_scan_leb; +struct ubi_mkvol_req; + +void ubi_dbg_print(int type, const char *func, const char *fmt, ...); +void ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr); +void ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr); +void ubi_dbg_dump_vol_info(const struct ubi_volume *vol); +void ubi_dbg_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx); +void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv); +void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type); +void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req); +void ubi_dbg_hexdump(const void *buf, int size); + +#else + +#define dbg_msg(fmt, ...) ({}) +#define ubi_dbg_dump_stack() ({}) +#define ubi_dbg_print(func, fmt, ...) ({}) +#define ubi_dbg_dump_ec_hdr(ec_hdr) ({}) +#define ubi_dbg_dump_vid_hdr(vid_hdr) ({}) +#define ubi_dbg_dump_vol_info(vol) ({}) +#define ubi_dbg_dump_vtbl_record(r, idx) ({}) +#define ubi_dbg_dump_sv(sv) ({}) +#define ubi_dbg_dump_seb(seb, type) ({}) +#define ubi_dbg_dump_mkvol_req(req) ({}) +#define ubi_dbg_hexdump(buf, size) ({}) + +#endif /* CONFIG_MTD_UBI_DEBUG_MSG */ + +#ifdef CONFIG_MTD_UBI_DEBUG_MSG_EBA +/* Messages from the eraseblock association unit */ +#define dbg_eba(fmt, ...) \ + printk(KERN_DEBUG "UBI DBG eba: %s: " fmt "\n", __FUNCTION__, \ + ##__VA_ARGS__) +#else +#define dbg_eba(fmt, ...) ({}) +#endif + +#ifdef CONFIG_MTD_UBI_DEBUG_MSG_WL +/* Messages from the wear-leveling unit */ +#define dbg_wl(fmt, ...) \ + printk(KERN_DEBUG "UBI DBG wl: %s: " fmt "\n", __FUNCTION__, \ + ##__VA_ARGS__) +#else +#define dbg_wl(fmt, ...) ({}) +#endif + +#ifdef CONFIG_MTD_UBI_DEBUG_MSG_IO +/* Messages from the input/output unit */ +#define dbg_io(fmt, ...) \ + printk(KERN_DEBUG "UBI DBG io: %s: " fmt "\n", __FUNCTION__, \ + ##__VA_ARGS__) +#else +#define dbg_io(fmt, ...) ({}) +#endif + +#ifdef CONFIG_MTD_UBI_DEBUG_MSG_BLD +/* Initialization and build messages */ +#define dbg_bld(fmt, ...) \ + printk(KERN_DEBUG "UBI DBG bld: %s: " fmt "\n", __FUNCTION__, \ + ##__VA_ARGS__) +#else +#define dbg_bld(fmt, ...) ({}) +#endif + +#ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_BITFLIPS +/** + * ubi_dbg_is_bitflip - if it is time to emulate a bit-flip. + * + * Returns non-zero if a bit-flip should be emulated, otherwise returns zero. + */ +static inline int ubi_dbg_is_bitflip(void) +{ + return !(random32() % 200); +} +#else +#define ubi_dbg_is_bitflip() 0 +#endif + +#ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_WRITE_FAILURES +/** + * ubi_dbg_is_write_failure - if it is time to emulate a write failure. + * + * Returns non-zero if a write failure should be emulated, otherwise returns + * zero. + */ +static inline int ubi_dbg_is_write_failure(void) +{ + return !(random32() % 500); +} +#else +#define ubi_dbg_is_write_failure() 0 +#endif + +#ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_ERASE_FAILURES +/** + * ubi_dbg_is_erase_failure - if its time to emulate an erase failure. + * + * Returns non-zero if an erase failure should be emulated, otherwise returns + * zero. + */ +static inline int ubi_dbg_is_erase_failure(void) +{ + return !(random32() % 400); +} +#else +#define ubi_dbg_is_erase_failure() 0 +#endif + +#endif /* !__UBI_DEBUG_H__ */ diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c new file mode 100644 index 00000000000..d847ee1da3d --- /dev/null +++ b/drivers/mtd/ubi/eba.c @@ -0,0 +1,1241 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * The UBI Eraseblock Association (EBA) unit. + * + * This unit is responsible for I/O to/from logical eraseblock. + * + * Although in this implementation the EBA table is fully kept and managed in + * RAM, which assumes poor scalability, it might be (partially) maintained on + * flash in future implementations. + * + * The EBA unit implements per-logical eraseblock locking. Before accessing a + * logical eraseblock it is locked for reading or writing. The per-logical + * eraseblock locking is implemented by means of the lock tree. The lock tree + * is an RB-tree which refers all the currently locked logical eraseblocks. The + * lock tree elements are &struct ltree_entry objects. They are indexed by + * (@vol_id, @lnum) pairs. + * + * EBA also maintains the global sequence counter which is incremented each + * time a logical eraseblock is mapped to a physical eraseblock and it is + * stored in the volume identifier header. This means that each VID header has + * a unique sequence number. The sequence number is only increased an we assume + * 64 bits is enough to never overflow. + */ + +#include <linux/slab.h> +#include <linux/crc32.h> +#include <linux/err.h> +#include "ubi.h" + +/** + * struct ltree_entry - an entry in the lock tree. + * @rb: links RB-tree nodes + * @vol_id: volume ID of the locked logical eraseblock + * @lnum: locked logical eraseblock number + * @users: how many tasks are using this logical eraseblock or wait for it + * @mutex: read/write mutex to implement read/write access serialization to + * the (@vol_id, @lnum) logical eraseblock + * + * When a logical eraseblock is being locked - corresponding &struct ltree_entry + * object is inserted to the lock tree (@ubi->ltree). + */ +struct ltree_entry { + struct rb_node rb; + int vol_id; + int lnum; + int users; + struct rw_semaphore mutex; +}; + +/* Slab cache for lock-tree entries */ +static struct kmem_cache *ltree_slab; + +/** + * next_sqnum - get next sequence number. + * @ubi: UBI device description object + * + * This function returns next sequence number to use, which is just the current + * global sequence counter value. It also increases the global sequence + * counter. + */ +static unsigned long long next_sqnum(struct ubi_device *ubi) +{ + unsigned long long sqnum; + + spin_lock(&ubi->ltree_lock); + sqnum = ubi->global_sqnum++; + spin_unlock(&ubi->ltree_lock); + + return sqnum; +} + +/** + * ubi_get_compat - get compatibility flags of a volume. + * @ubi: UBI device description object + * @vol_id: volume ID + * + * This function returns compatibility flags for an internal volume. User + * volumes have no compatibility flags, so %0 is returned. + */ +static int ubi_get_compat(const struct ubi_device *ubi, int vol_id) +{ + if (vol_id == UBI_LAYOUT_VOL_ID) + return UBI_LAYOUT_VOLUME_COMPAT; + return 0; +} + +/** + * ltree_lookup - look up the lock tree. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + * + * This function returns a pointer to the corresponding &struct ltree_entry + * object if the logical eraseblock is locked and %NULL if it is not. + * @ubi->ltree_lock has to be locked. + */ +static struct ltree_entry *ltree_lookup(struct ubi_device *ubi, int vol_id, + int lnum) +{ + struct rb_node *p; + + p = ubi->ltree.rb_node; + while (p) { + struct ltree_entry *le; + + le = rb_entry(p, struct ltree_entry, rb); + + if (vol_id < le->vol_id) + p = p->rb_left; + else if (vol_id > le->vol_id) + p = p->rb_right; + else { + if (lnum < le->lnum) + p = p->rb_left; + else if (lnum > le->lnum) + p = p->rb_right; + else + return le; + } + } + + return NULL; +} + +/** + * ltree_add_entry - add new entry to the lock tree. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + * + * This function adds new entry for logical eraseblock (@vol_id, @lnum) to the + * lock tree. If such entry is already there, its usage counter is increased. + * Returns pointer to the lock tree entry or %-ENOMEM if memory allocation + * failed. + */ +static struct ltree_entry *ltree_add_entry(struct ubi_device *ubi, int vol_id, + int lnum) +{ + struct ltree_entry *le, *le1, *le_free; + + le = kmem_cache_alloc(ltree_slab, GFP_KERNEL); + if (!le) + return ERR_PTR(-ENOMEM); + + le->vol_id = vol_id; + le->lnum = lnum; + + spin_lock(&ubi->ltree_lock); + le1 = ltree_lookup(ubi, vol_id, lnum); + + if (le1) { + /* + * This logical eraseblock is already locked. The newly + * allocated lock entry is not needed. + */ + le_free = le; + le = le1; + } else { + struct rb_node **p, *parent = NULL; + + /* + * No lock entry, add the newly allocated one to the + * @ubi->ltree RB-tree. + */ + le_free = NULL; + + p = &ubi->ltree.rb_node; + while (*p) { + parent = *p; + le1 = rb_entry(parent, struct ltree_entry, rb); + + if (vol_id < le1->vol_id) + p = &(*p)->rb_left; + else if (vol_id > le1->vol_id) + p = &(*p)->rb_right; + else { + ubi_assert(lnum != le1->lnum); + if (lnum < le1->lnum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + } + + rb_link_node(&le->rb, parent, p); + rb_insert_color(&le->rb, &ubi->ltree); + } + le->users += 1; + spin_unlock(&ubi->ltree_lock); + + if (le_free) + kmem_cache_free(ltree_slab, le_free); + + return le; +} + +/** + * leb_read_lock - lock logical eraseblock for reading. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + * + * This function locks a logical eraseblock for reading. Returns zero in case + * of success and a negative error code in case of failure. + */ +static int leb_read_lock(struct ubi_device *ubi, int vol_id, int lnum) +{ + struct ltree_entry *le; + + le = ltree_add_entry(ubi, vol_id, lnum); + if (IS_ERR(le)) + return PTR_ERR(le); + down_read(&le->mutex); + return 0; +} + +/** + * leb_read_unlock - unlock logical eraseblock. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + */ +static void leb_read_unlock(struct ubi_device *ubi, int vol_id, int lnum) +{ + int free = 0; + struct ltree_entry *le; + + spin_lock(&ubi->ltree_lock); + le = ltree_lookup(ubi, vol_id, lnum); + le->users -= 1; + ubi_assert(le->users >= 0); + if (le->users == 0) { + rb_erase(&le->rb, &ubi->ltree); + free = 1; + } + spin_unlock(&ubi->ltree_lock); + + up_read(&le->mutex); + if (free) + kmem_cache_free(ltree_slab, le); +} + +/** + * leb_write_lock - lock logical eraseblock for writing. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + * + * This function locks a logical eraseblock for writing. Returns zero in case + * of success and a negative error code in case of failure. + */ +static int leb_write_lock(struct ubi_device *ubi, int vol_id, int lnum) +{ + struct ltree_entry *le; + + le = ltree_add_entry(ubi, vol_id, lnum); + if (IS_ERR(le)) + return PTR_ERR(le); + down_write(&le->mutex); + return 0; +} + +/** + * leb_write_unlock - unlock logical eraseblock. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + */ +static void leb_write_unlock(struct ubi_device *ubi, int vol_id, int lnum) +{ + int free; + struct ltree_entry *le; + + spin_lock(&ubi->ltree_lock); + le = ltree_lookup(ubi, vol_id, lnum); + le->users -= 1; + ubi_assert(le->users >= 0); + if (le->users == 0) { + rb_erase(&le->rb, &ubi->ltree); + free = 1; + } else + free = 0; + spin_unlock(&ubi->ltree_lock); + + up_write(&le->mutex); + if (free) + kmem_cache_free(ltree_slab, le); +} + +/** + * ubi_eba_unmap_leb - un-map logical eraseblock. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + * + * This function un-maps logical eraseblock @lnum and schedules corresponding + * physical eraseblock for erasure. Returns zero in case of success and a + * negative error code in case of failure. + */ +int ubi_eba_unmap_leb(struct ubi_device *ubi, int vol_id, int lnum) +{ + int idx = vol_id2idx(ubi, vol_id), err, pnum; + struct ubi_volume *vol = ubi->volumes[idx]; + + if (ubi->ro_mode) + return -EROFS; + + err = leb_write_lock(ubi, vol_id, lnum); + if (err) + return err; + + pnum = vol->eba_tbl[lnum]; + if (pnum < 0) + /* This logical eraseblock is already unmapped */ + goto out_unlock; + + dbg_eba("erase LEB %d:%d, PEB %d", vol_id, lnum, pnum); + + vol->eba_tbl[lnum] = UBI_LEB_UNMAPPED; + err = ubi_wl_put_peb(ubi, pnum, 0); + +out_unlock: + leb_write_unlock(ubi, vol_id, lnum); + return err; +} + +/** + * ubi_eba_read_leb - read data. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + * @buf: buffer to store the read data + * @offset: offset from where to read + * @len: how many bytes to read + * @check: data CRC check flag + * + * If the logical eraseblock @lnum is unmapped, @buf is filled with 0xFF + * bytes. The @check flag only makes sense for static volumes and forces + * eraseblock data CRC checking. + * + * In case of success this function returns zero. In case of a static volume, + * if data CRC mismatches - %-EBADMSG is returned. %-EBADMSG may also be + * returned for any volume type if an ECC error was detected by the MTD device + * driver. Other negative error cored may be returned in case of other errors. + */ +int ubi_eba_read_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf, + int offset, int len, int check) +{ + int err, pnum, scrub = 0, idx = vol_id2idx(ubi, vol_id); + struct ubi_vid_hdr *vid_hdr; + struct ubi_volume *vol = ubi->volumes[idx]; + uint32_t crc, crc1; + + err = leb_read_lock(ubi, vol_id, lnum); + if (err) + return err; + + pnum = vol->eba_tbl[lnum]; + if (pnum < 0) { + /* + * The logical eraseblock is not mapped, fill the whole buffer + * with 0xFF bytes. The exception is static volumes for which + * it is an error to read unmapped logical eraseblocks. + */ + dbg_eba("read %d bytes from offset %d of LEB %d:%d (unmapped)", + len, offset, vol_id, lnum); + leb_read_unlock(ubi, vol_id, lnum); + ubi_assert(vol->vol_type != UBI_STATIC_VOLUME); + memset(buf, 0xFF, len); + return 0; + } + + dbg_eba("read %d bytes from offset %d of LEB %d:%d, PEB %d", + len, offset, vol_id, lnum, pnum); + + if (vol->vol_type == UBI_DYNAMIC_VOLUME) + check = 0; + +retry: + if (check) { + vid_hdr = ubi_zalloc_vid_hdr(ubi); + if (!vid_hdr) { + err = -ENOMEM; + goto out_unlock; + } + + err = ubi_io_read_vid_hdr(ubi, pnum, vid_hdr, 1); + if (err && err != UBI_IO_BITFLIPS) { + if (err > 0) { + /* + * The header is either absent or corrupted. + * The former case means there is a bug - + * switch to read-only mode just in case. + * The latter case means a real corruption - we + * may try to recover data. FIXME: but this is + * not implemented. + */ + if (err == UBI_IO_BAD_VID_HDR) { + ubi_warn("bad VID header at PEB %d, LEB" + "%d:%d", pnum, vol_id, lnum); + err = -EBADMSG; + } else + ubi_ro_mode(ubi); + } + goto out_free; + } else if (err == UBI_IO_BITFLIPS) + scrub = 1; + + ubi_assert(lnum < ubi32_to_cpu(vid_hdr->used_ebs)); + ubi_assert(len == ubi32_to_cpu(vid_hdr->data_size)); + + crc = ubi32_to_cpu(vid_hdr->data_crc); + ubi_free_vid_hdr(ubi, vid_hdr); + } + + err = ubi_io_read_data(ubi, buf, pnum, offset, len); + if (err) { + if (err == UBI_IO_BITFLIPS) { + scrub = 1; + err = 0; + } else if (err == -EBADMSG) { + if (vol->vol_type == UBI_DYNAMIC_VOLUME) + goto out_unlock; + scrub = 1; + if (!check) { + ubi_msg("force data checking"); + check = 1; + goto retry; + } + } else + goto out_unlock; + } + + if (check) { + crc1 = crc32(UBI_CRC32_INIT, buf, len); + if (crc1 != crc) { + ubi_warn("CRC error: calculated %#08x, must be %#08x", + crc1, crc); + err = -EBADMSG; + goto out_unlock; + } + } + + if (scrub) + err = ubi_wl_scrub_peb(ubi, pnum); + + leb_read_unlock(ubi, vol_id, lnum); + return err; + +out_free: + ubi_free_vid_hdr(ubi, vid_hdr); +out_unlock: + leb_read_unlock(ubi, vol_id, lnum); + return err; +} + +/** + * recover_peb - recover from write failure. + * @ubi: UBI device description object + * @pnum: the physical eraseblock to recover + * @vol_id: volume ID + * @lnum: logical eraseblock number + * @buf: data which was not written because of the write failure + * @offset: offset of the failed write + * @len: how many bytes should have been written + * + * This function is called in case of a write failure and moves all good data + * from the potentially bad physical eraseblock to a good physical eraseblock. + * This function also writes the data which was not written due to the failure. + * Returns new physical eraseblock number in case of success, and a negative + * error code in case of failure. + */ +static int recover_peb(struct ubi_device *ubi, int pnum, int vol_id, int lnum, + const void *buf, int offset, int len) +{ + int err, idx = vol_id2idx(ubi, vol_id), new_pnum, data_size, tries = 0; + struct ubi_volume *vol = ubi->volumes[idx]; + struct ubi_vid_hdr *vid_hdr; + unsigned char *new_buf; + + vid_hdr = ubi_zalloc_vid_hdr(ubi); + if (!vid_hdr) { + return -ENOMEM; + } + +retry: + new_pnum = ubi_wl_get_peb(ubi, UBI_UNKNOWN); + if (new_pnum < 0) { + ubi_free_vid_hdr(ubi, vid_hdr); + return new_pnum; + } + + ubi_msg("recover PEB %d, move data to PEB %d", pnum, new_pnum); + + err = ubi_io_read_vid_hdr(ubi, pnum, vid_hdr, 1); + if (err && err != UBI_IO_BITFLIPS) { + if (err > 0) + err = -EIO; + goto out_put; + } + + vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); + err = ubi_io_write_vid_hdr(ubi, new_pnum, vid_hdr); + if (err) + goto write_error; + + data_size = offset + len; + new_buf = kmalloc(data_size, GFP_KERNEL); + if (!new_buf) { + err = -ENOMEM; + goto out_put; + } + memset(new_buf + offset, 0xFF, len); + + /* Read everything before the area where the write failure happened */ + if (offset > 0) { + err = ubi_io_read_data(ubi, new_buf, pnum, 0, offset); + if (err && err != UBI_IO_BITFLIPS) { + kfree(new_buf); + goto out_put; + } + } + + memcpy(new_buf + offset, buf, len); + + err = ubi_io_write_data(ubi, new_buf, new_pnum, 0, data_size); + if (err) { + kfree(new_buf); + goto write_error; + } + + kfree(new_buf); + ubi_free_vid_hdr(ubi, vid_hdr); + + vol->eba_tbl[lnum] = new_pnum; + ubi_wl_put_peb(ubi, pnum, 1); + + ubi_msg("data was successfully recovered"); + return 0; + +out_put: + ubi_wl_put_peb(ubi, new_pnum, 1); + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + +write_error: + /* + * Bad luck? This physical eraseblock is bad too? Crud. Let's try to + * get another one. + */ + ubi_warn("failed to write to PEB %d", new_pnum); + ubi_wl_put_peb(ubi, new_pnum, 1); + if (++tries > UBI_IO_RETRIES) { + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + } + ubi_msg("try again"); + goto retry; +} + +/** + * ubi_eba_write_leb - write data to dynamic volume. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + * @buf: the data to write + * @offset: offset within the logical eraseblock where to write + * @len: how many bytes to write + * @dtype: data type + * + * This function writes data to logical eraseblock @lnum of a dynamic volume + * @vol_id. Returns zero in case of success and a negative error code in case + * of failure. In case of error, it is possible that something was still + * written to the flash media, but may be some garbage. + */ +int ubi_eba_write_leb(struct ubi_device *ubi, int vol_id, int lnum, + const void *buf, int offset, int len, int dtype) +{ + int idx = vol_id2idx(ubi, vol_id), err, pnum, tries = 0; + struct ubi_volume *vol = ubi->volumes[idx]; + struct ubi_vid_hdr *vid_hdr; + + if (ubi->ro_mode) + return -EROFS; + + err = leb_write_lock(ubi, vol_id, lnum); + if (err) + return err; + + pnum = vol->eba_tbl[lnum]; + if (pnum >= 0) { + dbg_eba("write %d bytes at offset %d of LEB %d:%d, PEB %d", + len, offset, vol_id, lnum, pnum); + + err = ubi_io_write_data(ubi, buf, pnum, offset, len); + if (err) { + ubi_warn("failed to write data to PEB %d", pnum); + if (err == -EIO && ubi->bad_allowed) + err = recover_peb(ubi, pnum, vol_id, lnum, buf, offset, len); + if (err) + ubi_ro_mode(ubi); + } + leb_write_unlock(ubi, vol_id, lnum); + return err; + } + + /* + * The logical eraseblock is not mapped. We have to get a free physical + * eraseblock and write the volume identifier header there first. + */ + vid_hdr = ubi_zalloc_vid_hdr(ubi); + if (!vid_hdr) { + leb_write_unlock(ubi, vol_id, lnum); + return -ENOMEM; + } + + vid_hdr->vol_type = UBI_VID_DYNAMIC; + vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); + vid_hdr->vol_id = cpu_to_ubi32(vol_id); + vid_hdr->lnum = cpu_to_ubi32(lnum); + vid_hdr->compat = ubi_get_compat(ubi, vol_id); + vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad); + +retry: + pnum = ubi_wl_get_peb(ubi, dtype); + if (pnum < 0) { + ubi_free_vid_hdr(ubi, vid_hdr); + leb_write_unlock(ubi, vol_id, lnum); + return pnum; + } + + dbg_eba("write VID hdr and %d bytes at offset %d of LEB %d:%d, PEB %d", + len, offset, vol_id, lnum, pnum); + + err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr); + if (err) { + ubi_warn("failed to write VID header to LEB %d:%d, PEB %d", + vol_id, lnum, pnum); + goto write_error; + } + + err = ubi_io_write_data(ubi, buf, pnum, offset, len); + if (err) { + ubi_warn("failed to write %d bytes at offset %d of LEB %d:%d, " + "PEB %d", len, offset, vol_id, lnum, pnum); + goto write_error; + } + + vol->eba_tbl[lnum] = pnum; + + leb_write_unlock(ubi, vol_id, lnum); + ubi_free_vid_hdr(ubi, vid_hdr); + return 0; + +write_error: + if (err != -EIO || !ubi->bad_allowed) { + ubi_ro_mode(ubi); + leb_write_unlock(ubi, vol_id, lnum); + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + } + + /* + * Fortunately, this is the first write operation to this physical + * eraseblock, so just put it and request a new one. We assume that if + * this physical eraseblock went bad, the erase code will handle that. + */ + err = ubi_wl_put_peb(ubi, pnum, 1); + if (err || ++tries > UBI_IO_RETRIES) { + ubi_ro_mode(ubi); + leb_write_unlock(ubi, vol_id, lnum); + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + } + + vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); + ubi_msg("try another PEB"); + goto retry; +} + +/** + * ubi_eba_write_leb_st - write data to static volume. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + * @buf: data to write + * @len: how many bytes to write + * @dtype: data type + * @used_ebs: how many logical eraseblocks will this volume contain + * + * This function writes data to logical eraseblock @lnum of static volume + * @vol_id. The @used_ebs argument should contain total number of logical + * eraseblock in this static volume. + * + * When writing to the last logical eraseblock, the @len argument doesn't have + * to be aligned to the minimal I/O unit size. Instead, it has to be equivalent + * to the real data size, although the @buf buffer has to contain the + * alignment. In all other cases, @len has to be aligned. + * + * It is prohibited to write more then once to logical eraseblocks of static + * volumes. This function returns zero in case of success and a negative error + * code in case of failure. + */ +int ubi_eba_write_leb_st(struct ubi_device *ubi, int vol_id, int lnum, + const void *buf, int len, int dtype, int used_ebs) +{ + int err, pnum, tries = 0, data_size = len; + int idx = vol_id2idx(ubi, vol_id); + struct ubi_volume *vol = ubi->volumes[idx]; + struct ubi_vid_hdr *vid_hdr; + uint32_t crc; + + if (ubi->ro_mode) + return -EROFS; + + if (lnum == used_ebs - 1) + /* If this is the last LEB @len may be unaligned */ + len = ALIGN(data_size, ubi->min_io_size); + else + ubi_assert(len % ubi->min_io_size == 0); + + vid_hdr = ubi_zalloc_vid_hdr(ubi); + if (!vid_hdr) + return -ENOMEM; + + err = leb_write_lock(ubi, vol_id, lnum); + if (err) { + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + } + + vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); + vid_hdr->vol_id = cpu_to_ubi32(vol_id); + vid_hdr->lnum = cpu_to_ubi32(lnum); + vid_hdr->compat = ubi_get_compat(ubi, vol_id); + vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad); + + crc = crc32(UBI_CRC32_INIT, buf, data_size); + vid_hdr->vol_type = UBI_VID_STATIC; + vid_hdr->data_size = cpu_to_ubi32(data_size); + vid_hdr->used_ebs = cpu_to_ubi32(used_ebs); + vid_hdr->data_crc = cpu_to_ubi32(crc); + +retry: + pnum = ubi_wl_get_peb(ubi, dtype); + if (pnum < 0) { + ubi_free_vid_hdr(ubi, vid_hdr); + leb_write_unlock(ubi, vol_id, lnum); + return pnum; + } + + dbg_eba("write VID hdr and %d bytes at LEB %d:%d, PEB %d, used_ebs %d", + len, vol_id, lnum, pnum, used_ebs); + + err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr); + if (err) { + ubi_warn("failed to write VID header to LEB %d:%d, PEB %d", + vol_id, lnum, pnum); + goto write_error; + } + + err = ubi_io_write_data(ubi, buf, pnum, 0, len); + if (err) { + ubi_warn("failed to write %d bytes of data to PEB %d", + len, pnum); + goto write_error; + } + + ubi_assert(vol->eba_tbl[lnum] < 0); + vol->eba_tbl[lnum] = pnum; + + leb_write_unlock(ubi, vol_id, lnum); + ubi_free_vid_hdr(ubi, vid_hdr); + return 0; + +write_error: + if (err != -EIO || !ubi->bad_allowed) { + /* + * This flash device does not admit of bad eraseblocks or + * something nasty and unexpected happened. Switch to read-only + * mode just in case. + */ + ubi_ro_mode(ubi); + leb_write_unlock(ubi, vol_id, lnum); + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + } + + err = ubi_wl_put_peb(ubi, pnum, 1); + if (err || ++tries > UBI_IO_RETRIES) { + ubi_ro_mode(ubi); + leb_write_unlock(ubi, vol_id, lnum); + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + } + + vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); + ubi_msg("try another PEB"); + goto retry; +} + +/* + * ubi_eba_atomic_leb_change - change logical eraseblock atomically. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + * @buf: data to write + * @len: how many bytes to write + * @dtype: data type + * + * This function changes the contents of a logical eraseblock atomically. @buf + * has to contain new logical eraseblock data, and @len - the length of the + * data, which has to be aligned. This function guarantees that in case of an + * unclean reboot the old contents is preserved. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubi_eba_atomic_leb_change(struct ubi_device *ubi, int vol_id, int lnum, + const void *buf, int len, int dtype) +{ + int err, pnum, tries = 0, idx = vol_id2idx(ubi, vol_id); + struct ubi_volume *vol = ubi->volumes[idx]; + struct ubi_vid_hdr *vid_hdr; + uint32_t crc; + + if (ubi->ro_mode) + return -EROFS; + + vid_hdr = ubi_zalloc_vid_hdr(ubi); + if (!vid_hdr) + return -ENOMEM; + + err = leb_write_lock(ubi, vol_id, lnum); + if (err) { + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + } + + vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); + vid_hdr->vol_id = cpu_to_ubi32(vol_id); + vid_hdr->lnum = cpu_to_ubi32(lnum); + vid_hdr->compat = ubi_get_compat(ubi, vol_id); + vid_hdr->data_pad = cpu_to_ubi32(vol->data_pad); + + crc = crc32(UBI_CRC32_INIT, buf, len); + vid_hdr->vol_type = UBI_VID_STATIC; + vid_hdr->data_size = cpu_to_ubi32(len); + vid_hdr->copy_flag = 1; + vid_hdr->data_crc = cpu_to_ubi32(crc); + +retry: + pnum = ubi_wl_get_peb(ubi, dtype); + if (pnum < 0) { + ubi_free_vid_hdr(ubi, vid_hdr); + leb_write_unlock(ubi, vol_id, lnum); + return pnum; + } + + dbg_eba("change LEB %d:%d, PEB %d, write VID hdr to PEB %d", + vol_id, lnum, vol->eba_tbl[lnum], pnum); + + err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr); + if (err) { + ubi_warn("failed to write VID header to LEB %d:%d, PEB %d", + vol_id, lnum, pnum); + goto write_error; + } + + err = ubi_io_write_data(ubi, buf, pnum, 0, len); + if (err) { + ubi_warn("failed to write %d bytes of data to PEB %d", + len, pnum); + goto write_error; + } + + err = ubi_wl_put_peb(ubi, vol->eba_tbl[lnum], 1); + if (err) { + ubi_free_vid_hdr(ubi, vid_hdr); + leb_write_unlock(ubi, vol_id, lnum); + return err; + } + + vol->eba_tbl[lnum] = pnum; + leb_write_unlock(ubi, vol_id, lnum); + ubi_free_vid_hdr(ubi, vid_hdr); + return 0; + +write_error: + if (err != -EIO || !ubi->bad_allowed) { + /* + * This flash device does not admit of bad eraseblocks or + * something nasty and unexpected happened. Switch to read-only + * mode just in case. + */ + ubi_ro_mode(ubi); + leb_write_unlock(ubi, vol_id, lnum); + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + } + + err = ubi_wl_put_peb(ubi, pnum, 1); + if (err || ++tries > UBI_IO_RETRIES) { + ubi_ro_mode(ubi); + leb_write_unlock(ubi, vol_id, lnum); + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + } + + vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); + ubi_msg("try another PEB"); + goto retry; +} + +/** + * ltree_entry_ctor - lock tree entries slab cache constructor. + * @obj: the lock-tree entry to construct + * @cache: the lock tree entry slab cache + * @flags: constructor flags + */ +static void ltree_entry_ctor(void *obj, struct kmem_cache *cache, + unsigned long flags) +{ + struct ltree_entry *le = obj; + + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) != + SLAB_CTOR_CONSTRUCTOR) + return; + + le->users = 0; + init_rwsem(&le->mutex); +} + +/** + * ubi_eba_copy_leb - copy logical eraseblock. + * @ubi: UBI device description object + * @from: physical eraseblock number from where to copy + * @to: physical eraseblock number where to copy + * @vid_hdr: VID header of the @from physical eraseblock + * + * This function copies logical eraseblock from physical eraseblock @from to + * physical eraseblock @to. The @vid_hdr buffer may be changed by this + * function. Returns zero in case of success, %UBI_IO_BITFLIPS if the operation + * was canceled because bit-flips were detected at the target PEB, and a + * negative error code in case of failure. + */ +int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to, + struct ubi_vid_hdr *vid_hdr) +{ + int err, vol_id, lnum, data_size, aldata_size, pnum, idx; + struct ubi_volume *vol; + uint32_t crc; + void *buf, *buf1 = NULL; + + vol_id = ubi32_to_cpu(vid_hdr->vol_id); + lnum = ubi32_to_cpu(vid_hdr->lnum); + + dbg_eba("copy LEB %d:%d, PEB %d to PEB %d", vol_id, lnum, from, to); + + if (vid_hdr->vol_type == UBI_VID_STATIC) { + data_size = ubi32_to_cpu(vid_hdr->data_size); + aldata_size = ALIGN(data_size, ubi->min_io_size); + } else + data_size = aldata_size = + ubi->leb_size - ubi32_to_cpu(vid_hdr->data_pad); + + buf = kmalloc(aldata_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* + * We do not want anybody to write to this logical eraseblock while we + * are moving it, so we lock it. + */ + err = leb_write_lock(ubi, vol_id, lnum); + if (err) { + kfree(buf); + return err; + } + + /* + * But the logical eraseblock might have been put by this time. + * Cancel if it is true. + */ + idx = vol_id2idx(ubi, vol_id); + + /* + * We may race with volume deletion/re-size, so we have to hold + * @ubi->volumes_lock. + */ + spin_lock(&ubi->volumes_lock); + vol = ubi->volumes[idx]; + if (!vol) { + dbg_eba("volume %d was removed meanwhile", vol_id); + spin_unlock(&ubi->volumes_lock); + goto out_unlock; + } + + pnum = vol->eba_tbl[lnum]; + if (pnum != from) { + dbg_eba("LEB %d:%d is no longer mapped to PEB %d, mapped to " + "PEB %d, cancel", vol_id, lnum, from, pnum); + spin_unlock(&ubi->volumes_lock); + goto out_unlock; + } + spin_unlock(&ubi->volumes_lock); + + /* OK, now the LEB is locked and we can safely start moving it */ + + dbg_eba("read %d bytes of data", aldata_size); + err = ubi_io_read_data(ubi, buf, from, 0, aldata_size); + if (err && err != UBI_IO_BITFLIPS) { + ubi_warn("error %d while reading data from PEB %d", + err, from); + goto out_unlock; + } + + /* + * Now we have got to calculate how much data we have to to copy. In + * case of a static volume it is fairly easy - the VID header contains + * the data size. In case of a dynamic volume it is more difficult - we + * have to read the contents, cut 0xFF bytes from the end and copy only + * the first part. We must do this to avoid writing 0xFF bytes as it + * may have some side-effects. And not only this. It is important not + * to include those 0xFFs to CRC because later the they may be filled + * by data. + */ + if (vid_hdr->vol_type == UBI_VID_DYNAMIC) + aldata_size = data_size = + ubi_calc_data_len(ubi, buf, data_size); + + cond_resched(); + crc = crc32(UBI_CRC32_INIT, buf, data_size); + cond_resched(); + + /* + * It may turn out to me that the whole @from physical eraseblock + * contains only 0xFF bytes. Then we have to only write the VID header + * and do not write any data. This also means we should not set + * @vid_hdr->copy_flag, @vid_hdr->data_size, and @vid_hdr->data_crc. + */ + if (data_size > 0) { + vid_hdr->copy_flag = 1; + vid_hdr->data_size = cpu_to_ubi32(data_size); + vid_hdr->data_crc = cpu_to_ubi32(crc); + } + vid_hdr->sqnum = cpu_to_ubi64(next_sqnum(ubi)); + + err = ubi_io_write_vid_hdr(ubi, to, vid_hdr); + if (err) + goto out_unlock; + + cond_resched(); + + /* Read the VID header back and check if it was written correctly */ + err = ubi_io_read_vid_hdr(ubi, to, vid_hdr, 1); + if (err) { + if (err != UBI_IO_BITFLIPS) + ubi_warn("cannot read VID header back from PEB %d", to); + goto out_unlock; + } + + if (data_size > 0) { + err = ubi_io_write_data(ubi, buf, to, 0, aldata_size); + if (err) + goto out_unlock; + + /* + * We've written the data and are going to read it back to make + * sure it was written correctly. + */ + buf1 = kmalloc(aldata_size, GFP_KERNEL); + if (!buf1) { + err = -ENOMEM; + goto out_unlock; + } + + cond_resched(); + + err = ubi_io_read_data(ubi, buf1, to, 0, aldata_size); + if (err) { + if (err != UBI_IO_BITFLIPS) + ubi_warn("cannot read data back from PEB %d", + to); + goto out_unlock; + } + + cond_resched(); + + if (memcmp(buf, buf1, aldata_size)) { + ubi_warn("read data back from PEB %d - it is different", + to); + goto out_unlock; + } + } + + ubi_assert(vol->eba_tbl[lnum] == from); + vol->eba_tbl[lnum] = to; + + leb_write_unlock(ubi, vol_id, lnum); + kfree(buf); + kfree(buf1); + + return 0; + +out_unlock: + leb_write_unlock(ubi, vol_id, lnum); + kfree(buf); + kfree(buf1); + return err; +} + +/** + * ubi_eba_init_scan - initialize the EBA unit using scanning information. + * @ubi: UBI device description object + * @si: scanning information + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si) +{ + int i, j, err, num_volumes; + struct ubi_scan_volume *sv; + struct ubi_volume *vol; + struct ubi_scan_leb *seb; + struct rb_node *rb; + + dbg_eba("initialize EBA unit"); + + spin_lock_init(&ubi->ltree_lock); + ubi->ltree = RB_ROOT; + + if (ubi_devices_cnt == 0) { + ltree_slab = kmem_cache_create("ubi_ltree_slab", + sizeof(struct ltree_entry), 0, + 0, <ree_entry_ctor, NULL); + if (!ltree_slab) + return -ENOMEM; + } + + ubi->global_sqnum = si->max_sqnum + 1; + num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT; + + for (i = 0; i < num_volumes; i++) { + vol = ubi->volumes[i]; + if (!vol) + continue; + + cond_resched(); + + vol->eba_tbl = kmalloc(vol->reserved_pebs * sizeof(int), + GFP_KERNEL); + if (!vol->eba_tbl) { + err = -ENOMEM; + goto out_free; + } + + for (j = 0; j < vol->reserved_pebs; j++) + vol->eba_tbl[j] = UBI_LEB_UNMAPPED; + + sv = ubi_scan_find_sv(si, idx2vol_id(ubi, i)); + if (!sv) + continue; + + ubi_rb_for_each_entry(rb, seb, &sv->root, u.rb) { + if (seb->lnum >= vol->reserved_pebs) + /* + * This may happen in case of an unclean reboot + * during re-size. + */ + ubi_scan_move_to_list(sv, seb, &si->erase); + vol->eba_tbl[seb->lnum] = seb->pnum; + } + } + + if (ubi->bad_allowed) { + ubi_calculate_reserved(ubi); + + if (ubi->avail_pebs < ubi->beb_rsvd_level) { + /* No enough free physical eraseblocks */ + ubi->beb_rsvd_pebs = ubi->avail_pebs; + ubi_warn("cannot reserve enough PEBs for bad PEB " + "handling, reserved %d, need %d", + ubi->beb_rsvd_pebs, ubi->beb_rsvd_level); + } else + ubi->beb_rsvd_pebs = ubi->beb_rsvd_level; + + ubi->avail_pebs -= ubi->beb_rsvd_pebs; + ubi->rsvd_pebs += ubi->beb_rsvd_pebs; + } + + dbg_eba("EBA unit is initialized"); + return 0; + +out_free: + for (i = 0; i < num_volumes; i++) { + if (!ubi->volumes[i]) + continue; + kfree(ubi->volumes[i]->eba_tbl); + } + if (ubi_devices_cnt == 0) + kmem_cache_destroy(ltree_slab); + return err; +} + +/** + * ubi_eba_close - close EBA unit. + * @ubi: UBI device description object + */ +void ubi_eba_close(const struct ubi_device *ubi) +{ + int i, num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT; + + dbg_eba("close EBA unit"); + + for (i = 0; i < num_volumes; i++) { + if (!ubi->volumes[i]) + continue; + kfree(ubi->volumes[i]->eba_tbl); + } + if (ubi_devices_cnt == 1) + kmem_cache_destroy(ltree_slab); +} diff --git a/drivers/mtd/ubi/gluebi.c b/drivers/mtd/ubi/gluebi.c new file mode 100644 index 00000000000..fc9478d605f --- /dev/null +++ b/drivers/mtd/ubi/gluebi.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём), Joern Engel + */ + +/* + * This file includes implementation of fake MTD devices for each UBI volume. + * This sounds strange, but it is in fact quite useful to make MTD-oriented + * software (including all the legacy software) to work on top of UBI. + * + * Gluebi emulates MTD devices of "MTD_UBIVOLUME" type. Their minimal I/O unit + * size (mtd->writesize) is equivalent to the UBI minimal I/O unit. The + * eraseblock size is equivalent to the logical eraseblock size of the volume. + */ + +#include <asm/div64.h> +#include "ubi.h" + +/** + * gluebi_get_device - get MTD device reference. + * @mtd: the MTD device description object + * + * This function is called every time the MTD device is being opened and + * implements the MTD get_device() operation. Returns zero in case of success + * and a negative error code in case of failure. + */ +static int gluebi_get_device(struct mtd_info *mtd) +{ + struct ubi_volume *vol; + + vol = container_of(mtd, struct ubi_volume, gluebi_mtd); + + /* + * We do not introduce locks for gluebi reference count because the + * get_device()/put_device() calls are already serialized at MTD. + */ + if (vol->gluebi_refcount > 0) { + /* + * The MTD device is already referenced and this is just one + * more reference. MTD allows many users to open the same + * volume simultaneously and do not distinguish between + * readers/writers/exclusive openers as UBI does. So we do not + * open the UBI volume again - just increase the reference + * counter and return. + */ + vol->gluebi_refcount += 1; + return 0; + } + + /* + * This is the first reference to this UBI volume via the MTD device + * interface. Open the corresponding volume in read-write mode. + */ + vol->gluebi_desc = ubi_open_volume(vol->ubi->ubi_num, vol->vol_id, + UBI_READWRITE); + if (IS_ERR(vol->gluebi_desc)) + return PTR_ERR(vol->gluebi_desc); + vol->gluebi_refcount += 1; + return 0; +} + +/** + * gluebi_put_device - put MTD device reference. + * @mtd: the MTD device description object + * + * This function is called every time the MTD device is being put. Returns + * zero in case of success and a negative error code in case of failure. + */ +static void gluebi_put_device(struct mtd_info *mtd) +{ + struct ubi_volume *vol; + + vol = container_of(mtd, struct ubi_volume, gluebi_mtd); + vol->gluebi_refcount -= 1; + ubi_assert(vol->gluebi_refcount >= 0); + if (vol->gluebi_refcount == 0) + ubi_close_volume(vol->gluebi_desc); +} + +/** + * gluebi_read - read operation of emulated MTD devices. + * @mtd: MTD device description object + * @from: absolute offset from where to read + * @len: how many bytes to read + * @retlen: count of read bytes is returned here + * @buf: buffer to store the read data + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int gluebi_read(struct mtd_info *mtd, loff_t from, size_t len, + size_t *retlen, unsigned char *buf) +{ + int err = 0, lnum, offs, total_read; + struct ubi_volume *vol; + struct ubi_device *ubi; + uint64_t tmp = from; + + dbg_msg("read %zd bytes from offset %lld", len, from); + + if (len < 0 || from < 0 || from + len > mtd->size) + return -EINVAL; + + vol = container_of(mtd, struct ubi_volume, gluebi_mtd); + ubi = vol->ubi; + + offs = do_div(tmp, mtd->erasesize); + lnum = tmp; + + total_read = len; + while (total_read) { + size_t to_read = mtd->erasesize - offs; + + if (to_read > total_read) + to_read = total_read; + + err = ubi_eba_read_leb(ubi, vol->vol_id, lnum, buf, offs, + to_read, 0); + if (err) + break; + + lnum += 1; + offs = 0; + total_read -= to_read; + buf += to_read; + } + + *retlen = len - total_read; + return err; +} + +/** + * gluebi_write - write operation of emulated MTD devices. + * @mtd: MTD device description object + * @to: absolute offset where to write + * @len: how many bytes to write + * @retlen: count of written bytes is returned here + * @buf: buffer with data to write + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int gluebi_write(struct mtd_info *mtd, loff_t to, size_t len, + size_t *retlen, const u_char *buf) +{ + int err = 0, lnum, offs, total_written; + struct ubi_volume *vol; + struct ubi_device *ubi; + uint64_t tmp = to; + + dbg_msg("write %zd bytes to offset %lld", len, to); + + if (len < 0 || to < 0 || len + to > mtd->size) + return -EINVAL; + + vol = container_of(mtd, struct ubi_volume, gluebi_mtd); + ubi = vol->ubi; + + if (ubi->ro_mode) + return -EROFS; + + offs = do_div(tmp, mtd->erasesize); + lnum = tmp; + + if (len % mtd->writesize || offs % mtd->writesize) + return -EINVAL; + + total_written = len; + while (total_written) { + size_t to_write = mtd->erasesize - offs; + + if (to_write > total_written) + to_write = total_written; + + err = ubi_eba_write_leb(ubi, vol->vol_id, lnum, buf, offs, + to_write, UBI_UNKNOWN); + if (err) + break; + + lnum += 1; + offs = 0; + total_written -= to_write; + buf += to_write; + } + + *retlen = len - total_written; + return err; +} + +/** + * gluebi_erase - erase operation of emulated MTD devices. + * @mtd: the MTD device description object + * @instr: the erase operation description + * + * This function calls the erase callback when finishes. Returns zero in case + * of success and a negative error code in case of failure. + */ +static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr) +{ + int err, i, lnum, count; + struct ubi_volume *vol; + struct ubi_device *ubi; + + dbg_msg("erase %u bytes at offset %u", instr->len, instr->addr); + + if (instr->addr < 0 || instr->addr > mtd->size - mtd->erasesize) + return -EINVAL; + + if (instr->len < 0 || instr->addr + instr->len > mtd->size) + return -EINVAL; + + if (instr->addr % mtd->writesize || instr->len % mtd->writesize) + return -EINVAL; + + lnum = instr->addr / mtd->erasesize; + count = instr->len / mtd->erasesize; + + vol = container_of(mtd, struct ubi_volume, gluebi_mtd); + ubi = vol->ubi; + + if (ubi->ro_mode) + return -EROFS; + + for (i = 0; i < count; i++) { + err = ubi_eba_unmap_leb(ubi, vol->vol_id, lnum + i); + if (err) + goto out_err; + } + + /* + * MTD erase operations are synchronous, so we have to make sure the + * physical eraseblock is wiped out. + */ + err = ubi_wl_flush(ubi); + if (err) + goto out_err; + + instr->state = MTD_ERASE_DONE; + mtd_erase_callback(instr); + return 0; + +out_err: + instr->state = MTD_ERASE_FAILED; + instr->fail_addr = lnum * mtd->erasesize; + return err; +} + +/** + * ubi_create_gluebi - initialize gluebi for an UBI volume. + * @ubi: UBI device description object + * @vol: volume description object + * + * This function is called when an UBI volume is created in order to create + * corresponding fake MTD device. Returns zero in case of success and a + * negative error code in case of failure. + */ +int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol) +{ + struct mtd_info *mtd = &vol->gluebi_mtd; + + mtd->name = kmemdup(vol->name, vol->name_len + 1, GFP_KERNEL); + if (!mtd->name) + return -ENOMEM; + + mtd->type = MTD_UBIVOLUME; + if (!ubi->ro_mode) + mtd->flags = MTD_WRITEABLE; + mtd->writesize = ubi->min_io_size; + mtd->owner = THIS_MODULE; + mtd->size = vol->usable_leb_size * vol->reserved_pebs; + mtd->erasesize = vol->usable_leb_size; + mtd->read = gluebi_read; + mtd->write = gluebi_write; + mtd->erase = gluebi_erase; + mtd->get_device = gluebi_get_device; + mtd->put_device = gluebi_put_device; + + if (add_mtd_device(mtd)) { + ubi_err("cannot not add MTD device\n"); + kfree(mtd->name); + return -ENFILE; + } + + dbg_msg("added mtd%d (\"%s\"), size %u, EB size %u", + mtd->index, mtd->name, mtd->size, mtd->erasesize); + return 0; +} + +/** + * ubi_destroy_gluebi - close gluebi for an UBI volume. + * @vol: volume description object + * + * This function is called when an UBI volume is removed in order to remove + * corresponding fake MTD device. Returns zero in case of success and a + * negative error code in case of failure. + */ +int ubi_destroy_gluebi(struct ubi_volume *vol) +{ + int err; + struct mtd_info *mtd = &vol->gluebi_mtd; + + dbg_msg("remove mtd%d", mtd->index); + err = del_mtd_device(mtd); + if (err) + return err; + kfree(mtd->name); + return 0; +} diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c new file mode 100644 index 00000000000..438914d0515 --- /dev/null +++ b/drivers/mtd/ubi/io.c @@ -0,0 +1,1259 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * Copyright (c) Nokia Corporation, 2006, 2007 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * UBI input/output unit. + * + * This unit provides a uniform way to work with all kinds of the underlying + * MTD devices. It also implements handy functions for reading and writing UBI + * headers. + * + * We are trying to have a paranoid mindset and not to trust to what we read + * from the flash media in order to be more secure and robust. So this unit + * validates every single header it reads from the flash media. + * + * Some words about how the eraseblock headers are stored. + * + * The erase counter header is always stored at offset zero. By default, the + * VID header is stored after the EC header at the closest aligned offset + * (i.e. aligned to the minimum I/O unit size). Data starts next to the VID + * header at the closest aligned offset. But this default layout may be + * changed. For example, for different reasons (e.g., optimization) UBI may be + * asked to put the VID header at further offset, and even at an unaligned + * offset. Of course, if the offset of the VID header is unaligned, UBI adds + * proper padding in front of it. Data offset may also be changed but it has to + * be aligned. + * + * About minimal I/O units. In general, UBI assumes flash device model where + * there is only one minimal I/O unit size. E.g., in case of NOR flash it is 1, + * in case of NAND flash it is a NAND page, etc. This is reported by MTD in the + * @ubi->mtd->writesize field. But as an exception, UBI admits of using another + * (smaller) minimal I/O unit size for EC and VID headers to make it possible + * to do different optimizations. + * + * This is extremely useful in case of NAND flashes which admit of several + * write operations to one NAND page. In this case UBI can fit EC and VID + * headers at one NAND page. Thus, UBI may use "sub-page" size as the minimal + * I/O unit for the headers (the @ubi->hdrs_min_io_size field). But it still + * reports NAND page size (@ubi->min_io_size) as a minimal I/O unit for the UBI + * users. + * + * Example: some Samsung NANDs with 2KiB pages allow 4x 512-byte writes, so + * although the minimal I/O unit is 2K, UBI uses 512 bytes for EC and VID + * headers. + * + * Q: why not just to treat sub-page as a minimal I/O unit of this flash + * device, e.g., make @ubi->min_io_size = 512 in the example above? + * + * A: because when writing a sub-page, MTD still writes a full 2K page but the + * bytes which are no relevant to the sub-page are 0xFF. So, basically, writing + * 4x512 sub-pages is 4 times slower then writing one 2KiB NAND page. Thus, we + * prefer to use sub-pages only for EV and VID headers. + * + * As it was noted above, the VID header may start at a non-aligned offset. + * For example, in case of a 2KiB page NAND flash with a 512 bytes sub-page, + * the VID header may reside at offset 1984 which is the last 64 bytes of the + * last sub-page (EC header is always at offset zero). This causes some + * difficulties when reading and writing VID headers. + * + * Suppose we have a 64-byte buffer and we read a VID header at it. We change + * the data and want to write this VID header out. As we can only write in + * 512-byte chunks, we have to allocate one more buffer and copy our VID header + * to offset 448 of this buffer. + * + * The I/O unit does the following trick in order to avoid this extra copy. + * It always allocates a @ubi->vid_hdr_alsize bytes buffer for the VID header + * and returns a pointer to offset @ubi->vid_hdr_shift of this buffer. When the + * VID header is being written out, it shifts the VID header pointer back and + * writes the whole sub-page. + */ + +#include <linux/crc32.h> +#include <linux/err.h> +#include "ubi.h" + +#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID +static int paranoid_check_not_bad(const struct ubi_device *ubi, int pnum); +static int paranoid_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum); +static int paranoid_check_ec_hdr(const struct ubi_device *ubi, int pnum, + const struct ubi_ec_hdr *ec_hdr); +static int paranoid_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum); +static int paranoid_check_vid_hdr(const struct ubi_device *ubi, int pnum, + const struct ubi_vid_hdr *vid_hdr); +static int paranoid_check_all_ff(const struct ubi_device *ubi, int pnum, + int offset, int len); +#else +#define paranoid_check_not_bad(ubi, pnum) 0 +#define paranoid_check_peb_ec_hdr(ubi, pnum) 0 +#define paranoid_check_ec_hdr(ubi, pnum, ec_hdr) 0 +#define paranoid_check_peb_vid_hdr(ubi, pnum) 0 +#define paranoid_check_vid_hdr(ubi, pnum, vid_hdr) 0 +#define paranoid_check_all_ff(ubi, pnum, offset, len) 0 +#endif + +/** + * ubi_io_read - read data from a physical eraseblock. + * @ubi: UBI device description object + * @buf: buffer where to store the read data + * @pnum: physical eraseblock number to read from + * @offset: offset within the physical eraseblock from where to read + * @len: how many bytes to read + * + * This function reads data from offset @offset of physical eraseblock @pnum + * and stores the read data in the @buf buffer. The following return codes are + * possible: + * + * o %0 if all the requested data were successfully read; + * o %UBI_IO_BITFLIPS if all the requested data were successfully read, but + * correctable bit-flips were detected; this is harmless but may indicate + * that this eraseblock may become bad soon (but do not have to); + * o %-EBADMSG if the MTD subsystem reported about data data integrity + * problems, for example it can me an ECC error in case of NAND; this most + * probably means that the data is corrupted; + * o %-EIO if some I/O error occurred; + * o other negative error codes in case of other errors. + */ +int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset, + int len) +{ + int err, retries = 0; + size_t read; + loff_t addr; + + dbg_io("read %d bytes from PEB %d:%d", len, pnum, offset); + + ubi_assert(pnum >= 0 && pnum < ubi->peb_count); + ubi_assert(offset >= 0 && offset + len <= ubi->peb_size); + ubi_assert(len > 0); + + err = paranoid_check_not_bad(ubi, pnum); + if (err) + return err > 0 ? -EINVAL : err; + + addr = (loff_t)pnum * ubi->peb_size + offset; +retry: + err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf); + if (err) { + if (err == -EUCLEAN) { + /* + * -EUCLEAN is reported if there was a bit-flip which + * was corrected, so this is harmless. + */ + ubi_msg("fixable bit-flip detected at PEB %d", pnum); + ubi_assert(len == read); + return UBI_IO_BITFLIPS; + } + + if (read != len && retries++ < UBI_IO_RETRIES) { + dbg_io("error %d while reading %d bytes from PEB %d:%d, " + "read only %zd bytes, retry", + err, len, pnum, offset, read); + yield(); + goto retry; + } + + ubi_err("error %d while reading %d bytes from PEB %d:%d, " + "read %zd bytes", err, len, pnum, offset, read); + ubi_dbg_dump_stack(); + } else { + ubi_assert(len == read); + + if (ubi_dbg_is_bitflip()) { + dbg_msg("bit-flip (emulated)"); + err = UBI_IO_BITFLIPS; + } + } + + return err; +} + +/** + * ubi_io_write - write data to a physical eraseblock. + * @ubi: UBI device description object + * @buf: buffer with the data to write + * @pnum: physical eraseblock number to write to + * @offset: offset within the physical eraseblock where to write + * @len: how many bytes to write + * + * This function writes @len bytes of data from buffer @buf to offset @offset + * of physical eraseblock @pnum. If all the data were successfully written, + * zero is returned. If an error occurred, this function returns a negative + * error code. If %-EIO is returned, the physical eraseblock most probably went + * bad. + * + * Note, in case of an error, it is possible that something was still written + * to the flash media, but may be some garbage. + */ +int ubi_io_write(const struct ubi_device *ubi, const void *buf, int pnum, + int offset, int len) +{ + int err; + size_t written; + loff_t addr; + + dbg_io("write %d bytes to PEB %d:%d", len, pnum, offset); + + ubi_assert(pnum >= 0 && pnum < ubi->peb_count); + ubi_assert(offset >= 0 && offset + len <= ubi->peb_size); + ubi_assert(offset % ubi->hdrs_min_io_size == 0); + ubi_assert(len > 0 && len % ubi->hdrs_min_io_size == 0); + + if (ubi->ro_mode) { + ubi_err("read-only mode"); + return -EROFS; + } + + /* The below has to be compiled out if paranoid checks are disabled */ + + err = paranoid_check_not_bad(ubi, pnum); + if (err) + return err > 0 ? -EINVAL : err; + + /* The area we are writing to has to contain all 0xFF bytes */ + err = paranoid_check_all_ff(ubi, pnum, offset, len); + if (err) + return err > 0 ? -EINVAL : err; + + if (offset >= ubi->leb_start) { + /* + * We write to the data area of the physical eraseblock. Make + * sure it has valid EC and VID headers. + */ + err = paranoid_check_peb_ec_hdr(ubi, pnum); + if (err) + return err > 0 ? -EINVAL : err; + err = paranoid_check_peb_vid_hdr(ubi, pnum); + if (err) + return err > 0 ? -EINVAL : err; + } + + if (ubi_dbg_is_write_failure()) { + dbg_err("cannot write %d bytes to PEB %d:%d " + "(emulated)", len, pnum, offset); + ubi_dbg_dump_stack(); + return -EIO; + } + + addr = (loff_t)pnum * ubi->peb_size + offset; + err = ubi->mtd->write(ubi->mtd, addr, len, &written, buf); + if (err) { + ubi_err("error %d while writing %d bytes to PEB %d:%d, written" + " %zd bytes", err, len, pnum, offset, written); + ubi_dbg_dump_stack(); + } else + ubi_assert(written == len); + + return err; +} + +/** + * erase_callback - MTD erasure call-back. + * @ei: MTD erase information object. + * + * Note, even though MTD erase interface is asynchronous, all the current + * implementations are synchronous anyway. + */ +static void erase_callback(struct erase_info *ei) +{ + wake_up_interruptible((wait_queue_head_t *)ei->priv); +} + +/** + * do_sync_erase - synchronously erase a physical eraseblock. + * @ubi: UBI device description object + * @pnum: the physical eraseblock number to erase + * + * This function synchronously erases physical eraseblock @pnum and returns + * zero in case of success and a negative error code in case of failure. If + * %-EIO is returned, the physical eraseblock most probably went bad. + */ +static int do_sync_erase(const struct ubi_device *ubi, int pnum) +{ + int err, retries = 0; + struct erase_info ei; + wait_queue_head_t wq; + + dbg_io("erase PEB %d", pnum); + +retry: + init_waitqueue_head(&wq); + memset(&ei, 0, sizeof(struct erase_info)); + + ei.mtd = ubi->mtd; + ei.addr = pnum * ubi->peb_size; + ei.len = ubi->peb_size; + ei.callback = erase_callback; + ei.priv = (unsigned long)&wq; + + err = ubi->mtd->erase(ubi->mtd, &ei); + if (err) { + if (retries++ < UBI_IO_RETRIES) { + dbg_io("error %d while erasing PEB %d, retry", + err, pnum); + yield(); + goto retry; + } + ubi_err("cannot erase PEB %d, error %d", pnum, err); + ubi_dbg_dump_stack(); + return err; + } + + err = wait_event_interruptible(wq, ei.state == MTD_ERASE_DONE || + ei.state == MTD_ERASE_FAILED); + if (err) { + ubi_err("interrupted PEB %d erasure", pnum); + return -EINTR; + } + + if (ei.state == MTD_ERASE_FAILED) { + if (retries++ < UBI_IO_RETRIES) { + dbg_io("error while erasing PEB %d, retry", pnum); + yield(); + goto retry; + } + ubi_err("cannot erase PEB %d", pnum); + ubi_dbg_dump_stack(); + return -EIO; + } + + err = paranoid_check_all_ff(ubi, pnum, 0, ubi->peb_size); + if (err) + return err > 0 ? -EINVAL : err; + + if (ubi_dbg_is_erase_failure() && !err) { + dbg_err("cannot erase PEB %d (emulated)", pnum); + return -EIO; + } + + return 0; +} + +/** + * check_pattern - check if buffer contains only a certain byte pattern. + * @buf: buffer to check + * @patt: the pattern to check + * @size: buffer size in bytes + * + * This function returns %1 in there are only @patt bytes in @buf, and %0 if + * something else was also found. + */ +static int check_pattern(const void *buf, uint8_t patt, int size) +{ + int i; + + for (i = 0; i < size; i++) + if (((const uint8_t *)buf)[i] != patt) + return 0; + return 1; +} + +/* Patterns to write to a physical eraseblock when torturing it */ +static uint8_t patterns[] = {0xa5, 0x5a, 0x0}; + +/** + * torture_peb - test a supposedly bad physical eraseblock. + * @ubi: UBI device description object + * @pnum: the physical eraseblock number to test + * + * This function returns %-EIO if the physical eraseblock did not pass the + * test, a positive number of erase operations done if the test was + * successfully passed, and other negative error codes in case of other errors. + */ +static int torture_peb(const struct ubi_device *ubi, int pnum) +{ + void *buf; + int err, i, patt_count; + + buf = kmalloc(ubi->peb_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + patt_count = ARRAY_SIZE(patterns); + ubi_assert(patt_count > 0); + + for (i = 0; i < patt_count; i++) { + err = do_sync_erase(ubi, pnum); + if (err) + goto out; + + /* Make sure the PEB contains only 0xFF bytes */ + err = ubi_io_read(ubi, buf, pnum, 0, ubi->peb_size); + if (err) + goto out; + + err = check_pattern(buf, 0xFF, ubi->peb_size); + if (err == 0) { + ubi_err("erased PEB %d, but a non-0xFF byte found", + pnum); + err = -EIO; + goto out; + } + + /* Write a pattern and check it */ + memset(buf, patterns[i], ubi->peb_size); + err = ubi_io_write(ubi, buf, pnum, 0, ubi->peb_size); + if (err) + goto out; + + memset(buf, ~patterns[i], ubi->peb_size); + err = ubi_io_read(ubi, buf, pnum, 0, ubi->peb_size); + if (err) + goto out; + + err = check_pattern(buf, patterns[i], ubi->peb_size); + if (err == 0) { + ubi_err("pattern %x checking failed for PEB %d", + patterns[i], pnum); + err = -EIO; + goto out; + } + } + + err = patt_count; + +out: + if (err == UBI_IO_BITFLIPS || err == -EBADMSG) + /* + * If a bit-flip or data integrity error was detected, the test + * has not passed because it happened on a freshly erased + * physical eraseblock which means something is wrong with it. + */ + err = -EIO; + kfree(buf); + return err; +} + +/** + * ubi_io_sync_erase - synchronously erase a physical eraseblock. + * @ubi: UBI device description object + * @pnum: physical eraseblock number to erase + * @torture: if this physical eraseblock has to be tortured + * + * This function synchronously erases physical eraseblock @pnum. If @torture + * flag is not zero, the physical eraseblock is checked by means of writing + * different patterns to it and reading them back. If the torturing is enabled, + * the physical eraseblock is erased more then once. + * + * This function returns the number of erasures made in case of success, %-EIO + * if the erasure failed or the torturing test failed, and other negative error + * codes in case of other errors. Note, %-EIO means that the physical + * eraseblock is bad. + */ +int ubi_io_sync_erase(const struct ubi_device *ubi, int pnum, int torture) +{ + int err, ret = 0; + + ubi_assert(pnum >= 0 && pnum < ubi->peb_count); + + err = paranoid_check_not_bad(ubi, pnum); + if (err != 0) + return err > 0 ? -EINVAL : err; + + if (ubi->ro_mode) { + ubi_err("read-only mode"); + return -EROFS; + } + + if (torture) { + ret = torture_peb(ubi, pnum); + if (ret < 0) + return ret; + } + + err = do_sync_erase(ubi, pnum); + if (err) + return err; + + return ret + 1; +} + +/** + * ubi_io_is_bad - check if a physical eraseblock is bad. + * @ubi: UBI device description object + * @pnum: the physical eraseblock number to check + * + * This function returns a positive number if the physical eraseblock is bad, + * zero if not, and a negative error code if an error occurred. + */ +int ubi_io_is_bad(const struct ubi_device *ubi, int pnum) +{ + struct mtd_info *mtd = ubi->mtd; + + ubi_assert(pnum >= 0 && pnum < ubi->peb_count); + + if (ubi->bad_allowed) { + int ret; + + ret = mtd->block_isbad(mtd, (loff_t)pnum * ubi->peb_size); + if (ret < 0) + ubi_err("error %d while checking if PEB %d is bad", + ret, pnum); + else if (ret) + dbg_io("PEB %d is bad", pnum); + return ret; + } + + return 0; +} + +/** + * ubi_io_mark_bad - mark a physical eraseblock as bad. + * @ubi: UBI device description object + * @pnum: the physical eraseblock number to mark + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubi_io_mark_bad(const struct ubi_device *ubi, int pnum) +{ + int err; + struct mtd_info *mtd = ubi->mtd; + + ubi_assert(pnum >= 0 && pnum < ubi->peb_count); + + if (ubi->ro_mode) { + ubi_err("read-only mode"); + return -EROFS; + } + + if (!ubi->bad_allowed) + return 0; + + err = mtd->block_markbad(mtd, (loff_t)pnum * ubi->peb_size); + if (err) + ubi_err("cannot mark PEB %d bad, error %d", pnum, err); + return err; +} + +/** + * validate_ec_hdr - validate an erase counter header. + * @ubi: UBI device description object + * @ec_hdr: the erase counter header to check + * + * This function returns zero if the erase counter header is OK, and %1 if + * not. + */ +static int validate_ec_hdr(const struct ubi_device *ubi, + const struct ubi_ec_hdr *ec_hdr) +{ + long long ec; + int vid_hdr_offset, leb_start; + + ec = ubi64_to_cpu(ec_hdr->ec); + vid_hdr_offset = ubi32_to_cpu(ec_hdr->vid_hdr_offset); + leb_start = ubi32_to_cpu(ec_hdr->data_offset); + + if (ec_hdr->version != UBI_VERSION) { + ubi_err("node with incompatible UBI version found: " + "this UBI version is %d, image version is %d", + UBI_VERSION, (int)ec_hdr->version); + goto bad; + } + + if (vid_hdr_offset != ubi->vid_hdr_offset) { + ubi_err("bad VID header offset %d, expected %d", + vid_hdr_offset, ubi->vid_hdr_offset); + goto bad; + } + + if (leb_start != ubi->leb_start) { + ubi_err("bad data offset %d, expected %d", + leb_start, ubi->leb_start); + goto bad; + } + + if (ec < 0 || ec > UBI_MAX_ERASECOUNTER) { + ubi_err("bad erase counter %lld", ec); + goto bad; + } + + return 0; + +bad: + ubi_err("bad EC header"); + ubi_dbg_dump_ec_hdr(ec_hdr); + ubi_dbg_dump_stack(); + return 1; +} + +/** + * ubi_io_read_ec_hdr - read and check an erase counter header. + * @ubi: UBI device description object + * @pnum: physical eraseblock to read from + * @ec_hdr: a &struct ubi_ec_hdr object where to store the read erase counter + * header + * @verbose: be verbose if the header is corrupted or was not found + * + * This function reads erase counter header from physical eraseblock @pnum and + * stores it in @ec_hdr. This function also checks CRC checksum of the read + * erase counter header. The following codes may be returned: + * + * o %0 if the CRC checksum is correct and the header was successfully read; + * o %UBI_IO_BITFLIPS if the CRC is correct, but bit-flips were detected + * and corrected by the flash driver; this is harmless but may indicate that + * this eraseblock may become bad soon (but may be not); + * o %UBI_IO_BAD_EC_HDR if the erase counter header is corrupted (a CRC error); + * o %UBI_IO_PEB_EMPTY if the physical eraseblock is empty; + * o a negative error code in case of failure. + */ +int ubi_io_read_ec_hdr(const struct ubi_device *ubi, int pnum, + struct ubi_ec_hdr *ec_hdr, int verbose) +{ + int err, read_err = 0; + uint32_t crc, magic, hdr_crc; + + dbg_io("read EC header from PEB %d", pnum); + ubi_assert(pnum >= 0 && pnum < ubi->peb_count); + + err = ubi_io_read(ubi, ec_hdr, pnum, 0, UBI_EC_HDR_SIZE); + if (err) { + if (err != UBI_IO_BITFLIPS && err != -EBADMSG) + return err; + + /* + * We read all the data, but either a correctable bit-flip + * occurred, or MTD reported about some data integrity error, + * like an ECC error in case of NAND. The former is harmless, + * the later may mean that the read data is corrupted. But we + * have a CRC check-sum and we will detect this. If the EC + * header is still OK, we just report this as there was a + * bit-flip. + */ + read_err = err; + } + + magic = ubi32_to_cpu(ec_hdr->magic); + if (magic != UBI_EC_HDR_MAGIC) { + /* + * The magic field is wrong. Let's check if we have read all + * 0xFF. If yes, this physical eraseblock is assumed to be + * empty. + * + * But if there was a read error, we do not test it for all + * 0xFFs. Even if it does contain all 0xFFs, this error + * indicates that something is still wrong with this physical + * eraseblock and we anyway cannot treat it as empty. + */ + if (read_err != -EBADMSG && + check_pattern(ec_hdr, 0xFF, UBI_EC_HDR_SIZE)) { + /* The physical eraseblock is supposedly empty */ + + /* + * The below is just a paranoid check, it has to be + * compiled out if paranoid checks are disabled. + */ + err = paranoid_check_all_ff(ubi, pnum, 0, + ubi->peb_size); + if (err) + return err > 0 ? UBI_IO_BAD_EC_HDR : err; + + if (verbose) + ubi_warn("no EC header found at PEB %d, " + "only 0xFF bytes", pnum); + return UBI_IO_PEB_EMPTY; + } + + /* + * This is not a valid erase counter header, and these are not + * 0xFF bytes. Report that the header is corrupted. + */ + if (verbose) { + ubi_warn("bad magic number at PEB %d: %08x instead of " + "%08x", pnum, magic, UBI_EC_HDR_MAGIC); + ubi_dbg_dump_ec_hdr(ec_hdr); + } + return UBI_IO_BAD_EC_HDR; + } + + crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC); + hdr_crc = ubi32_to_cpu(ec_hdr->hdr_crc); + + if (hdr_crc != crc) { + if (verbose) { + ubi_warn("bad EC header CRC at PEB %d, calculated %#08x," + " read %#08x", pnum, crc, hdr_crc); + ubi_dbg_dump_ec_hdr(ec_hdr); + } + return UBI_IO_BAD_EC_HDR; + } + + /* And of course validate what has just been read from the media */ + err = validate_ec_hdr(ubi, ec_hdr); + if (err) { + ubi_err("validation failed for PEB %d", pnum); + return -EINVAL; + } + + return read_err ? UBI_IO_BITFLIPS : 0; +} + +/** + * ubi_io_write_ec_hdr - write an erase counter header. + * @ubi: UBI device description object + * @pnum: physical eraseblock to write to + * @ec_hdr: the erase counter header to write + * + * This function writes erase counter header described by @ec_hdr to physical + * eraseblock @pnum. It also fills most fields of @ec_hdr before writing, so + * the caller do not have to fill them. Callers must only fill the @ec_hdr->ec + * field. + * + * This function returns zero in case of success and a negative error code in + * case of failure. If %-EIO is returned, the physical eraseblock most probably + * went bad. + */ +int ubi_io_write_ec_hdr(const struct ubi_device *ubi, int pnum, + struct ubi_ec_hdr *ec_hdr) +{ + int err; + uint32_t crc; + + dbg_io("write EC header to PEB %d", pnum); + ubi_assert(pnum >= 0 && pnum < ubi->peb_count); + + ec_hdr->magic = cpu_to_ubi32(UBI_EC_HDR_MAGIC); + ec_hdr->version = UBI_VERSION; + ec_hdr->vid_hdr_offset = cpu_to_ubi32(ubi->vid_hdr_offset); + ec_hdr->data_offset = cpu_to_ubi32(ubi->leb_start); + crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC); + ec_hdr->hdr_crc = cpu_to_ubi32(crc); + + err = paranoid_check_ec_hdr(ubi, pnum, ec_hdr); + if (err) + return -EINVAL; + + err = ubi_io_write(ubi, ec_hdr, pnum, 0, ubi->ec_hdr_alsize); + return err; +} + +/** + * validate_vid_hdr - validate a volume identifier header. + * @ubi: UBI device description object + * @vid_hdr: the volume identifier header to check + * + * This function checks that data stored in the volume identifier header + * @vid_hdr. Returns zero if the VID header is OK and %1 if not. + */ +static int validate_vid_hdr(const struct ubi_device *ubi, + const struct ubi_vid_hdr *vid_hdr) +{ + int vol_type = vid_hdr->vol_type; + int copy_flag = vid_hdr->copy_flag; + int vol_id = ubi32_to_cpu(vid_hdr->vol_id); + int lnum = ubi32_to_cpu(vid_hdr->lnum); + int compat = vid_hdr->compat; + int data_size = ubi32_to_cpu(vid_hdr->data_size); + int used_ebs = ubi32_to_cpu(vid_hdr->used_ebs); + int data_pad = ubi32_to_cpu(vid_hdr->data_pad); + int data_crc = ubi32_to_cpu(vid_hdr->data_crc); + int usable_leb_size = ubi->leb_size - data_pad; + + if (copy_flag != 0 && copy_flag != 1) { + dbg_err("bad copy_flag"); + goto bad; + } + + if (vol_id < 0 || lnum < 0 || data_size < 0 || used_ebs < 0 || + data_pad < 0) { + dbg_err("negative values"); + goto bad; + } + + if (vol_id >= UBI_MAX_VOLUMES && vol_id < UBI_INTERNAL_VOL_START) { + dbg_err("bad vol_id"); + goto bad; + } + + if (vol_id < UBI_INTERNAL_VOL_START && compat != 0) { + dbg_err("bad compat"); + goto bad; + } + + if (vol_id >= UBI_INTERNAL_VOL_START && compat != UBI_COMPAT_DELETE && + compat != UBI_COMPAT_RO && compat != UBI_COMPAT_PRESERVE && + compat != UBI_COMPAT_REJECT) { + dbg_err("bad compat"); + goto bad; + } + + if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) { + dbg_err("bad vol_type"); + goto bad; + } + + if (data_pad >= ubi->leb_size / 2) { + dbg_err("bad data_pad"); + goto bad; + } + + if (vol_type == UBI_VID_STATIC) { + /* + * Although from high-level point of view static volumes may + * contain zero bytes of data, but no VID headers can contain + * zero at these fields, because they empty volumes do not have + * mapped logical eraseblocks. + */ + if (used_ebs == 0) { + dbg_err("zero used_ebs"); + goto bad; + } + if (data_size == 0) { + dbg_err("zero data_size"); + goto bad; + } + if (lnum < used_ebs - 1) { + if (data_size != usable_leb_size) { + dbg_err("bad data_size"); + goto bad; + } + } else if (lnum == used_ebs - 1) { + if (data_size == 0) { + dbg_err("bad data_size at last LEB"); + goto bad; + } + } else { + dbg_err("too high lnum"); + goto bad; + } + } else { + if (copy_flag == 0) { + if (data_crc != 0) { + dbg_err("non-zero data CRC"); + goto bad; + } + if (data_size != 0) { + dbg_err("non-zero data_size"); + goto bad; + } + } else { + if (data_size == 0) { + dbg_err("zero data_size of copy"); + goto bad; + } + } + if (used_ebs != 0) { + dbg_err("bad used_ebs"); + goto bad; + } + } + + return 0; + +bad: + ubi_err("bad VID header"); + ubi_dbg_dump_vid_hdr(vid_hdr); + ubi_dbg_dump_stack(); + return 1; +} + +/** + * ubi_io_read_vid_hdr - read and check a volume identifier header. + * @ubi: UBI device description object + * @pnum: physical eraseblock number to read from + * @vid_hdr: &struct ubi_vid_hdr object where to store the read volume + * identifier header + * @verbose: be verbose if the header is corrupted or wasn't found + * + * This function reads the volume identifier header from physical eraseblock + * @pnum and stores it in @vid_hdr. It also checks CRC checksum of the read + * volume identifier header. The following codes may be returned: + * + * o %0 if the CRC checksum is correct and the header was successfully read; + * o %UBI_IO_BITFLIPS if the CRC is correct, but bit-flips were detected + * and corrected by the flash driver; this is harmless but may indicate that + * this eraseblock may become bad soon; + * o %UBI_IO_BAD_VID_HRD if the volume identifier header is corrupted (a CRC + * error detected); + * o %UBI_IO_PEB_FREE if the physical eraseblock is free (i.e., there is no VID + * header there); + * o a negative error code in case of failure. + */ +int ubi_io_read_vid_hdr(const struct ubi_device *ubi, int pnum, + struct ubi_vid_hdr *vid_hdr, int verbose) +{ + int err, read_err = 0; + uint32_t crc, magic, hdr_crc; + void *p; + + dbg_io("read VID header from PEB %d", pnum); + ubi_assert(pnum >= 0 && pnum < ubi->peb_count); + + p = (char *)vid_hdr - ubi->vid_hdr_shift; + err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset, + ubi->vid_hdr_alsize); + if (err) { + if (err != UBI_IO_BITFLIPS && err != -EBADMSG) + return err; + + /* + * We read all the data, but either a correctable bit-flip + * occurred, or MTD reported about some data integrity error, + * like an ECC error in case of NAND. The former is harmless, + * the later may mean the read data is corrupted. But we have a + * CRC check-sum and we will identify this. If the VID header is + * still OK, we just report this as there was a bit-flip. + */ + read_err = err; + } + + magic = ubi32_to_cpu(vid_hdr->magic); + if (magic != UBI_VID_HDR_MAGIC) { + /* + * If we have read all 0xFF bytes, the VID header probably does + * not exist and the physical eraseblock is assumed to be free. + * + * But if there was a read error, we do not test the data for + * 0xFFs. Even if it does contain all 0xFFs, this error + * indicates that something is still wrong with this physical + * eraseblock and it cannot be regarded as free. + */ + if (read_err != -EBADMSG && + check_pattern(vid_hdr, 0xFF, UBI_VID_HDR_SIZE)) { + /* The physical eraseblock is supposedly free */ + + /* + * The below is just a paranoid check, it has to be + * compiled out if paranoid checks are disabled. + */ + err = paranoid_check_all_ff(ubi, pnum, ubi->leb_start, + ubi->leb_size); + if (err) + return err > 0 ? UBI_IO_BAD_VID_HDR : err; + + if (verbose) + ubi_warn("no VID header found at PEB %d, " + "only 0xFF bytes", pnum); + return UBI_IO_PEB_FREE; + } + + /* + * This is not a valid VID header, and these are not 0xFF + * bytes. Report that the header is corrupted. + */ + if (verbose) { + ubi_warn("bad magic number at PEB %d: %08x instead of " + "%08x", pnum, magic, UBI_VID_HDR_MAGIC); + ubi_dbg_dump_vid_hdr(vid_hdr); + } + return UBI_IO_BAD_VID_HDR; + } + + crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC); + hdr_crc = ubi32_to_cpu(vid_hdr->hdr_crc); + + if (hdr_crc != crc) { + if (verbose) { + ubi_warn("bad CRC at PEB %d, calculated %#08x, " + "read %#08x", pnum, crc, hdr_crc); + ubi_dbg_dump_vid_hdr(vid_hdr); + } + return UBI_IO_BAD_VID_HDR; + } + + /* Validate the VID header that we have just read */ + err = validate_vid_hdr(ubi, vid_hdr); + if (err) { + ubi_err("validation failed for PEB %d", pnum); + return -EINVAL; + } + + return read_err ? UBI_IO_BITFLIPS : 0; +} + +/** + * ubi_io_write_vid_hdr - write a volume identifier header. + * @ubi: UBI device description object + * @pnum: the physical eraseblock number to write to + * @vid_hdr: the volume identifier header to write + * + * This function writes the volume identifier header described by @vid_hdr to + * physical eraseblock @pnum. This function automatically fills the + * @vid_hdr->magic and the @vid_hdr->version fields, as well as calculates + * header CRC checksum and stores it at vid_hdr->hdr_crc. + * + * This function returns zero in case of success and a negative error code in + * case of failure. If %-EIO is returned, the physical eraseblock probably went + * bad. + */ +int ubi_io_write_vid_hdr(const struct ubi_device *ubi, int pnum, + struct ubi_vid_hdr *vid_hdr) +{ + int err; + uint32_t crc; + void *p; + + dbg_io("write VID header to PEB %d", pnum); + ubi_assert(pnum >= 0 && pnum < ubi->peb_count); + + err = paranoid_check_peb_ec_hdr(ubi, pnum); + if (err) + return err > 0 ? -EINVAL: err; + + vid_hdr->magic = cpu_to_ubi32(UBI_VID_HDR_MAGIC); + vid_hdr->version = UBI_VERSION; + crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC); + vid_hdr->hdr_crc = cpu_to_ubi32(crc); + + err = paranoid_check_vid_hdr(ubi, pnum, vid_hdr); + if (err) + return -EINVAL; + + p = (char *)vid_hdr - ubi->vid_hdr_shift; + err = ubi_io_write(ubi, p, pnum, ubi->vid_hdr_aloffset, + ubi->vid_hdr_alsize); + return err; +} + +#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID + +/** + * paranoid_check_not_bad - ensure that a physical eraseblock is not bad. + * @ubi: UBI device description object + * @pnum: physical eraseblock number to check + * + * This function returns zero if the physical eraseblock is good, a positive + * number if it is bad and a negative error code if an error occurred. + */ +static int paranoid_check_not_bad(const struct ubi_device *ubi, int pnum) +{ + int err; + + err = ubi_io_is_bad(ubi, pnum); + if (!err) + return err; + + ubi_err("paranoid check failed for PEB %d", pnum); + ubi_dbg_dump_stack(); + return err; +} + +/** + * paranoid_check_ec_hdr - check if an erase counter header is all right. + * @ubi: UBI device description object + * @pnum: physical eraseblock number the erase counter header belongs to + * @ec_hdr: the erase counter header to check + * + * This function returns zero if the erase counter header contains valid + * values, and %1 if not. + */ +static int paranoid_check_ec_hdr(const struct ubi_device *ubi, int pnum, + const struct ubi_ec_hdr *ec_hdr) +{ + int err; + uint32_t magic; + + magic = ubi32_to_cpu(ec_hdr->magic); + if (magic != UBI_EC_HDR_MAGIC) { + ubi_err("bad magic %#08x, must be %#08x", + magic, UBI_EC_HDR_MAGIC); + goto fail; + } + + err = validate_ec_hdr(ubi, ec_hdr); + if (err) { + ubi_err("paranoid check failed for PEB %d", pnum); + goto fail; + } + + return 0; + +fail: + ubi_dbg_dump_ec_hdr(ec_hdr); + ubi_dbg_dump_stack(); + return 1; +} + +/** + * paranoid_check_peb_ec_hdr - check that the erase counter header of a + * physical eraseblock is in-place and is all right. + * @ubi: UBI device description object + * @pnum: the physical eraseblock number to check + * + * This function returns zero if the erase counter header is all right, %1 if + * not, and a negative error code if an error occurred. + */ +static int paranoid_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum) +{ + int err; + uint32_t crc, hdr_crc; + struct ubi_ec_hdr *ec_hdr; + + ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); + if (!ec_hdr) + return -ENOMEM; + + err = ubi_io_read(ubi, ec_hdr, pnum, 0, UBI_EC_HDR_SIZE); + if (err && err != UBI_IO_BITFLIPS && err != -EBADMSG) + goto exit; + + crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC); + hdr_crc = ubi32_to_cpu(ec_hdr->hdr_crc); + if (hdr_crc != crc) { + ubi_err("bad CRC, calculated %#08x, read %#08x", crc, hdr_crc); + ubi_err("paranoid check failed for PEB %d", pnum); + ubi_dbg_dump_ec_hdr(ec_hdr); + ubi_dbg_dump_stack(); + err = 1; + goto exit; + } + + err = paranoid_check_ec_hdr(ubi, pnum, ec_hdr); + +exit: + kfree(ec_hdr); + return err; +} + +/** + * paranoid_check_vid_hdr - check that a volume identifier header is all right. + * @ubi: UBI device description object + * @pnum: physical eraseblock number the volume identifier header belongs to + * @vid_hdr: the volume identifier header to check + * + * This function returns zero if the volume identifier header is all right, and + * %1 if not. + */ +static int paranoid_check_vid_hdr(const struct ubi_device *ubi, int pnum, + const struct ubi_vid_hdr *vid_hdr) +{ + int err; + uint32_t magic; + + magic = ubi32_to_cpu(vid_hdr->magic); + if (magic != UBI_VID_HDR_MAGIC) { + ubi_err("bad VID header magic %#08x at PEB %d, must be %#08x", + magic, pnum, UBI_VID_HDR_MAGIC); + goto fail; + } + + err = validate_vid_hdr(ubi, vid_hdr); + if (err) { + ubi_err("paranoid check failed for PEB %d", pnum); + goto fail; + } + + return err; + +fail: + ubi_err("paranoid check failed for PEB %d", pnum); + ubi_dbg_dump_vid_hdr(vid_hdr); + ubi_dbg_dump_stack(); + return 1; + +} + +/** + * paranoid_check_peb_vid_hdr - check that the volume identifier header of a + * physical eraseblock is in-place and is all right. + * @ubi: UBI device description object + * @pnum: the physical eraseblock number to check + * + * This function returns zero if the volume identifier header is all right, + * %1 if not, and a negative error code if an error occurred. + */ +static int paranoid_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum) +{ + int err; + uint32_t crc, hdr_crc; + struct ubi_vid_hdr *vid_hdr; + void *p; + + vid_hdr = ubi_zalloc_vid_hdr(ubi); + if (!vid_hdr) + return -ENOMEM; + + p = (char *)vid_hdr - ubi->vid_hdr_shift; + err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset, + ubi->vid_hdr_alsize); + if (err && err != UBI_IO_BITFLIPS && err != -EBADMSG) + goto exit; + + crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_EC_HDR_SIZE_CRC); + hdr_crc = ubi32_to_cpu(vid_hdr->hdr_crc); + if (hdr_crc != crc) { + ubi_err("bad VID header CRC at PEB %d, calculated %#08x, " + "read %#08x", pnum, crc, hdr_crc); + ubi_err("paranoid check failed for PEB %d", pnum); + ubi_dbg_dump_vid_hdr(vid_hdr); + ubi_dbg_dump_stack(); + err = 1; + goto exit; + } + + err = paranoid_check_vid_hdr(ubi, pnum, vid_hdr); + +exit: + ubi_free_vid_hdr(ubi, vid_hdr); + return err; +} + +/** + * paranoid_check_all_ff - check that a region of flash is empty. + * @ubi: UBI device description object + * @pnum: the physical eraseblock number to check + * @offset: the starting offset within the physical eraseblock to check + * @len: the length of the region to check + * + * This function returns zero if only 0xFF bytes are present at offset + * @offset of the physical eraseblock @pnum, %1 if not, and a negative error + * code if an error occurred. + */ +static int paranoid_check_all_ff(const struct ubi_device *ubi, int pnum, + int offset, int len) +{ + size_t read; + int err; + void *buf; + loff_t addr = (loff_t)pnum * ubi->peb_size + offset; + + buf = kzalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf); + if (err && err != -EUCLEAN) { + ubi_err("error %d while reading %d bytes from PEB %d:%d, " + "read %zd bytes", err, len, pnum, offset, read); + goto error; + } + + err = check_pattern(buf, 0xFF, len); + if (err == 0) { + ubi_err("flash region at PEB %d:%d, length %d does not " + "contain all 0xFF bytes", pnum, offset, len); + goto fail; + } + + kfree(buf); + return 0; + +fail: + ubi_err("paranoid check failed for PEB %d", pnum); + dbg_msg("hex dump of the %d-%d region", offset, offset + len); + ubi_dbg_hexdump(buf, len); + err = 1; +error: + ubi_dbg_dump_stack(); + kfree(buf); + return err; +} + +#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */ diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c new file mode 100644 index 00000000000..d352c4575c3 --- /dev/null +++ b/drivers/mtd/ubi/kapi.c @@ -0,0 +1,575 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +/* This file mostly implements UBI kernel API functions */ + +#include <linux/module.h> +#include <linux/err.h> +#include <asm/div64.h> +#include "ubi.h" + +/** + * ubi_get_device_info - get information about UBI device. + * @ubi_num: UBI device number + * @di: the information is stored here + * + * This function returns %0 in case of success and a %-ENODEV if there is no + * such UBI device. + */ +int ubi_get_device_info(int ubi_num, struct ubi_device_info *di) +{ + const struct ubi_device *ubi; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || + !ubi_devices[ubi_num]) { + module_put(THIS_MODULE); + return -ENODEV; + } + + ubi = ubi_devices[ubi_num]; + di->ubi_num = ubi->ubi_num; + di->leb_size = ubi->leb_size; + di->min_io_size = ubi->min_io_size; + di->ro_mode = ubi->ro_mode; + di->cdev = MKDEV(ubi->major, 0); + module_put(THIS_MODULE); + return 0; +} +EXPORT_SYMBOL_GPL(ubi_get_device_info); + +/** + * ubi_get_volume_info - get information about UBI volume. + * @desc: volume descriptor + * @vi: the information is stored here + */ +void ubi_get_volume_info(struct ubi_volume_desc *desc, + struct ubi_volume_info *vi) +{ + const struct ubi_volume *vol = desc->vol; + const struct ubi_device *ubi = vol->ubi; + + vi->vol_id = vol->vol_id; + vi->ubi_num = ubi->ubi_num; + vi->size = vol->reserved_pebs; + vi->used_bytes = vol->used_bytes; + vi->vol_type = vol->vol_type; + vi->corrupted = vol->corrupted; + vi->upd_marker = vol->upd_marker; + vi->alignment = vol->alignment; + vi->usable_leb_size = vol->usable_leb_size; + vi->name_len = vol->name_len; + vi->name = vol->name; + vi->cdev = MKDEV(ubi->major, vi->vol_id + 1); +} +EXPORT_SYMBOL_GPL(ubi_get_volume_info); + +/** + * ubi_open_volume - open UBI volume. + * @ubi_num: UBI device number + * @vol_id: volume ID + * @mode: open mode + * + * The @mode parameter specifies if the volume should be opened in read-only + * mode, read-write mode, or exclusive mode. The exclusive mode guarantees that + * nobody else will be able to open this volume. UBI allows to have many volume + * readers and one writer at a time. + * + * If a static volume is being opened for the first time since boot, it will be + * checked by this function, which means it will be fully read and the CRC + * checksum of each logical eraseblock will be checked. + * + * This function returns volume descriptor in case of success and a negative + * error code in case of failure. + */ +struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode) +{ + int err; + struct ubi_volume_desc *desc; + struct ubi_device *ubi = ubi_devices[ubi_num]; + struct ubi_volume *vol; + + dbg_msg("open device %d volume %d, mode %d", ubi_num, vol_id, mode); + + err = -ENODEV; + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(err); + + if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || !ubi) + goto out_put; + + err = -EINVAL; + if (vol_id < 0 || vol_id >= ubi->vtbl_slots) + goto out_put; + if (mode != UBI_READONLY && mode != UBI_READWRITE && + mode != UBI_EXCLUSIVE) + goto out_put; + + desc = kmalloc(sizeof(struct ubi_volume_desc), GFP_KERNEL); + if (!desc) { + err = -ENOMEM; + goto out_put; + } + + spin_lock(&ubi->volumes_lock); + vol = ubi->volumes[vol_id]; + if (!vol) { + err = -ENODEV; + goto out_unlock; + } + + err = -EBUSY; + switch (mode) { + case UBI_READONLY: + if (vol->exclusive) + goto out_unlock; + vol->readers += 1; + break; + + case UBI_READWRITE: + if (vol->exclusive || vol->writers > 0) + goto out_unlock; + vol->writers += 1; + break; + + case UBI_EXCLUSIVE: + if (vol->exclusive || vol->writers || vol->readers) + goto out_unlock; + vol->exclusive = 1; + break; + } + spin_unlock(&ubi->volumes_lock); + + desc->vol = vol; + desc->mode = mode; + + /* + * To prevent simultaneous checks of the same volume we use @vtbl_mutex, + * although it is not the purpose it was introduced for. + */ + mutex_lock(&ubi->vtbl_mutex); + if (!vol->checked) { + /* This is the first open - check the volume */ + err = ubi_check_volume(ubi, vol_id); + if (err < 0) { + mutex_unlock(&ubi->vtbl_mutex); + ubi_close_volume(desc); + return ERR_PTR(err); + } + if (err == 1) { + ubi_warn("volume %d on UBI device %d is corrupted", + vol_id, ubi->ubi_num); + vol->corrupted = 1; + } + vol->checked = 1; + } + mutex_unlock(&ubi->vtbl_mutex); + return desc; + +out_unlock: + spin_unlock(&ubi->volumes_lock); + kfree(desc); +out_put: + module_put(THIS_MODULE); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(ubi_open_volume); + +/** + * ubi_open_volume_nm - open UBI volume by name. + * @ubi_num: UBI device number + * @name: volume name + * @mode: open mode + * + * This function is similar to 'ubi_open_volume()', but opens a volume by name. + */ +struct ubi_volume_desc *ubi_open_volume_nm(int ubi_num, const char *name, + int mode) +{ + int i, vol_id = -1, len; + struct ubi_volume_desc *ret; + struct ubi_device *ubi; + + dbg_msg("open volume %s, mode %d", name, mode); + + if (!name) + return ERR_PTR(-EINVAL); + + len = strnlen(name, UBI_VOL_NAME_MAX + 1); + if (len > UBI_VOL_NAME_MAX) + return ERR_PTR(-EINVAL); + + ret = ERR_PTR(-ENODEV); + if (!try_module_get(THIS_MODULE)) + return ret; + + if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || !ubi_devices[ubi_num]) + goto out_put; + + ubi = ubi_devices[ubi_num]; + + spin_lock(&ubi->volumes_lock); + /* Walk all volumes of this UBI device */ + for (i = 0; i < ubi->vtbl_slots; i++) { + struct ubi_volume *vol = ubi->volumes[i]; + + if (vol && len == vol->name_len && !strcmp(name, vol->name)) { + vol_id = i; + break; + } + } + spin_unlock(&ubi->volumes_lock); + + if (vol_id < 0) + goto out_put; + + ret = ubi_open_volume(ubi_num, vol_id, mode); + +out_put: + module_put(THIS_MODULE); + return ret; +} +EXPORT_SYMBOL_GPL(ubi_open_volume_nm); + +/** + * ubi_close_volume - close UBI volume. + * @desc: volume descriptor + */ +void ubi_close_volume(struct ubi_volume_desc *desc) +{ + struct ubi_volume *vol = desc->vol; + + dbg_msg("close volume %d, mode %d", vol->vol_id, desc->mode); + + spin_lock(&vol->ubi->volumes_lock); + switch (desc->mode) { + case UBI_READONLY: + vol->readers -= 1; + break; + case UBI_READWRITE: + vol->writers -= 1; + break; + case UBI_EXCLUSIVE: + vol->exclusive = 0; + } + spin_unlock(&vol->ubi->volumes_lock); + + kfree(desc); + module_put(THIS_MODULE); +} +EXPORT_SYMBOL_GPL(ubi_close_volume); + +/** + * ubi_leb_read - read data. + * @desc: volume descriptor + * @lnum: logical eraseblock number to read from + * @buf: buffer where to store the read data + * @offset: offset within the logical eraseblock to read from + * @len: how many bytes to read + * @check: whether UBI has to check the read data's CRC or not. + * + * This function reads data from offset @offset of logical eraseblock @lnum and + * stores the data at @buf. When reading from static volumes, @check specifies + * whether the data has to be checked or not. If yes, the whole logical + * eraseblock will be read and its CRC checksum will be checked (i.e., the CRC + * checksum is per-eraseblock). So checking may substantially slow down the + * read speed. The @check argument is ignored for dynamic volumes. + * + * In case of success, this function returns zero. In case of failure, this + * function returns a negative error code. + * + * %-EBADMSG error code is returned: + * o for both static and dynamic volumes if MTD driver has detected a data + * integrity problem (unrecoverable ECC checksum mismatch in case of NAND); + * o for static volumes in case of data CRC mismatch. + * + * If the volume is damaged because of an interrupted update this function just + * returns immediately with %-EBADF error code. + */ +int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, + int len, int check) +{ + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + int err, vol_id = vol->vol_id; + + dbg_msg("read %d bytes from LEB %d:%d:%d", len, vol_id, lnum, offset); + + if (vol_id < 0 || vol_id >= ubi->vtbl_slots || lnum < 0 || + lnum >= vol->used_ebs || offset < 0 || len < 0 || + offset + len > vol->usable_leb_size) + return -EINVAL; + + if (vol->vol_type == UBI_STATIC_VOLUME && lnum == vol->used_ebs - 1 && + offset + len > vol->last_eb_bytes) + return -EINVAL; + + if (vol->upd_marker) + return -EBADF; + if (len == 0) + return 0; + + err = ubi_eba_read_leb(ubi, vol_id, lnum, buf, offset, len, check); + if (err && err == -EBADMSG && vol->vol_type == UBI_STATIC_VOLUME) { + ubi_warn("mark volume %d as corrupted", vol_id); + vol->corrupted = 1; + } + + return err; +} +EXPORT_SYMBOL_GPL(ubi_leb_read); + +/** + * ubi_leb_write - write data. + * @desc: volume descriptor + * @lnum: logical eraseblock number to write to + * @buf: data to write + * @offset: offset within the logical eraseblock where to write + * @len: how many bytes to write + * @dtype: expected data type + * + * This function writes @len bytes of data from @buf to offset @offset of + * logical eraseblock @lnum. The @dtype argument describes expected lifetime of + * the data. + * + * This function takes care of physical eraseblock write failures. If write to + * the physical eraseblock write operation fails, the logical eraseblock is + * re-mapped to another physical eraseblock, the data is recovered, and the + * write finishes. UBI has a pool of reserved physical eraseblocks for this. + * + * If all the data were successfully written, zero is returned. If an error + * occurred and UBI has not been able to recover from it, this function returns + * a negative error code. Note, in case of an error, it is possible that + * something was still written to the flash media, but that may be some + * garbage. + * + * If the volume is damaged because of an interrupted update this function just + * returns immediately with %-EBADF code. + */ +int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, + int offset, int len, int dtype) +{ + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + int vol_id = vol->vol_id; + + dbg_msg("write %d bytes to LEB %d:%d:%d", len, vol_id, lnum, offset); + + if (vol_id < 0 || vol_id >= ubi->vtbl_slots) + return -EINVAL; + + if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME) + return -EROFS; + + if (lnum < 0 || lnum >= vol->reserved_pebs || offset < 0 || len < 0 || + offset + len > vol->usable_leb_size || offset % ubi->min_io_size || + len % ubi->min_io_size) + return -EINVAL; + + if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM && + dtype != UBI_UNKNOWN) + return -EINVAL; + + if (vol->upd_marker) + return -EBADF; + + if (len == 0) + return 0; + + return ubi_eba_write_leb(ubi, vol_id, lnum, buf, offset, len, dtype); +} +EXPORT_SYMBOL_GPL(ubi_leb_write); + +/* + * ubi_leb_change - change logical eraseblock atomically. + * @desc: volume descriptor + * @lnum: logical eraseblock number to change + * @buf: data to write + * @len: how many bytes to write + * @dtype: expected data type + * + * This function changes the contents of a logical eraseblock atomically. @buf + * has to contain new logical eraseblock data, and @len - the length of the + * data, which has to be aligned. The length may be shorter then the logical + * eraseblock size, ant the logical eraseblock may be appended to more times + * later on. This function guarantees that in case of an unclean reboot the old + * contents is preserved. Returns zero in case of success and a negative error + * code in case of failure. + */ +int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, + int len, int dtype) +{ + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + int vol_id = vol->vol_id; + + dbg_msg("atomically write %d bytes to LEB %d:%d", len, vol_id, lnum); + + if (vol_id < 0 || vol_id >= ubi->vtbl_slots) + return -EINVAL; + + if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME) + return -EROFS; + + if (lnum < 0 || lnum >= vol->reserved_pebs || len < 0 || + len > vol->usable_leb_size || len % ubi->min_io_size) + return -EINVAL; + + if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM && + dtype != UBI_UNKNOWN) + return -EINVAL; + + if (vol->upd_marker) + return -EBADF; + + if (len == 0) + return 0; + + return ubi_eba_atomic_leb_change(ubi, vol_id, lnum, buf, len, dtype); +} +EXPORT_SYMBOL_GPL(ubi_leb_change); + +/** + * ubi_leb_erase - erase logical eraseblock. + * @desc: volume descriptor + * @lnum: logical eraseblock number + * + * This function un-maps logical eraseblock @lnum and synchronously erases the + * correspondent physical eraseblock. Returns zero in case of success and a + * negative error code in case of failure. + * + * If the volume is damaged because of an interrupted update this function just + * returns immediately with %-EBADF code. + */ +int ubi_leb_erase(struct ubi_volume_desc *desc, int lnum) +{ + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + int err, vol_id = vol->vol_id; + + dbg_msg("erase LEB %d:%d", vol_id, lnum); + + if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME) + return -EROFS; + + if (lnum < 0 || lnum >= vol->reserved_pebs) + return -EINVAL; + + if (vol->upd_marker) + return -EBADF; + + err = ubi_eba_unmap_leb(ubi, vol_id, lnum); + if (err) + return err; + + return ubi_wl_flush(ubi); +} +EXPORT_SYMBOL_GPL(ubi_leb_erase); + +/** + * ubi_leb_unmap - un-map logical eraseblock. + * @desc: volume descriptor + * @lnum: logical eraseblock number + * + * This function un-maps logical eraseblock @lnum and schedules the + * corresponding physical eraseblock for erasure, so that it will eventually be + * physically erased in background. This operation is much faster then the + * erase operation. + * + * Unlike erase, the un-map operation does not guarantee that the logical + * eraseblock will contain all 0xFF bytes when UBI is initialized again. For + * example, if several logical eraseblocks are un-mapped, and an unclean reboot + * happens after this, the logical eraseblocks will not necessarily be + * un-mapped again when this MTD device is attached. They may actually be + * mapped to the same physical eraseblocks again. So, this function has to be + * used with care. + * + * In other words, when un-mapping a logical eraseblock, UBI does not store + * any information about this on the flash media, it just marks the logical + * eraseblock as "un-mapped" in RAM. If UBI is detached before the physical + * eraseblock is physically erased, it will be mapped again to the same logical + * eraseblock when the MTD device is attached again. + * + * The main and obvious use-case of this function is when the contents of a + * logical eraseblock has to be re-written. Then it is much more efficient to + * first un-map it, then write new data, rather then first erase it, then write + * new data. Note, once new data has been written to the logical eraseblock, + * UBI guarantees that the old contents has gone forever. In other words, if an + * unclean reboot happens after the logical eraseblock has been un-mapped and + * then written to, it will contain the last written data. + * + * This function returns zero in case of success and a negative error code in + * case of failure. If the volume is damaged because of an interrupted update + * this function just returns immediately with %-EBADF code. + */ +int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum) +{ + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + int vol_id = vol->vol_id; + + dbg_msg("unmap LEB %d:%d", vol_id, lnum); + + if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME) + return -EROFS; + + if (lnum < 0 || lnum >= vol->reserved_pebs) + return -EINVAL; + + if (vol->upd_marker) + return -EBADF; + + return ubi_eba_unmap_leb(ubi, vol_id, lnum); +} +EXPORT_SYMBOL_GPL(ubi_leb_unmap); + +/** + * ubi_is_mapped - check if logical eraseblock is mapped. + * @desc: volume descriptor + * @lnum: logical eraseblock number + * + * This function checks if logical eraseblock @lnum is mapped to a physical + * eraseblock. If a logical eraseblock is un-mapped, this does not necessarily + * mean it will still be un-mapped after the UBI device is re-attached. The + * logical eraseblock may become mapped to the physical eraseblock it was last + * mapped to. + * + * This function returns %1 if the LEB is mapped, %0 if not, and a negative + * error code in case of failure. If the volume is damaged because of an + * interrupted update this function just returns immediately with %-EBADF error + * code. + */ +int ubi_is_mapped(struct ubi_volume_desc *desc, int lnum) +{ + struct ubi_volume *vol = desc->vol; + + dbg_msg("test LEB %d:%d", vol->vol_id, lnum); + + if (lnum < 0 || lnum >= vol->reserved_pebs) + return -EINVAL; + + if (vol->upd_marker) + return -EBADF; + + return vol->eba_tbl[lnum] >= 0; +} +EXPORT_SYMBOL_GPL(ubi_is_mapped); diff --git a/drivers/mtd/ubi/misc.c b/drivers/mtd/ubi/misc.c new file mode 100644 index 00000000000..38d4e6757dc --- /dev/null +++ b/drivers/mtd/ubi/misc.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +/* Here we keep miscellaneous functions which are used all over the UBI code */ + +#include "ubi.h" + +/** + * calc_data_len - calculate how much real data is stored in a buffer. + * @ubi: UBI device description object + * @buf: a buffer with the contents of the physical eraseblock + * @length: the buffer length + * + * This function calculates how much "real data" is stored in @buf and returnes + * the length. Continuous 0xFF bytes at the end of the buffer are not + * considered as "real data". + */ +int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf, + int length) +{ + int i; + + ubi_assert(length % ubi->min_io_size == 0); + + for (i = length - 1; i >= 0; i--) + if (((const uint8_t *)buf)[i] != 0xFF) + break; + + /* The resulting length must be aligned to the minimum flash I/O size */ + length = ALIGN(i + 1, ubi->min_io_size); + return length; +} + +/** + * ubi_check_volume - check the contents of a static volume. + * @ubi: UBI device description object + * @vol_id: ID of the volume to check + * + * This function checks if static volume @vol_id is corrupted by fully reading + * it and checking data CRC. This function returns %0 if the volume is not + * corrupted, %1 if it is corrupted and a negative error code in case of + * failure. Dynamic volumes are not checked and zero is returned immediately. + */ +int ubi_check_volume(struct ubi_device *ubi, int vol_id) +{ + void *buf; + int err = 0, i; + struct ubi_volume *vol = ubi->volumes[vol_id]; + + if (vol->vol_type != UBI_STATIC_VOLUME) + return 0; + + buf = kmalloc(vol->usable_leb_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; i < vol->used_ebs; i++) { + int size; + + if (i == vol->used_ebs - 1) + size = vol->last_eb_bytes; + else + size = vol->usable_leb_size; + + err = ubi_eba_read_leb(ubi, vol_id, i, buf, 0, size, 1); + if (err) { + if (err == -EBADMSG) + err = 1; + break; + } + } + + kfree(buf); + return err; +} + +/** + * ubi_calculate_rsvd_pool - calculate how many PEBs must be reserved for bad + * eraseblock handling. + * @ubi: UBI device description object + */ +void ubi_calculate_reserved(struct ubi_device *ubi) +{ + ubi->beb_rsvd_level = ubi->good_peb_count/100; + ubi->beb_rsvd_level *= CONFIG_MTD_UBI_BEB_RESERVE; + if (ubi->beb_rsvd_level < MIN_RESEVED_PEBS) + ubi->beb_rsvd_level = MIN_RESEVED_PEBS; +} diff --git a/drivers/mtd/ubi/scan.c b/drivers/mtd/ubi/scan.c new file mode 100644 index 00000000000..473f3200b86 --- /dev/null +++ b/drivers/mtd/ubi/scan.c @@ -0,0 +1,1368 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * UBI scanning unit. + * + * This unit is responsible for scanning the flash media, checking UBI + * headers and providing complete information about the UBI flash image. + * + * The scanning information is reoresented by a &struct ubi_scan_info' object. + * Information about found volumes is represented by &struct ubi_scan_volume + * objects which are kept in volume RB-tree with root at the @volumes field. + * The RB-tree is indexed by the volume ID. + * + * Found logical eraseblocks are represented by &struct ubi_scan_leb objects. + * These objects are kept in per-volume RB-trees with the root at the + * corresponding &struct ubi_scan_volume object. To put it differently, we keep + * an RB-tree of per-volume objects and each of these objects is the root of + * RB-tree of per-eraseblock objects. + * + * Corrupted physical eraseblocks are put to the @corr list, free physical + * eraseblocks are put to the @free list and the physical eraseblock to be + * erased are put to the @erase list. + */ + +#include <linux/err.h> +#include <linux/crc32.h> +#include "ubi.h" + +#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID +static int paranoid_check_si(const struct ubi_device *ubi, + struct ubi_scan_info *si); +#else +#define paranoid_check_si(ubi, si) 0 +#endif + +/* Temporary variables used during scanning */ +static struct ubi_ec_hdr *ech; +static struct ubi_vid_hdr *vidh; + +int ubi_scan_add_to_list(struct ubi_scan_info *si, int pnum, int ec, + struct list_head *list) +{ + struct ubi_scan_leb *seb; + + if (list == &si->free) + dbg_bld("add to free: PEB %d, EC %d", pnum, ec); + else if (list == &si->erase) + dbg_bld("add to erase: PEB %d, EC %d", pnum, ec); + else if (list == &si->corr) + dbg_bld("add to corrupted: PEB %d, EC %d", pnum, ec); + else if (list == &si->alien) + dbg_bld("add to alien: PEB %d, EC %d", pnum, ec); + else + BUG(); + + seb = kmalloc(sizeof(struct ubi_scan_leb), GFP_KERNEL); + if (!seb) + return -ENOMEM; + + seb->pnum = pnum; + seb->ec = ec; + list_add_tail(&seb->u.list, list); + return 0; +} + +/** + * commit_to_mean_value - commit intermediate results to the final mean erase + * counter value. + * @si: scanning information + * + * This is a helper function which calculates partial mean erase counter mean + * value and adds it to the resulting mean value. As we can work only in + * integer arithmetic and we want to calculate the mean value of erase counter + * accurately, we first sum erase counter values in @si->ec_sum variable and + * count these components in @si->ec_count. If this temporary @si->ec_sum is + * going to overflow, we calculate the partial mean value + * (@si->ec_sum/@si->ec_count) and add it to @si->mean_ec. + */ +static void commit_to_mean_value(struct ubi_scan_info *si) +{ + si->ec_sum /= si->ec_count; + if (si->ec_sum % si->ec_count >= si->ec_count / 2) + si->mean_ec += 1; + si->mean_ec += si->ec_sum; +} + +/** + * validate_vid_hdr - check that volume identifier header is correct and + * consistent. + * @vid_hdr: the volume identifier header to check + * @sv: information about the volume this logical eraseblock belongs to + * @pnum: physical eraseblock number the VID header came from + * + * This function checks that data stored in @vid_hdr is consistent. Returns + * non-zero if an inconsistency was found and zero if not. + * + * Note, UBI does sanity check of everything it reads from the flash media. + * Most of the checks are done in the I/O unit. Here we check that the + * information in the VID header is consistent to the information in other VID + * headers of the same volume. + */ +static int validate_vid_hdr(const struct ubi_vid_hdr *vid_hdr, + const struct ubi_scan_volume *sv, int pnum) +{ + int vol_type = vid_hdr->vol_type; + int vol_id = ubi32_to_cpu(vid_hdr->vol_id); + int used_ebs = ubi32_to_cpu(vid_hdr->used_ebs); + int data_pad = ubi32_to_cpu(vid_hdr->data_pad); + + if (sv->leb_count != 0) { + int sv_vol_type; + + /* + * This is not the first logical eraseblock belonging to this + * volume. Ensure that the data in its VID header is consistent + * to the data in previous logical eraseblock headers. + */ + + if (vol_id != sv->vol_id) { + dbg_err("inconsistent vol_id"); + goto bad; + } + + if (sv->vol_type == UBI_STATIC_VOLUME) + sv_vol_type = UBI_VID_STATIC; + else + sv_vol_type = UBI_VID_DYNAMIC; + + if (vol_type != sv_vol_type) { + dbg_err("inconsistent vol_type"); + goto bad; + } + + if (used_ebs != sv->used_ebs) { + dbg_err("inconsistent used_ebs"); + goto bad; + } + + if (data_pad != sv->data_pad) { + dbg_err("inconsistent data_pad"); + goto bad; + } + } + + return 0; + +bad: + ubi_err("inconsistent VID header at PEB %d", pnum); + ubi_dbg_dump_vid_hdr(vid_hdr); + ubi_dbg_dump_sv(sv); + return -EINVAL; +} + +/** + * add_volume - add volume to the scanning information. + * @si: scanning information + * @vol_id: ID of the volume to add + * @pnum: physical eraseblock number + * @vid_hdr: volume identifier header + * + * If the volume corresponding to the @vid_hdr logical eraseblock is already + * present in the scanning information, this function does nothing. Otherwise + * it adds corresponding volume to the scanning information. Returns a pointer + * to the scanning volume object in case of success and a negative error code + * in case of failure. + */ +static struct ubi_scan_volume *add_volume(struct ubi_scan_info *si, int vol_id, + int pnum, + const struct ubi_vid_hdr *vid_hdr) +{ + struct ubi_scan_volume *sv; + struct rb_node **p = &si->volumes.rb_node, *parent = NULL; + + ubi_assert(vol_id == ubi32_to_cpu(vid_hdr->vol_id)); + + /* Walk the volume RB-tree to look if this volume is already present */ + while (*p) { + parent = *p; + sv = rb_entry(parent, struct ubi_scan_volume, rb); + + if (vol_id == sv->vol_id) + return sv; + + if (vol_id > sv->vol_id) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + /* The volume is absent - add it */ + sv = kmalloc(sizeof(struct ubi_scan_volume), GFP_KERNEL); + if (!sv) + return ERR_PTR(-ENOMEM); + + sv->highest_lnum = sv->leb_count = 0; + si->max_sqnum = 0; + sv->vol_id = vol_id; + sv->root = RB_ROOT; + sv->used_ebs = ubi32_to_cpu(vid_hdr->used_ebs); + sv->data_pad = ubi32_to_cpu(vid_hdr->data_pad); + sv->compat = vid_hdr->compat; + sv->vol_type = vid_hdr->vol_type == UBI_VID_DYNAMIC ? UBI_DYNAMIC_VOLUME + : UBI_STATIC_VOLUME; + if (vol_id > si->highest_vol_id) + si->highest_vol_id = vol_id; + + rb_link_node(&sv->rb, parent, p); + rb_insert_color(&sv->rb, &si->volumes); + si->vols_found += 1; + dbg_bld("added volume %d", vol_id); + return sv; +} + +/** + * compare_lebs - find out which logical eraseblock is newer. + * @ubi: UBI device description object + * @seb: first logical eraseblock to compare + * @pnum: physical eraseblock number of the second logical eraseblock to + * compare + * @vid_hdr: volume identifier header of the second logical eraseblock + * + * This function compares 2 copies of a LEB and informs which one is newer. In + * case of success this function returns a positive value, in case of failure, a + * negative error code is returned. The success return codes use the following + * bits: + * o bit 0 is cleared: the first PEB (described by @seb) is newer then the + * second PEB (described by @pnum and @vid_hdr); + * o bit 0 is set: the second PEB is newer; + * o bit 1 is cleared: no bit-flips were detected in the newer LEB; + * o bit 1 is set: bit-flips were detected in the newer LEB; + * o bit 2 is cleared: the older LEB is not corrupted; + * o bit 2 is set: the older LEB is corrupted. + */ +static int compare_lebs(const struct ubi_device *ubi, + const struct ubi_scan_leb *seb, int pnum, + const struct ubi_vid_hdr *vid_hdr) +{ + void *buf; + int len, err, second_is_newer, bitflips = 0, corrupted = 0; + uint32_t data_crc, crc; + struct ubi_vid_hdr *vidh = NULL; + unsigned long long sqnum2 = ubi64_to_cpu(vid_hdr->sqnum); + + if (seb->sqnum == 0 && sqnum2 == 0) { + long long abs, v1 = seb->leb_ver, v2 = ubi32_to_cpu(vid_hdr->leb_ver); + + /* + * UBI constantly increases the logical eraseblock version + * number and it can overflow. Thus, we have to bear in mind + * that versions that are close to %0xFFFFFFFF are less then + * versions that are close to %0. + * + * The UBI WL unit guarantees that the number of pending tasks + * is not greater then %0x7FFFFFFF. So, if the difference + * between any two versions is greater or equivalent to + * %0x7FFFFFFF, there was an overflow and the logical + * eraseblock with lower version is actually newer then the one + * with higher version. + * + * FIXME: but this is anyway obsolete and will be removed at + * some point. + */ + + dbg_bld("using old crappy leb_ver stuff"); + + abs = v1 - v2; + if (abs < 0) + abs = -abs; + + if (abs < 0x7FFFFFFF) + /* Non-overflow situation */ + second_is_newer = (v2 > v1); + else + second_is_newer = (v2 < v1); + } else + /* Obviously the LEB with lower sequence counter is older */ + second_is_newer = sqnum2 > seb->sqnum; + + /* + * Now we know which copy is newer. If the copy flag of the PEB with + * newer version is not set, then we just return, otherwise we have to + * check data CRC. For the second PEB we already have the VID header, + * for the first one - we'll need to re-read it from flash. + * + * FIXME: this may be optimized so that we wouldn't read twice. + */ + + if (second_is_newer) { + if (!vid_hdr->copy_flag) { + /* It is not a copy, so it is newer */ + dbg_bld("second PEB %d is newer, copy_flag is unset", + pnum); + return 1; + } + } else { + pnum = seb->pnum; + + vidh = ubi_zalloc_vid_hdr(ubi); + if (!vidh) + return -ENOMEM; + + err = ubi_io_read_vid_hdr(ubi, pnum, vidh, 0); + if (err) { + if (err == UBI_IO_BITFLIPS) + bitflips = 1; + else { + dbg_err("VID of PEB %d header is bad, but it " + "was OK earlier", pnum); + if (err > 0) + err = -EIO; + + goto out_free_vidh; + } + } + + if (!vidh->copy_flag) { + /* It is not a copy, so it is newer */ + dbg_bld("first PEB %d is newer, copy_flag is unset", + pnum); + err = bitflips << 1; + goto out_free_vidh; + } + + vid_hdr = vidh; + } + + /* Read the data of the copy and check the CRC */ + + len = ubi32_to_cpu(vid_hdr->data_size); + buf = kmalloc(len, GFP_KERNEL); + if (!buf) { + err = -ENOMEM; + goto out_free_vidh; + } + + err = ubi_io_read_data(ubi, buf, pnum, 0, len); + if (err && err != UBI_IO_BITFLIPS) + goto out_free_buf; + + data_crc = ubi32_to_cpu(vid_hdr->data_crc); + crc = crc32(UBI_CRC32_INIT, buf, len); + if (crc != data_crc) { + dbg_bld("PEB %d CRC error: calculated %#08x, must be %#08x", + pnum, crc, data_crc); + corrupted = 1; + bitflips = 0; + second_is_newer = !second_is_newer; + } else { + dbg_bld("PEB %d CRC is OK", pnum); + bitflips = !!err; + } + + kfree(buf); + ubi_free_vid_hdr(ubi, vidh); + + if (second_is_newer) + dbg_bld("second PEB %d is newer, copy_flag is set", pnum); + else + dbg_bld("first PEB %d is newer, copy_flag is set", pnum); + + return second_is_newer | (bitflips << 1) | (corrupted << 2); + +out_free_buf: + kfree(buf); +out_free_vidh: + ubi_free_vid_hdr(ubi, vidh); + ubi_assert(err < 0); + return err; +} + +/** + * ubi_scan_add_used - add information about a physical eraseblock to the + * scanning information. + * @ubi: UBI device description object + * @si: scanning information + * @pnum: the physical eraseblock number + * @ec: erase counter + * @vid_hdr: the volume identifier header + * @bitflips: if bit-flips were detected when this physical eraseblock was read + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubi_scan_add_used(const struct ubi_device *ubi, struct ubi_scan_info *si, + int pnum, int ec, const struct ubi_vid_hdr *vid_hdr, + int bitflips) +{ + int err, vol_id, lnum; + uint32_t leb_ver; + unsigned long long sqnum; + struct ubi_scan_volume *sv; + struct ubi_scan_leb *seb; + struct rb_node **p, *parent = NULL; + + vol_id = ubi32_to_cpu(vid_hdr->vol_id); + lnum = ubi32_to_cpu(vid_hdr->lnum); + sqnum = ubi64_to_cpu(vid_hdr->sqnum); + leb_ver = ubi32_to_cpu(vid_hdr->leb_ver); + + dbg_bld("PEB %d, LEB %d:%d, EC %d, sqnum %llu, ver %u, bitflips %d", + pnum, vol_id, lnum, ec, sqnum, leb_ver, bitflips); + + sv = add_volume(si, vol_id, pnum, vid_hdr); + if (IS_ERR(sv) < 0) + return PTR_ERR(sv); + + /* + * Walk the RB-tree of logical eraseblocks of volume @vol_id to look + * if this is the first instance of this logical eraseblock or not. + */ + p = &sv->root.rb_node; + while (*p) { + int cmp_res; + + parent = *p; + seb = rb_entry(parent, struct ubi_scan_leb, u.rb); + if (lnum != seb->lnum) { + if (lnum < seb->lnum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + continue; + } + + /* + * There is already a physical eraseblock describing the same + * logical eraseblock present. + */ + + dbg_bld("this LEB already exists: PEB %d, sqnum %llu, " + "LEB ver %u, EC %d", seb->pnum, seb->sqnum, + seb->leb_ver, seb->ec); + + /* + * Make sure that the logical eraseblocks have different + * versions. Otherwise the image is bad. + */ + if (seb->leb_ver == leb_ver && leb_ver != 0) { + ubi_err("two LEBs with same version %u", leb_ver); + ubi_dbg_dump_seb(seb, 0); + ubi_dbg_dump_vid_hdr(vid_hdr); + return -EINVAL; + } + + /* + * Make sure that the logical eraseblocks have different + * sequence numbers. Otherwise the image is bad. + * + * FIXME: remove 'sqnum != 0' check when leb_ver is removed. + */ + if (seb->sqnum == sqnum && sqnum != 0) { + ubi_err("two LEBs with same sequence number %llu", + sqnum); + ubi_dbg_dump_seb(seb, 0); + ubi_dbg_dump_vid_hdr(vid_hdr); + return -EINVAL; + } + + /* + * Now we have to drop the older one and preserve the newer + * one. + */ + cmp_res = compare_lebs(ubi, seb, pnum, vid_hdr); + if (cmp_res < 0) + return cmp_res; + + if (cmp_res & 1) { + /* + * This logical eraseblock is newer then the one + * found earlier. + */ + err = validate_vid_hdr(vid_hdr, sv, pnum); + if (err) + return err; + + if (cmp_res & 4) + err = ubi_scan_add_to_list(si, seb->pnum, + seb->ec, &si->corr); + else + err = ubi_scan_add_to_list(si, seb->pnum, + seb->ec, &si->erase); + if (err) + return err; + + seb->ec = ec; + seb->pnum = pnum; + seb->scrub = ((cmp_res & 2) || bitflips); + seb->sqnum = sqnum; + seb->leb_ver = leb_ver; + + if (sv->highest_lnum == lnum) + sv->last_data_size = + ubi32_to_cpu(vid_hdr->data_size); + + return 0; + } else { + /* + * This logical eraseblock is older then the one found + * previously. + */ + if (cmp_res & 4) + return ubi_scan_add_to_list(si, pnum, ec, + &si->corr); + else + return ubi_scan_add_to_list(si, pnum, ec, + &si->erase); + } + } + + /* + * We've met this logical eraseblock for the first time, add it to the + * scanning information. + */ + + err = validate_vid_hdr(vid_hdr, sv, pnum); + if (err) + return err; + + seb = kmalloc(sizeof(struct ubi_scan_leb), GFP_KERNEL); + if (!seb) + return -ENOMEM; + + seb->ec = ec; + seb->pnum = pnum; + seb->lnum = lnum; + seb->sqnum = sqnum; + seb->scrub = bitflips; + seb->leb_ver = leb_ver; + + if (sv->highest_lnum <= lnum) { + sv->highest_lnum = lnum; + sv->last_data_size = ubi32_to_cpu(vid_hdr->data_size); + } + + if (si->max_sqnum < sqnum) + si->max_sqnum = sqnum; + + sv->leb_count += 1; + rb_link_node(&seb->u.rb, parent, p); + rb_insert_color(&seb->u.rb, &sv->root); + return 0; +} + +/** + * ubi_scan_find_sv - find information about a particular volume in the + * scanning information. + * @si: scanning information + * @vol_id: the requested volume ID + * + * This function returns a pointer to the volume description or %NULL if there + * are no data about this volume in the scanning information. + */ +struct ubi_scan_volume *ubi_scan_find_sv(const struct ubi_scan_info *si, + int vol_id) +{ + struct ubi_scan_volume *sv; + struct rb_node *p = si->volumes.rb_node; + + while (p) { + sv = rb_entry(p, struct ubi_scan_volume, rb); + + if (vol_id == sv->vol_id) + return sv; + + if (vol_id > sv->vol_id) + p = p->rb_left; + else + p = p->rb_right; + } + + return NULL; +} + +/** + * ubi_scan_find_seb - find information about a particular logical + * eraseblock in the volume scanning information. + * @sv: a pointer to the volume scanning information + * @lnum: the requested logical eraseblock + * + * This function returns a pointer to the scanning logical eraseblock or %NULL + * if there are no data about it in the scanning volume information. + */ +struct ubi_scan_leb *ubi_scan_find_seb(const struct ubi_scan_volume *sv, + int lnum) +{ + struct ubi_scan_leb *seb; + struct rb_node *p = sv->root.rb_node; + + while (p) { + seb = rb_entry(p, struct ubi_scan_leb, u.rb); + + if (lnum == seb->lnum) + return seb; + + if (lnum > seb->lnum) + p = p->rb_left; + else + p = p->rb_right; + } + + return NULL; +} + +/** + * ubi_scan_rm_volume - delete scanning information about a volume. + * @si: scanning information + * @sv: the volume scanning information to delete + */ +void ubi_scan_rm_volume(struct ubi_scan_info *si, struct ubi_scan_volume *sv) +{ + struct rb_node *rb; + struct ubi_scan_leb *seb; + + dbg_bld("remove scanning information about volume %d", sv->vol_id); + + while ((rb = rb_first(&sv->root))) { + seb = rb_entry(rb, struct ubi_scan_leb, u.rb); + rb_erase(&seb->u.rb, &sv->root); + list_add_tail(&seb->u.list, &si->erase); + } + + rb_erase(&sv->rb, &si->volumes); + kfree(sv); + si->vols_found -= 1; +} + +/** + * ubi_scan_erase_peb - erase a physical eraseblock. + * @ubi: UBI device description object + * @si: scanning information + * @pnum: physical eraseblock number to erase; + * @ec: erase counter value to write (%UBI_SCAN_UNKNOWN_EC if it is unknown) + * + * This function erases physical eraseblock 'pnum', and writes the erase + * counter header to it. This function should only be used on UBI device + * initialization stages, when the EBA unit had not been yet initialized. This + * function returns zero in case of success and a negative error code in case + * of failure. + */ +int ubi_scan_erase_peb(const struct ubi_device *ubi, + const struct ubi_scan_info *si, int pnum, int ec) +{ + int err; + struct ubi_ec_hdr *ec_hdr; + + ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); + if (!ec_hdr) + return -ENOMEM; + + if ((long long)ec >= UBI_MAX_ERASECOUNTER) { + /* + * Erase counter overflow. Upgrade UBI and use 64-bit + * erase counters internally. + */ + ubi_err("erase counter overflow at PEB %d, EC %d", pnum, ec); + return -EINVAL; + } + + ec_hdr->ec = cpu_to_ubi64(ec); + + err = ubi_io_sync_erase(ubi, pnum, 0); + if (err < 0) + goto out_free; + + err = ubi_io_write_ec_hdr(ubi, pnum, ec_hdr); + +out_free: + kfree(ec_hdr); + return err; +} + +/** + * ubi_scan_get_free_peb - get a free physical eraseblock. + * @ubi: UBI device description object + * @si: scanning information + * + * This function returns a free physical eraseblock. It is supposed to be + * called on the UBI initialization stages when the wear-leveling unit is not + * initialized yet. This function picks a physical eraseblocks from one of the + * lists, writes the EC header if it is needed, and removes it from the list. + * + * This function returns scanning physical eraseblock information in case of + * success and an error code in case of failure. + */ +struct ubi_scan_leb *ubi_scan_get_free_peb(const struct ubi_device *ubi, + struct ubi_scan_info *si) +{ + int err = 0, i; + struct ubi_scan_leb *seb; + + if (!list_empty(&si->free)) { + seb = list_entry(si->free.next, struct ubi_scan_leb, u.list); + list_del(&seb->u.list); + dbg_bld("return free PEB %d, EC %d", seb->pnum, seb->ec); + return seb; + } + + for (i = 0; i < 2; i++) { + struct list_head *head; + struct ubi_scan_leb *tmp_seb; + + if (i == 0) + head = &si->erase; + else + head = &si->corr; + + /* + * We try to erase the first physical eraseblock from the @head + * list and pick it if we succeed, or try to erase the + * next one if not. And so forth. We don't want to take care + * about bad eraseblocks here - they'll be handled later. + */ + list_for_each_entry_safe(seb, tmp_seb, head, u.list) { + if (seb->ec == UBI_SCAN_UNKNOWN_EC) + seb->ec = si->mean_ec; + + err = ubi_scan_erase_peb(ubi, si, seb->pnum, seb->ec+1); + if (err) + continue; + + seb->ec += 1; + list_del(&seb->u.list); + dbg_bld("return PEB %d, EC %d", seb->pnum, seb->ec); + return seb; + } + } + + ubi_err("no eraseblocks found"); + return ERR_PTR(-ENOSPC); +} + +/** + * process_eb - read UBI headers, check them and add corresponding data + * to the scanning information. + * @ubi: UBI device description object + * @si: scanning information + * @pnum: the physical eraseblock number + * + * This function returns a zero if the physical eraseblock was succesfully + * handled and a negative error code in case of failure. + */ +static int process_eb(struct ubi_device *ubi, struct ubi_scan_info *si, int pnum) +{ + long long ec; + int err, bitflips = 0, vol_id, ec_corr = 0; + + dbg_bld("scan PEB %d", pnum); + + /* Skip bad physical eraseblocks */ + err = ubi_io_is_bad(ubi, pnum); + if (err < 0) + return err; + else if (err) { + /* + * FIXME: this is actually duty of the I/O unit to initialize + * this, but MTD does not provide enough information. + */ + si->bad_peb_count += 1; + return 0; + } + + err = ubi_io_read_ec_hdr(ubi, pnum, ech, 0); + if (err < 0) + return err; + else if (err == UBI_IO_BITFLIPS) + bitflips = 1; + else if (err == UBI_IO_PEB_EMPTY) + return ubi_scan_add_to_list(si, pnum, UBI_SCAN_UNKNOWN_EC, + &si->erase); + else if (err == UBI_IO_BAD_EC_HDR) { + /* + * We have to also look at the VID header, possibly it is not + * corrupted. Set %bitflips flag in order to make this PEB be + * moved and EC be re-created. + */ + ec_corr = 1; + ec = UBI_SCAN_UNKNOWN_EC; + bitflips = 1; + } + + si->is_empty = 0; + + if (!ec_corr) { + /* Make sure UBI version is OK */ + if (ech->version != UBI_VERSION) { + ubi_err("this UBI version is %d, image version is %d", + UBI_VERSION, (int)ech->version); + return -EINVAL; + } + + ec = ubi64_to_cpu(ech->ec); + if (ec > UBI_MAX_ERASECOUNTER) { + /* + * Erase counter overflow. The EC headers have 64 bits + * reserved, but we anyway make use of only 31 bit + * values, as this seems to be enough for any existing + * flash. Upgrade UBI and use 64-bit erase counters + * internally. + */ + ubi_err("erase counter overflow, max is %d", + UBI_MAX_ERASECOUNTER); + ubi_dbg_dump_ec_hdr(ech); + return -EINVAL; + } + } + + /* OK, we've done with the EC header, let's look at the VID header */ + + err = ubi_io_read_vid_hdr(ubi, pnum, vidh, 0); + if (err < 0) + return err; + else if (err == UBI_IO_BITFLIPS) + bitflips = 1; + else if (err == UBI_IO_BAD_VID_HDR || + (err == UBI_IO_PEB_FREE && ec_corr)) { + /* VID header is corrupted */ + err = ubi_scan_add_to_list(si, pnum, ec, &si->corr); + if (err) + return err; + goto adjust_mean_ec; + } else if (err == UBI_IO_PEB_FREE) { + /* No VID header - the physical eraseblock is free */ + err = ubi_scan_add_to_list(si, pnum, ec, &si->free); + if (err) + return err; + goto adjust_mean_ec; + } + + vol_id = ubi32_to_cpu(vidh->vol_id); + if (vol_id > UBI_MAX_VOLUMES && vol_id != UBI_LAYOUT_VOL_ID) { + int lnum = ubi32_to_cpu(vidh->lnum); + + /* Unsupported internal volume */ + switch (vidh->compat) { + case UBI_COMPAT_DELETE: + ubi_msg("\"delete\" compatible internal volume %d:%d" + " found, remove it", vol_id, lnum); + err = ubi_scan_add_to_list(si, pnum, ec, &si->corr); + if (err) + return err; + break; + + case UBI_COMPAT_RO: + ubi_msg("read-only compatible internal volume %d:%d" + " found, switch to read-only mode", + vol_id, lnum); + ubi->ro_mode = 1; + break; + + case UBI_COMPAT_PRESERVE: + ubi_msg("\"preserve\" compatible internal volume %d:%d" + " found", vol_id, lnum); + err = ubi_scan_add_to_list(si, pnum, ec, &si->alien); + if (err) + return err; + si->alien_peb_count += 1; + return 0; + + case UBI_COMPAT_REJECT: + ubi_err("incompatible internal volume %d:%d found", + vol_id, lnum); + return -EINVAL; + } + } + + /* Both UBI headers seem to be fine */ + err = ubi_scan_add_used(ubi, si, pnum, ec, vidh, bitflips); + if (err) + return err; + +adjust_mean_ec: + if (!ec_corr) { + if (si->ec_sum + ec < ec) { + commit_to_mean_value(si); + si->ec_sum = 0; + si->ec_count = 0; + } else { + si->ec_sum += ec; + si->ec_count += 1; + } + + if (ec > si->max_ec) + si->max_ec = ec; + if (ec < si->min_ec) + si->min_ec = ec; + } + + return 0; +} + +/** + * ubi_scan - scan an MTD device. + * @ubi: UBI device description object + * + * This function does full scanning of an MTD device and returns complete + * information about it. In case of failure, an error code is returned. + */ +struct ubi_scan_info *ubi_scan(struct ubi_device *ubi) +{ + int err, pnum; + struct rb_node *rb1, *rb2; + struct ubi_scan_volume *sv; + struct ubi_scan_leb *seb; + struct ubi_scan_info *si; + + si = kzalloc(sizeof(struct ubi_scan_info), GFP_KERNEL); + if (!si) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&si->corr); + INIT_LIST_HEAD(&si->free); + INIT_LIST_HEAD(&si->erase); + INIT_LIST_HEAD(&si->alien); + si->volumes = RB_ROOT; + si->is_empty = 1; + + err = -ENOMEM; + ech = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); + if (!ech) + goto out_si; + + vidh = ubi_zalloc_vid_hdr(ubi); + if (!vidh) + goto out_ech; + + for (pnum = 0; pnum < ubi->peb_count; pnum++) { + cond_resched(); + + dbg_msg("process PEB %d", pnum); + err = process_eb(ubi, si, pnum); + if (err < 0) + goto out_vidh; + } + + dbg_msg("scanning is finished"); + + /* Finish mean erase counter calculations */ + if (si->ec_count) + commit_to_mean_value(si); + + if (si->is_empty) + ubi_msg("empty MTD device detected"); + + /* + * In case of unknown erase counter we use the mean erase counter + * value. + */ + ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) { + ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) + if (seb->ec == UBI_SCAN_UNKNOWN_EC) + seb->ec = si->mean_ec; + } + + list_for_each_entry(seb, &si->free, u.list) { + if (seb->ec == UBI_SCAN_UNKNOWN_EC) + seb->ec = si->mean_ec; + } + + list_for_each_entry(seb, &si->corr, u.list) + if (seb->ec == UBI_SCAN_UNKNOWN_EC) + seb->ec = si->mean_ec; + + list_for_each_entry(seb, &si->erase, u.list) + if (seb->ec == UBI_SCAN_UNKNOWN_EC) + seb->ec = si->mean_ec; + + err = paranoid_check_si(ubi, si); + if (err) { + if (err > 0) + err = -EINVAL; + goto out_vidh; + } + + ubi_free_vid_hdr(ubi, vidh); + kfree(ech); + + return si; + +out_vidh: + ubi_free_vid_hdr(ubi, vidh); +out_ech: + kfree(ech); +out_si: + ubi_scan_destroy_si(si); + return ERR_PTR(err); +} + +/** + * destroy_sv - free the scanning volume information + * @sv: scanning volume information + * + * This function destroys the volume RB-tree (@sv->root) and the scanning + * volume information. + */ +static void destroy_sv(struct ubi_scan_volume *sv) +{ + struct ubi_scan_leb *seb; + struct rb_node *this = sv->root.rb_node; + + while (this) { + if (this->rb_left) + this = this->rb_left; + else if (this->rb_right) + this = this->rb_right; + else { + seb = rb_entry(this, struct ubi_scan_leb, u.rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &seb->u.rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + + kfree(seb); + } + } + kfree(sv); +} + +/** + * ubi_scan_destroy_si - destroy scanning information. + * @si: scanning information + */ +void ubi_scan_destroy_si(struct ubi_scan_info *si) +{ + struct ubi_scan_leb *seb, *seb_tmp; + struct ubi_scan_volume *sv; + struct rb_node *rb; + + list_for_each_entry_safe(seb, seb_tmp, &si->alien, u.list) { + list_del(&seb->u.list); + kfree(seb); + } + list_for_each_entry_safe(seb, seb_tmp, &si->erase, u.list) { + list_del(&seb->u.list); + kfree(seb); + } + list_for_each_entry_safe(seb, seb_tmp, &si->corr, u.list) { + list_del(&seb->u.list); + kfree(seb); + } + list_for_each_entry_safe(seb, seb_tmp, &si->free, u.list) { + list_del(&seb->u.list); + kfree(seb); + } + + /* Destroy the volume RB-tree */ + rb = si->volumes.rb_node; + while (rb) { + if (rb->rb_left) + rb = rb->rb_left; + else if (rb->rb_right) + rb = rb->rb_right; + else { + sv = rb_entry(rb, struct ubi_scan_volume, rb); + + rb = rb_parent(rb); + if (rb) { + if (rb->rb_left == &sv->rb) + rb->rb_left = NULL; + else + rb->rb_right = NULL; + } + + destroy_sv(sv); + } + } + + kfree(si); +} + +#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID + +/** + * paranoid_check_si - check if the scanning information is correct and + * consistent. + * @ubi: UBI device description object + * @si: scanning information + * + * This function returns zero if the scanning information is all right, %1 if + * not and a negative error code if an error occurred. + */ +static int paranoid_check_si(const struct ubi_device *ubi, + struct ubi_scan_info *si) +{ + int pnum, err, vols_found = 0; + struct rb_node *rb1, *rb2; + struct ubi_scan_volume *sv; + struct ubi_scan_leb *seb, *last_seb; + uint8_t *buf; + + /* + * At first, check that scanning information is ok. + */ + ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) { + int leb_count = 0; + + cond_resched(); + + vols_found += 1; + + if (si->is_empty) { + ubi_err("bad is_empty flag"); + goto bad_sv; + } + + if (sv->vol_id < 0 || sv->highest_lnum < 0 || + sv->leb_count < 0 || sv->vol_type < 0 || sv->used_ebs < 0 || + sv->data_pad < 0 || sv->last_data_size < 0) { + ubi_err("negative values"); + goto bad_sv; + } + + if (sv->vol_id >= UBI_MAX_VOLUMES && + sv->vol_id < UBI_INTERNAL_VOL_START) { + ubi_err("bad vol_id"); + goto bad_sv; + } + + if (sv->vol_id > si->highest_vol_id) { + ubi_err("highest_vol_id is %d, but vol_id %d is there", + si->highest_vol_id, sv->vol_id); + goto out; + } + + if (sv->vol_type != UBI_DYNAMIC_VOLUME && + sv->vol_type != UBI_STATIC_VOLUME) { + ubi_err("bad vol_type"); + goto bad_sv; + } + + if (sv->data_pad > ubi->leb_size / 2) { + ubi_err("bad data_pad"); + goto bad_sv; + } + + last_seb = NULL; + ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) { + cond_resched(); + + last_seb = seb; + leb_count += 1; + + if (seb->pnum < 0 || seb->ec < 0) { + ubi_err("negative values"); + goto bad_seb; + } + + if (seb->ec < si->min_ec) { + ubi_err("bad si->min_ec (%d), %d found", + si->min_ec, seb->ec); + goto bad_seb; + } + + if (seb->ec > si->max_ec) { + ubi_err("bad si->max_ec (%d), %d found", + si->max_ec, seb->ec); + goto bad_seb; + } + + if (seb->pnum >= ubi->peb_count) { + ubi_err("too high PEB number %d, total PEBs %d", + seb->pnum, ubi->peb_count); + goto bad_seb; + } + + if (sv->vol_type == UBI_STATIC_VOLUME) { + if (seb->lnum >= sv->used_ebs) { + ubi_err("bad lnum or used_ebs"); + goto bad_seb; + } + } else { + if (sv->used_ebs != 0) { + ubi_err("non-zero used_ebs"); + goto bad_seb; + } + } + + if (seb->lnum > sv->highest_lnum) { + ubi_err("incorrect highest_lnum or lnum"); + goto bad_seb; + } + } + + if (sv->leb_count != leb_count) { + ubi_err("bad leb_count, %d objects in the tree", + leb_count); + goto bad_sv; + } + + if (!last_seb) + continue; + + seb = last_seb; + + if (seb->lnum != sv->highest_lnum) { + ubi_err("bad highest_lnum"); + goto bad_seb; + } + } + + if (vols_found != si->vols_found) { + ubi_err("bad si->vols_found %d, should be %d", + si->vols_found, vols_found); + goto out; + } + + /* Check that scanning information is correct */ + ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) { + last_seb = NULL; + ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) { + int vol_type; + + cond_resched(); + + last_seb = seb; + + err = ubi_io_read_vid_hdr(ubi, seb->pnum, vidh, 1); + if (err && err != UBI_IO_BITFLIPS) { + ubi_err("VID header is not OK (%d)", err); + if (err > 0) + err = -EIO; + return err; + } + + vol_type = vidh->vol_type == UBI_VID_DYNAMIC ? + UBI_DYNAMIC_VOLUME : UBI_STATIC_VOLUME; + if (sv->vol_type != vol_type) { + ubi_err("bad vol_type"); + goto bad_vid_hdr; + } + + if (seb->sqnum != ubi64_to_cpu(vidh->sqnum)) { + ubi_err("bad sqnum %llu", seb->sqnum); + goto bad_vid_hdr; + } + + if (sv->vol_id != ubi32_to_cpu(vidh->vol_id)) { + ubi_err("bad vol_id %d", sv->vol_id); + goto bad_vid_hdr; + } + + if (sv->compat != vidh->compat) { + ubi_err("bad compat %d", vidh->compat); + goto bad_vid_hdr; + } + + if (seb->lnum != ubi32_to_cpu(vidh->lnum)) { + ubi_err("bad lnum %d", seb->lnum); + goto bad_vid_hdr; + } + + if (sv->used_ebs != ubi32_to_cpu(vidh->used_ebs)) { + ubi_err("bad used_ebs %d", sv->used_ebs); + goto bad_vid_hdr; + } + + if (sv->data_pad != ubi32_to_cpu(vidh->data_pad)) { + ubi_err("bad data_pad %d", sv->data_pad); + goto bad_vid_hdr; + } + + if (seb->leb_ver != ubi32_to_cpu(vidh->leb_ver)) { + ubi_err("bad leb_ver %u", seb->leb_ver); + goto bad_vid_hdr; + } + } + + if (!last_seb) + continue; + + if (sv->highest_lnum != ubi32_to_cpu(vidh->lnum)) { + ubi_err("bad highest_lnum %d", sv->highest_lnum); + goto bad_vid_hdr; + } + + if (sv->last_data_size != ubi32_to_cpu(vidh->data_size)) { + ubi_err("bad last_data_size %d", sv->last_data_size); + goto bad_vid_hdr; + } + } + + /* + * Make sure that all the physical eraseblocks are in one of the lists + * or trees. + */ + buf = kmalloc(ubi->peb_count, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + memset(buf, 1, ubi->peb_count); + for (pnum = 0; pnum < ubi->peb_count; pnum++) { + err = ubi_io_is_bad(ubi, pnum); + if (err < 0) + return err; + else if (err) + buf[pnum] = 0; + } + + ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) + ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) + buf[seb->pnum] = 0; + + list_for_each_entry(seb, &si->free, u.list) + buf[seb->pnum] = 0; + + list_for_each_entry(seb, &si->corr, u.list) + buf[seb->pnum] = 0; + + list_for_each_entry(seb, &si->erase, u.list) + buf[seb->pnum] = 0; + + list_for_each_entry(seb, &si->alien, u.list) + buf[seb->pnum] = 0; + + err = 0; + for (pnum = 0; pnum < ubi->peb_count; pnum++) + if (buf[pnum]) { + ubi_err("PEB %d is not referred", pnum); + err = 1; + } + + kfree(buf); + if (err) + goto out; + return 0; + +bad_seb: + ubi_err("bad scanning information about LEB %d", seb->lnum); + ubi_dbg_dump_seb(seb, 0); + ubi_dbg_dump_sv(sv); + goto out; + +bad_sv: + ubi_err("bad scanning information about volume %d", sv->vol_id); + ubi_dbg_dump_sv(sv); + goto out; + +bad_vid_hdr: + ubi_err("bad scanning information about volume %d", sv->vol_id); + ubi_dbg_dump_sv(sv); + ubi_dbg_dump_vid_hdr(vidh); + +out: + ubi_dbg_dump_stack(); + return 1; +} + +#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */ diff --git a/drivers/mtd/ubi/scan.h b/drivers/mtd/ubi/scan.h new file mode 100644 index 00000000000..3949f6192c7 --- /dev/null +++ b/drivers/mtd/ubi/scan.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +#ifndef __UBI_SCAN_H__ +#define __UBI_SCAN_H__ + +/* The erase counter value for this physical eraseblock is unknown */ +#define UBI_SCAN_UNKNOWN_EC (-1) + +/** + * struct ubi_scan_leb - scanning information about a physical eraseblock. + * @ec: erase counter (%UBI_SCAN_UNKNOWN_EC if it is unknown) + * @pnum: physical eraseblock number + * @lnum: logical eraseblock number + * @scrub: if this physical eraseblock needs scrubbing + * @sqnum: sequence number + * @u: unions RB-tree or @list links + * @u.rb: link in the per-volume RB-tree of &struct ubi_scan_leb objects + * @u.list: link in one of the eraseblock lists + * @leb_ver: logical eraseblock version (obsolete) + * + * One object of this type is allocated for each physical eraseblock during + * scanning. + */ +struct ubi_scan_leb { + int ec; + int pnum; + int lnum; + int scrub; + unsigned long long sqnum; + union { + struct rb_node rb; + struct list_head list; + } u; + uint32_t leb_ver; +}; + +/** + * struct ubi_scan_volume - scanning information about a volume. + * @vol_id: volume ID + * @highest_lnum: highest logical eraseblock number in this volume + * @leb_count: number of logical eraseblocks in this volume + * @vol_type: volume type + * @used_ebs: number of used logical eraseblocks in this volume (only for + * static volumes) + * @last_data_size: amount of data in the last logical eraseblock of this + * volume (always equivalent to the usable logical eraseblock size in case of + * dynamic volumes) + * @data_pad: how many bytes at the end of logical eraseblocks of this volume + * are not used (due to volume alignment) + * @compat: compatibility flags of this volume + * @rb: link in the volume RB-tree + * @root: root of the RB-tree containing all the eraseblock belonging to this + * volume (&struct ubi_scan_leb objects) + * + * One object of this type is allocated for each volume during scanning. + */ +struct ubi_scan_volume { + int vol_id; + int highest_lnum; + int leb_count; + int vol_type; + int used_ebs; + int last_data_size; + int data_pad; + int compat; + struct rb_node rb; + struct rb_root root; +}; + +/** + * struct ubi_scan_info - UBI scanning information. + * @volumes: root of the volume RB-tree + * @corr: list of corrupted physical eraseblocks + * @free: list of free physical eraseblocks + * @erase: list of physical eraseblocks which have to be erased + * @alien: list of physical eraseblocks which should not be used by UBI (e.g., + * @bad_peb_count: count of bad physical eraseblocks + * those belonging to "preserve"-compatible internal volumes) + * @vols_found: number of volumes found during scanning + * @highest_vol_id: highest volume ID + * @alien_peb_count: count of physical eraseblocks in the @alien list + * @is_empty: flag indicating whether the MTD device is empty or not + * @min_ec: lowest erase counter value + * @max_ec: highest erase counter value + * @max_sqnum: highest sequence number value + * @mean_ec: mean erase counter value + * @ec_sum: a temporary variable used when calculating @mean_ec + * @ec_count: a temporary variable used when calculating @mean_ec + * + * This data structure contains the result of scanning and may be used by other + * UBI units to build final UBI data structures, further error-recovery and so + * on. + */ +struct ubi_scan_info { + struct rb_root volumes; + struct list_head corr; + struct list_head free; + struct list_head erase; + struct list_head alien; + int bad_peb_count; + int vols_found; + int highest_vol_id; + int alien_peb_count; + int is_empty; + int min_ec; + int max_ec; + unsigned long long max_sqnum; + int mean_ec; + int ec_sum; + int ec_count; +}; + +struct ubi_device; +struct ubi_vid_hdr; + +/* + * ubi_scan_move_to_list - move a physical eraseblock from the volume tree to a + * list. + * + * @sv: volume scanning information + * @seb: scanning eraseblock infprmation + * @list: the list to move to + */ +static inline void ubi_scan_move_to_list(struct ubi_scan_volume *sv, + struct ubi_scan_leb *seb, + struct list_head *list) +{ + rb_erase(&seb->u.rb, &sv->root); + list_add_tail(&seb->u.list, list); +} + +int ubi_scan_add_to_list(struct ubi_scan_info *si, int pnum, int ec, + struct list_head *list); +int ubi_scan_add_used(const struct ubi_device *ubi, struct ubi_scan_info *si, + int pnum, int ec, const struct ubi_vid_hdr *vid_hdr, + int bitflips); +struct ubi_scan_volume *ubi_scan_find_sv(const struct ubi_scan_info *si, + int vol_id); +struct ubi_scan_leb *ubi_scan_find_seb(const struct ubi_scan_volume *sv, + int lnum); +void ubi_scan_rm_volume(struct ubi_scan_info *si, struct ubi_scan_volume *sv); +struct ubi_scan_leb *ubi_scan_get_free_peb(const struct ubi_device *ubi, + struct ubi_scan_info *si); +int ubi_scan_erase_peb(const struct ubi_device *ubi, + const struct ubi_scan_info *si, int pnum, int ec); +struct ubi_scan_info *ubi_scan(struct ubi_device *ubi); +void ubi_scan_destroy_si(struct ubi_scan_info *si); + +#endif /* !__UBI_SCAN_H__ */ diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h new file mode 100644 index 00000000000..feb647f108f --- /dev/null +++ b/drivers/mtd/ubi/ubi.h @@ -0,0 +1,535 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * Copyright (c) Nokia Corporation, 2006, 2007 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +#ifndef __UBI_UBI_H__ +#define __UBI_UBI_H__ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/spinlock.h> +#include <linux/fs.h> +#include <linux/cdev.h> +#include <linux/device.h> +#include <linux/string.h> +#include <linux/mtd/mtd.h> + +#include <mtd/ubi-header.h> +#include <linux/mtd/ubi.h> + +#include "scan.h" +#include "debug.h" + +/* Maximum number of supported UBI devices */ +#define UBI_MAX_DEVICES 32 + +/* UBI name used for character devices, sysfs, etc */ +#define UBI_NAME_STR "ubi" + +/* Normal UBI messages */ +#define ubi_msg(fmt, ...) printk(KERN_NOTICE "UBI: " fmt "\n", ##__VA_ARGS__) +/* UBI warning messages */ +#define ubi_warn(fmt, ...) printk(KERN_WARNING "UBI warning: %s: " fmt "\n", \ + __FUNCTION__, ##__VA_ARGS__) +/* UBI error messages */ +#define ubi_err(fmt, ...) printk(KERN_ERR "UBI error: %s: " fmt "\n", \ + __FUNCTION__, ##__VA_ARGS__) + +/* Lowest number PEBs reserved for bad PEB handling */ +#define MIN_RESEVED_PEBS 2 + +/* Background thread name pattern */ +#define UBI_BGT_NAME_PATTERN "ubi_bgt%dd" + +/* This marker in the EBA table means that the LEB is um-mapped */ +#define UBI_LEB_UNMAPPED -1 + +/* + * In case of errors, UBI tries to repeat the operation several times before + * returning error. The below constant defines how many times UBI re-tries. + */ +#define UBI_IO_RETRIES 3 + +/* + * Error codes returned by the I/O unit. + * + * UBI_IO_PEB_EMPTY: the physical eraseblock is empty, i.e. it contains only + * 0xFF bytes + * UBI_IO_PEB_FREE: the physical eraseblock is free, i.e. it contains only a + * valid erase counter header, and the rest are %0xFF bytes + * UBI_IO_BAD_EC_HDR: the erase counter header is corrupted (bad magic or CRC) + * UBI_IO_BAD_VID_HDR: the volume identifier header is corrupted (bad magic or + * CRC) + * UBI_IO_BITFLIPS: bit-flips were detected and corrected + */ +enum { + UBI_IO_PEB_EMPTY = 1, + UBI_IO_PEB_FREE, + UBI_IO_BAD_EC_HDR, + UBI_IO_BAD_VID_HDR, + UBI_IO_BITFLIPS +}; + +extern int ubi_devices_cnt; +extern struct ubi_device *ubi_devices[]; + +struct ubi_volume_desc; + +/** + * struct ubi_volume - UBI volume description data structure. + * @dev: device object to make use of the the Linux device model + * @cdev: character device object to create character device + * @ubi: reference to the UBI device description object + * @vol_id: volume ID + * @readers: number of users holding this volume in read-only mode + * @writers: number of users holding this volume in read-write mode + * @exclusive: whether somebody holds this volume in exclusive mode + * @removed: if the volume was removed + * @checked: if this static volume was checked + * + * @reserved_pebs: how many physical eraseblocks are reserved for this volume + * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) + * @usable_leb_size: logical eraseblock size without padding + * @used_ebs: how many logical eraseblocks in this volume contain data + * @last_eb_bytes: how many bytes are stored in the last logical eraseblock + * @used_bytes: how many bytes of data this volume contains + * @upd_marker: non-zero if the update marker is set for this volume + * @corrupted: non-zero if the volume is corrupted (static volumes only) + * @alignment: volume alignment + * @data_pad: how many bytes are not used at the end of physical eraseblocks to + * satisfy the requested alignment + * @name_len: volume name length + * @name: volume name + * + * @updating: whether the volume is being updated + * @upd_ebs: how many eraseblocks are expected to be updated + * @upd_bytes: how many bytes are expected to be received + * @upd_received: how many update bytes were already received + * @upd_buf: update buffer which is used to collect update data + * + * @eba_tbl: EBA table of this volume (LEB->PEB mapping) + * + * @gluebi_desc: gluebi UBI volume descriptor + * @gluebi_refcount: reference count of the gluebi MTD device + * @gluebi_mtd: MTD device description object of the gluebi MTD device + * + * The @corrupted field indicates that the volume's contents is corrupted. + * Since UBI protects only static volumes, this field is not relevant to + * dynamic volumes - it is user's responsibility to assure their data + * integrity. + * + * The @upd_marker flag indicates that this volume is either being updated at + * the moment or is damaged because of an unclean reboot. + */ +struct ubi_volume { + struct device dev; + struct cdev cdev; + struct ubi_device *ubi; + int vol_id; + int readers; + int writers; + int exclusive; + int removed; + int checked; + + int reserved_pebs; + int vol_type; + int usable_leb_size; + int used_ebs; + int last_eb_bytes; + long long used_bytes; + int upd_marker; + int corrupted; + int alignment; + int data_pad; + int name_len; + char name[UBI_VOL_NAME_MAX+1]; + + int updating; + int upd_ebs; + long long upd_bytes; + long long upd_received; + void *upd_buf; + + int *eba_tbl; + +#ifdef CONFIG_MTD_UBI_GLUEBI + /* Gluebi-related stuff may be compiled out */ + struct ubi_volume_desc *gluebi_desc; + int gluebi_refcount; + struct mtd_info gluebi_mtd; +#endif +}; + +/** + * struct ubi_volume_desc - descriptor of the UBI volume returned when it is + * opened. + * @vol: reference to the corresponding volume description object + * @mode: open mode (%UBI_READONLY, %UBI_READWRITE, or %UBI_EXCLUSIVE) + */ +struct ubi_volume_desc { + struct ubi_volume *vol; + int mode; +}; + +struct ubi_wl_entry; + +/** + * struct ubi_device - UBI device description structure + * @dev: class device object to use the the Linux device model + * @cdev: character device object to create character device + * @ubi_num: UBI device number + * @ubi_name: UBI device name + * @major: character device major number + * @vol_count: number of volumes in this UBI device + * @volumes: volumes of this UBI device + * @volumes_lock: protects @volumes, @rsvd_pebs, @avail_pebs, beb_rsvd_pebs, + * @beb_rsvd_level, @bad_peb_count, @good_peb_count, @vol_count, @vol->readers, + * @vol->writers, @vol->exclusive, @vol->removed, @vol->mapping and + * @vol->eba_tbl. + * + * @rsvd_pebs: count of reserved physical eraseblocks + * @avail_pebs: count of available physical eraseblocks + * @beb_rsvd_pebs: how many physical eraseblocks are reserved for bad PEB + * handling + * @beb_rsvd_level: normal level of PEBs reserved for bad PEB handling + * + * @vtbl_slots: how many slots are available in the volume table + * @vtbl_size: size of the volume table in bytes + * @vtbl: in-RAM volume table copy + * + * @max_ec: current highest erase counter value + * @mean_ec: current mean erase counter value + * + * global_sqnum: global sequence number + * @ltree_lock: protects the lock tree and @global_sqnum + * @ltree: the lock tree + * @vtbl_mutex: protects on-flash volume table + * + * @used: RB-tree of used physical eraseblocks + * @free: RB-tree of free physical eraseblocks + * @scrub: RB-tree of physical eraseblocks which need scrubbing + * @prot: protection trees + * @prot.pnum: protection tree indexed by physical eraseblock numbers + * @prot.aec: protection tree indexed by absolute erase counter value + * @wl_lock: protects the @used, @free, @prot, @lookuptbl, @abs_ec, @move_from, + * @move_to, @move_to_put @erase_pending, @wl_scheduled, and @works + * fields + * @wl_scheduled: non-zero if the wear-leveling was scheduled + * @lookuptbl: a table to quickly find a &struct ubi_wl_entry object for any + * physical eraseblock + * @abs_ec: absolute erase counter + * @move_from: physical eraseblock from where the data is being moved + * @move_to: physical eraseblock where the data is being moved to + * @move_from_put: if the "from" PEB was put + * @move_to_put: if the "to" PEB was put + * @works: list of pending works + * @works_count: count of pending works + * @bgt_thread: background thread description object + * @thread_enabled: if the background thread is enabled + * @bgt_name: background thread name + * + * @flash_size: underlying MTD device size (in bytes) + * @peb_count: count of physical eraseblocks on the MTD device + * @peb_size: physical eraseblock size + * @bad_peb_count: count of bad physical eraseblocks + * @good_peb_count: count of good physical eraseblocks + * @min_io_size: minimal input/output unit size of the underlying MTD device + * @hdrs_min_io_size: minimal I/O unit size used for VID and EC headers + * @ro_mode: if the UBI device is in read-only mode + * @leb_size: logical eraseblock size + * @leb_start: starting offset of logical eraseblocks within physical + * eraseblocks + * @ec_hdr_alsize: size of the EC header aligned to @hdrs_min_io_size + * @vid_hdr_alsize: size of the VID header aligned to @hdrs_min_io_size + * @vid_hdr_offset: starting offset of the volume identifier header (might be + * unaligned) + * @vid_hdr_aloffset: starting offset of the VID header aligned to + * @hdrs_min_io_size + * @vid_hdr_shift: contains @vid_hdr_offset - @vid_hdr_aloffset + * @bad_allowed: whether the MTD device admits of bad physical eraseblocks or + * not + * @mtd: MTD device descriptor + */ +struct ubi_device { + struct cdev cdev; + struct device dev; + int ubi_num; + char ubi_name[sizeof(UBI_NAME_STR)+5]; + int major; + int vol_count; + struct ubi_volume *volumes[UBI_MAX_VOLUMES+UBI_INT_VOL_COUNT]; + spinlock_t volumes_lock; + + int rsvd_pebs; + int avail_pebs; + int beb_rsvd_pebs; + int beb_rsvd_level; + + int vtbl_slots; + int vtbl_size; + struct ubi_vtbl_record *vtbl; + struct mutex vtbl_mutex; + + int max_ec; + int mean_ec; + + /* EBA unit's stuff */ + unsigned long long global_sqnum; + spinlock_t ltree_lock; + struct rb_root ltree; + + /* Wear-leveling unit's stuff */ + struct rb_root used; + struct rb_root free; + struct rb_root scrub; + struct { + struct rb_root pnum; + struct rb_root aec; + } prot; + spinlock_t wl_lock; + int wl_scheduled; + struct ubi_wl_entry **lookuptbl; + unsigned long long abs_ec; + struct ubi_wl_entry *move_from; + struct ubi_wl_entry *move_to; + int move_from_put; + int move_to_put; + struct list_head works; + int works_count; + struct task_struct *bgt_thread; + int thread_enabled; + char bgt_name[sizeof(UBI_BGT_NAME_PATTERN)+2]; + + /* I/O unit's stuff */ + long long flash_size; + int peb_count; + int peb_size; + int bad_peb_count; + int good_peb_count; + int min_io_size; + int hdrs_min_io_size; + int ro_mode; + int leb_size; + int leb_start; + int ec_hdr_alsize; + int vid_hdr_alsize; + int vid_hdr_offset; + int vid_hdr_aloffset; + int vid_hdr_shift; + int bad_allowed; + struct mtd_info *mtd; +}; + +extern struct file_operations ubi_cdev_operations; +extern struct file_operations ubi_vol_cdev_operations; +extern struct class *ubi_class; + +/* vtbl.c */ +int ubi_change_vtbl_record(struct ubi_device *ubi, int idx, + struct ubi_vtbl_record *vtbl_rec); +int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_scan_info *si); + +/* vmt.c */ +int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req); +int ubi_remove_volume(struct ubi_volume_desc *desc); +int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs); +int ubi_add_volume(struct ubi_device *ubi, int vol_id); +void ubi_free_volume(struct ubi_device *ubi, int vol_id); + +/* upd.c */ +int ubi_start_update(struct ubi_device *ubi, int vol_id, long long bytes); +int ubi_more_update_data(struct ubi_device *ubi, int vol_id, + const void __user *buf, int count); + +/* misc.c */ +int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf, int length); +int ubi_check_volume(struct ubi_device *ubi, int vol_id); +void ubi_calculate_reserved(struct ubi_device *ubi); + +/* gluebi.c */ +#ifdef CONFIG_MTD_UBI_GLUEBI +int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol); +int ubi_destroy_gluebi(struct ubi_volume *vol); +#else +#define ubi_create_gluebi(ubi, vol) 0 +#define ubi_destroy_gluebi(vol) 0 +#endif + +/* eba.c */ +int ubi_eba_unmap_leb(struct ubi_device *ubi, int vol_id, int lnum); +int ubi_eba_read_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf, + int offset, int len, int check); +int ubi_eba_write_leb(struct ubi_device *ubi, int vol_id, int lnum, + const void *buf, int offset, int len, int dtype); +int ubi_eba_write_leb_st(struct ubi_device *ubi, int vol_id, int lnum, + const void *buf, int len, int dtype, + int used_ebs); +int ubi_eba_atomic_leb_change(struct ubi_device *ubi, int vol_id, int lnum, + const void *buf, int len, int dtype); +int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to, + struct ubi_vid_hdr *vid_hdr); +int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si); +void ubi_eba_close(const struct ubi_device *ubi); + +/* wl.c */ +int ubi_wl_get_peb(struct ubi_device *ubi, int dtype); +int ubi_wl_put_peb(struct ubi_device *ubi, int pnum, int torture); +int ubi_wl_flush(struct ubi_device *ubi); +int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum); +int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si); +void ubi_wl_close(struct ubi_device *ubi); + +/* io.c */ +int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset, + int len); +int ubi_io_write(const struct ubi_device *ubi, const void *buf, int pnum, + int offset, int len); +int ubi_io_sync_erase(const struct ubi_device *ubi, int pnum, int torture); +int ubi_io_is_bad(const struct ubi_device *ubi, int pnum); +int ubi_io_mark_bad(const struct ubi_device *ubi, int pnum); +int ubi_io_read_ec_hdr(const struct ubi_device *ubi, int pnum, + struct ubi_ec_hdr *ec_hdr, int verbose); +int ubi_io_write_ec_hdr(const struct ubi_device *ubi, int pnum, + struct ubi_ec_hdr *ec_hdr); +int ubi_io_read_vid_hdr(const struct ubi_device *ubi, int pnum, + struct ubi_vid_hdr *vid_hdr, int verbose); +int ubi_io_write_vid_hdr(const struct ubi_device *ubi, int pnum, + struct ubi_vid_hdr *vid_hdr); + +/* + * ubi_rb_for_each_entry - walk an RB-tree. + * @rb: a pointer to type 'struct rb_node' to to use as a loop counter + * @pos: a pointer to RB-tree entry type to use as a loop counter + * @root: RB-tree's root + * @member: the name of the 'struct rb_node' within the RB-tree entry + */ +#define ubi_rb_for_each_entry(rb, pos, root, member) \ + for (rb = rb_first(root), \ + pos = (rb ? container_of(rb, typeof(*pos), member) : NULL); \ + rb; \ + rb = rb_next(rb), pos = container_of(rb, typeof(*pos), member)) + +/** + * ubi_zalloc_vid_hdr - allocate a volume identifier header object. + * @ubi: UBI device description object + * + * This function returns a pointer to the newly allocated and zero-filled + * volume identifier header object in case of success and %NULL in case of + * failure. + */ +static inline struct ubi_vid_hdr *ubi_zalloc_vid_hdr(const struct ubi_device *ubi) +{ + void *vid_hdr; + + vid_hdr = kzalloc(ubi->vid_hdr_alsize, GFP_KERNEL); + if (!vid_hdr) + return NULL; + + /* + * VID headers may be stored at un-aligned flash offsets, so we shift + * the pointer. + */ + return vid_hdr + ubi->vid_hdr_shift; +} + +/** + * ubi_free_vid_hdr - free a volume identifier header object. + * @ubi: UBI device description object + * @vid_hdr: the object to free + */ +static inline void ubi_free_vid_hdr(const struct ubi_device *ubi, + struct ubi_vid_hdr *vid_hdr) +{ + void *p = vid_hdr; + + if (!p) + return; + + kfree(p - ubi->vid_hdr_shift); +} + +/* + * This function is equivalent to 'ubi_io_read()', but @offset is relative to + * the beginning of the logical eraseblock, not to the beginning of the + * physical eraseblock. + */ +static inline int ubi_io_read_data(const struct ubi_device *ubi, void *buf, + int pnum, int offset, int len) +{ + ubi_assert(offset >= 0); + return ubi_io_read(ubi, buf, pnum, offset + ubi->leb_start, len); +} + +/* + * This function is equivalent to 'ubi_io_write()', but @offset is relative to + * the beginning of the logical eraseblock, not to the beginning of the + * physical eraseblock. + */ +static inline int ubi_io_write_data(const struct ubi_device *ubi, const void *buf, + int pnum, int offset, int len) +{ + ubi_assert(offset >= 0); + return ubi_io_write(ubi, buf, pnum, offset + ubi->leb_start, len); +} + +/** + * ubi_ro_mode - switch to read-only mode. + * @ubi: UBI device description object + */ +static inline void ubi_ro_mode(struct ubi_device *ubi) +{ + ubi->ro_mode = 1; + ubi_warn("switch to read-only mode"); +} + +/** + * vol_id2idx - get table index by volume ID. + * @ubi: UBI device description object + * @vol_id: volume ID + */ +static inline int vol_id2idx(const struct ubi_device *ubi, int vol_id) +{ + if (vol_id >= UBI_INTERNAL_VOL_START) + return vol_id - UBI_INTERNAL_VOL_START + ubi->vtbl_slots; + else + return vol_id; +} + +/** + * idx2vol_id - get volume ID by table index. + * @ubi: UBI device description object + * @idx: table index + */ +static inline int idx2vol_id(const struct ubi_device *ubi, int idx) +{ + if (idx >= ubi->vtbl_slots) + return idx - ubi->vtbl_slots + UBI_INTERNAL_VOL_START; + else + return idx; +} + +#endif /* !__UBI_UBI_H__ */ diff --git a/drivers/mtd/ubi/upd.c b/drivers/mtd/ubi/upd.c new file mode 100644 index 00000000000..8925b977e3d --- /dev/null +++ b/drivers/mtd/ubi/upd.c @@ -0,0 +1,348 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * Copyright (c) Nokia Corporation, 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + * + * Jan 2007: Alexander Schmidt, hacked per-volume update. + */ + +/* + * This file contains implementation of the volume update functionality. + * + * The update operation is based on the per-volume update marker which is + * stored in the volume table. The update marker is set before the update + * starts, and removed after the update has been finished. So if the update was + * interrupted by an unclean re-boot or due to some other reasons, the update + * marker stays on the flash media and UBI finds it when it attaches the MTD + * device next time. If the update marker is set for a volume, the volume is + * treated as damaged and most I/O operations are prohibited. Only a new update + * operation is allowed. + * + * Note, in general it is possible to implement the update operation as a + * transaction with a roll-back capability. + */ + +#include <linux/err.h> +#include <asm/uaccess.h> +#include <asm/div64.h> +#include "ubi.h" + +/** + * set_update_marker - set update marker. + * @ubi: UBI device description object + * @vol_id: volume ID + * + * This function sets the update marker flag for volume @vol_id. Returns zero + * in case of success and a negative error code in case of failure. + */ +static int set_update_marker(struct ubi_device *ubi, int vol_id) +{ + int err; + struct ubi_vtbl_record vtbl_rec; + struct ubi_volume *vol = ubi->volumes[vol_id]; + + dbg_msg("set update marker for volume %d", vol_id); + + if (vol->upd_marker) { + ubi_assert(ubi->vtbl[vol_id].upd_marker); + dbg_msg("already set"); + return 0; + } + + memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record)); + vtbl_rec.upd_marker = 1; + + err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); + vol->upd_marker = 1; + return err; +} + +/** + * clear_update_marker - clear update marker. + * @ubi: UBI device description object + * @vol_id: volume ID + * @bytes: new data size in bytes + * + * This function clears the update marker for volume @vol_id, sets new volume + * data size and clears the "corrupted" flag (static volumes only). Returns + * zero in case of success and a negative error code in case of failure. + */ +static int clear_update_marker(struct ubi_device *ubi, int vol_id, long long bytes) +{ + int err; + uint64_t tmp; + struct ubi_vtbl_record vtbl_rec; + struct ubi_volume *vol = ubi->volumes[vol_id]; + + dbg_msg("clear update marker for volume %d", vol_id); + + memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record)); + ubi_assert(vol->upd_marker && vtbl_rec.upd_marker); + vtbl_rec.upd_marker = 0; + + if (vol->vol_type == UBI_STATIC_VOLUME) { + vol->corrupted = 0; + vol->used_bytes = tmp = bytes; + vol->last_eb_bytes = do_div(tmp, vol->usable_leb_size); + vol->used_ebs = tmp; + if (vol->last_eb_bytes) + vol->used_ebs += 1; + else + vol->last_eb_bytes = vol->usable_leb_size; + } + + err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); + vol->upd_marker = 0; + return err; +} + +/** + * ubi_start_update - start volume update. + * @ubi: UBI device description object + * @vol_id: volume ID + * @bytes: update bytes + * + * This function starts volume update operation. If @bytes is zero, the volume + * is just wiped out. Returns zero in case of success and a negative error code + * in case of failure. + */ +int ubi_start_update(struct ubi_device *ubi, int vol_id, long long bytes) +{ + int i, err; + uint64_t tmp; + struct ubi_volume *vol = ubi->volumes[vol_id]; + + dbg_msg("start update of volume %d, %llu bytes", vol_id, bytes); + vol->updating = 1; + + err = set_update_marker(ubi, vol_id); + if (err) + return err; + + /* Before updating - wipe out the volume */ + for (i = 0; i < vol->reserved_pebs; i++) { + err = ubi_eba_unmap_leb(ubi, vol_id, i); + if (err) + return err; + } + + if (bytes == 0) { + err = clear_update_marker(ubi, vol_id, 0); + if (err) + return err; + err = ubi_wl_flush(ubi); + if (!err) + vol->updating = 0; + } + + vol->upd_buf = kmalloc(ubi->leb_size, GFP_KERNEL); + if (!vol->upd_buf) + return -ENOMEM; + + tmp = bytes; + vol->upd_ebs = !!do_div(tmp, vol->usable_leb_size); + vol->upd_ebs += tmp; + vol->upd_bytes = bytes; + vol->upd_received = 0; + return 0; +} + +/** + * write_leb - write update data. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + * @buf: data to write + * @len: data size + * @used_ebs: how many logical eraseblocks will this volume contain (static + * volumes only) + * + * This function writes update data to corresponding logical eraseblock. In + * case of dynamic volume, this function checks if the data contains 0xFF bytes + * at the end. If yes, the 0xFF bytes are cut and not written. So if the whole + * buffer contains only 0xFF bytes, the LEB is left unmapped. + * + * The reason why we skip the trailing 0xFF bytes in case of dynamic volume is + * that we want to make sure that more data may be appended to the logical + * eraseblock in future. Indeed, writing 0xFF bytes may have side effects and + * this PEB won't be writable anymore. So if one writes the file-system image + * to the UBI volume where 0xFFs mean free space - UBI makes sure this free + * space is writable after the update. + * + * We do not do this for static volumes because they are read-only. But this + * also cannot be done because we have to store per-LEB CRC and the correct + * data length. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int write_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf, + int len, int used_ebs) +{ + int err, l; + struct ubi_volume *vol = ubi->volumes[vol_id]; + + if (vol->vol_type == UBI_DYNAMIC_VOLUME) { + l = ALIGN(len, ubi->min_io_size); + memset(buf + len, 0xFF, l - len); + + l = ubi_calc_data_len(ubi, buf, l); + if (l == 0) { + dbg_msg("all %d bytes contain 0xFF - skip", len); + return 0; + } + if (len != l) + dbg_msg("skip last %d bytes (0xFF)", len - l); + + err = ubi_eba_write_leb(ubi, vol_id, lnum, buf, 0, l, + UBI_UNKNOWN); + } else { + /* + * When writing static volume, and this is the last logical + * eraseblock, the length (@len) does not have to be aligned to + * the minimal flash I/O unit. The 'ubi_eba_write_leb_st()' + * function accepts exact (unaligned) length and stores it in + * the VID header. And it takes care of proper alignment by + * padding the buffer. Here we just make sure the padding will + * contain zeros, not random trash. + */ + memset(buf + len, 0, vol->usable_leb_size - len); + err = ubi_eba_write_leb_st(ubi, vol_id, lnum, buf, len, + UBI_UNKNOWN, used_ebs); + } + + return err; +} + +/** + * ubi_more_update_data - write more update data. + * @vol: volume description object + * @buf: write data (user-space memory buffer) + * @count: how much bytes to write + * + * This function writes more data to the volume which is being updated. It may + * be called arbitrary number of times until all of the update data arrive. + * This function returns %0 in case of success, number of bytes written during + * the last call if the whole volume update was successfully finished, and a + * negative error code in case of failure. + */ +int ubi_more_update_data(struct ubi_device *ubi, int vol_id, + const void __user *buf, int count) +{ + uint64_t tmp; + struct ubi_volume *vol = ubi->volumes[vol_id]; + int lnum, offs, err = 0, len, to_write = count; + + dbg_msg("write %d of %lld bytes, %lld already passed", + count, vol->upd_bytes, vol->upd_received); + + if (ubi->ro_mode) + return -EROFS; + + tmp = vol->upd_received; + offs = do_div(tmp, vol->usable_leb_size); + lnum = tmp; + + if (vol->upd_received + count > vol->upd_bytes) + to_write = count = vol->upd_bytes - vol->upd_received; + + /* + * When updating volumes, we accumulate whole logical eraseblock of + * data and write it at once. + */ + if (offs != 0) { + /* + * This is a write to the middle of the logical eraseblock. We + * copy the data to our update buffer and wait for more data or + * flush it if the whole eraseblock is written or the update + * is finished. + */ + + len = vol->usable_leb_size - offs; + if (len > count) + len = count; + + err = copy_from_user(vol->upd_buf + offs, buf, len); + if (err) + return -EFAULT; + + if (offs + len == vol->usable_leb_size || + vol->upd_received + len == vol->upd_bytes) { + int flush_len = offs + len; + + /* + * OK, we gathered either the whole eraseblock or this + * is the last chunk, it's time to flush the buffer. + */ + ubi_assert(flush_len <= vol->usable_leb_size); + err = write_leb(ubi, vol_id, lnum, vol->upd_buf, + flush_len, vol->upd_ebs); + if (err) + return err; + } + + vol->upd_received += len; + count -= len; + buf += len; + lnum += 1; + } + + /* + * If we've got more to write, let's continue. At this point we know we + * are starting from the beginning of an eraseblock. + */ + while (count) { + if (count > vol->usable_leb_size) + len = vol->usable_leb_size; + else + len = count; + + err = copy_from_user(vol->upd_buf, buf, len); + if (err) + return -EFAULT; + + if (len == vol->usable_leb_size || + vol->upd_received + len == vol->upd_bytes) { + err = write_leb(ubi, vol_id, lnum, vol->upd_buf, len, + vol->upd_ebs); + if (err) + break; + } + + vol->upd_received += len; + count -= len; + lnum += 1; + buf += len; + } + + ubi_assert(vol->upd_received <= vol->upd_bytes); + if (vol->upd_received == vol->upd_bytes) { + /* The update is finished, clear the update marker */ + err = clear_update_marker(ubi, vol_id, vol->upd_bytes); + if (err) + return err; + err = ubi_wl_flush(ubi); + if (err == 0) { + err = to_write; + kfree(vol->upd_buf); + vol->updating = 0; + } + } + + return err; +} diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c new file mode 100644 index 00000000000..622d0d18952 --- /dev/null +++ b/drivers/mtd/ubi/vmt.c @@ -0,0 +1,809 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file contains implementation of volume creation, deletion, updating and + * resizing. + */ + +#include <linux/err.h> +#include <asm/div64.h> +#include "ubi.h" + +#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID +static void paranoid_check_volumes(struct ubi_device *ubi); +#else +#define paranoid_check_volumes(ubi) +#endif + +static ssize_t vol_attribute_show(struct device *dev, + struct device_attribute *attr, char *buf); + +/* Device attributes corresponding to files in '/<sysfs>/class/ubi/ubiX_Y' */ +static struct device_attribute vol_reserved_ebs = + __ATTR(reserved_ebs, S_IRUGO, vol_attribute_show, NULL); +static struct device_attribute vol_type = + __ATTR(type, S_IRUGO, vol_attribute_show, NULL); +static struct device_attribute vol_name = + __ATTR(name, S_IRUGO, vol_attribute_show, NULL); +static struct device_attribute vol_corrupted = + __ATTR(corrupted, S_IRUGO, vol_attribute_show, NULL); +static struct device_attribute vol_alignment = + __ATTR(alignment, S_IRUGO, vol_attribute_show, NULL); +static struct device_attribute vol_usable_eb_size = + __ATTR(usable_eb_size, S_IRUGO, vol_attribute_show, NULL); +static struct device_attribute vol_data_bytes = + __ATTR(data_bytes, S_IRUGO, vol_attribute_show, NULL); +static struct device_attribute vol_upd_marker = + __ATTR(upd_marker, S_IRUGO, vol_attribute_show, NULL); + +/* + * "Show" method for files in '/<sysfs>/class/ubi/ubiX_Y/'. + * + * Consider a situation: + * A. process 1 opens a sysfs file related to volume Y, say + * /<sysfs>/class/ubi/ubiX_Y/reserved_ebs; + * B. process 2 removes volume Y; + * C. process 1 starts reading the /<sysfs>/class/ubi/ubiX_Y/reserved_ebs file; + * + * What we want to do in a situation like that is to return error when the file + * is read. This is done by means of the 'removed' flag and the 'vol_lock' of + * the UBI volume description object. + */ +static ssize_t vol_attribute_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int ret; + struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev); + + spin_lock(&vol->ubi->volumes_lock); + if (vol->removed) { + spin_unlock(&vol->ubi->volumes_lock); + return -ENODEV; + } + if (attr == &vol_reserved_ebs) + ret = sprintf(buf, "%d\n", vol->reserved_pebs); + else if (attr == &vol_type) { + const char *tp; + tp = vol->vol_type == UBI_DYNAMIC_VOLUME ? "dynamic" : "static"; + ret = sprintf(buf, "%s\n", tp); + } else if (attr == &vol_name) + ret = sprintf(buf, "%s\n", vol->name); + else if (attr == &vol_corrupted) + ret = sprintf(buf, "%d\n", vol->corrupted); + else if (attr == &vol_alignment) + ret = sprintf(buf, "%d\n", vol->alignment); + else if (attr == &vol_usable_eb_size) { + ret = sprintf(buf, "%d\n", vol->usable_leb_size); + } else if (attr == &vol_data_bytes) + ret = sprintf(buf, "%lld\n", vol->used_bytes); + else if (attr == &vol_upd_marker) + ret = sprintf(buf, "%d\n", vol->upd_marker); + else + BUG(); + spin_unlock(&vol->ubi->volumes_lock); + return ret; +} + +/* Release method for volume devices */ +static void vol_release(struct device *dev) +{ + struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev); + ubi_assert(vol->removed); + kfree(vol); +} + +/** + * volume_sysfs_init - initialize sysfs for new volume. + * @ubi: UBI device description object + * @vol: volume description object + * + * This function returns zero in case of success and a negative error code in + * case of failure. + * + * Note, this function does not free allocated resources in case of failure - + * the caller does it. This is because this would cause release() here and the + * caller would oops. + */ +static int volume_sysfs_init(struct ubi_device *ubi, struct ubi_volume *vol) +{ + int err; + + err = device_create_file(&vol->dev, &vol_reserved_ebs); + if (err) + return err; + err = device_create_file(&vol->dev, &vol_type); + if (err) + return err; + err = device_create_file(&vol->dev, &vol_name); + if (err) + return err; + err = device_create_file(&vol->dev, &vol_corrupted); + if (err) + return err; + err = device_create_file(&vol->dev, &vol_alignment); + if (err) + return err; + err = device_create_file(&vol->dev, &vol_usable_eb_size); + if (err) + return err; + err = device_create_file(&vol->dev, &vol_data_bytes); + if (err) + return err; + err = device_create_file(&vol->dev, &vol_upd_marker); + if (err) + return err; + return 0; +} + +/** + * volume_sysfs_close - close sysfs for a volume. + * @vol: volume description object + */ +static void volume_sysfs_close(struct ubi_volume *vol) +{ + device_remove_file(&vol->dev, &vol_upd_marker); + device_remove_file(&vol->dev, &vol_data_bytes); + device_remove_file(&vol->dev, &vol_usable_eb_size); + device_remove_file(&vol->dev, &vol_alignment); + device_remove_file(&vol->dev, &vol_corrupted); + device_remove_file(&vol->dev, &vol_name); + device_remove_file(&vol->dev, &vol_type); + device_remove_file(&vol->dev, &vol_reserved_ebs); + device_unregister(&vol->dev); +} + +/** + * ubi_create_volume - create volume. + * @ubi: UBI device description object + * @req: volume creation request + * + * This function creates volume described by @req. If @req->vol_id id + * %UBI_VOL_NUM_AUTO, this function automatically assigne ID to the new volume + * and saves it in @req->vol_id. Returns zero in case of success and a negative + * error code in case of failure. + */ +int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req) +{ + int i, err, vol_id = req->vol_id; + struct ubi_volume *vol; + struct ubi_vtbl_record vtbl_rec; + uint64_t bytes; + + if (ubi->ro_mode) + return -EROFS; + + vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL); + if (!vol) + return -ENOMEM; + + spin_lock(&ubi->volumes_lock); + + if (vol_id == UBI_VOL_NUM_AUTO) { + /* Find unused volume ID */ + dbg_msg("search for vacant volume ID"); + for (i = 0; i < ubi->vtbl_slots; i++) + if (!ubi->volumes[i]) { + vol_id = i; + break; + } + + if (vol_id == UBI_VOL_NUM_AUTO) { + dbg_err("out of volume IDs"); + err = -ENFILE; + goto out_unlock; + } + req->vol_id = vol_id; + } + + dbg_msg("volume ID %d, %llu bytes, type %d, name %s", + vol_id, (unsigned long long)req->bytes, + (int)req->vol_type, req->name); + + /* Ensure that this volume does not exist */ + err = -EEXIST; + if (ubi->volumes[vol_id]) { + dbg_err("volume %d already exists", vol_id); + goto out_unlock; + } + + /* Ensure that the name is unique */ + for (i = 0; i < ubi->vtbl_slots; i++) + if (ubi->volumes[i] && + ubi->volumes[i]->name_len == req->name_len && + strcmp(ubi->volumes[i]->name, req->name) == 0) { + dbg_err("volume \"%s\" exists (ID %d)", req->name, i); + goto out_unlock; + } + + /* Calculate how many eraseblocks are requested */ + vol->usable_leb_size = ubi->leb_size - ubi->leb_size % req->alignment; + bytes = req->bytes; + if (do_div(bytes, vol->usable_leb_size)) + vol->reserved_pebs = 1; + vol->reserved_pebs += bytes; + + /* Reserve physical eraseblocks */ + if (vol->reserved_pebs > ubi->avail_pebs) { + dbg_err("not enough PEBs, only %d available", ubi->avail_pebs); + spin_unlock(&ubi->volumes_lock); + err = -ENOSPC; + goto out_unlock; + } + ubi->avail_pebs -= vol->reserved_pebs; + ubi->rsvd_pebs += vol->reserved_pebs; + + vol->vol_id = vol_id; + vol->alignment = req->alignment; + vol->data_pad = ubi->leb_size % vol->alignment; + vol->vol_type = req->vol_type; + vol->name_len = req->name_len; + memcpy(vol->name, req->name, vol->name_len + 1); + vol->exclusive = 1; + vol->ubi = ubi; + ubi->volumes[vol_id] = vol; + spin_unlock(&ubi->volumes_lock); + + /* + * Finish all pending erases because there may be some LEBs belonging + * to the same volume ID. + */ + err = ubi_wl_flush(ubi); + if (err) + goto out_acc; + + vol->eba_tbl = kmalloc(vol->reserved_pebs * sizeof(int), GFP_KERNEL); + if (!vol->eba_tbl) { + err = -ENOMEM; + goto out_acc; + } + + for (i = 0; i < vol->reserved_pebs; i++) + vol->eba_tbl[i] = UBI_LEB_UNMAPPED; + + if (vol->vol_type == UBI_DYNAMIC_VOLUME) { + vol->used_ebs = vol->reserved_pebs; + vol->last_eb_bytes = vol->usable_leb_size; + vol->used_bytes = vol->used_ebs * vol->usable_leb_size; + } else { + bytes = vol->used_bytes; + vol->last_eb_bytes = do_div(bytes, vol->usable_leb_size); + vol->used_ebs = bytes; + if (vol->last_eb_bytes) + vol->used_ebs += 1; + else + vol->last_eb_bytes = vol->usable_leb_size; + } + + /* Register character device for the volume */ + cdev_init(&vol->cdev, &ubi_vol_cdev_operations); + vol->cdev.owner = THIS_MODULE; + err = cdev_add(&vol->cdev, MKDEV(ubi->major, vol_id + 1), 1); + if (err) { + ubi_err("cannot add character device for volume %d", vol_id); + goto out_mapping; + } + + err = ubi_create_gluebi(ubi, vol); + if (err) + goto out_cdev; + + vol->dev.release = vol_release; + vol->dev.parent = &ubi->dev; + vol->dev.devt = MKDEV(ubi->major, vol->vol_id + 1); + vol->dev.class = ubi_class; + sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id); + err = device_register(&vol->dev); + if (err) + goto out_gluebi; + + err = volume_sysfs_init(ubi, vol); + if (err) + goto out_sysfs; + + /* Fill volume table record */ + memset(&vtbl_rec, 0, sizeof(struct ubi_vtbl_record)); + vtbl_rec.reserved_pebs = cpu_to_ubi32(vol->reserved_pebs); + vtbl_rec.alignment = cpu_to_ubi32(vol->alignment); + vtbl_rec.data_pad = cpu_to_ubi32(vol->data_pad); + vtbl_rec.name_len = cpu_to_ubi16(vol->name_len); + if (vol->vol_type == UBI_DYNAMIC_VOLUME) + vtbl_rec.vol_type = UBI_VID_DYNAMIC; + else + vtbl_rec.vol_type = UBI_VID_STATIC; + memcpy(vtbl_rec.name, vol->name, vol->name_len + 1); + + err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); + if (err) + goto out_sysfs; + + spin_lock(&ubi->volumes_lock); + ubi->vol_count += 1; + vol->exclusive = 0; + spin_unlock(&ubi->volumes_lock); + + paranoid_check_volumes(ubi); + return 0; + +out_gluebi: + err = ubi_destroy_gluebi(vol); +out_cdev: + cdev_del(&vol->cdev); +out_mapping: + kfree(vol->eba_tbl); +out_acc: + spin_lock(&ubi->volumes_lock); + ubi->rsvd_pebs -= vol->reserved_pebs; + ubi->avail_pebs += vol->reserved_pebs; +out_unlock: + spin_unlock(&ubi->volumes_lock); + kfree(vol); + return err; + + /* + * We are registered, so @vol is destroyed in the release function and + * we have to de-initialize differently. + */ +out_sysfs: + err = ubi_destroy_gluebi(vol); + cdev_del(&vol->cdev); + kfree(vol->eba_tbl); + spin_lock(&ubi->volumes_lock); + ubi->rsvd_pebs -= vol->reserved_pebs; + ubi->avail_pebs += vol->reserved_pebs; + spin_unlock(&ubi->volumes_lock); + volume_sysfs_close(vol); + return err; +} + +/** + * ubi_remove_volume - remove volume. + * @desc: volume descriptor + * + * This function removes volume described by @desc. The volume has to be opened + * in "exclusive" mode. Returns zero in case of success and a negative error + * code in case of failure. + */ +int ubi_remove_volume(struct ubi_volume_desc *desc) +{ + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + int i, err, vol_id = vol->vol_id, reserved_pebs = vol->reserved_pebs; + + dbg_msg("remove UBI volume %d", vol_id); + ubi_assert(desc->mode == UBI_EXCLUSIVE); + ubi_assert(vol == ubi->volumes[vol_id]); + + if (ubi->ro_mode) + return -EROFS; + + err = ubi_destroy_gluebi(vol); + if (err) + return err; + + err = ubi_change_vtbl_record(ubi, vol_id, NULL); + if (err) + return err; + + for (i = 0; i < vol->reserved_pebs; i++) { + err = ubi_eba_unmap_leb(ubi, vol_id, i); + if (err) + return err; + } + + spin_lock(&ubi->volumes_lock); + vol->removed = 1; + ubi->volumes[vol_id] = NULL; + spin_unlock(&ubi->volumes_lock); + + kfree(vol->eba_tbl); + vol->eba_tbl = NULL; + cdev_del(&vol->cdev); + volume_sysfs_close(vol); + kfree(desc); + + spin_lock(&ubi->volumes_lock); + ubi->rsvd_pebs -= reserved_pebs; + ubi->avail_pebs += reserved_pebs; + i = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs; + if (i > 0) { + i = ubi->avail_pebs >= i ? i : ubi->avail_pebs; + ubi->avail_pebs -= i; + ubi->rsvd_pebs += i; + ubi->beb_rsvd_pebs += i; + if (i > 0) + ubi_msg("reserve more %d PEBs", i); + } + ubi->vol_count -= 1; + spin_unlock(&ubi->volumes_lock); + + paranoid_check_volumes(ubi); + module_put(THIS_MODULE); + return 0; +} + +/** + * ubi_resize_volume - re-size volume. + * @desc: volume descriptor + * @reserved_pebs: new size in physical eraseblocks + * + * This function returns zero in case of success, and a negative error code in + * case of failure. + */ +int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) +{ + int i, err, pebs, *new_mapping; + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + struct ubi_vtbl_record vtbl_rec; + int vol_id = vol->vol_id; + + if (ubi->ro_mode) + return -EROFS; + + dbg_msg("re-size volume %d to from %d to %d PEBs", + vol_id, vol->reserved_pebs, reserved_pebs); + ubi_assert(desc->mode == UBI_EXCLUSIVE); + ubi_assert(vol == ubi->volumes[vol_id]); + + if (vol->vol_type == UBI_STATIC_VOLUME && + reserved_pebs < vol->used_ebs) { + dbg_err("too small size %d, %d LEBs contain data", + reserved_pebs, vol->used_ebs); + return -EINVAL; + } + + /* If the size is the same, we have nothing to do */ + if (reserved_pebs == vol->reserved_pebs) + return 0; + + new_mapping = kmalloc(reserved_pebs * sizeof(int), GFP_KERNEL); + if (!new_mapping) + return -ENOMEM; + + for (i = 0; i < reserved_pebs; i++) + new_mapping[i] = UBI_LEB_UNMAPPED; + + /* Reserve physical eraseblocks */ + pebs = reserved_pebs - vol->reserved_pebs; + if (pebs > 0) { + spin_lock(&ubi->volumes_lock); + if (pebs > ubi->avail_pebs) { + dbg_err("not enough PEBs: requested %d, available %d", + pebs, ubi->avail_pebs); + spin_unlock(&ubi->volumes_lock); + err = -ENOSPC; + goto out_free; + } + ubi->avail_pebs -= pebs; + ubi->rsvd_pebs += pebs; + for (i = 0; i < vol->reserved_pebs; i++) + new_mapping[i] = vol->eba_tbl[i]; + kfree(vol->eba_tbl); + vol->eba_tbl = new_mapping; + spin_unlock(&ubi->volumes_lock); + } + + /* Change volume table record */ + memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record)); + vtbl_rec.reserved_pebs = cpu_to_ubi32(reserved_pebs); + err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); + if (err) + goto out_acc; + + if (pebs < 0) { + for (i = 0; i < -pebs; i++) { + err = ubi_eba_unmap_leb(ubi, vol_id, reserved_pebs + i); + if (err) + goto out_acc; + } + spin_lock(&ubi->volumes_lock); + ubi->rsvd_pebs += pebs; + ubi->avail_pebs -= pebs; + pebs = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs; + if (pebs > 0) { + pebs = ubi->avail_pebs >= pebs ? pebs : ubi->avail_pebs; + ubi->avail_pebs -= pebs; + ubi->rsvd_pebs += pebs; + ubi->beb_rsvd_pebs += pebs; + if (pebs > 0) + ubi_msg("reserve more %d PEBs", pebs); + } + for (i = 0; i < reserved_pebs; i++) + new_mapping[i] = vol->eba_tbl[i]; + kfree(vol->eba_tbl); + vol->eba_tbl = new_mapping; + spin_unlock(&ubi->volumes_lock); + } + + vol->reserved_pebs = reserved_pebs; + if (vol->vol_type == UBI_DYNAMIC_VOLUME) { + vol->used_ebs = reserved_pebs; + vol->last_eb_bytes = vol->usable_leb_size; + vol->used_bytes = vol->used_ebs * vol->usable_leb_size; + } + + paranoid_check_volumes(ubi); + return 0; + +out_acc: + if (pebs > 0) { + spin_lock(&ubi->volumes_lock); + ubi->rsvd_pebs -= pebs; + ubi->avail_pebs += pebs; + spin_unlock(&ubi->volumes_lock); + } +out_free: + kfree(new_mapping); + return err; +} + +/** + * ubi_add_volume - add volume. + * @ubi: UBI device description object + * @vol_id: volume ID + * + * This function adds an existin volume and initializes all its data + * structures. Returnes zero in case of success and a negative error code in + * case of failure. + */ +int ubi_add_volume(struct ubi_device *ubi, int vol_id) +{ + int err; + struct ubi_volume *vol = ubi->volumes[vol_id]; + + dbg_msg("add volume %d", vol_id); + ubi_dbg_dump_vol_info(vol); + ubi_assert(vol); + + /* Register character device for the volume */ + cdev_init(&vol->cdev, &ubi_vol_cdev_operations); + vol->cdev.owner = THIS_MODULE; + err = cdev_add(&vol->cdev, MKDEV(ubi->major, vol->vol_id + 1), 1); + if (err) { + ubi_err("cannot add character device for volume %d", vol_id); + return err; + } + + err = ubi_create_gluebi(ubi, vol); + if (err) + goto out_cdev; + + vol->dev.release = vol_release; + vol->dev.parent = &ubi->dev; + vol->dev.devt = MKDEV(ubi->major, vol->vol_id + 1); + vol->dev.class = ubi_class; + sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id); + err = device_register(&vol->dev); + if (err) + goto out_gluebi; + + err = volume_sysfs_init(ubi, vol); + if (err) { + cdev_del(&vol->cdev); + err = ubi_destroy_gluebi(vol); + volume_sysfs_close(vol); + return err; + } + + paranoid_check_volumes(ubi); + return 0; + +out_gluebi: + err = ubi_destroy_gluebi(vol); +out_cdev: + cdev_del(&vol->cdev); + return err; +} + +/** + * ubi_free_volume - free volume. + * @ubi: UBI device description object + * @vol_id: volume ID + * + * This function frees all resources for volume @vol_id but does not remove it. + * Used only when the UBI device is detached. + */ +void ubi_free_volume(struct ubi_device *ubi, int vol_id) +{ + int err; + struct ubi_volume *vol = ubi->volumes[vol_id]; + + dbg_msg("free volume %d", vol_id); + ubi_assert(vol); + + vol->removed = 1; + err = ubi_destroy_gluebi(vol); + ubi->volumes[vol_id] = NULL; + cdev_del(&vol->cdev); + volume_sysfs_close(vol); +} + +#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID + +/** + * paranoid_check_volume - check volume information. + * @ubi: UBI device description object + * @vol_id: volume ID + */ +static void paranoid_check_volume(const struct ubi_device *ubi, int vol_id) +{ + int idx = vol_id2idx(ubi, vol_id); + int reserved_pebs, alignment, data_pad, vol_type, name_len, upd_marker; + const struct ubi_volume *vol = ubi->volumes[idx]; + long long n; + const char *name; + + reserved_pebs = ubi32_to_cpu(ubi->vtbl[vol_id].reserved_pebs); + + if (!vol) { + if (reserved_pebs) { + ubi_err("no volume info, but volume exists"); + goto fail; + } + return; + } + + if (vol->reserved_pebs < 0 || vol->alignment < 0 || vol->data_pad < 0 || + vol->name_len < 0) { + ubi_err("negative values"); + goto fail; + } + if (vol->alignment > ubi->leb_size || vol->alignment == 0) { + ubi_err("bad alignment"); + goto fail; + } + + n = vol->alignment % ubi->min_io_size; + if (vol->alignment != 1 && n) { + ubi_err("alignment is not multiple of min I/O unit"); + goto fail; + } + + n = ubi->leb_size % vol->alignment; + if (vol->data_pad != n) { + ubi_err("bad data_pad, has to be %lld", n); + goto fail; + } + + if (vol->vol_type != UBI_DYNAMIC_VOLUME && + vol->vol_type != UBI_STATIC_VOLUME) { + ubi_err("bad vol_type"); + goto fail; + } + + if (vol->upd_marker != 0 && vol->upd_marker != 1) { + ubi_err("bad upd_marker"); + goto fail; + } + + if (vol->upd_marker && vol->corrupted) { + dbg_err("update marker and corrupted simultaneously"); + goto fail; + } + + if (vol->reserved_pebs > ubi->good_peb_count) { + ubi_err("too large reserved_pebs"); + goto fail; + } + + n = ubi->leb_size - vol->data_pad; + if (vol->usable_leb_size != ubi->leb_size - vol->data_pad) { + ubi_err("bad usable_leb_size, has to be %lld", n); + goto fail; + } + + if (vol->name_len > UBI_VOL_NAME_MAX) { + ubi_err("too long volume name, max is %d", UBI_VOL_NAME_MAX); + goto fail; + } + + if (!vol->name) { + ubi_err("NULL volume name"); + goto fail; + } + + n = strnlen(vol->name, vol->name_len + 1); + if (n != vol->name_len) { + ubi_err("bad name_len %lld", n); + goto fail; + } + + n = vol->used_ebs * vol->usable_leb_size; + if (vol->vol_type == UBI_DYNAMIC_VOLUME) { + if (vol->corrupted != 0) { + ubi_err("corrupted dynamic volume"); + goto fail; + } + if (vol->used_ebs != vol->reserved_pebs) { + ubi_err("bad used_ebs"); + goto fail; + } + if (vol->last_eb_bytes != vol->usable_leb_size) { + ubi_err("bad last_eb_bytes"); + goto fail; + } + if (vol->used_bytes != n) { + ubi_err("bad used_bytes"); + goto fail; + } + } else { + if (vol->corrupted != 0 && vol->corrupted != 1) { + ubi_err("bad corrupted"); + goto fail; + } + if (vol->used_ebs < 0 || vol->used_ebs > vol->reserved_pebs) { + ubi_err("bad used_ebs"); + goto fail; + } + if (vol->last_eb_bytes < 0 || + vol->last_eb_bytes > vol->usable_leb_size) { + ubi_err("bad last_eb_bytes"); + goto fail; + } + if (vol->used_bytes < 0 || vol->used_bytes > n || + vol->used_bytes < n - vol->usable_leb_size) { + ubi_err("bad used_bytes"); + goto fail; + } + } + + alignment = ubi32_to_cpu(ubi->vtbl[vol_id].alignment); + data_pad = ubi32_to_cpu(ubi->vtbl[vol_id].data_pad); + name_len = ubi16_to_cpu(ubi->vtbl[vol_id].name_len); + upd_marker = ubi->vtbl[vol_id].upd_marker; + name = &ubi->vtbl[vol_id].name[0]; + if (ubi->vtbl[vol_id].vol_type == UBI_VID_DYNAMIC) + vol_type = UBI_DYNAMIC_VOLUME; + else + vol_type = UBI_STATIC_VOLUME; + + if (alignment != vol->alignment || data_pad != vol->data_pad || + upd_marker != vol->upd_marker || vol_type != vol->vol_type || + name_len!= vol->name_len || strncmp(name, vol->name, name_len)) { + ubi_err("volume info is different"); + goto fail; + } + + return; + +fail: + ubi_err("paranoid check failed"); + ubi_dbg_dump_vol_info(vol); + ubi_dbg_dump_vtbl_record(&ubi->vtbl[vol_id], vol_id); + BUG(); +} + +/** + * paranoid_check_volumes - check information about all volumes. + * @ubi: UBI device description object + */ +static void paranoid_check_volumes(struct ubi_device *ubi) +{ + int i; + + mutex_lock(&ubi->vtbl_mutex); + spin_lock(&ubi->volumes_lock); + for (i = 0; i < ubi->vtbl_slots; i++) + paranoid_check_volume(ubi, i); + spin_unlock(&ubi->volumes_lock); + mutex_unlock(&ubi->vtbl_mutex); +} +#endif diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c new file mode 100644 index 00000000000..b6fd6bbd941 --- /dev/null +++ b/drivers/mtd/ubi/vtbl.c @@ -0,0 +1,809 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * Copyright (c) Nokia Corporation, 2006, 2007 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file includes volume table manipulation code. The volume table is an + * on-flash table containing volume meta-data like name, number of reserved + * physical eraseblocks, type, etc. The volume table is stored in the so-called + * "layout volume". + * + * The layout volume is an internal volume which is organized as follows. It + * consists of two logical eraseblocks - LEB 0 and LEB 1. Each logical + * eraseblock stores one volume table copy, i.e. LEB 0 and LEB 1 duplicate each + * other. This redundancy guarantees robustness to unclean reboots. The volume + * table is basically an array of volume table records. Each record contains + * full information about the volume and protected by a CRC checksum. + * + * The volume table is changed, it is first changed in RAM. Then LEB 0 is + * erased, and the updated volume table is written back to LEB 0. Then same for + * LEB 1. This scheme guarantees recoverability from unclean reboots. + * + * In this UBI implementation the on-flash volume table does not contain any + * information about how many data static volumes contain. This information may + * be found from the scanning data. + * + * But it would still be beneficial to store this information in the volume + * table. For example, suppose we have a static volume X, and all its physical + * eraseblocks became bad for some reasons. Suppose we are attaching the + * corresponding MTD device, the scanning has found no logical eraseblocks + * corresponding to the volume X. According to the volume table volume X does + * exist. So we don't know whether it is just empty or all its physical + * eraseblocks went bad. So we cannot alarm the user about this corruption. + * + * The volume table also stores so-called "update marker", which is used for + * volume updates. Before updating the volume, the update marker is set, and + * after the update operation is finished, the update marker is cleared. So if + * the update operation was interrupted (e.g. by an unclean reboot) - the + * update marker is still there and we know that the volume's contents is + * damaged. + */ + +#include <linux/crc32.h> +#include <linux/err.h> +#include <asm/div64.h> +#include "ubi.h" + +#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID +static void paranoid_vtbl_check(const struct ubi_device *ubi); +#else +#define paranoid_vtbl_check(ubi) +#endif + +/* Empty volume table record */ +static struct ubi_vtbl_record empty_vtbl_record; + +/** + * ubi_change_vtbl_record - change volume table record. + * @ubi: UBI device description object + * @idx: table index to change + * @vtbl_rec: new volume table record + * + * This function changes volume table record @idx. If @vtbl_rec is %NULL, empty + * volume table record is written. The caller does not have to calculate CRC of + * the record as it is done by this function. Returns zero in case of success + * and a negative error code in case of failure. + */ +int ubi_change_vtbl_record(struct ubi_device *ubi, int idx, + struct ubi_vtbl_record *vtbl_rec) +{ + int i, err; + uint32_t crc; + + ubi_assert(idx >= 0 && idx < ubi->vtbl_slots); + + if (!vtbl_rec) + vtbl_rec = &empty_vtbl_record; + else { + crc = crc32(UBI_CRC32_INIT, vtbl_rec, UBI_VTBL_RECORD_SIZE_CRC); + vtbl_rec->crc = cpu_to_ubi32(crc); + } + + dbg_msg("change record %d", idx); + ubi_dbg_dump_vtbl_record(vtbl_rec, idx); + + mutex_lock(&ubi->vtbl_mutex); + memcpy(&ubi->vtbl[idx], vtbl_rec, sizeof(struct ubi_vtbl_record)); + for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) { + err = ubi_eba_unmap_leb(ubi, UBI_LAYOUT_VOL_ID, i); + if (err) { + mutex_unlock(&ubi->vtbl_mutex); + return err; + } + err = ubi_eba_write_leb(ubi, UBI_LAYOUT_VOL_ID, i, ubi->vtbl, 0, + ubi->vtbl_size, UBI_LONGTERM); + if (err) { + mutex_unlock(&ubi->vtbl_mutex); + return err; + } + } + + paranoid_vtbl_check(ubi); + mutex_unlock(&ubi->vtbl_mutex); + return ubi_wl_flush(ubi); +} + +/** + * vol_til_check - check if volume table is not corrupted and contains sensible + * data. + * + * @ubi: UBI device description object + * @vtbl: volume table + * + * This function returns zero if @vtbl is all right, %1 if CRC is incorrect, + * and %-EINVAL if it contains inconsistent data. + */ +static int vtbl_check(const struct ubi_device *ubi, + const struct ubi_vtbl_record *vtbl) +{ + int i, n, reserved_pebs, alignment, data_pad, vol_type, name_len; + int upd_marker; + uint32_t crc; + const char *name; + + for (i = 0; i < ubi->vtbl_slots; i++) { + cond_resched(); + + reserved_pebs = ubi32_to_cpu(vtbl[i].reserved_pebs); + alignment = ubi32_to_cpu(vtbl[i].alignment); + data_pad = ubi32_to_cpu(vtbl[i].data_pad); + upd_marker = vtbl[i].upd_marker; + vol_type = vtbl[i].vol_type; + name_len = ubi16_to_cpu(vtbl[i].name_len); + name = &vtbl[i].name[0]; + + crc = crc32(UBI_CRC32_INIT, &vtbl[i], UBI_VTBL_RECORD_SIZE_CRC); + if (ubi32_to_cpu(vtbl[i].crc) != crc) { + ubi_err("bad CRC at record %u: %#08x, not %#08x", + i, crc, ubi32_to_cpu(vtbl[i].crc)); + ubi_dbg_dump_vtbl_record(&vtbl[i], i); + return 1; + } + + if (reserved_pebs == 0) { + if (memcmp(&vtbl[i], &empty_vtbl_record, + UBI_VTBL_RECORD_SIZE)) { + dbg_err("bad empty record"); + goto bad; + } + continue; + } + + if (reserved_pebs < 0 || alignment < 0 || data_pad < 0 || + name_len < 0) { + dbg_err("negative values"); + goto bad; + } + + if (alignment > ubi->leb_size || alignment == 0) { + dbg_err("bad alignment"); + goto bad; + } + + n = alignment % ubi->min_io_size; + if (alignment != 1 && n) { + dbg_err("alignment is not multiple of min I/O unit"); + goto bad; + } + + n = ubi->leb_size % alignment; + if (data_pad != n) { + dbg_err("bad data_pad, has to be %d", n); + goto bad; + } + + if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) { + dbg_err("bad vol_type"); + goto bad; + } + + if (upd_marker != 0 && upd_marker != 1) { + dbg_err("bad upd_marker"); + goto bad; + } + + if (reserved_pebs > ubi->good_peb_count) { + dbg_err("too large reserved_pebs, good PEBs %d", + ubi->good_peb_count); + goto bad; + } + + if (name_len > UBI_VOL_NAME_MAX) { + dbg_err("too long volume name, max %d", + UBI_VOL_NAME_MAX); + goto bad; + } + + if (name[0] == '\0') { + dbg_err("NULL volume name"); + goto bad; + } + + if (name_len != strnlen(name, name_len + 1)) { + dbg_err("bad name_len"); + goto bad; + } + } + + /* Checks that all names are unique */ + for (i = 0; i < ubi->vtbl_slots - 1; i++) { + for (n = i + 1; n < ubi->vtbl_slots; n++) { + int len1 = ubi16_to_cpu(vtbl[i].name_len); + int len2 = ubi16_to_cpu(vtbl[n].name_len); + + if (len1 > 0 && len1 == len2 && + !strncmp(vtbl[i].name, vtbl[n].name, len1)) { + ubi_err("volumes %d and %d have the same name" + " \"%s\"", i, n, vtbl[i].name); + ubi_dbg_dump_vtbl_record(&vtbl[i], i); + ubi_dbg_dump_vtbl_record(&vtbl[n], n); + return -EINVAL; + } + } + } + + return 0; + +bad: + ubi_err("volume table check failed, record %d", i); + ubi_dbg_dump_vtbl_record(&vtbl[i], i); + return -EINVAL; +} + +/** + * create_vtbl - create a copy of volume table. + * @ubi: UBI device description object + * @si: scanning information + * @copy: number of the volume table copy + * @vtbl: contents of the volume table + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int create_vtbl(const struct ubi_device *ubi, struct ubi_scan_info *si, + int copy, void *vtbl) +{ + int err, tries = 0; + static struct ubi_vid_hdr *vid_hdr; + struct ubi_scan_volume *sv; + struct ubi_scan_leb *new_seb, *old_seb = NULL; + + ubi_msg("create volume table (copy #%d)", copy + 1); + + vid_hdr = ubi_zalloc_vid_hdr(ubi); + if (!vid_hdr) + return -ENOMEM; + + /* + * Check if there is a logical eraseblock which would have to contain + * this volume table copy was found during scanning. It has to be wiped + * out. + */ + sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOL_ID); + if (sv) + old_seb = ubi_scan_find_seb(sv, copy); + +retry: + new_seb = ubi_scan_get_free_peb(ubi, si); + if (IS_ERR(new_seb)) { + err = PTR_ERR(new_seb); + goto out_free; + } + + vid_hdr->vol_type = UBI_VID_DYNAMIC; + vid_hdr->vol_id = cpu_to_ubi32(UBI_LAYOUT_VOL_ID); + vid_hdr->compat = UBI_LAYOUT_VOLUME_COMPAT; + vid_hdr->data_size = vid_hdr->used_ebs = + vid_hdr->data_pad = cpu_to_ubi32(0); + vid_hdr->lnum = cpu_to_ubi32(copy); + vid_hdr->sqnum = cpu_to_ubi64(++si->max_sqnum); + vid_hdr->leb_ver = cpu_to_ubi32(old_seb ? old_seb->leb_ver + 1: 0); + + /* The EC header is already there, write the VID header */ + err = ubi_io_write_vid_hdr(ubi, new_seb->pnum, vid_hdr); + if (err) + goto write_error; + + /* Write the layout volume contents */ + err = ubi_io_write_data(ubi, vtbl, new_seb->pnum, 0, ubi->vtbl_size); + if (err) + goto write_error; + + /* + * And add it to the scanning information. Don't delete the old + * @old_seb as it will be deleted and freed in 'ubi_scan_add_used()'. + */ + err = ubi_scan_add_used(ubi, si, new_seb->pnum, new_seb->ec, + vid_hdr, 0); + kfree(new_seb); + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + +write_error: + kfree(new_seb); + /* May be this physical eraseblock went bad, try to pick another one */ + if (++tries <= 5) { + err = ubi_scan_add_to_list(si, new_seb->pnum, new_seb->ec, + &si->corr); + if (!err) + goto retry; + } +out_free: + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + +} + +/** + * process_lvol - process the layout volume. + * @ubi: UBI device description object + * @si: scanning information + * @sv: layout volume scanning information + * + * This function is responsible for reading the layout volume, ensuring it is + * not corrupted, and recovering from corruptions if needed. Returns volume + * table in case of success and a negative error code in case of failure. + */ +static struct ubi_vtbl_record *process_lvol(const struct ubi_device *ubi, + struct ubi_scan_info *si, + struct ubi_scan_volume *sv) +{ + int err; + struct rb_node *rb; + struct ubi_scan_leb *seb; + struct ubi_vtbl_record *leb[UBI_LAYOUT_VOLUME_EBS] = { NULL, NULL }; + int leb_corrupted[UBI_LAYOUT_VOLUME_EBS] = {1, 1}; + + /* + * UBI goes through the following steps when it changes the layout + * volume: + * a. erase LEB 0; + * b. write new data to LEB 0; + * c. erase LEB 1; + * d. write new data to LEB 1. + * + * Before the change, both LEBs contain the same data. + * + * Due to unclean reboots, the contents of LEB 0 may be lost, but there + * should LEB 1. So it is OK if LEB 0 is corrupted while LEB 1 is not. + * Similarly, LEB 1 may be lost, but there should be LEB 0. And + * finally, unclean reboots may result in a situation when neither LEB + * 0 nor LEB 1 are corrupted, but they are different. In this case, LEB + * 0 contains more recent information. + * + * So the plan is to first check LEB 0. Then + * a. if LEB 0 is OK, it must be containing the most resent data; then + * we compare it with LEB 1, and if they are different, we copy LEB + * 0 to LEB 1; + * b. if LEB 0 is corrupted, but LEB 1 has to be OK, and we copy LEB 1 + * to LEB 0. + */ + + dbg_msg("check layout volume"); + + /* Read both LEB 0 and LEB 1 into memory */ + ubi_rb_for_each_entry(rb, seb, &sv->root, u.rb) { + leb[seb->lnum] = kzalloc(ubi->vtbl_size, GFP_KERNEL); + if (!leb[seb->lnum]) { + err = -ENOMEM; + goto out_free; + } + + err = ubi_io_read_data(ubi, leb[seb->lnum], seb->pnum, 0, + ubi->vtbl_size); + if (err == UBI_IO_BITFLIPS || err == -EBADMSG) + /* Scrub the PEB later */ + seb->scrub = 1; + else if (err) + goto out_free; + } + + err = -EINVAL; + if (leb[0]) { + leb_corrupted[0] = vtbl_check(ubi, leb[0]); + if (leb_corrupted[0] < 0) + goto out_free; + } + + if (!leb_corrupted[0]) { + /* LEB 0 is OK */ + if (leb[1]) + leb_corrupted[1] = memcmp(leb[0], leb[1], ubi->vtbl_size); + if (leb_corrupted[1]) { + ubi_warn("volume table copy #2 is corrupted"); + err = create_vtbl(ubi, si, 1, leb[0]); + if (err) + goto out_free; + ubi_msg("volume table was restored"); + } + + /* Both LEB 1 and LEB 2 are OK and consistent */ + kfree(leb[1]); + return leb[0]; + } else { + /* LEB 0 is corrupted or does not exist */ + if (leb[1]) { + leb_corrupted[1] = vtbl_check(ubi, leb[1]); + if (leb_corrupted[1] < 0) + goto out_free; + } + if (leb_corrupted[1]) { + /* Both LEB 0 and LEB 1 are corrupted */ + ubi_err("both volume tables are corrupted"); + goto out_free; + } + + ubi_warn("volume table copy #1 is corrupted"); + err = create_vtbl(ubi, si, 0, leb[1]); + if (err) + goto out_free; + ubi_msg("volume table was restored"); + + kfree(leb[0]); + return leb[1]; + } + +out_free: + kfree(leb[0]); + kfree(leb[1]); + return ERR_PTR(err); +} + +/** + * create_empty_lvol - create empty layout volume. + * @ubi: UBI device description object + * @si: scanning information + * + * This function returns volume table contents in case of success and a + * negative error code in case of failure. + */ +static struct ubi_vtbl_record *create_empty_lvol(const struct ubi_device *ubi, + struct ubi_scan_info *si) +{ + int i; + struct ubi_vtbl_record *vtbl; + + vtbl = kzalloc(ubi->vtbl_size, GFP_KERNEL); + if (!vtbl) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < ubi->vtbl_slots; i++) + memcpy(&vtbl[i], &empty_vtbl_record, UBI_VTBL_RECORD_SIZE); + + for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) { + int err; + + err = create_vtbl(ubi, si, i, vtbl); + if (err) { + kfree(vtbl); + return ERR_PTR(err); + } + } + + return vtbl; +} + +/** + * init_volumes - initialize volume information for existing volumes. + * @ubi: UBI device description object + * @si: scanning information + * @vtbl: volume table + * + * This function allocates volume description objects for existing volumes. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +static int init_volumes(struct ubi_device *ubi, const struct ubi_scan_info *si, + const struct ubi_vtbl_record *vtbl) +{ + int i, reserved_pebs = 0; + struct ubi_scan_volume *sv; + struct ubi_volume *vol; + + for (i = 0; i < ubi->vtbl_slots; i++) { + cond_resched(); + + if (ubi32_to_cpu(vtbl[i].reserved_pebs) == 0) + continue; /* Empty record */ + + vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL); + if (!vol) + return -ENOMEM; + + vol->reserved_pebs = ubi32_to_cpu(vtbl[i].reserved_pebs); + vol->alignment = ubi32_to_cpu(vtbl[i].alignment); + vol->data_pad = ubi32_to_cpu(vtbl[i].data_pad); + vol->vol_type = vtbl[i].vol_type == UBI_VID_DYNAMIC ? + UBI_DYNAMIC_VOLUME : UBI_STATIC_VOLUME; + vol->name_len = ubi16_to_cpu(vtbl[i].name_len); + vol->usable_leb_size = ubi->leb_size - vol->data_pad; + memcpy(vol->name, vtbl[i].name, vol->name_len); + vol->name[vol->name_len] = '\0'; + vol->vol_id = i; + + ubi_assert(!ubi->volumes[i]); + ubi->volumes[i] = vol; + ubi->vol_count += 1; + vol->ubi = ubi; + reserved_pebs += vol->reserved_pebs; + + /* + * In case of dynamic volume UBI knows nothing about how many + * data is stored there. So assume the whole volume is used. + */ + if (vol->vol_type == UBI_DYNAMIC_VOLUME) { + vol->used_ebs = vol->reserved_pebs; + vol->last_eb_bytes = vol->usable_leb_size; + vol->used_bytes = vol->used_ebs * vol->usable_leb_size; + continue; + } + + /* Static volumes only */ + sv = ubi_scan_find_sv(si, i); + if (!sv) { + /* + * No eraseblocks belonging to this volume found. We + * don't actually know whether this static volume is + * completely corrupted or just contains no data. And + * we cannot know this as long as data size is not + * stored on flash. So we just assume the volume is + * empty. FIXME: this should be handled. + */ + continue; + } + + if (sv->leb_count != sv->used_ebs) { + /* + * We found a static volume which misses several + * eraseblocks. Treat it as corrupted. + */ + ubi_warn("static volume %d misses %d LEBs - corrupted", + sv->vol_id, sv->used_ebs - sv->leb_count); + vol->corrupted = 1; + continue; + } + + vol->used_ebs = sv->used_ebs; + vol->used_bytes = (vol->used_ebs - 1) * vol->usable_leb_size; + vol->used_bytes += sv->last_data_size; + vol->last_eb_bytes = sv->last_data_size; + } + + vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL); + if (!vol) + return -ENOMEM; + + vol->reserved_pebs = UBI_LAYOUT_VOLUME_EBS; + vol->alignment = 1; + vol->vol_type = UBI_DYNAMIC_VOLUME; + vol->name_len = sizeof(UBI_LAYOUT_VOLUME_NAME) - 1; + memcpy(vol->name, UBI_LAYOUT_VOLUME_NAME, vol->name_len + 1); + vol->usable_leb_size = ubi->leb_size; + vol->used_ebs = vol->reserved_pebs; + vol->last_eb_bytes = vol->reserved_pebs; + vol->used_bytes = vol->used_ebs * (ubi->leb_size - vol->data_pad); + vol->vol_id = UBI_LAYOUT_VOL_ID; + + ubi_assert(!ubi->volumes[i]); + ubi->volumes[vol_id2idx(ubi, vol->vol_id)] = vol; + reserved_pebs += vol->reserved_pebs; + ubi->vol_count += 1; + vol->ubi = ubi; + + if (reserved_pebs > ubi->avail_pebs) + ubi_err("not enough PEBs, required %d, available %d", + reserved_pebs, ubi->avail_pebs); + ubi->rsvd_pebs += reserved_pebs; + ubi->avail_pebs -= reserved_pebs; + + return 0; +} + +/** + * check_sv - check volume scanning information. + * @vol: UBI volume description object + * @sv: volume scanning information + * + * This function returns zero if the volume scanning information is consistent + * to the data read from the volume tabla, and %-EINVAL if not. + */ +static int check_sv(const struct ubi_volume *vol, + const struct ubi_scan_volume *sv) +{ + if (sv->highest_lnum >= vol->reserved_pebs) { + dbg_err("bad highest_lnum"); + goto bad; + } + if (sv->leb_count > vol->reserved_pebs) { + dbg_err("bad leb_count"); + goto bad; + } + if (sv->vol_type != vol->vol_type) { + dbg_err("bad vol_type"); + goto bad; + } + if (sv->used_ebs > vol->reserved_pebs) { + dbg_err("bad used_ebs"); + goto bad; + } + if (sv->data_pad != vol->data_pad) { + dbg_err("bad data_pad"); + goto bad; + } + return 0; + +bad: + ubi_err("bad scanning information"); + ubi_dbg_dump_sv(sv); + ubi_dbg_dump_vol_info(vol); + return -EINVAL; +} + +/** + * check_scanning_info - check that scanning information. + * @ubi: UBI device description object + * @si: scanning information + * + * Even though we protect on-flash data by CRC checksums, we still don't trust + * the media. This function ensures that scanning information is consistent to + * the information read from the volume table. Returns zero if the scanning + * information is OK and %-EINVAL if it is not. + */ +static int check_scanning_info(const struct ubi_device *ubi, + struct ubi_scan_info *si) +{ + int err, i; + struct ubi_scan_volume *sv; + struct ubi_volume *vol; + + if (si->vols_found > UBI_INT_VOL_COUNT + ubi->vtbl_slots) { + ubi_err("scanning found %d volumes, maximum is %d + %d", + si->vols_found, UBI_INT_VOL_COUNT, ubi->vtbl_slots); + return -EINVAL; + } + + if (si->highest_vol_id >= ubi->vtbl_slots + UBI_INT_VOL_COUNT&& + si->highest_vol_id < UBI_INTERNAL_VOL_START) { + ubi_err("too large volume ID %d found by scanning", + si->highest_vol_id); + return -EINVAL; + } + + + for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { + cond_resched(); + + sv = ubi_scan_find_sv(si, i); + vol = ubi->volumes[i]; + if (!vol) { + if (sv) + ubi_scan_rm_volume(si, sv); + continue; + } + + if (vol->reserved_pebs == 0) { + ubi_assert(i < ubi->vtbl_slots); + + if (!sv) + continue; + + /* + * During scanning we found a volume which does not + * exist according to the information in the volume + * table. This must have happened due to an unclean + * reboot while the volume was being removed. Discard + * these eraseblocks. + */ + ubi_msg("finish volume %d removal", sv->vol_id); + ubi_scan_rm_volume(si, sv); + } else if (sv) { + err = check_sv(vol, sv); + if (err) + return err; + } + } + + return 0; +} + +/** + * ubi_read_volume_table - read volume table. + * information. + * @ubi: UBI device description object + * @si: scanning information + * + * This function reads volume table, checks it, recover from errors if needed, + * or creates it if needed. Returns zero in case of success and a negative + * error code in case of failure. + */ +int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_scan_info *si) +{ + int i, err; + struct ubi_scan_volume *sv; + + empty_vtbl_record.crc = cpu_to_ubi32(0xf116c36b); + + /* + * The number of supported volumes is limited by the eraseblock size + * and by the UBI_MAX_VOLUMES constant. + */ + ubi->vtbl_slots = ubi->leb_size / UBI_VTBL_RECORD_SIZE; + if (ubi->vtbl_slots > UBI_MAX_VOLUMES) + ubi->vtbl_slots = UBI_MAX_VOLUMES; + + ubi->vtbl_size = ubi->vtbl_slots * UBI_VTBL_RECORD_SIZE; + ubi->vtbl_size = ALIGN(ubi->vtbl_size, ubi->min_io_size); + + sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOL_ID); + if (!sv) { + /* + * No logical eraseblocks belonging to the layout volume were + * found. This could mean that the flash is just empty. In + * this case we create empty layout volume. + * + * But if flash is not empty this must be a corruption or the + * MTD device just contains garbage. + */ + if (si->is_empty) { + ubi->vtbl = create_empty_lvol(ubi, si); + if (IS_ERR(ubi->vtbl)) + return PTR_ERR(ubi->vtbl); + } else { + ubi_err("the layout volume was not found"); + return -EINVAL; + } + } else { + if (sv->leb_count > UBI_LAYOUT_VOLUME_EBS) { + /* This must not happen with proper UBI images */ + dbg_err("too many LEBs (%d) in layout volume", + sv->leb_count); + return -EINVAL; + } + + ubi->vtbl = process_lvol(ubi, si, sv); + if (IS_ERR(ubi->vtbl)) + return PTR_ERR(ubi->vtbl); + } + + ubi->avail_pebs = ubi->good_peb_count; + + /* + * The layout volume is OK, initialize the corresponding in-RAM data + * structures. + */ + err = init_volumes(ubi, si, ubi->vtbl); + if (err) + goto out_free; + + /* + * Get sure that the scanning information is consistent to the + * information stored in the volume table. + */ + err = check_scanning_info(ubi, si); + if (err) + goto out_free; + + return 0; + +out_free: + kfree(ubi->vtbl); + for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) + if (ubi->volumes[i]) { + kfree(ubi->volumes[i]); + ubi->volumes[i] = NULL; + } + return err; +} + +#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID + +/** + * paranoid_vtbl_check - check volume table. + * @ubi: UBI device description object + */ +static void paranoid_vtbl_check(const struct ubi_device *ubi) +{ + if (vtbl_check(ubi, ubi->vtbl)) { + ubi_err("paranoid check failed"); + BUG(); + } +} + +#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */ diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c new file mode 100644 index 00000000000..9ecaf77eca9 --- /dev/null +++ b/drivers/mtd/ubi/wl.c @@ -0,0 +1,1671 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём), Thomas Gleixner + */ + +/* + * UBI wear-leveling unit. + * + * This unit is responsible for wear-leveling. It works in terms of physical + * eraseblocks and erase counters and knows nothing about logical eraseblocks, + * volumes, etc. From this unit's perspective all physical eraseblocks are of + * two types - used and free. Used physical eraseblocks are those that were + * "get" by the 'ubi_wl_get_peb()' function, and free physical eraseblocks are + * those that were put by the 'ubi_wl_put_peb()' function. + * + * Physical eraseblocks returned by 'ubi_wl_get_peb()' have only erase counter + * header. The rest of the physical eraseblock contains only 0xFF bytes. + * + * When physical eraseblocks are returned to the WL unit by means of the + * 'ubi_wl_put_peb()' function, they are scheduled for erasure. The erasure is + * done asynchronously in context of the per-UBI device background thread, + * which is also managed by the WL unit. + * + * The wear-leveling is ensured by means of moving the contents of used + * physical eraseblocks with low erase counter to free physical eraseblocks + * with high erase counter. + * + * The 'ubi_wl_get_peb()' function accepts data type hints which help to pick + * an "optimal" physical eraseblock. For example, when it is known that the + * physical eraseblock will be "put" soon because it contains short-term data, + * the WL unit may pick a free physical eraseblock with low erase counter, and + * so forth. + * + * If the WL unit fails to erase a physical eraseblock, it marks it as bad. + * + * This unit is also responsible for scrubbing. If a bit-flip is detected in a + * physical eraseblock, it has to be moved. Technically this is the same as + * moving it for wear-leveling reasons. + * + * As it was said, for the UBI unit all physical eraseblocks are either "free" + * or "used". Free eraseblock are kept in the @wl->free RB-tree, while used + * eraseblocks are kept in a set of different RB-trees: @wl->used, + * @wl->prot.pnum, @wl->prot.aec, and @wl->scrub. + * + * Note, in this implementation, we keep a small in-RAM object for each physical + * eraseblock. This is surely not a scalable solution. But it appears to be good + * enough for moderately large flashes and it is simple. In future, one may + * re-work this unit and make it more scalable. + * + * At the moment this unit does not utilize the sequence number, which was + * introduced relatively recently. But it would be wise to do this because the + * sequence number of a logical eraseblock characterizes how old is it. For + * example, when we move a PEB with low erase counter, and we need to pick the + * target PEB, we pick a PEB with the highest EC if our PEB is "old" and we + * pick target PEB with an average EC if our PEB is not very "old". This is a + * room for future re-works of the WL unit. + * + * FIXME: looks too complex, should be simplified (later). + */ + +#include <linux/slab.h> +#include <linux/crc32.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include "ubi.h" + +/* Number of physical eraseblocks reserved for wear-leveling purposes */ +#define WL_RESERVED_PEBS 1 + +/* + * How many erase cycles are short term, unknown, and long term physical + * eraseblocks protected. + */ +#define ST_PROTECTION 16 +#define U_PROTECTION 10 +#define LT_PROTECTION 4 + +/* + * Maximum difference between two erase counters. If this threshold is + * exceeded, the WL unit starts moving data from used physical eraseblocks with + * low erase counter to free physical eraseblocks with high erase counter. + */ +#define UBI_WL_THRESHOLD CONFIG_MTD_UBI_WL_THRESHOLD + +/* + * When a physical eraseblock is moved, the WL unit has to pick the target + * physical eraseblock to move to. The simplest way would be just to pick the + * one with the highest erase counter. But in certain workloads this could lead + * to an unlimited wear of one or few physical eraseblock. Indeed, imagine a + * situation when the picked physical eraseblock is constantly erased after the + * data is written to it. So, we have a constant which limits the highest erase + * counter of the free physical eraseblock to pick. Namely, the WL unit does + * not pick eraseblocks with erase counter greater then the lowest erase + * counter plus %WL_FREE_MAX_DIFF. + */ +#define WL_FREE_MAX_DIFF (2*UBI_WL_THRESHOLD) + +/* + * Maximum number of consecutive background thread failures which is enough to + * switch to read-only mode. + */ +#define WL_MAX_FAILURES 32 + +/** + * struct ubi_wl_entry - wear-leveling entry. + * @rb: link in the corresponding RB-tree + * @ec: erase counter + * @pnum: physical eraseblock number + * + * Each physical eraseblock has a corresponding &struct wl_entry object which + * may be kept in different RB-trees. + */ +struct ubi_wl_entry { + struct rb_node rb; + int ec; + int pnum; +}; + +/** + * struct ubi_wl_prot_entry - PEB protection entry. + * @rb_pnum: link in the @wl->prot.pnum RB-tree + * @rb_aec: link in the @wl->prot.aec RB-tree + * @abs_ec: the absolute erase counter value when the protection ends + * @e: the wear-leveling entry of the physical eraseblock under protection + * + * When the WL unit returns a physical eraseblock, the physical eraseblock is + * protected from being moved for some "time". For this reason, the physical + * eraseblock is not directly moved from the @wl->free tree to the @wl->used + * tree. There is one more tree in between where this physical eraseblock is + * temporarily stored (@wl->prot). + * + * All this protection stuff is needed because: + * o we don't want to move physical eraseblocks just after we have given them + * to the user; instead, we first want to let users fill them up with data; + * + * o there is a chance that the user will put the physical eraseblock very + * soon, so it makes sense not to move it for some time, but wait; this is + * especially important in case of "short term" physical eraseblocks. + * + * Physical eraseblocks stay protected only for limited time. But the "time" is + * measured in erase cycles in this case. This is implemented with help of the + * absolute erase counter (@wl->abs_ec). When it reaches certain value, the + * physical eraseblocks are moved from the protection trees (@wl->prot.*) to + * the @wl->used tree. + * + * Protected physical eraseblocks are searched by physical eraseblock number + * (when they are put) and by the absolute erase counter (to check if it is + * time to move them to the @wl->used tree). So there are actually 2 RB-trees + * storing the protected physical eraseblocks: @wl->prot.pnum and + * @wl->prot.aec. They are referred to as the "protection" trees. The + * first one is indexed by the physical eraseblock number. The second one is + * indexed by the absolute erase counter. Both trees store + * &struct ubi_wl_prot_entry objects. + * + * Each physical eraseblock has 2 main states: free and used. The former state + * corresponds to the @wl->free tree. The latter state is split up on several + * sub-states: + * o the WL movement is allowed (@wl->used tree); + * o the WL movement is temporarily prohibited (@wl->prot.pnum and + * @wl->prot.aec trees); + * o scrubbing is needed (@wl->scrub tree). + * + * Depending on the sub-state, wear-leveling entries of the used physical + * eraseblocks may be kept in one of those trees. + */ +struct ubi_wl_prot_entry { + struct rb_node rb_pnum; + struct rb_node rb_aec; + unsigned long long abs_ec; + struct ubi_wl_entry *e; +}; + +/** + * struct ubi_work - UBI work description data structure. + * @list: a link in the list of pending works + * @func: worker function + * @priv: private data of the worker function + * + * @e: physical eraseblock to erase + * @torture: if the physical eraseblock has to be tortured + * + * The @func pointer points to the worker function. If the @cancel argument is + * not zero, the worker has to free the resources and exit immediately. The + * worker has to return zero in case of success and a negative error code in + * case of failure. + */ +struct ubi_work { + struct list_head list; + int (*func)(struct ubi_device *ubi, struct ubi_work *wrk, int cancel); + /* The below fields are only relevant to erasure works */ + struct ubi_wl_entry *e; + int torture; +}; + +#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID +static int paranoid_check_ec(const struct ubi_device *ubi, int pnum, int ec); +static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e, + struct rb_root *root); +#else +#define paranoid_check_ec(ubi, pnum, ec) 0 +#define paranoid_check_in_wl_tree(e, root) +#endif + +/* Slab cache for wear-leveling entries */ +static struct kmem_cache *wl_entries_slab; + +/** + * tree_empty - a helper function to check if an RB-tree is empty. + * @root: the root of the tree + * + * This function returns non-zero if the RB-tree is empty and zero if not. + */ +static inline int tree_empty(struct rb_root *root) +{ + return root->rb_node == NULL; +} + +/** + * wl_tree_add - add a wear-leveling entry to a WL RB-tree. + * @e: the wear-leveling entry to add + * @root: the root of the tree + * + * Note, we use (erase counter, physical eraseblock number) pairs as keys in + * the @ubi->used and @ubi->free RB-trees. + */ +static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root) +{ + struct rb_node **p, *parent = NULL; + + p = &root->rb_node; + while (*p) { + struct ubi_wl_entry *e1; + + parent = *p; + e1 = rb_entry(parent, struct ubi_wl_entry, rb); + + if (e->ec < e1->ec) + p = &(*p)->rb_left; + else if (e->ec > e1->ec) + p = &(*p)->rb_right; + else { + ubi_assert(e->pnum != e1->pnum); + if (e->pnum < e1->pnum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + } + + rb_link_node(&e->rb, parent, p); + rb_insert_color(&e->rb, root); +} + + +/* + * Helper functions to add and delete wear-leveling entries from different + * trees. + */ + +static void free_tree_add(struct ubi_device *ubi, struct ubi_wl_entry *e) +{ + wl_tree_add(e, &ubi->free); +} +static inline void used_tree_add(struct ubi_device *ubi, + struct ubi_wl_entry *e) +{ + wl_tree_add(e, &ubi->used); +} +static inline void scrub_tree_add(struct ubi_device *ubi, + struct ubi_wl_entry *e) +{ + wl_tree_add(e, &ubi->scrub); +} +static inline void free_tree_del(struct ubi_device *ubi, + struct ubi_wl_entry *e) +{ + paranoid_check_in_wl_tree(e, &ubi->free); + rb_erase(&e->rb, &ubi->free); +} +static inline void used_tree_del(struct ubi_device *ubi, + struct ubi_wl_entry *e) +{ + paranoid_check_in_wl_tree(e, &ubi->used); + rb_erase(&e->rb, &ubi->used); +} +static inline void scrub_tree_del(struct ubi_device *ubi, + struct ubi_wl_entry *e) +{ + paranoid_check_in_wl_tree(e, &ubi->scrub); + rb_erase(&e->rb, &ubi->scrub); +} + +/** + * do_work - do one pending work. + * @ubi: UBI device description object + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int do_work(struct ubi_device *ubi) +{ + int err; + struct ubi_work *wrk; + + spin_lock(&ubi->wl_lock); + + if (list_empty(&ubi->works)) { + spin_unlock(&ubi->wl_lock); + return 0; + } + + wrk = list_entry(ubi->works.next, struct ubi_work, list); + list_del(&wrk->list); + spin_unlock(&ubi->wl_lock); + + /* + * Call the worker function. Do not touch the work structure + * after this call as it will have been freed or reused by that + * time by the worker function. + */ + err = wrk->func(ubi, wrk, 0); + if (err) + ubi_err("work failed with error code %d", err); + + spin_lock(&ubi->wl_lock); + ubi->works_count -= 1; + ubi_assert(ubi->works_count >= 0); + spin_unlock(&ubi->wl_lock); + return err; +} + +/** + * produce_free_peb - produce a free physical eraseblock. + * @ubi: UBI device description object + * + * This function tries to make a free PEB by means of synchronous execution of + * pending works. This may be needed if, for example the background thread is + * disabled. Returns zero in case of success and a negative error code in case + * of failure. + */ +static int produce_free_peb(struct ubi_device *ubi) +{ + int err; + + spin_lock(&ubi->wl_lock); + while (tree_empty(&ubi->free)) { + spin_unlock(&ubi->wl_lock); + + dbg_wl("do one work synchronously"); + err = do_work(ubi); + if (err) + return err; + + spin_lock(&ubi->wl_lock); + } + spin_unlock(&ubi->wl_lock); + + return 0; +} + +/** + * in_wl_tree - check if wear-leveling entry is present in a WL RB-tree. + * @e: the wear-leveling entry to check + * @root: the root of the tree + * + * This function returns non-zero if @e is in the @root RB-tree and zero if it + * is not. + */ +static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root) +{ + struct rb_node *p; + + p = root->rb_node; + while (p) { + struct ubi_wl_entry *e1; + + e1 = rb_entry(p, struct ubi_wl_entry, rb); + + if (e->pnum == e1->pnum) { + ubi_assert(e == e1); + return 1; + } + + if (e->ec < e1->ec) + p = p->rb_left; + else if (e->ec > e1->ec) + p = p->rb_right; + else { + ubi_assert(e->pnum != e1->pnum); + if (e->pnum < e1->pnum) + p = p->rb_left; + else + p = p->rb_right; + } + } + + return 0; +} + +/** + * prot_tree_add - add physical eraseblock to protection trees. + * @ubi: UBI device description object + * @e: the physical eraseblock to add + * @pe: protection entry object to use + * @abs_ec: absolute erase counter value when this physical eraseblock has + * to be removed from the protection trees. + * + * @wl->lock has to be locked. + */ +static void prot_tree_add(struct ubi_device *ubi, struct ubi_wl_entry *e, + struct ubi_wl_prot_entry *pe, int abs_ec) +{ + struct rb_node **p, *parent = NULL; + struct ubi_wl_prot_entry *pe1; + + pe->e = e; + pe->abs_ec = ubi->abs_ec + abs_ec; + + p = &ubi->prot.pnum.rb_node; + while (*p) { + parent = *p; + pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_pnum); + + if (e->pnum < pe1->e->pnum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&pe->rb_pnum, parent, p); + rb_insert_color(&pe->rb_pnum, &ubi->prot.pnum); + + p = &ubi->prot.aec.rb_node; + parent = NULL; + while (*p) { + parent = *p; + pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_aec); + + if (pe->abs_ec < pe1->abs_ec) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&pe->rb_aec, parent, p); + rb_insert_color(&pe->rb_aec, &ubi->prot.aec); +} + +/** + * find_wl_entry - find wear-leveling entry closest to certain erase counter. + * @root: the RB-tree where to look for + * @max: highest possible erase counter + * + * This function looks for a wear leveling entry with erase counter closest to + * @max and less then @max. + */ +static struct ubi_wl_entry *find_wl_entry(struct rb_root *root, int max) +{ + struct rb_node *p; + struct ubi_wl_entry *e; + + e = rb_entry(rb_first(root), struct ubi_wl_entry, rb); + max += e->ec; + + p = root->rb_node; + while (p) { + struct ubi_wl_entry *e1; + + e1 = rb_entry(p, struct ubi_wl_entry, rb); + if (e1->ec >= max) + p = p->rb_left; + else { + p = p->rb_right; + e = e1; + } + } + + return e; +} + +/** + * ubi_wl_get_peb - get a physical eraseblock. + * @ubi: UBI device description object + * @dtype: type of data which will be stored in this physical eraseblock + * + * This function returns a physical eraseblock in case of success and a + * negative error code in case of failure. Might sleep. + */ +int ubi_wl_get_peb(struct ubi_device *ubi, int dtype) +{ + int err, protect, medium_ec; + struct ubi_wl_entry *e, *first, *last; + struct ubi_wl_prot_entry *pe; + + ubi_assert(dtype == UBI_LONGTERM || dtype == UBI_SHORTTERM || + dtype == UBI_UNKNOWN); + + pe = kmalloc(sizeof(struct ubi_wl_prot_entry), GFP_KERNEL); + if (!pe) + return -ENOMEM; + +retry: + spin_lock(&ubi->wl_lock); + if (tree_empty(&ubi->free)) { + if (ubi->works_count == 0) { + ubi_assert(list_empty(&ubi->works)); + ubi_err("no free eraseblocks"); + spin_unlock(&ubi->wl_lock); + kfree(pe); + return -ENOSPC; + } + spin_unlock(&ubi->wl_lock); + + err = produce_free_peb(ubi); + if (err < 0) { + kfree(pe); + return err; + } + goto retry; + } + + switch (dtype) { + case UBI_LONGTERM: + /* + * For long term data we pick a physical eraseblock + * with high erase counter. But the highest erase + * counter we can pick is bounded by the the lowest + * erase counter plus %WL_FREE_MAX_DIFF. + */ + e = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); + protect = LT_PROTECTION; + break; + case UBI_UNKNOWN: + /* + * For unknown data we pick a physical eraseblock with + * medium erase counter. But we by no means can pick a + * physical eraseblock with erase counter greater or + * equivalent than the lowest erase counter plus + * %WL_FREE_MAX_DIFF. + */ + first = rb_entry(rb_first(&ubi->free), + struct ubi_wl_entry, rb); + last = rb_entry(rb_last(&ubi->free), + struct ubi_wl_entry, rb); + + if (last->ec - first->ec < WL_FREE_MAX_DIFF) + e = rb_entry(ubi->free.rb_node, + struct ubi_wl_entry, rb); + else { + medium_ec = (first->ec + WL_FREE_MAX_DIFF)/2; + e = find_wl_entry(&ubi->free, medium_ec); + } + protect = U_PROTECTION; + break; + case UBI_SHORTTERM: + /* + * For short term data we pick a physical eraseblock + * with the lowest erase counter as we expect it will + * be erased soon. + */ + e = rb_entry(rb_first(&ubi->free), + struct ubi_wl_entry, rb); + protect = ST_PROTECTION; + break; + default: + protect = 0; + e = NULL; + BUG(); + } + + /* + * Move the physical eraseblock to the protection trees where it will + * be protected from being moved for some time. + */ + free_tree_del(ubi, e); + prot_tree_add(ubi, e, pe, protect); + + dbg_wl("PEB %d EC %d, protection %d", e->pnum, e->ec, protect); + spin_unlock(&ubi->wl_lock); + + return e->pnum; +} + +/** + * prot_tree_del - remove a physical eraseblock from the protection trees + * @ubi: UBI device description object + * @pnum: the physical eraseblock to remove + */ +static void prot_tree_del(struct ubi_device *ubi, int pnum) +{ + struct rb_node *p; + struct ubi_wl_prot_entry *pe = NULL; + + p = ubi->prot.pnum.rb_node; + while (p) { + + pe = rb_entry(p, struct ubi_wl_prot_entry, rb_pnum); + + if (pnum == pe->e->pnum) + break; + + if (pnum < pe->e->pnum) + p = p->rb_left; + else + p = p->rb_right; + } + + ubi_assert(pe->e->pnum == pnum); + rb_erase(&pe->rb_aec, &ubi->prot.aec); + rb_erase(&pe->rb_pnum, &ubi->prot.pnum); + kfree(pe); +} + +/** + * sync_erase - synchronously erase a physical eraseblock. + * @ubi: UBI device description object + * @e: the the physical eraseblock to erase + * @torture: if the physical eraseblock has to be tortured + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int torture) +{ + int err; + struct ubi_ec_hdr *ec_hdr; + unsigned long long ec = e->ec; + + dbg_wl("erase PEB %d, old EC %llu", e->pnum, ec); + + err = paranoid_check_ec(ubi, e->pnum, e->ec); + if (err > 0) + return -EINVAL; + + ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); + if (!ec_hdr) + return -ENOMEM; + + err = ubi_io_sync_erase(ubi, e->pnum, torture); + if (err < 0) + goto out_free; + + ec += err; + if (ec > UBI_MAX_ERASECOUNTER) { + /* + * Erase counter overflow. Upgrade UBI and use 64-bit + * erase counters internally. + */ + ubi_err("erase counter overflow at PEB %d, EC %llu", + e->pnum, ec); + err = -EINVAL; + goto out_free; + } + + dbg_wl("erased PEB %d, new EC %llu", e->pnum, ec); + + ec_hdr->ec = cpu_to_ubi64(ec); + + err = ubi_io_write_ec_hdr(ubi, e->pnum, ec_hdr); + if (err) + goto out_free; + + e->ec = ec; + spin_lock(&ubi->wl_lock); + if (e->ec > ubi->max_ec) + ubi->max_ec = e->ec; + spin_unlock(&ubi->wl_lock); + +out_free: + kfree(ec_hdr); + return err; +} + +/** + * check_protection_over - check if it is time to stop protecting some + * physical eraseblocks. + * @ubi: UBI device description object + * + * This function is called after each erase operation, when the absolute erase + * counter is incremented, to check if some physical eraseblock have not to be + * protected any longer. These physical eraseblocks are moved from the + * protection trees to the used tree. + */ +static void check_protection_over(struct ubi_device *ubi) +{ + struct ubi_wl_prot_entry *pe; + + /* + * There may be several protected physical eraseblock to remove, + * process them all. + */ + while (1) { + spin_lock(&ubi->wl_lock); + if (tree_empty(&ubi->prot.aec)) { + spin_unlock(&ubi->wl_lock); + break; + } + + pe = rb_entry(rb_first(&ubi->prot.aec), + struct ubi_wl_prot_entry, rb_aec); + + if (pe->abs_ec > ubi->abs_ec) { + spin_unlock(&ubi->wl_lock); + break; + } + + dbg_wl("PEB %d protection over, abs_ec %llu, PEB abs_ec %llu", + pe->e->pnum, ubi->abs_ec, pe->abs_ec); + rb_erase(&pe->rb_aec, &ubi->prot.aec); + rb_erase(&pe->rb_pnum, &ubi->prot.pnum); + used_tree_add(ubi, pe->e); + spin_unlock(&ubi->wl_lock); + + kfree(pe); + cond_resched(); + } +} + +/** + * schedule_ubi_work - schedule a work. + * @ubi: UBI device description object + * @wrk: the work to schedule + * + * This function enqueues a work defined by @wrk to the tail of the pending + * works list. + */ +static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk) +{ + spin_lock(&ubi->wl_lock); + list_add_tail(&wrk->list, &ubi->works); + ubi_assert(ubi->works_count >= 0); + ubi->works_count += 1; + if (ubi->thread_enabled) + wake_up_process(ubi->bgt_thread); + spin_unlock(&ubi->wl_lock); +} + +static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, + int cancel); + +/** + * schedule_erase - schedule an erase work. + * @ubi: UBI device description object + * @e: the WL entry of the physical eraseblock to erase + * @torture: if the physical eraseblock has to be tortured + * + * This function returns zero in case of success and a %-ENOMEM in case of + * failure. + */ +static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, + int torture) +{ + struct ubi_work *wl_wrk; + + dbg_wl("schedule erasure of PEB %d, EC %d, torture %d", + e->pnum, e->ec, torture); + + wl_wrk = kmalloc(sizeof(struct ubi_work), GFP_KERNEL); + if (!wl_wrk) + return -ENOMEM; + + wl_wrk->func = &erase_worker; + wl_wrk->e = e; + wl_wrk->torture = torture; + + schedule_ubi_work(ubi, wl_wrk); + return 0; +} + +/** + * wear_leveling_worker - wear-leveling worker function. + * @ubi: UBI device description object + * @wrk: the work object + * @cancel: non-zero if the worker has to free memory and exit + * + * This function copies a more worn out physical eraseblock to a less worn out + * one. Returns zero in case of success and a negative error code in case of + * failure. + */ +static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk, + int cancel) +{ + int err, put = 0; + struct ubi_wl_entry *e1, *e2; + struct ubi_vid_hdr *vid_hdr; + + kfree(wrk); + + if (cancel) + return 0; + + vid_hdr = ubi_zalloc_vid_hdr(ubi); + if (!vid_hdr) + return -ENOMEM; + + spin_lock(&ubi->wl_lock); + + /* + * Only one WL worker at a time is supported at this implementation, so + * make sure a PEB is not being moved already. + */ + if (ubi->move_to || tree_empty(&ubi->free) || + (tree_empty(&ubi->used) && tree_empty(&ubi->scrub))) { + /* + * Only one WL worker at a time is supported at this + * implementation, so if a LEB is already being moved, cancel. + * + * No free physical eraseblocks? Well, we cancel wear-leveling + * then. It will be triggered again when a free physical + * eraseblock appears. + * + * No used physical eraseblocks? They must be temporarily + * protected from being moved. They will be moved to the + * @ubi->used tree later and the wear-leveling will be + * triggered again. + */ + dbg_wl("cancel WL, a list is empty: free %d, used %d", + tree_empty(&ubi->free), tree_empty(&ubi->used)); + ubi->wl_scheduled = 0; + spin_unlock(&ubi->wl_lock); + ubi_free_vid_hdr(ubi, vid_hdr); + return 0; + } + + if (tree_empty(&ubi->scrub)) { + /* + * Now pick the least worn-out used physical eraseblock and a + * highly worn-out free physical eraseblock. If the erase + * counters differ much enough, start wear-leveling. + */ + e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb); + e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); + + if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) { + dbg_wl("no WL needed: min used EC %d, max free EC %d", + e1->ec, e2->ec); + ubi->wl_scheduled = 0; + spin_unlock(&ubi->wl_lock); + ubi_free_vid_hdr(ubi, vid_hdr); + return 0; + } + used_tree_del(ubi, e1); + dbg_wl("move PEB %d EC %d to PEB %d EC %d", + e1->pnum, e1->ec, e2->pnum, e2->ec); + } else { + e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, rb); + e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); + scrub_tree_del(ubi, e1); + dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum); + } + + free_tree_del(ubi, e2); + ubi_assert(!ubi->move_from && !ubi->move_to); + ubi_assert(!ubi->move_to_put && !ubi->move_from_put); + ubi->move_from = e1; + ubi->move_to = e2; + spin_unlock(&ubi->wl_lock); + + /* + * Now we are going to copy physical eraseblock @e1->pnum to @e2->pnum. + * We so far do not know which logical eraseblock our physical + * eraseblock (@e1) belongs to. We have to read the volume identifier + * header first. + */ + + err = ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0); + if (err && err != UBI_IO_BITFLIPS) { + if (err == UBI_IO_PEB_FREE) { + /* + * We are trying to move PEB without a VID header. UBI + * always write VID headers shortly after the PEB was + * given, so we have a situation when it did not have + * chance to write it down because it was preempted. + * Just re-schedule the work, so that next time it will + * likely have the VID header in place. + */ + dbg_wl("PEB %d has no VID header", e1->pnum); + err = 0; + } else { + ubi_err("error %d while reading VID header from PEB %d", + err, e1->pnum); + if (err > 0) + err = -EIO; + } + goto error; + } + + err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr); + if (err) { + if (err == UBI_IO_BITFLIPS) + err = 0; + goto error; + } + + ubi_free_vid_hdr(ubi, vid_hdr); + spin_lock(&ubi->wl_lock); + if (!ubi->move_to_put) + used_tree_add(ubi, e2); + else + put = 1; + ubi->move_from = ubi->move_to = NULL; + ubi->move_from_put = ubi->move_to_put = 0; + ubi->wl_scheduled = 0; + spin_unlock(&ubi->wl_lock); + + if (put) { + /* + * Well, the target PEB was put meanwhile, schedule it for + * erasure. + */ + dbg_wl("PEB %d was put meanwhile, erase", e2->pnum); + err = schedule_erase(ubi, e2, 0); + if (err) { + kmem_cache_free(wl_entries_slab, e2); + ubi_ro_mode(ubi); + } + } + + err = schedule_erase(ubi, e1, 0); + if (err) { + kmem_cache_free(wl_entries_slab, e1); + ubi_ro_mode(ubi); + } + + dbg_wl("done"); + return err; + + /* + * Some error occurred. @e1 was not changed, so return it back. @e2 + * might be changed, schedule it for erasure. + */ +error: + if (err) + dbg_wl("error %d occurred, cancel operation", err); + ubi_assert(err <= 0); + + ubi_free_vid_hdr(ubi, vid_hdr); + spin_lock(&ubi->wl_lock); + ubi->wl_scheduled = 0; + if (ubi->move_from_put) + put = 1; + else + used_tree_add(ubi, e1); + ubi->move_from = ubi->move_to = NULL; + ubi->move_from_put = ubi->move_to_put = 0; + spin_unlock(&ubi->wl_lock); + + if (put) { + /* + * Well, the target PEB was put meanwhile, schedule it for + * erasure. + */ + dbg_wl("PEB %d was put meanwhile, erase", e1->pnum); + err = schedule_erase(ubi, e1, 0); + if (err) { + kmem_cache_free(wl_entries_slab, e1); + ubi_ro_mode(ubi); + } + } + + err = schedule_erase(ubi, e2, 0); + if (err) { + kmem_cache_free(wl_entries_slab, e2); + ubi_ro_mode(ubi); + } + + yield(); + return err; +} + +/** + * ensure_wear_leveling - schedule wear-leveling if it is needed. + * @ubi: UBI device description object + * + * This function checks if it is time to start wear-leveling and schedules it + * if yes. This function returns zero in case of success and a negative error + * code in case of failure. + */ +static int ensure_wear_leveling(struct ubi_device *ubi) +{ + int err = 0; + struct ubi_wl_entry *e1; + struct ubi_wl_entry *e2; + struct ubi_work *wrk; + + spin_lock(&ubi->wl_lock); + if (ubi->wl_scheduled) + /* Wear-leveling is already in the work queue */ + goto out_unlock; + + /* + * If the ubi->scrub tree is not empty, scrubbing is needed, and the + * the WL worker has to be scheduled anyway. + */ + if (tree_empty(&ubi->scrub)) { + if (tree_empty(&ubi->used) || tree_empty(&ubi->free)) + /* No physical eraseblocks - no deal */ + goto out_unlock; + + /* + * We schedule wear-leveling only if the difference between the + * lowest erase counter of used physical eraseblocks and a high + * erase counter of free physical eraseblocks is greater then + * %UBI_WL_THRESHOLD. + */ + e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb); + e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); + + if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) + goto out_unlock; + dbg_wl("schedule wear-leveling"); + } else + dbg_wl("schedule scrubbing"); + + ubi->wl_scheduled = 1; + spin_unlock(&ubi->wl_lock); + + wrk = kmalloc(sizeof(struct ubi_work), GFP_KERNEL); + if (!wrk) { + err = -ENOMEM; + goto out_cancel; + } + + wrk->func = &wear_leveling_worker; + schedule_ubi_work(ubi, wrk); + return err; + +out_cancel: + spin_lock(&ubi->wl_lock); + ubi->wl_scheduled = 0; +out_unlock: + spin_unlock(&ubi->wl_lock); + return err; +} + +/** + * erase_worker - physical eraseblock erase worker function. + * @ubi: UBI device description object + * @wl_wrk: the work object + * @cancel: non-zero if the worker has to free memory and exit + * + * This function erases a physical eraseblock and perform torture testing if + * needed. It also takes care about marking the physical eraseblock bad if + * needed. Returns zero in case of success and a negative error code in case of + * failure. + */ +static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, + int cancel) +{ + int err; + struct ubi_wl_entry *e = wl_wrk->e; + int pnum = e->pnum; + + if (cancel) { + dbg_wl("cancel erasure of PEB %d EC %d", pnum, e->ec); + kfree(wl_wrk); + kmem_cache_free(wl_entries_slab, e); + return 0; + } + + dbg_wl("erase PEB %d EC %d", pnum, e->ec); + + err = sync_erase(ubi, e, wl_wrk->torture); + if (!err) { + /* Fine, we've erased it successfully */ + kfree(wl_wrk); + + spin_lock(&ubi->wl_lock); + ubi->abs_ec += 1; + free_tree_add(ubi, e); + spin_unlock(&ubi->wl_lock); + + /* + * One more erase operation has happened, take care about protected + * physical eraseblocks. + */ + check_protection_over(ubi); + + /* And take care about wear-leveling */ + err = ensure_wear_leveling(ubi); + return err; + } + + kfree(wl_wrk); + kmem_cache_free(wl_entries_slab, e); + + if (err != -EIO) { + /* + * If this is not %-EIO, we have no idea what to do. Scheduling + * this physical eraseblock for erasure again would cause + * errors again and again. Well, lets switch to RO mode. + */ + ubi_ro_mode(ubi); + return err; + } + + /* It is %-EIO, the PEB went bad */ + + if (!ubi->bad_allowed) { + ubi_err("bad physical eraseblock %d detected", pnum); + ubi_ro_mode(ubi); + err = -EIO; + } else { + int need; + + spin_lock(&ubi->volumes_lock); + need = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs + 1; + if (need > 0) { + need = ubi->avail_pebs >= need ? need : ubi->avail_pebs; + ubi->avail_pebs -= need; + ubi->rsvd_pebs += need; + ubi->beb_rsvd_pebs += need; + if (need > 0) + ubi_msg("reserve more %d PEBs", need); + } + + if (ubi->beb_rsvd_pebs == 0) { + spin_unlock(&ubi->volumes_lock); + ubi_err("no reserved physical eraseblocks"); + ubi_ro_mode(ubi); + return -EIO; + } + + spin_unlock(&ubi->volumes_lock); + ubi_msg("mark PEB %d as bad", pnum); + + err = ubi_io_mark_bad(ubi, pnum); + if (err) { + ubi_ro_mode(ubi); + return err; + } + + spin_lock(&ubi->volumes_lock); + ubi->beb_rsvd_pebs -= 1; + ubi->bad_peb_count += 1; + ubi->good_peb_count -= 1; + ubi_calculate_reserved(ubi); + if (ubi->beb_rsvd_pebs == 0) + ubi_warn("last PEB from the reserved pool was used"); + spin_unlock(&ubi->volumes_lock); + } + + return err; +} + +/** + * ubi_wl_put_peb - return a physical eraseblock to the wear-leveling + * unit. + * @ubi: UBI device description object + * @pnum: physical eraseblock to return + * @torture: if this physical eraseblock has to be tortured + * + * This function is called to return physical eraseblock @pnum to the pool of + * free physical eraseblocks. The @torture flag has to be set if an I/O error + * occurred to this @pnum and it has to be tested. This function returns zero + * in case of success and a negative error code in case of failure. + */ +int ubi_wl_put_peb(struct ubi_device *ubi, int pnum, int torture) +{ + int err; + struct ubi_wl_entry *e; + + dbg_wl("PEB %d", pnum); + ubi_assert(pnum >= 0); + ubi_assert(pnum < ubi->peb_count); + + spin_lock(&ubi->wl_lock); + + e = ubi->lookuptbl[pnum]; + if (e == ubi->move_from) { + /* + * User is putting the physical eraseblock which was selected to + * be moved. It will be scheduled for erasure in the + * wear-leveling worker. + */ + dbg_wl("PEB %d is being moved", pnum); + ubi_assert(!ubi->move_from_put); + ubi->move_from_put = 1; + spin_unlock(&ubi->wl_lock); + return 0; + } else if (e == ubi->move_to) { + /* + * User is putting the physical eraseblock which was selected + * as the target the data is moved to. It may happen if the EBA + * unit already re-mapped the LEB but the WL unit did has not + * put the PEB to the "used" tree. + */ + dbg_wl("PEB %d is the target of data moving", pnum); + ubi_assert(!ubi->move_to_put); + ubi->move_to_put = 1; + spin_unlock(&ubi->wl_lock); + return 0; + } else { + if (in_wl_tree(e, &ubi->used)) + used_tree_del(ubi, e); + else if (in_wl_tree(e, &ubi->scrub)) + scrub_tree_del(ubi, e); + else + prot_tree_del(ubi, e->pnum); + } + spin_unlock(&ubi->wl_lock); + + err = schedule_erase(ubi, e, torture); + if (err) { + spin_lock(&ubi->wl_lock); + used_tree_add(ubi, e); + spin_unlock(&ubi->wl_lock); + } + + return err; +} + +/** + * ubi_wl_scrub_peb - schedule a physical eraseblock for scrubbing. + * @ubi: UBI device description object + * @pnum: the physical eraseblock to schedule + * + * If a bit-flip in a physical eraseblock is detected, this physical eraseblock + * needs scrubbing. This function schedules a physical eraseblock for + * scrubbing which is done in background. This function returns zero in case of + * success and a negative error code in case of failure. + */ +int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum) +{ + struct ubi_wl_entry *e; + + ubi_msg("schedule PEB %d for scrubbing", pnum); + +retry: + spin_lock(&ubi->wl_lock); + e = ubi->lookuptbl[pnum]; + if (e == ubi->move_from || in_wl_tree(e, &ubi->scrub)) { + spin_unlock(&ubi->wl_lock); + return 0; + } + + if (e == ubi->move_to) { + /* + * This physical eraseblock was used to move data to. The data + * was moved but the PEB was not yet inserted to the proper + * tree. We should just wait a little and let the WL worker + * proceed. + */ + spin_unlock(&ubi->wl_lock); + dbg_wl("the PEB %d is not in proper tree, retry", pnum); + yield(); + goto retry; + } + + if (in_wl_tree(e, &ubi->used)) + used_tree_del(ubi, e); + else + prot_tree_del(ubi, pnum); + + scrub_tree_add(ubi, e); + spin_unlock(&ubi->wl_lock); + + /* + * Technically scrubbing is the same as wear-leveling, so it is done + * by the WL worker. + */ + return ensure_wear_leveling(ubi); +} + +/** + * ubi_wl_flush - flush all pending works. + * @ubi: UBI device description object + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubi_wl_flush(struct ubi_device *ubi) +{ + int err, pending_count; + + pending_count = ubi->works_count; + + dbg_wl("flush (%d pending works)", pending_count); + + /* + * Erase while the pending works queue is not empty, but not more then + * the number of currently pending works. + */ + while (pending_count-- > 0) { + err = do_work(ubi); + if (err) + return err; + } + + return 0; +} + +/** + * tree_destroy - destroy an RB-tree. + * @root: the root of the tree to destroy + */ +static void tree_destroy(struct rb_root *root) +{ + struct rb_node *rb; + struct ubi_wl_entry *e; + + rb = root->rb_node; + while (rb) { + if (rb->rb_left) + rb = rb->rb_left; + else if (rb->rb_right) + rb = rb->rb_right; + else { + e = rb_entry(rb, struct ubi_wl_entry, rb); + + rb = rb_parent(rb); + if (rb) { + if (rb->rb_left == &e->rb) + rb->rb_left = NULL; + else + rb->rb_right = NULL; + } + + kmem_cache_free(wl_entries_slab, e); + } + } +} + +/** + * ubi_thread - UBI background thread. + * @u: the UBI device description object pointer + */ +static int ubi_thread(void *u) +{ + int failures = 0; + struct ubi_device *ubi = u; + + ubi_msg("background thread \"%s\" started, PID %d", + ubi->bgt_name, current->pid); + + for (;;) { + int err; + + if (kthread_should_stop()) + goto out; + + if (try_to_freeze()) + continue; + + spin_lock(&ubi->wl_lock); + if (list_empty(&ubi->works) || ubi->ro_mode || + !ubi->thread_enabled) { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock(&ubi->wl_lock); + schedule(); + continue; + } + spin_unlock(&ubi->wl_lock); + + err = do_work(ubi); + if (err) { + ubi_err("%s: work failed with error code %d", + ubi->bgt_name, err); + if (failures++ > WL_MAX_FAILURES) { + /* + * Too many failures, disable the thread and + * switch to read-only mode. + */ + ubi_msg("%s: %d consecutive failures", + ubi->bgt_name, WL_MAX_FAILURES); + ubi_ro_mode(ubi); + break; + } + } else + failures = 0; + + cond_resched(); + } + +out: + dbg_wl("background thread \"%s\" is killed", ubi->bgt_name); + return 0; +} + +/** + * cancel_pending - cancel all pending works. + * @ubi: UBI device description object + */ +static void cancel_pending(struct ubi_device *ubi) +{ + while (!list_empty(&ubi->works)) { + struct ubi_work *wrk; + + wrk = list_entry(ubi->works.next, struct ubi_work, list); + list_del(&wrk->list); + wrk->func(ubi, wrk, 1); + ubi->works_count -= 1; + ubi_assert(ubi->works_count >= 0); + } +} + +/** + * ubi_wl_init_scan - initialize the wear-leveling unit using scanning + * information. + * @ubi: UBI device description object + * @si: scanning information + * + * This function returns zero in case of success, and a negative error code in + * case of failure. + */ +int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si) +{ + int err; + struct rb_node *rb1, *rb2; + struct ubi_scan_volume *sv; + struct ubi_scan_leb *seb, *tmp; + struct ubi_wl_entry *e; + + + ubi->used = ubi->free = ubi->scrub = RB_ROOT; + ubi->prot.pnum = ubi->prot.aec = RB_ROOT; + spin_lock_init(&ubi->wl_lock); + ubi->max_ec = si->max_ec; + INIT_LIST_HEAD(&ubi->works); + + sprintf(ubi->bgt_name, UBI_BGT_NAME_PATTERN, ubi->ubi_num); + + ubi->bgt_thread = kthread_create(ubi_thread, ubi, ubi->bgt_name); + if (IS_ERR(ubi->bgt_thread)) { + err = PTR_ERR(ubi->bgt_thread); + ubi_err("cannot spawn \"%s\", error %d", ubi->bgt_name, + err); + return err; + } + + if (ubi_devices_cnt == 0) { + wl_entries_slab = kmem_cache_create("ubi_wl_entry_slab", + sizeof(struct ubi_wl_entry), + 0, 0, NULL, NULL); + if (!wl_entries_slab) + return -ENOMEM; + } + + err = -ENOMEM; + ubi->lookuptbl = kzalloc(ubi->peb_count * sizeof(void *), GFP_KERNEL); + if (!ubi->lookuptbl) + goto out_free; + + list_for_each_entry_safe(seb, tmp, &si->erase, u.list) { + cond_resched(); + + e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); + if (!e) + goto out_free; + + e->pnum = seb->pnum; + e->ec = seb->ec; + ubi->lookuptbl[e->pnum] = e; + if (schedule_erase(ubi, e, 0)) { + kmem_cache_free(wl_entries_slab, e); + goto out_free; + } + } + + list_for_each_entry(seb, &si->free, u.list) { + cond_resched(); + + e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); + if (!e) + goto out_free; + + e->pnum = seb->pnum; + e->ec = seb->ec; + ubi_assert(e->ec >= 0); + free_tree_add(ubi, e); + ubi->lookuptbl[e->pnum] = e; + } + + list_for_each_entry(seb, &si->corr, u.list) { + cond_resched(); + + e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); + if (!e) + goto out_free; + + e->pnum = seb->pnum; + e->ec = seb->ec; + ubi->lookuptbl[e->pnum] = e; + if (schedule_erase(ubi, e, 0)) { + kmem_cache_free(wl_entries_slab, e); + goto out_free; + } + } + + ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) { + ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) { + cond_resched(); + + e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); + if (!e) + goto out_free; + + e->pnum = seb->pnum; + e->ec = seb->ec; + ubi->lookuptbl[e->pnum] = e; + if (!seb->scrub) { + dbg_wl("add PEB %d EC %d to the used tree", + e->pnum, e->ec); + used_tree_add(ubi, e); + } else { + dbg_wl("add PEB %d EC %d to the scrub tree", + e->pnum, e->ec); + scrub_tree_add(ubi, e); + } + } + } + + if (WL_RESERVED_PEBS > ubi->avail_pebs) { + ubi_err("no enough physical eraseblocks (%d, need %d)", + ubi->avail_pebs, WL_RESERVED_PEBS); + goto out_free; + } + ubi->avail_pebs -= WL_RESERVED_PEBS; + ubi->rsvd_pebs += WL_RESERVED_PEBS; + + /* Schedule wear-leveling if needed */ + err = ensure_wear_leveling(ubi); + if (err) + goto out_free; + + return 0; + +out_free: + cancel_pending(ubi); + tree_destroy(&ubi->used); + tree_destroy(&ubi->free); + tree_destroy(&ubi->scrub); + kfree(ubi->lookuptbl); + if (ubi_devices_cnt == 0) + kmem_cache_destroy(wl_entries_slab); + return err; +} + +/** + * protection_trees_destroy - destroy the protection RB-trees. + * @ubi: UBI device description object + */ +static void protection_trees_destroy(struct ubi_device *ubi) +{ + struct rb_node *rb; + struct ubi_wl_prot_entry *pe; + + rb = ubi->prot.aec.rb_node; + while (rb) { + if (rb->rb_left) + rb = rb->rb_left; + else if (rb->rb_right) + rb = rb->rb_right; + else { + pe = rb_entry(rb, struct ubi_wl_prot_entry, rb_aec); + + rb = rb_parent(rb); + if (rb) { + if (rb->rb_left == &pe->rb_aec) + rb->rb_left = NULL; + else + rb->rb_right = NULL; + } + + kmem_cache_free(wl_entries_slab, pe->e); + kfree(pe); + } + } +} + +/** + * ubi_wl_close - close the wear-leveling unit. + * @ubi: UBI device description object + */ +void ubi_wl_close(struct ubi_device *ubi) +{ + dbg_wl("disable \"%s\"", ubi->bgt_name); + if (ubi->bgt_thread) + kthread_stop(ubi->bgt_thread); + + dbg_wl("close the UBI wear-leveling unit"); + + cancel_pending(ubi); + protection_trees_destroy(ubi); + tree_destroy(&ubi->used); + tree_destroy(&ubi->free); + tree_destroy(&ubi->scrub); + kfree(ubi->lookuptbl); + if (ubi_devices_cnt == 1) + kmem_cache_destroy(wl_entries_slab); +} + +#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID + +/** + * paranoid_check_ec - make sure that the erase counter of a physical eraseblock + * is correct. + * @ubi: UBI device description object + * @pnum: the physical eraseblock number to check + * @ec: the erase counter to check + * + * This function returns zero if the erase counter of physical eraseblock @pnum + * is equivalent to @ec, %1 if not, and a negative error code if an error + * occurred. + */ +static int paranoid_check_ec(const struct ubi_device *ubi, int pnum, int ec) +{ + int err; + long long read_ec; + struct ubi_ec_hdr *ec_hdr; + + ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL); + if (!ec_hdr) + return -ENOMEM; + + err = ubi_io_read_ec_hdr(ubi, pnum, ec_hdr, 0); + if (err && err != UBI_IO_BITFLIPS) { + /* The header does not have to exist */ + err = 0; + goto out_free; + } + + read_ec = ubi64_to_cpu(ec_hdr->ec); + if (ec != read_ec) { + ubi_err("paranoid check failed for PEB %d", pnum); + ubi_err("read EC is %lld, should be %d", read_ec, ec); + ubi_dbg_dump_stack(); + err = 1; + } else + err = 0; + +out_free: + kfree(ec_hdr); + return err; +} + +/** + * paranoid_check_in_wl_tree - make sure that a wear-leveling entry is present + * in a WL RB-tree. + * @e: the wear-leveling entry to check + * @root: the root of the tree + * + * This function returns zero if @e is in the @root RB-tree and %1 if it + * is not. + */ +static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e, + struct rb_root *root) +{ + if (in_wl_tree(e, root)) + return 0; + + ubi_err("paranoid check failed for PEB %d, EC %d, RB-tree %p ", + e->pnum, e->ec, root); + ubi_dbg_dump_stack(); + return 1; +} + +#endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */ diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index abb90c0c09c..8a649f60276 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -672,6 +672,13 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) { return ret; } + /* and an UBI volume */ + if (jffs2_ubivol(c)) { + ret = jffs2_ubivol_setup(c); + if (ret) + return ret; + } + return ret; } @@ -690,4 +697,9 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) { if (jffs2_nor_wbuf_flash(c)) { jffs2_nor_wbuf_flash_cleanup(c); } + + /* and an UBI volume */ + if (jffs2_ubivol(c)) { + jffs2_ubivol_cleanup(c); + } } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index e07a0edcdb4..8d92e45168c 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -98,6 +98,9 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f) #define jffs2_nor_wbuf_flash(c) (0) #define jffs2_nor_wbuf_flash_setup(c) (0) #define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0) +#define jffs2_ubivol(c) (0) +#define jffs2_ubivol_setup(c) (0) +#define jffs2_ubivol_cleanup(c) do {} while (0) #else /* NAND and/or ECC'd NOR support present */ @@ -133,6 +136,9 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c); #define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH) int jffs2_dataflash_setup(struct jffs2_sb_info *c); void jffs2_dataflash_cleanup(struct jffs2_sb_info *c); +#define jffs2_ubivol(c) (c->mtd->type == MTD_UBIVOLUME) +int jffs2_ubivol_setup(struct jffs2_sb_info *c); +void jffs2_ubivol_cleanup(struct jffs2_sb_info *c); #define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE)) int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 4fac6dd5395..ab86031b3c0 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1208,3 +1208,27 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) { void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { kfree(c->wbuf); } + +int jffs2_ubivol_setup(struct jffs2_sb_info *c) { + c->cleanmarker_size = 0; + + if (c->mtd->writesize == 1) + /* We do not need write-buffer */ + return 0; + + init_rwsem(&c->wbuf_sem); + + c->wbuf_pagesize = c->mtd->writesize; + c->wbuf_ofs = 0xFFFFFFFF; + c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf) + return -ENOMEM; + + printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size); + + return 0; +} + +void jffs2_ubivol_cleanup(struct jffs2_sb_info *c) { + kfree(c->wbuf); +} diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f27e5378caf..a0c8667caa7 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -27,6 +27,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> +#include <linux/swap.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -34,6 +35,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "aops.h" #include "dlmglue.h" #include "extent_map.h" #include "inode.h" @@ -47,63 +49,243 @@ #include "buffer_head_io.h" -static int ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - u64 blkno); +static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); -static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - int wanted, - struct ocfs2_alloc_context *meta_ac, - struct buffer_head *bhs[]); +/* + * Structures which describe a path through a btree, and functions to + * manipulate them. + * + * The idea here is to be as generic as possible with the tree + * manipulation code. + */ +struct ocfs2_path_item { + struct buffer_head *bh; + struct ocfs2_extent_list *el; +}; -static int ocfs2_add_branch(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - struct buffer_head *eb_bh, - struct buffer_head *last_eb_bh, - struct ocfs2_alloc_context *meta_ac); +#define OCFS2_MAX_PATH_DEPTH 5 -static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_alloc_context *meta_ac, - struct buffer_head **ret_new_eb_bh); +struct ocfs2_path { + int p_tree_depth; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; +}; -static int ocfs2_do_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 blkno, - u32 new_clusters); +#define path_root_bh(_path) ((_path)->p_node[0].bh) +#define path_root_el(_path) ((_path)->p_node[0].el) +#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) +#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) +#define path_num_items(_path) ((_path)->p_tree_depth + 1) -static int ocfs2_find_branch_target(struct ocfs2_super *osb, - struct inode *inode, - struct buffer_head *fe_bh, - struct buffer_head **target_bh); +/* + * Reset the actual path elements so that we can re-use the structure + * to build another path. Generally, this involves freeing the buffer + * heads. + */ +static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) +{ + int i, start = 0, depth = 0; + struct ocfs2_path_item *node; -static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, - struct inode *inode, - struct ocfs2_dinode *fe, - unsigned int new_i_clusters, - struct buffer_head *old_last_eb, - struct buffer_head **new_last_eb); + if (keep_root) + start = 1; + + for(i = start; i < path_num_items(path); i++) { + node = &path->p_node[i]; + + brelse(node->bh); + node->bh = NULL; + node->el = NULL; + } + + /* + * Tree depth may change during truncate, or insert. If we're + * keeping the root extent list, then make sure that our path + * structure reflects the proper depth. + */ + if (keep_root) + depth = le16_to_cpu(path_root_el(path)->l_tree_depth); + + path->p_tree_depth = depth; +} + +static void ocfs2_free_path(struct ocfs2_path *path) +{ + if (path) { + ocfs2_reinit_path(path, 0); + kfree(path); + } +} + +/* + * Make the *dest path the same as src and re-initialize src path to + * have a root only. + */ +static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) +{ + int i; + + BUG_ON(path_root_bh(dest) != path_root_bh(src)); + + for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { + brelse(dest->p_node[i].bh); + + dest->p_node[i].bh = src->p_node[i].bh; + dest->p_node[i].el = src->p_node[i].el; + + src->p_node[i].bh = NULL; + src->p_node[i].el = NULL; + } +} + +/* + * Insert an extent block at given index. + * + * This will not take an additional reference on eb_bh. + */ +static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, + struct buffer_head *eb_bh) +{ + struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; + + /* + * Right now, no root bh is an extent block, so this helps + * catch code errors with dinode trees. The assertion can be + * safely removed if we ever need to insert extent block + * structures at the root. + */ + BUG_ON(index == 0); + + path->p_node[index].bh = eb_bh; + path->p_node[index].el = &eb->h_list; +} + +static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, + struct ocfs2_extent_list *root_el) +{ + struct ocfs2_path *path; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); + + path = kzalloc(sizeof(*path), GFP_NOFS); + if (path) { + path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); + get_bh(root_bh); + path_root_bh(path) = root_bh; + path_root_el(path) = root_el; + } + + return path; +} + +/* + * Allocate and initialize a new path based on a disk inode tree. + */ +static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *el = &di->id2.i_list; + + return ocfs2_new_path(di_bh, el); +} + +/* + * Convenience function to journal all components in a path. + */ +static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, + struct ocfs2_path *path) +{ + int i, ret = 0; + + if (!path) + goto out; + + for(i = 0; i < path_num_items(path); i++) { + ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + +out: + return ret; +} + +enum ocfs2_contig_type { + CONTIG_NONE = 0, + CONTIG_LEFT, + CONTIG_RIGHT +}; -static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); -static int ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - u64 blkno) +/* + * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and + * ocfs2_extent_contig only work properly against leaf nodes! + */ +static int ocfs2_block_extent_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + u64 blkno) +{ + u64 blk_end = le64_to_cpu(ext->e_blkno); + + blk_end += ocfs2_clusters_to_blocks(sb, + le16_to_cpu(ext->e_leaf_clusters)); + + return blkno == blk_end; +} + +static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, + struct ocfs2_extent_rec *right) +{ + u32 left_range; + + left_range = le32_to_cpu(left->e_cpos) + + le16_to_cpu(left->e_leaf_clusters); + + return (left_range == le32_to_cpu(right->e_cpos)); +} + +static enum ocfs2_contig_type + ocfs2_extent_contig(struct inode *inode, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) { - return blkno == (le64_to_cpu(ext->e_blkno) + - ocfs2_clusters_to_blocks(inode->i_sb, - le32_to_cpu(ext->e_clusters))); + u64 blkno = le64_to_cpu(insert_rec->e_blkno); + + if (ocfs2_extents_adjacent(ext, insert_rec) && + ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) + return CONTIG_RIGHT; + + blkno = le64_to_cpu(ext->e_blkno); + if (ocfs2_extents_adjacent(insert_rec, ext) && + ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) + return CONTIG_LEFT; + + return CONTIG_NONE; } /* + * NOTE: We can have pretty much any combination of contiguousness and + * appending. + * + * The usefulness of APPEND_TAIL is more in that it lets us know that + * we'll have to update the path to that leaf. + */ +enum ocfs2_append_type { + APPEND_NONE = 0, + APPEND_TAIL, +}; + +struct ocfs2_insert_type { + enum ocfs2_append_type ins_appending; + enum ocfs2_contig_type ins_contig; + int ins_contig_index; + int ins_free_records; + int ins_tree_depth; +}; + +/* * How many free extents have we got before we need more meta data? */ int ocfs2_num_free_extents(struct ocfs2_super *osb, @@ -242,6 +424,28 @@ bail: } /* + * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). + * + * Returns the sum of the rightmost extent rec logical offset and + * cluster count. + * + * ocfs2_add_branch() uses this to determine what logical cluster + * value should be populated into the leftmost new branch records. + * + * ocfs2_shift_tree_depth() uses this to determine the # clusters + * value for the new topmost tree record. + */ +static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) +{ + int i; + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + return le32_to_cpu(el->l_recs[i].e_cpos) + + ocfs2_rec_clusters(el, &el->l_recs[i]); +} + +/* * Add an entire tree branch to our inode. eb_bh is the extent block * to start at, if we don't want to start the branch at the dinode * structure. @@ -250,7 +454,7 @@ bail: * for the new last extent block. * * the new branch will be 'empty' in the sense that every block will - * contain a single record with e_clusters == 0. + * contain a single record with cluster count == 0. */ static int ocfs2_add_branch(struct ocfs2_super *osb, handle_t *handle, @@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; + u32 new_cpos; mlog_entry_void(); @@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } + eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; + new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); + /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. * conversly, new_eb_bhs[0] is the new bottommost leaf. @@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, eb->h_next_leaf_blk = 0; eb_el->l_tree_depth = cpu_to_le16(i); eb_el->l_next_free_rec = cpu_to_le16(1); - eb_el->l_recs[0].e_cpos = fe->i_clusters; + /* + * This actually counts as an empty extent as + * c_clusters == 0 + */ + eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); - eb_el->l_recs[0].e_clusters = cpu_to_le32(0); + /* + * eb_el isn't always an interior node, but even leaf + * nodes want a zero'd flags and reserved field so + * this gets the whole 32 bits regardless of use. + */ + eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); if (!eb_el->l_tree_depth) new_last_eb_blk = le64_to_cpu(eb->h_blkno); @@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * either be on the fe, or the extent block passed in. */ i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); - el->l_recs[i].e_cpos = fe->i_clusters; - el->l_recs[i].e_clusters = 0; + el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); + el->l_recs[i].e_int_clusters = 0; le16_add_cpu(&el->l_next_free_rec, 1); /* fe needs a new last extent block pointer, as does the @@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, struct buffer_head **ret_new_eb_bh) { int status, i; + u32 new_clusters; struct buffer_head *new_eb_bh = NULL; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; @@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, /* copy the fe data into the new extent block */ eb_el->l_tree_depth = fe_el->l_tree_depth; eb_el->l_next_free_rec = fe_el->l_next_free_rec; - for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { - eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; - eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters; - eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno; - } + for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) + eb_el->l_recs[i] = fe_el->l_recs[i]; status = ocfs2_journal_dirty(handle, new_eb_bh); if (status < 0) { @@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } + new_clusters = ocfs2_sum_rightmost_rec(eb_el); + /* update fe now */ le16_add_cpu(&fe_el->l_tree_depth, 1); fe_el->l_recs[0].e_cpos = 0; fe_el->l_recs[0].e_blkno = eb->h_blkno; - fe_el->l_recs[0].e_clusters = fe->i_clusters; - for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { - fe_el->l_recs[i].e_cpos = 0; - fe_el->l_recs[i].e_clusters = 0; - fe_el->l_recs[i].e_blkno = 0; - } + fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); + for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) + memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); fe_el->l_next_free_rec = cpu_to_le16(1); /* If this is our 1st tree depth shift, then last_eb_blk @@ -515,199 +729,6 @@ bail: } /* - * Expects the tree to already have room in the rightmost leaf for the - * extent. Updates all the extent blocks (and the dinode) on the way - * down. - */ -static int ocfs2_do_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 start_blk, - u32 new_clusters) -{ - int status, i, num_bhs = 0; - u64 next_blkno; - u16 next_free; - struct buffer_head **eb_bhs = NULL; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; - - mlog_entry_void(); - - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; - if (el->l_tree_depth) { - /* This is another operation where we want to be - * careful about our tree updates. An error here means - * none of the previous changes we made should roll - * forward. As a result, we have to record the buffers - * for this part of the tree in an array and reserve a - * journal write to them before making any changes. */ - num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); - eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), - GFP_KERNEL); - if (!eb_bhs) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - i = 0; - while(el->l_tree_depth) { - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = -EIO; - goto bail; - } - next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); - - BUG_ON(i >= num_bhs); - status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], - OCFS2_BH_CACHED, inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } - eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, - eb); - status = -EIO; - goto bail; - } - - status = ocfs2_journal_access(handle, inode, eb_bhs[i], - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - el = &eb->h_list; - i++; - /* When we leave this loop, eb_bhs[num_bhs - 1] will - * hold the bottom-most leaf extent block. */ - } - BUG_ON(el->l_tree_depth); - - el = &fe->id2.i_list; - /* If we have tree depth, then the fe update is - * trivial, and we want to switch el out for the - * bottom-most leaf in order to update it with the - * actual extent data below. */ - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = -EIO; - goto bail; - } - le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, - new_clusters); - /* (num_bhs - 1) to avoid the leaf */ - for(i = 0; i < (num_bhs - 1); i++) { - eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; - el = &eb->h_list; - - /* finally, make our actual change to the - * intermediate extent blocks. */ - next_free = le16_to_cpu(el->l_next_free_rec); - le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, - new_clusters); - - status = ocfs2_journal_dirty(handle, eb_bhs[i]); - if (status < 0) - mlog_errno(status); - } - BUG_ON(i != (num_bhs - 1)); - /* note that the leaf block wasn't touched in - * the loop above */ - eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; - el = &eb->h_list; - BUG_ON(el->l_tree_depth); - } - - /* yay, we can finally add the actual extent now! */ - i = le16_to_cpu(el->l_next_free_rec) - 1; - if (le16_to_cpu(el->l_next_free_rec) && - ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { - le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); - } else if (le16_to_cpu(el->l_next_free_rec) && - (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { - /* having an empty extent at eof is legal. */ - if (el->l_recs[i].e_cpos != fe->i_clusters) { - ocfs2_error(inode->i_sb, - "Dinode %llu trailing extent is bad: " - "cpos (%u) != number of clusters (%u)", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - le32_to_cpu(el->l_recs[i].e_cpos), - le32_to_cpu(fe->i_clusters)); - status = -EIO; - goto bail; - } - el->l_recs[i].e_blkno = cpu_to_le64(start_blk); - el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); - } else { - /* No contiguous record, or no empty record at eof, so - * we add a new one. */ - - BUG_ON(le16_to_cpu(el->l_next_free_rec) >= - le16_to_cpu(el->l_count)); - i = le16_to_cpu(el->l_next_free_rec); - - el->l_recs[i].e_blkno = cpu_to_le64(start_blk); - el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); - el->l_recs[i].e_cpos = fe->i_clusters; - le16_add_cpu(&el->l_next_free_rec, 1); - } - - /* - * extent_map errors are not fatal, so they are ignored outside - * of flushing the thing. - */ - status = ocfs2_extent_map_append(inode, &el->l_recs[i], - new_clusters); - if (status) { - mlog_errno(status); - ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); - } - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) - mlog_errno(status); - if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); - if (status < 0) - mlog_errno(status); - } - - status = 0; -bail: - if (eb_bhs) { - for (i = 0; i < num_bhs; i++) - if (eb_bhs[i]) - brelse(eb_bhs[i]); - kfree(eb_bhs); - } - - mlog_exit(status); - return status; -} - -/* * Should only be called when there is no space left in any of the * leaf nodes. What we want to do is find the lowest tree depth * non-leaf extent block with room for new records. There are three @@ -807,53 +828,1548 @@ bail: return status; } -/* the caller needs to update fe->i_clusters */ -int ocfs2_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 start_blk, - u32 new_clusters, - struct ocfs2_alloc_context *meta_ac) +/* + * This is only valid for leaf nodes, which are the only ones that can + * have empty extents anyway. + */ +static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) { - int status, i, shift; - struct buffer_head *last_eb_bh = NULL; + return !rec->e_leaf_clusters; +} + +/* + * This function will discard the rightmost extent record. + */ +static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + int count = le16_to_cpu(el->l_count); + unsigned int num_bytes; + + BUG_ON(!next_free); + /* This will cause us to go off the end of our extent list. */ + BUG_ON(next_free >= count); + + num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; + + memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); +} + +static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i, insert_index, next_free, has_empty, num_bytes; + u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + next_free = le16_to_cpu(el->l_next_free_rec); + has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); + + BUG_ON(!next_free); + + /* The tree code before us didn't allow enough room in the leaf. */ + if (el->l_next_free_rec == el->l_count && !has_empty) + BUG(); + + /* + * The easiest way to approach this is to just remove the + * empty extent and temporarily decrement next_free. + */ + if (has_empty) { + /* + * If next_free was 1 (only an empty extent), this + * loop won't execute, which is fine. We still want + * the decrement above to happen. + */ + for(i = 0; i < (next_free - 1); i++) + el->l_recs[i] = el->l_recs[i+1]; + + next_free--; + } + + /* + * Figure out what the new record index should be. + */ + for(i = 0; i < next_free; i++) { + rec = &el->l_recs[i]; + + if (insert_cpos < le32_to_cpu(rec->e_cpos)) + break; + } + insert_index = i; + + mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", + insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); + + BUG_ON(insert_index < 0); + BUG_ON(insert_index >= le16_to_cpu(el->l_count)); + BUG_ON(insert_index > next_free); + + /* + * No need to memmove if we're just adding to the tail. + */ + if (insert_index != next_free) { + BUG_ON(next_free >= le16_to_cpu(el->l_count)); + + num_bytes = next_free - insert_index; + num_bytes *= sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[insert_index + 1], + &el->l_recs[insert_index], + num_bytes); + } + + /* + * Either we had an empty extent, and need to re-increment or + * there was no empty extent on a non full rightmost leaf node, + * in which case we still need to increment. + */ + next_free++; + el->l_next_free_rec = cpu_to_le16(next_free); + /* + * Make sure none of the math above just messed up our tree. + */ + BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); + + el->l_recs[insert_index] = *insert_rec; + +} + +/* + * Create an empty extent record . + * + * l_next_free_rec may be updated. + * + * If an empty extent already exists do nothing. + */ +static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (next_free == 0) + goto set_and_inc; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) + return; + + mlog_bug_on_msg(el->l_count == el->l_next_free_rec, + "Asked to create an empty extent in a full list:\n" + "count = %u, tree depth = %u", + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_tree_depth)); + + ocfs2_shift_records_right(el); + +set_and_inc: + le16_add_cpu(&el->l_next_free_rec, 1); + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); +} + +/* + * For a rotation which involves two leaf nodes, the "root node" is + * the lowest level tree node which contains a path to both leafs. This + * resulting set of information can be used to form a complete "subtree" + * + * This function is passed two full paths from the dinode down to a + * pair of adjacent leaves. It's task is to figure out which path + * index contains the subtree root - this can be the root index itself + * in a worst-case rotation. + * + * The array index of the subtree root is passed back. + */ +static int ocfs2_find_subtree_root(struct inode *inode, + struct ocfs2_path *left, + struct ocfs2_path *right) +{ + int i = 0; + + /* + * Check that the caller passed in two paths from the same tree. + */ + BUG_ON(path_root_bh(left) != path_root_bh(right)); + + do { + i++; + + /* + * The caller didn't pass two adjacent paths. + */ + mlog_bug_on_msg(i > left->p_tree_depth, + "Inode %lu, left depth %u, right depth %u\n" + "left leaf blk %llu, right leaf blk %llu\n", + inode->i_ino, left->p_tree_depth, + right->p_tree_depth, + (unsigned long long)path_leaf_bh(left)->b_blocknr, + (unsigned long long)path_leaf_bh(right)->b_blocknr); + } while (left->p_node[i].bh->b_blocknr == + right->p_node[i].bh->b_blocknr); + + return i - 1; +} + +typedef void (path_insert_t)(void *, struct buffer_head *); + +/* + * Traverse a btree path in search of cpos, starting at root_el. + * + * This code can be called with a cpos larger than the tree, in which + * case it will return the rightmost path. + */ +static int __ocfs2_find_path(struct inode *inode, + struct ocfs2_extent_list *root_el, u32 cpos, + path_insert_t *func, void *data) +{ + int i, ret = 0; + u32 range; + u64 blkno; struct buffer_head *bh = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + struct ocfs2_inode_info *oi = OCFS2_I(inode); - mlog_entry_void(); + el = root_el; + while (el->l_tree_depth) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has empty extent list at " + "depth %u\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(el->l_tree_depth)); + ret = -EROFS; + goto out; - mlog(0, "add %u clusters starting at block %llu to inode %llu\n", - new_clusters, (unsigned long long)start_blk, - (unsigned long long)OCFS2_I(inode)->ip_blkno); + } - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; + for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { + rec = &el->l_recs[i]; + + /* + * In the case that cpos is off the allocation + * tree, this should just wind up returning the + * rightmost record. + */ + range = le32_to_cpu(rec->e_cpos) + + ocfs2_rec_clusters(el, rec); + if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) + break; + } - if (el->l_tree_depth) { - /* jump to end of tree */ - status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); - if (status < 0) { - mlog_exit(status); - goto bail; + blkno = le64_to_cpu(el->l_recs[i].e_blkno); + if (blkno == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has bad blkno in extent list " + "at depth %u (index %d)\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(el->l_tree_depth), i); + ret = -EROFS; + goto out; } - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + + brelse(bh); + bh = NULL; + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, + &bh, OCFS2_BH_CACHED, inode); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; el = &eb->h_list; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + ret = -EIO; + goto out; + } + + if (le16_to_cpu(el->l_next_free_rec) > + le16_to_cpu(el->l_count)) { + ocfs2_error(inode->i_sb, + "Inode %llu has bad count in extent list " + "at block %llu (next free=%u, count=%u)\n", + (unsigned long long)oi->ip_blkno, + (unsigned long long)bh->b_blocknr, + le16_to_cpu(el->l_next_free_rec), + le16_to_cpu(el->l_count)); + ret = -EROFS; + goto out; + } + + if (func) + func(data, bh); + } + +out: + /* + * Catch any trailing bh that the loop didn't handle. + */ + brelse(bh); + + return ret; +} + +/* + * Given an initialized path (that is, it has a valid root extent + * list), this function will traverse the btree in search of the path + * which would contain cpos. + * + * The path traveled is recorded in the path structure. + * + * Note that this will not do any comparisons on leaf node extent + * records, so it will work fine in the case that we just added a tree + * branch. + */ +struct find_path_data { + int index; + struct ocfs2_path *path; +}; +static void find_path_ins(void *data, struct buffer_head *bh) +{ + struct find_path_data *fp = data; + + get_bh(bh); + ocfs2_path_insert_eb(fp->path, fp->index, bh); + fp->index++; +} +static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, + u32 cpos) +{ + struct find_path_data data; + + data.index = 1; + data.path = path; + return __ocfs2_find_path(inode, path_root_el(path), cpos, + find_path_ins, &data); +} + +static void find_leaf_ins(void *data, struct buffer_head *bh) +{ + struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; + struct ocfs2_extent_list *el = &eb->h_list; + struct buffer_head **ret = data; + + /* We want to retain only the leaf block. */ + if (le16_to_cpu(el->l_tree_depth) == 0) { + get_bh(bh); + *ret = bh; + } +} +/* + * Find the leaf block in the tree which would contain cpos. No + * checking of the actual leaf is done. + * + * Some paths want to call this instead of allocating a path structure + * and calling ocfs2_find_path(). + * + * This function doesn't handle non btree extent lists. + */ +int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, + u32 cpos, struct buffer_head **leaf_bh) +{ + int ret; + struct buffer_head *bh = NULL; + + ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + *leaf_bh = bh; +out: + return ret; +} + +/* + * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. + * + * Basically, we've moved stuff around at the bottom of the tree and + * we need to fix up the extent records above the changes to reflect + * the new changes. + * + * left_rec: the record on the left. + * left_child_el: is the child list pointed to by left_rec + * right_rec: the record to the right of left_rec + * right_child_el: is the child list pointed to by right_rec + * + * By definition, this only works on interior nodes. + */ +static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, + struct ocfs2_extent_list *left_child_el, + struct ocfs2_extent_rec *right_rec, + struct ocfs2_extent_list *right_child_el) +{ + u32 left_clusters, right_end; + + /* + * Interior nodes never have holes. Their cpos is the cpos of + * the leftmost record in their child list. Their cluster + * count covers the full theoretical range of their child list + * - the range between their cpos and the cpos of the record + * immediately to their right. + */ + left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); + left_clusters -= le32_to_cpu(left_rec->e_cpos); + left_rec->e_int_clusters = cpu_to_le32(left_clusters); + + /* + * Calculate the rightmost cluster count boundary before + * moving cpos - we will need to adjust clusters after + * updating e_cpos to keep the same highest cluster count. + */ + right_end = le32_to_cpu(right_rec->e_cpos); + right_end += le32_to_cpu(right_rec->e_int_clusters); + + right_rec->e_cpos = left_rec->e_cpos; + le32_add_cpu(&right_rec->e_cpos, left_clusters); + + right_end -= le32_to_cpu(right_rec->e_cpos); + right_rec->e_int_clusters = cpu_to_le32(right_end); +} + +/* + * Adjust the adjacent root node records involved in a + * rotation. left_el_blkno is passed in as a key so that we can easily + * find it's index in the root list. + */ +static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, + struct ocfs2_extent_list *left_el, + struct ocfs2_extent_list *right_el, + u64 left_el_blkno) +{ + int i; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= + le16_to_cpu(left_el->l_tree_depth)); + + for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { + if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) + break; + } + + /* + * The path walking code should have never returned a root and + * two paths which are not adjacent. + */ + BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); + + ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, + &root_el->l_recs[i + 1], right_el); +} + +/* + * We've changed a leaf block (in right_path) and need to reflect that + * change back up the subtree. + * + * This happens in multiple places: + * - When we've moved an extent record from the left path leaf to the right + * path leaf to make room for an empty extent in the left path leaf. + * - When our insert into the right path leaf is at the leftmost edge + * and requires an update of the path immediately to it's left. This + * can occur at the end of some types of rotation and appending inserts. + */ +static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index) +{ + int ret, i, idx; + struct ocfs2_extent_list *el, *left_el, *right_el; + struct ocfs2_extent_rec *left_rec, *right_rec; + struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + + /* + * Update the counts and position values within all the + * interior nodes to reflect the leaf rotation we just did. + * + * The root node is handled below the loop. + * + * We begin the loop with right_el and left_el pointing to the + * leaf lists and work our way up. + * + * NOTE: within this loop, left_el and right_el always refer + * to the *child* lists. + */ + left_el = path_leaf_el(left_path); + right_el = path_leaf_el(right_path); + for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { + mlog(0, "Adjust records at index %u\n", i); + + /* + * One nice property of knowing that all of these + * nodes are below the root is that we only deal with + * the leftmost right node record and the rightmost + * left node record. + */ + el = left_path->p_node[i].el; + idx = le16_to_cpu(left_el->l_next_free_rec) - 1; + left_rec = &el->l_recs[idx]; + + el = right_path->p_node[i].el; + right_rec = &el->l_recs[0]; + + ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, + right_el); + + ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); + if (ret) + mlog_errno(ret); + + ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); + if (ret) + mlog_errno(ret); + + /* + * Setup our list pointers now so that the current + * parents become children in the next iteration. + */ + left_el = left_path->p_node[i].el; + right_el = right_path->p_node[i].el; + } + + /* + * At the root node, adjust the two adjacent records which + * begin our path to the leaves. + */ + + el = left_path->p_node[subtree_index].el; + left_el = left_path->p_node[subtree_index + 1].el; + right_el = right_path->p_node[subtree_index + 1].el; + + ocfs2_adjust_root_records(el, left_el, right_el, + left_path->p_node[subtree_index + 1].bh->b_blocknr); + + root_bh = left_path->p_node[subtree_index].bh; + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret) + mlog_errno(ret); +} + +static int ocfs2_rotate_subtree_right(struct inode *inode, + handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index) +{ + int ret, i; + struct buffer_head *right_leaf_bh; + struct buffer_head *left_leaf_bh = NULL; + struct buffer_head *root_bh; + struct ocfs2_extent_list *right_el, *left_el; + struct ocfs2_extent_rec move_rec; + + left_leaf_bh = path_leaf_bh(left_path); + left_el = path_leaf_el(left_path); + + if (left_el->l_next_free_rec != left_el->l_count) { + ocfs2_error(inode->i_sb, + "Inode %llu has non-full interior leaf node %llu" + "(next free = %u)", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)left_leaf_bh->b_blocknr, + le16_to_cpu(left_el->l_next_free_rec)); + return -EROFS; + } + + /* + * This extent block may already have an empty record, so we + * return early if so. + */ + if (ocfs2_is_empty_extent(&left_el->l_recs[0])) + return 0; + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for(i = subtree_index + 1; i < path_num_items(right_path); i++) { + ret = ocfs2_journal_access(handle, inode, + right_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, + left_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + right_leaf_bh = path_leaf_bh(right_path); + right_el = path_leaf_el(right_path); + + /* This is a code error, not a disk corruption. */ + mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " + "because rightmost leaf block %llu is empty\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)right_leaf_bh->b_blocknr); + + ocfs2_create_empty_extent(right_el); + + ret = ocfs2_journal_dirty(handle, right_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Do the copy now. */ + i = le16_to_cpu(left_el->l_next_free_rec) - 1; + move_rec = left_el->l_recs[i]; + right_el->l_recs[0] = move_rec; + + /* + * Clear out the record we just copied and shift everything + * over, leaving an empty extent in the left leaf. + * + * We temporarily subtract from next_free_rec so that the + * shift will lose the tail record (which is now defunct). + */ + le16_add_cpu(&left_el->l_next_free_rec, -1); + ocfs2_shift_records_right(left_el); + memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&left_el->l_next_free_rec, 1); + + ret = ocfs2_journal_dirty(handle, left_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_complete_edge_insert(inode, handle, left_path, right_path, + subtree_index); + +out: + return ret; +} + +/* + * Given a full path, determine what cpos value would return us a path + * containing the leaf immediately to the left of the current one. + * + * Will return zero if the path passed in is already the leftmost path. + */ +static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) +{ + int i, j, ret = 0; + u64 blkno; + struct ocfs2_extent_list *el; + + BUG_ON(path->p_tree_depth == 0); + + *cpos = 0; + + blkno = path_leaf_bh(path)->b_blocknr; + + /* Start at the tree node just above the leaf and work our way up. */ + i = path->p_tree_depth - 1; + while (i >= 0) { + el = path->p_node[i].el; + + /* + * Find the extent record just before the one in our + * path. + */ + for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { + if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { + if (j == 0) { + if (i == 0) { + /* + * We've determined that the + * path specified is already + * the leftmost one - return a + * cpos of zero. + */ + goto out; + } + /* + * The leftmost record points to our + * leaf - we need to travel up the + * tree one level. + */ + goto next_node; + } + + *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); + *cpos = *cpos + ocfs2_rec_clusters(el, + &el->l_recs[j - 1]); + *cpos = *cpos - 1; + goto out; + } + } + + /* + * If we got here, we never found a valid node where + * the tree indicated one should be. + */ + ocfs2_error(sb, + "Invalid extent tree at extent block %llu\n", + (unsigned long long)blkno); + ret = -EROFS; + goto out; + +next_node: + blkno = path->p_node[i].bh->b_blocknr; + i--; + } + +out: + return ret; +} + +static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, + struct ocfs2_path *path) +{ + int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; + + if (handle->h_buffer_credits < credits) + return ocfs2_extend_trans(handle, credits); + + return 0; +} + +/* + * Trap the case where we're inserting into the theoretical range past + * the _actual_ left leaf range. Otherwise, we'll rotate a record + * whose cpos is less than ours into the right leaf. + * + * It's only necessary to look at the rightmost record of the left + * leaf because the logic that calls us should ensure that the + * theoretical ranges in the path components above the leaves are + * correct. + */ +static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, + u32 insert_cpos) +{ + struct ocfs2_extent_list *left_el; + struct ocfs2_extent_rec *rec; + int next_free; + + left_el = path_leaf_el(left_path); + next_free = le16_to_cpu(left_el->l_next_free_rec); + rec = &left_el->l_recs[next_free - 1]; + + if (insert_cpos > le32_to_cpu(rec->e_cpos)) + return 1; + return 0; +} + +/* + * Rotate all the records in a btree right one record, starting at insert_cpos. + * + * The path to the rightmost leaf should be passed in. + * + * The array is assumed to be large enough to hold an entire path (tree depth). + * + * Upon succesful return from this function: + * + * - The 'right_path' array will contain a path to the leaf block + * whose range contains e_cpos. + * - That leaf block will have a single empty extent in list index 0. + * - In the case that the rotation requires a post-insert update, + * *ret_left_path will contain a valid path which can be passed to + * ocfs2_insert_path(). + */ +static int ocfs2_rotate_tree_right(struct inode *inode, + handle_t *handle, + u32 insert_cpos, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret, start; + u32 cpos; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + left_path = ocfs2_new_path(path_root_bh(right_path), + path_root_el(right_path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); + + /* + * What we want to do here is: + * + * 1) Start with the rightmost path. + * + * 2) Determine a path to the leaf block directly to the left + * of that leaf. + * + * 3) Determine the 'subtree root' - the lowest level tree node + * which contains a path to both leaves. + * + * 4) Rotate the subtree. + * + * 5) Find the next subtree by considering the left path to be + * the new right path. + * + * The check at the top of this while loop also accepts + * insert_cpos == cpos because cpos is only a _theoretical_ + * value to get us the left path - insert_cpos might very well + * be filling that hole. + * + * Stop at a cpos of '0' because we either started at the + * leftmost branch (i.e., a tree with one branch and a + * rotation inside of it), or we've gone as far as we can in + * rotating subtrees. + */ + while (cpos && insert_cpos <= cpos) { + mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", + insert_cpos, cpos); + + ret = ocfs2_find_path(inode, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog_bug_on_msg(path_leaf_bh(left_path) == + path_leaf_bh(right_path), + "Inode %lu: error during insert of %u " + "(left path cpos %u) results in two identical " + "paths ending at %llu\n", + inode->i_ino, insert_cpos, cpos, + (unsigned long long) + path_leaf_bh(left_path)->b_blocknr); + + if (ocfs2_rotate_requires_path_adjustment(left_path, + insert_cpos)) { + mlog(0, "Path adjustment required\n"); + + /* + * We've rotated the tree as much as we + * should. The rest is up to + * ocfs2_insert_path() to complete, after the + * record insertion. We indicate this + * situation by returning the left path. + * + * The reason we don't adjust the records here + * before the record insert is that an error + * later might break the rule where a parent + * record e_cpos will reflect the actual + * e_cpos of the 1st nonempty record of the + * child list. + */ + *ret_left_path = left_path; + goto out_ret_path; + } + + start = ocfs2_find_subtree_root(inode, left_path, right_path); + + mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", + start, + (unsigned long long) right_path->p_node[start].bh->b_blocknr, + right_path->p_tree_depth); + + ret = ocfs2_extend_rotate_transaction(handle, start, + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rotate_subtree_right(inode, handle, left_path, + right_path, start); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * There is no need to re-read the next right path + * as we know that it'll be our current left + * path. Optimize by copying values instead. + */ + ocfs2_mv_path(right_path, left_path); + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, + &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(left_path); + +out_ret_path: + return ret; +} + +/* + * Do the final bits of extent record insertion at the target leaf + * list. If this leaf is part of an allocation tree, it is assumed + * that the tree above has been prepared. + */ +static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, + struct ocfs2_extent_list *el, + struct ocfs2_insert_type *insert, + struct inode *inode) +{ + int i = insert->ins_contig_index; + unsigned int range; + struct ocfs2_extent_rec *rec; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + /* + * Contiguous insert - either left or right. + */ + if (insert->ins_contig != CONTIG_NONE) { + rec = &el->l_recs[i]; + if (insert->ins_contig == CONTIG_LEFT) { + rec->e_blkno = insert_rec->e_blkno; + rec->e_cpos = insert_rec->e_cpos; + } + le16_add_cpu(&rec->e_leaf_clusters, + le16_to_cpu(insert_rec->e_leaf_clusters)); + return; + } + + /* + * Handle insert into an empty leaf. + */ + if (le16_to_cpu(el->l_next_free_rec) == 0 || + ((le16_to_cpu(el->l_next_free_rec) == 1) && + ocfs2_is_empty_extent(&el->l_recs[0]))) { + el->l_recs[0] = *insert_rec; + el->l_next_free_rec = cpu_to_le16(1); + return; + } + + /* + * Appending insert. + */ + if (insert->ins_appending == APPEND_TAIL) { + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + range = le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters); + BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); + + mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= + le16_to_cpu(el->l_count), + "inode %lu, depth %u, count %u, next free %u, " + "rec.cpos %u, rec.clusters %u, " + "insert.cpos %u, insert.clusters %u\n", + inode->i_ino, + le16_to_cpu(el->l_tree_depth), + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_next_free_rec), + le32_to_cpu(el->l_recs[i].e_cpos), + le16_to_cpu(el->l_recs[i].e_leaf_clusters), + le32_to_cpu(insert_rec->e_cpos), + le16_to_cpu(insert_rec->e_leaf_clusters)); + i++; + el->l_recs[i] = *insert_rec; + le16_add_cpu(&el->l_next_free_rec, 1); + return; + } + + /* + * Ok, we have to rotate. + * + * At this point, it is safe to assume that inserting into an + * empty leaf and appending to a leaf have both been handled + * above. + * + * This leaf needs to have space, either by the empty 1st + * extent record, or by virtue of an l_next_rec < l_count. + */ + ocfs2_rotate_leaf(el, insert_rec); +} + +static inline void ocfs2_update_dinode_clusters(struct inode *inode, + struct ocfs2_dinode *di, + u32 clusters) +{ + le32_add_cpu(&di->i_clusters, clusters); + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret, i, next_free; + struct buffer_head *bh; + struct ocfs2_extent_list *el; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + /* + * This shouldn't happen for non-trees. The extent rec cluster + * count manipulation below only works for interior nodes. + */ + BUG_ON(right_path->p_tree_depth == 0); + + /* + * If our appending insert is at the leftmost edge of a leaf, + * then we might need to update the rightmost records of the + * neighboring path. + */ + el = path_leaf_el(right_path); + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0 || + (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { + u32 left_cpos; + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, + &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Append may need a left path update. cpos: %u, " + "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), + left_cpos); + + /* + * No need to worry if the append is already in the + * leftmost leaf. + */ + if (left_cpos) { + left_path = ocfs2_new_path(path_root_bh(right_path), + path_root_el(right_path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(inode, left_path, left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * ocfs2_insert_path() will pass the left_path to the + * journal for us. + */ + } + } + + ret = ocfs2_journal_access_path(inode, handle, right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_root_el(right_path); + bh = path_root_bh(right_path); + i = 0; + while (1) { + struct ocfs2_extent_rec *rec; + + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(inode->i_sb, + "Dinode %llu has a bad extent list", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + ret = -EIO; + goto out; + } + + rec = &el->l_recs[next_free - 1]; + + rec->e_int_clusters = insert_rec->e_cpos; + le32_add_cpu(&rec->e_int_clusters, + le16_to_cpu(insert_rec->e_leaf_clusters)); + le32_add_cpu(&rec->e_int_clusters, + -le32_to_cpu(rec->e_cpos)); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret) + mlog_errno(ret); + + /* Don't touch the leaf node */ + if (++i >= right_path->p_tree_depth) + break; + + bh = right_path->p_node[i].bh; + el = right_path->p_node[i].el; + } + + *ret_left_path = left_path; + ret = 0; +out: + if (ret != 0) + ocfs2_free_path(left_path); + + return ret; +} + +/* + * This function only does inserts on an allocation b-tree. For dinode + * lists, ocfs2_insert_at_leaf() is called directly. + * + * right_path is the path we want to do the actual insert + * in. left_path should only be passed in if we need to update that + * portion of the tree after an edge insert. + */ +static int ocfs2_insert_path(struct inode *inode, + handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *insert) +{ + int ret, subtree_index; + struct buffer_head *leaf_bh = path_leaf_bh(right_path); + struct ocfs2_extent_list *el; + + /* + * Pass both paths to the journal. The majority of inserts + * will be touching all components anyway. + */ + ret = ocfs2_journal_access_path(inode, handle, right_path); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (left_path) { + int credits = handle->h_buffer_credits; + + /* + * There's a chance that left_path got passed back to + * us without being accounted for in the + * journal. Extend our transaction here to be sure we + * can change those blocks. + */ + credits += left_path->p_tree_depth; + + ret = ocfs2_extend_trans(handle, credits); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, left_path); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + el = path_leaf_el(right_path); + + ocfs2_insert_at_leaf(insert_rec, el, insert, inode); + ret = ocfs2_journal_dirty(handle, leaf_bh); + if (ret) + mlog_errno(ret); + + if (left_path) { + /* + * The rotate code has indicated that we need to fix + * up portions of the tree after the insert. + * + * XXX: Should we extend the transaction here? + */ + subtree_index = ocfs2_find_subtree_root(inode, left_path, + right_path); + ocfs2_complete_edge_insert(inode, handle, left_path, + right_path, subtree_index); + } + + ret = 0; +out: + return ret; +} + +static int ocfs2_do_insert_extent(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *type) +{ + int ret, rotate = 0; + u32 cpos; + struct ocfs2_path *right_path = NULL; + struct ocfs2_path *left_path = NULL; + struct ocfs2_dinode *di; + struct ocfs2_extent_list *el; + + di = (struct ocfs2_dinode *) di_bh->b_data; + el = &di->id2.i_list; + + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le16_to_cpu(el->l_tree_depth) == 0) { + ocfs2_insert_at_leaf(insert_rec, el, type, inode); + goto out_update_clusters; + } + + right_path = ocfs2_new_inode_path(di_bh); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* + * Determine the path to start with. Rotations need the + * rightmost path, everything else can go directly to the + * target leaf. + */ + cpos = le32_to_cpu(insert_rec->e_cpos); + if (type->ins_appending == APPEND_NONE && + type->ins_contig == CONTIG_NONE) { + rotate = 1; + cpos = UINT_MAX; + } + + ret = ocfs2_find_path(inode, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Rotations and appends need special treatment - they modify + * parts of the tree's above them. + * + * Both might pass back a path immediate to the left of the + * one being inserted to. This will be cause + * ocfs2_insert_path() to modify the rightmost records of + * left_path to account for an edge insert. + * + * XXX: When modifying this code, keep in mind that an insert + * can wind up skipping both of these two special cases... + */ + if (rotate) { + ret = ocfs2_rotate_tree_right(inode, handle, + le32_to_cpu(insert_rec->e_cpos), + right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + } else if (type->ins_appending == APPEND_TAIL + && type->ins_contig != CONTIG_LEFT) { + ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, + right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_insert_path(inode, handle, left_path, right_path, + insert_rec, type); + if (ret) { + mlog_errno(ret); + goto out; + } + +out_update_clusters: + ocfs2_update_dinode_clusters(inode, di, + le16_to_cpu(insert_rec->e_leaf_clusters)); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret) + mlog_errno(ret); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + + return ret; +} + +static void ocfs2_figure_contig_type(struct inode *inode, + struct ocfs2_insert_type *insert, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i; + enum ocfs2_contig_type contig_type = CONTIG_NONE; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], + insert_rec); + if (contig_type != CONTIG_NONE) { + insert->ins_contig_index = i; + break; + } + } + insert->ins_contig = contig_type; +} + +/* + * This should only be called against the righmost leaf extent list. + * + * ocfs2_figure_appending_type() will figure out whether we'll have to + * insert at the tail of the rightmost leaf. + * + * This should also work against the dinode list for tree's with 0 + * depth. If we consider the dinode list to be the rightmost leaf node + * then the logic here makes sense. + */ +static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i; + u32 cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + insert->ins_appending = APPEND_NONE; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (!el->l_next_free_rec) + goto set_tail_append; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + /* Were all records empty? */ + if (le16_to_cpu(el->l_next_free_rec) == 1) + goto set_tail_append; } - /* Can we allocate without adding/shifting tree bits? */ i = le16_to_cpu(el->l_next_free_rec) - 1; - if (le16_to_cpu(el->l_next_free_rec) == 0 - || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) - || le32_to_cpu(el->l_recs[i].e_clusters) == 0 - || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) - goto out_add; + rec = &el->l_recs[i]; + + if (cpos >= + (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) + goto set_tail_append; + + return; + +set_tail_append: + insert->ins_appending = APPEND_TAIL; +} + +/* + * Helper function called at the begining of an insert. + * + * This computes a few things that are commonly used in the process of + * inserting into the btree: + * - Whether the new extent is contiguous with an existing one. + * - The current tree depth. + * - Whether the insert is an appending one. + * - The total # of free records in the tree. + * + * All of the information is stored on the ocfs2_insert_type + * structure. + */ +static int ocfs2_figure_insert_type(struct inode *inode, + struct buffer_head *di_bh, + struct buffer_head **last_eb_bh, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *insert) +{ + int ret; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct ocfs2_path *path = NULL; + struct buffer_head *bh = NULL; + + el = &di->id2.i_list; + insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); + + if (el->l_tree_depth) { + /* + * If we have tree depth, we read in the + * rightmost extent block ahead of time as + * ocfs2_figure_insert_type() and ocfs2_add_branch() + * may want it later. + */ + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_last_eb_blk), &bh, + OCFS2_BH_CACHED, inode); + if (ret) { + mlog_exit(ret); + goto out; + } + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + } + + /* + * Unless we have a contiguous insert, we'll need to know if + * there is room left in our allocation tree for another + * extent record. + * + * XXX: This test is simplistic, we can search for empty + * extent records too. + */ + insert->ins_free_records = le16_to_cpu(el->l_count) - + le16_to_cpu(el->l_next_free_rec); + + if (!insert->ins_tree_depth) { + ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_appending_type(insert, el, insert_rec); + return 0; + } + + path = ocfs2_new_inode_path(di_bh); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* + * In the case that we're inserting past what the tree + * currently accounts for, ocfs2_find_path() will return for + * us the rightmost tree path. This is accounted for below in + * the appending code. + */ + ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + /* + * Now that we have the path, there's two things we want to determine: + * 1) Contiguousness (also set contig_index if this is so) + * + * 2) Are we doing an append? We can trivially break this up + * into two types of appends: simple record append, or a + * rotate inside the tail leaf. + */ + ocfs2_figure_contig_type(inode, insert, el, insert_rec); + + /* + * The insert code isn't quite ready to deal with all cases of + * left contiguousness. Specifically, if it's an insert into + * the 1st record in a leaf, it will require the adjustment of + * cluster count on the last record of the path directly to it's + * left. For now, just catch that case and fool the layers + * above us. This works just fine for tree_depth == 0, which + * is why we allow that above. + */ + if (insert->ins_contig == CONTIG_LEFT && + insert->ins_contig_index == 0) + insert->ins_contig = CONTIG_NONE; + + /* + * Ok, so we can simply compare against last_eb to figure out + * whether the path doesn't exist. This will only happen in + * the case that we're doing a tail append, so maybe we can + * take advantage of that information somehow. + */ + if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { + /* + * Ok, ocfs2_find_path() returned us the rightmost + * tree path. This might be an appending insert. There are + * two cases: + * 1) We're doing a true append at the tail: + * -This might even be off the end of the leaf + * 2) We're "appending" by rotating in the tail + */ + ocfs2_figure_appending_type(insert, el, insert_rec); + } + +out: + ocfs2_free_path(path); + + if (ret == 0) + *last_eb_bh = bh; + else + brelse(bh); + return ret; +} + +/* + * Insert an extent into an inode btree. + * + * The caller needs to update fe->i_clusters + */ +int ocfs2_insert_extent(struct ocfs2_super *osb, + handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u32 cpos, + u64 start_blk, + u32 new_clusters, + struct ocfs2_alloc_context *meta_ac) +{ + int status, shift; + struct buffer_head *last_eb_bh = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_insert_type insert = {0, }; + struct ocfs2_extent_rec rec; + + mlog(0, "add %u clusters at position %u to inode %llu\n", + new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); + + mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && + (OCFS2_I(inode)->ip_clusters != cpos), + "Device %s, asking for sparse allocation: inode %llu, " + "cpos %u, clusters %u\n", + osb->dev_str, + (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, + OCFS2_I(inode)->ip_clusters); + + memset(&rec, 0, sizeof(rec)); + rec.e_cpos = cpu_to_le32(cpos); + rec.e_blkno = cpu_to_le64(start_blk); + rec.e_leaf_clusters = cpu_to_le16(new_clusters); + + status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, + &insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } - mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " - "tree now.\n"); + mlog(0, "Insert.appending: %u, Insert.Contig: %u, " + "Insert.contig_index: %d, Insert.free_records: %d, " + "Insert.tree_depth: %d\n", + insert.ins_appending, insert.ins_contig, insert.ins_contig_index, + insert.ins_free_records, insert.ins_tree_depth); + + /* + * Avoid growing the tree unless we're out of records and the + * insert type requres one. + */ + if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) + goto out_add; shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); if (shift < 0) { @@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, * and didn't find room for any more extents - we need to add * another tree level */ if (shift) { - /* if we hit a leaf, we'd better be empty :) */ - BUG_ON(le16_to_cpu(el->l_next_free_rec) != - le16_to_cpu(el->l_count)); BUG_ON(bh); - mlog(0, "ocfs2_allocate_extent: need to shift tree depth " - "(current = %u)\n", - le16_to_cpu(fe->id2.i_list.l_tree_depth)); + mlog(0, "need to shift tree depth " + "(current = %d)\n", insert.ins_tree_depth); /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to @@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, mlog_errno(status); goto bail; } + insert.ins_tree_depth++; /* Special case: we have room now if we shifted from * tree_depth 0 */ - if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) + if (insert.ins_tree_depth == 1) goto out_add; } /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ - mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); + mlog(0, "add branch. bh = %p\n", bh); status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, meta_ac); if (status < 0) { @@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, } out_add: - /* Finally, we can add clusters. */ - status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, - start_blk, new_clusters); + /* Finally, we can add clusters. This might rotate the tree for us. */ + status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); if (status < 0) mlog_errno(status); + else + ocfs2_extent_map_insert_rec(inode, &rec); bail: if (bh) @@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) * block will be deleted, and if it will, what the new last extent * block will be so we can update his h_next_leaf_blk field, as well * as the dinodes i_last_eb_blk */ -static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, - struct inode *inode, - struct ocfs2_dinode *fe, - u32 new_i_clusters, - struct buffer_head *old_last_eb, +static int ocfs2_find_new_last_ext_blk(struct inode *inode, + unsigned int clusters_to_del, + struct ocfs2_path *path, struct buffer_head **new_last_eb) { - int i, status = 0; - u64 block = 0; + int next_free, ret = 0; + u32 cpos; + struct ocfs2_extent_rec *rec; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *bh = NULL; *new_last_eb = NULL; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto bail; - } - /* we have no tree, so of course, no last_eb. */ - if (!fe->id2.i_list.l_tree_depth) - goto bail; + if (!path->p_tree_depth) + goto out; /* trunc to zero special case - this makes tree_depth = 0 * regardless of what it is. */ - if (!new_i_clusters) - goto bail; + if (OCFS2_I(inode)->ip_clusters == clusters_to_del) + goto out; - eb = (struct ocfs2_extent_block *) old_last_eb->b_data; - el = &(eb->h_list); + el = path_leaf_el(path); BUG_ON(!el->l_next_free_rec); - /* Make sure that this guy will actually be empty after we - * clear away the data. */ - if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) - goto bail; + /* + * Make sure that this extent list will actually be empty + * after we clear away the data. We can shortcut out if + * there's more than one non-empty extent in the + * list. Otherwise, a check of the remaining extent is + * necessary. + */ + next_free = le16_to_cpu(el->l_next_free_rec); + rec = NULL; + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + if (next_free > 2) + goto out; - /* Ok, at this point, we know that last_eb will definitely - * change, so lets traverse the tree and find the second to - * last extent block. */ - el = &(fe->id2.i_list); - /* go down the tree, */ - do { - for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { - if (le32_to_cpu(el->l_recs[i].e_cpos) < - new_i_clusters) { - block = le64_to_cpu(el->l_recs[i].e_blkno); - break; - } + /* We may have a valid extent in index 1, check it. */ + if (next_free == 2) + rec = &el->l_recs[1]; + + /* + * Fall through - no more nonempty extents, so we want + * to delete this leaf. + */ + } else { + if (next_free > 1) + goto out; + + rec = &el->l_recs[0]; + } + + if (rec) { + /* + * Check it we'll only be trimming off the end of this + * cluster. + */ + if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) + goto out; + } + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + ret = -EROFS; + goto out; + } + + *new_last_eb = bh; + get_bh(*new_last_eb); + mlog(0, "returning block %llu, (cpos: %u)\n", + (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); +out: + brelse(bh); + + return ret; +} + +/* + * Trim some clusters off the rightmost edge of a tree. Only called + * during truncate. + * + * The caller needs to: + * - start journaling of each path component. + * - compute and fully set up any new last ext block + */ +static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, + handle_t *handle, struct ocfs2_truncate_context *tc, + u32 clusters_to_del, u64 *delete_start) +{ + int ret, i, index = path->p_tree_depth; + u32 new_edge = 0; + u64 deleted_eb = 0; + struct buffer_head *bh; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + *delete_start = 0; + + while (index >= 0) { + bh = path->p_node[index].bh; + el = path->p_node[index].el; + + mlog(0, "traveling tree (index = %d, block = %llu)\n", + index, (unsigned long long)bh->b_blocknr); + + BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); + + if (index != + (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { + ocfs2_error(inode->i_sb, + "Inode %lu has invalid ext. block %llu", + inode->i_ino, + (unsigned long long)bh->b_blocknr); + ret = -EROFS; + goto out; } - BUG_ON(i < 0); - if (bh) { - brelse(bh); - bh = NULL; +find_tail_record: + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + + mlog(0, "Extent list before: record %d: (%u, %u, %llu), " + "next = %u\n", i, le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec), + (unsigned long long)le64_to_cpu(rec->e_blkno), + le16_to_cpu(el->l_next_free_rec)); + + BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); + + if (le16_to_cpu(el->l_tree_depth) == 0) { + /* + * If the leaf block contains a single empty + * extent and no records, we can just remove + * the block. + */ + if (i == 0 && ocfs2_is_empty_extent(rec)) { + memset(rec, 0, + sizeof(struct ocfs2_extent_rec)); + el->l_next_free_rec = cpu_to_le16(0); + + goto delete; + } + + /* + * Remove any empty extents by shifting things + * left. That should make life much easier on + * the code below. This condition is rare + * enough that we shouldn't see a performance + * hit. + */ + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + le16_add_cpu(&el->l_next_free_rec, -1); + + for(i = 0; + i < le16_to_cpu(el->l_next_free_rec); i++) + el->l_recs[i] = el->l_recs[i + 1]; + + memset(&el->l_recs[i], 0, + sizeof(struct ocfs2_extent_rec)); + + /* + * We've modified our extent list. The + * simplest way to handle this change + * is to being the search from the + * start again. + */ + goto find_tail_record; + } + + le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); + + /* + * We'll use "new_edge" on our way back up the + * tree to know what our rightmost cpos is. + */ + new_edge = le16_to_cpu(rec->e_leaf_clusters); + new_edge += le32_to_cpu(rec->e_cpos); + + /* + * The caller will use this to delete data blocks. + */ + *delete_start = le64_to_cpu(rec->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, + le16_to_cpu(rec->e_leaf_clusters)); + + /* + * If it's now empty, remove this record. + */ + if (le16_to_cpu(rec->e_leaf_clusters) == 0) { + memset(rec, 0, + sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&el->l_next_free_rec, -1); + } + } else { + if (le64_to_cpu(rec->e_blkno) == deleted_eb) { + memset(rec, 0, + sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&el->l_next_free_rec, -1); + + goto delete; + } + + /* Can this actually happen? */ + if (le16_to_cpu(el->l_next_free_rec) == 0) + goto delete; + + /* + * We never actually deleted any clusters + * because our leaf was empty. There's no + * reason to adjust the rightmost edge then. + */ + if (new_edge == 0) + goto delete; + + rec->e_int_clusters = cpu_to_le32(new_edge); + le32_add_cpu(&rec->e_int_clusters, + -le32_to_cpu(rec->e_cpos)); + + /* + * A deleted child record should have been + * caught above. + */ + BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); } - status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, - inode); - if (status < 0) { - mlog_errno(status); - goto bail; +delete: + ret = ocfs2_journal_dirty(handle, bh); + if (ret) { + mlog_errno(ret); + goto out; } - eb = (struct ocfs2_extent_block *) bh->b_data; - el = &eb->h_list; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; + + mlog(0, "extent list container %llu, after: record %d: " + "(%u, %u, %llu), next = %u.\n", + (unsigned long long)bh->b_blocknr, i, + le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), + (unsigned long long)le64_to_cpu(rec->e_blkno), + le16_to_cpu(el->l_next_free_rec)); + + /* + * We must be careful to only attempt delete of an + * extent block (and not the root inode block). + */ + if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { + struct ocfs2_extent_block *eb = + (struct ocfs2_extent_block *)bh->b_data; + + /* + * Save this for use when processing the + * parent block. + */ + deleted_eb = le64_to_cpu(eb->h_blkno); + + mlog(0, "deleting this extent block.\n"); + + ocfs2_remove_from_cache(inode, bh); + + BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); + BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); + BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); + + if (le16_to_cpu(eb->h_suballoc_slot) == 0) { + /* + * This code only understands how to + * lock the suballocator in slot 0, + * which is fine because allocation is + * only ever done out of that + * suballocator too. A future version + * might change that however, so avoid + * a free if we don't know how to + * handle it. This way an fs incompat + * bit will not be necessary. + */ + ret = ocfs2_free_extent_block(handle, + tc->tc_ext_alloc_inode, + tc->tc_ext_alloc_bh, + eb); + + /* An error here is not fatal. */ + if (ret < 0) + mlog_errno(ret); + } + } else { + deleted_eb = 0; } - } while (el->l_tree_depth); - *new_last_eb = bh; - get_bh(*new_last_eb); - mlog(0, "returning block %llu\n", - (unsigned long long)le64_to_cpu(eb->h_blkno)); -bail: - if (bh) - brelse(bh); + index--; + } - return status; + ret = 0; +out: + return ret; } static int ocfs2_do_truncate(struct ocfs2_super *osb, unsigned int clusters_to_del, struct inode *inode, struct buffer_head *fe_bh, - struct buffer_head *old_last_eb_bh, handle_t *handle, - struct ocfs2_truncate_context *tc) + struct ocfs2_truncate_context *tc, + struct ocfs2_path *path) { - int status, i, depth; + int status; struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; struct ocfs2_extent_block *last_eb = NULL; struct ocfs2_extent_list *el; - struct buffer_head *eb_bh = NULL; struct buffer_head *last_eb_bh = NULL; - u64 next_eb = 0; u64 delete_blk = 0; fe = (struct ocfs2_dinode *) fe_bh->b_data; - status = ocfs2_find_new_last_ext_blk(osb, - inode, - fe, - le32_to_cpu(fe->i_clusters) - - clusters_to_del, - old_last_eb_bh, - &last_eb_bh); + status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, + path, &last_eb_bh); if (status < 0) { mlog_errno(status); goto bail; } - if (last_eb_bh) - last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + /* + * Each component will be touched, so we might as well journal + * here to avoid having to handle errors later. + */ + status = ocfs2_journal_access_path(inode, handle, path); if (status < 0) { mlog_errno(status); goto bail; } + + if (last_eb_bh) { + status = ocfs2_journal_access(handle, inode, last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + } + el = &(fe->id2.i_list); + /* + * Lower levels depend on this never happening, but it's best + * to check it up here before changing the tree. + */ + if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { + ocfs2_error(inode->i_sb, + "Inode %lu has an empty extent record, depth %u\n", + inode->i_ino, le16_to_cpu(el->l_tree_depth)); + status = -EROFS; + goto bail; + } + spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - clusters_to_del; spin_unlock(&OCFS2_I(inode)->ip_lock); le32_add_cpu(&fe->i_clusters, -clusters_to_del); - fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); - fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec); - - i = le16_to_cpu(el->l_next_free_rec) - 1; - - BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); - le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); - /* tree depth zero, we can just delete the clusters, otherwise - * we need to record the offset of the next level extent block - * as we may overwrite it. */ - if (!el->l_tree_depth) - delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) - + ocfs2_clusters_to_blocks(osb->sb, - le32_to_cpu(el->l_recs[i].e_clusters)); - else - next_eb = le64_to_cpu(el->l_recs[i].e_blkno); - if (!el->l_recs[i].e_clusters) { - /* if we deleted the whole extent record, then clear - * out the other fields and update the extent - * list. For depth > 0 trees, we've already recorded - * the extent block in 'next_eb' */ - el->l_recs[i].e_cpos = 0; - el->l_recs[i].e_blkno = 0; - BUG_ON(!el->l_next_free_rec); - le16_add_cpu(&el->l_next_free_rec, -1); + status = ocfs2_trim_tree(inode, path, handle, tc, + clusters_to_del, &delete_blk); + if (status) { + mlog_errno(status); + goto bail; } - depth = le16_to_cpu(el->l_tree_depth); - if (!fe->i_clusters) { + if (le32_to_cpu(fe->i_clusters) == 0) { /* trunc to zero is a special case. */ el->l_tree_depth = 0; fe->i_last_eb_blk = 0; @@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, /* If there will be a new last extent block, then by * definition, there cannot be any leaves to the right of * him. */ - status = ocfs2_journal_access(handle, inode, last_eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } last_eb->h_next_leaf_blk = 0; status = ocfs2_journal_dirty(handle, last_eb_bh); if (status < 0) { @@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } } - /* if our tree depth > 0, update all the tree blocks below us. */ - while (depth) { - mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n", - depth, (unsigned long long)next_eb); - status = ocfs2_read_block(osb, next_eb, &eb_bh, - OCFS2_BH_CACHED, inode); + if (delete_blk) { + status = ocfs2_truncate_log_append(osb, handle, delete_blk, + clusters_to_del); if (status < 0) { mlog_errno(status); goto bail; } - eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; + } + status = 0; +bail: + + mlog_exit(status); + return status; +} + +static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) +{ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + return 0; +} + +static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) +{ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + return ocfs2_journal_dirty_data(handle, bh); +} + +static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, + struct page **pages, int numpages, + u64 phys, handle_t *handle) +{ + int i, ret, partial = 0; + void *kaddr; + struct page *page; + unsigned int from, to = PAGE_CACHE_SIZE; + struct super_block *sb = inode->i_sb; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); + + if (numpages == 0) + goto out; + + from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ + if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) { + /* + * Since 'from' has been capped to a value below page + * size, this calculation won't be able to overflow + * 'to' + */ + to = ocfs2_align_bytes_to_clusters(sb, from); + + /* + * The truncate tail in this case should never contain + * more than one page at maximum. The loop below also + * assumes this. + */ + BUG_ON(numpages != 1); + } + + for(i = 0; i < numpages; i++) { + page = pages[i]; + + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + + ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0); + if (ret) + mlog_errno(ret); + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + from, 0, to - from); + kunmap_atomic(kaddr, KM_USER0); + + /* + * Need to set the buffers we zero'd into uptodate + * here if they aren't - ocfs2_map_page_blocks() + * might've skipped some + */ + if (ocfs2_should_order_data(inode)) { + ret = walk_page_buffers(handle, + page_buffers(page), + from, to, &partial, + ocfs2_ordered_zero_func); + if (ret < 0) + mlog_errno(ret); + } else { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_writeback_zero_func); + if (ret < 0) + mlog_errno(ret); } - el = &(eb->h_list); - status = ocfs2_journal_access(handle, inode, eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; + if (!partial) + SetPageUptodate(page); + + flush_dcache_page(page); + + /* + * Every page after the 1st one should be completely zero'd. + */ + from = 0; + } +out: + if (pages) { + for (i = 0; i < numpages; i++) { + page = pages[i]; + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); } + } +} - BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); - BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); +static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, + int *num, u64 *phys) +{ + int i, numpages = 0, ret = 0; + unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize; + unsigned int ext_flags; + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + unsigned long index; + u64 next_cluster_bytes; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); + + /* Cluster boundary, so we don't need to grab any pages. */ + if ((isize & (csize - 1)) == 0) + goto out; - i = le16_to_cpu(el->l_next_free_rec) - 1; + ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, + phys, NULL, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } - mlog(0, "extent block %llu, before: record %d: " - "(%u, %u, %llu), next = %u\n", - (unsigned long long)le64_to_cpu(eb->h_blkno), i, - le32_to_cpu(el->l_recs[i].e_cpos), - le32_to_cpu(el->l_recs[i].e_clusters), - (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno), - le16_to_cpu(el->l_next_free_rec)); + /* Tail is a hole. */ + if (*phys == 0) + goto out; - BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); - le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); - - next_eb = le64_to_cpu(el->l_recs[i].e_blkno); - /* bottom-most block requires us to delete data.*/ - if (!el->l_tree_depth) - delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) - + ocfs2_clusters_to_blocks(osb->sb, - le32_to_cpu(el->l_recs[i].e_clusters)); - if (!el->l_recs[i].e_clusters) { - el->l_recs[i].e_cpos = 0; - el->l_recs[i].e_blkno = 0; - BUG_ON(!el->l_next_free_rec); - le16_add_cpu(&el->l_next_free_rec, -1); - } - mlog(0, "extent block %llu, after: record %d: " - "(%u, %u, %llu), next = %u\n", - (unsigned long long)le64_to_cpu(eb->h_blkno), i, - le32_to_cpu(el->l_recs[i].e_cpos), - le32_to_cpu(el->l_recs[i].e_clusters), - (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno), - le16_to_cpu(el->l_next_free_rec)); + /* Tail is marked as unwritten, we can count on write to zero + * in that case. */ + if (ext_flags & OCFS2_EXT_UNWRITTEN) + goto out; - status = ocfs2_journal_dirty(handle, eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; + next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); + index = isize >> PAGE_CACHE_SHIFT; + do { + pages[numpages] = grab_cache_page(mapping, index); + if (!pages[numpages]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; } - if (!el->l_next_free_rec) { - mlog(0, "deleting this extent block.\n"); - - ocfs2_remove_from_cache(inode, eb_bh); + numpages++; + index++; + } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); - BUG_ON(el->l_recs[0].e_clusters); - BUG_ON(el->l_recs[0].e_cpos); - BUG_ON(el->l_recs[0].e_blkno); - if (eb->h_suballoc_slot == 0) { - /* - * This code only understands how to - * lock the suballocator in slot 0, - * which is fine because allocation is - * only ever done out of that - * suballocator too. A future version - * might change that however, so avoid - * a free if we don't know how to - * handle it. This way an fs incompat - * bit will not be necessary. - */ - status = ocfs2_free_extent_block(handle, - tc->tc_ext_alloc_inode, - tc->tc_ext_alloc_bh, - eb); - if (status < 0) { - mlog_errno(status); - goto bail; +out: + if (ret != 0) { + if (pages) { + for (i = 0; i < numpages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + page_cache_release(pages[i]); } } } - brelse(eb_bh); - eb_bh = NULL; - depth--; + numpages = 0; } - BUG_ON(!delete_blk); - status = ocfs2_truncate_log_append(osb, handle, delete_blk, - clusters_to_del); - if (status < 0) { - mlog_errno(status); - goto bail; + *num = numpages; + + return ret; +} + +/* + * Zero the area past i_size but still within an allocated + * cluster. This avoids exposing nonzero data on subsequent file + * extends. + * + * We need to call this before i_size is updated on the inode because + * otherwise block_write_full_page() will skip writeout of pages past + * i_size. The new_i_size parameter is passed for this reason. + */ +int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, + u64 new_i_size) +{ + int ret, numpages; + loff_t endbyte; + struct page **pages = NULL; + u64 phys; + + /* + * File systems which don't support sparse files zero on every + * extend. + */ + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + return 0; + + pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb), + sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; } - status = 0; -bail: - if (!status) - ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); - else - ocfs2_extent_map_drop(inode, 0); - mlog_exit(status); - return status; + + ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (numpages == 0) + goto out; + + ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, + handle); + + /* + * Initiate writeout of the pages we zero'd here. We don't + * wait on them - the truncate_inode_pages() call later will + * do that for us. + */ + endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); + ret = do_sync_mapping_range(inode->i_mapping, new_i_size, + endbyte - 1, SYNC_FILE_RANGE_WRITE); + if (ret) + mlog_errno(ret); + +out: + if (pages) + kfree(pages); + + return ret; } /* @@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct ocfs2_truncate_context *tc) { int status, i, credits, tl_sem = 0; - u32 clusters_to_del, target_i_clusters; - u64 last_eb = 0; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; + u32 clusters_to_del, new_highest_cpos, range; struct ocfs2_extent_list *el; - struct buffer_head *last_eb_bh; handle_t *handle = NULL; struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_path *path = NULL; mlog_entry_void(); down_write(&OCFS2_I(inode)->ip_alloc_sem); - target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, + new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - last_eb_bh = tc->tc_last_eb_bh; - tc->tc_last_eb_bh = NULL; + path = ocfs2_new_inode_path(fe_bh); + if (!path) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } - fe = (struct ocfs2_dinode *) fe_bh->b_data; + ocfs2_extent_map_trunc(inode, new_highest_cpos); - if (fe->id2.i_list.l_tree_depth) { - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - el = &eb->h_list; - } else - el = &fe->id2.i_list; - last_eb = le64_to_cpu(fe->i_last_eb_blk); start: - mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " - "last_eb = %llu, fe->i_last_eb_blk = %llu, " - "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", - le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb, - (unsigned long long)le64_to_cpu(fe->i_last_eb_blk), - le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); - - if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) { - mlog(0, "last_eb changed!\n"); - BUG_ON(!fe->id2.i_list.l_tree_depth); - last_eb = le64_to_cpu(fe->i_last_eb_blk); - /* i_last_eb_blk may have changed, read it if - * necessary. We don't have to worry about the - * truncate to zero case here (where there becomes no - * last_eb) because we never loop back after our work - * is done. */ - if (last_eb_bh) { - brelse(last_eb_bh); - last_eb_bh = NULL; - } + /* + * Check that we still have allocation to delete. + */ + if (OCFS2_I(inode)->ip_clusters == 0) { + status = 0; + goto bail; + } - status = ocfs2_read_block(osb, last_eb, - &last_eb_bh, OCFS2_BH_CACHED, - inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } - el = &(eb->h_list); + /* + * Truncate always works against the rightmost tree branch. + */ + status = ocfs2_find_path(inode, path, UINT_MAX); + if (status) { + mlog_errno(status); + goto bail; + } + + mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", + OCFS2_I(inode)->ip_clusters, path->p_tree_depth); + + /* + * By now, el will point to the extent list on the bottom most + * portion of this tree. Only the tail record is considered in + * each pass. + * + * We handle the following cases, in order: + * - empty extent: delete the remaining branch + * - remove the entire record + * - remove a partial record + * - no record needs to be removed (truncate has completed) + */ + el = path_leaf_el(path); + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has empty extent block at %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)path_leaf_bh(path)->b_blocknr); + status = -EROFS; + goto bail; } - /* by now, el will point to the extent list on the bottom most - * portion of this tree. */ i = le16_to_cpu(el->l_next_free_rec) - 1; - if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) - clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); - else - clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + + range = le32_to_cpu(el->l_recs[i].e_cpos) + + ocfs2_rec_clusters(el, &el->l_recs[i]); + if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { + clusters_to_del = 0; + } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { + clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); + } else if (range > new_highest_cpos) { + clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + le32_to_cpu(el->l_recs[i].e_cpos)) - - target_i_clusters; + new_highest_cpos; + } else { + status = 0; + goto bail; + } - mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); + mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", + clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); + + BUG_ON(clusters_to_del == 0); mutex_lock(&tl_inode->i_mutex); tl_sem = 1; @@ -1861,7 +3722,8 @@ start: } credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, - fe, el); + (struct ocfs2_dinode *)fe_bh->b_data, + el); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -1870,13 +3732,8 @@ start: goto bail; } - inode->i_ctime = inode->i_mtime = CURRENT_TIME; - status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); - if (status < 0) - mlog_errno(status); - - status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, - last_eb_bh, handle, tc); + status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, + tc, path); if (status < 0) { mlog_errno(status); goto bail; @@ -1888,9 +3745,14 @@ start: ocfs2_commit_trans(osb, handle); handle = NULL; - BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); - if (le32_to_cpu(fe->i_clusters) > target_i_clusters) - goto start; + ocfs2_reinit_path(path, 1); + + /* + * The check above will catch the case where we've truncated + * away all allocation. + */ + goto start; + bail: up_write(&OCFS2_I(inode)->ip_alloc_sem); @@ -1902,8 +3764,7 @@ bail: if (handle) ocfs2_commit_trans(osb, handle); - if (last_eb_bh) - brelse(last_eb_bh); + ocfs2_free_path(path); /* This will drop the ext_alloc cluster lock for us */ ocfs2_free_truncate_context(tc); @@ -1912,7 +3773,6 @@ bail: return status; } - /* * Expects the inode to already be locked. This will figure out which * inodes need to be locked and will put them on the returned truncate @@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, struct buffer_head *fe_bh, struct ocfs2_truncate_context **tc) { - int status, metadata_delete; + int status, metadata_delete, i; unsigned int new_i_clusters; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; @@ -1944,21 +3804,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, "%llu\n", fe->i_clusters, new_i_clusters, (unsigned long long)fe->i_size); - if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { - ocfs2_error(inode->i_sb, "Dinode %llu has cluster count " - "%u and size %llu whereas struct inode has " - "cluster count %u and size %llu which caused an " - "invalid truncate to %u clusters.", - (unsigned long long)le64_to_cpu(fe->i_blkno), - le32_to_cpu(fe->i_clusters), - (unsigned long long)le64_to_cpu(fe->i_size), - OCFS2_I(inode)->ip_clusters, i_size_read(inode), - new_i_clusters); - mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres); - status = -EIO; - goto bail; - } - *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); if (!(*tc)) { status = -ENOMEM; @@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, goto bail; } el = &(eb->h_list); - if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) + + i = 0; + if (ocfs2_is_empty_extent(&el->l_recs[0])) + i = 1; + /* + * XXX: Should we check that next_free_rec contains + * the extent? + */ + if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters) metadata_delete = 1; } diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 0b82e804432..fbcb5934a08 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, - u64 blkno, + u32 cpos, + u64 start_blk, u32 new_clusters, struct ocfs2_alloc_context *meta_ac); int ocfs2_num_free_extents(struct ocfs2_super *osb, @@ -70,6 +71,8 @@ struct ocfs2_truncate_context { struct buffer_head *tc_last_eb_bh; }; +int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, + u64 new_i_size); int ocfs2_prepare_truncate(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, @@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct buffer_head *fe_bh, struct ocfs2_truncate_context *tc); +int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, + u32 cpos, struct buffer_head **leaf_bh); + +/* + * Helper function to look at the # of clusters in an extent record. + */ +static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *rec) +{ + /* + * Cluster count in extent records is slightly different + * between interior nodes and leaf nodes. This is to support + * unwritten extents which need a flags field in leaf node + * records, thus shrinking the available space for a clusters + * field. + */ + if (el->l_tree_depth) + return le32_to_cpu(rec->e_int_clusters); + else + return le16_to_cpu(rec->e_leaf_clusters); +} + #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 875c1144381..56963e6c46c 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -24,6 +24,8 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <asm/byteorder.h> +#include <linux/swap.h> +#include <linux/pipe_fs_i.h> #define MLOG_MASK_PREFIX ML_FILE_IO #include <cluster/masklog.h> @@ -37,6 +39,7 @@ #include "file.h" #include "inode.h" #include "journal.h" +#include "suballoc.h" #include "super.h" #include "symlink.h" @@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int err = 0; + unsigned int ext_flags; u64 p_blkno, past_eof; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, (unsigned long long)iblock, bh_result, create); @@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, goto bail; } - /* this can happen if another node truncs after our extend! */ - spin_lock(&OCFS2_I(inode)->ip_lock); - if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, - OCFS2_I(inode)->ip_clusters)) - err = -EIO; - spin_unlock(&OCFS2_I(inode)->ip_lock); - if (err) - goto bail; - - err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, - NULL); + err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, + &ext_flags); if (err) { mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " "%llu, NULL)\n", err, inode, (unsigned long long)iblock, @@ -167,22 +163,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, goto bail; } - map_bh(bh_result, inode->i_sb, p_blkno); - - if (bh_result->b_blocknr == 0) { - err = -EIO; - mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", - (unsigned long long)iblock, - (unsigned long long)p_blkno, - (unsigned long long)OCFS2_I(inode)->ip_blkno); - } + /* + * ocfs2 never allocates in this function - the only time we + * need to use BH_New is when we're extending i_size on a file + * system which doesn't support holes, in which case BH_New + * allows block_prepare_write() to zero. + */ + mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), + "ino %lu, iblock %llu\n", inode->i_ino, + (unsigned long long)iblock); + + /* Treat the unwritten extent as a hole for zeroing purposes. */ + if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + map_bh(bh_result, inode->i_sb, p_blkno); + + if (!ocfs2_sparse_alloc(osb)) { + if (p_blkno == 0) { + err = -EIO; + mlog(ML_ERROR, + "iblock = %llu p_blkno = %llu blkno=(%llu)\n", + (unsigned long long)iblock, + (unsigned long long)p_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); + dump_stack(); + } - past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, - (unsigned long long)past_eof); + past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, + (unsigned long long)past_eof); - if (create && (iblock >= past_eof)) - set_buffer_new(bh_result); + if (create && (iblock >= past_eof)) + set_buffer_new(bh_result); + } bail: if (err < 0) @@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) return ret; } -/* This can also be called from ocfs2_write_zero_page() which has done - * it's own cluster locking. */ +/* + * This is called from ocfs2_write_zero_page() which has handled it's + * own cluster locking and has ensured allocation exists for those + * blocks to be written. + */ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, unsigned from, unsigned to) { @@ -292,44 +308,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, return ret; } -/* - * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called - * from loopback. It must be able to perform its own locking around - * ocfs2_get_block(). - */ -static int ocfs2_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) -{ - struct inode *inode = page->mapping->host; - int ret; - - mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); - - ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); - if (ret != 0) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_prepare_write_nolock(inode, page, from, to); - - ocfs2_meta_unlock(inode, 0); -out: - mlog_exit(ret); - return ret; -} - /* Taken from ext3. We don't necessarily need the full blown * functionality yet, but IMHO it's better to cut and paste the whole * thing so we can avoid introducing our own bugs (and easily pick up * their fixes when they happen) --Mark */ -static int walk_page_buffers( handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)( handle_t *handle, - struct buffer_head *bh)) +int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; @@ -388,95 +377,6 @@ out: return handle; } -static int ocfs2_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) -{ - int ret; - struct buffer_head *di_bh = NULL; - struct inode *inode = page->mapping->host; - handle_t *handle = NULL; - struct ocfs2_dinode *di; - - mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); - - /* NOTE: ocfs2_file_aio_write has ensured that it's safe for - * us to continue here without rechecking the I/O against - * changed inode values. - * - * 1) We're currently holding the inode alloc lock, so no - * nodes can change it underneath us. - * - * 2) We've had to take the metadata lock at least once - * already to check for extending writes, suid removal, etc. - * The meta data update code then ensures that we don't get a - * stale inode allocation image (i_size, i_clusters, etc). - */ - - ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page); - if (ret != 0) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_data_lock_with_page(inode, 1, page); - if (ret != 0) { - mlog_errno(ret); - goto out_unlock_meta; - } - - handle = ocfs2_start_walk_page_trans(inode, page, from, to); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_unlock_data; - } - - /* Mark our buffer early. We'd rather catch this error up here - * as opposed to after a successful commit_write which would - * require us to set back inode->i_size. */ - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } - - /* might update i_size */ - ret = generic_commit_write(file, page, from, to); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } - - di = (struct ocfs2_dinode *)di_bh->b_data; - - /* ocfs2_mark_inode_dirty() is too heavy to use here. */ - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - - inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode))); - di->i_size = cpu_to_le64((u64)i_size_read(inode)); - - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } - -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); -out_unlock_data: - ocfs2_data_unlock(inode, 1); -out_unlock_meta: - ocfs2_meta_unlock(inode, 1); -out: - if (di_bh) - brelse(di_bh); - - mlog_exit(ret); - return ret; -} - static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) { sector_t status; @@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) down_read(&OCFS2_I(inode)->ip_alloc_sem); } - err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, - NULL); + err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); if (!INODE_JOURNAL(inode)) { up_read(&OCFS2_I(inode)->ip_alloc_sem); @@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret; - u64 p_blkno, inode_blocks; - int contig_blocks; + u64 p_blkno, inode_blocks, contig_blocks; + unsigned int ext_flags; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; @@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, * nicely aligned and of the right size, so there's no need * for us to check any of that. */ - spin_lock(&OCFS2_I(inode)->ip_lock); - inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb, - OCFS2_I(inode)->ip_clusters); - - /* - * For a read which begins past the end of file, we return a hole. - */ - if (!create && (iblock >= inode_blocks)) { - spin_unlock(&OCFS2_I(inode)->ip_lock); - ret = 0; - goto bail; - } + inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); /* * Any write past EOF is not allowed because we'd be extending. */ if (create && (iblock + max_blocks) > inode_blocks) { - spin_unlock(&OCFS2_I(inode)->ip_lock); ret = -EIO; goto bail; } - spin_unlock(&OCFS2_I(inode)->ip_lock); /* This figures out the size of the next contiguous block, and * our logical offset */ - ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, - &contig_blocks); + ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, + &contig_blocks, &ext_flags); if (ret) { mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", (unsigned long long)iblock); @@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } - map_bh(bh_result, inode->i_sb, p_blkno); + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { + ocfs2_error(inode->i_sb, + "Inode %llu has a hole at block %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)iblock); + ret = -EROFS; + goto bail; + } + + /* + * get_more_blocks() expects us to describe a hole by clearing + * the mapped bit on bh_result(). + * + * Consider an unwritten extent as a hole. + */ + if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + map_bh(bh_result, inode->i_sb, p_blkno); + else { + /* + * ocfs2_prepare_inode_for_write() should have caught + * the case where we'd be filling a hole and triggered + * a buffered write instead. + */ + if (create) { + ret = -EIO; + mlog_errno(ret); + goto bail; + } + + clear_buffer_mapped(bh_result); + } /* make sure we don't map more than max_blocks blocks here as that's all the kernel will handle at this point. */ @@ -606,12 +522,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, void *private) { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + int level; /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + ocfs2_iocb_clear_rw_locked(iocb); - up_read(&inode->i_alloc_sem); - ocfs2_rw_unlock(inode, 0); + + level = ocfs2_iocb_rw_locked_level(iocb); + if (!level) + up_read(&inode->i_alloc_sem); + ocfs2_rw_unlock(inode, level); } /* @@ -647,23 +568,27 @@ static ssize_t ocfs2_direct_IO(int rw, mlog_entry_void(); - /* - * We get PR data locks even for O_DIRECT. This allows - * concurrent O_DIRECT I/O but doesn't let O_DIRECT with - * extending and buffered zeroing writes race. If they did - * race then the buffered zeroing could be written back after - * the O_DIRECT I/O. It's one thing to tell people not to mix - * buffered and O_DIRECT writes, but expecting them to - * understand that file extension is also an implicit buffered - * write is too much. By getting the PR we force writeback of - * the buffered zeroing before proceeding. - */ - ret = ocfs2_data_lock(inode, 0); - if (ret < 0) { - mlog_errno(ret); - goto out; + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { + /* + * We get PR data locks even for O_DIRECT. This + * allows concurrent O_DIRECT I/O but doesn't let + * O_DIRECT with extending and buffered zeroing writes + * race. If they did race then the buffered zeroing + * could be written back after the O_DIRECT I/O. It's + * one thing to tell people not to mix buffered and + * O_DIRECT writes, but expecting them to understand + * that file extension is also an implicit buffered + * write is too much. By getting the PR we force + * writeback of the buffered zeroing before + * proceeding. + */ + ret = ocfs2_data_lock(inode, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + ocfs2_data_unlock(inode, 0); } - ocfs2_data_unlock(inode, 0); ret = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, @@ -675,11 +600,715 @@ out: return ret; } +static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, + u32 cpos, + unsigned int *start, + unsigned int *end) +{ + unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; + + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { + unsigned int cpp; + + cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); + + cluster_start = cpos % cpp; + cluster_start = cluster_start << osb->s_clustersize_bits; + + cluster_end = cluster_start + osb->s_clustersize; + } + + BUG_ON(cluster_start > PAGE_SIZE); + BUG_ON(cluster_end > PAGE_SIZE); + + if (start) + *start = cluster_start; + if (end) + *end = cluster_end; +} + +/* + * 'from' and 'to' are the region in the page to avoid zeroing. + * + * If pagesize > clustersize, this function will avoid zeroing outside + * of the cluster boundary. + * + * from == to == 0 is code for "zero the entire cluster region" + */ +static void ocfs2_clear_page_regions(struct page *page, + struct ocfs2_super *osb, u32 cpos, + unsigned from, unsigned to) +{ + void *kaddr; + unsigned int cluster_start, cluster_end; + + ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); + + kaddr = kmap_atomic(page, KM_USER0); + + if (from || to) { + if (from > cluster_start) + memset(kaddr + cluster_start, 0, from - cluster_start); + if (to < cluster_end) + memset(kaddr + to, 0, cluster_end - to); + } else { + memset(kaddr + cluster_start, 0, cluster_end - cluster_start); + } + + kunmap_atomic(kaddr, KM_USER0); +} + +/* + * Some of this taken from block_prepare_write(). We already have our + * mapping by now though, and the entire write will be allocating or + * it won't, so not much need to use BH_New. + * + * This will also skip zeroing, which is handled externally. + */ +int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, + struct inode *inode, unsigned int from, + unsigned int to, int new) +{ + int ret = 0; + struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; + unsigned int block_end, block_start; + unsigned int bsize = 1 << inode->i_blkbits; + + if (!page_has_buffers(page)) + create_empty_buffers(page, bsize, 0); + + head = page_buffers(page); + for (bh = head, block_start = 0; bh != head || !block_start; + bh = bh->b_this_page, block_start += bsize) { + block_end = block_start + bsize; + + /* + * Ignore blocks outside of our i/o range - + * they may belong to unallocated clusters. + */ + if (block_start >= to || block_end <= from) { + if (PageUptodate(page)) + set_buffer_uptodate(bh); + continue; + } + + /* + * For an allocating write with cluster size >= page + * size, we always write the entire page. + */ + + if (buffer_new(bh)) + clear_buffer_new(bh); + + if (!buffer_mapped(bh)) { + map_bh(bh, inode->i_sb, *p_blkno); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + + *p_blkno = *p_blkno + 1; + } + + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + ret = -EIO; + } + + if (ret == 0 || !new) + return ret; + + /* + * If we get -EIO above, zero out any newly allocated blocks + * to avoid exposing stale data. + */ + bh = head; + block_start = 0; + do { + void *kaddr; + + block_end = block_start + bsize; + if (block_end <= from) + goto next_bh; + if (block_start >= to) + break; + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+block_start, 0, bh->b_size); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + +next_bh: + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + + return ret; +} + +/* + * This will copy user data from the buffer page in the splice + * context. + * + * For now, we ignore SPLICE_F_MOVE as that would require some extra + * communication out all the way to ocfs2_write(). + */ +int ocfs2_map_and_write_splice_data(struct inode *inode, + struct ocfs2_write_ctxt *wc, u64 *p_blkno, + unsigned int *ret_from, unsigned int *ret_to) +{ + int ret; + unsigned int to, from, cluster_start, cluster_end; + char *src, *dst; + struct ocfs2_splice_write_priv *sp = wc->w_private; + struct pipe_buffer *buf = sp->s_buf; + unsigned long bytes, src_from; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, + &cluster_end); + + from = sp->s_offset; + src_from = sp->s_buf_offset; + bytes = wc->w_count; + + if (wc->w_large_pages) { + /* + * For cluster size < page size, we have to + * calculate pos within the cluster and obey + * the rightmost boundary. + */ + bytes = min(bytes, (unsigned long)(osb->s_clustersize + - (wc->w_pos & (osb->s_clustersize - 1)))); + } + to = from + bytes; + + if (wc->w_this_page_new) + ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, + cluster_start, cluster_end, 1); + else + ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, + from, to, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > osb->s_clustersize); + BUG_ON(to > osb->s_clustersize); + + src = buf->ops->map(sp->s_pipe, buf, 1); + dst = kmap_atomic(wc->w_this_page, KM_USER1); + memcpy(dst + from, src + src_from, bytes); + kunmap_atomic(wc->w_this_page, KM_USER1); + buf->ops->unmap(sp->s_pipe, buf, src); + + wc->w_finished_copy = 1; + + *ret_from = from; + *ret_to = to; +out: + + return bytes ? (unsigned int)bytes : ret; +} + +/* + * This will copy user data from the iovec in the buffered write + * context. + */ +int ocfs2_map_and_write_user_data(struct inode *inode, + struct ocfs2_write_ctxt *wc, u64 *p_blkno, + unsigned int *ret_from, unsigned int *ret_to) +{ + int ret; + unsigned int to, from, cluster_start, cluster_end; + unsigned long bytes, src_from; + char *dst; + struct ocfs2_buffered_write_priv *bp = wc->w_private; + const struct iovec *cur_iov = bp->b_cur_iov; + char __user *buf; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, + &cluster_end); + + buf = cur_iov->iov_base + bp->b_cur_off; + src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; + + from = wc->w_pos & (PAGE_CACHE_SIZE - 1); + + /* + * This is a lot of comparisons, but it reads quite + * easily, which is important here. + */ + /* Stay within the src page */ + bytes = PAGE_SIZE - src_from; + /* Stay within the vector */ + bytes = min(bytes, + (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); + /* Stay within count */ + bytes = min(bytes, (unsigned long)wc->w_count); + /* + * For clustersize > page size, just stay within + * target page, otherwise we have to calculate pos + * within the cluster and obey the rightmost + * boundary. + */ + if (wc->w_large_pages) { + /* + * For cluster size < page size, we have to + * calculate pos within the cluster and obey + * the rightmost boundary. + */ + bytes = min(bytes, (unsigned long)(osb->s_clustersize + - (wc->w_pos & (osb->s_clustersize - 1)))); + } else { + /* + * cluster size > page size is the most common + * case - we just stay within the target page + * boundary. + */ + bytes = min(bytes, PAGE_CACHE_SIZE - from); + } + + to = from + bytes; + + if (wc->w_this_page_new) + ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, + cluster_start, cluster_end, 1); + else + ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, + from, to, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > osb->s_clustersize); + BUG_ON(to > osb->s_clustersize); + + dst = kmap(wc->w_this_page); + memcpy(dst + from, bp->b_src_buf + src_from, bytes); + kunmap(wc->w_this_page); + + /* + * XXX: This is slow, but simple. The caller of + * ocfs2_buffered_write_cluster() is responsible for + * passing through the iovecs, so it's difficult to + * predict what our next step is in here after our + * initial write. A future version should be pushing + * that iovec manipulation further down. + * + * By setting this, we indicate that a copy from user + * data was done, and subsequent calls for this + * cluster will skip copying more data. + */ + wc->w_finished_copy = 1; + + *ret_from = from; + *ret_to = to; +out: + + return bytes ? (unsigned int)bytes : ret; +} + +/* + * Map, fill and write a page to disk. + * + * The work of copying data is done via callback. Newly allocated + * pages which don't take user data will be zero'd (set 'new' to + * indicate an allocating write) + * + * Returns a negative error code or the number of bytes copied into + * the page. + */ +int ocfs2_write_data_page(struct inode *inode, handle_t *handle, + u64 *p_blkno, struct page *page, + struct ocfs2_write_ctxt *wc, int new) +{ + int ret, copied = 0; + unsigned int from = 0, to = 0; + unsigned int cluster_start, cluster_end; + unsigned int zero_from = 0, zero_to = 0; + + ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, + &cluster_start, &cluster_end); + + if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index + && !wc->w_finished_copy) { + + wc->w_this_page = page; + wc->w_this_page_new = new; + ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + copied = ret; + + zero_from = from; + zero_to = to; + if (new) { + from = cluster_start; + to = cluster_end; + } + } else { + /* + * If we haven't allocated the new page yet, we + * shouldn't be writing it out without copying user + * data. This is likely a math error from the caller. + */ + BUG_ON(!new); + + from = cluster_start; + to = cluster_end; + + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + cluster_start, cluster_end, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Parts of newly allocated pages need to be zero'd. + * + * Above, we have also rewritten 'to' and 'from' - as far as + * the rest of the function is concerned, the entire cluster + * range inside of a page needs to be written. + * + * We can skip this if the page is up to date - it's already + * been zero'd from being read in as a hole. + */ + if (new && !PageUptodate(page)) + ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), + wc->w_cpos, zero_from, zero_to); + + flush_dcache_page(page); + + if (ocfs2_should_order_data(inode)) { + ret = walk_page_buffers(handle, + page_buffers(page), + from, to, NULL, + ocfs2_journal_dirty_data); + if (ret < 0) + mlog_errno(ret); + } + + /* + * We don't use generic_commit_write() because we need to + * handle our own i_size update. + */ + ret = block_commit_write(page, from, to); + if (ret) + mlog_errno(ret); +out: + + return copied ? copied : ret; +} + +/* + * Do the actual write of some data into an inode. Optionally allocate + * in order to fulfill the write. + * + * cpos is the logical cluster offset within the file to write at + * + * 'phys' is the physical mapping of that offset. a 'phys' value of + * zero indicates that allocation is required. In this case, data_ac + * and meta_ac should be valid (meta_ac can be null if metadata + * allocation isn't required). + */ +static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, + struct buffer_head *di_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_write_ctxt *wc) +{ + int ret, i, numpages = 1, new; + unsigned int copied = 0; + u32 tmp_pos; + u64 v_blkno, p_blkno; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned long index, start; + struct page **cpages; + + new = phys == 0 ? 1 : 0; + + /* + * Figure out how many pages we'll be manipulating here. For + * non allocating write, we just change the one + * page. Otherwise, we'll need a whole clusters worth. + */ + if (new) + numpages = ocfs2_pages_per_cluster(inode->i_sb); + + cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); + if (!cpages) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + /* + * Fill our page array first. That way we've grabbed enough so + * that we can zero and flush if we error after adding the + * extent. + */ + if (new) { + start = ocfs2_align_clusters_to_page_index(inode->i_sb, + wc->w_cpos); + v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); + } else { + start = wc->w_pos >> PAGE_CACHE_SHIFT; + v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; + } + + for(i = 0; i < numpages; i++) { + index = start + i; + + cpages[i] = grab_cache_page(mapping, index); + if (!cpages[i]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + } + + if (new) { + /* + * This is safe to call with the page locks - it won't take + * any additional semaphores or cluster locks. + */ + tmp_pos = wc->w_cpos; + ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, + &tmp_pos, 1, di_bh, handle, + data_ac, meta_ac, NULL); + /* + * This shouldn't happen because we must have already + * calculated the correct meta data allocation required. The + * internal tree allocation code should know how to increase + * transaction credits itself. + * + * If need be, we could handle -EAGAIN for a + * RESTART_TRANS here. + */ + mlog_bug_on_msg(ret == -EAGAIN, + "Inode %llu: EAGAIN return during allocation.\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, + NULL); + if (ret < 0) { + + /* + * XXX: Should we go readonly here? + */ + + mlog_errno(ret); + goto out; + } + + BUG_ON(p_blkno == 0); + + for(i = 0; i < numpages; i++) { + ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], + wc, new); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + copied += ret; + } + +out: + for(i = 0; i < numpages; i++) { + unlock_page(cpages[i]); + mark_page_accessed(cpages[i]); + page_cache_release(cpages[i]); + } + kfree(cpages); + + return copied ? copied : ret; +} + +static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, + struct ocfs2_super *osb, loff_t pos, + size_t count, ocfs2_page_writer *cb, + void *cb_priv) +{ + wc->w_count = count; + wc->w_pos = pos; + wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; + wc->w_finished_copy = 0; + + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) + wc->w_large_pages = 1; + else + wc->w_large_pages = 0; + + wc->w_write_data_page = cb; + wc->w_private = cb_priv; +} + +/* + * Write a cluster to an inode. The cluster may not be allocated yet, + * in which case it will be. This only exists for buffered writes - + * O_DIRECT takes a more "traditional" path through the kernel. + * + * The caller is responsible for incrementing pos, written counts, etc + * + * For file systems that don't support sparse files, pre-allocation + * and page zeroing up until cpos should be done prior to this + * function call. + * + * Callers should be holding i_sem, and the rw cluster lock. + * + * Returns the number of user bytes written, or less than zero for + * error. + */ +ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, + size_t count, ocfs2_page_writer *actor, + void *priv) +{ + int ret, credits = OCFS2_INODE_UPDATE_CREDITS; + ssize_t written = 0; + u32 phys; + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle; + struct ocfs2_write_ctxt wc; + + ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); + + ret = ocfs2_meta_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + di = (struct ocfs2_dinode *)di_bh->b_data; + + /* + * Take alloc sem here to prevent concurrent lookups. That way + * the mapping, zeroing and tree manipulation within + * ocfs2_write() will be safe against ->readpage(). This + * should also serve to lock out allocation from a shared + * writeable region. + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); + if (ret) { + mlog_errno(ret); + goto out_meta; + } + + /* phys == 0 means that allocation is required. */ + if (phys == 0) { + ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out_meta; + } + + credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); + } + + ret = ocfs2_data_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto out_meta; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_data; + } + + written = ocfs2_write(file, phys, handle, di_bh, data_ac, + meta_ac, &wc); + if (written < 0) { + ret = written; + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + pos += written; + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_data: + ocfs2_data_unlock(inode, 1); + +out_meta: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_meta_unlock(inode, 1); + +out: + brelse(di_bh); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return written ? written : ret; +} + const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, .writepage = ocfs2_writepage, - .prepare_write = ocfs2_prepare_write, - .commit_write = ocfs2_commit_write, .bmap = ocfs2_bmap, .sync_page = block_sync_page, .direct_IO = ocfs2_direct_IO, diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index f446a15eab8..45821d479b5 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, unsigned from, unsigned to); +int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, + struct inode *inode, unsigned int from, + unsigned int to, int new); + +int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)); + +struct ocfs2_write_ctxt; +typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, + u64 *, unsigned int *, unsigned int *); + +ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, + size_t count, ocfs2_page_writer *actor, + void *priv); + +struct ocfs2_write_ctxt { + size_t w_count; + loff_t w_pos; + u32 w_cpos; + unsigned int w_finished_copy; + + /* This is true if page_size > cluster_size */ + unsigned int w_large_pages; + + /* Filler callback and private data */ + ocfs2_page_writer *w_write_data_page; + void *w_private; + + /* Only valid for the filler callback */ + struct page *w_this_page; + unsigned int w_this_page_new; +}; + +struct ocfs2_buffered_write_priv { + char *b_src_buf; + const struct iovec *b_cur_iov; /* Current iovec */ + size_t b_cur_off; /* Offset in the + * current iovec */ +}; +int ocfs2_map_and_write_user_data(struct inode *inode, + struct ocfs2_write_ctxt *wc, + u64 *p_blkno, + unsigned int *ret_from, + unsigned int *ret_to); + +struct ocfs2_splice_write_priv { + struct splice_desc *s_sd; + struct pipe_buffer *s_buf; + struct pipe_inode_info *s_pipe; + /* Neither offset value is ever larger than one page */ + unsigned int s_offset; + unsigned int s_buf_offset; +}; +int ocfs2_map_and_write_splice_data(struct inode *inode, + struct ocfs2_write_ctxt *wc, + u64 *p_blkno, + unsigned int *ret_from, + unsigned int *ret_to); + /* all ocfs2_dio_end_io()'s fault */ #define ocfs2_iocb_is_rw_locked(iocb) \ test_bit(0, (unsigned long *)&iocb->private) -#define ocfs2_iocb_set_rw_locked(iocb) \ - set_bit(0, (unsigned long *)&iocb->private) +static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) +{ + set_bit(0, (unsigned long *)&iocb->private); + if (level) + set_bit(1, (unsigned long *)&iocb->private); + else + clear_bit(1, (unsigned long *)&iocb->private); +} #define ocfs2_iocb_clear_rw_locked(iocb) \ clear_bit(0, (unsigned long *)&iocb->private) - +#define ocfs2_iocb_rw_locked_level(iocb) \ + test_bit(1, (unsigned long *)&iocb->private) #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 4705d659fe5..bbacf7da48a 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -46,6 +46,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/workqueue.h> +#include <linux/reboot.h> #include "heartbeat.h" #include "nodemanager.h" @@ -72,7 +73,9 @@ static void o2quo_fence_self(void) /* panic spins with interrupts enabled. with preempt * threads can still schedule, etc, etc */ o2hb_stop_all_regions(); - panic("ocfs2 is very sorry to be fencing this system by panicing\n"); + + printk("ocfs2 is very sorry to be fencing this system by restarting\n"); + emergency_restart(); } /* Indicate that a timeout occured on a hearbeat region write. The diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4dae5df5e46..9606111fe89 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -38,6 +38,9 @@ * locking semantics of the file system using the protocol. It should * be somewhere else, I'm sure, but right now it isn't. * + * New in version 8: + * - Replace delete inode votes with a cluster lock + * * New in version 7: * - DLM join domain includes the live nodemap * @@ -57,7 +60,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 7ULL +#define O2NET_PROTOCOL_VERSION 8ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 66821e17816..67e6866a2a4 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb, { int status; int extend; - u64 p_blkno; + u64 p_blkno, v_blkno; spin_lock(&OCFS2_I(dir)->ip_lock); extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); spin_unlock(&OCFS2_I(dir)->ip_lock); if (extend) { - status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, - parent_fe_bh, handle, + u32 offset = OCFS2_I(dir)->ip_clusters; + + status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, + 1, parent_fe_bh, handle, data_ac, meta_ac, NULL); BUG_ON(status == -EAGAIN); if (status < 0) { @@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb, } } - status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> - (sb->s_blocksize_bits - 9)), - 1, &p_blkno, NULL); + v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir)); + status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL); if (status < 0) { mlog_errno(status); goto bail; @@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, dir_i_size += dir->i_sb->s_blocksize; i_size_write(dir, dir_i_size); - dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size); + dir->i_blocks = ocfs2_inode_sector_count(dir); status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index c558442a0b4..d836b98dd99 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -430,11 +430,10 @@ redo_bucket: dlm_lockres_put(res); - cond_resched_lock(&dlm->spinlock); - if (dropped) goto redo_bucket; } + cond_resched_lock(&dlm->spinlock); num += n; mlog(0, "%s: touched %d lockreses in bucket %d " "(tot=%d)\n", dlm->name, n, i, num); @@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) { int status = 0, tmpstat, node; struct domain_join_ctxt *ctxt; - enum dlm_query_join_response response; + enum dlm_query_join_response response = JOIN_DISALLOW; mlog_entry("%p", dlm); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 6d4a83d5015..c1807a42c49 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) } } while (status != 0); + spin_lock(&dlm_reco_state_lock); switch (ndata->state) { case DLM_RECO_NODE_DATA_INIT: case DLM_RECO_NODE_DATA_FINALIZE_SENT: @@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) ndata->node_num, dead_node); break; } + spin_unlock(&dlm_reco_state_lock); } mlog(0, "done requesting all lock info\n"); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index e335541727f..27e43b0c0ea 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { + .get_osb = ocfs2_get_inode_osb, + .flags = 0, +}; + static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) { return lockres->l_type == OCFS2_LOCK_TYPE_META || lockres->l_type == OCFS2_LOCK_TYPE_DATA || - lockres->l_type == OCFS2_LOCK_TYPE_RW; + lockres->l_type == OCFS2_LOCK_TYPE_RW || + lockres->l_type == OCFS2_LOCK_TYPE_OPEN; } static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) @@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, case OCFS2_LOCK_TYPE_DATA: ops = &ocfs2_inode_data_lops; break; + case OCFS2_LOCK_TYPE_OPEN: + ops = &ocfs2_inode_open_lops; + break; default: mlog_bug_on_msg(1, "type: %d\n", type); ops = NULL; /* thanks, gcc */ @@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode) goto bail; } + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); + if (ret) { + mlog_errno(ret); + goto bail; + } + bail: mlog_exit(ret); return ret; @@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write) mlog_exit_void(); } +/* + * ocfs2_open_lock always get PR mode lock. + */ +int ocfs2_open_lock(struct inode *inode) +{ + int status = 0; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %llu take PRMODE open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (ocfs2_mount_local(osb)) + goto out; + + lockres = &OCFS2_I(inode)->ip_open_lockres; + + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, + LKM_PRMODE, 0, 0); + if (status < 0) + mlog_errno(status); + +out: + mlog_exit(status); + return status; +} + +int ocfs2_try_open_lock(struct inode *inode, int write) +{ + int status = 0, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %llu try to take %s open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_mount_local(osb)) + goto out; + + lockres = &OCFS2_I(inode)->ip_open_lockres; + + level = write ? LKM_EXMODE : LKM_PRMODE; + + /* + * The file system may already holding a PRMODE/EXMODE open lock. + * Since we pass LKM_NOQUEUE, the request won't block waiting on + * other nodes and the -EAGAIN will indicate to the caller that + * this inode is still in use. + */ + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, + level, LKM_NOQUEUE, 0); + +out: + mlog_exit(status); + return status; +} + +/* + * ocfs2_open_unlock unlock PR and EX mode open locks. + */ +void ocfs2_open_unlock(struct inode *inode) +{ + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog_entry_void(); + + mlog(0, "inode %llu drop open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (ocfs2_mount_local(osb)) + goto out; + + if(lockres->l_ro_holders) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, + LKM_PRMODE); + if(lockres->l_ex_holders) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, + LKM_EXMODE); + +out: + mlog_exit_void(); +} + int ocfs2_data_lock_full(struct inode *inode, int write, int arg_flags) @@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) inode->i_blocks = 0; else - inode->i_blocks = - ocfs2_align_bytes_to_sectors(i_size_read(inode)); + inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_uid = be32_to_cpu(lvb->lvb_iuid); inode->i_gid = be32_to_cpu(lvb->lvb_igid); @@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode, { int status = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_lock_res *lockres = NULL; + struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; struct ocfs2_dinode *fe; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); + if (ocfs2_mount_local(osb)) + goto bail; + spin_lock(&oi->ip_lock); if (oi->ip_flags & OCFS2_INODE_DELETED) { mlog(0, "Orphaned inode %llu was deleted while we " @@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode, } spin_unlock(&oi->ip_lock); - if (!ocfs2_mount_local(osb)) { - lockres = &oi->ip_meta_lockres; - - if (!ocfs2_should_refresh_lock_res(lockres)) - goto bail; - } + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; /* This will discard any caching information we might have had * for the inode metadata. */ ocfs2_metadata_cache_purge(inode); - /* will do nothing for inode types that don't use the extent - * map (directories, bitmap files, etc) */ ocfs2_extent_map_trunc(inode, 0); - if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) { + if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { mlog(0, "Trusting LVB on inode %llu\n", (unsigned long long)oi->ip_blkno); ocfs2_refresh_inode_from_lvb(inode); @@ -1558,8 +1662,7 @@ static int ocfs2_meta_lock_update(struct inode *inode, status = 0; bail_refresh: - if (lockres) - ocfs2_complete_lock_res_refresh(lockres, status); + ocfs2_complete_lock_res_refresh(lockres, status); bail: mlog_exit(status); return status; @@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode, wait_event(osb->recovery_event, ocfs2_node_map_is_empty(osb, &osb->recovery_map)); - acquired = 0; lockres = &OCFS2_I(inode)->ip_meta_lockres; level = ex ? LKM_EXMODE : LKM_PRMODE; dlm_flags = 0; @@ -2458,13 +2560,20 @@ int ocfs2_drop_inode_locks(struct inode *inode) * ocfs2_clear_inode has done it for us. */ err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), - &OCFS2_I(inode)->ip_data_lockres); + &OCFS2_I(inode)->ip_open_lockres); if (err < 0) mlog_errno(err); status = err; err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_data_lockres); + if (err < 0) + mlog_errno(err); + if (err < 0 && !status) + status = err; + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), &OCFS2_I(inode)->ip_meta_lockres); if (err < 0) mlog_errno(err); diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index c343fca68cf..59cb566e798 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode, int write); int ocfs2_rw_lock(struct inode *inode, int write); void ocfs2_rw_unlock(struct inode *inode, int write); +int ocfs2_open_lock(struct inode *inode); +int ocfs2_try_open_lock(struct inode *inode, int write); +void ocfs2_open_unlock(struct inode *inode); int ocfs2_meta_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, int *level); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 80ac69f11d9..ba2b2ab1c6e 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -3,8 +3,7 @@ * * extent_map.c * - * In-memory extent map for OCFS2. Man, this code was prettier in - * the library. + * Block/Cluster mapping functions * * Copyright (C) 2004 Oracle. All rights reserved. * @@ -26,1016 +25,528 @@ #include <linux/fs.h> #include <linux/init.h> #include <linux/types.h> -#include <linux/slab.h> -#include <linux/rbtree.h> #define MLOG_MASK_PREFIX ML_EXTENT_MAP #include <cluster/masklog.h> #include "ocfs2.h" +#include "alloc.h" #include "extent_map.h" #include "inode.h" #include "super.h" #include "buffer_head_io.h" - /* - * SUCK SUCK SUCK - * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h - */ - -struct ocfs2_extent_map_entry { - struct rb_node e_node; - int e_tree_depth; - struct ocfs2_extent_rec e_rec; -}; - -struct ocfs2_em_insert_context { - int need_left; - int need_right; - struct ocfs2_extent_map_entry *new_ent; - struct ocfs2_extent_map_entry *old_ent; - struct ocfs2_extent_map_entry *left_ent; - struct ocfs2_extent_map_entry *right_ent; -}; - -static struct kmem_cache *ocfs2_em_ent_cachep = NULL; - - -static struct ocfs2_extent_map_entry * -ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, - u32 cpos, u32 clusters, - struct rb_node ***ret_p, - struct rb_node **ret_parent); -static int ocfs2_extent_map_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth); -static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, - struct ocfs2_extent_map_entry *ent); -static int ocfs2_extent_map_find_leaf(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_list *el); -static int ocfs2_extent_map_lookup_read(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_map_entry **ret_ent); -static int ocfs2_extent_map_try_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth, - struct ocfs2_em_insert_context *ctxt); - -/* returns 1 only if the rec contains all the given clusters -- that is that - * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + - * clusters) is >= the argument's endpoint */ -static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, - u32 cpos, u32 clusters) -{ - if (le32_to_cpu(rec->e_cpos) > cpos) - return 0; - if (cpos + clusters > le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) - return 0; - return 1; -} - - -/* - * Find an entry in the tree that intersects the region passed in. - * Note that this will find straddled intervals, it is up to the - * callers to enforce any boundary conditions. - * - * Callers must hold ip_lock. This lookup is not guaranteed to return - * a tree_depth 0 match, and as such can race inserts if the lock - * were not held. + * The extent caching implementation is intentionally trivial. * - * The rb_node garbage lets insertion share the search. Trivial - * callers pass NULL. + * We only cache a small number of extents stored directly on the + * inode, so linear order operations are acceptable. If we ever want + * to increase the size of the extent map, then these algorithms must + * get smarter. */ -static struct ocfs2_extent_map_entry * -ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, - u32 cpos, u32 clusters, - struct rb_node ***ret_p, - struct rb_node **ret_parent) + +void ocfs2_extent_map_init(struct inode *inode) { - struct rb_node **p = &em->em_extents.rb_node; - struct rb_node *parent = NULL; - struct ocfs2_extent_map_entry *ent = NULL; - - while (*p) - { - parent = *p; - ent = rb_entry(parent, struct ocfs2_extent_map_entry, - e_node); - if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { - p = &(*p)->rb_left; - ent = NULL; - } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + - le32_to_cpu(ent->e_rec.e_clusters))) { - p = &(*p)->rb_right; - ent = NULL; - } else - break; - } + struct ocfs2_inode_info *oi = OCFS2_I(inode); - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; - return ent; + oi->ip_extent_map.em_num_items = 0; + INIT_LIST_HEAD(&oi->ip_extent_map.em_list); } -/* - * Find the leaf containing the interval we want. While we're on our - * way down the tree, fill in every record we see at any depth, because - * we might want it later. - * - * Note that this code is run without ip_lock. That's because it - * sleeps while reading. If someone is also filling the extent list at - * the same time we are, we might have to restart. - */ -static int ocfs2_extent_map_find_leaf(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_list *el) +static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, + unsigned int cpos, + struct ocfs2_extent_map_item **ret_emi) { - int i, ret; - struct buffer_head *eb_bh = NULL; - u64 blkno; - u32 rec_end; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_rec *rec; - - /* - * The bh data containing the el cannot change here, because - * we hold alloc_sem. So we can do this without other - * locks. - */ - while (el->l_tree_depth) - { - blkno = 0; - for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - rec = &el->l_recs[i]; - rec_end = (le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)); - - ret = -EBADR; - if (rec_end > OCFS2_I(inode)->ip_clusters) { - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", - i, - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno, - OCFS2_I(inode)->ip_clusters); - goto out_free; - } - - if (rec_end <= cpos) { - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } - continue; - } - if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } - continue; - } + unsigned int range; + struct ocfs2_extent_map_item *emi; - /* - * We've found a record that matches our - * interval. We don't insert it because we're - * about to traverse it. - */ - - /* Check to see if we're stradling */ - ret = -ESRCH; - if (!ocfs2_extent_rec_contains_clusters(rec, - cpos, - clusters)) { - mlog_errno(ret); - goto out_free; - } + *ret_emi = NULL; - /* - * If we've already found a record, the el has - * two records covering the same interval. - * EEEK! - */ - ret = -EBADR; - if (blkno) { - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n", - cpos, clusters, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)blkno, i, - (unsigned long long)le64_to_cpu(rec->e_blkno)); - goto out_free; - } + list_for_each_entry(emi, &em->em_list, ei_list) { + range = emi->ei_cpos + emi->ei_clusters; - blkno = le64_to_cpu(rec->e_blkno); - } + if (cpos >= emi->ei_cpos && cpos < range) { + list_move(&emi->ei_list, &em->em_list); - /* - * We don't support holes, and we're still up - * in the branches, so we'd better have found someone - */ - ret = -EBADR; - if (!blkno) { - ocfs2_error(inode->i_sb, - "No record found for (cpos = %u, clusters = %u) on inode %llu\n", - cpos, clusters, - (unsigned long long)OCFS2_I(inode)->ip_blkno); - mlog_errno(ret); - goto out_free; - } - - if (eb_bh) { - brelse(eb_bh); - eb_bh = NULL; - } - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - blkno, &eb_bh, OCFS2_BH_CACHED, - inode); - if (ret) { - mlog_errno(ret); - goto out_free; - } - eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EIO; - goto out_free; + *ret_emi = emi; + break; } - el = &eb->h_list; } +} - BUG_ON(el->l_tree_depth); - - for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - rec = &el->l_recs[i]; - - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > - OCFS2_I(inode)->ip_clusters) { - ret = -EBADR; - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", - i, - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno, - OCFS2_I(inode)->ip_clusters); - return ret; - } - - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } +static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, + unsigned int *phys, unsigned int *len, + unsigned int *flags) +{ + unsigned int coff; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map_item *emi; + + spin_lock(&oi->ip_lock); + + __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi); + if (emi) { + coff = cpos - emi->ei_cpos; + *phys = emi->ei_phys + coff; + if (len) + *len = emi->ei_clusters - coff; + if (flags) + *flags = emi->ei_flags; } - ret = 0; + spin_unlock(&oi->ip_lock); -out_free: - if (eb_bh) - brelse(eb_bh); + if (emi == NULL) + return -ENOENT; - return ret; + return 0; } /* - * This lookup actually will read from disk. It has one invariant: - * It will never re-traverse blocks. This means that all inserts should - * be new regions or more granular regions (both allowed by insert). + * Forget about all clusters equal to or greater than cpos. */ -static int ocfs2_extent_map_lookup_read(struct inode *inode, - u32 cpos, - u32 clusters, - struct ocfs2_extent_map_entry **ret_ent) +void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) { - int ret; - u64 blkno; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - struct buffer_head *bh = NULL; - struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di; - struct ocfs2_extent_list *el; - - spin_lock(&OCFS2_I(inode)->ip_lock); - ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); - if (ent) { - if (!ent->e_tree_depth) { - spin_unlock(&OCFS2_I(inode)->ip_lock); - *ret_ent = ent; - return 0; - } - blkno = le64_to_cpu(ent->e_rec.e_blkno); - spin_unlock(&OCFS2_I(inode)->ip_lock); - - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, - OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - if (bh) - brelse(bh); - return ret; + struct list_head *p, *n; + struct ocfs2_extent_map_item *emi; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + LIST_HEAD(tmp_list); + unsigned int range; + + spin_lock(&oi->ip_lock); + list_for_each_safe(p, n, &em->em_list) { + emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); + + if (emi->ei_cpos >= cpos) { + /* Full truncate of this record. */ + list_move(&emi->ei_list, &tmp_list); + BUG_ON(em->em_num_items == 0); + em->em_num_items--; + continue; } - eb = (struct ocfs2_extent_block *)bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - brelse(bh); - return -EIO; - } - el = &eb->h_list; - } else { - spin_unlock(&OCFS2_I(inode)->ip_lock); - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - if (bh) - brelse(bh); - return ret; + range = emi->ei_cpos + emi->ei_clusters; + if (range > cpos) { + /* Partial truncate */ + emi->ei_clusters = cpos - emi->ei_cpos; } - di = (struct ocfs2_dinode *)bh->b_data; - if (!OCFS2_IS_VALID_DINODE(di)) { - brelse(bh); - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); - return -EIO; - } - el = &di->id2.i_list; - } - - ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); - brelse(bh); - if (ret) { - mlog_errno(ret); - return ret; } + spin_unlock(&oi->ip_lock); - ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); - if (!ent) { - ret = -ESRCH; - mlog_errno(ret); - return ret; + list_for_each_safe(p, n, &tmp_list) { + emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); + list_del(&emi->ei_list); + kfree(emi); } - - /* FIXME: Make sure this isn't a corruption */ - BUG_ON(ent->e_tree_depth); - - *ret_ent = ent; - - return 0; } /* - * Callers must hold ip_lock. This can insert pieces of the tree, - * thus racing lookup if the lock weren't held. + * Is any part of emi2 contained within emi1 */ -static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, - struct ocfs2_extent_map_entry *ent) +static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1, + struct ocfs2_extent_map_item *emi2) { - struct rb_node **p, *parent; - struct ocfs2_extent_map_entry *old_ent; + unsigned int range1, range2; - old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), - le32_to_cpu(ent->e_rec.e_clusters), - &p, &parent); - if (old_ent) - return -EEXIST; + /* + * Check if logical start of emi2 is inside emi1 + */ + range1 = emi1->ei_cpos + emi1->ei_clusters; + if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1) + return 1; - rb_link_node(&ent->e_node, parent, p); - rb_insert_color(&ent->e_node, &em->em_extents); + /* + * Check if logical end of emi2 is inside emi1 + */ + range2 = emi2->ei_cpos + emi2->ei_clusters; + if (range2 > emi1->ei_cpos && range2 <= range1) + return 1; return 0; } +static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest, + struct ocfs2_extent_map_item *src) +{ + dest->ei_cpos = src->ei_cpos; + dest->ei_phys = src->ei_phys; + dest->ei_clusters = src->ei_clusters; + dest->ei_flags = src->ei_flags; +} /* - * Simple rule: on any return code other than -EAGAIN, anything left - * in the insert_context will be freed. - * - * Simple rule #2: A return code of -EEXIST from this function or - * its calls to ocfs2_extent_map_insert_entry() signifies that another - * thread beat us to the insert. It is not an actual error, but it - * tells the caller we have no more work to do. + * Try to merge emi with ins. Returns 1 if merge succeeds, zero + * otherwise. */ -static int ocfs2_extent_map_try_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth, - struct ocfs2_em_insert_context *ctxt) +static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi, + struct ocfs2_extent_map_item *ins) { - int ret; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *old_ent; - - ctxt->need_left = 0; - ctxt->need_right = 0; - ctxt->old_ent = NULL; - - spin_lock(&OCFS2_I(inode)->ip_lock); - ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); - if (!ret) { - ctxt->new_ent = NULL; - goto out_unlock; - } - - /* Since insert_entry failed, the map MUST have old_ent */ - old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), - le32_to_cpu(rec->e_clusters), - NULL, NULL); - - BUG_ON(!old_ent); - - if (old_ent->e_tree_depth < tree_depth) { - /* Another thread beat us to the lower tree_depth */ - ret = -EEXIST; - goto out_unlock; - } - - if (old_ent->e_tree_depth == tree_depth) { - /* - * Another thread beat us to this tree_depth. - * Let's make sure we agree with that thread (the - * extent_rec should be identical). - */ - if (!memcmp(rec, &old_ent->e_rec, - sizeof(struct ocfs2_extent_rec))) - ret = 0; - else - /* FIXME: Should this be ESRCH/EBADR??? */ - ret = -EEXIST; - - goto out_unlock; - } - /* - * We do it in this order specifically so that no actual tree - * changes occur until we have all the pieces we need. We - * don't want malloc failures to leave an inconsistent tree. - * Whenever we drop the lock, another process could be - * inserting. Also note that, if another process just beat us - * to an insert, we might not need the same pieces we needed - * the first go round. In the end, the pieces we need will - * be used, and the pieces we don't will be freed. + * Handle contiguousness */ - ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > - le32_to_cpu(old_ent->e_rec.e_cpos)); - ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + - le32_to_cpu(old_ent->e_rec.e_clusters)) > - (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); - ret = -EAGAIN; - if (ctxt->need_left) { - if (!ctxt->left_ent) - goto out_unlock; - *(ctxt->left_ent) = *old_ent; - ctxt->left_ent->e_rec.e_clusters = - cpu_to_le32(le32_to_cpu(rec->e_cpos) - - le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); - } - if (ctxt->need_right) { - if (!ctxt->right_ent) - goto out_unlock; - *(ctxt->right_ent) = *old_ent; - ctxt->right_ent->e_rec.e_cpos = - cpu_to_le32(le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)); - ctxt->right_ent->e_rec.e_clusters = - cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + - le32_to_cpu(old_ent->e_rec.e_clusters)) - - le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); - } - - rb_erase(&old_ent->e_node, &em->em_extents); - /* Now that he's erased, set him up for deletion */ - ctxt->old_ent = old_ent; - - if (ctxt->need_left) { - ret = ocfs2_extent_map_insert_entry(em, - ctxt->left_ent); - if (ret) - goto out_unlock; - ctxt->left_ent = NULL; + if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) && + ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) && + ins->ei_flags == emi->ei_flags) { + emi->ei_clusters += ins->ei_clusters; + return 1; + } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && + (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys && + ins->ei_flags == emi->ei_flags) { + emi->ei_phys = ins->ei_phys; + emi->ei_cpos = ins->ei_cpos; + emi->ei_clusters += ins->ei_clusters; + return 1; } - if (ctxt->need_right) { - ret = ocfs2_extent_map_insert_entry(em, - ctxt->right_ent); - if (ret) - goto out_unlock; - ctxt->right_ent = NULL; + /* + * Overlapping extents - this shouldn't happen unless we've + * split an extent to change it's flags. That is exceedingly + * rare, so there's no sense in trying to optimize it yet. + */ + if (ocfs2_ei_is_contained(emi, ins) || + ocfs2_ei_is_contained(ins, emi)) { + ocfs2_copy_emi_fields(emi, ins); + return 1; } - ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); - - if (!ret) - ctxt->new_ent = NULL; - -out_unlock: - spin_unlock(&OCFS2_I(inode)->ip_lock); - - return ret; + /* No merge was possible. */ + return 0; } - -static int ocfs2_extent_map_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth) +/* + * In order to reduce complexity on the caller, this insert function + * is intentionally liberal in what it will accept. + * + * The only rule is that the truncate call *must* be used whenever + * records have been deleted. This avoids inserting overlapping + * records with different physical mappings. + */ +void ocfs2_extent_map_insert_rec(struct inode *inode, + struct ocfs2_extent_rec *rec) { - int ret; - struct ocfs2_em_insert_context ctxt = {0, }; - - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > - OCFS2_I(inode)->ip_map.em_clusters) { - ret = -EBADR; - mlog_errno(ret); - return ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + struct ocfs2_extent_map_item *emi, *new_emi = NULL; + struct ocfs2_extent_map_item ins; + + ins.ei_cpos = le32_to_cpu(rec->e_cpos); + ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters); + ins.ei_flags = rec->e_flags; + +search: + spin_lock(&oi->ip_lock); + + list_for_each_entry(emi, &em->em_list, ei_list) { + if (ocfs2_try_to_merge_extent_map(emi, &ins)) { + list_move(&emi->ei_list, &em->em_list); + spin_unlock(&oi->ip_lock); + goto out; + } } - /* Zero e_clusters means a truncated tail record. It better be EOF */ - if (!rec->e_clusters) { - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != - OCFS2_I(inode)->ip_map.em_clusters) { - ret = -EBADR; - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n", - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno); - return ret; - } + /* + * No item could be merged. + * + * Either allocate and add a new item, or overwrite the last recently + * inserted. + */ - /* Ignore the truncated tail */ - return 0; - } + if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) { + if (new_emi == NULL) { + spin_unlock(&oi->ip_lock); - ret = -ENOMEM; - ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.new_ent) { - mlog_errno(ret); - return ret; - } + new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS); + if (new_emi == NULL) + goto out; - ctxt.new_ent->e_rec = *rec; - ctxt.new_ent->e_tree_depth = tree_depth; - - do { - ret = -ENOMEM; - if (ctxt.need_left && !ctxt.left_ent) { - ctxt.left_ent = - kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.left_ent) - break; - } - if (ctxt.need_right && !ctxt.right_ent) { - ctxt.right_ent = - kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.right_ent) - break; + goto search; } - ret = ocfs2_extent_map_try_insert(inode, rec, - tree_depth, &ctxt); - } while (ret == -EAGAIN); - - if ((ret < 0) && (ret != -EEXIST)) - mlog_errno(ret); + ocfs2_copy_emi_fields(new_emi, &ins); + list_add(&new_emi->ei_list, &em->em_list); + em->em_num_items++; + new_emi = NULL; + } else { + BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0); + emi = list_entry(em->em_list.prev, + struct ocfs2_extent_map_item, ei_list); + list_move(&emi->ei_list, &em->em_list); + ocfs2_copy_emi_fields(emi, &ins); + } - if (ctxt.left_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); - if (ctxt.right_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); - if (ctxt.old_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); - if (ctxt.new_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); + spin_unlock(&oi->ip_lock); - return ret; +out: + if (new_emi) + kfree(new_emi); } /* - * Append this record to the tail of the extent map. It must be - * tree_depth 0. The record might be an extension of an existing - * record, and as such that needs to be handled. eg: - * - * Existing record in the extent map: - * - * cpos = 10, len = 10 - * |---------| - * - * New Record: - * - * cpos = 10, len = 20 - * |------------------| - * - * The passed record is the new on-disk record. The new_clusters value - * is how many clusters were added to the file. If the append is a - * contiguous append, the new_clusters has been added to - * rec->e_clusters. If the append is an entirely new extent, then - * rec->e_clusters is == new_clusters. + * Return the 1st index within el which contains an extent start + * larger than v_cluster. */ -int ocfs2_extent_map_append(struct inode *inode, - struct ocfs2_extent_rec *rec, - u32 new_clusters) +static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, + u32 v_cluster) { - int ret; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - struct ocfs2_extent_rec *old; - - BUG_ON(!new_clusters); - BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); + int i; + struct ocfs2_extent_rec *rec; - if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; - } + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; - mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) != - (em->em_clusters + new_clusters), - "Inode %llu:\n" - "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" - "em->em_clusters = %u + new_clusters = %u = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), - le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), - em->em_clusters, new_clusters, - em->em_clusters + new_clusters); - - em->em_clusters += new_clusters; - - ret = -ENOENT; - if (le32_to_cpu(rec->e_clusters) > new_clusters) { - /* This is a contiguous append */ - ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, - NULL, NULL); - if (ent) { - old = &ent->e_rec; - BUG_ON((le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) != - (le32_to_cpu(old->e_cpos) + - le32_to_cpu(old->e_clusters) + - new_clusters)); - if (ent->e_tree_depth == 0) { - BUG_ON(le32_to_cpu(old->e_cpos) != - le32_to_cpu(rec->e_cpos)); - BUG_ON(le64_to_cpu(old->e_blkno) != - le64_to_cpu(rec->e_blkno)); - ret = 0; - } - /* - * Let non-leafs fall through as -ENOENT to - * force insertion of the new leaf. - */ - le32_add_cpu(&old->e_clusters, new_clusters); - } + if (v_cluster < le32_to_cpu(rec->e_cpos)) + break; } - if (ret == -ENOENT) - ret = ocfs2_extent_map_insert(inode, rec, 0); - if (ret < 0) - mlog_errno(ret); - return ret; + return i; } -#if 0 -/* Code here is included but defined out as it completes the extent - * map api and may be used in the future. */ - /* - * Look up the record containing this cluster offset. This record is - * part of the extent map. Do not free it. Any changes you make to - * it will reflect in the extent map. So, if your last extent - * is (cpos = 10, clusters = 10) and you truncate the file by 5 - * clusters, you can do: + * Figure out the size of a hole which starts at v_cluster within the given + * extent list. * - * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); - * rec->e_clusters -= 5; + * If there is no more allocation past v_cluster, we return the maximum + * cluster size minus v_cluster. * - * The lookup does not read from disk. If the map isn't filled in for - * an entry, you won't find it. - * - * Also note that the returned record is valid until alloc_sem is - * dropped. After that, truncate and extend can happen. Caveat Emptor. + * If we have in-inode extents, then el points to the dinode list and + * eb_bh is NULL. Otherwise, eb_bh should point to the extent block + * containing el. */ -int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, - struct ocfs2_extent_rec **rec, - int *tree_depth) +static int ocfs2_figure_hole_clusters(struct inode *inode, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters) { - int ret = -ENOENT; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; + int ret, i; + struct buffer_head *next_eb_bh = NULL; + struct ocfs2_extent_block *eb, *next_eb; - *rec = NULL; + i = ocfs2_search_for_hole_index(el, v_cluster); - if (cpos >= OCFS2_I(inode)->ip_clusters) - return -EINVAL; + if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) { + eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (cpos >= em->em_clusters) { /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters + * Check the next leaf for any extents. */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters ; - } - - ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, - NULL, NULL); - if (ent) { - *rec = &ent->e_rec; - if (tree_depth) - *tree_depth = ent->e_tree_depth; - ret = 0; - } + if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) + goto no_more_extents; - return ret; -} + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + le64_to_cpu(eb->h_next_leaf_blk), + &next_eb_bh, OCFS2_BH_CACHED, inode); + if (ret) { + mlog_errno(ret); + goto out; + } + next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; -int ocfs2_extent_map_get_clusters(struct inode *inode, - u32 v_cpos, int count, - u32 *p_cpos, int *ret_count) -{ - int ret; - u32 coff, ccount; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent = NULL; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) { + ret = -EROFS; + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb); + goto out; + } - *p_cpos = ccount = 0; + el = &next_eb->h_list; - if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) - return -EINVAL; + i = ocfs2_search_for_hole_index(el, v_cluster); + } - if ((v_cpos + count) > em->em_clusters) { +no_more_extents: + if (i == le16_to_cpu(el->l_next_free_rec)) { /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters + * We're at the end of our existing allocation. Just + * return the maximum number of clusters we could + * possibly allocate. */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; + *num_clusters = UINT_MAX - v_cluster; + } else { + *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster; } + ret = 0; +out: + brelse(next_eb_bh); + return ret; +} - ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); - if (ret) - return ret; +/* + * Return the index of the extent record which contains cluster #v_cluster. + * -1 is returned if it was not found. + * + * Should work fine on interior and exterior nodes. + */ +static int ocfs2_search_extent_list(struct ocfs2_extent_list *el, + u32 v_cluster) +{ + int ret = -1; + int i; + struct ocfs2_extent_rec *rec; + u32 rec_end, rec_start, clusters; - if (ent) { - /* We should never find ourselves straddling an interval */ - if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, - v_cpos, - count)) - return -ESRCH; + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; - coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); - *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, - le64_to_cpu(ent->e_rec.e_blkno)) + - coff; + rec_start = le32_to_cpu(rec->e_cpos); + clusters = ocfs2_rec_clusters(el, rec); - if (ret_count) - *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; + rec_end = rec_start + clusters; - return 0; + if (v_cluster >= rec_start && v_cluster < rec_end) { + ret = i; + break; + } } - - return -ENOENT; + return ret; } -#endif /* 0 */ - -int ocfs2_extent_map_get_blocks(struct inode *inode, - u64 v_blkno, int count, - u64 *p_blkno, int *ret_count) +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + unsigned int *extent_flags) { - int ret; - u64 boff; - u32 cpos, clusters; - int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); - struct ocfs2_extent_map_entry *ent = NULL; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + int ret, i; + unsigned int flags = 0; + struct buffer_head *di_bh = NULL; + struct buffer_head *eb_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; + u32 coff; - *p_blkno = 0; - - cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); - clusters = ocfs2_blocks_to_clusters(inode->i_sb, - (u64)count + bpc - 1); - if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { - ret = -EINVAL; - mlog_errno(ret); - return ret; - } - - if ((cpos + clusters) > em->em_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; - } + ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, + num_clusters, extent_flags); + if (ret == 0) + goto out; - ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, + &di_bh, OCFS2_BH_CACHED, inode); if (ret) { mlog_errno(ret); - return ret; + goto out; } - if (ent) - { - rec = &ent->e_rec; + di = (struct ocfs2_dinode *) di_bh->b_data; + el = &di->id2.i_list; - /* We should never find ourselves straddling an interval */ - if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { - ret = -ESRCH; + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + if (ret) { mlog_errno(ret); - return ret; + goto out; } - boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - - le32_to_cpu(rec->e_cpos)); - boff += (v_blkno & (u64)(bpc - 1)); - *p_blkno = le64_to_cpu(rec->e_blkno) + boff; + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; - if (ret_count) { - *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, - le32_to_cpu(rec->e_clusters)) - boff; + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; } - - return 0; } - return -ENOENT; -} - -int ocfs2_extent_map_init(struct inode *inode) -{ - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - - em->em_extents = RB_ROOT; - em->em_clusters = 0; - - return 0; -} - -/* Needs the lock */ -static void __ocfs2_extent_map_drop(struct inode *inode, - u32 new_clusters, - struct rb_node **free_head, - struct ocfs2_extent_map_entry **tail_ent) -{ - struct rb_node *node, *next; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + /* + * A hole was found. Return some canned values that + * callers can key on. If asked for, num_clusters will + * be populated with the size of the hole. + */ + *p_cluster = 0; + if (num_clusters) { + ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, + v_cluster, + num_clusters); + if (ret) { + mlog_errno(ret); + goto out; + } + } + } else { + rec = &el->l_recs[i]; - *free_head = NULL; + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); - ent = NULL; - node = rb_last(&em->em_extents); - while (node) - { - next = rb_prev(node); + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0)", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } - ent = rb_entry(node, struct ocfs2_extent_map_entry, - e_node); - if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) - break; + coff = v_cluster - le32_to_cpu(rec->e_cpos); - rb_erase(&ent->e_node, &em->em_extents); + *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; - node->rb_right = *free_head; - *free_head = node; + if (num_clusters) + *num_clusters = ocfs2_rec_clusters(el, rec) - coff; - ent = NULL; - node = next; - } + flags = rec->e_flags; - /* Do we have an entry straddling new_clusters? */ - if (tail_ent) { - if (ent && - ((le32_to_cpu(ent->e_rec.e_cpos) + - le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) - *tail_ent = ent; - else - *tail_ent = NULL; + ocfs2_extent_map_insert_rec(inode, rec); } -} - -static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) -{ - struct rb_node *node; - struct ocfs2_extent_map_entry *ent; - while (free_head) { - node = free_head; - free_head = node->rb_right; + if (extent_flags) + *extent_flags = flags; - ent = rb_entry(node, struct ocfs2_extent_map_entry, - e_node); - kmem_cache_free(ocfs2_em_ent_cachep, ent); - } +out: + brelse(di_bh); + brelse(eb_bh); + return ret; } /* - * Remove all entries past new_clusters, inclusive of an entry that - * contains new_clusters. This is effectively a cache forget. - * - * If you want to also clip the last extent by some number of clusters, - * you need to call ocfs2_extent_map_trunc(). - * This code does not check or modify ip_clusters. + * This expects alloc_sem to be held. The allocation cannot change at + * all while the map is in the process of being updated. */ -int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + u64 *ret_count, unsigned int *extent_flags) { - struct rb_node *free_head = NULL; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - - spin_lock(&OCFS2_I(inode)->ip_lock); + int ret; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 cpos, num_clusters, p_cluster; + u64 boff = 0; - __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); + cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); - if (ent) { - rb_erase(&ent->e_node, &em->em_extents); - ent->e_node.rb_right = free_head; - free_head = &ent->e_node; + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, + extent_flags); + if (ret) { + mlog_errno(ret); + goto out; } - spin_unlock(&OCFS2_I(inode)->ip_lock); - - if (free_head) - __ocfs2_extent_map_drop_cleanup(free_head); - - return 0; -} - -/* - * Remove all entries past new_clusters and also clip any extent - * straddling new_clusters, if there is one. This does not check - * or modify ip_clusters - */ -int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) -{ - struct rb_node *free_head = NULL; - struct ocfs2_extent_map_entry *ent = NULL; - - spin_lock(&OCFS2_I(inode)->ip_lock); - - __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); - - if (ent) - ent->e_rec.e_clusters = cpu_to_le32(new_clusters - - le32_to_cpu(ent->e_rec.e_cpos)); - - OCFS2_I(inode)->ip_map.em_clusters = new_clusters; - - spin_unlock(&OCFS2_I(inode)->ip_lock); - - if (free_head) - __ocfs2_extent_map_drop_cleanup(free_head); - - return 0; -} + /* + * p_cluster == 0 indicates a hole. + */ + if (p_cluster) { + boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + boff += (v_blkno & (u64)(bpc - 1)); + } -int __init init_ocfs2_extent_maps(void) -{ - ocfs2_em_ent_cachep = - kmem_cache_create("ocfs2_em_ent", - sizeof(struct ocfs2_extent_map_entry), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!ocfs2_em_ent_cachep) - return -ENOMEM; + *p_blkno = boff; - return 0; -} + if (ret_count) { + *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + *ret_count -= v_blkno & (u64)(bpc - 1); + } -void exit_ocfs2_extent_maps(void) -{ - kmem_cache_destroy(ocfs2_em_ent_cachep); +out: + return ret; } diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index fa3745efa88..de91e3e41a2 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -25,22 +25,29 @@ #ifndef _EXTENT_MAP_H #define _EXTENT_MAP_H -int init_ocfs2_extent_maps(void); -void exit_ocfs2_extent_maps(void); +struct ocfs2_extent_map_item { + unsigned int ei_cpos; + unsigned int ei_phys; + unsigned int ei_clusters; + unsigned int ei_flags; -/* - * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem - * to be held. The allocation cannot change at all while the map is - * in the process of being updated. - */ -int ocfs2_extent_map_init(struct inode *inode); -int ocfs2_extent_map_append(struct inode *inode, - struct ocfs2_extent_rec *rec, - u32 new_clusters); -int ocfs2_extent_map_get_blocks(struct inode *inode, - u64 v_blkno, int count, - u64 *p_blkno, int *ret_count); -int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); -int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); + struct list_head ei_list; +}; + +#define OCFS2_MAX_EXTENT_MAP_ITEMS 3 +struct ocfs2_extent_map { + unsigned int em_num_items; + struct list_head em_list; +}; + +void ocfs2_extent_map_init(struct inode *inode); +void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster); +void ocfs2_extent_map_insert_rec(struct inode *inode, + struct ocfs2_extent_rec *rec); + +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, unsigned int *extent_flags); +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + u64 *ret_count, unsigned int *extent_flags); #endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index f2cd3bf9efb..520a2a6d767 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -33,6 +33,7 @@ #include <linux/sched.h> #include <linux/pipe_fs_i.h> #include <linux/mount.h> +#include <linux/writeback.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle, mlog_entry_void(); i_size_write(inode, new_i_size); - inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); + inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_ctime = inode->i_mtime = CURRENT_TIME; status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); @@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, { int status; handle_t *handle; + struct ocfs2_dinode *di; mlog_entry_void(); @@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, goto out; } - status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + /* + * Do this before setting i_size. + */ + status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); + if (status) { + mlog_errno(status); + goto out_commit; + } + + i_size_write(inode, new_i_size); + inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + di = (struct ocfs2_dinode *) fe_bh->b_data; + di->i_size = cpu_to_le64(new_i_size); + di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) mlog_errno(status); +out_commit: ocfs2_commit_trans(osb, handle); out: + mlog_exit(status); return status; } @@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode, mlog_errno(status); goto bail; } - ocfs2_data_unlock(inode, 1); - - if (le32_to_cpu(fe->i_clusters) == - ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { - mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", - fe->i_clusters); - /* No allocation change is required, so lets fast path - * this truncate. */ - status = ocfs2_simple_size_update(inode, di_bh, new_i_size); - if (status < 0) - mlog_errno(status); - goto bail; - } /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the @@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode, status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); if (status < 0) { mlog_errno(status); - goto bail; + goto bail_unlock_data; } status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); if (status < 0) { mlog_errno(status); - goto bail; + goto bail_unlock_data; } status = ocfs2_commit_truncate(osb, inode, di_bh, tc); if (status < 0) { mlog_errno(status); - goto bail; + goto bail_unlock_data; } /* TODO: orphan dir cleanup here. */ +bail_unlock_data: + ocfs2_data_unlock(inode, 1); + bail: mlog_exit(status); @@ -397,6 +416,7 @@ bail: */ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, struct inode *inode, + u32 *logical_offset, u32 clusters_to_add, struct buffer_head *fe_bh, handle_t *handle, @@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, block = ocfs2_clusters_to_blocks(osb->sb, bit_off); mlog(0, "Allocating %u clusters at block %u for inode %llu\n", num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, - num_bits, meta_ac); + status = ocfs2_insert_extent(osb, handle, inode, fe_bh, + *logical_offset, block, num_bits, + meta_ac); if (status < 0) { mlog_errno(status); goto leave; } - le32_add_cpu(&fe->i_clusters, num_bits); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - spin_unlock(&OCFS2_I(inode)->ip_lock); - status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) { mlog_errno(status); @@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, } clusters_to_add -= num_bits; + *logical_offset += num_bits; if (clusters_to_add) { mlog(0, "need to alloc once more, clusters = %u, wanted = " @@ -494,14 +511,87 @@ leave: return status; } +/* + * For a given allocation, determine which allocators will need to be + * accessed, and lock them, reserving the appropriate number of bits. + * + * Called from ocfs2_extend_allocation() for file systems which don't + * support holes, and from ocfs2_write() for file systems which + * understand sparse inodes. + */ +int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, + u32 clusters_to_add, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac) +{ + int ret, num_free_extents; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *meta_ac = NULL; + *data_ac = NULL; + + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " + "clusters_to_add = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), + le32_to_cpu(di->i_clusters), clusters_to_add); + + num_free_extents = ocfs2_num_free_extents(osb, inode, di); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + /* + * Sparse allocation file systems need to be more conservative + * with reserving room for expansion - the actual allocation + * happens while we've got a journal handle open so re-taking + * a cluster lock (because we ran out of room for another + * extent) will violate ordering rules. + * + * Most of the time we'll only be seeing this 1 cluster at a time + * anyway. + */ + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { + ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + + /* + * We cannot have an error and a non null *data_ac. + */ + } + + return ret; +} + static int ocfs2_extend_allocation(struct inode *inode, u32 clusters_to_add) { int status = 0; int restart_func = 0; int drop_alloc_sem = 0; - int credits, num_free_extents; - u32 prev_clusters; + int credits; + u32 prev_clusters, logical_start; struct buffer_head *bh = NULL; struct ocfs2_dinode *fe = NULL; handle_t *handle = NULL; @@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode, mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); + /* + * This function only exists for file systems which don't + * support holes. + */ + BUG_ON(ocfs2_sparse_alloc(osb)); + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, OCFS2_BH_CACHED, inode); if (status < 0) { @@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode, goto leave; } + logical_start = OCFS2_I(inode)->ip_clusters; + restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); - mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " - "clusters_to_add = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), - fe->i_clusters, clusters_to_add); - - num_free_extents = ocfs2_num_free_extents(osb, - inode, - fe); - if (num_free_extents < 0) { - status = num_free_extents; - mlog_errno(status); - goto leave; - } - - if (!num_free_extents) { - status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); - goto leave; - } - } - - status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); - goto leave; - } - /* blocks peope in read/write from reading our allocation * until we're done changing it. We depend on i_mutex to block * other extend/truncate calls while we're here. Ordering wrt @@ -566,6 +634,13 @@ restart_all: down_write(&OCFS2_I(inode)->ip_alloc_sem); drop_alloc_sem = 1; + status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, + &meta_ac); + if (status) { + mlog_errno(status); + goto leave; + } + credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -590,6 +665,7 @@ restarted_transaction: status = ocfs2_do_extend_allocation(osb, inode, + &logical_start, clusters_to_add, bh, handle, @@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode, size_t tail_to_skip) { int ret = 0; - u32 clusters_to_add; + u32 clusters_to_add = 0; BUG_ON(!tail_to_skip && !di_bh); @@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode, goto out; BUG_ON(new_i_size < i_size_read(inode)); + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { + BUG_ON(tail_to_skip != 0); + goto out_update_size; + } + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - OCFS2_I(inode)->ip_clusters; @@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode, goto out_unlock; } +out_update_size: if (!tail_to_skip) { /* We're being called from ocfs2_setattr() which wants * us to update i_size */ @@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode, } out_unlock: - ocfs2_data_unlock(inode, 1); + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ocfs2_data_unlock(inode, 1); out: return ret; @@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) ret = ocfs2_meta_lock(inode, NULL, 0); if (ret) { - mlog_errno(ret); + if (ret != -ENOENT) + mlog_errno(ret); goto out; } @@ -1035,10 +1119,49 @@ out: return ret; } +/* + * Will look for holes and unwritten extents in the range starting at + * pos for count bytes (inclusive). + */ +static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, + size_t count) +{ + int ret = 0; + unsigned int extent_flags; + u32 cpos, clusters, extent_len, phys_cpos; + struct super_block *sb = inode->i_sb; + + cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, + &extent_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { + ret = 1; + break; + } + + if (extent_len > clusters) + extent_len = clusters; + + clusters -= extent_len; + cpos += extent_len; + } +out: + return ret; +} + static int ocfs2_prepare_inode_for_write(struct dentry *dentry, loff_t *ppos, size_t count, - int appending) + int appending, + int *direct_io) { int ret = 0, meta_level = appending; struct inode *inode = dentry->d_inode; @@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, } else { saved_pos = *ppos; } + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { + loff_t end = saved_pos + count; + + /* + * Skip the O_DIRECT checks if we don't need + * them. + */ + if (!direct_io || !(*direct_io)) + break; + + /* + * Allowing concurrent direct writes means + * i_size changes wouldn't be synchronized, so + * one node could wind up truncating another + * nodes writes. + */ + if (end > i_size_read(inode)) { + *direct_io = 0; + break; + } + + /* + * We don't fill holes during direct io, so + * check for them here. If any are found, the + * caller will have to retake some cluster + * locks and initiate the io as buffered. + */ + ret = ocfs2_check_range_for_holes(inode, saved_pos, + count); + if (ret == 1) { + *direct_io = 0; + ret = 0; + } else if (ret < 0) + mlog_errno(ret); + break; + } + + /* + * The rest of this loop is concerned with legacy file + * systems which don't support sparse files. + */ + newsize = count + saved_pos; mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", @@ -1141,55 +1307,264 @@ out: return ret; } +static inline void +ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) +{ + const struct iovec *iov = *iovp; + size_t base = *basep; + + do { + int copy = min(bytes, iov->iov_len - base); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } while (bytes); + *iovp = iov; + *basep = base; +} + +static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, + const struct iovec *cur_iov, + size_t iov_offset) +{ + int ret; + char *buf; + struct page *src_page = NULL; + + buf = cur_iov->iov_base + iov_offset; + + if (!segment_eq(get_fs(), KERNEL_DS)) { + /* + * Pull in the user page. We want to do this outside + * of the meta data locks in order to preserve locking + * order in case of page fault. + */ + ret = get_user_pages(current, current->mm, + (unsigned long)buf & PAGE_CACHE_MASK, 1, + 0, 0, &src_page, NULL); + if (ret == 1) + bp->b_src_buf = kmap(src_page); + else + src_page = ERR_PTR(-EFAULT); + } else { + bp->b_src_buf = buf; + } + + return src_page; +} + +static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, + struct page *page) +{ + if (page) { + kunmap(page); + page_cache_release(page); + } +} + +static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, + const struct iovec *iov, + unsigned long nr_segs, + size_t count, + ssize_t o_direct_written) +{ + int ret = 0; + ssize_t copied, total = 0; + size_t iov_offset = 0; + const struct iovec *cur_iov = iov; + struct ocfs2_buffered_write_priv bp; + struct page *page; + + /* + * handle partial DIO write. Adjust cur_iov if needed. + */ + ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); + + do { + bp.b_cur_off = iov_offset; + bp.b_cur_iov = cur_iov; + + page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto out; + } + + copied = ocfs2_buffered_write_cluster(file, *ppos, count, + ocfs2_map_and_write_user_data, + &bp); + + ocfs2_put_write_source(&bp, page); + + if (copied < 0) { + mlog_errno(copied); + ret = copied; + goto out; + } + + total += copied; + *ppos = *ppos + copied; + count -= copied; + + ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); + } while(count); + +out: + return total ? total : ret; +} + +static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted, + unsigned long *nr_segs) +{ + size_t ocount; /* original count */ + unsigned long seg; + + ocount = 0; + for (seg = 0; seg < *nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + ocount += iv->iov_len; + if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + *nr_segs = seg; + ocount -= iv->iov_len; /* This segment is no good */ + break; + } + + *counted = ocount; + return 0; +} + static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - int ret, rw_level, have_alloc_sem = 0; - struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_path.dentry->d_inode; - int appending = filp->f_flags & O_APPEND ? 1 : 0; - - mlog_entry("(0x%p, %u, '%.*s')\n", filp, + int ret, direct_io, appending, rw_level, have_alloc_sem = 0; + int can_do_direct, sync = 0; + ssize_t written = 0; + size_t ocount; /* original count */ + size_t count; /* after file limit checks */ + loff_t *ppos = &iocb->ki_pos; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_path.dentry->d_inode; + + mlog_entry("(0x%p, %u, '%.*s')\n", file, (unsigned int)nr_segs, - filp->f_path.dentry->d_name.len, - filp->f_path.dentry->d_name.name); + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name); - /* happy write of zero bytes */ if (iocb->ki_left == 0) return 0; + ret = ocfs2_check_iovec(iov, &ocount, &nr_segs); + if (ret) + return ret; + + count = ocount; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + appending = file->f_flags & O_APPEND ? 1 : 0; + direct_io = file->f_flags & O_DIRECT ? 1 : 0; + mutex_lock(&inode->i_mutex); + +relock: /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ - if (filp->f_flags & O_DIRECT) { - have_alloc_sem = 1; + if (direct_io) { down_read(&inode->i_alloc_sem); + have_alloc_sem = 1; } /* concurrent O_DIRECT writes are allowed */ - rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; + rw_level = !direct_io; ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { - rw_level = -1; mlog_errno(ret); - goto out; + goto out_sems; } - ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, - iocb->ki_left, appending); + can_do_direct = direct_io; + ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, + iocb->ki_left, appending, + &can_do_direct); if (ret < 0) { mlog_errno(ret); goto out; } - /* communicate with ocfs2_dio_end_io */ - ocfs2_iocb_set_rw_locked(iocb); + /* + * We can't complete the direct I/O as requested, fall back to + * buffered I/O. + */ + if (direct_io && !can_do_direct) { + ocfs2_rw_unlock(inode, rw_level); + up_read(&inode->i_alloc_sem); + + have_alloc_sem = 0; + rw_level = -1; - ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); + direct_io = 0; + sync = 1; + goto relock; + } + + if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) + sync = 1; + + /* + * XXX: Is it ok to execute these checks a second time? + */ + ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); + if (ret) + goto out; + + /* + * Set pos so that sync_page_range_nolock() below understands + * where to start from. We might've moved it around via the + * calls above. The range we want to actually sync starts from + * *ppos here. + * + */ + pos = *ppos; + + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_rw_locked(iocb, rw_level); + + if (direct_io) { + written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, + ppos, count, ocount); + if (written < 0) { + ret = written; + goto out_dio; + } + } else { + written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, + count, written); + if (written < 0) { + ret = written; + if (ret != -EFAULT || ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } +out_dio: /* buffered aio wouldn't have proper lock coverage today */ - BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); + BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io @@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, } out: + if (rw_level != -1) + ocfs2_rw_unlock(inode, rw_level); + +out_sems: if (have_alloc_sem) up_read(&inode->i_alloc_sem); - if (rw_level != -1) - ocfs2_rw_unlock(inode, rw_level); + + if (written > 0 && sync) { + ssize_t err; + + err = sync_page_range_nolock(inode, file->f_mapping, pos, count); + if (err < 0) + written = err; + } + mutex_unlock(&inode->i_mutex); mlog_exit(ret); + return written ? written : ret; +} + +static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, + struct splice_desc *sd) +{ + int ret, count, total = 0; + ssize_t copied = 0; + struct ocfs2_splice_write_priv sp; + + ret = buf->ops->pin(pipe, buf); + if (ret) + goto out; + + sp.s_sd = sd; + sp.s_buf = buf; + sp.s_pipe = pipe; + sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; + sp.s_buf_offset = buf->offset; + + count = sd->len; + if (count + sp.s_offset > PAGE_CACHE_SIZE) + count = PAGE_CACHE_SIZE - sp.s_offset; + + do { + /* + * splice wants us to copy up to one page at a + * time. For pagesize > cluster size, this means we + * might enter ocfs2_buffered_write_cluster() more + * than once, so keep track of our progress here. + */ + copied = ocfs2_buffered_write_cluster(sd->file, + (loff_t)sd->pos + total, + count, + ocfs2_map_and_write_splice_data, + &sp); + if (copied < 0) { + mlog_errno(copied); + ret = copied; + goto out; + } + + count -= copied; + sp.s_offset += copied; + sp.s_buf_offset += copied; + total += copied; + } while (count); + + ret = 0; +out: + + return total ? total : ret; +} + +static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, + loff_t *ppos, + size_t len, + unsigned int flags) +{ + int ret, err; + struct address_space *mapping = out->f_mapping; + struct inode *inode = mapping->host; + + ret = __splice_from_pipe(pipe, out, ppos, len, flags, + ocfs2_splice_write_actor); + if (ret > 0) { + *ppos += ret; + + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { + err = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + if (err) + ret = err; + } + } + return ret; } @@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, goto out; } - ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); + ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, + NULL); if (ret < 0) { mlog_errno(ret); goto out_unlock; } /* ok, we're done with i_size and alloc work */ - ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); + ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); out_unlock: ocfs2_rw_unlock(inode, 1); @@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, } rw_level = 0; /* communicate with ocfs2_dio_end_io */ - ocfs2_iocb_set_rw_locked(iocb); + ocfs2_iocb_set_rw_locked(iocb, rw_level); } /* diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index cc973f01f6c..2c4460fced5 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted { }; int ocfs2_do_extend_allocation(struct ocfs2_super *osb, struct inode *inode, + u32 *cluster_start, u32 clusters_to_add, struct buffer_head *fe_bh, handle_t *handle, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason); +int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, + u32 clusters_to_add, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 28ab56f2b98..21a605079c6 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode) inode->i_flags |= S_DIRSYNC; } -struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, - u64 blkno, - int delete_vote) -{ - struct ocfs2_find_inode_args args; - - /* ocfs2_ilookup_for_vote should *only* be called from the - * vote thread */ - BUG_ON(current != osb->vote_task); - - args.fi_blkno = blkno; - args.fi_flags = OCFS2_FI_FLAG_NOWAIT; - if (delete_vote) - args.fi_flags |= OCFS2_FI_FLAG_DELETE; - args.fi_ino = ino_from_blkno(osb->sb, blkno); - return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); -} - struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) { struct inode *inode = NULL; @@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque) if (oi->ip_blkno != args->fi_blkno) goto bail; - /* OCFS2_FI_FLAG_NOWAIT is *only* set from - * ocfs2_ilookup_for_vote which won't create an inode for one - * that isn't found. The vote thread which doesn't want to get - * an inode which is in the process of going away - otherwise - * the call to __wait_on_freeing_inode in find_inode_fast will - * cause it to deadlock on an inode which may be waiting on a - * vote (or lock release) in delete_inode */ - if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) && - (inode->i_state & (I_FREEING|I_CLEAR))) { - /* As stated above, we're not going to return an - * inode. In the case of a delete vote, the voting - * code is going to signal the other node to go - * ahead. Mark that state here, so this freeing inode - * has the state when it gets to delete_inode. */ - if (args->fi_flags & OCFS2_FI_FLAG_DELETE) { - spin_lock(&oi->ip_lock); - ocfs2_mark_inode_remotely_deleted(inode); - spin_unlock(&oi->ip_lock); - } - goto bail; - } - ret = 1; bail: mlog_exit(ret); @@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, goto bail; } + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); + inode->i_version = 1; inode->i_generation = le32_to_cpu(fe->i_generation); inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); @@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, if (S_ISLNK(inode->i_mode) && !fe->i_clusters) inode->i_blocks = 0; else - inode->i_blocks = - ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size)); + inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_mapping->a_ops = &ocfs2_aops; inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); @@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)fe->i_blkno); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; - OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); - inode->i_nlink = le16_to_cpu(fe->i_links_count); if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) @@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, OCFS2_LOCK_TYPE_META, 0, inode); + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, + OCFS2_LOCK_TYPE_OPEN, 0, inode); } ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, @@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, * cluster lock before trusting anything anyway. */ can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) - && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK) + && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) && !ocfs2_mount_local(osb); /* @@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode, OCFS2_LOCK_TYPE_META, generation, inode); + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, + OCFS2_LOCK_TYPE_OPEN, + 0, inode); + if (can_lock) { + status = ocfs2_open_lock(inode); + if (status) { + make_bad_inode(inode); + mlog_errno(status); + return status; + } status = ocfs2_meta_lock(inode, NULL, 0); if (status) { make_bad_inode(inode); @@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode, } } + if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { + status = ocfs2_try_open_lock(inode, 0); + if (status) { + make_bad_inode(inode); + return status; + } + } + status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, can_lock ? inode : NULL); if (status < 0) { @@ -507,50 +486,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct buffer_head *fe_bh) { int status = 0; - handle_t *handle = NULL; struct ocfs2_truncate_context *tc = NULL; struct ocfs2_dinode *fe; + handle_t *handle = NULL; mlog_entry_void(); fe = (struct ocfs2_dinode *) fe_bh->b_data; - /* zero allocation, zero truncate :) */ - if (!fe->i_clusters) - goto bail; + if (fe->i_clusters) { + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - handle = NULL; - mlog_errno(status); - goto bail; - } + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out; + } - status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); - if (status < 0) { - mlog_errno(status); - goto bail; - } + i_size_write(inode, 0); - ocfs2_commit_trans(osb, handle); - handle = NULL; + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto out; + } - status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_commit_trans(osb, handle); + handle = NULL; - status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); - if (status < 0) { - mlog_errno(status); - goto bail; + status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); + if (status < 0) { + mlog_errno(status); + goto out; + } } -bail: + +out: if (handle) ocfs2_commit_trans(osb, handle); - mlog_exit(status); return status; } @@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode, struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; - /* We've already voted on this so it should be readonly - no - * spinlock needed. */ - orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; + di = (struct ocfs2_dinode *) di_bh->b_data; + orphaned_slot = le16_to_cpu(di->i_orphaned_slot); status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); if (status) @@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode, goto bail; } - status = ocfs2_request_delete_vote(inode); - /* -EBUSY means that other nodes are still using the - * inode. We're done here though, so avoid doing anything on - * disk and let them worry about deleting it. */ - if (status == -EBUSY) { + /* + * This is how ocfs2 determines whether an inode is still live + * within the cluster. Every node takes a shared read lock on + * the inode open lock in ocfs2_read_locked_inode(). When we + * get to ->delete_inode(), each node tries to convert it's + * lock to an exclusive. Trylocks are serialized by the inode + * meta data lock. If the upconvert suceeds, we know the inode + * is no longer live and can be deleted. + * + * Though we call this with the meta data lock held, the + * trylock keeps us from ABBA deadlock. + */ + status = ocfs2_try_open_lock(inode, 1); + if (status == -EAGAIN) { status = 0; mlog(0, "Skipping delete of %llu because it is in use on" "other nodes\n", (unsigned long long)oi->ip_blkno); @@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode, goto bail; } - spin_lock(&oi->ip_lock); - if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { - /* Nobody knew which slot this inode was orphaned - * into. This may happen during node death and - * recovery knows how to clean it up so we can safely - * ignore this inode for now on. */ - mlog(0, "Nobody knew where inode %llu was orphaned!\n", - (unsigned long long)oi->ip_blkno); - } else { - *wipe = 1; - - mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n", - (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot); - } - spin_unlock(&oi->ip_lock); + *wipe = 1; + mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(di->i_orphaned_slot)); bail: return status; @@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode) mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); + /* For remove delete_inode vote, we hold open lock before, + * now it is time to unlock PR and EX open locks. */ + ocfs2_open_unlock(inode); + /* Do these before all the other work so that we don't bounce * the vote thread while waiting to destroy the locks. */ ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); + ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); /* We very well may get a clear_inode before all an inodes * metadata has hit disk. Of course, we can't drop any cluster @@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode) "Clear inode of %llu, inode has io markers\n", (unsigned long long)oi->ip_blkno); - ocfs2_extent_map_drop(inode, 0); - ocfs2_extent_map_init(inode); + ocfs2_extent_map_trunc(inode, 0); status = ocfs2_drop_inode_locks(inode); if (status < 0) @@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode) ocfs2_lock_res_free(&oi->ip_rw_lockres); ocfs2_lock_res_free(&oi->ip_meta_lockres); ocfs2_lock_res_free(&oi->ip_data_lockres); + ocfs2_lock_res_free(&oi->ip_open_lockres); ocfs2_metadata_cache_purge(inode); @@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode) mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); - /* Testing ip_orphaned_slot here wouldn't work because we may - * not have gotten a delete_inode vote from any other nodes - * yet. */ if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) generic_delete_inode(inode); else @@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode, return NULL; } - tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, - &p_blkno, NULL); + tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, + NULL); if (tmperr < 0) { mlog_errno(tmperr); goto fail; @@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode, if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) inode->i_blocks = 0; else - inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); + inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 1a7dd2945b3..03ae075869e 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -26,6 +26,8 @@ #ifndef OCFS2_INODE_H #define OCFS2_INODE_H +#include "extent_map.h" + /* OCFS2 Inode Private Data */ struct ocfs2_inode_info { @@ -34,6 +36,7 @@ struct ocfs2_inode_info struct ocfs2_lock_res ip_rw_lockres; struct ocfs2_lock_res ip_meta_lockres; struct ocfs2_lock_res ip_data_lockres; + struct ocfs2_lock_res ip_open_lockres; /* protects allocation changes on this inode. */ struct rw_semaphore ip_alloc_sem; @@ -42,9 +45,7 @@ struct ocfs2_inode_info spinlock_t ip_lock; u32 ip_open_count; u32 ip_clusters; - struct ocfs2_extent_map ip_map; struct list_head ip_io_markers; - int ip_orphaned_slot; struct mutex ip_io_mutex; @@ -64,6 +65,8 @@ struct ocfs2_inode_info struct ocfs2_caching_info ip_metadata_cache; + struct ocfs2_extent_map ip_extent_map; + struct inode vfs_inode; }; @@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode); void ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ -#define OCFS2_FI_FLAG_NOWAIT 0x1 -#define OCFS2_FI_FLAG_DELETE 0x2 -#define OCFS2_FI_FLAG_SYSFILE 0x4 -#define OCFS2_FI_FLAG_NOLOCK 0x8 +#define OCFS2_FI_FLAG_SYSFILE 0x4 +#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); -struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, - u64 blkno, - int delete_vote); int ocfs2_inode_init_private(struct inode *inode); int ocfs2_inode_revalidate(struct dentry *dentry); int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, @@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); void ocfs2_set_inode_flags(struct inode *inode); +static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) +{ + int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; + + return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); +} + #endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 825cb0ae1b4..5a8a90d1c78 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -649,29 +649,20 @@ bail: static int ocfs2_force_read_journal(struct inode *inode) { int status = 0; - int i, p_blocks; - u64 v_blkno, p_blkno; -#define CONCURRENT_JOURNAL_FILL 32 + int i; + u64 v_blkno, p_blkno, p_blocks, num_blocks; +#define CONCURRENT_JOURNAL_FILL 32ULL struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; mlog_entry_void(); - BUG_ON(inode->i_blocks != - ocfs2_align_bytes_to_sectors(i_size_read(inode))); - memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); - mlog(0, "Force reading %llu blocks\n", - (unsigned long long)(inode->i_blocks >> - (inode->i_sb->s_blocksize_bits - 9))); - + num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); v_blkno = 0; - while (v_blkno < - (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { - + while (v_blkno < num_blocks) { status = ocfs2_extent_map_get_blocks(inode, v_blkno, - 1, &p_blkno, - &p_blocks); + &p_blkno, &p_blocks, NULL); if (status < 0) { mlog_errno(status); goto bail; @@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, continue; iter = ocfs2_iget(osb, le64_to_cpu(de->inode), - OCFS2_FI_FLAG_NOLOCK); + OCFS2_FI_FLAG_ORPHAN_RECOVERY); if (IS_ERR(iter)) continue; @@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, /* Set the proper information to get us going into * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; - oi->ip_orphaned_slot = slot; spin_unlock(&oi->ip_lock); iput(inode); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index d026b4f2775..3db5de4506d 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, /* We may be deleting metadata blocks, so metadata alloc dinode + one desc. block for each possible delete. */ if (tree_depth && next_free == 1 && - le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del) + ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) credits += 1 + tree_depth; /* update to the truncate log. */ diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 51b02044768..af01158b39f 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) int ret = 0, lock_level = 0; struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); - /* We don't want to support shared writable mappings yet. */ - if (!ocfs2_mount_local(osb) && + /* + * Only support shared writeable mmap for local mounts which + * don't know about holes. + */ + if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 28dd757ff67..2bcf353fd7c 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); if (IS_ERR(inode)) { - mlog(ML_ERROR, "Unable to create inode %llu\n", - (unsigned long long)blkno); ret = ERR_PTR(-EACCES); goto bail_unlock; } @@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, * unlink. */ spin_lock(&oi->ip_lock); oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; - oi->ip_orphaned_slot = OCFS2_INVALID_SLOT; spin_unlock(&oi->ip_lock); bail_add: @@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb, i_size_write(inode, inode->i_sb->s_blocksize); inode->i_nlink = 2; - inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); + inode->i_blocks = ocfs2_inode_sector_count(inode); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { mlog_errno(status); @@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, struct buffer_head **bhs = NULL; const char *c; struct super_block *sb = osb->sb; - u64 p_blkno; - int p_blocks; + u64 p_blkno, p_blocks; int virtual, blocks, status, i, bytes_left; bytes_left = i_size_read(inode) + 1; @@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, goto bail; } - status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, - &p_blocks); + status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks, + NULL); if (status < 0) { mlog_errno(status); goto bail; @@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir, inode->i_rdev = 0; newsize = l - 1; if (l > ocfs2_fast_symlink_chars(sb)) { + u32 offset = 0; + inode->i_op = &ocfs2_symlink_inode_operations; - status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, + status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, + new_fe_bh, handle, data_ac, NULL, NULL); if (status < 0) { @@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } i_size_write(inode, newsize); - inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); + inode->i_blocks = ocfs2_inode_sector_count(inode); } else { inode->i_op = &ocfs2_fast_symlink_inode_operations; memcpy((char *) fe->id2.i_symlink, symname, l); @@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, /* Record which orphan dir our inode now resides * in. delete_inode will use this to determine which orphan * dir to lock. */ - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num; - spin_unlock(&OCFS2_I(inode)->ip_lock); + fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); mlog(0, "Inode %llu orphaned in slot %d\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index db8e77cd35d..82cc92dcf8a 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -46,11 +46,6 @@ #include "endian.h" #include "ocfs2_lockid.h" -struct ocfs2_extent_map { - u32 em_clusters; - struct rb_root em_extents; -}; - /* Most user visible OCFS2 inodes will have very few pieces of * metadata, but larger files (including bitmaps, etc) must be taken * into account when designing an access scheme. We allow a small @@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode) return 1; } +static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) return (unsigned long)((bytes + 511) >> 9); } +static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, + unsigned long pg_index) +{ + u32 clusters = pg_index; + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + + if (unlikely(PAGE_CACHE_SHIFT > cbits)) + clusters = pg_index << (PAGE_CACHE_SHIFT - cbits); + else if (PAGE_CACHE_SHIFT < cbits) + clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT); + + return clusters; +} + +/* + * Find the 1st page index which covers the given clusters. + */ +static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb, + u32 clusters) +{ + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned long index = clusters; + + if (PAGE_CACHE_SHIFT > cbits) { + index = clusters >> (PAGE_CACHE_SHIFT - cbits); + } else if (PAGE_CACHE_SHIFT < cbits) { + index = clusters << (cbits - PAGE_CACHE_SHIFT); + } + + return index; +} + +static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) +{ + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int pages_per_cluster = 1; + + if (PAGE_CACHE_SHIFT < cbits) + pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT); + + return pages_per_cluster; +} + #define ocfs2_set_bit ext2_set_bit #define ocfs2_clear_bit ext2_clear_bit #define ocfs2_test_bit ext2_test_bit diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e61e218f5e0..71306479c68 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -86,7 +86,8 @@ OCFS2_SB(sb)->s_feature_incompat &= ~(mask) #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB -#define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT +#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ + | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 /* @@ -155,6 +156,12 @@ #define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ /* + * Extent record flags (e_node.leaf.flags) + */ +#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but + * unwritten */ + +/* * ioctl commands */ #define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) @@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { /* * On disk extent record for OCFS2 * It describes a range of clusters on disk. + * + * Length fields are divided into interior and leaf node versions. + * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes. */ struct ocfs2_extent_rec { /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ - __le32 e_clusters; /* Clusters covered by this extent */ + union { + __le32 e_int_clusters; /* Clusters covered by all children */ + struct { + __le16 e_leaf_clusters; /* Clusters covered by this + extent */ + __u8 e_reserved1; + __u8 e_flags; /* Extent flags */ + }; + }; __le64 e_blkno; /* Physical disk offset, in blocks */ /*10*/ }; @@ -311,7 +329,10 @@ struct ocfs2_extent_list { /*00*/ __le16 l_tree_depth; /* Extent tree depth from this point. 0 means data extents hang directly off this - header (a leaf) */ + header (a leaf) + NOTE: The high 8 bits cannot be + used - tree_depth is never that big. + */ __le16 l_count; /* Number of extent records */ __le16 l_next_free_rec; /* Next unused extent slot */ __le16 l_reserved1; @@ -446,7 +467,9 @@ struct ocfs2_dinode { __le32 i_ctime_nsec; __le32 i_mtime_nsec; __le32 i_attr; - __le32 i_reserved1; + __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL + was set in i_flags */ + __le16 i_reserved1; /*70*/ __le64 i_reserved2[8]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 4d5d5655c18..4ca02b1c38a 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -44,6 +44,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_RENAME, OCFS2_LOCK_TYPE_RW, OCFS2_LOCK_TYPE_DENTRY, + OCFS2_LOCK_TYPE_OPEN, OCFS2_NUM_LOCK_TYPES }; @@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_DENTRY: c = 'N'; break; + case OCFS2_LOCK_TYPE_OPEN: + c = 'O'; + break; default: c = '\0'; } @@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = { * important job it does, anyway. */ [OCFS2_LOCK_TYPE_RW] = "Write/Read", [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", + [OCFS2_LOCK_TYPE_OPEN] = "Open", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 2d3ac32cb74..d921a28329d 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) goto bail; } - status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); + status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 6dbb1176275..0da655ae5d6 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, le32_to_cpu(fe->i_clusters))); spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); - alloc_inode->i_blocks = - ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode)); + alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); status = 0; bail: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 6534f92424d..5c9e8243691 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -806,9 +806,6 @@ static int __init ocfs2_init(void) ocfs2_print_version(); - if (init_ocfs2_extent_maps()) - return -ENOMEM; - status = init_ocfs2_uptodate_cache(); if (status < 0) { mlog_errno(status); @@ -837,7 +834,6 @@ leave: if (status < 0) { ocfs2_free_mem_caches(); exit_ocfs2_uptodate_cache(); - exit_ocfs2_extent_maps(); } mlog_exit(status); @@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void) unregister_filesystem(&ocfs2_fs_type); - exit_ocfs2_extent_maps(); - exit_ocfs2_uptodate_cache(); mlog_exit_void(); @@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data, ocfs2_lock_res_init_once(&oi->ip_rw_lockres); ocfs2_lock_res_init_once(&oi->ip_meta_lockres); ocfs2_lock_res_init_once(&oi->ip_data_lockres); + ocfs2_lock_res_init_once(&oi->ip_open_lockres); ocfs2_metadata_cache_init(&oi->vfs_inode); diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index f30e63b9910..4f82a2f0efe 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -63,17 +63,10 @@ struct ocfs2_msg_hdr __be32 h_node_num; /* node sending this particular message. */ }; -/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this - * for the network. */ -#define OCFS2_VOTE_FILENAME_LEN 256 struct ocfs2_vote_msg { struct ocfs2_msg_hdr v_hdr; - union { - __be32 v_generic1; - __be32 v_orphaned_slot; /* Used during delete votes */ - __be32 v_nlink; /* Used during unlink votes */ - } md1; /* Message type dependant 1 */ + __be32 v_reserved1; }; /* Responses are given these values to maintain backwards @@ -86,7 +79,6 @@ struct ocfs2_response_msg { struct ocfs2_msg_hdr r_hdr; __be32 r_response; - __be32 r_orphaned_slot; }; struct ocfs2_vote_work { @@ -96,7 +88,6 @@ struct ocfs2_vote_work { enum ocfs2_vote_request { OCFS2_VOTE_REQ_INVALID = 0, - OCFS2_VOTE_REQ_DELETE, OCFS2_VOTE_REQ_MOUNT, OCFS2_VOTE_REQ_UMOUNT, OCFS2_VOTE_REQ_LAST @@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb, ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); } -void ocfs2_mark_inode_remotely_deleted(struct inode *inode) -{ - struct ocfs2_inode_info *oi = OCFS2_I(inode); - - assert_spin_locked(&oi->ip_lock); - /* We set the SKIP_DELETE flag on the inode so we don't try to - * delete it in delete_inode ourselves, thus avoiding - * unecessary lock pinging. If the other node failed to wipe - * the inode as a result of a crash, then recovery will pick - * up the slack. */ - oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE; -} - -static int ocfs2_process_delete_request(struct inode *inode, - int *orphaned_slot) -{ - int response = OCFS2_RESPONSE_BUSY; - - mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n", - inode->i_ino, inode->i_nlink, *orphaned_slot); - - spin_lock(&OCFS2_I(inode)->ip_lock); - - /* Whatever our vote response is, we want to make sure that - * the orphaned slot is recorded properly on this node *and* - * on the requesting node. Technically, if the requesting node - * did not know which slot the inode is orphaned in but we - * respond with BUSY he doesn't actually need the orphaned - * slot, but it doesn't hurt to do it here anyway. */ - if ((*orphaned_slot) != OCFS2_INVALID_SLOT) { - mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != - OCFS2_INVALID_SLOT && - OCFS2_I(inode)->ip_orphaned_slot != - (*orphaned_slot), - "Inode %llu: This node thinks it's " - "orphaned in slot %d, messaged it's in %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - OCFS2_I(inode)->ip_orphaned_slot, - *orphaned_slot); - - mlog(0, "Setting orphaned slot for inode %llu to %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - *orphaned_slot); - - OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot; - } else { - mlog(0, "Sending back orphaned slot %d for inode %llu\n", - OCFS2_I(inode)->ip_orphaned_slot, - (unsigned long long)OCFS2_I(inode)->ip_blkno); - - *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; - } - - /* vote no if the file is still open. */ - if (OCFS2_I(inode)->ip_open_count) { - mlog(0, "open count = %u\n", - OCFS2_I(inode)->ip_open_count); - spin_unlock(&OCFS2_I(inode)->ip_lock); - goto done; - } - spin_unlock(&OCFS2_I(inode)->ip_lock); - - /* directories are a bit ugly... What if someone is sitting in - * it? We want to make sure the inode is removed completely as - * a result of the iput in process_vote. */ - if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) { - mlog(0, "i_count = %u\n", atomic_read(&inode->i_count)); - goto done; - } - - if (filemap_fdatawrite(inode->i_mapping)) { - mlog(ML_ERROR, "Could not sync inode %llu for delete!\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - goto done; - } - sync_mapping_buffers(inode->i_mapping); - truncate_inode_pages(inode->i_mapping, 0); - ocfs2_extent_map_trunc(inode, 0); - - spin_lock(&OCFS2_I(inode)->ip_lock); - /* double check open count - someone might have raced this - * thread into ocfs2_file_open while we were writing out - * data. If we're to allow a wipe of this inode now, we *must* - * hold the spinlock until we've marked it. */ - if (OCFS2_I(inode)->ip_open_count) { - mlog(0, "Raced to wipe! open count = %u\n", - OCFS2_I(inode)->ip_open_count); - spin_unlock(&OCFS2_I(inode)->ip_lock); - goto done; - } - - /* Mark the inode as being wiped from disk. */ - ocfs2_mark_inode_remotely_deleted(inode); - spin_unlock(&OCFS2_I(inode)->ip_lock); - - /* Not sure this is necessary anymore. */ - d_prune_aliases(inode); - - /* If we get here, then we're voting 'yes', so commit the - * delete on our side. */ - response = OCFS2_RESPONSE_OK; -done: - return response; -} - static void ocfs2_process_vote(struct ocfs2_super *osb, struct ocfs2_vote_msg *msg) { int net_status, vote_response; - int orphaned_slot = 0; - unsigned int node_num, generation; + unsigned int node_num; u64 blkno; enum ocfs2_vote_request request; - struct inode *inode = NULL; struct ocfs2_msg_hdr *hdr = &msg->v_hdr; struct ocfs2_response_msg response; /* decode the network mumbo jumbo into local variables. */ request = be32_to_cpu(hdr->h_request); blkno = be64_to_cpu(hdr->h_blkno); - generation = be32_to_cpu(hdr->h_generation); node_num = be32_to_cpu(hdr->h_node_num); - if (request == OCFS2_VOTE_REQ_DELETE) - orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot); - mlog(0, "processing vote: request = %u, blkno = %llu, " - "generation = %u, node_num = %u, priv1 = %u\n", request, - (unsigned long long)blkno, generation, node_num, - be32_to_cpu(msg->md1.v_generic1)); + mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n", + request, (unsigned long long)blkno, node_num); if (!ocfs2_is_valid_vote_request(request)) { mlog(ML_ERROR, "Invalid vote request %d from node %u\n", @@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb, break; } - /* We cannot process the remaining message types before we're - * fully mounted. It's perfectly safe however to send a 'yes' - * response as we can't possibly have any of the state they're - * asking us to modify yet. */ - if (atomic_read(&osb->vol_state) == VOLUME_INIT) - goto respond; - - /* If we get here, then the request is against an inode. */ - inode = ocfs2_ilookup_for_vote(osb, blkno, - request == OCFS2_VOTE_REQ_DELETE); - - /* Not finding the inode is perfectly valid - it means we're - * not interested in what the other node is about to do to it - * so in those cases we automatically respond with an - * affirmative. Cluster locking ensures that we won't race - * interest in the inode with this vote request. */ - if (!inode) - goto respond; - - /* Check generation values. It's possible for us to get a - * request against a stale inode. If so then we proceed as if - * we had not found an inode in the first place. */ - if (inode->i_generation != generation) { - mlog(0, "generation passed %u != inode generation = %u, " - "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, " - "message type = %u\n", generation, inode->i_generation, - OCFS2_I(inode)->ip_flags, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)blkno, atomic_read(&inode->i_count), - request); - iput(inode); - inode = NULL; - goto respond; - } - - switch (request) { - case OCFS2_VOTE_REQ_DELETE: - vote_response = ocfs2_process_delete_request(inode, - &orphaned_slot); - break; - default: - mlog(ML_ERROR, "node %u, invalid request: %u\n", - node_num, request); - vote_response = OCFS2_RESPONSE_BAD_MSG; - } - respond: /* Response struture is small so we just put it on the stack * and stuff it inline. */ @@ -357,7 +190,6 @@ respond: response.r_hdr.h_generation = hdr->h_generation; response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); response.r_response = cpu_to_be32(vote_response); - response.r_orphaned_slot = cpu_to_be32(orphaned_slot); net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, osb->net_key, @@ -373,9 +205,6 @@ respond: && net_status != -ENOTCONN) mlog(ML_ERROR, "message to node %u fails with error %d!\n", node_num, net_status); - - if (inode) - iput(inode); } static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) @@ -634,8 +463,7 @@ bail: static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, u64 blkno, unsigned int generation, - enum ocfs2_vote_request type, - u32 priv) + enum ocfs2_vote_request type) { struct ocfs2_vote_msg *request; struct ocfs2_msg_hdr *hdr; @@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, hdr->h_request = cpu_to_be32(type); hdr->h_blkno = cpu_to_be64(blkno); hdr->h_generation = cpu_to_be32(generation); - - request->md1.v_generic1 = cpu_to_be32(priv); } return request; @@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb, struct ocfs2_vote_msg *request, struct ocfs2_net_response_cb *callback) { - int status, response; + int status, response = -EBUSY; unsigned int response_id; struct ocfs2_msg_hdr *hdr; @@ -686,109 +512,12 @@ bail: return status; } -static int ocfs2_request_vote(struct inode *inode, - struct ocfs2_vote_msg *request, - struct ocfs2_net_response_cb *callback) -{ - int status; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - if (ocfs2_inode_is_new(inode)) - return 0; - - status = -EAGAIN; - while (status == -EAGAIN) { - if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && - signal_pending(current)) - return -ERESTARTSYS; - - status = ocfs2_super_lock(osb, 0); - if (status < 0) { - mlog_errno(status); - break; - } - - status = 0; - if (!ocfs2_node_map_is_only(osb, &osb->mounted_map, - osb->node_num)) - status = ocfs2_do_request_vote(osb, request, callback); - - ocfs2_super_unlock(osb, 0); - } - return status; -} - -static void ocfs2_delete_response_cb(void *priv, - struct ocfs2_response_msg *resp) -{ - int orphaned_slot, node; - struct inode *inode = priv; - - orphaned_slot = be32_to_cpu(resp->r_orphaned_slot); - node = be32_to_cpu(resp->r_hdr.h_node_num); - mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n", - node, (unsigned long long)OCFS2_I(inode)->ip_blkno, - orphaned_slot); - - /* The other node may not actually know which slot the inode - * is orphaned in. */ - if (orphaned_slot == OCFS2_INVALID_SLOT) - return; - - /* Ok, the responding node knows which slot this inode is - * orphaned in. We verify that the information is correct and - * then record this in the inode. ocfs2_delete_inode will use - * this information to determine which lock to take. */ - spin_lock(&OCFS2_I(inode)->ip_lock); - mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot && - OCFS2_I(inode)->ip_orphaned_slot - != OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's " - "orphaned in slot %d, we think it's in %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - be32_to_cpu(resp->r_hdr.h_node_num), - orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot); - - OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot; - spin_unlock(&OCFS2_I(inode)->ip_lock); -} - -int ocfs2_request_delete_vote(struct inode *inode) -{ - int orphaned_slot, status; - struct ocfs2_net_response_cb delete_cb; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_vote_msg *request; - - spin_lock(&OCFS2_I(inode)->ip_lock); - orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; - spin_unlock(&OCFS2_I(inode)->ip_lock); - - delete_cb.rc_cb = ocfs2_delete_response_cb; - delete_cb.rc_priv = inode; - - mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot); - - status = -ENOMEM; - request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, - inode->i_generation, - OCFS2_VOTE_REQ_DELETE, orphaned_slot); - if (request) { - status = ocfs2_request_vote(inode, request, &delete_cb); - - kfree(request); - } - - return status; -} - int ocfs2_request_mount_vote(struct ocfs2_super *osb) { int status; struct ocfs2_vote_msg *request = NULL; - request = ocfs2_new_vote_request(osb, 0ULL, 0, - OCFS2_VOTE_REQ_MOUNT, 0); + request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT); if (!request) { status = -ENOMEM; goto bail; @@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb) int status; struct ocfs2_vote_msg *request = NULL; - request = ocfs2_new_vote_request(osb, 0ULL, 0, - OCFS2_VOTE_REQ_UMOUNT, 0); + request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT); if (!request) { status = -ENOMEM; goto bail; @@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg, be32_to_cpu(work->w_msg.v_hdr.h_generation)); mlog(0, "h_node_num = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_node_num)); - mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1)); spin_lock(&osb->vote_task_lock); list_add_tail(&work->w_list, &osb->vote_list); diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h index 53ebc1c69e5..9ea46f62de3 100644 --- a/fs/ocfs2/vote.h +++ b/fs/ocfs2/vote.h @@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb) wake_up(&osb->vote_event); } -int ocfs2_request_delete_vote(struct inode *inode); int ocfs2_request_mount_vote(struct ocfs2_super *osb); int ocfs2_request_umount_vote(struct ocfs2_super *osb); int ocfs2_register_net_handlers(struct ocfs2_super *osb); void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); -void ocfs2_mark_inode_remotely_deleted(struct inode *inode); - void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, int node_num); #endif diff --git a/fs/sync.c b/fs/sync.c index d0feff61e6a..5cb9e7e4338 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -239,13 +239,11 @@ out: /* * `endbyte' is inclusive */ -int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, - unsigned int flags) +int do_sync_mapping_range(struct address_space *mapping, loff_t offset, + loff_t endbyte, unsigned int flags) { int ret; - struct address_space *mapping; - mapping = file->f_mapping; if (!mapping) { ret = -EINVAL; goto out; @@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, out: return ret; } -EXPORT_SYMBOL_GPL(do_sync_file_range); +EXPORT_SYMBOL_GPL(do_sync_mapping_range); diff --git a/include/linux/fs.h b/include/linux/fs.h index 86ec3f4a7da..095a9c9a64f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -843,8 +843,13 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); extern int fcntl_getlease(struct file *filp); /* fs/sync.c */ -extern int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, - unsigned int flags); +extern int do_sync_mapping_range(struct address_space *mapping, loff_t offset, + loff_t endbyte, unsigned int flags); +static inline int do_sync_file_range(struct file *file, loff_t offset, + loff_t endbyte, unsigned int flags) +{ + return do_sync_mapping_range(file->f_mapping, offset, endbyte, flags); +} /* fs/locks.c */ extern void locks_init_lock(struct file_lock *); diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h new file mode 100644 index 00000000000..3d967b6b120 --- /dev/null +++ b/include/linux/mtd/ubi.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +#ifndef __LINUX_UBI_H__ +#define __LINUX_UBI_H__ + +#include <asm/ioctl.h> +#include <linux/types.h> +#include <mtd/ubi-user.h> + +/* + * UBI data type hint constants. + * + * UBI_LONGTERM: long-term data + * UBI_SHORTTERM: short-term data + * UBI_UNKNOWN: data persistence is unknown + * + * These constants are used when data is written to UBI volumes in order to + * help the UBI wear-leveling unit to find more appropriate physical + * eraseblocks. + */ +enum { + UBI_LONGTERM = 1, + UBI_SHORTTERM, + UBI_UNKNOWN +}; + +/* + * enum ubi_open_mode - UBI volume open mode constants. + * + * UBI_READONLY: read-only mode + * UBI_READWRITE: read-write mode + * UBI_EXCLUSIVE: exclusive mode + */ +enum { + UBI_READONLY = 1, + UBI_READWRITE, + UBI_EXCLUSIVE +}; + +/** + * struct ubi_volume_info - UBI volume description data structure. + * @vol_id: volume ID + * @ubi_num: UBI device number this volume belongs to + * @size: how many physical eraseblocks are reserved for this volume + * @used_bytes: how many bytes of data this volume contains + * @used_ebs: how many physical eraseblocks of this volume actually contain any + * data + * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) + * @corrupted: non-zero if the volume is corrupted (static volumes only) + * @upd_marker: non-zero if the volume has update marker set + * @alignment: volume alignment + * @usable_leb_size: how many bytes are available in logical eraseblocks of + * this volume + * @name_len: volume name length + * @name: volume name + * @cdev: UBI volume character device major and minor numbers + * + * The @corrupted flag is only relevant to static volumes and is always zero + * for dynamic ones. This is because UBI does not care about dynamic volume + * data protection and only cares about protecting static volume data. + * + * The @upd_marker flag is set if the volume update operation was interrupted. + * Before touching the volume data during the update operation, UBI first sets + * the update marker flag for this volume. If the volume update operation was + * further interrupted, the update marker indicates this. If the update marker + * is set, the contents of the volume is certainly damaged and a new volume + * update operation has to be started. + * + * To put it differently, @corrupted and @upd_marker fields have different + * semantics: + * o the @corrupted flag means that this static volume is corrupted for some + * reasons, but not because an interrupted volume update + * o the @upd_marker field means that the volume is damaged because of an + * interrupted update operation. + * + * I.e., the @corrupted flag is never set if the @upd_marker flag is set. + * + * The @used_bytes and @used_ebs fields are only really needed for static + * volumes and contain the number of bytes stored in this static volume and how + * many eraseblock this data occupies. In case of dynamic volumes, the + * @used_bytes field is equivalent to @size*@usable_leb_size, and the @used_ebs + * field is equivalent to @size. + * + * In general, logical eraseblock size is a property of the UBI device, not + * of the UBI volume. Indeed, the logical eraseblock size depends on the + * physical eraseblock size and on how much bytes UBI headers consume. But + * because of the volume alignment (@alignment), the usable size of logical + * eraseblocks if a volume may be less. The following equation is true: + * @usable_leb_size = LEB size - (LEB size mod @alignment), + * where LEB size is the logical eraseblock size defined by the UBI device. + * + * The alignment is multiple to the minimal flash input/output unit size or %1 + * if all the available space is used. + * + * To put this differently, alignment may be considered is a way to change + * volume logical eraseblock sizes. + */ +struct ubi_volume_info { + int ubi_num; + int vol_id; + int size; + long long used_bytes; + int used_ebs; + int vol_type; + int corrupted; + int upd_marker; + int alignment; + int usable_leb_size; + int name_len; + const char *name; + dev_t cdev; +}; + +/** + * struct ubi_device_info - UBI device description data structure. + * @ubi_num: ubi device number + * @leb_size: logical eraseblock size on this UBI device + * @min_io_size: minimal I/O unit size + * @ro_mode: if this device is in read-only mode + * @cdev: UBI character device major and minor numbers + * + * Note, @leb_size is the logical eraseblock size offered by the UBI device. + * Volumes of this UBI device may have smaller logical eraseblock size if their + * alignment is not equivalent to %1. + */ +struct ubi_device_info { + int ubi_num; + int leb_size; + int min_io_size; + int ro_mode; + dev_t cdev; +}; + +/* UBI descriptor given to users when they open UBI volumes */ +struct ubi_volume_desc; + +int ubi_get_device_info(int ubi_num, struct ubi_device_info *di); +void ubi_get_volume_info(struct ubi_volume_desc *desc, + struct ubi_volume_info *vi); +struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode); +struct ubi_volume_desc *ubi_open_volume_nm(int ubi_num, const char *name, + int mode); +void ubi_close_volume(struct ubi_volume_desc *desc); +int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, + int len, int check); +int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, + int offset, int len, int dtype); +int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, + int len, int dtype); +int ubi_leb_erase(struct ubi_volume_desc *desc, int lnum); +int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum); +int ubi_is_mapped(struct ubi_volume_desc *desc, int lnum); + +/* + * This function is the same as the 'ubi_leb_read()' function, but it does not + * provide the checking capability. + */ +static inline int ubi_read(struct ubi_volume_desc *desc, int lnum, char *buf, + int offset, int len) +{ + return ubi_leb_read(desc, lnum, buf, offset, len, 0); +} + +/* + * This function is the same as the 'ubi_leb_write()' functions, but it does + * not have the data type argument. + */ +static inline int ubi_write(struct ubi_volume_desc *desc, int lnum, + const void *buf, int offset, int len) +{ + return ubi_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN); +} + +/* + * This function is the same as the 'ubi_leb_change()' functions, but it does + * not have the data type argument. + */ +static inline int ubi_change(struct ubi_volume_desc *desc, int lnum, + const void *buf, int len) +{ + return ubi_leb_change(desc, lnum, buf, len, UBI_UNKNOWN); +} + +#endif /* !__LINUX_UBI_H__ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 49fe2997a01..a1707583de4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -196,13 +196,13 @@ extern void init_idle(struct task_struct *idle, int cpu); extern cpumask_t nohz_cpu_mask; /* - * Only dump TASK_* tasks. (-1 for all tasks) + * Only dump TASK_* tasks. (0 for all tasks) */ extern void show_state_filter(unsigned long state_filter); static inline void show_state(void) { - show_state_filter(-1); + show_state_filter(0); } extern void show_regs(struct pt_regs *); diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 52c9eb9b6df..26e4925bc35 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -61,10 +61,10 @@ static inline void write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); ++sl->sequence; - smp_wmb(); -} + smp_wmb(); +} -static inline void write_sequnlock(seqlock_t *sl) +static inline void write_sequnlock(seqlock_t *sl) { smp_wmb(); sl->sequence++; @@ -77,7 +77,7 @@ static inline int write_tryseqlock(seqlock_t *sl) if (ret) { ++sl->sequence; - smp_wmb(); + smp_wmb(); } return ret; } diff --git a/include/mtd/Kbuild b/include/mtd/Kbuild index e0fe92b03a4..4d46b3bdebd 100644 --- a/include/mtd/Kbuild +++ b/include/mtd/Kbuild @@ -3,3 +3,5 @@ header-y += jffs2-user.h header-y += mtd-abi.h header-y += mtd-user.h header-y += nftl-user.h +header-y += ubi-header.h +header-y += ubi-user.h diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h index 8e501a75a76..f71dac42039 100644 --- a/include/mtd/mtd-abi.h +++ b/include/mtd/mtd-abi.h @@ -24,6 +24,7 @@ struct mtd_oob_buf { #define MTD_NORFLASH 3 #define MTD_NANDFLASH 4 #define MTD_DATAFLASH 6 +#define MTD_UBIVOLUME 7 #define MTD_WRITEABLE 0x400 /* Device is writeable */ #define MTD_BIT_WRITEABLE 0x800 /* Single bits can be flipped */ diff --git a/include/mtd/ubi-header.h b/include/mtd/ubi-header.h new file mode 100644 index 00000000000..fa479c71aa3 --- /dev/null +++ b/include/mtd/ubi-header.h @@ -0,0 +1,360 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Thomas Gleixner + * Frank Haverkamp + * Oliver Lohmann + * Andreas Arnez + */ + +/* + * This file defines the layout of UBI headers and all the other UBI on-flash + * data structures. May be included by user-space. + */ + +#ifndef __UBI_HEADER_H__ +#define __UBI_HEADER_H__ + +#include <asm/byteorder.h> + +/* The version of UBI images supported by this implementation */ +#define UBI_VERSION 1 + +/* The highest erase counter value supported by this implementation */ +#define UBI_MAX_ERASECOUNTER 0x7FFFFFFF + +/* The initial CRC32 value used when calculating CRC checksums */ +#define UBI_CRC32_INIT 0xFFFFFFFFU + +/* Erase counter header magic number (ASCII "UBI#") */ +#define UBI_EC_HDR_MAGIC 0x55424923 +/* Volume identifier header magic number (ASCII "UBI!") */ +#define UBI_VID_HDR_MAGIC 0x55424921 + +/* + * Volume type constants used in the volume identifier header. + * + * @UBI_VID_DYNAMIC: dynamic volume + * @UBI_VID_STATIC: static volume + */ +enum { + UBI_VID_DYNAMIC = 1, + UBI_VID_STATIC = 2 +}; + +/* + * Compatibility constants used by internal volumes. + * + * @UBI_COMPAT_DELETE: delete this internal volume before anything is written + * to the flash + * @UBI_COMPAT_RO: attach this device in read-only mode + * @UBI_COMPAT_PRESERVE: preserve this internal volume - do not touch its + * physical eraseblocks, don't allow the wear-leveling unit to move them + * @UBI_COMPAT_REJECT: reject this UBI image + */ +enum { + UBI_COMPAT_DELETE = 1, + UBI_COMPAT_RO = 2, + UBI_COMPAT_PRESERVE = 4, + UBI_COMPAT_REJECT = 5 +}; + +/* + * ubi16_t/ubi32_t/ubi64_t - 16, 32, and 64-bit integers used in UBI on-flash + * data structures. + */ +typedef struct { + uint16_t int16; +} __attribute__ ((packed)) ubi16_t; + +typedef struct { + uint32_t int32; +} __attribute__ ((packed)) ubi32_t; + +typedef struct { + uint64_t int64; +} __attribute__ ((packed)) ubi64_t; + +/* + * In this implementation of UBI uses the big-endian format for on-flash + * integers. The below are the corresponding conversion macros. + */ +#define cpu_to_ubi16(x) ((ubi16_t){__cpu_to_be16(x)}) +#define ubi16_to_cpu(x) ((uint16_t)__be16_to_cpu((x).int16)) + +#define cpu_to_ubi32(x) ((ubi32_t){__cpu_to_be32(x)}) +#define ubi32_to_cpu(x) ((uint32_t)__be32_to_cpu((x).int32)) + +#define cpu_to_ubi64(x) ((ubi64_t){__cpu_to_be64(x)}) +#define ubi64_to_cpu(x) ((uint64_t)__be64_to_cpu((x).int64)) + +/* Sizes of UBI headers */ +#define UBI_EC_HDR_SIZE sizeof(struct ubi_ec_hdr) +#define UBI_VID_HDR_SIZE sizeof(struct ubi_vid_hdr) + +/* Sizes of UBI headers without the ending CRC */ +#define UBI_EC_HDR_SIZE_CRC (UBI_EC_HDR_SIZE - sizeof(ubi32_t)) +#define UBI_VID_HDR_SIZE_CRC (UBI_VID_HDR_SIZE - sizeof(ubi32_t)) + +/** + * struct ubi_ec_hdr - UBI erase counter header. + * @magic: erase counter header magic number (%UBI_EC_HDR_MAGIC) + * @version: version of UBI implementation which is supposed to accept this + * UBI image + * @padding1: reserved for future, zeroes + * @ec: the erase counter + * @vid_hdr_offset: where the VID header starts + * @data_offset: where the user data start + * @padding2: reserved for future, zeroes + * @hdr_crc: erase counter header CRC checksum + * + * The erase counter header takes 64 bytes and has a plenty of unused space for + * future usage. The unused fields are zeroed. The @version field is used to + * indicate the version of UBI implementation which is supposed to be able to + * work with this UBI image. If @version is greater then the current UBI + * version, the image is rejected. This may be useful in future if something + * is changed radically. This field is duplicated in the volume identifier + * header. + * + * The @vid_hdr_offset and @data_offset fields contain the offset of the the + * volume identifier header and user data, relative to the beginning of the + * physical eraseblock. These values have to be the same for all physical + * eraseblocks. + */ +struct ubi_ec_hdr { + ubi32_t magic; + uint8_t version; + uint8_t padding1[3]; + ubi64_t ec; /* Warning: the current limit is 31-bit anyway! */ + ubi32_t vid_hdr_offset; + ubi32_t data_offset; + uint8_t padding2[36]; + ubi32_t hdr_crc; +} __attribute__ ((packed)); + +/** + * struct ubi_vid_hdr - on-flash UBI volume identifier header. + * @magic: volume identifier header magic number (%UBI_VID_HDR_MAGIC) + * @version: UBI implementation version which is supposed to accept this UBI + * image (%UBI_VERSION) + * @vol_type: volume type (%UBI_VID_DYNAMIC or %UBI_VID_STATIC) + * @copy_flag: if this logical eraseblock was copied from another physical + * eraseblock (for wear-leveling reasons) + * @compat: compatibility of this volume (%0, %UBI_COMPAT_DELETE, + * %UBI_COMPAT_IGNORE, %UBI_COMPAT_PRESERVE, or %UBI_COMPAT_REJECT) + * @vol_id: ID of this volume + * @lnum: logical eraseblock number + * @leb_ver: version of this logical eraseblock (IMPORTANT: obsolete, to be + * removed, kept only for not breaking older UBI users) + * @data_size: how many bytes of data this logical eraseblock contains + * @used_ebs: total number of used logical eraseblocks in this volume + * @data_pad: how many bytes at the end of this physical eraseblock are not + * used + * @data_crc: CRC checksum of the data stored in this logical eraseblock + * @padding1: reserved for future, zeroes + * @sqnum: sequence number + * @padding2: reserved for future, zeroes + * @hdr_crc: volume identifier header CRC checksum + * + * The @sqnum is the value of the global sequence counter at the time when this + * VID header was created. The global sequence counter is incremented each time + * UBI writes a new VID header to the flash, i.e. when it maps a logical + * eraseblock to a new physical eraseblock. The global sequence counter is an + * unsigned 64-bit integer and we assume it never overflows. The @sqnum + * (sequence number) is used to distinguish between older and newer versions of + * logical eraseblocks. + * + * There are 2 situations when there may be more then one physical eraseblock + * corresponding to the same logical eraseblock, i.e., having the same @vol_id + * and @lnum values in the volume identifier header. Suppose we have a logical + * eraseblock L and it is mapped to the physical eraseblock P. + * + * 1. Because UBI may erase physical eraseblocks asynchronously, the following + * situation is possible: L is asynchronously erased, so P is scheduled for + * erasure, then L is written to,i.e. mapped to another physical eraseblock P1, + * so P1 is written to, then an unclean reboot happens. Result - there are 2 + * physical eraseblocks P and P1 corresponding to the same logical eraseblock + * L. But P1 has greater sequence number, so UBI picks P1 when it attaches the + * flash. + * + * 2. From time to time UBI moves logical eraseblocks to other physical + * eraseblocks for wear-leveling reasons. If, for example, UBI moves L from P + * to P1, and an unclean reboot happens before P is physically erased, there + * are two physical eraseblocks P and P1 corresponding to L and UBI has to + * select one of them when the flash is attached. The @sqnum field says which + * PEB is the original (obviously P will have lower @sqnum) and the copy. But + * it is not enough to select the physical eraseblock with the higher sequence + * number, because the unclean reboot could have happen in the middle of the + * copying process, so the data in P is corrupted. It is also not enough to + * just select the physical eraseblock with lower sequence number, because the + * data there may be old (consider a case if more data was added to P1 after + * the copying). Moreover, the unclean reboot may happen when the erasure of P + * was just started, so it result in unstable P, which is "mostly" OK, but + * still has unstable bits. + * + * UBI uses the @copy_flag field to indicate that this logical eraseblock is a + * copy. UBI also calculates data CRC when the data is moved and stores it at + * the @data_crc field of the copy (P1). So when UBI needs to pick one physical + * eraseblock of two (P or P1), the @copy_flag of the newer one (P1) is + * examined. If it is cleared, the situation* is simple and the newer one is + * picked. If it is set, the data CRC of the copy (P1) is examined. If the CRC + * checksum is correct, this physical eraseblock is selected (P1). Otherwise + * the older one (P) is selected. + * + * Note, there is an obsolete @leb_ver field which was used instead of @sqnum + * in the past. But it is not used anymore and we keep it in order to be able + * to deal with old UBI images. It will be removed at some point. + * + * There are 2 sorts of volumes in UBI: user volumes and internal volumes. + * Internal volumes are not seen from outside and are used for various internal + * UBI purposes. In this implementation there is only one internal volume - the + * layout volume. Internal volumes are the main mechanism of UBI extensions. + * For example, in future one may introduce a journal internal volume. Internal + * volumes have their own reserved range of IDs. + * + * The @compat field is only used for internal volumes and contains the "degree + * of their compatibility". It is always zero for user volumes. This field + * provides a mechanism to introduce UBI extensions and to be still compatible + * with older UBI binaries. For example, if someone introduced a journal in + * future, he would probably use %UBI_COMPAT_DELETE compatibility for the + * journal volume. And in this case, older UBI binaries, which know nothing + * about the journal volume, would just delete this volume and work perfectly + * fine. This is similar to what Ext2fs does when it is fed by an Ext3fs image + * - it just ignores the Ext3fs journal. + * + * The @data_crc field contains the CRC checksum of the contents of the logical + * eraseblock if this is a static volume. In case of dynamic volumes, it does + * not contain the CRC checksum as a rule. The only exception is when the + * data of the physical eraseblock was moved by the wear-leveling unit, then + * the wear-leveling unit calculates the data CRC and stores it in the + * @data_crc field. And of course, the @copy_flag is %in this case. + * + * The @data_size field is used only for static volumes because UBI has to know + * how many bytes of data are stored in this eraseblock. For dynamic volumes, + * this field usually contains zero. The only exception is when the data of the + * physical eraseblock was moved to another physical eraseblock for + * wear-leveling reasons. In this case, UBI calculates CRC checksum of the + * contents and uses both @data_crc and @data_size fields. In this case, the + * @data_size field contains data size. + * + * The @used_ebs field is used only for static volumes and indicates how many + * eraseblocks the data of the volume takes. For dynamic volumes this field is + * not used and always contains zero. + * + * The @data_pad is calculated when volumes are created using the alignment + * parameter. So, effectively, the @data_pad field reduces the size of logical + * eraseblocks of this volume. This is very handy when one uses block-oriented + * software (say, cramfs) on top of the UBI volume. + */ +struct ubi_vid_hdr { + ubi32_t magic; + uint8_t version; + uint8_t vol_type; + uint8_t copy_flag; + uint8_t compat; + ubi32_t vol_id; + ubi32_t lnum; + ubi32_t leb_ver; /* obsolete, to be removed, don't use */ + ubi32_t data_size; + ubi32_t used_ebs; + ubi32_t data_pad; + ubi32_t data_crc; + uint8_t padding1[4]; + ubi64_t sqnum; + uint8_t padding2[12]; + ubi32_t hdr_crc; +} __attribute__ ((packed)); + +/* Internal UBI volumes count */ +#define UBI_INT_VOL_COUNT 1 + +/* + * Starting ID of internal volumes. There is reserved room for 4096 internal + * volumes. + */ +#define UBI_INTERNAL_VOL_START (0x7FFFFFFF - 4096) + +/* The layout volume contains the volume table */ + +#define UBI_LAYOUT_VOL_ID UBI_INTERNAL_VOL_START +#define UBI_LAYOUT_VOLUME_EBS 2 +#define UBI_LAYOUT_VOLUME_NAME "layout volume" +#define UBI_LAYOUT_VOLUME_COMPAT UBI_COMPAT_REJECT + +/* The maximum number of volumes per one UBI device */ +#define UBI_MAX_VOLUMES 128 + +/* The maximum volume name length */ +#define UBI_VOL_NAME_MAX 127 + +/* Size of the volume table record */ +#define UBI_VTBL_RECORD_SIZE sizeof(struct ubi_vtbl_record) + +/* Size of the volume table record without the ending CRC */ +#define UBI_VTBL_RECORD_SIZE_CRC (UBI_VTBL_RECORD_SIZE - sizeof(ubi32_t)) + +/** + * struct ubi_vtbl_record - a record in the volume table. + * @reserved_pebs: how many physical eraseblocks are reserved for this volume + * @alignment: volume alignment + * @data_pad: how many bytes are unused at the end of the each physical + * eraseblock to satisfy the requested alignment + * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) + * @upd_marker: if volume update was started but not finished + * @name_len: volume name length + * @name: the volume name + * @padding2: reserved, zeroes + * @crc: a CRC32 checksum of the record + * + * The volume table records are stored in the volume table, which is stored in + * the layout volume. The layout volume consists of 2 logical eraseblock, each + * of which contains a copy of the volume table (i.e., the volume table is + * duplicated). The volume table is an array of &struct ubi_vtbl_record + * objects indexed by the volume ID. + * + * If the size of the logical eraseblock is large enough to fit + * %UBI_MAX_VOLUMES records, the volume table contains %UBI_MAX_VOLUMES + * records. Otherwise, it contains as many records as it can fit (i.e., size of + * logical eraseblock divided by sizeof(struct ubi_vtbl_record)). + * + * The @upd_marker flag is used to implement volume update. It is set to %1 + * before update and set to %0 after the update. So if the update operation was + * interrupted, UBI knows that the volume is corrupted. + * + * The @alignment field is specified when the volume is created and cannot be + * later changed. It may be useful, for example, when a block-oriented file + * system works on top of UBI. The @data_pad field is calculated using the + * logical eraseblock size and @alignment. The alignment must be multiple to the + * minimal flash I/O unit. If @alignment is 1, all the available space of + * the physical eraseblocks is used. + * + * Empty records contain all zeroes and the CRC checksum of those zeroes. + */ +struct ubi_vtbl_record { + ubi32_t reserved_pebs; + ubi32_t alignment; + ubi32_t data_pad; + uint8_t vol_type; + uint8_t upd_marker; + ubi16_t name_len; + uint8_t name[UBI_VOL_NAME_MAX+1]; + uint8_t padding2[24]; + ubi32_t crc; +} __attribute__ ((packed)); + +#endif /* !__UBI_HEADER_H__ */ diff --git a/include/mtd/ubi-user.h b/include/mtd/ubi-user.h new file mode 100644 index 00000000000..fe06ded0e6b --- /dev/null +++ b/include/mtd/ubi-user.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Artem Bityutskiy (Битюцкий Артём) + */ + +#ifndef __UBI_USER_H__ +#define __UBI_USER_H__ + +/* + * UBI volume creation + * ~~~~~~~~~~~~~~~~~~~ + * + * UBI volumes are created via the %UBI_IOCMKVOL IOCTL command of UBI character + * device. A &struct ubi_mkvol_req object has to be properly filled and a + * pointer to it has to be passed to the IOCTL. + * + * UBI volume deletion + * ~~~~~~~~~~~~~~~~~~~ + * + * To delete a volume, the %UBI_IOCRMVOL IOCTL command of the UBI character + * device should be used. A pointer to the 32-bit volume ID hast to be passed + * to the IOCTL. + * + * UBI volume re-size + * ~~~~~~~~~~~~~~~~~~ + * + * To re-size a volume, the %UBI_IOCRSVOL IOCTL command of the UBI character + * device should be used. A &struct ubi_rsvol_req object has to be properly + * filled and a pointer to it has to be passed to the IOCTL. + * + * UBI volume update + * ~~~~~~~~~~~~~~~~~ + * + * Volume update should be done via the %UBI_IOCVOLUP IOCTL command of the + * corresponding UBI volume character device. A pointer to a 64-bit update + * size should be passed to the IOCTL. After then, UBI expects user to write + * this number of bytes to the volume character device. The update is finished + * when the claimed number of bytes is passed. So, the volume update sequence + * is something like: + * + * fd = open("/dev/my_volume"); + * ioctl(fd, UBI_IOCVOLUP, &image_size); + * write(fd, buf, image_size); + * close(fd); + */ + +/* + * When a new volume is created, users may either specify the volume number they + * want to create or to let UBI automatically assign a volume number using this + * constant. + */ +#define UBI_VOL_NUM_AUTO (-1) + +/* Maximum volume name length */ +#define UBI_MAX_VOLUME_NAME 127 + +/* IOCTL commands of UBI character devices */ + +#define UBI_IOC_MAGIC 'o' + +/* Create an UBI volume */ +#define UBI_IOCMKVOL _IOW(UBI_IOC_MAGIC, 0, struct ubi_mkvol_req) +/* Remove an UBI volume */ +#define UBI_IOCRMVOL _IOW(UBI_IOC_MAGIC, 1, int32_t) +/* Re-size an UBI volume */ +#define UBI_IOCRSVOL _IOW(UBI_IOC_MAGIC, 2, struct ubi_rsvol_req) + +/* IOCTL commands of UBI volume character devices */ + +#define UBI_VOL_IOC_MAGIC 'O' + +/* Start UBI volume update */ +#define UBI_IOCVOLUP _IOW(UBI_VOL_IOC_MAGIC, 0, int64_t) +/* An eraseblock erasure command, used for debugging, disabled by default */ +#define UBI_IOCEBER _IOW(UBI_VOL_IOC_MAGIC, 1, int32_t) + +/* + * UBI volume type constants. + * + * @UBI_DYNAMIC_VOLUME: dynamic volume + * @UBI_STATIC_VOLUME: static volume + */ +enum { + UBI_DYNAMIC_VOLUME = 3, + UBI_STATIC_VOLUME = 4 +}; + +/** + * struct ubi_mkvol_req - volume description data structure used in + * volume creation requests. + * @vol_id: volume number + * @alignment: volume alignment + * @bytes: volume size in bytes + * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) + * @padding1: reserved for future, not used + * @name_len: volume name length + * @padding2: reserved for future, not used + * @name: volume name + * + * This structure is used by userspace programs when creating new volumes. The + * @used_bytes field is only necessary when creating static volumes. + * + * The @alignment field specifies the required alignment of the volume logical + * eraseblock. This means, that the size of logical eraseblocks will be aligned + * to this number, i.e., + * (UBI device logical eraseblock size) mod (@alignment) = 0. + * + * To put it differently, the logical eraseblock of this volume may be slightly + * shortened in order to make it properly aligned. The alignment has to be + * multiple of the flash minimal input/output unit, or %1 to utilize the entire + * available space of logical eraseblocks. + * + * The @alignment field may be useful, for example, when one wants to maintain + * a block device on top of an UBI volume. In this case, it is desirable to fit + * an integer number of blocks in logical eraseblocks of this UBI volume. With + * alignment it is possible to update this volume using plane UBI volume image + * BLOBs, without caring about how to properly align them. + */ +struct ubi_mkvol_req { + int32_t vol_id; + int32_t alignment; + int64_t bytes; + int8_t vol_type; + int8_t padding1; + int16_t name_len; + int8_t padding2[4]; + char name[UBI_MAX_VOLUME_NAME+1]; +} __attribute__ ((packed)); + +/** + * struct ubi_rsvol_req - a data structure used in volume re-size requests. + * @vol_id: ID of the volume to re-size + * @bytes: new size of the volume in bytes + * + * Re-sizing is possible for both dynamic and static volumes. But while dynamic + * volumes may be re-sized arbitrarily, static volumes cannot be made to be + * smaller then the number of bytes they bear. To arbitrarily shrink a static + * volume, it must be wiped out first (by means of volume update operation with + * zero number of bytes). + */ +struct ubi_rsvol_req { + int64_t bytes; + int32_t vol_id; +} __attribute__ ((packed)); + +#endif /* __UBI_USER_H__ */ diff --git a/kernel/sched.c b/kernel/sched.c index b9a68373014..960d7c5fca3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4746,7 +4746,7 @@ void show_state_filter(unsigned long state_filter) * console might take alot of time: */ touch_nmi_watchdog(); - if (p->state & state_filter) + if (!state_filter || (p->state & state_filter)) show_task(p); } while_each_thread(g, p); diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 11a3404d65a..e1f18489db1 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -92,6 +92,33 @@ int cipso_v4_rbm_optfmt = 0; int cipso_v4_rbm_strictvalid = 1; /* + * Protocol Constants + */ + +/* Maximum size of the CIPSO IP option, derived from the fact that the maximum + * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */ +#define CIPSO_V4_OPT_LEN_MAX 40 + +/* Length of the base CIPSO option, this includes the option type (1 byte), the + * option length (1 byte), and the DOI (4 bytes). */ +#define CIPSO_V4_HDR_LEN 6 + +/* Base length of the restrictive category bitmap tag (tag #1). */ +#define CIPSO_V4_TAG_RBM_BLEN 4 + +/* Base length of the enumerated category tag (tag #2). */ +#define CIPSO_V4_TAG_ENUM_BLEN 4 + +/* Base length of the ranged categories bitmap tag (tag #5). */ +#define CIPSO_V4_TAG_RNG_BLEN 4 +/* The maximum number of category ranges permitted in the ranged category tag + * (tag #5). You may note that the IETF draft states that the maximum number + * of category ranges is 7, but if the low end of the last category range is + * zero then it is possibile to fit 8 category ranges because the zero should + * be omitted. */ +#define CIPSO_V4_TAG_RNG_CAT_MAX 8 + +/* * Helper Functions */ @@ -1109,16 +1136,15 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, unsigned char *net_cat, u32 net_cat_len) { - /* The constant '16' is not random, it is the maximum number of - * high/low category range pairs as permitted by the CIPSO draft based - * on a maximum IPv4 header length of 60 bytes - the BUG_ON() assertion - * does a sanity check to make sure we don't overflow the array. */ int iter = -1; - u16 array[16]; + u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2]; u32 array_cnt = 0; u32 cat_size = 0; - BUG_ON(net_cat_len > 30); + /* make sure we don't overflow the 'array[]' variable */ + if (net_cat_len > + (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN)) + return -ENOSPC; for (;;) { iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1); @@ -1196,9 +1222,6 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, * Protocol Handling Functions */ -#define CIPSO_V4_OPT_LEN_MAX 40 -#define CIPSO_V4_HDR_LEN 6 - /** * cipso_v4_gentag_hdr - Generate a CIPSO option header * @doi_def: the DOI definition diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index e03a3282c55..f2535e7f286 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -263,9 +263,6 @@ int netlbl_socket_setattr(const struct socket *sock, int ret_val = -ENOENT; struct netlbl_dom_map *dom_entry; - if ((secattr->flags & NETLBL_SECATTR_DOMAIN) == 0) - return -ENOENT; - rcu_read_lock(); dom_entry = netlbl_domhsh_getentry(secattr->domain); if (dom_entry == NULL) diff --git a/security/selinux/Makefile b/security/selinux/Makefile index faf2e02e441..dc3502e30b1 100644 --- a/security/selinux/Makefile +++ b/security/selinux/Makefile @@ -8,5 +8,7 @@ selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o +selinux-$(CONFIG_NETLABEL) += netlabel.o + EXTRA_CFLAGS += -Isecurity/selinux/include diff --git a/security/selinux/avc.c b/security/selinux/avc.c index da8caf10ef9..e4396a89edc 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -217,6 +217,8 @@ static void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tcla audit_log_format(ab, " tcontext=%s", scontext); kfree(scontext); } + + BUG_ON(tclass >= ARRAY_SIZE(class_to_string) || !class_to_string[tclass]); audit_log_format(ab, " tclass=%s", class_to_string[tclass]); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 5f02b4be191..885a9a958b8 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -77,7 +77,7 @@ #include "objsec.h" #include "netif.h" #include "xfrm.h" -#include "selinux_netlabel.h" +#include "netlabel.h" #define XATTR_SELINUX_SUFFIX "selinux" #define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX @@ -3123,6 +3123,34 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, return ret; } +/** + * selinux_skb_extlbl_sid - Determine the external label of a packet + * @skb: the packet + * @base_sid: the SELinux SID to use as a context for MLS only external labels + * @sid: the packet's SID + * + * Description: + * Check the various different forms of external packet labeling and determine + * the external SID for the packet. + * + */ +static void selinux_skb_extlbl_sid(struct sk_buff *skb, + u32 base_sid, + u32 *sid) +{ + u32 xfrm_sid; + u32 nlbl_sid; + + selinux_skb_xfrm_sid(skb, &xfrm_sid); + if (selinux_netlbl_skbuff_getsid(skb, + (xfrm_sid == SECSID_NULL ? + base_sid : xfrm_sid), + &nlbl_sid) != 0) + nlbl_sid = SECSID_NULL; + + *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid); +} + /* socket security operations */ static int socket_has_perm(struct task_struct *task, struct socket *sock, u32 perms) @@ -3664,9 +3692,7 @@ static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff * if (sock && sock->sk->sk_family == PF_UNIX) selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid); else if (skb) - security_skb_extlbl_sid(skb, - SECINITSID_UNLABELED, - &peer_secid); + selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peer_secid); if (peer_secid == SECSID_NULL) err = -EINVAL; @@ -3727,7 +3753,7 @@ static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb, u32 newsid; u32 peersid; - security_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peersid); + selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &peersid); if (peersid == SECSID_NULL) { req->secid = sksec->sid; req->peer_secid = SECSID_NULL; @@ -3765,7 +3791,7 @@ static void selinux_inet_conn_established(struct sock *sk, { struct sk_security_struct *sksec = sk->sk_security; - security_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &sksec->peer_sid); + selinux_skb_extlbl_sid(skb, SECINITSID_UNLABELED, &sksec->peer_sid); } static void selinux_req_classify_flow(const struct request_sock *req, diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h index ad9fb2d69b5..b83e74012a9 100644 --- a/security/selinux/include/av_perm_to_string.h +++ b/security/selinux/include/av_perm_to_string.h @@ -128,96 +128,6 @@ S_(SECCLASS_CAPABILITY, CAPABILITY__LEASE, "lease") S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_WRITE, "audit_write") S_(SECCLASS_CAPABILITY, CAPABILITY__AUDIT_CONTROL, "audit_control") - S_(SECCLASS_PASSWD, PASSWD__PASSWD, "passwd") - S_(SECCLASS_PASSWD, PASSWD__CHFN, "chfn") - S_(SECCLASS_PASSWD, PASSWD__CHSH, "chsh") - S_(SECCLASS_PASSWD, PASSWD__ROOTOK, "rootok") - S_(SECCLASS_PASSWD, PASSWD__CRONTAB, "crontab") - S_(SECCLASS_DRAWABLE, DRAWABLE__CREATE, "create") - S_(SECCLASS_DRAWABLE, DRAWABLE__DESTROY, "destroy") - S_(SECCLASS_DRAWABLE, DRAWABLE__DRAW, "draw") - S_(SECCLASS_DRAWABLE, DRAWABLE__COPY, "copy") - S_(SECCLASS_DRAWABLE, DRAWABLE__GETATTR, "getattr") - S_(SECCLASS_GC, GC__CREATE, "create") - S_(SECCLASS_GC, GC__FREE, "free") - S_(SECCLASS_GC, GC__GETATTR, "getattr") - S_(SECCLASS_GC, GC__SETATTR, "setattr") - S_(SECCLASS_WINDOW, WINDOW__ADDCHILD, "addchild") - S_(SECCLASS_WINDOW, WINDOW__CREATE, "create") - S_(SECCLASS_WINDOW, WINDOW__DESTROY, "destroy") - S_(SECCLASS_WINDOW, WINDOW__MAP, "map") - S_(SECCLASS_WINDOW, WINDOW__UNMAP, "unmap") - S_(SECCLASS_WINDOW, WINDOW__CHSTACK, "chstack") - S_(SECCLASS_WINDOW, WINDOW__CHPROPLIST, "chproplist") - S_(SECCLASS_WINDOW, WINDOW__CHPROP, "chprop") - S_(SECCLASS_WINDOW, WINDOW__LISTPROP, "listprop") - S_(SECCLASS_WINDOW, WINDOW__GETATTR, "getattr") - S_(SECCLASS_WINDOW, WINDOW__SETATTR, "setattr") - S_(SECCLASS_WINDOW, WINDOW__SETFOCUS, "setfocus") - S_(SECCLASS_WINDOW, WINDOW__MOVE, "move") - S_(SECCLASS_WINDOW, WINDOW__CHSELECTION, "chselection") - S_(SECCLASS_WINDOW, WINDOW__CHPARENT, "chparent") - S_(SECCLASS_WINDOW, WINDOW__CTRLLIFE, "ctrllife") - S_(SECCLASS_WINDOW, WINDOW__ENUMERATE, "enumerate") - S_(SECCLASS_WINDOW, WINDOW__TRANSPARENT, "transparent") - S_(SECCLASS_WINDOW, WINDOW__MOUSEMOTION, "mousemotion") - S_(SECCLASS_WINDOW, WINDOW__CLIENTCOMEVENT, "clientcomevent") - S_(SECCLASS_WINDOW, WINDOW__INPUTEVENT, "inputevent") - S_(SECCLASS_WINDOW, WINDOW__DRAWEVENT, "drawevent") - S_(SECCLASS_WINDOW, WINDOW__WINDOWCHANGEEVENT, "windowchangeevent") - S_(SECCLASS_WINDOW, WINDOW__WINDOWCHANGEREQUEST, "windowchangerequest") - S_(SECCLASS_WINDOW, WINDOW__SERVERCHANGEEVENT, "serverchangeevent") - S_(SECCLASS_WINDOW, WINDOW__EXTENSIONEVENT, "extensionevent") - S_(SECCLASS_FONT, FONT__LOAD, "load") - S_(SECCLASS_FONT, FONT__FREE, "free") - S_(SECCLASS_FONT, FONT__GETATTR, "getattr") - S_(SECCLASS_FONT, FONT__USE, "use") - S_(SECCLASS_COLORMAP, COLORMAP__CREATE, "create") - S_(SECCLASS_COLORMAP, COLORMAP__FREE, "free") - S_(SECCLASS_COLORMAP, COLORMAP__INSTALL, "install") - S_(SECCLASS_COLORMAP, COLORMAP__UNINSTALL, "uninstall") - S_(SECCLASS_COLORMAP, COLORMAP__LIST, "list") - S_(SECCLASS_COLORMAP, COLORMAP__READ, "read") - S_(SECCLASS_COLORMAP, COLORMAP__STORE, "store") - S_(SECCLASS_COLORMAP, COLORMAP__GETATTR, "getattr") - S_(SECCLASS_COLORMAP, COLORMAP__SETATTR, "setattr") - S_(SECCLASS_PROPERTY, PROPERTY__CREATE, "create") - S_(SECCLASS_PROPERTY, PROPERTY__FREE, "free") - S_(SECCLASS_PROPERTY, PROPERTY__READ, "read") - S_(SECCLASS_PROPERTY, PROPERTY__WRITE, "write") - S_(SECCLASS_CURSOR, CURSOR__CREATE, "create") - S_(SECCLASS_CURSOR, CURSOR__CREATEGLYPH, "createglyph") - S_(SECCLASS_CURSOR, CURSOR__FREE, "free") - S_(SECCLASS_CURSOR, CURSOR__ASSIGN, "assign") - S_(SECCLASS_CURSOR, CURSOR__SETATTR, "setattr") - S_(SECCLASS_XCLIENT, XCLIENT__KILL, "kill") - S_(SECCLASS_XINPUT, XINPUT__LOOKUP, "lookup") - S_(SECCLASS_XINPUT, XINPUT__GETATTR, "getattr") - S_(SECCLASS_XINPUT, XINPUT__SETATTR, "setattr") - S_(SECCLASS_XINPUT, XINPUT__SETFOCUS, "setfocus") - S_(SECCLASS_XINPUT, XINPUT__WARPPOINTER, "warppointer") - S_(SECCLASS_XINPUT, XINPUT__ACTIVEGRAB, "activegrab") - S_(SECCLASS_XINPUT, XINPUT__PASSIVEGRAB, "passivegrab") - S_(SECCLASS_XINPUT, XINPUT__UNGRAB, "ungrab") - S_(SECCLASS_XINPUT, XINPUT__BELL, "bell") - S_(SECCLASS_XINPUT, XINPUT__MOUSEMOTION, "mousemotion") - S_(SECCLASS_XINPUT, XINPUT__RELABELINPUT, "relabelinput") - S_(SECCLASS_XSERVER, XSERVER__SCREENSAVER, "screensaver") - S_(SECCLASS_XSERVER, XSERVER__GETHOSTLIST, "gethostlist") - S_(SECCLASS_XSERVER, XSERVER__SETHOSTLIST, "sethostlist") - S_(SECCLASS_XSERVER, XSERVER__GETFONTPATH, "getfontpath") - S_(SECCLASS_XSERVER, XSERVER__SETFONTPATH, "setfontpath") - S_(SECCLASS_XSERVER, XSERVER__GETATTR, "getattr") - S_(SECCLASS_XSERVER, XSERVER__GRAB, "grab") - S_(SECCLASS_XSERVER, XSERVER__UNGRAB, "ungrab") - S_(SECCLASS_XEXTENSION, XEXTENSION__QUERY, "query") - S_(SECCLASS_XEXTENSION, XEXTENSION__USE, "use") - S_(SECCLASS_PAX, PAX__PAGEEXEC, "pageexec") - S_(SECCLASS_PAX, PAX__EMUTRAMP, "emutramp") - S_(SECCLASS_PAX, PAX__MPROTECT, "mprotect") - S_(SECCLASS_PAX, PAX__RANDMMAP, "randmmap") - S_(SECCLASS_PAX, PAX__RANDEXEC, "randexec") - S_(SECCLASS_PAX, PAX__SEGMEXEC, "segmexec") S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ, "nlmsg_read") S_(SECCLASS_NETLINK_ROUTE_SOCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE, "nlmsg_write") S_(SECCLASS_NETLINK_FIREWALL_SOCKET, NETLINK_FIREWALL_SOCKET__NLMSG_READ, "nlmsg_read") @@ -232,16 +142,6 @@ S_(SECCLASS_NETLINK_AUDIT_SOCKET, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV, "nlmsg_readpriv") S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_READ, "nlmsg_read") S_(SECCLASS_NETLINK_IP6FW_SOCKET, NETLINK_IP6FW_SOCKET__NLMSG_WRITE, "nlmsg_write") - S_(SECCLASS_DBUS, DBUS__ACQUIRE_SVC, "acquire_svc") - S_(SECCLASS_DBUS, DBUS__SEND_MSG, "send_msg") - S_(SECCLASS_NSCD, NSCD__GETPWD, "getpwd") - S_(SECCLASS_NSCD, NSCD__GETGRP, "getgrp") - S_(SECCLASS_NSCD, NSCD__GETHOST, "gethost") - S_(SECCLASS_NSCD, NSCD__GETSTAT, "getstat") - S_(SECCLASS_NSCD, NSCD__ADMIN, "admin") - S_(SECCLASS_NSCD, NSCD__SHMEMPWD, "shmempwd") - S_(SECCLASS_NSCD, NSCD__SHMEMGRP, "shmemgrp") - S_(SECCLASS_NSCD, NSCD__SHMEMHOST, "shmemhost") S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto") S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom") S_(SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, "setcontext") @@ -256,7 +156,5 @@ S_(SECCLASS_KEY, KEY__LINK, "link") S_(SECCLASS_KEY, KEY__SETATTR, "setattr") S_(SECCLASS_KEY, KEY__CREATE, "create") - S_(SECCLASS_CONTEXT, CONTEXT__TRANSLATE, "translate") - S_(SECCLASS_CONTEXT, CONTEXT__CONTAINS, "contains") S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind") S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect") diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h index 2de4b5fe3aa..5fee1735bff 100644 --- a/security/selinux/include/av_permissions.h +++ b/security/selinux/include/av_permissions.h @@ -16,7 +16,6 @@ #define COMMON_FILE__SWAPON 0x00004000UL #define COMMON_FILE__QUOTAON 0x00008000UL #define COMMON_FILE__MOUNTON 0x00010000UL - #define COMMON_SOCKET__IOCTL 0x00000001UL #define COMMON_SOCKET__READ 0x00000002UL #define COMMON_SOCKET__WRITE 0x00000004UL @@ -39,7 +38,6 @@ #define COMMON_SOCKET__RECV_MSG 0x00080000UL #define COMMON_SOCKET__SEND_MSG 0x00100000UL #define COMMON_SOCKET__NAME_BIND 0x00200000UL - #define COMMON_IPC__CREATE 0x00000001UL #define COMMON_IPC__DESTROY 0x00000002UL #define COMMON_IPC__GETATTR 0x00000004UL @@ -49,7 +47,6 @@ #define COMMON_IPC__ASSOCIATE 0x00000040UL #define COMMON_IPC__UNIX_READ 0x00000080UL #define COMMON_IPC__UNIX_WRITE 0x00000100UL - #define FILESYSTEM__MOUNT 0x00000001UL #define FILESYSTEM__REMOUNT 0x00000002UL #define FILESYSTEM__UNMOUNT 0x00000004UL @@ -60,7 +57,6 @@ #define FILESYSTEM__ASSOCIATE 0x00000080UL #define FILESYSTEM__QUOTAMOD 0x00000100UL #define FILESYSTEM__QUOTAGET 0x00000200UL - #define DIR__IOCTL 0x00000001UL #define DIR__READ 0x00000002UL #define DIR__WRITE 0x00000004UL @@ -78,13 +74,11 @@ #define DIR__SWAPON 0x00004000UL #define DIR__QUOTAON 0x00008000UL #define DIR__MOUNTON 0x00010000UL - #define DIR__ADD_NAME 0x00020000UL #define DIR__REMOVE_NAME 0x00040000UL #define DIR__REPARENT 0x00080000UL #define DIR__SEARCH 0x00100000UL #define DIR__RMDIR 0x00200000UL - #define FILE__IOCTL 0x00000001UL #define FILE__READ 0x00000002UL #define FILE__WRITE 0x00000004UL @@ -102,11 +96,9 @@ #define FILE__SWAPON 0x00004000UL #define FILE__QUOTAON 0x00008000UL #define FILE__MOUNTON 0x00010000UL - #define FILE__EXECUTE_NO_TRANS 0x00020000UL #define FILE__ENTRYPOINT 0x00040000UL #define FILE__EXECMOD 0x00080000UL - #define LNK_FILE__IOCTL 0x00000001UL #define LNK_FILE__READ 0x00000002UL #define LNK_FILE__WRITE 0x00000004UL @@ -124,7 +116,6 @@ #define LNK_FILE__SWAPON 0x00004000UL #define LNK_FILE__QUOTAON 0x00008000UL #define LNK_FILE__MOUNTON 0x00010000UL - #define CHR_FILE__IOCTL 0x00000001UL #define CHR_FILE__READ 0x00000002UL #define CHR_FILE__WRITE 0x00000004UL @@ -142,11 +133,9 @@ #define CHR_FILE__SWAPON 0x00004000UL #define CHR_FILE__QUOTAON 0x00008000UL #define CHR_FILE__MOUNTON 0x00010000UL - #define CHR_FILE__EXECUTE_NO_TRANS 0x00020000UL #define CHR_FILE__ENTRYPOINT 0x00040000UL #define CHR_FILE__EXECMOD 0x00080000UL - #define BLK_FILE__IOCTL 0x00000001UL #define BLK_FILE__READ 0x00000002UL #define BLK_FILE__WRITE 0x00000004UL @@ -164,7 +153,6 @@ #define BLK_FILE__SWAPON 0x00004000UL #define BLK_FILE__QUOTAON 0x00008000UL #define BLK_FILE__MOUNTON 0x00010000UL - #define SOCK_FILE__IOCTL 0x00000001UL #define SOCK_FILE__READ 0x00000002UL #define SOCK_FILE__WRITE 0x00000004UL @@ -182,7 +170,6 @@ #define SOCK_FILE__SWAPON 0x00004000UL #define SOCK_FILE__QUOTAON 0x00008000UL #define SOCK_FILE__MOUNTON 0x00010000UL - #define FIFO_FILE__IOCTL 0x00000001UL #define FIFO_FILE__READ 0x00000002UL #define FIFO_FILE__WRITE 0x00000004UL @@ -200,9 +187,7 @@ #define FIFO_FILE__SWAPON 0x00004000UL #define FIFO_FILE__QUOTAON 0x00008000UL #define FIFO_FILE__MOUNTON 0x00010000UL - #define FD__USE 0x00000001UL - #define SOCKET__IOCTL 0x00000001UL #define SOCKET__READ 0x00000002UL #define SOCKET__WRITE 0x00000004UL @@ -225,7 +210,6 @@ #define SOCKET__RECV_MSG 0x00080000UL #define SOCKET__SEND_MSG 0x00100000UL #define SOCKET__NAME_BIND 0x00200000UL - #define TCP_SOCKET__IOCTL 0x00000001UL #define TCP_SOCKET__READ 0x00000002UL #define TCP_SOCKET__WRITE 0x00000004UL @@ -248,13 +232,11 @@ #define TCP_SOCKET__RECV_MSG 0x00080000UL #define TCP_SOCKET__SEND_MSG 0x00100000UL #define TCP_SOCKET__NAME_BIND 0x00200000UL - #define TCP_SOCKET__CONNECTTO 0x00400000UL #define TCP_SOCKET__NEWCONN 0x00800000UL #define TCP_SOCKET__ACCEPTFROM 0x01000000UL #define TCP_SOCKET__NODE_BIND 0x02000000UL #define TCP_SOCKET__NAME_CONNECT 0x04000000UL - #define UDP_SOCKET__IOCTL 0x00000001UL #define UDP_SOCKET__READ 0x00000002UL #define UDP_SOCKET__WRITE 0x00000004UL @@ -277,9 +259,7 @@ #define UDP_SOCKET__RECV_MSG 0x00080000UL #define UDP_SOCKET__SEND_MSG 0x00100000UL #define UDP_SOCKET__NAME_BIND 0x00200000UL - #define UDP_SOCKET__NODE_BIND 0x00400000UL - #define RAWIP_SOCKET__IOCTL 0x00000001UL #define RAWIP_SOCKET__READ 0x00000002UL #define RAWIP_SOCKET__WRITE 0x00000004UL @@ -302,9 +282,7 @@ #define RAWIP_SOCKET__RECV_MSG 0x00080000UL #define RAWIP_SOCKET__SEND_MSG 0x00100000UL #define RAWIP_SOCKET__NAME_BIND 0x00200000UL - #define RAWIP_SOCKET__NODE_BIND 0x00400000UL - #define NODE__TCP_RECV 0x00000001UL #define NODE__TCP_SEND 0x00000002UL #define NODE__UDP_RECV 0x00000004UL @@ -314,7 +292,6 @@ #define NODE__ENFORCE_DEST 0x00000040UL #define NODE__DCCP_RECV 0x00000080UL #define NODE__DCCP_SEND 0x00000100UL - #define NETIF__TCP_RECV 0x00000001UL #define NETIF__TCP_SEND 0x00000002UL #define NETIF__UDP_RECV 0x00000004UL @@ -323,7 +300,6 @@ #define NETIF__RAWIP_SEND 0x00000020UL #define NETIF__DCCP_RECV 0x00000040UL #define NETIF__DCCP_SEND 0x00000080UL - #define NETLINK_SOCKET__IOCTL 0x00000001UL #define NETLINK_SOCKET__READ 0x00000002UL #define NETLINK_SOCKET__WRITE 0x00000004UL @@ -346,7 +322,6 @@ #define NETLINK_SOCKET__RECV_MSG 0x00080000UL #define NETLINK_SOCKET__SEND_MSG 0x00100000UL #define NETLINK_SOCKET__NAME_BIND 0x00200000UL - #define PACKET_SOCKET__IOCTL 0x00000001UL #define PACKET_SOCKET__READ 0x00000002UL #define PACKET_SOCKET__WRITE 0x00000004UL @@ -369,7 +344,6 @@ #define PACKET_SOCKET__RECV_MSG 0x00080000UL #define PACKET_SOCKET__SEND_MSG 0x00100000UL #define PACKET_SOCKET__NAME_BIND 0x00200000UL - #define KEY_SOCKET__IOCTL 0x00000001UL #define KEY_SOCKET__READ 0x00000002UL #define KEY_SOCKET__WRITE 0x00000004UL @@ -392,7 +366,6 @@ #define KEY_SOCKET__RECV_MSG 0x00080000UL #define KEY_SOCKET__SEND_MSG 0x00100000UL #define KEY_SOCKET__NAME_BIND 0x00200000UL - #define UNIX_STREAM_SOCKET__IOCTL 0x00000001UL #define UNIX_STREAM_SOCKET__READ 0x00000002UL #define UNIX_STREAM_SOCKET__WRITE 0x00000004UL @@ -415,11 +388,9 @@ #define UNIX_STREAM_SOCKET__RECV_MSG 0x00080000UL #define UNIX_STREAM_SOCKET__SEND_MSG 0x00100000UL #define UNIX_STREAM_SOCKET__NAME_BIND 0x00200000UL - #define UNIX_STREAM_SOCKET__CONNECTTO 0x00400000UL #define UNIX_STREAM_SOCKET__NEWCONN 0x00800000UL #define UNIX_STREAM_SOCKET__ACCEPTFROM 0x01000000UL - #define UNIX_DGRAM_SOCKET__IOCTL 0x00000001UL #define UNIX_DGRAM_SOCKET__READ 0x00000002UL #define UNIX_DGRAM_SOCKET__WRITE 0x00000004UL @@ -442,7 +413,6 @@ #define UNIX_DGRAM_SOCKET__RECV_MSG 0x00080000UL #define UNIX_DGRAM_SOCKET__SEND_MSG 0x00100000UL #define UNIX_DGRAM_SOCKET__NAME_BIND 0x00200000UL - #define PROCESS__FORK 0x00000001UL #define PROCESS__TRANSITION 0x00000002UL #define PROCESS__SIGCHLD 0x00000004UL @@ -473,7 +443,6 @@ #define PROCESS__EXECHEAP 0x08000000UL #define PROCESS__SETKEYCREATE 0x10000000UL #define PROCESS__SETSOCKCREATE 0x20000000UL - #define IPC__CREATE 0x00000001UL #define IPC__DESTROY 0x00000002UL #define IPC__GETATTR 0x00000004UL @@ -483,7 +452,6 @@ #define IPC__ASSOCIATE 0x00000040UL #define IPC__UNIX_READ 0x00000080UL #define IPC__UNIX_WRITE 0x00000100UL - #define SEM__CREATE 0x00000001UL #define SEM__DESTROY 0x00000002UL #define SEM__GETATTR 0x00000004UL @@ -493,7 +461,6 @@ #define SEM__ASSOCIATE 0x00000040UL #define SEM__UNIX_READ 0x00000080UL #define SEM__UNIX_WRITE 0x00000100UL - #define MSGQ__CREATE 0x00000001UL #define MSGQ__DESTROY 0x00000002UL #define MSGQ__GETATTR 0x00000004UL @@ -503,12 +470,9 @@ #define MSGQ__ASSOCIATE 0x00000040UL #define MSGQ__UNIX_READ 0x00000080UL #define MSGQ__UNIX_WRITE 0x00000100UL - #define MSGQ__ENQUEUE 0x00000200UL - #define MSG__SEND 0x00000001UL #define MSG__RECEIVE 0x00000002UL - #define SHM__CREATE 0x00000001UL #define SHM__DESTROY 0x00000002UL #define SHM__GETATTR 0x00000004UL @@ -518,9 +482,7 @@ #define SHM__ASSOCIATE 0x00000040UL #define SHM__UNIX_READ 0x00000080UL #define SHM__UNIX_WRITE 0x00000100UL - #define SHM__LOCK 0x00000200UL - #define SECURITY__COMPUTE_AV 0x00000001UL #define SECURITY__COMPUTE_CREATE 0x00000002UL #define SECURITY__COMPUTE_MEMBER 0x00000004UL @@ -532,12 +494,10 @@ #define SECURITY__SETBOOL 0x00000100UL #define SECURITY__SETSECPARAM 0x00000200UL #define SECURITY__SETCHECKREQPROT 0x00000400UL - #define SYSTEM__IPC_INFO 0x00000001UL #define SYSTEM__SYSLOG_READ 0x00000002UL #define SYSTEM__SYSLOG_MOD 0x00000004UL #define SYSTEM__SYSLOG_CONSOLE 0x00000008UL - #define CAPABILITY__CHOWN 0x00000001UL #define CAPABILITY__DAC_OVERRIDE 0x00000002UL #define CAPABILITY__DAC_READ_SEARCH 0x00000004UL @@ -569,110 +529,6 @@ #define CAPABILITY__LEASE 0x10000000UL #define CAPABILITY__AUDIT_WRITE 0x20000000UL #define CAPABILITY__AUDIT_CONTROL 0x40000000UL - -#define PASSWD__PASSWD 0x00000001UL -#define PASSWD__CHFN 0x00000002UL -#define PASSWD__CHSH 0x00000004UL -#define PASSWD__ROOTOK 0x00000008UL -#define PASSWD__CRONTAB 0x00000010UL - -#define DRAWABLE__CREATE 0x00000001UL -#define DRAWABLE__DESTROY 0x00000002UL -#define DRAWABLE__DRAW 0x00000004UL -#define DRAWABLE__COPY 0x00000008UL -#define DRAWABLE__GETATTR 0x00000010UL - -#define GC__CREATE 0x00000001UL -#define GC__FREE 0x00000002UL -#define GC__GETATTR 0x00000004UL -#define GC__SETATTR 0x00000008UL - -#define WINDOW__ADDCHILD 0x00000001UL -#define WINDOW__CREATE 0x00000002UL -#define WINDOW__DESTROY 0x00000004UL -#define WINDOW__MAP 0x00000008UL -#define WINDOW__UNMAP 0x00000010UL -#define WINDOW__CHSTACK 0x00000020UL -#define WINDOW__CHPROPLIST 0x00000040UL -#define WINDOW__CHPROP 0x00000080UL -#define WINDOW__LISTPROP 0x00000100UL -#define WINDOW__GETATTR 0x00000200UL -#define WINDOW__SETATTR 0x00000400UL -#define WINDOW__SETFOCUS 0x00000800UL -#define WINDOW__MOVE 0x00001000UL -#define WINDOW__CHSELECTION 0x00002000UL -#define WINDOW__CHPARENT 0x00004000UL -#define WINDOW__CTRLLIFE 0x00008000UL -#define WINDOW__ENUMERATE 0x00010000UL -#define WINDOW__TRANSPARENT 0x00020000UL -#define WINDOW__MOUSEMOTION 0x00040000UL -#define WINDOW__CLIENTCOMEVENT 0x00080000UL -#define WINDOW__INPUTEVENT 0x00100000UL -#define WINDOW__DRAWEVENT 0x00200000UL -#define WINDOW__WINDOWCHANGEEVENT 0x00400000UL -#define WINDOW__WINDOWCHANGEREQUEST 0x00800000UL -#define WINDOW__SERVERCHANGEEVENT 0x01000000UL -#define WINDOW__EXTENSIONEVENT 0x02000000UL - -#define FONT__LOAD 0x00000001UL -#define FONT__FREE 0x00000002UL -#define FONT__GETATTR 0x00000004UL -#define FONT__USE 0x00000008UL - -#define COLORMAP__CREATE 0x00000001UL -#define COLORMAP__FREE 0x00000002UL -#define COLORMAP__INSTALL 0x00000004UL -#define COLORMAP__UNINSTALL 0x00000008UL -#define COLORMAP__LIST 0x00000010UL -#define COLORMAP__READ 0x00000020UL -#define COLORMAP__STORE 0x00000040UL -#define COLORMAP__GETATTR 0x00000080UL -#define COLORMAP__SETATTR 0x00000100UL - -#define PROPERTY__CREATE 0x00000001UL -#define PROPERTY__FREE 0x00000002UL -#define PROPERTY__READ 0x00000004UL -#define PROPERTY__WRITE 0x00000008UL - -#define CURSOR__CREATE 0x00000001UL -#define CURSOR__CREATEGLYPH 0x00000002UL -#define CURSOR__FREE 0x00000004UL -#define CURSOR__ASSIGN 0x00000008UL -#define CURSOR__SETATTR 0x00000010UL - -#define XCLIENT__KILL 0x00000001UL - -#define XINPUT__LOOKUP 0x00000001UL -#define XINPUT__GETATTR 0x00000002UL -#define XINPUT__SETATTR 0x00000004UL -#define XINPUT__SETFOCUS 0x00000008UL -#define XINPUT__WARPPOINTER 0x00000010UL -#define XINPUT__ACTIVEGRAB 0x00000020UL -#define XINPUT__PASSIVEGRAB 0x00000040UL -#define XINPUT__UNGRAB 0x00000080UL -#define XINPUT__BELL 0x00000100UL -#define XINPUT__MOUSEMOTION 0x00000200UL -#define XINPUT__RELABELINPUT 0x00000400UL - -#define XSERVER__SCREENSAVER 0x00000001UL -#define XSERVER__GETHOSTLIST 0x00000002UL -#define XSERVER__SETHOSTLIST 0x00000004UL -#define XSERVER__GETFONTPATH 0x00000008UL -#define XSERVER__SETFONTPATH 0x00000010UL -#define XSERVER__GETATTR 0x00000020UL -#define XSERVER__GRAB 0x00000040UL -#define XSERVER__UNGRAB 0x00000080UL - -#define XEXTENSION__QUERY 0x00000001UL -#define XEXTENSION__USE 0x00000002UL - -#define PAX__PAGEEXEC 0x00000001UL -#define PAX__EMUTRAMP 0x00000002UL -#define PAX__MPROTECT 0x00000004UL -#define PAX__RANDMMAP 0x00000008UL -#define PAX__RANDEXEC 0x00000010UL -#define PAX__SEGMEXEC 0x00000020UL - #define NETLINK_ROUTE_SOCKET__IOCTL 0x00000001UL #define NETLINK_ROUTE_SOCKET__READ 0x00000002UL #define NETLINK_ROUTE_SOCKET__WRITE 0x00000004UL @@ -695,10 +551,8 @@ #define NETLINK_ROUTE_SOCKET__RECV_MSG 0x00080000UL #define NETLINK_ROUTE_SOCKET__SEND_MSG 0x00100000UL #define NETLINK_ROUTE_SOCKET__NAME_BIND 0x00200000UL - #define NETLINK_ROUTE_SOCKET__NLMSG_READ 0x00400000UL #define NETLINK_ROUTE_SOCKET__NLMSG_WRITE 0x00800000UL - #define NETLINK_FIREWALL_SOCKET__IOCTL 0x00000001UL #define NETLINK_FIREWALL_SOCKET__READ 0x00000002UL #define NETLINK_FIREWALL_SOCKET__WRITE 0x00000004UL @@ -721,10 +575,8 @@ #define NETLINK_FIREWALL_SOCKET__RECV_MSG 0x00080000UL #define NETLINK_FIREWALL_SOCKET__SEND_MSG 0x00100000UL #define NETLINK_FIREWALL_SOCKET__NAME_BIND 0x00200000UL - #define NETLINK_FIREWALL_SOCKET__NLMSG_READ 0x00400000UL #define NETLINK_FIREWALL_SOCKET__NLMSG_WRITE 0x00800000UL - #define NETLINK_TCPDIAG_SOCKET__IOCTL 0x00000001UL #define NETLINK_TCPDIAG_SOCKET__READ 0x00000002UL #define NETLINK_TCPDIAG_SOCKET__WRITE 0x00000004UL @@ -747,10 +599,8 @@ #define NETLINK_TCPDIAG_SOCKET__RECV_MSG 0x00080000UL #define NETLINK_TCPDIAG_SOCKET__SEND_MSG 0x00100000UL #define NETLINK_TCPDIAG_SOCKET__NAME_BIND 0x00200000UL - #define NETLINK_TCPDIAG_SOCKET__NLMSG_READ 0x00400000UL #define NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE 0x00800000UL - #define NETLINK_NFLOG_SOCKET__IOCTL 0x00000001UL #define NETLINK_NFLOG_SOCKET__READ 0x00000002UL #define NETLINK_NFLOG_SOCKET__WRITE 0x00000004UL @@ -773,7 +623,6 @@ #define NETLINK_NFLOG_SOCKET__RECV_MSG 0x00080000UL #define NETLINK_NFLOG_SOCKET__SEND_MSG 0x00100000UL #define NETLINK_NFLOG_SOCKET__NAME_BIND 0x00200000UL - #define NETLINK_XFRM_SOCKET__IOCTL 0x00000001UL #define NETLINK_XFRM_SOCKET__READ 0x00000002UL #define NETLINK_XFRM_SOCKET__WRITE 0x00000004UL @@ -796,10 +645,8 @@ #define NETLINK_XFRM_SOCKET__RECV_MSG 0x00080000UL #define NETLINK_XFRM_SOCKET__SEND_MSG 0x00100000UL #define NETLINK_XFRM_SOCKET__NAME_BIND 0x00200000UL - #define NETLINK_XFRM_SOCKET__NLMSG_READ 0x00400000UL #define NETLINK_XFRM_SOCKET__NLMSG_WRITE 0x00800000UL - #define NETLINK_SELINUX_SOCKET__IOCTL 0x00000001UL #define NETLINK_SELINUX_SOCKET__READ 0x00000002UL #define NETLINK_SELINUX_SOCKET__WRITE 0x00000004UL @@ -822,7 +669,6 @@ #define NETLINK_SELINUX_SOCKET__RECV_MSG 0x00080000UL #define NETLINK_SELINUX_SOCKET__SEND_MSG 0x00100000UL #define NETLINK_SELINUX_SOCKET__NAME_BIND 0x00200000UL - #define NETLINK_AUDIT_SOCKET__IOCTL 0x00000001UL #define NETLINK_AUDIT_SOCKET__READ 0x00000002UL #define NETLINK_AUDIT_SOCKET__WRITE 0x00000004UL @@ -845,12 +691,10 @@ #define NETLINK_AUDIT_SOCKET__RECV_MSG 0x00080000UL #define NETLINK_AUDIT_SOCKET__SEND_MSG 0x00100000UL #define NETLINK_AUDIT_SOCKET__NAME_BIND 0x00200000UL - #define NETLINK_AUDIT_SOCKET__NLMSG_READ 0x00400000UL #define NETLINK_AUDIT_SOCKET__NLMSG_WRITE 0x00800000UL #define NETLINK_AUDIT_SOCKET__NLMSG_RELAY 0x01000000UL #define NETLINK_AUDIT_SOCKET__NLMSG_READPRIV 0x02000000UL - #define NETLINK_IP6FW_SOCKET__IOCTL 0x00000001UL #define NETLINK_IP6FW_SOCKET__READ 0x00000002UL #define NETLINK_IP6FW_SOCKET__WRITE 0x00000004UL @@ -873,10 +717,8 @@ #define NETLINK_IP6FW_SOCKET__RECV_MSG 0x00080000UL #define NETLINK_IP6FW_SOCKET__SEND_MSG 0x00100000UL #define NETLINK_IP6FW_SOCKET__NAME_BIND 0x00200000UL - #define NETLINK_IP6FW_SOCKET__NLMSG_READ 0x00400000UL #define NETLINK_IP6FW_SOCKET__NLMSG_WRITE 0x00800000UL - #define NETLINK_DNRT_SOCKET__IOCTL 0x00000001UL #define NETLINK_DNRT_SOCKET__READ 0x00000002UL #define NETLINK_DNRT_SOCKET__WRITE 0x00000004UL @@ -899,24 +741,10 @@ #define NETLINK_DNRT_SOCKET__RECV_MSG 0x00080000UL #define NETLINK_DNRT_SOCKET__SEND_MSG 0x00100000UL #define NETLINK_DNRT_SOCKET__NAME_BIND 0x00200000UL - -#define DBUS__ACQUIRE_SVC 0x00000001UL -#define DBUS__SEND_MSG 0x00000002UL - -#define NSCD__GETPWD 0x00000001UL -#define NSCD__GETGRP 0x00000002UL -#define NSCD__GETHOST 0x00000004UL -#define NSCD__GETSTAT 0x00000008UL -#define NSCD__ADMIN 0x00000010UL -#define NSCD__SHMEMPWD 0x00000020UL -#define NSCD__SHMEMGRP 0x00000040UL -#define NSCD__SHMEMHOST 0x00000080UL - #define ASSOCIATION__SENDTO 0x00000001UL #define ASSOCIATION__RECVFROM 0x00000002UL #define ASSOCIATION__SETCONTEXT 0x00000004UL #define ASSOCIATION__POLMATCH 0x00000008UL - #define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL 0x00000001UL #define NETLINK_KOBJECT_UEVENT_SOCKET__READ 0x00000002UL #define NETLINK_KOBJECT_UEVENT_SOCKET__WRITE 0x00000004UL @@ -939,7 +767,6 @@ #define NETLINK_KOBJECT_UEVENT_SOCKET__RECV_MSG 0x00080000UL #define NETLINK_KOBJECT_UEVENT_SOCKET__SEND_MSG 0x00100000UL #define NETLINK_KOBJECT_UEVENT_SOCKET__NAME_BIND 0x00200000UL - #define APPLETALK_SOCKET__IOCTL 0x00000001UL #define APPLETALK_SOCKET__READ 0x00000002UL #define APPLETALK_SOCKET__WRITE 0x00000004UL @@ -962,11 +789,9 @@ #define APPLETALK_SOCKET__RECV_MSG 0x00080000UL #define APPLETALK_SOCKET__SEND_MSG 0x00100000UL #define APPLETALK_SOCKET__NAME_BIND 0x00200000UL - #define PACKET__SEND 0x00000001UL #define PACKET__RECV 0x00000002UL #define PACKET__RELABELTO 0x00000004UL - #define KEY__VIEW 0x00000001UL #define KEY__READ 0x00000002UL #define KEY__WRITE 0x00000004UL @@ -974,10 +799,6 @@ #define KEY__LINK 0x00000010UL #define KEY__SETATTR 0x00000020UL #define KEY__CREATE 0x00000040UL - -#define CONTEXT__TRANSLATE 0x00000001UL -#define CONTEXT__CONTAINS 0x00000002UL - #define DCCP_SOCKET__IOCTL 0x00000001UL #define DCCP_SOCKET__READ 0x00000002UL #define DCCP_SOCKET__WRITE 0x00000004UL diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h index 9f3ebb1bfae..37879906844 100644 --- a/security/selinux/include/class_to_string.h +++ b/security/selinux/include/class_to_string.h @@ -2,7 +2,7 @@ /* * Security object class definitions */ - S_("null") + S_(NULL) S_("security") S_("process") S_("system") @@ -32,19 +32,19 @@ S_("msgq") S_("shm") S_("ipc") - S_("passwd") - S_("drawable") - S_("window") - S_("gc") - S_("font") - S_("colormap") - S_("property") - S_("cursor") - S_("xclient") - S_("xinput") - S_("xserver") - S_("xextension") - S_("pax") + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) + S_(NULL) S_("netlink_route_socket") S_("netlink_firewall_socket") S_("netlink_tcpdiag_socket") @@ -54,12 +54,12 @@ S_("netlink_audit_socket") S_("netlink_ip6fw_socket") S_("netlink_dnrt_socket") - S_("dbus") - S_("nscd") + S_(NULL) + S_(NULL) S_("association") S_("netlink_kobject_uevent_socket") S_("appletalk_socket") S_("packet") S_("key") - S_("context") + S_(NULL) S_("dccp_socket") diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h index 67cef371ee0..35f309f4787 100644 --- a/security/selinux/include/flask.h +++ b/security/selinux/include/flask.h @@ -34,19 +34,6 @@ #define SECCLASS_MSGQ 27 #define SECCLASS_SHM 28 #define SECCLASS_IPC 29 -#define SECCLASS_PASSWD 30 -#define SECCLASS_DRAWABLE 31 -#define SECCLASS_WINDOW 32 -#define SECCLASS_GC 33 -#define SECCLASS_FONT 34 -#define SECCLASS_COLORMAP 35 -#define SECCLASS_PROPERTY 36 -#define SECCLASS_CURSOR 37 -#define SECCLASS_XCLIENT 38 -#define SECCLASS_XINPUT 39 -#define SECCLASS_XSERVER 40 -#define SECCLASS_XEXTENSION 41 -#define SECCLASS_PAX 42 #define SECCLASS_NETLINK_ROUTE_SOCKET 43 #define SECCLASS_NETLINK_FIREWALL_SOCKET 44 #define SECCLASS_NETLINK_TCPDIAG_SOCKET 45 @@ -56,14 +43,11 @@ #define SECCLASS_NETLINK_AUDIT_SOCKET 49 #define SECCLASS_NETLINK_IP6FW_SOCKET 50 #define SECCLASS_NETLINK_DNRT_SOCKET 51 -#define SECCLASS_DBUS 52 -#define SECCLASS_NSCD 53 #define SECCLASS_ASSOCIATION 54 #define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET 55 #define SECCLASS_APPLETALK_SOCKET 56 #define SECCLASS_PACKET 57 #define SECCLASS_KEY 58 -#define SECCLASS_CONTEXT 59 #define SECCLASS_DCCP_SOCKET 60 /* diff --git a/security/selinux/include/selinux_netlabel.h b/security/selinux/include/netlabel.h index 2a732c9033e..218e3f77c35 100644 --- a/security/selinux/include/selinux_netlabel.h +++ b/security/selinux/include/netlabel.h @@ -38,19 +38,22 @@ #ifdef CONFIG_NETLABEL void selinux_netlbl_cache_invalidate(void); -int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid); -int selinux_netlbl_socket_post_create(struct socket *sock); -void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock); -int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, - struct sk_buff *skb, - struct avc_audit_data *ad); + void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec, int family); void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec, int family); void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, struct sk_security_struct *newssec); + +int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid); + +void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock); +int selinux_netlbl_socket_post_create(struct socket *sock); int selinux_netlbl_inode_permission(struct inode *inode, int mask); +int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, + struct sk_buff *skb, + struct avc_audit_data *ad); int selinux_netlbl_socket_setsockopt(struct socket *sock, int level, int optname); @@ -60,59 +63,53 @@ static inline void selinux_netlbl_cache_invalidate(void) return; } -static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, - u32 base_sid, - u32 *sid) +static inline void selinux_netlbl_sk_security_reset( + struct sk_security_struct *ssec, + int family) { - *sid = SECSID_NULL; - return 0; + return; } - -static inline int selinux_netlbl_socket_post_create(struct socket *sock) +static inline void selinux_netlbl_sk_security_init( + struct sk_security_struct *ssec, + int family) { - return 0; + return; } - -static inline void selinux_netlbl_sock_graft(struct sock *sk, - struct socket *sock) +static inline void selinux_netlbl_sk_security_clone( + struct sk_security_struct *ssec, + struct sk_security_struct *newssec) { return; } -static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, - struct sk_buff *skb, - struct avc_audit_data *ad) +static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, + u32 base_sid, + u32 *sid) { + *sid = SECSID_NULL; return 0; } -static inline void selinux_netlbl_sk_security_reset( - struct sk_security_struct *ssec, - int family) -{ - return; -} - -static inline void selinux_netlbl_sk_security_init( - struct sk_security_struct *ssec, - int family) +static inline void selinux_netlbl_sock_graft(struct sock *sk, + struct socket *sock) { return; } - -static inline void selinux_netlbl_sk_security_clone( - struct sk_security_struct *ssec, - struct sk_security_struct *newssec) +static inline int selinux_netlbl_socket_post_create(struct socket *sock) { - return; + return 0; } - static inline int selinux_netlbl_inode_permission(struct inode *inode, int mask) { return 0; } - +static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, + struct sk_buff *skb, + struct avc_audit_data *ad) +{ + return 0; +} static inline int selinux_netlbl_socket_setsockopt(struct socket *sock, int level, int optname) diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 210eec77e7f..b94378afea2 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -34,7 +34,7 @@ #define POLICYDB_VERSION_MAX POLICYDB_VERSION_RANGETRANS #endif -struct sk_buff; +struct netlbl_lsm_secattr; extern int selinux_enabled; extern int selinux_mls_enabled; @@ -82,8 +82,6 @@ int security_netif_sid(char *name, u32 *if_sid, int security_node_sid(u16 domain, void *addr, u32 addrlen, u32 *out_sid); -void security_skb_extlbl_sid(struct sk_buff *skb, u32 base_sid, u32 *sid); - int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, u16 tclass); @@ -102,5 +100,30 @@ int security_fs_use(const char *fstype, unsigned int *behavior, int security_genfs_sid(const char *fstype, char *name, u16 sclass, u32 *sid); +#ifdef CONFIG_NETLABEL +int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, + u32 base_sid, + u32 *sid); + +int security_netlbl_sid_to_secattr(u32 sid, + struct netlbl_lsm_secattr *secattr); +#else +static inline int security_netlbl_secattr_to_sid( + struct netlbl_lsm_secattr *secattr, + u32 base_sid, + u32 *sid) +{ + return -EIDRM; +} + +static inline int security_netlbl_sid_to_secattr(u32 sid, + struct netlbl_lsm_secattr *secattr) +{ + return -ENOENT; +} +#endif /* CONFIG_NETLABEL */ + +const char *security_get_initial_sid_context(u32 sid); + #endif /* _SELINUX_SECURITY_H_ */ diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c new file mode 100644 index 00000000000..bf8750791dd --- /dev/null +++ b/security/selinux/netlabel.c @@ -0,0 +1,363 @@ +/* + * SELinux NetLabel Support + * + * This file provides the necessary glue to tie NetLabel into the SELinux + * subsystem. + * + * Author: Paul Moore <paul.moore@hp.com> + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <linux/spinlock.h> +#include <linux/rcupdate.h> +#include <net/sock.h> +#include <net/netlabel.h> + +#include "objsec.h" +#include "security.h" + +/** + * selinux_netlbl_socket_setsid - Label a socket using the NetLabel mechanism + * @sock: the socket to label + * @sid: the SID to use + * + * Description: + * Attempt to label a socket using the NetLabel mechanism using the given + * SID. Returns zero values on success, negative values on failure. The + * caller is responsibile for calling rcu_read_lock() before calling this + * this function and rcu_read_unlock() after this function returns. + * + */ +static int selinux_netlbl_socket_setsid(struct socket *sock, u32 sid) +{ + int rc; + struct sk_security_struct *sksec = sock->sk->sk_security; + struct netlbl_lsm_secattr secattr; + + rc = security_netlbl_sid_to_secattr(sid, &secattr); + if (rc != 0) + return rc; + + rc = netlbl_socket_setattr(sock, &secattr); + if (rc == 0) { + spin_lock_bh(&sksec->nlbl_lock); + sksec->nlbl_state = NLBL_LABELED; + spin_unlock_bh(&sksec->nlbl_lock); + } + + return rc; +} + +/** + * selinux_netlbl_cache_invalidate - Invalidate the NetLabel cache + * + * Description: + * Invalidate the NetLabel security attribute mapping cache. + * + */ +void selinux_netlbl_cache_invalidate(void) +{ + netlbl_cache_invalidate(); +} + +/** + * selinux_netlbl_sk_security_reset - Reset the NetLabel fields + * @ssec: the sk_security_struct + * @family: the socket family + * + * Description: + * Called when the NetLabel state of a sk_security_struct needs to be reset. + * The caller is responsibile for all the NetLabel sk_security_struct locking. + * + */ +void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec, + int family) +{ + if (family == PF_INET) + ssec->nlbl_state = NLBL_REQUIRE; + else + ssec->nlbl_state = NLBL_UNSET; +} + +/** + * selinux_netlbl_sk_security_init - Setup the NetLabel fields + * @ssec: the sk_security_struct + * @family: the socket family + * + * Description: + * Called when a new sk_security_struct is allocated to initialize the NetLabel + * fields. + * + */ +void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec, + int family) +{ + /* No locking needed, we are the only one who has access to ssec */ + selinux_netlbl_sk_security_reset(ssec, family); + spin_lock_init(&ssec->nlbl_lock); +} + +/** + * selinux_netlbl_sk_security_clone - Copy the NetLabel fields + * @ssec: the original sk_security_struct + * @newssec: the cloned sk_security_struct + * + * Description: + * Clone the NetLabel specific sk_security_struct fields from @ssec to + * @newssec. + * + */ +void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, + struct sk_security_struct *newssec) +{ + /* We don't need to take newssec->nlbl_lock because we are the only + * thread with access to newssec, but we do need to take the RCU read + * lock as other threads could have access to ssec */ + rcu_read_lock(); + selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family); + newssec->sclass = ssec->sclass; + rcu_read_unlock(); +} + +/** + * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel + * @skb: the packet + * @base_sid: the SELinux SID to use as a context for MLS only attributes + * @sid: the SID + * + * Description: + * Call the NetLabel mechanism to get the security attributes of the given + * packet and use those attributes to determine the correct context/SID to + * assign to the packet. Returns zero on success, negative values on failure. + * + */ +int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) +{ + int rc; + struct netlbl_lsm_secattr secattr; + + netlbl_secattr_init(&secattr); + rc = netlbl_skbuff_getattr(skb, &secattr); + if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) + rc = security_netlbl_secattr_to_sid(&secattr, + base_sid, + sid); + else + *sid = SECSID_NULL; + netlbl_secattr_destroy(&secattr); + + return rc; +} + +/** + * selinux_netlbl_sock_graft - Netlabel the new socket + * @sk: the new connection + * @sock: the new socket + * + * Description: + * The connection represented by @sk is being grafted onto @sock so set the + * socket's NetLabel to match the SID of @sk. + * + */ +void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) +{ + struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; + struct sk_security_struct *sksec = sk->sk_security; + struct netlbl_lsm_secattr secattr; + u32 nlbl_peer_sid; + + sksec->sclass = isec->sclass; + + rcu_read_lock(); + + if (sksec->nlbl_state != NLBL_REQUIRE) { + rcu_read_unlock(); + return; + } + + netlbl_secattr_init(&secattr); + if (netlbl_sock_getattr(sk, &secattr) == 0 && + secattr.flags != NETLBL_SECATTR_NONE && + security_netlbl_secattr_to_sid(&secattr, + SECINITSID_UNLABELED, + &nlbl_peer_sid) == 0) + sksec->peer_sid = nlbl_peer_sid; + netlbl_secattr_destroy(&secattr); + + /* Try to set the NetLabel on the socket to save time later, if we fail + * here we will pick up the pieces in later calls to + * selinux_netlbl_inode_permission(). */ + selinux_netlbl_socket_setsid(sock, sksec->sid); + + rcu_read_unlock(); +} + +/** + * selinux_netlbl_socket_post_create - Label a socket using NetLabel + * @sock: the socket to label + * + * Description: + * Attempt to label a socket using the NetLabel mechanism using the given + * SID. Returns zero values on success, negative values on failure. + * + */ +int selinux_netlbl_socket_post_create(struct socket *sock) +{ + int rc = 0; + struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; + struct sk_security_struct *sksec = sock->sk->sk_security; + + sksec->sclass = isec->sclass; + + rcu_read_lock(); + if (sksec->nlbl_state == NLBL_REQUIRE) + rc = selinux_netlbl_socket_setsid(sock, sksec->sid); + rcu_read_unlock(); + + return rc; +} + +/** + * selinux_netlbl_inode_permission - Verify the socket is NetLabel labeled + * @inode: the file descriptor's inode + * @mask: the permission mask + * + * Description: + * Looks at a file's inode and if it is marked as a socket protected by + * NetLabel then verify that the socket has been labeled, if not try to label + * the socket now with the inode's SID. Returns zero on success, negative + * values on failure. + * + */ +int selinux_netlbl_inode_permission(struct inode *inode, int mask) +{ + int rc; + struct sk_security_struct *sksec; + struct socket *sock; + + if (!S_ISSOCK(inode->i_mode) || + ((mask & (MAY_WRITE | MAY_APPEND)) == 0)) + return 0; + sock = SOCKET_I(inode); + sksec = sock->sk->sk_security; + + rcu_read_lock(); + if (sksec->nlbl_state != NLBL_REQUIRE) { + rcu_read_unlock(); + return 0; + } + local_bh_disable(); + bh_lock_sock_nested(sock->sk); + rc = selinux_netlbl_socket_setsid(sock, sksec->sid); + bh_unlock_sock(sock->sk); + local_bh_enable(); + rcu_read_unlock(); + + return rc; +} + +/** + * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel + * @sksec: the sock's sk_security_struct + * @skb: the packet + * @ad: the audit data + * + * Description: + * Fetch the NetLabel security attributes from @skb and perform an access check + * against the receiving socket. Returns zero on success, negative values on + * error. + * + */ +int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, + struct sk_buff *skb, + struct avc_audit_data *ad) +{ + int rc; + u32 netlbl_sid; + u32 recv_perm; + + rc = selinux_netlbl_skbuff_getsid(skb, + SECINITSID_UNLABELED, + &netlbl_sid); + if (rc != 0) + return rc; + + if (netlbl_sid == SECSID_NULL) + return 0; + + switch (sksec->sclass) { + case SECCLASS_UDP_SOCKET: + recv_perm = UDP_SOCKET__RECVFROM; + break; + case SECCLASS_TCP_SOCKET: + recv_perm = TCP_SOCKET__RECVFROM; + break; + default: + recv_perm = RAWIP_SOCKET__RECVFROM; + } + + rc = avc_has_perm(sksec->sid, + netlbl_sid, + sksec->sclass, + recv_perm, + ad); + if (rc == 0) + return 0; + + netlbl_skbuff_err(skb, rc); + return rc; +} + +/** + * selinux_netlbl_socket_setsockopt - Do not allow users to remove a NetLabel + * @sock: the socket + * @level: the socket level or protocol + * @optname: the socket option name + * + * Description: + * Check the setsockopt() call and if the user is trying to replace the IP + * options on a socket and a NetLabel is in place for the socket deny the + * access; otherwise allow the access. Returns zero when the access is + * allowed, -EACCES when denied, and other negative values on error. + * + */ +int selinux_netlbl_socket_setsockopt(struct socket *sock, + int level, + int optname) +{ + int rc = 0; + struct sk_security_struct *sksec = sock->sk->sk_security; + struct netlbl_lsm_secattr secattr; + + rcu_read_lock(); + if (level == IPPROTO_IP && optname == IP_OPTIONS && + sksec->nlbl_state == NLBL_LABELED) { + netlbl_secattr_init(&secattr); + rc = netlbl_socket_getattr(sock, &secattr); + if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) + rc = -EACCES; + netlbl_secattr_destroy(&secattr); + } + rcu_read_unlock(); + + return rc; +} diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 93b3177c758..aca099aa2ed 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -96,12 +96,18 @@ enum sel_inos { SEL_COMMIT_BOOLS, /* commit new boolean values */ SEL_MLS, /* return if MLS policy is enabled */ SEL_DISABLE, /* disable SELinux until next reboot */ - SEL_AVC, /* AVC management directory */ SEL_MEMBER, /* compute polyinstantiation membership decision */ SEL_CHECKREQPROT, /* check requested protection, not kernel-applied one */ SEL_COMPAT_NET, /* whether to use old compat network packet controls */ + SEL_INO_NEXT, /* The next inode number to use */ }; +static unsigned long sel_last_ino = SEL_INO_NEXT - 1; + +#define SEL_INITCON_INO_OFFSET 0x01000000 +#define SEL_BOOL_INO_OFFSET 0x02000000 +#define SEL_INO_MASK 0x00ffffff + #define TMPBUFLEN 12 static ssize_t sel_read_enforce(struct file *filp, char __user *buf, size_t count, loff_t *ppos) @@ -777,8 +783,6 @@ static struct inode *sel_make_inode(struct super_block *sb, int mode) return ret; } -#define BOOL_INO_OFFSET 30 - static ssize_t sel_read_bool(struct file *filep, char __user *buf, size_t count, loff_t *ppos) { @@ -806,14 +810,14 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, } inode = filep->f_path.dentry->d_inode; - cur_enforcing = security_get_bool_value(inode->i_ino - BOOL_INO_OFFSET); + cur_enforcing = security_get_bool_value(inode->i_ino&SEL_INO_MASK); if (cur_enforcing < 0) { ret = cur_enforcing; goto out; } length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing, - bool_pending_values[inode->i_ino - BOOL_INO_OFFSET]); + bool_pending_values[inode->i_ino&SEL_INO_MASK]); ret = simple_read_from_buffer(buf, count, ppos, page, length); out: mutex_unlock(&sel_mutex); @@ -865,7 +869,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf, new_value = 1; inode = filep->f_path.dentry->d_inode; - bool_pending_values[inode->i_ino - BOOL_INO_OFFSET] = new_value; + bool_pending_values[inode->i_ino&SEL_INO_MASK] = new_value; length = count; out: @@ -1029,7 +1033,7 @@ static int sel_make_bools(void) isec->sid = sid; isec->initialized = 1; inode->i_fop = &sel_bool_ops; - inode->i_ino = i + BOOL_INO_OFFSET; + inode->i_ino = i|SEL_BOOL_INO_OFFSET; d_add(dentry, inode); } bool_num = num; @@ -1234,6 +1238,56 @@ static int sel_make_avc_files(struct dentry *dir) goto out; } inode->i_fop = files[i].ops; + inode->i_ino = ++sel_last_ino; + d_add(dentry, inode); + } +out: + return ret; +} + +static ssize_t sel_read_initcon(struct file * file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode; + char *con; + u32 sid, len; + ssize_t ret; + + inode = file->f_path.dentry->d_inode; + sid = inode->i_ino&SEL_INO_MASK; + ret = security_sid_to_context(sid, &con, &len); + if (ret < 0) + return ret; + + ret = simple_read_from_buffer(buf, count, ppos, con, len); + kfree(con); + return ret; +} + +static const struct file_operations sel_initcon_ops = { + .read = sel_read_initcon, +}; + +static int sel_make_initcon_files(struct dentry *dir) +{ + int i, ret = 0; + + for (i = 1; i <= SECINITSID_NUM; i++) { + struct inode *inode; + struct dentry *dentry; + dentry = d_alloc_name(dir, security_get_initial_sid_context(i)); + if (!dentry) { + ret = -ENOMEM; + goto out; + } + + inode = sel_make_inode(dir->d_sb, S_IFREG|S_IRUGO); + if (!inode) { + ret = -ENOMEM; + goto out; + } + inode->i_fop = &sel_initcon_ops; + inode->i_ino = i|SEL_INITCON_INO_OFFSET; d_add(dentry, inode); } out: @@ -1252,6 +1306,7 @@ static int sel_make_dir(struct inode *dir, struct dentry *dentry) } inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; + inode->i_ino = ++sel_last_ino; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_add(dentry, inode); @@ -1314,6 +1369,7 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent) ret = -ENOMEM; goto err; } + inode->i_ino = ++sel_last_ino; isec = (struct inode_security_struct*)inode->i_security; isec->sid = SECINITSID_DEVNULL; isec->sclass = SECCLASS_CHR_FILE; @@ -1336,6 +1392,21 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent) ret = sel_make_avc_files(dentry); if (ret) goto err; + + dentry = d_alloc_name(sb->s_root, "initial_contexts"); + if (!dentry) { + ret = -ENOMEM; + goto err; + } + + ret = sel_make_dir(root_inode, dentry); + if (ret) + goto err; + + ret = sel_make_initcon_files(dentry); + if (ret) + goto err; + out: return ret; err: diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 1e52356664d..40660ffd49b 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -39,7 +39,6 @@ #include <linux/sched.h> #include <linux/audit.h> #include <linux/mutex.h> -#include <net/sock.h> #include <net/netlabel.h> #include "flask.h" @@ -53,7 +52,7 @@ #include "conditional.h" #include "mls.h" #include "objsec.h" -#include "selinux_netlabel.h" +#include "netlabel.h" #include "xfrm.h" #include "ebitmap.h" @@ -594,6 +593,13 @@ static int context_struct_to_string(struct context *context, char **scontext, u3 #include "initial_sid_to_string.h" +const char *security_get_initial_sid_context(u32 sid) +{ + if (unlikely(sid > SECINITSID_NUM)) + return NULL; + return initial_sid_to_string[sid]; +} + /** * security_sid_to_context - Obtain a context for a given SID. * @sid: security identifier, SID @@ -1050,6 +1056,8 @@ static int validate_classes(struct policydb *p) for (i = 1; i < kdefs->cts_len; i++) { def_class = kdefs->class_to_string[i]; + if (!def_class) + continue; if (i > p->p_classes.nprim) { printk(KERN_INFO "security: class %s not defined in policy\n", @@ -1249,6 +1257,7 @@ bad: } extern void selinux_complete_init(void); +static int security_preserve_bools(struct policydb *p); /** * security_load_policy - Load a security policy configuration. @@ -1325,6 +1334,12 @@ int security_load_policy(void *data, size_t len) goto err; } + rc = security_preserve_bools(&newpolicydb); + if (rc) { + printk(KERN_ERR "security: unable to preserve booleans\n"); + goto err; + } + /* Clone the SID table. */ sidtab_shutdown(&sidtab); if (sidtab_map(&sidtab, clone_sid, &newsidtab)) { @@ -1882,6 +1897,37 @@ out: return rc; } +static int security_preserve_bools(struct policydb *p) +{ + int rc, nbools = 0, *bvalues = NULL, i; + char **bnames = NULL; + struct cond_bool_datum *booldatum; + struct cond_node *cur; + + rc = security_get_bools(&nbools, &bnames, &bvalues); + if (rc) + goto out; + for (i = 0; i < nbools; i++) { + booldatum = hashtab_search(p->p_bools.table, bnames[i]); + if (booldatum) + booldatum->state = bvalues[i]; + } + for (cur = p->cond_list; cur != NULL; cur = cur->next) { + rc = evaluate_cond_node(p, cur); + if (rc) + goto out; + } + +out: + if (bnames) { + for (i = 0; i < nbools; i++) + kfree(bnames[i]); + } + kfree(bnames); + kfree(bvalues); + return rc; +} + /* * security_sid_mls_copy() - computes a new sid based on the given * sid and the mls portion of mls_sid. @@ -2198,41 +2244,15 @@ void selinux_audit_set_callback(int (*callback)(void)) aurule_callback = callback; } -/** - * security_skb_extlbl_sid - Determine the external label of a packet - * @skb: the packet - * @base_sid: the SELinux SID to use as a context for MLS only external labels - * @sid: the packet's SID - * - * Description: - * Check the various different forms of external packet labeling and determine - * the external SID for the packet. - * - */ -void security_skb_extlbl_sid(struct sk_buff *skb, u32 base_sid, u32 *sid) -{ - u32 xfrm_sid; - u32 nlbl_sid; - - selinux_skb_xfrm_sid(skb, &xfrm_sid); - if (selinux_netlbl_skbuff_getsid(skb, - (xfrm_sid == SECSID_NULL ? - base_sid : xfrm_sid), - &nlbl_sid) != 0) - nlbl_sid = SECSID_NULL; - - *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid); -} - #ifdef CONFIG_NETLABEL /* - * This is the structure we store inside the NetLabel cache block. + * NetLabel cache structure */ -#define NETLBL_CACHE(x) ((struct netlbl_cache *)(x)) +#define NETLBL_CACHE(x) ((struct selinux_netlbl_cache *)(x)) #define NETLBL_CACHE_T_NONE 0 #define NETLBL_CACHE_T_SID 1 #define NETLBL_CACHE_T_MLS 2 -struct netlbl_cache { +struct selinux_netlbl_cache { u32 type; union { u32 sid; @@ -2241,7 +2261,7 @@ struct netlbl_cache { }; /** - * selinux_netlbl_cache_free - Free the NetLabel cached data + * security_netlbl_cache_free - Free the NetLabel cached data * @data: the data to free * * Description: @@ -2249,9 +2269,9 @@ struct netlbl_cache { * netlbl_lsm_cache structure. * */ -static void selinux_netlbl_cache_free(const void *data) +static void security_netlbl_cache_free(const void *data) { - struct netlbl_cache *cache; + struct selinux_netlbl_cache *cache; if (data == NULL) return; @@ -2266,33 +2286,33 @@ static void selinux_netlbl_cache_free(const void *data) } /** - * selinux_netlbl_cache_add - Add an entry to the NetLabel cache - * @skb: the packet + * security_netlbl_cache_add - Add an entry to the NetLabel cache + * @secattr: the NetLabel packet security attributes * @ctx: the SELinux context * * Description: * Attempt to cache the context in @ctx, which was derived from the packet in - * @skb, in the NetLabel subsystem cache. + * @skb, in the NetLabel subsystem cache. This function assumes @secattr has + * already been initialized. * */ -static void selinux_netlbl_cache_add(struct sk_buff *skb, struct context *ctx) +static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, + struct context *ctx) { - struct netlbl_cache *cache = NULL; - struct netlbl_lsm_secattr secattr; + struct selinux_netlbl_cache *cache = NULL; - netlbl_secattr_init(&secattr); - secattr.cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); - if (secattr.cache == NULL) - goto netlbl_cache_add_return; + secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); + if (secattr->cache == NULL) + return; cache = kzalloc(sizeof(*cache), GFP_ATOMIC); if (cache == NULL) - goto netlbl_cache_add_return; + return; cache->type = NETLBL_CACHE_T_MLS; if (ebitmap_cpy(&cache->data.mls_label.level[0].cat, &ctx->range.level[0].cat) != 0) - goto netlbl_cache_add_return; + return; cache->data.mls_label.level[1].cat.highbit = cache->data.mls_label.level[0].cat.highbit; cache->data.mls_label.level[1].cat.node = @@ -2300,52 +2320,40 @@ static void selinux_netlbl_cache_add(struct sk_buff *skb, struct context *ctx) cache->data.mls_label.level[0].sens = ctx->range.level[0].sens; cache->data.mls_label.level[1].sens = ctx->range.level[0].sens; - secattr.cache->free = selinux_netlbl_cache_free; - secattr.cache->data = (void *)cache; - secattr.flags = NETLBL_SECATTR_CACHE; - - netlbl_cache_add(skb, &secattr); - -netlbl_cache_add_return: - netlbl_secattr_destroy(&secattr); + secattr->cache->free = security_netlbl_cache_free; + secattr->cache->data = (void *)cache; + secattr->flags |= NETLBL_SECATTR_CACHE; } /** - * selinux_netlbl_cache_invalidate - Invalidate the NetLabel cache - * - * Description: - * Invalidate the NetLabel security attribute mapping cache. - * - */ -void selinux_netlbl_cache_invalidate(void) -{ - netlbl_cache_invalidate(); -} - -/** - * selinux_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID - * @skb: the network packet + * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID * @secattr: the NetLabel packet security attributes * @base_sid: the SELinux SID to use as a context for MLS only attributes * @sid: the SELinux SID * * Description: - * Convert the given NetLabel packet security attributes in @secattr into a + * Convert the given NetLabel security attributes in @secattr into a * SELinux SID. If the @secattr field does not contain a full SELinux - * SID/context then use the context in @base_sid as the foundation. If @skb - * is not NULL attempt to cache as much data as possibile. Returns zero on - * success, negative values on failure. + * SID/context then use the context in @base_sid as the foundation. If + * possibile the 'cache' field of @secattr is set and the CACHE flag is set; + * this is to allow the @secattr to be used by NetLabel to cache the secattr to + * SID conversion for future lookups. Returns zero on success, negative + * values on failure. * */ -static int selinux_netlbl_secattr_to_sid(struct sk_buff *skb, - struct netlbl_lsm_secattr *secattr, - u32 base_sid, - u32 *sid) +int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, + u32 base_sid, + u32 *sid) { int rc = -EIDRM; struct context *ctx; struct context ctx_new; - struct netlbl_cache *cache; + struct selinux_netlbl_cache *cache; + + if (!ss_initialized) { + *sid = SECSID_NULL; + return 0; + } POLICY_RDLOCK; @@ -2410,8 +2418,8 @@ static int selinux_netlbl_secattr_to_sid(struct sk_buff *skb, if (rc != 0) goto netlbl_secattr_to_sid_return_cleanup; - if (skb != NULL) - selinux_netlbl_cache_add(skb, &ctx_new); + security_netlbl_cache_add(secattr, &ctx_new); + ebitmap_destroy(&ctx_new.range.level[0].cat); } else { *sid = SECSID_NULL; @@ -2427,338 +2435,43 @@ netlbl_secattr_to_sid_return_cleanup: } /** - * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel - * @skb: the packet - * @base_sid: the SELinux SID to use as a context for MLS only attributes - * @sid: the SID - * - * Description: - * Call the NetLabel mechanism to get the security attributes of the given - * packet and use those attributes to determine the correct context/SID to - * assign to the packet. Returns zero on success, negative values on failure. - * - */ -int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) -{ - int rc; - struct netlbl_lsm_secattr secattr; - - netlbl_secattr_init(&secattr); - rc = netlbl_skbuff_getattr(skb, &secattr); - if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) - rc = selinux_netlbl_secattr_to_sid(skb, - &secattr, - base_sid, - sid); - else - *sid = SECSID_NULL; - netlbl_secattr_destroy(&secattr); - - return rc; -} - -/** - * selinux_netlbl_socket_setsid - Label a socket using the NetLabel mechanism - * @sock: the socket to label - * @sid: the SID to use + * security_netlbl_sid_to_secattr - Convert a SELinux SID to a NetLabel secattr + * @sid: the SELinux SID + * @secattr: the NetLabel packet security attributes * * Description: - * Attempt to label a socket using the NetLabel mechanism using the given - * SID. Returns zero values on success, negative values on failure. The - * caller is responsibile for calling rcu_read_lock() before calling this - * this function and rcu_read_unlock() after this function returns. + * Convert the given SELinux SID in @sid into a NetLabel security attribute. + * Returns zero on success, negative values on failure. * */ -static int selinux_netlbl_socket_setsid(struct socket *sock, u32 sid) +int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr) { int rc = -ENOENT; - struct sk_security_struct *sksec = sock->sk->sk_security; - struct netlbl_lsm_secattr secattr; struct context *ctx; + netlbl_secattr_init(secattr); + if (!ss_initialized) return 0; - netlbl_secattr_init(&secattr); - POLICY_RDLOCK; - ctx = sidtab_search(&sidtab, sid); if (ctx == NULL) - goto netlbl_socket_setsid_return; - - secattr.domain = kstrdup(policydb.p_type_val_to_name[ctx->type - 1], - GFP_ATOMIC); - secattr.flags |= NETLBL_SECATTR_DOMAIN; - mls_export_netlbl_lvl(ctx, &secattr); - rc = mls_export_netlbl_cat(ctx, &secattr); + goto netlbl_sid_to_secattr_failure; + secattr->domain = kstrdup(policydb.p_type_val_to_name[ctx->type - 1], + GFP_ATOMIC); + secattr->flags |= NETLBL_SECATTR_DOMAIN; + mls_export_netlbl_lvl(ctx, secattr); + rc = mls_export_netlbl_cat(ctx, secattr); if (rc != 0) - goto netlbl_socket_setsid_return; - - rc = netlbl_socket_setattr(sock, &secattr); - if (rc == 0) { - spin_lock_bh(&sksec->nlbl_lock); - sksec->nlbl_state = NLBL_LABELED; - spin_unlock_bh(&sksec->nlbl_lock); - } - -netlbl_socket_setsid_return: + goto netlbl_sid_to_secattr_failure; POLICY_RDUNLOCK; - netlbl_secattr_destroy(&secattr); - return rc; -} - -/** - * selinux_netlbl_sk_security_reset - Reset the NetLabel fields - * @ssec: the sk_security_struct - * @family: the socket family - * - * Description: - * Called when the NetLabel state of a sk_security_struct needs to be reset. - * The caller is responsibile for all the NetLabel sk_security_struct locking. - * - */ -void selinux_netlbl_sk_security_reset(struct sk_security_struct *ssec, - int family) -{ - if (family == PF_INET) - ssec->nlbl_state = NLBL_REQUIRE; - else - ssec->nlbl_state = NLBL_UNSET; -} -/** - * selinux_netlbl_sk_security_init - Setup the NetLabel fields - * @ssec: the sk_security_struct - * @family: the socket family - * - * Description: - * Called when a new sk_security_struct is allocated to initialize the NetLabel - * fields. - * - */ -void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec, - int family) -{ - /* No locking needed, we are the only one who has access to ssec */ - selinux_netlbl_sk_security_reset(ssec, family); - spin_lock_init(&ssec->nlbl_lock); -} - -/** - * selinux_netlbl_sk_security_clone - Copy the NetLabel fields - * @ssec: the original sk_security_struct - * @newssec: the cloned sk_security_struct - * - * Description: - * Clone the NetLabel specific sk_security_struct fields from @ssec to - * @newssec. - * - */ -void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, - struct sk_security_struct *newssec) -{ - /* We don't need to take newssec->nlbl_lock because we are the only - * thread with access to newssec, but we do need to take the RCU read - * lock as other threads could have access to ssec */ - rcu_read_lock(); - selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family); - newssec->sclass = ssec->sclass; - rcu_read_unlock(); -} - -/** - * selinux_netlbl_socket_post_create - Label a socket using NetLabel - * @sock: the socket to label - * - * Description: - * Attempt to label a socket using the NetLabel mechanism using the given - * SID. Returns zero values on success, negative values on failure. - * - */ -int selinux_netlbl_socket_post_create(struct socket *sock) -{ - int rc = 0; - struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; - struct sk_security_struct *sksec = sock->sk->sk_security; - - sksec->sclass = isec->sclass; - - rcu_read_lock(); - if (sksec->nlbl_state == NLBL_REQUIRE) - rc = selinux_netlbl_socket_setsid(sock, sksec->sid); - rcu_read_unlock(); - - return rc; -} - -/** - * selinux_netlbl_sock_graft - Netlabel the new socket - * @sk: the new connection - * @sock: the new socket - * - * Description: - * The connection represented by @sk is being grafted onto @sock so set the - * socket's NetLabel to match the SID of @sk. - * - */ -void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) -{ - struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; - struct sk_security_struct *sksec = sk->sk_security; - struct netlbl_lsm_secattr secattr; - u32 nlbl_peer_sid; - - sksec->sclass = isec->sclass; - - rcu_read_lock(); - - if (sksec->nlbl_state != NLBL_REQUIRE) { - rcu_read_unlock(); - return; - } - - netlbl_secattr_init(&secattr); - if (netlbl_sock_getattr(sk, &secattr) == 0 && - secattr.flags != NETLBL_SECATTR_NONE && - selinux_netlbl_secattr_to_sid(NULL, - &secattr, - SECINITSID_UNLABELED, - &nlbl_peer_sid) == 0) - sksec->peer_sid = nlbl_peer_sid; - netlbl_secattr_destroy(&secattr); - - /* Try to set the NetLabel on the socket to save time later, if we fail - * here we will pick up the pieces in later calls to - * selinux_netlbl_inode_permission(). */ - selinux_netlbl_socket_setsid(sock, sksec->sid); - - rcu_read_unlock(); -} - -/** - * selinux_netlbl_inode_permission - Verify the socket is NetLabel labeled - * @inode: the file descriptor's inode - * @mask: the permission mask - * - * Description: - * Looks at a file's inode and if it is marked as a socket protected by - * NetLabel then verify that the socket has been labeled, if not try to label - * the socket now with the inode's SID. Returns zero on success, negative - * values on failure. - * - */ -int selinux_netlbl_inode_permission(struct inode *inode, int mask) -{ - int rc; - struct sk_security_struct *sksec; - struct socket *sock; - - if (!S_ISSOCK(inode->i_mode) || - ((mask & (MAY_WRITE | MAY_APPEND)) == 0)) - return 0; - sock = SOCKET_I(inode); - sksec = sock->sk->sk_security; - - rcu_read_lock(); - if (sksec->nlbl_state != NLBL_REQUIRE) { - rcu_read_unlock(); - return 0; - } - local_bh_disable(); - bh_lock_sock_nested(sock->sk); - rc = selinux_netlbl_socket_setsid(sock, sksec->sid); - bh_unlock_sock(sock->sk); - local_bh_enable(); - rcu_read_unlock(); - - return rc; -} - -/** - * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel - * @sksec: the sock's sk_security_struct - * @skb: the packet - * @ad: the audit data - * - * Description: - * Fetch the NetLabel security attributes from @skb and perform an access check - * against the receiving socket. Returns zero on success, negative values on - * error. - * - */ -int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, - struct sk_buff *skb, - struct avc_audit_data *ad) -{ - int rc; - u32 netlbl_sid; - u32 recv_perm; - - rc = selinux_netlbl_skbuff_getsid(skb, - SECINITSID_UNLABELED, - &netlbl_sid); - if (rc != 0) - return rc; - - if (netlbl_sid == SECSID_NULL) - return 0; - - switch (sksec->sclass) { - case SECCLASS_UDP_SOCKET: - recv_perm = UDP_SOCKET__RECVFROM; - break; - case SECCLASS_TCP_SOCKET: - recv_perm = TCP_SOCKET__RECVFROM; - break; - default: - recv_perm = RAWIP_SOCKET__RECVFROM; - } - - rc = avc_has_perm(sksec->sid, - netlbl_sid, - sksec->sclass, - recv_perm, - ad); - if (rc == 0) - return 0; - - netlbl_skbuff_err(skb, rc); - return rc; -} - -/** - * selinux_netlbl_socket_setsockopt - Do not allow users to remove a NetLabel - * @sock: the socket - * @level: the socket level or protocol - * @optname: the socket option name - * - * Description: - * Check the setsockopt() call and if the user is trying to replace the IP - * options on a socket and a NetLabel is in place for the socket deny the - * access; otherwise allow the access. Returns zero when the access is - * allowed, -EACCES when denied, and other negative values on error. - * - */ -int selinux_netlbl_socket_setsockopt(struct socket *sock, - int level, - int optname) -{ - int rc = 0; - struct sk_security_struct *sksec = sock->sk->sk_security; - struct netlbl_lsm_secattr secattr; - - rcu_read_lock(); - if (level == IPPROTO_IP && optname == IP_OPTIONS && - sksec->nlbl_state == NLBL_LABELED) { - netlbl_secattr_init(&secattr); - rc = netlbl_socket_getattr(sock, &secattr); - if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) - rc = -EACCES; - netlbl_secattr_destroy(&secattr); - } - rcu_read_unlock(); + return 0; +netlbl_sid_to_secattr_failure: + POLICY_RDUNLOCK; + netlbl_secattr_destroy(secattr); return rc; } #endif /* CONFIG_NETLABEL */ |