diff -Naur linux-2.4.22-dm-1/drivers/md/Config.in linux-2.4.22-new-snapshot/drivers/md/Config.in --- linux-2.4.22-dm-1/drivers/md/Config.in 2003-09-30 11:59:23.000000000 -0500 +++ linux-2.4.22-new-snapshot/drivers/md/Config.in 2003-09-30 12:02:51.000000000 -0500 @@ -16,5 +16,8 @@ dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD dep_tristate ' Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + dep_tristate ' Bad-Block Device Target (NEW)' CONFIG_BLK_DEV_DM_BADBLOCK $CONFIG_BLK_DEV_DM +fi endmenu diff -Naur linux-2.4.22-dm-1/drivers/md/Makefile linux-2.4.22-new-snapshot/drivers/md/Makefile --- linux-2.4.22-dm-1/drivers/md/Makefile 2003-09-30 11:59:22.000000000 -0500 +++ linux-2.4.22-new-snapshot/drivers/md/Makefile 2003-09-30 12:03:07.000000000 -0500 @@ -5,13 +5,13 @@ O_TARGET := mddev.o export-objs := md.o xor.o dm-table.o dm-target.o kcopyd.o dm-daemon.o \ - dm-log.o dm-io.o dm.o + dm-log.o dm-io.o dm.o dm-exception.o list-multi := lvm-mod.o dm-mod.o dm-mirror-mod.o lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o dm-mod-objs := dm.o dm-table.o dm-target.o dm-ioctl.o \ - dm-linear.o dm-stripe.o dm-snapshot.o dm-exception-store.o \ - kcopyd.o dm-daemon.o dm-io.o + dm-linear.o dm-stripe.o dm-snapshot.o dm-exception.o \ + dm-exception-store.o kcopyd.o dm-daemon.o dm-io.o dm-mirror-mod-objs := dm-raid1.o dm-log.o # Note: link order is important. All raid personalities @@ -30,6 +30,7 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_BLK_DEV_DM_MIRROR) += dm-mirror.o +obj-$(CONFIG_BLK_DEV_DM_BADBLOCK) += dm-badblock.o include $(TOPDIR)/Rules.make diff -Naur linux-2.4.22-dm-1/drivers/md/dm-badblock.c linux-2.4.22-new-snapshot/drivers/md/dm-badblock.c --- linux-2.4.22-dm-1/drivers/md/dm-badblock.c 1969-12-31 18:00:00.000000000 -0600 +++ linux-2.4.22-new-snapshot/drivers/md/dm-badblock.c 2003-09-30 12:01:28.000000000 -0500 @@ -0,0 +1,346 @@ +/* + * Copyright (c) International Business Machines Corp., 2003 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Bad-Block-Relocation target for Device-Mapper. + */ + +#include +#include +#include +#include +#include + +#include "dm.h" +#include "dm-daemon.h" +#include "dm-exception.h" + +/** + * struct dm_badblock + * + * Private data for each badblock instance. + **/ +struct dm_badblock { + struct exception_table *etable; +}; + +/** + * struct badblock_io + * + * Extra info to attach to each buffer-head. + **/ +struct badblock_io { + struct dm_badblock *bb; + struct buffer_head *bh; + struct list_head list; + sector_t b_rsector; +}; + +/* Memory pool of badblock_io structures. */ +static kmem_cache_t *badblock_io_cache; +static mempool_t *badblock_io_pool; + +/* Daemon for processing I/O errors, and + * a list for passing I/Os to the daemon. + */ +static struct dm_daemon badblock_daemon; +static LIST_HEAD(io_list); +static spinlock_t io_list_lock = SPIN_LOCK_UNLOCKED; + +static void process_io_list(void); + +/* + * When the first badblock device is created, initialize the mempools + * and start the badblock-daemon. When the last device is deleted, tear + * everything down. + */ + +static DECLARE_MUTEX(global_init_lock); +static int bbr_devices = 0; + +static int global_init(void) +{ + int rc = 0; + + down(&global_init_lock); + + if (bbr_devices == 0) { + badblock_io_cache = kmem_cache_create("dm_badblock_io", + sizeof(struct badblock_io), + __alignof__(struct badblock_io), + 0, NULL, NULL); + if (!badblock_io_cache) { + DMERR("cannot create bad-block I/O cache."); + rc = -ENOMEM; + goto out; + } + + badblock_io_pool = mempool_create(256, mempool_alloc_slab, + mempool_free_slab, + badblock_io_cache); + if (!badblock_io_pool) { + DMERR("cannot create bad-block I/O mempool."); + kmem_cache_destroy(badblock_io_cache); + rc = -ENOMEM; + goto out; + } + + rc = dm_daemon_start(&badblock_daemon, + "dm_badblock", process_io_list); + if (rc) { + DMERR("cannot start bad-block daemon."); + mempool_destroy(badblock_io_pool); + kmem_cache_destroy(badblock_io_cache); + goto out; + } + } + bbr_devices++; + +out: + up(&global_init_lock); + return rc; +} + +static void global_exit(void) +{ + down(&global_init_lock); + + bbr_devices--; + if (bbr_devices == 0) { + dm_daemon_stop(&badblock_daemon); + mempool_destroy(badblock_io_pool); + kmem_cache_destroy(badblock_io_cache); + } + + up(&global_init_lock); +} + +/* + * Process one I/O that has triggered an error. + */ +static int process_io(struct badblock_io *bb_io) +{ + /* KMC: More to do here. :) */ + return 0; +} + +/* + * Take entries off the I/O list and process each individually. + */ +static void process_io_list(void) +{ + struct badblock_io *bb_io; + unsigned long flags; + int rc; + + while (1) { + spin_lock_irqsave(&io_list_lock, flags); + if (list_empty(&io_list)) { + spin_unlock_irqrestore(&io_list_lock, flags); + break; + } + + bb_io = list_entry(io_list.next, struct badblock_io, list); + list_del_init(&bb_io->list); + spin_unlock_irqrestore(&io_list_lock, flags); + + rc = process_io(bb_io); + } +} + +/* + * Put an I/O request on the daemon's list for further processing. + */ +static void schedule_io(struct badblock_io *bb_io) +{ + unsigned long flags; + spin_lock_irqsave(&io_list_lock, flags); + list_add_tail(&bb_io->list , &io_list); + spin_unlock_irqrestore(&io_list_lock, flags); + dm_daemon_wake(&badblock_daemon); +} + +/** + * badblock_ctr + * + * Build a new bad-block mapping. + * arg format: + **/ +static int badblock_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dm_badblock *bb; + char *data_path, *repl_path; + unsigned long chunk_size; + int rc; + + if (argc < 3) { + ti->error = "dm-badblock requires exactly 3 arguments"; + rc = -EINVAL; + goto bad1; + } + + data_path = argv[0]; + repl_path = argv[1]; + chunk_size = simple_strtoul(argv[2], NULL, 10); + + rc = global_init(); + if (rc) { + ti->error = "cannot initialize bad-block global pools and daemon"; + goto bad1; + } + + bb = kmalloc(sizeof(*bb), GFP_KERNEL); + if (!bb) { + ti->error = "cannot allocate bad-block private structure"; + rc = -ENOMEM; + goto bad2; + } + + bb->etable = etable_create(ti, data_path, repl_path, chunk_size, 'N'); + if (!bb->etable) { + ti->error = "cannot create bad-block exception table"; + rc = -EINVAL; + goto bad3; + } + + ti->private = bb; + return 0; + +bad3: + kfree(bb); +bad2: + global_exit(); +bad1: + return rc; +} + +/** + * badblock_dtr + * + * Delete a bad-block mapping. + **/ +static void badblock_dtr(struct dm_target *ti) +{ + struct dm_badblock *bb = ti->private; + etable_delete(bb->etable); + kfree(bb); + global_exit(); +} + +/** + * badblock_map + * + * Process an I/O request for a bad-block device. + **/ +static int badblock_map(struct dm_target *ti, struct buffer_head *bh, + int rw, union map_info *map_context) +{ + struct dm_badblock *bb = ti->private; + return -ENOSYS; +} + +/** + * badblock_end_io + * + * Return >0 if we're going to do more processing on this bh. + * Return <0 if there's an error we can't handle. + **/ +static int badblock_end_io(struct dm_target *ti, struct buffer_head *bh, + int rw, int error, union map_info *map_context) +{ + return -ENOSYS; +} + +static void badblock_resume(struct dm_target *ti) +{ + struct dm_badblock *bb = ti->private; + int rc; + + rc = etable_read_metadata(bb->etable); + /* KMC: What do we do if there's an error??? */ +} + +static int badblock_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct dm_badblock *bb = ti->private; + struct exception_table *et = bb->etable; + struct dm_dev *dev; + chunk_t numerator, denominator; + char data[16], repl[16]; + int rc; + + switch (type) { + case STATUSTYPE_INFO: + rc = etable_fraction_full(et, &numerator, &denominator); + if (rc) { + snprintf(result, maxlen, "Unknown"); + } else { + snprintf(result, maxlen, + SECTOR_FORMAT "/" SECTOR_FORMAT, + numerator, denominator); + } + break; + + case STATUSTYPE_TABLE: + /* + * dm_kdevname returns a static pointer so we need to + * make private copies if the output is to make sense. + */ + dev = etable_src_dev(et); + strncpy(data, dm_kdevname(dev->dev), sizeof(data)); + dev = etable_dest_dev(et); + strncpy(repl, dm_kdevname(dev->dev), sizeof(repl)); + snprintf(result, maxlen, "%s %s %ld", data, repl, + etable_chunk_size(et)); + break; + } + + return 0; +} + +static struct target_type badblock_target = { + name: "badblock", + module: THIS_MODULE, + ctr: badblock_ctr, + dtr: badblock_dtr, + map: badblock_map, + end_io: badblock_end_io, + resume: badblock_resume, + status: badblock_status, +}; + +int __init dm_badblock_init(void) +{ + int rc = dm_register_target(&badblock_target); + if (rc) { + DMERR("failed to register badblock target: %d", rc); + } + return rc; +} + +void __exit dm_badblock_exit(void) +{ + int rc = dm_unregister_target(&badblock_target); + if (rc) { + DMERR("failed to unregister badblock target: %d", rc); + } +} + +module_init(dm_badblock_init); +module_exit(dm_badblock_exit); +MODULE_LICENSE("GPL"); + diff -Naur linux-2.4.22-dm-1/drivers/md/dm-exception-store.c linux-2.4.22-new-snapshot/drivers/md/dm-exception-store.c --- linux-2.4.22-dm-1/drivers/md/dm-exception-store.c 2003-09-30 11:59:23.000000000 -0500 +++ linux-2.4.22-new-snapshot/drivers/md/dm-exception-store.c 2003-09-30 12:01:35.000000000 -0500 @@ -1,42 +1,43 @@ /* - * dm-snapshot.c + * dm-exception-store.c * * Copyright (C) 2001-2002 Sistina Software (UK) Limited. * * This file is released under the GPL. */ -#include "dm-snapshot.h" -#include "dm-io.h" -#include "kcopyd.h" - #include #include #include #include +#include -/*----------------------------------------------------------------- - * Persistent snapshots, by persistent we mean that the snapshot - * will survive a reboot. - *---------------------------------------------------------------*/ +#include "dm-io.h" +#include "kcopyd.h" +#include "dm-exception-table.h" + +/*----------------------------------------------------------------------- + * Persistent exception store. By persistent we mean that the destination + * device will survive a reboot. + *---------------------------------------------------------------------*/ /* - * We need to store a record of which parts of the origin have - * been copied to the snapshot device. The snapshot code - * requires that we copy exception chunks to chunk aligned areas - * of the COW store. It makes sense therefore, to store the - * metadata in chunk size blocks. + * We need to keep a record of which parts of the source device + * have been copied to the destination device. The exception code + * requires that we copy exception-chunks to chunk-aligned areas + * of the destination device. It makes sense therefore, to store the + * metadata in chunk-size blocks. * - * There is no backward or forward compatibility implemented, - * snapshots with different disk versions than the kernel will - * not be usable. It is expected that "lvcreate" will blank out - * the start of a fresh COW device before calling the snapshot + * There is no backward or forward compatibility implemented. + * Excpetion-stores with different disk versions than the kernel + * will not be usable. It is expected that "lvcreate" will blank out + * the start of a fresh destination device before calling the exception * constructor. * - * The first chunk of the COW device just contains the header. + * The first chunk of the destination device just contains the header. * After this there is a chunk filled with exception metadata, - * followed by as many exception chunks as can fit in the - * metadata areas. + * followed by as many exception chunks as can fit in the metadata + * areas. * * All on disk structures are in little-endian format. The end * of the exceptions info is indicated by an exception with a @@ -45,21 +46,21 @@ */ /* - * Magic for persistent snapshots: "SnAp" - Feeble isn't it. + * Magic for persistent exception-stores: "SnAp" - Feeble isn't it. */ -#define SNAP_MAGIC 0x70416e53 +#define PSTORE_MAGIC 0x70416e53 /* * The on-disk version of the metadata. */ -#define SNAPSHOT_DISK_VERSION 1 +#define PSTORE_DISK_VERSION 1 struct disk_header { uint32_t magic; /* - * Is this snapshot valid. There is no way of recovering - * an invalid snapshot. + * Is this exception-store valid. There is no way of recovering + * once it's invalidated. */ uint32_t valid; @@ -87,10 +88,10 @@ * The top level structure for a persistent exception store. */ struct pstore { - struct dm_snapshot *snap; /* up pointer to my snapshot */ + struct exception_table *etable; /* up pointer to my table */ int version; - int valid; - uint32_t chunk_size; + int valid; /* KMC: Do we need this? Should be available through "etable" */ + uint32_t chunk_size; /* KMC: Do we need this? Should be available through "etable" */ uint32_t exceptions_per_area; /* @@ -184,7 +185,7 @@ struct io_region where; unsigned int bits; - where.dev = ps->snap->cow->dev; + where.dev = ps->etable->dest->dev; where.sector = ps->chunk_size * chunk; where.count = ps->chunk_size; @@ -217,7 +218,7 @@ return area_io(ps, area, WRITE); } -static int read_header(struct pstore *ps, int *new_snapshot) +static int read_header(struct pstore *ps, int *new_store) { int r; struct disk_header *dh; @@ -229,16 +230,16 @@ dh = (struct disk_header *) ps->area; if (le32_to_cpu(dh->magic) == 0) { - *new_snapshot = 1; + *new_store = 1; - } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { - *new_snapshot = 0; + } else if (le32_to_cpu(dh->magic) == PSTORE_MAGIC) { + *new_store = 0; ps->valid = le32_to_cpu(dh->valid); ps->version = le32_to_cpu(dh->version); ps->chunk_size = le32_to_cpu(dh->chunk_size); } else { - DMWARN("Invalid/corrupt snapshot"); + DMWARN("Invalid/corrupt exception store"); r = -ENXIO; } @@ -252,7 +253,7 @@ memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); dh = (struct disk_header *) ps->area; - dh->magic = cpu_to_le32(SNAP_MAGIC); + dh->magic = cpu_to_le32(PSTORE_MAGIC); dh->valid = cpu_to_le32(ps->valid); dh->version = cpu_to_le32(ps->version); dh->chunk_size = cpu_to_le32(ps->chunk_size); @@ -325,8 +326,8 @@ /* * If the new_chunk is pointing at the start of - * the COW device, where the first metadata area - * is we know that we've hit the end of the + * the destination device, where the first metadata + * area is we know that we've hit the end of the * exceptions. Therefore the area is not full. */ if (de.new_chunk == 0LL) { @@ -342,9 +343,9 @@ ps->next_free = de.new_chunk + 1; /* - * Otherwise we add the exception to the snapshot. + * Otherwise we add the exception to the table. */ - r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); + r = etable_add_exception(ps->etable, de.old_chunk, de.new_chunk); if (r) return r; } @@ -382,8 +383,8 @@ static void persistent_fraction_full(struct exception_store *store, sector_t *numerator, sector_t *denominator) { - *numerator = get_info(store)->next_free * store->snap->chunk_size; - *denominator = get_dev_size(store->snap->cow->dev); + *numerator = get_info(store)->next_free * store->etable->chunk_size; + *denominator = get_dev_size(store->etable->dest->dev); } static void persistent_destroy(struct exception_store *store) @@ -398,20 +399,20 @@ static int persistent_read_metadata(struct exception_store *store) { - int r, new_snapshot; + int r, new_store; struct pstore *ps = get_info(store); /* - * Read the snapshot header. + * Read the exception header. */ - r = read_header(ps, &new_snapshot); + r = read_header(ps, &new_store); if (r) return r; /* - * Do we need to setup a new snapshot ? + * Do we need to setup a new exception store? */ - if (new_snapshot) { + if (new_store) { r = write_header(ps); if (r) { DMWARN("write_header failed"); @@ -427,14 +428,17 @@ } else { /* * Sanity checks. + * + * KMC: Should we compare chunk_size from the disk-header + * against the value sent to etable_create? */ if (!ps->valid) { - DMWARN("snapshot is marked invalid"); + DMWARN("exception store is marked invalid"); return -EINVAL; } - if (ps->version != SNAPSHOT_DISK_VERSION) { - DMWARN("unable to handle snapshot disk version %d", + if (ps->version != PSTORE_DISK_VERSION) { + DMWARN("unable to handle exception store disk version %d", ps->version); return -EINVAL; } @@ -455,10 +459,10 @@ { struct pstore *ps = get_info(store); uint32_t stride; - sector_t size = get_dev_size(store->snap->cow->dev); + sector_t size = get_dev_size(store->etable->dest->dev); /* Is there enough room ? */ - if (size < ((ps->next_free + 1) * store->snap->chunk_size)) + if (size < ((ps->next_free + 1) * store->etable->chunk_size)) return -ENOSPC; e->new_chunk = ps->next_free; @@ -530,7 +534,7 @@ } } -static void persistent_drop(struct exception_store *store) +static void persistent_invalidate(struct exception_store *store) { struct pstore *ps = get_info(store); @@ -539,7 +543,7 @@ DMWARN("write header failed"); } -int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) +int estore_create_persistent(struct exception_store *store, uint32_t chunk_size) { int r; struct pstore *ps; @@ -555,12 +559,12 @@ goto bad; } - ps->snap = store->snap; + ps->etable = store->etable; ps->valid = 1; - ps->version = SNAPSHOT_DISK_VERSION; + ps->version = PSTORE_DISK_VERSION; ps->chunk_size = chunk_size; ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / - sizeof(struct disk_exception); + sizeof(struct disk_exception); ps->next_free = 2; /* skipping the header and first area */ ps->current_committed = 0; @@ -585,7 +589,7 @@ store->read_metadata = persistent_read_metadata; store->prepare_exception = persistent_prepare; store->commit_exception = persistent_commit; - store->drop_snapshot = persistent_drop; + store->invalidate = persistent_invalidate; store->fraction_full = persistent_fraction_full; store->context = ps; @@ -603,40 +607,40 @@ } /*----------------------------------------------------------------- - * Implementation of the store for non-persistent snapshots. + * Implementation of the non-persistent exception-store. *---------------------------------------------------------------*/ struct transient_c { sector_t next_free; }; -void transient_destroy(struct exception_store *store) +static void transient_destroy(struct exception_store *store) { kfree(store->context); } -int transient_read_metadata(struct exception_store *store) +static int transient_read_metadata(struct exception_store *store) { return 0; } -int transient_prepare(struct exception_store *store, struct exception *e) +static int transient_prepare(struct exception_store *store, struct exception *e) { struct transient_c *tc = (struct transient_c *) store->context; - sector_t size = get_dev_size(store->snap->cow->dev); + sector_t size = get_dev_size(store->etable->dest->dev); - if (size < (tc->next_free + store->snap->chunk_size)) + if (size < (tc->next_free + store->etable->chunk_size)) return -1; - e->new_chunk = sector_to_chunk(store->snap, tc->next_free); - tc->next_free += store->snap->chunk_size; + e->new_chunk = sector_to_chunk(store->etable, tc->next_free); + tc->next_free += store->etable->chunk_size; return 0; } -void transient_commit(struct exception_store *store, - struct exception *e, - void (*callback) (void *, int success), - void *callback_context) +static void transient_commit(struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context) { /* Just succeed */ callback(callback_context, 1); @@ -646,11 +650,11 @@ sector_t *numerator, sector_t *denominator) { *numerator = ((struct transient_c *) store->context)->next_free; - *denominator = get_dev_size(store->snap->cow->dev); + *denominator = get_dev_size(store->etable->dest->dev); } -int dm_create_transient(struct exception_store *store, - struct dm_snapshot *s, int blocksize) +int estore_create_transient(struct exception_store *store, + struct exception_table *etable) { struct transient_c *tc; @@ -660,7 +664,7 @@ store->prepare_exception = transient_prepare; store->commit_exception = transient_commit; store->fraction_full = transient_fraction_full; - store->snap = s; + store->etable = etable; tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); if (!tc) @@ -671,3 +675,4 @@ return 0; } + diff -Naur linux-2.4.22-dm-1/drivers/md/dm-exception-table.h linux-2.4.22-new-snapshot/drivers/md/dm-exception-table.h --- linux-2.4.22-dm-1/drivers/md/dm-exception-table.h 1969-12-31 18:00:00.000000000 -0600 +++ linux-2.4.22-new-snapshot/drivers/md/dm-exception-table.h 2003-09-30 12:01:35.000000000 -0500 @@ -0,0 +1,179 @@ +/* + * dm-exception-table.h + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#ifndef DM_EXCEPTION_TABLE_H +#define DM_EXCEPTION_TABLE_H + +#include "dm-exception.h" + +/** + * struct exception + * + * An exception is used where an old chunk of data has been + * replaced by a new one. + **/ +struct exception { + struct list_head hash_list; + chunk_t old_chunk; + chunk_t new_chunk; +}; + +/** + * struct pending_exception + * + * An exception that is in the process of being copied from the + * source device to the destination device. + * + * KMC: We may need to add a ref-count or a lock to this structure. + **/ +struct pending_exception { + struct exception e; + + /* + * I/O buffers waiting for this copy to complete are held + * in a list (using b_reqnext). + * + * KMC: Do we need a third queue for reads to the destination? + */ + struct buffer_head *src_bhs; + struct buffer_head *dest_bhs; + + /* + * Other pending_exceptions that are processing this chunk + * on the source device. When this list is empty, we know + * we can complete the source queue. + */ + struct list_head siblings; + + /* Pointer back to the exception-table. */ + struct exception_table *etable; + + /* 1 indicates the exception has already been sent to kcopyd. */ + int started; + + /* Count of threads accessing this structure. */ + atomic_t count; +}; + +/** + * struct exception_hash + * + * A hash table for fast storage and lookups of exceptions. + * Used for both completed and pending exceptions. + **/ +struct exception_hash { + uint32_t hash_mask; + struct list_head *hash_table; +}; + +/* + * Abstraction to handle the meta/layout of exception stores (the + * destination device). + */ +struct exception_store { + + /* + * Deletes this store from memory when you've finished with it. + */ + void (*destroy) (struct exception_store *store); + + /* + * Read metadata from the exception device and load all existing + * exceptions into memory. Don't perform I/O to the destination + * device until this has been called. + */ + int (*read_metadata) (struct exception_store *store); + + /* + * Find somewhere to store the next exception. + */ + int (*prepare_exception) (struct exception_store *store, + struct exception *e); + + /* + * Update the metadata with this exception. + */ + void (*commit_exception) (struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context); + + /* + * The destination device is invalid; note this in the metadata. + */ + void (*invalidate) (struct exception_store *store); + + /* + * Return how full the destination device is. + */ + void (*fraction_full) (struct exception_store *store, + sector_t *numerator, + sector_t *denominator); + + struct exception_table *etable; + void *context; +}; + +/** + * struct exception_table + * + * Describe one exception-handler instance. + **/ +struct exception_table { + struct rw_semaphore lock; + + struct dm_target *dm_target; + struct dm_dev *src; + struct dm_dev *dest; + + /* Size of data blocks saved - must be a power of 2 */ + chunk_t chunk_size; + chunk_t chunk_mask; + chunk_t chunk_shift; + + /* You can't use the destination device if this is 0 (e.g. if full) */ + int valid; + + /* 1 if metadata has already been read from disk. */ + int have_metadata; + + /* Used for display of table. 'P' or 'N'. */ + char type; + + /* The last percentage we notified */ + int last_percent; + + /* Two hash tables. One for in-progress copies, and one + * for completed copies. + */ + struct exception_hash pending; + struct exception_hash complete; + + /* The on disk metadata handler */ + struct exception_store store; + + struct kcopyd_client *kcopyd_client; +}; + +/* + * Constructors for the persistent and transient stores. + */ +int estore_create_persistent(struct exception_store *store, uint32_t chunk_size); +int estore_create_transient(struct exception_store *store, + struct exception_table *etable); + +/* + * Used by the exception stores to load exceptions when + * initialising. + */ +int etable_add_exception(struct exception_table *et, chunk_t old, chunk_t new); +chunk_t sector_to_chunk(struct exception_table *etable, sector_t sector); +sector_t chunk_to_sector(struct exception_table *etable, chunk_t chunk); + +#endif + diff -Naur linux-2.4.22-dm-1/drivers/md/dm-exception.c linux-2.4.22-new-snapshot/drivers/md/dm-exception.c --- linux-2.4.22-dm-1/drivers/md/dm-exception.c 1969-12-31 18:00:00.000000000 -0600 +++ linux-2.4.22-new-snapshot/drivers/md/dm-exception.c 2003-09-30 12:01:35.000000000 -0500 @@ -0,0 +1,932 @@ +/* + * dm-exception-table.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +/* KMC: Need to designate which APIs require read and/or write locks. */ + +/* KMC: Need to figure out which APIs are EXPORT'ed. */ + +#include +#include +#include + +#include "dm.h" +#include "dm-exception-table.h" +#include "kcopyd.h" + +/* + * Each exception-table reserves this many pages for io + * FIXME: calculate this + */ +#define EXCEPTION_TABLE_PAGES 256 + +/* + * The percentage increment we will wake up users at + */ +#define WAKE_UP_PERCENT 5 + +/* + * Global caches and pools for exceptions. + */ +static kmem_cache_t *exception_cache; +static kmem_cache_t *pending_cache; +static mempool_t *pending_pool; + +/* + * Simple APIs to return basic info about the exception table. + */ + +struct dm_dev *etable_src_dev(struct exception_table *etable) +{ + return etable->src; +} + +struct dm_dev *etable_dest_dev(struct exception_table *etable) +{ + return etable->dest; +} + +chunk_t etable_chunk_size(struct exception_table *etable) +{ + return etable->chunk_size; +} + +int etable_valid(struct exception_table *etable) +{ + return etable->valid; +} + +char etable_type(struct exception_table *etable) +{ + return etable->type; +} + +int etable_fraction_full(struct exception_table *etable, + sector_t *numerator, sector_t *denominator) +{ + int r = -EINVAL; + + if (etable->store.fraction_full) { + etable->store.fraction_full(&etable->store, + numerator, denominator); + r = 0; + } + + return r; +} + +/* + * Must lock the exception-table before accessing exception info. + */ + +void etable_write_lock(struct exception_table *etable) +{ + down_write(&etable->lock); +} + +void etable_write_unlock(struct exception_table *etable) +{ + up_write(&etable->lock); +} + +void etable_read_lock(struct exception_table *etable) +{ + down_read(&etable->lock); +} + +void etable_read_unlock(struct exception_table *etable) +{ + up_read(&etable->lock); +} + +/* + * Hard coded magic. The number of list_head's that will fit into + * 1/50th of the machine's physical memory. + */ +static int calc_max_buckets(void) +{ + unsigned long mem; + + mem = num_physpages << PAGE_SHIFT; + mem /= 50; + mem /= sizeof(struct list_head); + + return mem; +} + +/* + * Rounds a number down to a power of 2. + * KMC: Should this be in dm.h like dm_round_up()? + */ +static inline uint32_t round_down(uint32_t n) +{ + while (n & (n - 1)) + n &= (n - 1); + return n; +} + +static int init_exception_hash(struct exception_hash *eh, uint32_t size) +{ + unsigned int i; + + eh->hash_mask = size - 1; + eh->hash_table = vcalloc(size, sizeof(struct list_head)); + if (!eh->hash_table) + return -ENOMEM; + + for (i = 0; i < size; i++) + INIT_LIST_HEAD(eh->hash_table + i); + + return 0; +} + +static void exit_exception_hash(struct exception_hash *eh, kmem_cache_t *mem) +{ + struct list_head *slot, *entry, *temp; + struct exception *ex; + int i, size; + + size = eh->hash_mask + 1; + for (i = 0; i < size; i++) { + slot = eh->hash_table + i; + + list_for_each_safe(entry, temp, slot) { + ex = list_entry(entry, struct exception, hash_list); + kmem_cache_free(mem, ex); + } + } + + vfree(eh->hash_table); +} + +/* + * Allocate room for a suitable hash table. + */ +static int init_hash_tables(struct exception_table *et) +{ + sector_t hash_size, dest_dev_size, src_dev_size, max_buckets; + + /* + * Calculate based on the size of the source or destination devices. + */ + dest_dev_size = get_dev_size(et->dest->dev); + src_dev_size = get_dev_size(et->src->dev); + max_buckets = calc_max_buckets(); + + hash_size = min(src_dev_size, dest_dev_size) / et->chunk_size; + hash_size = min(hash_size, max_buckets); + + /* Round it down to a power of 2 */ + hash_size = round_down(hash_size); + if (init_exception_hash(&et->complete, hash_size)) + return -ENOMEM; + + /* + * Allocate hash table for in-flight exceptions + * Make this smaller than the real hash table + */ + hash_size >>= 3; + if (!hash_size) + hash_size = 64; + + if (init_exception_hash(&et->pending, hash_size)) { + exit_exception_hash(&et->complete, exception_cache); + return -ENOMEM; + } + + return 0; +} + +/** + * etable_create + * + * Create a new exception table. + * chunk_size: Must be a non-zero power-of-2. + * persistent: Must be 'P' or 'N' + **/ +struct exception_table *etable_create(struct dm_target *ti, + char *src_path, + char *dest_path, + unsigned long chunk_size, + char persistent) +{ + struct exception_table *et = NULL; + int blocksize, r; + + if (persistent != 'P' && persistent != 'N') { + DMERR("Persistent flag is not P or N"); + goto out; + } + + if (chunk_size == 0) { + DMERR("Chunk size must be non-zero"); + goto out; + } + + /* + * Chunk size must be multiple of page size. Silently + * round up if it's not. + */ + chunk_size = dm_round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE); + + /* Check the sizes are small enough to fit in one kiovec */ + if (chunk_size > KIO_MAX_SECTORS) { + DMERR("Chunk size is too big"); + goto out; + } + + /* Check chunk_size is a power of 2 */ + /* KMC: Since we rounded-up to PAGE_SIZE, do we need to bother with this? */ + if (chunk_size & (chunk_size - 1)) { + DMERR("Chunk size is not a power of 2"); + goto out; + } + + et = kmalloc(sizeof(*et), GFP_KERNEL); + if (!et) { + DMERR("Cannot allocate new exception table."); + goto out; + } + + r = dm_get_device(ti, src_path, 0, ti->len, FMODE_READ, &et->src); + if (r) { + DMERR("Cannot get source device"); + goto bad1; + } + + r = dm_get_device(ti, dest_path, 0, 0, + FMODE_READ | FMODE_WRITE, &et->dest); + if (r) { + dm_put_device(ti, et->src); + DMERR("Cannot get destination device"); + goto bad1; + } + + /* Validate the chunk size against the dest. device's block size */ + blocksize = get_hardsect_size(et->dest->dev); + if (chunk_size % (blocksize / SECTOR_SIZE)) { + DMERR("Chunk size is not a multiple of device blocksize"); + goto bad2; + } + + init_rwsem(&et->lock); + et->dm_target = ti; + et->chunk_size = chunk_size; + et->chunk_mask = chunk_size - 1; + for (et->chunk_shift = 0; chunk_size; + et->chunk_shift++, chunk_size >>= 1) + ; + et->chunk_shift--; + + et->valid = 1; + et->have_metadata = 0; + et->type = persistent; + et->last_percent = 0; + + /* Allocate hash tables. */ + if (init_hash_tables(et)) { + DMERR("Unable to allocate hash table space"); + goto bad2; + } + + /* + * Check the persistent flag - done here because we need the iobuf + * to check the destination-dev header. + */ + et->store.etable = et; + + if (persistent == 'P') + r = estore_create_persistent(&et->store, et->chunk_size); + else + r = estore_create_transient(&et->store, et); + + if (r) { + DMERR("Couldn't create exception store"); + goto bad3; + } + + r = kcopyd_client_create(EXCEPTION_TABLE_PAGES, &et->kcopyd_client); + if (r) { + DMERR("Could not create kcopyd client"); + goto bad4; + } + +out: + return et; + +bad4: + et->store.destroy(&et->store); +bad3: + exit_exception_hash(&et->complete, exception_cache); + exit_exception_hash(&et->pending, pending_cache); +bad2: + dm_put_device(ti, et->dest); + dm_put_device(ti, et->src); +bad1: + kfree(et); + return NULL; +} + +/** + * etable_delete + * + * Delete an exception-table. + **/ +void etable_delete(struct exception_table *etable) +{ + dm_table_event(etable->dm_target->table); + kcopyd_client_destroy(etable->kcopyd_client); + etable->store.destroy(&etable->store); + exit_exception_hash(&etable->complete, exception_cache); + /* KMC: We're free'ing the remaining pending exceptions directly + * back to the pending-cache instead of the pending-mempool. + * Does this matter? Should the pending-hash-table be empty + * at this point anyway? We shouldn't be deleting devices + * unless they're closed (open-count == 0)...and we can't + * close the device until all outstanding I/O's are complete... + */ + exit_exception_hash(&etable->pending, pending_cache); + dm_put_device(etable->dm_target, etable->src); + dm_put_device(etable->dm_target, etable->dest); + kfree(etable); +} + +/** + * etable_read_metadata + * + * If we haven't read the metadata yet, call the exception-store to read the + * metadata and setup the initial exception hash-table. + **/ +int etable_read_metadata(struct exception_table *etable) +{ + int r = 0; + + if (!etable->have_metadata) { + r = etable->store.read_metadata(&etable->store); + if (!r) { + etable->have_metadata = 1; + } + } + + return r; +} + +/* + * Convert LBAs to chunks and vice-versa. + */ + +chunk_t sector_to_chunk(struct exception_table *etable, sector_t sector) +{ + return (sector & ~etable->chunk_mask) >> etable->chunk_shift; +} + +sector_t chunk_to_sector(struct exception_table *etable, chunk_t chunk) +{ + return chunk << etable->chunk_shift; +} + +/* + * Basic hash-table operations. + * FIXME: check how this hash fn is performing. + */ +static inline uint32_t exception_hash(struct exception_hash *eh, chunk_t chunk) +{ + return chunk & eh->hash_mask; +} + +static void insert_exception(struct exception_hash *eh, struct exception *e) +{ + struct list_head *l = &eh->hash_table[exception_hash(eh, e->old_chunk)]; + list_add(&e->hash_list, l); +} + +static inline void remove_exception(struct exception *e) +{ + list_del(&e->hash_list); +} + +/* + * Hash-table lookups. Return the exception data + * for a chunk, or NULL if not remapped. + */ +static struct exception *lookup_exception(struct exception_hash *eh, + chunk_t chunk) +{ + struct list_head *slot, *el; + struct exception *e; + + slot = &eh->hash_table[exception_hash(eh, chunk)]; + list_for_each(el, slot) { + e = list_entry(el, struct exception, hash_list); + if (e->old_chunk == chunk) + return e; + } + + return NULL; +} + +/** + * _etable_lookup_exception + * + * Look for a complete exception. Only requires a read-lock, since once + * an exception is added to the "complete" table, it will never be removed. + **/ +struct exception *_etable_lookup_exception(struct exception_table *etable, + struct buffer_head *bh) +{ + return lookup_exception(&etable->complete, + sector_to_chunk(etable, bh->b_rsector)); +} + +/** + * __etable_lookup_pending_exception + * + * Look for a pending exception. Requires a write-lock, since at some point + * the pending exception will be removed from the table and free'd, and no + * other ref-counting is done. This may have to change!!! + * + * KMC: If we add ref-counting to the pending-exceptions, can we call this + * function with just the read-lock? + **/ +struct pending_exception * +__etable_lookup_pending_exception(struct exception_table *etable, + struct buffer_head *bh) +{ + struct exception *e; + struct pending_exception *pe = NULL; + + e = lookup_exception(&etable->pending, + sector_to_chunk(etable, bh->b_rsector)); + if (e) { + /* cast the exception to a pending exception */ + pe = list_entry(e, struct pending_exception, e); + atomic_inc(&pe->count); + } + + return pe; +} + +/** + * etable_remap_exception + * + * Update the buffer-head to redirect it to a remapped chunk on the + * destination device. No lock is required. + **/ +void etable_remap_exception(struct exception_table *et, + struct exception *e, + struct buffer_head *bh) +{ + bh->b_rdev = et->dest->dev; + bh->b_rsector = chunk_to_sector(et, e->new_chunk) + + (bh->b_rsector & et->chunk_mask); +} + +/* + * The pending-exceptions hold lists of buffer_heads, + * using the b_reqnext field. + */ +static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh) +{ + bh->b_reqnext = *queue; + *queue = bh; +} + +/** + * __etable_queue_src_buffer + * + * Queue a buffer-head destined for the source device on the + * pending-exception. Requires a write-lock on the exception table. + **/ +void __etable_queue_src_buffer(struct pending_exception *pe, + struct buffer_head *bh) +{ + queue_buffer(&pe->src_bhs, bh); +} + +/** + * __etable_queue_dest_buffer + * + * Remap the buffer-head to the destination device and queue it on the + * pending-exception. Requires a write-lock on the exception-table. + **/ +void __etable_queue_dest_buffer(struct pending_exception *pe, + struct buffer_head *bh) +{ + etable_remap_exception(pe->etable, &pe->e, bh); + queue_buffer(&pe->dest_bhs, bh); +} + +static inline struct exception *alloc_exception(void) +{ + struct exception *e; + + e = kmem_cache_alloc(exception_cache, GFP_NOIO); + if (!e) + e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); + + return e; +} + +static inline void free_exception(struct exception *e) +{ + kmem_cache_free(exception_cache, e); +} + +static inline struct pending_exception *alloc_pending_exception(void) +{ + return mempool_alloc(pending_pool, GFP_NOIO); +} + +static inline void free_pending_exception(struct pending_exception *pe) +{ + mempool_free(pe, pending_pool); +} + +/** + * __etable_create_pending_exception + * + * Create and prepare new pending exception and add it to the pending + * hash table. Requires a write-lock since we're modifying the table. + **/ +struct pending_exception * +__etable_create_pending_exception(struct exception_table *et, + struct buffer_head *bh) +{ + struct pending_exception *pe; + chunk_t chunk = sector_to_chunk(et, bh->b_rsector); + + pe = alloc_pending_exception(); + pe->e.old_chunk = chunk; + pe->src_bhs = pe->dest_bhs = NULL; + INIT_LIST_HEAD(&pe->siblings); + pe->etable = et; + pe->started = 0; + atomic_set(&pe->count, 2); + + if (et->store.prepare_exception(&et->store, &pe->e)) { + free_pending_exception(pe); + et->valid = 0; + return NULL; + } + + insert_exception(&et->pending, &pe->e); + + return pe; +} + +static void list_merge(struct list_head *l1, struct list_head *l2) +{ + struct list_head *l1_n, *l2_p; + + l1_n = l1->next; + l2_p = l2->prev; + + l1->next = l2; + l2->prev = l1; + + l2_p->next = l1_n; + l1_n->prev = l2_p; +} + +/** + * __etable_merge_pending_exceptions + * + * Join these two pending-exceptions together through their "sibling" fields. + * Requires a write-lock. + * + * KMC: Needs better locking! + **/ +void __etable_merge_pending_exceptions(struct pending_exception *pe1, + struct pending_exception *pe2) +{ + list_merge(&pe1->siblings, &pe2->siblings); +} + +/** + * etable_add_exception + * + * Allocate and initialize an exception and add it to the + * complete-exceptions hash-table. + **/ +int etable_add_exception(struct exception_table *et, chunk_t old, chunk_t new) +{ + struct exception *e; + + e = alloc_exception(); + if (!e) + return -ENOMEM; + + e->old_chunk = old; + e->new_chunk = new; + insert_exception(&et->complete, e); + return 0; +} + +static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs) +{ + while (*queue) + queue = &((*queue)->b_reqnext); + + *queue = bhs; +} + +/* + * Flush a list of buffers. + */ +static void flush_buffers(struct buffer_head *bh) +{ + struct buffer_head *n; + + while (bh) { + n = bh->b_reqnext; + bh->b_reqnext = NULL; + generic_make_request(WRITE, bh); + bh = n; + } + + run_task_queue(&tq_disk); +} + +/* + * Error a list of buffers. + */ +static void error_buffers(struct buffer_head *bh) +{ + struct buffer_head *n; + + while (bh) { + n = bh->b_reqnext; + bh->b_reqnext = NULL; + buffer_IO_error(bh); + bh = n; + } +} + +/* KMC: I think we need a big spinlock to protect the "sibling" lists. + * Otherwise we need some "anchor" structure for each sibling list + * where we can put a lock to control the list. + */ +static struct buffer_head *__flush_src_bhs(struct pending_exception *pe) +{ + struct pending_exception *sibling; + + if (list_empty(&pe->siblings)) + return pe->src_bhs; + + sibling = list_entry(pe->siblings.next, + struct pending_exception, siblings); + + list_del(&pe->siblings); + + /* FIXME: I think there's a race on SMP machines here, add spin lock */ + queue_buffers(&sibling->src_bhs, pe->src_bhs); + + return NULL; +} + +/** + * etable_put_pending_exception + * + * Decrement the reference count in the pending exception. When the count + * reaches zero, we can flush the I/O queues, and free the structure. + * + * KMC: Not quite done yet! + **/ +void etable_put_pending_exception(struct pending_exception *pe) +{ + struct buffer_head *flush; + + if (atomic_dec_and_test(&pe->count)) { + flush = __flush_src_bhs(pe); + if (flush) { + flush_buffers(flush); + } + free_pending_exception(pe); + } +} + +static void pending_complete(struct pending_exception *pe, int success) +{ + struct exception_table *et = pe->etable; + struct exception *e; + + if (success) { + e = alloc_exception(); + if (!e) { + DMWARN("Unable to allocate exception."); + down_write(&et->lock); + /* KMC: Don't want to invalidate here. Need to return + * control to the caller and let them decide + * what action to take. + */ + et->store.invalidate(&et->store); + et->valid = 0; + remove_exception(&pe->e); + up_write(&et->lock); + + error_buffers(pe->dest_bhs); + goto out; + } + + /* Add a complete-exception and remove the pending-exception. */ + down_write(&et->lock); + memcpy(e, &pe->e, sizeof(*e)); + insert_exception(&et->complete, e); + remove_exception(&pe->e); + up_write(&et->lock); + + /* Submit pending I/Os to the destination device. */ + flush_buffers(pe->dest_bhs); + + /* Notify any interested parties */ + /* KMC: Should this be done here, or should we let + * the caller do this? + */ + if (et->store.fraction_full) { + sector_t numerator, denominator; + int pc; + + et->store.fraction_full(&et->store, &numerator, + &denominator); + pc = numerator * 100 / denominator; + + if (pc >= et->last_percent + WAKE_UP_PERCENT) { + dm_table_event(et->dm_target->table); + et->last_percent = pc - pc % WAKE_UP_PERCENT; + } + } + + } else { + /* Read/write error - snapshot is unusable */ + /* KMC: Don't want to invalidate here. Need to return + * control to the caller and let them decide + * what action to take. + */ + down_write(&et->lock); + if (et->valid) + DMERR("Error reading/writing snapshot"); + et->store.invalidate(&et->store); + et->valid = 0; + remove_exception(&pe->e); + up_write(&et->lock); + + error_buffers(pe->dest_bhs); + + dm_table_event(et->dm_target->table); + DMDEBUG("Exception failed."); + } + + out: + etable_put_pending_exception(pe); +} + +static void commit_callback(void *context, int success) +{ + struct pending_exception *pe = (struct pending_exception *) context; + pending_complete(pe, success); +} + +/* + * Called when the copy I/O has finished. kcopyd actually runs + * this code so don't block. + */ +static void copy_callback(int read_err, unsigned int write_err, void *context) +{ + struct pending_exception *pe = (struct pending_exception *) context; + struct exception_table *et = pe->etable; + + if (read_err || write_err) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + et->store.commit_exception(&et->store, &pe->e, + commit_callback, pe); +} + +/** + * __etable_start_copy + * + * Dispatches the copy operation to kcopyd. + * + * KMC: Should we add a "callback" argument to this to notify the caller + * when the copy is complete? Might make pending_complete() a bit + * simpler. + **/ +void __etable_start_copy(struct pending_exception *pe) +{ + struct exception_table *et = pe->etable; + struct io_region src, dest; + kdev_t dev = et->src->dev; + sector_t dev_size; + + if (pe->started) + return; + + pe->started = 1; + + dev_size = get_dev_size(dev); + if (!dev_size) + dev_size = (sector_t) -1; + + src.dev = dev; + src.sector = chunk_to_sector(et, pe->e.old_chunk); + src.count = min(et->chunk_size, dev_size - src.sector); + + dest.dev = et->dest->dev; + dest.sector = chunk_to_sector(et, pe->e.new_chunk); + dest.count = src.count; + + /* Hand over to kcopyd */ + kcopyd_copy(et->kcopyd_client, + &src, 1, &dest, 0, copy_callback, pe); +} + +/** + * etable_start_copies + * + * Start copies on a list of pending-exceptions. Queue the buffer-head + * on the first pending-exception. + */ +void etable_start_copies(struct pending_exception *list) +{ + struct pending_exception *next, *pe = list; + + do { + down_write(&pe->etable->lock); + __etable_start_copy(pe); + up_write(&pe->etable->lock); + + next = list_entry(pe->siblings.next, + struct pending_exception, siblings); + etable_put_pending_exception(pe); + pe = next; + } while (pe != list); +} + +/** + * __etable_invalidate + * + * Mark the exception table invalid (in memory and on disk). Requires + * a write-lock. + **/ +void __etable_invalidate(struct exception_table *etable) +{ + etable->valid = 0; + etable->store.invalidate(&etable->store); +} + +int dm_exception_init(void) +{ + int r; + + exception_cache = kmem_cache_create("dm-exception", + sizeof(struct exception), + __alignof__(struct exception), + 0, NULL, NULL); + if (!exception_cache) { + DMERR("Couldn't create exception cache."); + r = -ENOMEM; + goto bad1; + } + + pending_cache = + kmem_cache_create("dm-pend-exception", + sizeof(struct pending_exception), + __alignof__(struct pending_exception), + 0, NULL, NULL); + if (!pending_cache) { + DMERR("Couldn't create pending cache."); + r = -ENOMEM; + goto bad2; + } + + pending_pool = mempool_create(128, mempool_alloc_slab, + mempool_free_slab, pending_cache); + if (!pending_pool) { + DMERR("Couldn't create pending pool."); + r = -ENOMEM; + goto bad3; + } + + return 0; + +bad3: + kmem_cache_destroy(pending_cache); +bad2: + kmem_cache_destroy(exception_cache); +bad1: + return r; +} + +void dm_exception_exit(void) +{ + mempool_destroy(pending_pool); + kmem_cache_destroy(pending_cache); + kmem_cache_destroy(exception_cache); +} + diff -Naur linux-2.4.22-dm-1/drivers/md/dm-exception.h linux-2.4.22-new-snapshot/drivers/md/dm-exception.h --- linux-2.4.22-dm-1/drivers/md/dm-exception.h 1969-12-31 18:00:00.000000000 -0600 +++ linux-2.4.22-new-snapshot/drivers/md/dm-exception.h 2003-09-30 12:01:35.000000000 -0500 @@ -0,0 +1,81 @@ +/* + * dm-exception.h + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#ifndef DM_EXCEPTION_H +#define DM_EXCEPTION_H + +struct exception; +struct pending_exception; +struct exception_table; + +/* + * The exception code deals with largish chunks of the disk at a + * time. Typically 64k - 256k. + */ +typedef sector_t chunk_t; + +/* + * Creating and deleting exception tables. + */ +struct exception_table *etable_create(struct dm_target *ti, + char *src_path, + char *dest_path, + unsigned long chunk_size, + char persistent); +void etable_delete(struct exception_table *etable); +int etable_read_metadata(struct exception_table *etable); +void __etable_invalidate(struct exception_table *etable); + +/* + * Lock an exception table. + */ +void etable_write_lock(struct exception_table *etable); +void etable_write_unlock(struct exception_table *etable); +void etable_read_lock(struct exception_table *etable); +void etable_read_unlock(struct exception_table *etable); + +/* + * Get basic info about an exception table. + */ +struct dm_dev *etable_src_dev(struct exception_table *etable); +struct dm_dev *etable_dest_dev(struct exception_table *etable); +chunk_t etable_chunk_size(struct exception_table *etable); +int etable_valid(struct exception_table *etable); +char etable_type(struct exception_table *etable); +int etable_fraction_full(struct exception_table *etable, + sector_t *numerator, sector_t *denominator); + +/* + * Access complete-exception information from the tables. + */ +struct exception *_etable_lookup_exception(struct exception_table *etable, + struct buffer_head *bh); +void etable_remap_exception(struct exception_table *et, + struct exception *e, struct buffer_head *bh); + +/* + * Access pending-exception info. + */ +struct pending_exception * +__etable_lookup_pending_exception(struct exception_table *etable, + struct buffer_head *bh); +struct pending_exception * +__etable_create_pending_exception(struct exception_table *et, + struct buffer_head *bh); +void etable_put_pending_exception(struct pending_exception *pe); +void __etable_queue_src_buffer(struct pending_exception *pe, + struct buffer_head *bh); +void __etable_queue_dest_buffer(struct pending_exception *pe, + struct buffer_head *bh); +void __etable_start_copy(struct pending_exception *pe); +void etable_start_copies(struct pending_exception *list); +void __etable_merge_pending_exceptions(struct pending_exception *pe1, + struct pending_exception *pe2); + +#endif + diff -Naur linux-2.4.22-dm-1/drivers/md/dm-snapshot.c linux-2.4.22-new-snapshot/drivers/md/dm-snapshot.c --- linux-2.4.22-dm-1/drivers/md/dm-snapshot.c 2003-09-30 11:59:23.000000000 -0500 +++ linux-2.4.22-new-snapshot/drivers/md/dm-snapshot.c 2003-09-30 12:01:41.000000000 -0500 @@ -6,142 +6,70 @@ * This file is released under the GPL. */ -#include -#include #include -#include -#include -#include +#include #include -#include -#include +#include #include -#include -#include "dm-snapshot.h" -#include "kcopyd.h" +#include "dm.h" +#include "dm-exception.h" -/* - * FIXME: Remove this before release. - */ -#if 0 -#define DMDEBUG(x...) DMWARN( ## x) -#else -#define DMDEBUG(x...) -#endif +/** + * struct snapshot + * + * Describe one snapshot device. + * + * KMC: We may need to add a pointer to a "struct origin". + **/ +struct snapshot { + struct exception_table *etable; -/* - * The percentage increment we will wake up users at - */ -#define WAKE_UP_PERCENT 5 - -/* - * kcopyd priority of snapshot operations - */ -#define SNAPSHOT_COPY_PRIORITY 2 - -/* - * Each snapshot reserves this many pages for io - * FIXME: calculate this - */ -#define SNAPSHOT_PAGES 256 - -struct pending_exception { - struct exception e; - - /* - * Origin buffers waiting for this to complete are held - * in a list (using b_reqnext). - */ - struct buffer_head *origin_bhs; - struct buffer_head *snapshot_bhs; - - /* - * Other pending_exceptions that are processing this - * chunk. When this list is empty, we know we can - * complete the origins. - */ - struct list_head siblings; - - /* Pointer back to snapshot context */ - struct dm_snapshot *snap; - - /* - * 1 indicates the exception has already been sent to - * kcopyd. - */ - int started; + /* List of snapshots per origin. */ + struct list_head list; }; -/* - * Hash table mapping origin volumes to lists of snapshots and - * a lock to protect it - */ -static kmem_cache_t *exception_cache; -static kmem_cache_t *pending_cache; -static mempool_t *pending_pool; - -/* - * One of these per registered origin, held in the snapshot_origins hash - */ +/** + * struct origin + * + * One of these per registered origin, held in the _origins list. + * + * KMC: We may need to add a lock to this structure. + **/ struct origin { /* The origin device */ - kdev_t dev; + struct dm_dev *dev; - struct list_head hash_list; + /* List of all known origins */ + struct list_head origins; /* List of snapshots for this origin */ struct list_head snapshots; + + /* Count of snapshots and origins referrencing this structure. */ + unsigned int count; }; /* - * Size of the hash table for origin volumes. If we make this - * the size of the minors list then it should be nearly perfect + * Global list of all origin devices, and a lock to protect it. + * + * Need to read-lock any time we're examining the _origins list or any + * of the snaphots lists within each origin. Need to write-lock any time + * we're adding a new origin or snapshot structure. */ -#define ORIGIN_HASH_SIZE 256 -#define ORIGIN_MASK 0xFF -static struct list_head *_origins; -static struct rw_semaphore _origins_lock; - -static int init_origin_hash(void) -{ - int i; - - _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), - GFP_KERNEL); - if (!_origins) { - DMERR("Device mapper: Snapshot: unable to allocate memory"); - return -ENOMEM; - } - - for (i = 0; i < ORIGIN_HASH_SIZE; i++) - INIT_LIST_HEAD(_origins + i); - init_rwsem(&_origins_lock); - return 0; -} +static LIST_HEAD(_origins); +static DECLARE_RWSEM(_origins_lock); -static void exit_origin_hash(void) +static struct origin *__lookup_origin(struct dm_dev *dev) { - kfree(_origins); -} - -static inline unsigned int origin_hash(kdev_t dev) -{ - return MINOR(dev) & ORIGIN_MASK; -} - -static struct origin *__lookup_origin(kdev_t origin) -{ - struct list_head *slist; - struct list_head *ol; + struct list_head *lh; struct origin *o; - ol = &_origins[origin_hash(origin)]; - list_for_each(slist, ol) { - o = list_entry(slist, struct origin, hash_list); + list_for_each(lh, &_origins) { + o = list_entry(lh, struct origin, origins); - if (o->dev == origin) + if (o->dev == dev) return o; } @@ -150,264 +78,138 @@ static void __insert_origin(struct origin *o) { - struct list_head *sl = &_origins[origin_hash(o->dev)]; - list_add_tail(&o->hash_list, sl); -} - -/* - * Make a note of the snapshot and its origin so we can look it - * up when the origin has a write on it. - */ -static int register_snapshot(struct dm_snapshot *snap) -{ - struct origin *o; - kdev_t dev = snap->origin->dev; - - down_write(&_origins_lock); - o = __lookup_origin(dev); - - if (!o) { - /* New origin */ - o = kmalloc(sizeof(*o), GFP_KERNEL); - if (!o) { - up_write(&_origins_lock); - return -ENOMEM; - } - - /* Initialise the struct */ - INIT_LIST_HEAD(&o->snapshots); - o->dev = dev; - - __insert_origin(o); - } - - list_add_tail(&snap->list, &o->snapshots); - - up_write(&_origins_lock); - return 0; + list_add_tail(&o->origins, &_origins); } -static void unregister_snapshot(struct dm_snapshot *s) +static void __remove_origin(struct origin *o) { - struct origin *o; - - down_write(&_origins_lock); - o = __lookup_origin(s->origin->dev); - - list_del(&s->list); - if (list_empty(&o->snapshots)) { - list_del(&o->hash_list); - kfree(o); - } - - up_write(&_origins_lock); + list_del(&o->origins); } -/* - * Implementation of the exception hash tables. - */ -static int init_exception_table(struct exception_table *et, uint32_t size) +static void __insert_snapshot(struct snapshot *s, struct origin *o) { - unsigned int i; - - et->hash_mask = size - 1; - et->table = vcalloc(size, sizeof(struct list_head)); - if (!et->table) - return -ENOMEM; - - for (i = 0; i < size; i++) - INIT_LIST_HEAD(et->table + i); - - return 0; + list_add_tail(&s->list, &o->snapshots); } -static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem) +static void __remove_snapshot(struct snapshot *s) { - struct list_head *slot, *entry, *temp; - struct exception *ex; - int i, size; - - size = et->hash_mask + 1; - for (i = 0; i < size; i++) { - slot = et->table + i; - - list_for_each_safe(entry, temp, slot) { - ex = list_entry(entry, struct exception, hash_list); - kmem_cache_free(mem, ex); - } - } - - vfree(et->table); + list_del(&s->list); } /* - * FIXME: check how this hash fn is performing. + * Allocate and initialize an origin structure. */ -static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk) +static struct origin * __alloc_origin(struct dm_dev *dev) { - return chunk & et->hash_mask; + struct origin *o = kmalloc(sizeof(*o), GFP_KERNEL); + if (o) { + o->dev = dev; + o->count = 0; + INIT_LIST_HEAD(&o->origins); + INIT_LIST_HEAD(&o->snapshots); + __insert_origin(o); + } + return o; } -static void insert_exception(struct exception_table *eh, struct exception *e) +static void __get_origin(struct origin *o) { - struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; - list_add(&e->hash_list, l); + o->count++; } -static inline void remove_exception(struct exception *e) +static void __put_origin(struct origin *o) { - list_del(&e->hash_list); + o->count--; + if (o->count == 0) { + __remove_origin(o); + kfree(o); + } } /* - * Return the exception data for a sector, or NULL if not - * remapped. + * Make a note of the snapshot and its origin so we can look it + * up when the origin has a write on it. */ -static struct exception *lookup_exception(struct exception_table *et, - chunk_t chunk) -{ - struct list_head *slot, *el; - struct exception *e; - - slot = &et->table[exception_hash(et, chunk)]; - list_for_each(el, slot) { - e = list_entry(el, struct exception, hash_list); - if (e->old_chunk == chunk) - return e; - } - - return NULL; -} - -static inline struct exception *alloc_exception(void) -{ - struct exception *e; - - e = kmem_cache_alloc(exception_cache, GFP_NOIO); - if (!e) - e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); - - return e; -} - -static inline void free_exception(struct exception *e) +static int register_snapshot(struct snapshot *snap) { - kmem_cache_free(exception_cache, e); -} + struct origin *o; + struct dm_dev *dev = etable_src_dev(snap->etable); -static inline struct pending_exception *alloc_pending_exception(void) -{ - return mempool_alloc(pending_pool, GFP_NOIO); -} + down_write(&_origins_lock); -static inline void free_pending_exception(struct pending_exception *pe) -{ - mempool_free(pe, pending_pool); -} + o = __lookup_origin(dev); + if (!o) { + /* New origin */ + o = __alloc_origin(dev); + if (!o) { + up_write(&_origins_lock); + return -ENOMEM; + } + } -int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) -{ - struct exception *e; + __get_origin(o); + __insert_snapshot(snap, o); - e = alloc_exception(); - if (!e) - return -ENOMEM; + up_write(&_origins_lock); - e->old_chunk = old; - e->new_chunk = new; - insert_exception(&s->complete, e); return 0; } -/* - * Hard coded magic. - */ -static int calc_max_buckets(void) +static void unregister_snapshot(struct snapshot *s) { - unsigned long mem; - - mem = num_physpages << PAGE_SHIFT; - mem /= 50; - mem /= sizeof(struct list_head); + struct origin *o; - return mem; + down_write(&_origins_lock); + o = __lookup_origin(etable_src_dev(s->etable)); + __remove_snapshot(s); + __put_origin(o); + up_write(&_origins_lock); } -/* - * Rounds a number down to a power of 2. - */ -static inline uint32_t round_down(uint32_t n) +static struct origin *register_origin(struct dm_dev *dev) { - while (n & (n - 1)) - n &= (n - 1); - return n; -} + struct origin *o; -/* - * Allocate room for a suitable hash table. - */ -static int init_hash_tables(struct dm_snapshot *s) -{ - sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; + down_write(&_origins_lock); - /* - * Calculate based on the size of the original volume or - * the COW volume... - */ - cow_dev_size = get_dev_size(s->cow->dev); - origin_dev_size = get_dev_size(s->origin->dev); - max_buckets = calc_max_buckets(); - - hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size; - hash_size = min(hash_size, max_buckets); - - /* Round it down to a power of 2 */ - hash_size = round_down(hash_size); - if (init_exception_table(&s->complete, hash_size)) - return -ENOMEM; + o = __lookup_origin(dev); + if (!o) { + /* New origin */ + o = __alloc_origin(dev); + if (!o) { + up_write(&_origins_lock); + return NULL; + } + } - /* - * Allocate hash table for in-flight exceptions - * Make this smaller than the real hash table - */ - hash_size >>= 3; - if (!hash_size) - hash_size = 64; + __get_origin(o); - if (init_exception_table(&s->pending, hash_size)) { - exit_exception_table(&s->complete, exception_cache); - return -ENOMEM; - } + up_write(&_origins_lock); - return 0; + return o; } -/* - * Round a number up to the nearest 'size' boundary. size must - * be a power of 2. - */ -static inline ulong round_up(ulong n, ulong size) +static void unregister_origin(struct origin *o) { - size--; - return (n + size) & ~size; + down_write(&_origins_lock); + __put_origin(o); + up_write(&_origins_lock); } /* - * Construct a snapshot mapping:

+ * Construct a snapshot mapping:

*/ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) { - struct dm_snapshot *s; + struct snapshot *s; unsigned long chunk_size; int r = -EINVAL; char persistent; char *origin_path; char *cow_path; - char *value; - int blocksize; if (argc < 4) { - ti->error = "dm-snapshot: requires exactly 4 arguments"; + ti->error = "dm-snapshot requires exactly 4 arguments"; r = -EINVAL; goto bad1; } @@ -415,434 +217,63 @@ origin_path = argv[0]; cow_path = argv[1]; persistent = toupper(*argv[2]); - - if (persistent != 'P' && persistent != 'N') { - ti->error = "Persistent flag is not P or N"; - r = -EINVAL; - goto bad1; - } - - chunk_size = simple_strtoul(argv[3], &value, 10); - if (chunk_size == 0 || value == NULL) { - ti->error = "Invalid chunk size"; - r = -EINVAL; - goto bad1; - } + chunk_size = simple_strtoul(argv[3], NULL, 10); s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s == NULL) { - ti->error = "Cannot allocate snapshot context private " - "structure"; + if (!s) { + ti->error = "cannot allocate snapshot private structure"; r = -ENOMEM; goto bad1; } - r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); - if (r) { - ti->error = "Cannot get origin device"; - goto bad2; - } - - /* FIXME: get cow length */ - r = dm_get_device(ti, cow_path, 0, 0, - FMODE_READ | FMODE_WRITE, &s->cow); - if (r) { - dm_put_device(ti, s->origin); - ti->error = "Cannot get COW device"; - goto bad2; - } - - /* - * Chunk size must be multiple of page size. Silently - * round up if it's not. - */ - chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE); - - /* Validate the chunk size against the device block size */ - blocksize = get_hardsect_size(s->cow->dev); - if (chunk_size % (blocksize / SECTOR_SIZE)) { - ti->error = "Chunk size is not a multiple of device blocksize"; + s->etable = etable_create(ti, origin_path, cow_path, + chunk_size, persistent); + if (!s->etable) { + ti->error = "cannot create exception table"; r = -EINVAL; - goto bad3; - } - - /* Check the sizes are small enough to fit in one kiovec */ - if (chunk_size > KIO_MAX_SECTORS) { - ti->error = "Chunk size is too big"; - r = -EINVAL; - goto bad3; - } - - /* Check chunk_size is a power of 2 */ - if (chunk_size & (chunk_size - 1)) { - ti->error = "Chunk size is not a power of 2"; - r = -EINVAL; - goto bad3; - } - - s->chunk_size = chunk_size; - s->chunk_mask = chunk_size - 1; - s->type = persistent; - for (s->chunk_shift = 0; chunk_size; - s->chunk_shift++, chunk_size >>= 1) - ; - s->chunk_shift--; - - s->valid = 1; - s->have_metadata = 0; - s->last_percent = 0; - init_rwsem(&s->lock); - s->table = ti->table; - - /* Allocate hash table for COW data */ - if (init_hash_tables(s)) { - ti->error = "Unable to allocate hash table space"; - r = -ENOMEM; - goto bad3; - } - - /* - * Check the persistent flag - done here because we need the iobuf - * to check the LV header - */ - s->store.snap = s; - - if (persistent == 'P') - r = dm_create_persistent(&s->store, s->chunk_size); - else - r = dm_create_transient(&s->store, s, blocksize); - - if (r) { - ti->error = "Couldn't create exception store"; - r = -EINVAL; - goto bad4; - } - - r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); - if (r) { - ti->error = "Could not create kcopyd client"; - goto bad5; + goto bad2; } /* Add snapshot to the list of snapshots for this origin */ if (register_snapshot(s)) { r = -EINVAL; - ti->error = "Cannot register snapshot origin"; - goto bad6; + ti->error = "cannot register snapshot device"; + goto bad3; } ti->private = s; return 0; - bad6: - kcopyd_client_destroy(s->kcopyd_client); - - bad5: - s->store.destroy(&s->store); - - bad4: - exit_exception_table(&s->pending, pending_cache); - exit_exception_table(&s->complete, exception_cache); - bad3: - dm_put_device(ti, s->cow); - dm_put_device(ti, s->origin); - + etable_delete(s->etable); bad2: kfree(s); - bad1: return r; } static void snapshot_dtr(struct dm_target *ti) { - struct dm_snapshot *s = (struct dm_snapshot *) ti->private; - - dm_table_event(ti->table); - + struct snapshot *s = (struct snapshot *) ti->private; unregister_snapshot(s); - - exit_exception_table(&s->pending, pending_cache); - exit_exception_table(&s->complete, exception_cache); - - /* Deallocate memory used */ - s->store.destroy(&s->store); - - dm_put_device(ti, s->origin); - dm_put_device(ti, s->cow); - kcopyd_client_destroy(s->kcopyd_client); + etable_delete(s->etable); kfree(s); } -/* - * We hold lists of buffer_heads, using the b_reqnext field. - */ -static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh) -{ - bh->b_reqnext = *queue; - *queue = bh; -} - -/* - * FIXME: inefficient. - */ -static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs) -{ - while (*queue) - queue = &((*queue)->b_reqnext); - - *queue = bhs; -} - -/* - * Flush a list of buffers. - */ -static void flush_buffers(struct buffer_head *bh) -{ - struct buffer_head *n; - - DMDEBUG("begin flush"); - while (bh) { - n = bh->b_reqnext; - bh->b_reqnext = NULL; - DMDEBUG("flushing %p", bh); - generic_make_request(WRITE, bh); - bh = n; - } - - run_task_queue(&tq_disk); -} - -/* - * Error a list of buffers. - */ -static void error_buffers(struct buffer_head *bh) -{ - struct buffer_head *n; - - while (bh) { - n = bh->b_reqnext; - bh->b_reqnext = NULL; - buffer_IO_error(bh); - bh = n; - } -} - -static struct buffer_head *__flush_bhs(struct pending_exception *pe) -{ - struct pending_exception *sibling; - - if (list_empty(&pe->siblings)) - return pe->origin_bhs; - - sibling = list_entry(pe->siblings.next, - struct pending_exception, siblings); - - list_del(&pe->siblings); - - /* FIXME: I think there's a race on SMP machines here, add spin lock */ - queue_buffers(&sibling->origin_bhs, pe->origin_bhs); - - return NULL; -} - -static void pending_complete(struct pending_exception *pe, int success) -{ - struct exception *e; - struct dm_snapshot *s = pe->snap; - struct buffer_head *flush = NULL; - - if (success) { - e = alloc_exception(); - if (!e) { - DMWARN("Unable to allocate exception."); - down_write(&s->lock); - s->store.drop_snapshot(&s->store); - s->valid = 0; - flush = __flush_bhs(pe); - up_write(&s->lock); - - error_buffers(pe->snapshot_bhs); - goto out; - } - - /* - * Add a proper exception, and remove the - * in-flight exception from the list. - */ - down_write(&s->lock); - - memcpy(e, &pe->e, sizeof(*e)); - insert_exception(&s->complete, e); - remove_exception(&pe->e); - flush = __flush_bhs(pe); - - /* Submit any pending write BHs */ - up_write(&s->lock); - - flush_buffers(pe->snapshot_bhs); - DMDEBUG("Exception completed successfully."); - - /* Notify any interested parties */ - if (s->store.fraction_full) { - sector_t numerator, denominator; - int pc; - - s->store.fraction_full(&s->store, &numerator, - &denominator); - pc = numerator * 100 / denominator; - - if (pc >= s->last_percent + WAKE_UP_PERCENT) { - dm_table_event(s->table); - s->last_percent = pc - pc % WAKE_UP_PERCENT; - } - } - - } else { - /* Read/write error - snapshot is unusable */ - down_write(&s->lock); - if (s->valid) - DMERR("Error reading/writing snapshot"); - s->store.drop_snapshot(&s->store); - s->valid = 0; - remove_exception(&pe->e); - flush = __flush_bhs(pe); - up_write(&s->lock); - - error_buffers(pe->snapshot_bhs); - - dm_table_event(s->table); - DMDEBUG("Exception failed."); - } - - out: - if (flush) - flush_buffers(flush); - - free_pending_exception(pe); -} - -static void commit_callback(void *context, int success) +static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, + int rw, union map_info *map_context) { - struct pending_exception *pe = (struct pending_exception *) context; - pending_complete(pe, success); -} - -/* - * Called when the copy I/O has finished. kcopyd actually runs - * this code so don't block. - */ -static void copy_callback(int read_err, unsigned int write_err, void *context) -{ - struct pending_exception *pe = (struct pending_exception *) context; - struct dm_snapshot *s = pe->snap; - - if (read_err || write_err) - pending_complete(pe, 0); - - else - /* Update the metadata if we are persistent */ - s->store.commit_exception(&s->store, &pe->e, commit_callback, - pe); -} - -/* - * Dispatches the copy operation to kcopyd. - */ -static inline void start_copy(struct pending_exception *pe) -{ - struct dm_snapshot *s = pe->snap; - struct io_region src, dest; - kdev_t dev = s->origin->dev; - int *sizes = blk_size[major(dev)]; - sector_t dev_size = (sector_t) -1; - - if (pe->started) - return; - - /* this is protected by snap->lock */ - pe->started = 1; - - if (sizes && sizes[minor(dev)]) - dev_size = sizes[minor(dev)] << 1; - - src.dev = dev; - src.sector = chunk_to_sector(s, pe->e.old_chunk); - src.count = min(s->chunk_size, dev_size - src.sector); - - dest.dev = s->cow->dev; - dest.sector = chunk_to_sector(s, pe->e.new_chunk); - dest.count = src.count; - - /* Hand over to kcopyd */ - kcopyd_copy(s->kcopyd_client, - &src, 1, &dest, 0, copy_callback, pe); -} - -/* - * Looks to see if this snapshot already has a pending exception - * for this chunk, otherwise it allocates a new one and inserts - * it into the pending table. - */ -static struct pending_exception *find_pending_exception(struct dm_snapshot *s, - struct buffer_head *bh) -{ - struct exception *e; + struct snapshot *s = (struct snapshot *) ti->private; + struct exception_table *et = s->etable; struct pending_exception *pe; - chunk_t chunk = sector_to_chunk(s, bh->b_rsector); - - /* - * Is there a pending exception for this already ? - */ - e = lookup_exception(&s->pending, chunk); - if (e) { - /* cast the exception to a pending exception */ - pe = list_entry(e, struct pending_exception, e); - - } else { - /* Create a new pending exception */ - pe = alloc_pending_exception(); - pe->e.old_chunk = chunk; - pe->origin_bhs = pe->snapshot_bhs = NULL; - INIT_LIST_HEAD(&pe->siblings); - pe->snap = s; - pe->started = 0; - - if (s->store.prepare_exception(&s->store, &pe->e)) { - free_pending_exception(pe); - s->valid = 0; - return NULL; - } - - insert_exception(&s->pending, &pe->e); - } - - return pe; -} - -static inline void remap_exception(struct dm_snapshot *s, struct exception *e, - struct buffer_head *bh) -{ - bh->b_rdev = s->cow->dev; - bh->b_rsector = chunk_to_sector(s, e->new_chunk) + - (bh->b_rsector & s->chunk_mask); -} - -static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw, - union map_info *map_context) -{ struct exception *e; - struct dm_snapshot *s = (struct dm_snapshot *) ti->private; int r = 1; - chunk_t chunk; - struct pending_exception *pe; - - chunk = sector_to_chunk(s, bh->b_rsector); /* Full snapshots are not usable */ - if (!s->valid) + /* KMC: Should this check be inside the lock? */ + if (!etable_valid(et)) { return -1; + } /* * Write to snapshot - higher level takes care of RW/RO @@ -851,29 +282,49 @@ */ if (rw == WRITE) { - down_write(&s->lock); + /* Check for a complete-exception first. If one + * exists we can go ahead with the remap. + */ + etable_read_lock(et); + e = _etable_lookup_exception(et, bh); + if (e) { + etable_read_unlock(et); + etable_remap_exception(et, e, bh); + goto out; + } - /* If the block is already remapped - use that, else remap it */ - e = lookup_exception(&s->complete, chunk); - if (e) - remap_exception(s, e, bh); + etable_read_unlock(et); + etable_write_lock(et); - else { - pe = find_pending_exception(s, bh); + /* Need to check for a complete-exception again since we + * dropped and re-took the lock. If we still don't find + * a complete-exception, we need to create a new remap. + */ + e = _etable_lookup_exception(et, bh); + if (e) { + etable_write_unlock(et); + etable_remap_exception(et, e, bh); + goto out; + } + pe = __etable_lookup_pending_exception(et, bh); + if (!pe) { + pe = __etable_create_pending_exception(et, bh); if (!pe) { - s->store.drop_snapshot(&s->store); - s->valid = 0; + __etable_invalidate(et); + etable_write_unlock(et); r = -EIO; - } else { - remap_exception(s, &pe->e, bh); - queue_buffer(&pe->snapshot_bhs, bh); - start_copy(pe); - r = 0; + goto out; } + + __etable_start_copy(pe); } - up_write(&s->lock); + __etable_queue_dest_buffer(pe, bh); + etable_put_pending_exception(pe); + r = 0; + + etable_write_unlock(et); } else { /* @@ -883,74 +334,82 @@ * situation where this is wrong - ejt. */ + /* KMC: Two issues to consider here. First, what happens when + * we get a snapshot read for a chunk that has a + * pending exception? Second, what happens when we need to + * start a new pending exception for a chunk with + * outstanding snapshot reads? + */ + /* Do reads */ - down_read(&s->lock); + etable_read_lock(et); - /* See if it it has been remapped */ - e = lookup_exception(&s->complete, chunk); - if (e) - remap_exception(s, e, bh); - else - bh->b_rdev = s->origin->dev; + /* See if it has been remapped */ + e = _etable_lookup_exception(et, bh); + if (e) { + etable_remap_exception(et, e, bh); + } else { + struct dm_dev *dev = etable_src_dev(et); + bh->b_rdev = dev->dev; + } - up_read(&s->lock); + etable_read_unlock(et); } +out: return r; } -void snapshot_resume(struct dm_target *ti) +static void snapshot_resume(struct dm_target *ti) { - struct dm_snapshot *s = (struct dm_snapshot *) ti->private; - - if (s->have_metadata) - return; + struct snapshot *s = (struct snapshot *) ti->private; + int r; - if (s->store.read_metadata(&s->store)) { - down_write(&s->lock); - s->valid = 0; - up_write(&s->lock); + r = etable_read_metadata(s->etable); + if (r) { + etable_write_lock(s->etable); + __etable_invalidate(s->etable); + etable_write_unlock(s->etable); } - - s->have_metadata = 1; } static int snapshot_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { - struct dm_snapshot *snap = (struct dm_snapshot *) ti->private; - char cow[16]; - char org[16]; + struct snapshot *snap = (struct snapshot *) ti->private; + struct exception_table *et = snap->etable; + sector_t numerator, denominator; + struct dm_dev *dev; + char cow[16], org[16]; + int r; switch (type) { case STATUSTYPE_INFO: - if (!snap->valid) + if (!etable_valid(et)) { snprintf(result, maxlen, "Invalid"); - else { - if (snap->store.fraction_full) { - sector_t numerator, denominator; - snap->store.fraction_full(&snap->store, - &numerator, - &denominator); + } else { + r = etable_fraction_full(et, &numerator, &denominator); + if (r) { + snprintf(result, maxlen, "Unknown"); + } else { snprintf(result, maxlen, SECTOR_FORMAT "/" SECTOR_FORMAT, numerator, denominator); } - else - snprintf(result, maxlen, "Unknown"); } break; case STATUSTYPE_TABLE: /* - * kdevname returns a static pointer so we need - * to make private copies if the output is to - * make sense. + * dm_kdevname returns a static pointer so we need to + * make private copies if the output is to make sense. */ - strncpy(cow, dm_kdevname(snap->cow->dev), sizeof(cow)); - strncpy(org, dm_kdevname(snap->origin->dev), sizeof(org)); + dev = etable_src_dev(et); + strncpy(org, dm_kdevname(dev->dev), sizeof(org)); + dev = etable_dest_dev(et); + strncpy(cow, dm_kdevname(dev->dev), sizeof(cow)); snprintf(result, maxlen, "%s %s %c %ld", org, cow, - snap->type, snap->chunk_size); + etable_type(et), etable_chunk_size(et)); break; } @@ -960,123 +419,109 @@ /*----------------------------------------------------------------- * Origin methods *---------------------------------------------------------------*/ -static void list_merge(struct list_head *l1, struct list_head *l2) -{ - struct list_head *l1_n, *l2_p; - - l1_n = l1->next; - l2_p = l2->prev; - - l1->next = l2; - l2->prev = l1; - - l2_p->next = l1_n; - l1_n->prev = l2_p; -} - static int __origin_write(struct list_head *snapshots, struct buffer_head *bh) { - int r = 1, first = 1; + int r = 1; struct list_head *sl; - struct dm_snapshot *snap; + struct snapshot *snap; + struct exception_table *et; struct exception *e; - struct pending_exception *pe, *last = NULL; - chunk_t chunk; + struct pending_exception *pe, *list = NULL; /* Do all the snapshots on this origin */ list_for_each(sl, snapshots) { - snap = list_entry(sl, struct dm_snapshot, list); + snap = list_entry(sl, struct snapshot, list); + et = snap->etable; /* Only deal with valid snapshots */ - if (!snap->valid) + /* KMC: Should this check be inside the lock? */ + if (!etable_valid(et)) continue; - down_write(&snap->lock); - - /* - * Remember, different snapshots can have - * different chunk sizes. + /* Check for a complete-exception first. If one + * exists we can just go on to the next snapshot. */ - chunk = sector_to_chunk(snap, bh->b_rsector); + etable_read_lock(et); + e = _etable_lookup_exception(et, bh); + if (e) { + etable_read_unlock(et); + continue; + } - /* - * Check exception table to see if block - * is already remapped in this snapshot - * and trigger an exception if not. + etable_read_unlock(et); + etable_write_lock(et); + + /* Need to check for a complete-exception again since we + * dropped and re-took the lock. If we still don't find + * a complete-exception, we need to remap this chunk. */ - e = lookup_exception(&snap->complete, chunk); + e = _etable_lookup_exception(et, bh); if (!e) { - pe = find_pending_exception(snap, bh); + pe = __etable_lookup_pending_exception(et, bh); if (!pe) { - snap->store.drop_snapshot(&snap->store); - snap->valid = 0; + pe = __etable_create_pending_exception(et, bh); + if (!pe) { + __etable_invalidate(et); + etable_write_unlock(et); + continue; + } + } + if (list) { + /* KMC: This looks racy!!! + * Two threads could be merging PEs into the + * same list at the same time. The locks in + * the etables aren't sufficient to protect + * this call. + */ + __etable_merge_pending_exceptions(pe, list); } else { - if (last) - list_merge(&pe->siblings, - &last->siblings); - - last = pe; - r = 0; + /* KMC: Can we queue the buffer on this pe, + * since we know this is the first in the list? + * If we do this, we must make an appropriate + * change to etable_start_copies. + */ + __etable_queue_src_buffer(pe, bh); } + + list = pe; + r = 0; } - up_write(&snap->lock); + etable_write_unlock(et); } + /* KMC: This concerns me. We've locked each individual exception + * table as we've gathered the list of pending-exceptions. But + * now we've dropped all of the locks, but we're still holding + * a list of pending-exceptions. What if one/all of those + * complete before we get a change to call the next function? + * My guess is we need some kind of ref-counting for pending + * exceptions. + */ + /* * Now that we have a complete pe list we can start the copying. */ - if (last) { - pe = last; - do { - down_write(&pe->snap->lock); - if (first) - queue_buffer(&pe->origin_bhs, bh); - start_copy(pe); - up_write(&pe->snap->lock); - first = 0; - pe = list_entry(pe->siblings.next, - struct pending_exception, siblings); - - } while (pe != last); + if (list) { + etable_start_copies(list); } return r; } /* - * Called on a write from the origin driver. - */ -int do_origin(struct dm_dev *origin, struct buffer_head *bh) -{ - struct origin *o; - int r; - - down_read(&_origins_lock); - o = __lookup_origin(origin->dev); - if (!o) - BUG(); - - r = __origin_write(&o->snapshots, bh); - up_read(&_origins_lock); - - return r; -} - -/* * Origin: maps a linear range of a device, with hooks for snapshotting. */ /* * Construct an origin mapping: - * The context for an origin is merely a 'struct dm_dev *' - * pointing to the real device. */ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) { - int r; struct dm_dev *dev; + struct origin *o; + int r; if (argc != 1) { ti->error = "dm-origin: incorrect number of arguments"; @@ -1086,34 +531,51 @@ r = dm_get_device(ti, argv[0], 0, ti->len, dm_table_get_mode(ti->table), &dev); if (r) { - ti->error = "Cannot get target device"; + ti->error = "cannot get origin device"; return r; } - ti->private = dev; + o = register_origin(dev); + if (!o) { + ti->error = "cannot register origin"; + dm_put_device(ti, dev); + return -ENOMEM; + } + + ti->private = o; return 0; } static void origin_dtr(struct dm_target *ti) { - struct dm_dev *dev = (struct dm_dev *) ti->private; + struct origin *o = (struct origin *) ti->private; + struct dm_dev *dev = o->dev; + unregister_origin(o); dm_put_device(ti, dev); } -static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw, - union map_info *map_context) +static int origin_map(struct dm_target *ti, struct buffer_head *bh, + int rw, union map_info *map_context) { - struct dm_dev *dev = (struct dm_dev *) ti->private; - bh->b_rdev = dev->dev; + struct origin *o = (struct origin *) ti->private; + int r = 1; + + bh->b_rdev = o->dev->dev; /* Only tell snapshots if this is a write */ - return (rw == WRITE) ? do_origin(dev, bh) : 1; + if (rw == WRITE) { + down_read(&_origins_lock); + r = __origin_write(&o->snapshots, bh); + up_read(&_origins_lock); + } + + return r; } -static int origin_status(struct dm_target *ti, status_type_t type, char *result, - unsigned int maxlen) +static int origin_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) { - struct dm_dev *dev = (struct dm_dev *) ti->private; + struct origin *o = (struct origin *) ti->private; switch (type) { case STATUSTYPE_INFO: @@ -1121,7 +583,7 @@ break; case STATUSTYPE_TABLE: - snprintf(result, maxlen, "%s", dm_kdevname(dev->dev)); + snprintf(result, maxlen, "%s", dm_kdevname(o->dev->dev)); break; } @@ -1153,64 +615,18 @@ r = dm_register_target(&snapshot_target); if (r) { - DMERR("snapshot target register failed %d", r); + DMERR("failed to register snapshot target: %d", r); return r; } r = dm_register_target(&origin_target); - if (r < 0) { - DMERR("Device mapper: Origin: register failed %d\n", r); - goto bad1; - } - - r = init_origin_hash(); if (r) { - DMERR("init_origin_hash failed."); - goto bad2; - } - - exception_cache = kmem_cache_create("dm-snapshot-ex", - sizeof(struct exception), - __alignof__(struct exception), - 0, NULL, NULL); - if (!exception_cache) { - DMERR("Couldn't create exception cache."); - r = -ENOMEM; - goto bad3; - } - - pending_cache = - kmem_cache_create("dm-snapshot-in", - sizeof(struct pending_exception), - __alignof__(struct pending_exception), - 0, NULL, NULL); - if (!pending_cache) { - DMERR("Couldn't create pending cache."); - r = -ENOMEM; - goto bad4; - } - - pending_pool = mempool_create(128, mempool_alloc_slab, - mempool_free_slab, pending_cache); - if (!pending_pool) { - DMERR("Couldn't create pending pool."); - r = -ENOMEM; - goto bad5; + DMERR("failed to register origin target: %d\n", r); + dm_unregister_target(&snapshot_target); + return r; } return 0; - - bad5: - kmem_cache_destroy(pending_cache); - bad4: - kmem_cache_destroy(exception_cache); - bad3: - exit_origin_hash(); - bad2: - dm_unregister_target(&origin_target); - bad1: - dm_unregister_target(&snapshot_target); - return r; } void dm_snapshot_exit(void) @@ -1219,14 +635,10 @@ r = dm_unregister_target(&snapshot_target); if (r) - DMERR("snapshot unregister failed %d", r); + DMERR("failed to unregister snapshot target: %d", r); r = dm_unregister_target(&origin_target); if (r) - DMERR("origin unregister failed %d", r); - - exit_origin_hash(); - mempool_destroy(pending_pool); - kmem_cache_destroy(pending_cache); - kmem_cache_destroy(exception_cache); + DMERR("failed to unregister origin target: %d", r); } + diff -Naur linux-2.4.22-dm-1/drivers/md/dm-snapshot.h linux-2.4.22-new-snapshot/drivers/md/dm-snapshot.h --- linux-2.4.22-dm-1/drivers/md/dm-snapshot.h 2003-09-30 11:59:23.000000000 -0500 +++ linux-2.4.22-new-snapshot/drivers/md/dm-snapshot.h 1969-12-31 18:00:00.000000000 -0600 @@ -1,158 +0,0 @@ -/* - * dm-snapshot.c - * - * Copyright (C) 2001-2002 Sistina Software (UK) Limited. - * - * This file is released under the GPL. - */ - -#ifndef DM_SNAPSHOT_H -#define DM_SNAPSHOT_H - -#include "dm.h" -#include - -struct exception_table { - uint32_t hash_mask; - struct list_head *table; -}; - -/* - * The snapshot code deals with largish chunks of the disk at a - * time. Typically 64k - 256k. - */ -/* FIXME: can we get away with limiting these to a uint32_t ? */ -typedef sector_t chunk_t; - -/* - * An exception is used where an old chunk of data has been - * replaced by a new one. - */ -struct exception { - struct list_head hash_list; - - chunk_t old_chunk; - chunk_t new_chunk; -}; - -/* - * Abstraction to handle the meta/layout of exception stores (the - * COW device). - */ -struct exception_store { - - /* - * Destroys this object when you've finished with it. - */ - void (*destroy) (struct exception_store *store); - - /* - * The target shouldn't read the COW device until this is - * called. - */ - int (*read_metadata) (struct exception_store *store); - - /* - * Find somewhere to store the next exception. - */ - int (*prepare_exception) (struct exception_store *store, - struct exception *e); - - /* - * Update the metadata with this exception. - */ - void (*commit_exception) (struct exception_store *store, - struct exception *e, - void (*callback) (void *, int success), - void *callback_context); - - /* - * The snapshot is invalid, note this in the metadata. - */ - void (*drop_snapshot) (struct exception_store *store); - - /* - * Return how full the snapshot is. - */ - void (*fraction_full) (struct exception_store *store, - sector_t *numerator, - sector_t *denominator); - - struct dm_snapshot *snap; - void *context; -}; - -struct dm_snapshot { - struct rw_semaphore lock; - struct dm_table *table; - - struct dm_dev *origin; - struct dm_dev *cow; - - /* List of snapshots per Origin */ - struct list_head list; - - /* Size of data blocks saved - must be a power of 2 */ - chunk_t chunk_size; - chunk_t chunk_mask; - chunk_t chunk_shift; - - /* You can't use a snapshot if this is 0 (e.g. if full) */ - int valid; - int have_metadata; - - /* Used for display of table */ - char type; - - /* The last percentage we notified */ - int last_percent; - - struct exception_table pending; - struct exception_table complete; - - /* The on disk metadata handler */ - struct exception_store store; - - struct kcopyd_client *kcopyd_client; -}; - -/* - * Used by the exception stores to load exceptions hen - * initialising. - */ -int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new); - -/* - * Constructor and destructor for the default persistent - * store. - */ -int dm_create_persistent(struct exception_store *store, uint32_t chunk_size); - -int dm_create_transient(struct exception_store *store, - struct dm_snapshot *s, int blocksize); - -/* - * Return the number of sectors in the device. - */ -static inline sector_t get_dev_size(kdev_t dev) -{ - int *sizes; - - sizes = blk_size[MAJOR(dev)]; - if (sizes) - return sizes[MINOR(dev)] << 1; - - return 0; -} - -static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector) -{ - return (sector & ~s->chunk_mask) >> s->chunk_shift; -} - -static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk) -{ - return chunk << s->chunk_shift; -} - -#endif diff -Naur linux-2.4.22-dm-1/drivers/md/dm-table.c linux-2.4.22-new-snapshot/drivers/md/dm-table.c --- linux-2.4.22-dm-1/drivers/md/dm-table.c 2003-09-30 11:59:23.000000000 -0500 +++ linux-2.4.22-new-snapshot/drivers/md/dm-table.c 2003-09-30 12:01:56.000000000 -0500 @@ -319,16 +319,14 @@ */ static int check_device_area(kdev_t dev, sector_t start, sector_t len) { - int *sizes; sector_t dev_size; - if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)])) + dev_size = get_dev_size(dev); + if (!dev_size) { /* we don't know the device details, * so give the benefit of the doubt */ return 1; - - /* convert to 512-byte sectors */ - dev_size <<= 1; + } return ((start < dev_size) && (len <= (dev_size - start))); } diff -Naur linux-2.4.22-dm-1/drivers/md/dm.c linux-2.4.22-new-snapshot/drivers/md/dm.c --- linux-2.4.22-dm-1/drivers/md/dm.c 2003-09-30 11:59:23.000000000 -0500 +++ linux-2.4.22-new-snapshot/drivers/md/dm.c 2003-09-30 12:01:52.000000000 -0500 @@ -369,6 +369,7 @@ xx(dm_linear) xx(dm_stripe) xx(dm_snapshot) + xx(dm_exception) xx(dm_interface) #undef xx }; @@ -456,7 +457,7 @@ unsigned int command, unsigned long a) { kdev_t dev = inode->i_rdev; - long size; + sector_t size; switch (command) { case BLKROSET: diff -Naur linux-2.4.22-dm-1/drivers/md/dm.h linux-2.4.22-new-snapshot/drivers/md/dm.h --- linux-2.4.22-dm-1/drivers/md/dm.h 2003-09-30 11:59:23.000000000 -0500 +++ linux-2.4.22-new-snapshot/drivers/md/dm.h 2003-09-30 12:01:53.000000000 -0500 @@ -19,6 +19,12 @@ #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) +#if 0 +#define DMDEBUG(x...) DMWARN( ## x) +#else +#define DMDEBUG(x...) +#endif + /* * FIXME: I think this should be with the definition of sector_t * in types.h. @@ -128,6 +134,21 @@ /*----------------------------------------------------------------- * Useful inlines. *---------------------------------------------------------------*/ + +/* + * Return the number of sectors in the device. + */ +static inline sector_t get_dev_size(kdev_t dev) +{ + int *sizes; + + sizes = blk_size[MAJOR(dev)]; + if (sizes) + return sizes[MINOR(dev)] << 1; + + return 0; +} + static inline int array_too_big(unsigned long fixed, unsigned long obj, unsigned long num) { @@ -172,4 +193,7 @@ int dm_snapshot_init(void); void dm_snapshot_exit(void); +int dm_exception_init(void); +void dm_exception_exit(void); + #endif