From 34ce4e7c23e3da578e459b05c6fb17edecb19e6b Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Tue, 15 Dec 2009 19:34:17 +0200 Subject: [PATCH 01/11] exofs: debug print even less * Last debug trimming left in some stupid print, remove them. Fixup some other prints * Shift printing from inode.c to ios.c * Add couple of prints when memory allocation fails. Signed-off-by: Boaz Harrosh --- fs/exofs/inode.c | 23 +++++++++++++---------- fs/exofs/ios.c | 38 ++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 2afbcebeda71..c88a0c5250cb 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -193,7 +193,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock) else good_bytes = pcol->length - resid; - EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx" + EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", pcol->inode->i_ino, _LLU(good_bytes), pcol->length, pcol->nr_pages); @@ -222,7 +222,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock) } pcol_free(pcol); - EXOFS_DBGMSG("readpages_done END\n"); + EXOFS_DBGMSG2("readpages_done END\n"); return ret; } @@ -290,7 +290,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync) atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", + EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", ios->obj.id, _LLU(ios->offset), pcol->length); /* pages ownership was passed to pcol_copy */ @@ -462,7 +462,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p) else good_bytes = pcol->length - resid; - EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx" + EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", pcol->inode->i_ino, _LLU(good_bytes), pcol->length, pcol->nr_pages); @@ -490,7 +490,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p) pcol_free(pcol); kfree(pcol); - EXOFS_DBGMSG("writepages_done END\n"); + EXOFS_DBGMSG2("writepages_done END\n"); } static int write_exec(struct page_collect *pcol) @@ -527,7 +527,7 @@ static int write_exec(struct page_collect *pcol) } atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", + EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), pcol->length); /* pages ownership was passed to pcol_copy */ @@ -616,7 +616,7 @@ try_again: ret = pcol_add_page(pcol, page, len); if (unlikely(ret)) { - EXOFS_DBGMSG("Failed pcol_add_page " + EXOFS_DBGMSG2("Failed pcol_add_page " "nr_pages=%u total_length=0x%lx\n", pcol->nr_pages, pcol->length); @@ -663,7 +663,7 @@ static int exofs_writepages(struct address_space *mapping, if (expected_pages < 32L) expected_pages = 32L; - EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " + EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", mapping->host->i_ino, wbc->range_start, wbc->range_end, mapping->nrpages, start, end, expected_pages); @@ -1170,8 +1170,10 @@ static int exofs_update_inode(struct inode *inode, int do_sync) int ret; args = kzalloc(sizeof(*args), GFP_KERNEL); - if (!args) + if (!args) { + EXOFS_DBGMSG("Faild kzalloc of args\n"); return -ENOMEM; + } fcb = &args->fcb; @@ -1234,7 +1236,8 @@ static int exofs_update_inode(struct inode *inode, int do_sync) free_args: kfree(args); out: - EXOFS_DBGMSG("ret=>%d\n", ret); + EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n", + inode->i_ino, do_sync, ret); return ret; } diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 5bad01fa1f9f..3cc0dd3f0eb2 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -26,6 +26,9 @@ #include "exofs.h" +#define EXOFS_DBGMSG2(M...) do {} while (0) +/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */ + void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) { osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); @@ -73,6 +76,8 @@ int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios) */ ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL); if (unlikely(!ios)) { + EXOFS_DBGMSG("Faild kzalloc bytes=%d\n", + exofs_io_state_size(sbi->s_numdevs)); *pios = NULL; return -ENOMEM; } @@ -276,6 +281,9 @@ int exofs_sbi_write(struct exofs_io_state *ios) bio = bio_kmalloc(GFP_KERNEL, ios->bio->bi_max_vecs); if (unlikely(!bio)) { + EXOFS_DBGMSG( + "Faild to allocate BIO size=%u\n", + ios->bio->bi_max_vecs); ret = -ENOMEM; goto out; } @@ -290,14 +298,21 @@ int exofs_sbi_write(struct exofs_io_state *ios) osd_req_write(or, &ios->obj, ios->offset, bio, ios->length); -/* EXOFS_DBGMSG("write sync=%d\n", sync);*/ + EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(ios->obj.id), _LLU(ios->offset), + _LLU(ios->length), i); } else if (ios->kern_buff) { osd_req_write_kern(or, &ios->obj, ios->offset, ios->kern_buff, ios->length); -/* EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/ + EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(ios->obj.id), _LLU(ios->offset), + _LLU(ios->length), i); } else { osd_req_set_attributes(or, &ios->obj); -/* EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/ + EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", + _LLU(ios->obj.id), ios->out_attr_len, i); } if (ios->out_attr) @@ -335,14 +350,25 @@ int exofs_sbi_read(struct exofs_io_state *ios) if (ios->bio) { osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length); -/* EXOFS_DBGMSG("read sync=%d\n", sync);*/ + EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" + " dev=%d\n", _LLU(ios->obj.id), + _LLU(ios->offset), + _LLU(ios->length), + first_dev); } else if (ios->kern_buff) { osd_req_read_kern(or, &ios->obj, ios->offset, ios->kern_buff, ios->length); -/* EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/ + EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(ios->obj.id), + _LLU(ios->offset), + _LLU(ios->length), + first_dev); } else { osd_req_get_attributes(or, &ios->obj); -/* EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/ + EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", + _LLU(ios->obj.id), ios->in_attr_len, + first_dev); } if (ios->out_attr) From 518f167a37b3c53f3cf44d27800455ca24e920f6 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 21 Jan 2010 20:00:02 +0200 Subject: [PATCH 02/11] exofs: Micro-optimize exofs_i_info optimize the exofs_i_info struct usage by moving the embedded vfs_inode to be first. A compiler might optimize away an "add" operation with constant zero. (Which it cannot with other constants) Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index c35fd4623986..13663da2b119 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -78,13 +78,13 @@ struct exofs_sb_info { * our extension to the in-memory inode */ struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ unsigned long i_flags; /* various atomic flags */ uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ uint32_t i_dir_start_lookup; /* which page to start lookup */ - wait_queue_head_t i_wq; /* wait queue for inode */ uint64_t i_commit_size; /* the object's written length */ uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ - struct inode vfs_inode; /* normal in-memory inode */ }; static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) From 22ddc556380cf5645c52292b6d980766646eb864 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Tue, 19 Jan 2010 19:24:45 +0200 Subject: [PATCH 03/11] exofs: Recover in the case of read-passed-end-of-file In check_io, implement the case of reading passed end of file, by clearing the pages and recover with no error. In a raid arrangement this can become a legitimate situation in case of holes in the file. Signed-off-by: Boaz Harrosh --- fs/exofs/ios.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 3cc0dd3f0eb2..439c5d097b27 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -173,6 +173,21 @@ static int exofs_io_execute(struct exofs_io_state *ios) return ret; } +static void _clear_bio(struct bio *bio) +{ + struct bio_vec *bv; + unsigned i; + + __bio_for_each_segment(bv, bio, i, 0) { + unsigned this_count = bv->bv_len; + + if (likely(PAGE_SIZE == this_count)) + clear_highpage(bv->bv_page); + else + zero_user(bv->bv_page, bv->bv_offset, this_count); + } +} + int exofs_check_io(struct exofs_io_state *ios, u64 *resid) { enum osd_err_priority acumulated_osd_err = 0; @@ -181,16 +196,25 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) for (i = 0; i < ios->numdevs; i++) { struct osd_sense_info osi; - int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi); + struct osd_request *or = ios->per_dev[i].or; + int ret; + if (unlikely(!or)) + continue; + + ret = osd_req_decode_sense(or, &osi); if (likely(!ret)) continue; - if (unlikely(ret == -EFAULT)) { - EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__); - /*FIXME: All the pages in this device range should: - * clear_highpage(page); - */ + if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { + /* start read offset passed endof file */ + _clear_bio(ios->per_dev[i].bio); + EXOFS_DBGMSG("start read offset passed end of file " + "offset=0x%llx, length=0x%llx\n", + _LLU(ios->offset), + _LLU(ios->length)); + + continue; /* we recovered */ } if (osi.osd_err_pri >= acumulated_osd_err) { From 45d3abcb1a7388b2b97582e13bf9dd21784dcaa5 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 28 Jan 2010 11:46:16 +0200 Subject: [PATCH 04/11] exofs: Move layout related members to a layout structure * Abstract away those members in exofs_sb_info that are related/needed by a layout into a new exofs_layout structure. Embed it in exofs_sb_info. * At exofs_io_state receive/keep a pointer to an exofs_layout. No need for an exofs_sb_info pointer, all we need is at exofs_layout. * Change any usage of above exofs_sb_info members to their new name. Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 24 +++++++++++++++++------ fs/exofs/inode.c | 14 ++++++------- fs/exofs/ios.c | 36 ++++++++++++++++++---------------- fs/exofs/super.c | 51 +++++++++++++++++++++++++----------------------- 4 files changed, 71 insertions(+), 54 deletions(-) diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 13663da2b119..33c68568b338 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -55,12 +55,18 @@ /* u64 has problems with printk this will cast it to unsigned long long */ #define _LLU(x) (unsigned long long)(x) +struct exofs_layout { + osd_id s_pid; /* partition ID of file system*/ + + unsigned s_numdevs; /* Num of devices in array */ + struct osd_dev *s_ods[0]; /* Variable length */ +}; + /* * our extension to the in-memory superblock */ struct exofs_sb_info { struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ - osd_id s_pid; /* partition ID of file system*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ uint32_t s_numfiles; /* number of files on fs */ @@ -69,9 +75,14 @@ struct exofs_sb_info { atomic_t s_curr_pending; /* number of pending commands */ uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ - struct pnfs_osd_data_map data_map; /* Default raid to use */ - unsigned s_numdevs; /* Num of devices in array */ - struct osd_dev *s_ods[1]; /* Variable length, minimum 1 */ + struct pnfs_osd_data_map data_map; /* Default raid to use + * FIXME: Needed ? + */ +/* struct exofs_layout dir_layout;*/ /* Default dir layout */ + struct exofs_layout layout; /* Default files layout, + * contains the variable osd_dev + * array. Keep last */ + struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ }; /* @@ -101,7 +112,7 @@ struct exofs_io_state { void *private; exofs_io_done_fn done; - struct exofs_sb_info *sbi; + struct exofs_layout *layout; struct osd_obj_id obj; u8 *cred; @@ -189,7 +200,8 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, u64 offset, void *p, unsigned length); -int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios); +int exofs_get_io_state(struct exofs_layout *layout, + struct exofs_io_state **ios); void exofs_put_io_state(struct exofs_io_state *ios); int exofs_check_io(struct exofs_io_state *ios, u64 *resid); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index c88a0c5250cb..03189a958b33 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -65,7 +65,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, /* Create master bios on first Q, later on cloning, each clone will be * allocated on it's destination Q */ - pcol->req_q = osd_request_queue(sbi->s_ods[0]); + pcol->req_q = osd_request_queue(sbi->layout.s_ods[0]); pcol->inode = inode; pcol->expected_pages = expected_pages; @@ -99,7 +99,7 @@ static int pcol_try_alloc(struct page_collect *pcol) BIO_MAX_PAGES_KMALLOC); if (!pcol->ios) { /* First time allocate io_state */ - int ret = exofs_get_io_state(pcol->sbi, &pcol->ios); + int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); if (ret) return ret; @@ -872,7 +872,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, int ret; *obj_size = ~0; - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); return ret; @@ -1043,7 +1043,7 @@ static void create_done(struct exofs_io_state *ios, void *p) if (unlikely(ret)) { EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", - _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid)); + _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); /*TODO: When FS is corrupted creation can fail, object already * exist. Get rid of this asynchronous creation, if exist * increment the obj counter and try the next object. Until we @@ -1104,7 +1104,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) mark_inode_dirty(inode); - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); return ERR_PTR(ret); @@ -1202,7 +1202,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); goto free_args; @@ -1286,7 +1286,7 @@ void exofs_delete_inode(struct inode *inode) clear_inode(inode); - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); return; diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 439c5d097b27..83e54a77b992 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -67,23 +67,24 @@ out: return ret; } -int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios) +int exofs_get_io_state(struct exofs_layout *layout, + struct exofs_io_state **pios) { struct exofs_io_state *ios; /*TODO: Maybe use kmem_cach per sbi of size - * exofs_io_state_size(sbi->s_numdevs) + * exofs_io_state_size(layout->s_numdevs) */ - ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL); + ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); if (unlikely(!ios)) { EXOFS_DBGMSG("Faild kzalloc bytes=%d\n", - exofs_io_state_size(sbi->s_numdevs)); + exofs_io_state_size(layout->s_numdevs)); *pios = NULL; return -ENOMEM; } - ios->sbi = sbi; - ios->obj.partition = sbi->s_pid; + ios->layout = layout; + ios->obj.partition = layout->s_pid; *pios = ios; return 0; } @@ -238,10 +239,10 @@ int exofs_sbi_create(struct exofs_io_state *ios) { int i, ret; - for (i = 0; i < ios->sbi->s_numdevs; i++) { + for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -262,10 +263,10 @@ int exofs_sbi_remove(struct exofs_io_state *ios) { int i, ret; - for (i = 0; i < ios->sbi->s_numdevs; i++) { + for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -286,10 +287,10 @@ int exofs_sbi_write(struct exofs_io_state *ios) { int i, ret; - for (i = 0; i < ios->sbi->s_numdevs; i++) { + for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -361,8 +362,9 @@ int exofs_sbi_read(struct exofs_io_state *ios) struct osd_request *or; unsigned first_dev = (unsigned)ios->obj.id; - first_dev %= ios->sbi->s_numdevs; - or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL); + first_dev %= ios->layout->s_numdevs; + or = osd_start_request(ios->layout->s_ods[first_dev], + GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -438,7 +440,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) __be64 newsize; int i, ret; - if (exofs_get_io_state(sbi, &ios)) + if (exofs_get_io_state(&sbi->layout, &ios)) return -ENOMEM; ios->obj.id = exofs_oi_objno(oi); @@ -448,10 +450,10 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) attr = g_attr_logical_length; attr.val_ptr = &newsize; - for (i = 0; i < sbi->s_numdevs; i++) { + for (i = 0; i < sbi->layout.s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(sbi->s_ods[i], GFP_KERNEL); + or = osd_start_request(sbi->layout.s_ods[i], GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index a1d1e77b12eb..fc8875186ae8 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -210,7 +210,7 @@ int exofs_sync_fs(struct super_block *sb, int wait) sbi = sb->s_fs_info; fscb = &sbi->s_fscb; - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (ret) goto out; @@ -264,12 +264,12 @@ static void _exofs_print_device(const char *msg, const char *dev_path, void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->s_numdevs) { - int i = --sbi->s_numdevs; - struct osd_dev *od = sbi->s_ods[i]; + while (sbi->layout.s_numdevs) { + int i = --sbi->layout.s_numdevs; + struct osd_dev *od = sbi->layout.s_ods[i]; if (od) { - sbi->s_ods[i] = NULL; + sbi->layout.s_ods[i] = NULL; osduld_put_device(od); } } @@ -298,7 +298,8 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid); + _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], + sbi->layout.s_pid); exofs_free_sbi(sbi); sb->s_fs_info = NULL; @@ -361,7 +362,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, { struct exofs_sb_info *sbi = *psbi; struct osd_dev *fscb_od; - struct osd_obj_id obj = {.partition = sbi->s_pid, + struct osd_obj_id obj = {.partition = sbi->layout.s_pid, .id = EXOFS_DEVTABLE_ID}; struct exofs_device_table *dt; unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + @@ -376,9 +377,9 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, return -ENOMEM; } - fscb_od = sbi->s_ods[0]; - sbi->s_ods[0] = NULL; - sbi->s_numdevs = 0; + fscb_od = sbi->layout.s_ods[0]; + sbi->layout.s_ods[0] = NULL; + sbi->layout.s_numdevs = 0; ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); if (unlikely(ret)) { EXOFS_ERR("ERROR: reading device table\n"); @@ -397,14 +398,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, goto out; if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->s_ods[0]); + unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]); sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); if (unlikely(!sbi)) { ret = -ENOMEM; goto out; } - memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0])); + memset(&sbi->layout.s_ods[1], 0, + size - sizeof(sbi->layout.s_ods[0])); *psbi = sbi; } @@ -427,8 +429,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->s_ods[i] = fscb_od; - ++sbi->s_numdevs; + sbi->layout.s_ods[i] = fscb_od; + ++sbi->layout.s_numdevs; fscb_od = NULL; continue; } @@ -441,8 +443,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, goto out; } - sbi->s_ods[i] = od; - ++sbi->s_numdevs; + sbi->layout.s_ods[i] = od; + ++sbi->layout.s_numdevs; /* Read the fscb of the other devices to make sure the FS * partition is there. @@ -499,9 +501,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - sbi->s_ods[0] = od; - sbi->s_numdevs = 1; - sbi->s_pid = opts->pid; + /* Default layout in case we do not have a device-table */ + sbi->layout.s_ods[0] = od; + sbi->layout.s_numdevs = 1; + sbi->layout.s_pid = opts->pid; sbi->s_timeout = opts->timeout; /* fill in some other data by hand */ @@ -514,7 +517,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sb->s_bdev = NULL; sb->s_dev = 0; - obj.partition = sbi->s_pid; + obj.partition = sbi->layout.s_pid; obj.id = EXOFS_SUPER_ID; exofs_make_credential(sbi->s_cred, &obj); @@ -578,13 +581,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0], - sbi->s_pid); + _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], + sbi->layout.s_pid); return 0; free_sbi: EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", - opts->dev_name, sbi->s_pid, ret); + opts->dev_name, sbi->layout.s_pid, ret); exofs_free_sbi(sbi); return ret; } @@ -627,7 +630,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) uint8_t cred_a[OSD_CAP_LEN]; int ret; - ret = exofs_get_io_state(sbi, &ios); + ret = exofs_get_io_state(&sbi->layout, &ios); if (ret) { EXOFS_DBGMSG("exofs_get_io_state failed.\n"); return ret; From 46f4d973f6874c06b7a41a3bf8f4c1717d90f97a Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 1 Feb 2010 11:37:30 +0200 Subject: [PATCH 05/11] exofs: unindent exofs_sbi_read The original idea was that a mirror read can be sub-divided to multiple devices. But this has very little gain and only at very large IOes so it's not going to be implemented soon. Signed-off-by: Boaz Harrosh --- fs/exofs/ios.c | 89 ++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 50 deletions(-) diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 83e54a77b992..4f679317ca54 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -356,59 +356,48 @@ out: int exofs_sbi_read(struct exofs_io_state *ios) { - int i, ret; + struct osd_request *or; + struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + unsigned first_dev = (unsigned)ios->obj.id; - for (i = 0; i < 1; i++) { - struct osd_request *or; - unsigned first_dev = (unsigned)ios->obj.id; - - first_dev %= ios->layout->s_numdevs; - or = osd_start_request(ios->layout->s_ods[first_dev], - GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; - - if (ios->bio) { - osd_req_read(or, &ios->obj, ios->offset, ios->bio, - ios->length); - EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d\n", _LLU(ios->obj.id), - _LLU(ios->offset), - _LLU(ios->length), - first_dev); - } else if (ios->kern_buff) { - osd_req_read_kern(or, &ios->obj, ios->offset, - ios->kern_buff, ios->length); - EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), - _LLU(ios->offset), - _LLU(ios->length), - first_dev); - } else { - osd_req_get_attributes(or, &ios->obj); - EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", - _LLU(ios->obj.id), ios->in_attr_len, - first_dev); - } - - if (ios->out_attr) - osd_req_add_set_attr_list(or, ios->out_attr, - ios->out_attr_len); - - if (ios->in_attr) - osd_req_add_get_attr_list(or, ios->in_attr, - ios->in_attr_len); + first_dev %= ios->layout->s_numdevs; + or = osd_start_request(ios->layout->s_ods[first_dev], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; } - ret = exofs_io_execute(ios); + per_dev->or = or; + ios->numdevs++; -out: - return ret; + if (ios->bio) { + osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length); + EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" + " dev=%d\n", _LLU(ios->obj.id), + _LLU(ios->offset), _LLU(ios->length), + first_dev); + } else if (ios->kern_buff) { + int ret = osd_req_read_kern(or, &ios->obj, ios->offset, + ios->kern_buff, ios->length); + + EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d ret=>%d\n", + _LLU(ios->obj.id), _LLU(ios->offset), + _LLU(ios->length), first_dev, ret); + if (unlikely(ret)) + return ret; + } else { + osd_req_get_attributes(or, &ios->obj); + EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", + _LLU(ios->obj.id), ios->in_attr_len, first_dev); + } + + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); + + return exofs_io_execute(ios); } int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) From d9c740d2253e75db8cef8f87a3125c450f3ebd82 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 28 Jan 2010 11:58:08 +0200 Subject: [PATCH 06/11] exofs: Define on-disk per-inode optional layout attribute * Layouts describe the way a file is spread on multiple devices. The layout information is stored in the objects attribute introduced in this patch. * There can be multiple generating function for the layout. Currently defined: - No attribute present - use below moving-window on global device table, all devices. (This is the only one currently used in exofs) - an obj_id generated moving window - the obj_id is a randomizing factor in the otherwise global map layout. - An explicit layout stored, including a data_map and a device index list. - More might be defined in future ... * There are two attributes defined of the same structure: A-data-files-layout - This layout is used by data-files. If present at a directory, all files of that directory will be created with this layout. A-meta-data-layout - This layout is used by a directory and other meta-data information. Also inherited at creation of subdirectories. * At creation time inodes are created with the layout specified above. A usermode utility may change the creation layout on a give directory or file. Which in the case of directories, will also apply to newly created files/subdirectories, children of that directory. In the simple unaltered case of a newly created exofs, no layout attributes are present, and all layouts adhere to the layout specified at the device-table. * In case of a future file system loaded in an old exofs-driver. At iget(), the generating_function is inspected and if not supported will return an IO error to the application and the inode will not be loaded. So not to damage any data. Note: After this patch we do not yet support any type of layout only the RAID0 patch that enables striping at the super-block level will add support for RAID0 layouts above. This way we are past and future compatible and fully bisectable. * Access to the device table is done by an accessor since it will change according to above information. Signed-off-by: Boaz Harrosh --- fs/exofs/common.h | 39 +++++++++++++++++++++++++++++++++ fs/exofs/exofs.h | 6 +++++ fs/exofs/inode.c | 56 ++++++++++++++++++++++++++++++++++++++++++----- fs/exofs/ios.c | 23 ++++++++++++++----- 4 files changed, 114 insertions(+), 10 deletions(-) diff --git a/fs/exofs/common.h b/fs/exofs/common.h index b1b178e61718..f0d520312d8b 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -55,6 +55,8 @@ /* exofs Application specific page/attribute */ # define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) # define EXOFS_ATTR_INODE_DATA 1 +# define EXOFS_ATTR_INODE_FILE_LAYOUT 2 +# define EXOFS_ATTR_INODE_DIR_LAYOUT 3 /* * The maximum number of files we can have is limited by the size of the @@ -206,4 +208,41 @@ enum { (((name_len) + offsetof(struct exofs_dir_entry, name) + \ EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) +/* + * The on-disk (optional) layout structure. + * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT + * attribute, attached to any inode, usually to a directory. + */ + +enum exofs_inode_layout_gen_functions { + LAYOUT_MOVING_WINDOW = 0, + LAYOUT_IMPLICT = 1, +}; + +struct exofs_on_disk_inode_layout { + __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */ + __le16 pad; + union { + /* gen_func == LAYOUT_MOVING_WINDOW (default) */ + struct exofs_layout_sliding_window { + __le32 num_devices; /* first n devices in global-table*/ + } sliding_window __packed; + + /* gen_func == LAYOUT_IMPLICT */ + struct exofs_layout_implict_list { + struct exofs_dt_data_map data_map; + /* Variable array of size data_map.cb_num_comps. These + * are device indexes of the devices in the global table + */ + __le32 dev_indexes[]; + } implict __packed; + }; +} __packed; + +static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs) +{ + return sizeof(struct exofs_on_disk_inode_layout) + + max_devs * sizeof(__le32); +} + #endif /*ifndef __EXOFS_COM_H__*/ diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 33c68568b338..09e331935514 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -185,6 +185,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode) return container_of(inode, struct exofs_i_info, vfs_inode); } +/* + * Given a layout, object_number and stripe_index return the associated global + * dev_index + */ +unsigned exofs_layout_od_id(struct exofs_layout *layout, + osd_id obj_no, unsigned layout_index); /* * Maximum count of links to a file */ diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 03189a958b33..0163546ba05a 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -859,6 +859,15 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) return error; } +static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_FILE_LAYOUT, + 0); +static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( + EXOFS_APAGE_FS_DATA, + EXOFS_ATTR_INODE_DIR_LAYOUT, + 0); + /* * Read an inode from the OSD, and return it as is. We also return the size * attribute in the 'obj_size' argument. @@ -867,11 +876,16 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, struct exofs_fcb *inode, uint64_t *obj_size) { struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_attr attrs[2]; + struct osd_attr attrs[] = { + [0] = g_attr_inode_data, + [1] = g_attr_inode_file_layout, + [2] = g_attr_inode_dir_layout, + [3] = g_attr_logical_length, + }; struct exofs_io_state *ios; + struct exofs_on_disk_inode_layout *layout; int ret; - *obj_size = ~0; ret = exofs_get_io_state(&sbi->layout, &ios); if (unlikely(ret)) { EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); @@ -882,8 +896,9 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, exofs_make_credential(oi->i_cred, &ios->obj); ios->cred = oi->i_cred; - attrs[0] = g_attr_inode_data; - attrs[1] = g_attr_logical_length; + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); + ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); @@ -900,12 +915,43 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); ret = extract_attr_from_ios(ios, &attrs[1]); + if (ret) { + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + goto out; + } + if (attrs[1].len) { + layout = attrs[1].val_ptr; + if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { + EXOFS_ERR("%s: unsupported files layout %d\n", + __func__, layout->gen_func); + ret = -ENOTSUPP; + goto out; + } + } + + ret = extract_attr_from_ios(ios, &attrs[2]); + if (ret) { + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + goto out; + } + if (attrs[2].len) { + layout = attrs[2].val_ptr; + if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { + EXOFS_ERR("%s: unsupported meta-data layout %d\n", + __func__, layout->gen_func); + ret = -ENOTSUPP; + goto out; + } + } + + *obj_size = ~0; + ret = extract_attr_from_ios(ios, &attrs[3]); if (ret) { EXOFS_ERR("%s: extract_attr of logical_length failed\n", __func__); goto out; } - *obj_size = get_unaligned_be64(attrs[1].val_ptr); + *obj_size = get_unaligned_be64(attrs[3].val_ptr); out: exofs_put_io_state(ios); diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 4f679317ca54..2b81f99fd62c 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -107,6 +107,19 @@ void exofs_put_io_state(struct exofs_io_state *ios) } } +unsigned exofs_layout_od_id(struct exofs_layout *layout, + osd_id obj_no, unsigned layout_index) +{ + return layout_index; +} + +static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, + unsigned layout_index) +{ + return ios->layout->s_ods[ + exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)]; +} + static void _sync_done(struct exofs_io_state *ios, void *p) { struct completion *waiting = p; @@ -242,7 +255,7 @@ int exofs_sbi_create(struct exofs_io_state *ios) for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -266,7 +279,7 @@ int exofs_sbi_remove(struct exofs_io_state *ios) for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -290,7 +303,7 @@ int exofs_sbi_write(struct exofs_io_state *ios) for (i = 0; i < ios->layout->s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -361,7 +374,7 @@ int exofs_sbi_read(struct exofs_io_state *ios) unsigned first_dev = (unsigned)ios->obj.id; first_dev %= ios->layout->s_numdevs; - or = osd_start_request(ios->layout->s_ods[first_dev], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; @@ -442,7 +455,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) for (i = 0; i < sbi->layout.s_numdevs; i++) { struct osd_request *or; - or = osd_start_request(sbi->layout.s_ods[i], GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; From 5d952b8391692553c31e620a92d6e09262a9a307 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 1 Feb 2010 13:35:51 +0200 Subject: [PATCH 07/11] exofs: RAID0 support We now support striping over mirror devices. Including variable sized stripe_unit. Some limits: * stripe_unit must be a multiple of PAGE_SIZE * stripe_unit * stripe_count is maximum upto 32-bit (4Gb) Tested RAID0 over mirrors, RAID0 only, mirrors only. All check. Design notes: * I'm not using a vectored raid-engine mechanism yet. Following the pnfs-objects-layout data-map structure, "Mirror" is just a private case of "group_width" == 1, and RAID0 is a private case of "Mirrors" == 1. The performance lose of the general case over the particular special case optimization is totally negligible, also considering the extra code size. * In general I added a prepare_stripes() stage that divides the to-be-io pages to the participating devices, the previous exofs_ios_write/read, now becomes _write/read_mirrors and a new write/read upper layer loops on all devices calling _write/read_mirrors. Effectively the prepare_stripes stage is the all secret. Also truncate need fixing to accommodate for striping. * In a RAID0 arrangement, in a regular usage scenario, if all inode layouts will start at the same device, the small files fill up the first device and the later devices stay empty, the farther the device the emptier it is. To fix that, each inode will start at a different stripe_unit, according to it's obj_id modulus number-of-stripe-units. And will then span all stripe-units in the same incrementing order wrapping back to the beginning of the device table. We call it a stripe-units moving window. Special consideration was taken to keep all devices in a mirror arrangement identical. So a broken osd-device could just be cloned from one of the mirrors and no FS scrubbing is needed. (We do that by rotating stripe-unit at a time and not a single device at a time.) TODO: We no longer verify object_length == inode->i_size in exofs_iget. (since i_size is stripped on multiple objects now). I should introduce a multiple-device attribute reading, and use it in exofs_iget. Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 11 ++ fs/exofs/inode.c | 26 +--- fs/exofs/ios.c | 327 +++++++++++++++++++++++++++++++++++++++-------- fs/exofs/super.c | 52 +++++++- 4 files changed, 333 insertions(+), 83 deletions(-) diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 09e331935514..0d8a34b21ae1 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -58,6 +58,14 @@ struct exofs_layout { osd_id s_pid; /* partition ID of file system*/ + /* Our way of looking at the data_map */ + unsigned stripe_unit; + unsigned mirrors_p1; + + unsigned group_width; + + enum exofs_inode_layout_gen_functions lay_func; + unsigned s_numdevs; /* Num of devices in array */ struct osd_dev *s_ods[0]; /* Variable length */ }; @@ -133,6 +141,9 @@ struct exofs_io_state { struct exofs_per_dev_state { struct osd_request *or; struct bio *bio; + loff_t offset; + unsigned length; + unsigned dev; } per_dev[]; }; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 0163546ba05a..2b3163ea56eb 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -869,18 +869,17 @@ static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( 0); /* - * Read an inode from the OSD, and return it as is. We also return the size - * attribute in the 'obj_size' argument. + * Read the Linux inode info from the OSD, and return it as is. In exofs the + * inode info is in an application specific page/attribute of the osd-object. */ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, - struct exofs_fcb *inode, uint64_t *obj_size) + struct exofs_fcb *inode) { struct exofs_sb_info *sbi = sb->s_fs_info; struct osd_attr attrs[] = { [0] = g_attr_inode_data, [1] = g_attr_inode_file_layout, [2] = g_attr_inode_dir_layout, - [3] = g_attr_logical_length, }; struct exofs_io_state *ios; struct exofs_on_disk_inode_layout *layout; @@ -944,15 +943,6 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, } } - *obj_size = ~0; - ret = extract_attr_from_ios(ios, &attrs[3]); - if (ret) { - EXOFS_ERR("%s: extract_attr of logical_length failed\n", - __func__); - goto out; - } - *obj_size = get_unaligned_be64(attrs[3].val_ptr); - out: exofs_put_io_state(ios); return ret; @@ -971,7 +961,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) struct exofs_i_info *oi; struct exofs_fcb fcb; struct inode *inode; - uint64_t obj_size; int ret; inode = iget_locked(sb, ino); @@ -983,7 +972,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) __oi_init(oi); /* read the inode from the osd */ - ret = exofs_get_inode(sb, oi, &fcb, &obj_size); + ret = exofs_get_inode(sb, oi, &fcb); if (ret) goto bad_inode; @@ -1004,13 +993,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_blkbits = EXOFS_BLKSHIFT; inode->i_generation = le32_to_cpu(fcb.i_generation); - if ((inode->i_size != obj_size) && - (!exofs_inode_is_fast_symlink(inode))) { - EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n", - inode->i_size, _LLU(obj_size)); - /* FIXME: call exofs_inode_recovery() */ - } - oi->i_dir_start_lookup = 0; if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 2b81f99fd62c..6e446b2670b9 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -23,6 +23,7 @@ */ #include +#include #include "exofs.h" @@ -110,7 +111,17 @@ void exofs_put_io_state(struct exofs_io_state *ios) unsigned exofs_layout_od_id(struct exofs_layout *layout, osd_id obj_no, unsigned layout_index) { - return layout_index; +/* switch (layout->lay_func) { + case LAYOUT_MOVING_WINDOW: + {*/ + unsigned dev_mod = obj_no; + + return (layout_index + dev_mod * layout->mirrors_p1) % + layout->s_numdevs; +/* } + case LAYOUT_FUNC_IMPLICT: + return layout->devs[layout_index]; + }*/ } static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, @@ -225,8 +236,8 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) _clear_bio(ios->per_dev[i].bio); EXOFS_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", - _LLU(ios->offset), - _LLU(ios->length)); + _LLU(ios->per_dev[i].offset), + _LLU(ios->per_dev[i].length)); continue; /* we recovered */ } @@ -248,6 +259,127 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) return acumulated_lin_err; } +/* REMOVEME: After review + Some quoteing from the standard + + L = logical offset into the file + W = number of data components in a stripe + S = W * stripe_unit (S is Stripe length) + N = L / S (N is the stripe Number) + C = (L-(N*S)) / stripe_unit (C is the component) + O = (N*stripe_unit)+(L%stripe_unit) (O is the object's offset) +*/ + +static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset, + u64 *obj_offset, unsigned *dev, unsigned *unit_off) +{ + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned stripe_length = stripe_unit * ios->layout->group_width; + u64 stripe_no = file_offset; + unsigned stripe_mod = do_div(stripe_no, stripe_length); + + *unit_off = stripe_mod % stripe_unit; + *obj_offset = stripe_no * stripe_unit + *unit_off; + *dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1; +} + +static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, + struct exofs_per_dev_state *per_dev, int cur_len) +{ + unsigned bv = *cur_bvec; + struct request_queue *q = + osd_request_queue(exofs_ios_od(ios, per_dev->dev)); + + per_dev->length += cur_len; + + if (per_dev->bio == NULL) { + unsigned pages_in_stripe = ios->layout->group_width * + (ios->layout->stripe_unit / PAGE_SIZE); + unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) / + ios->layout->group_width; + + per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); + if (unlikely(!per_dev->bio)) { + EXOFS_DBGMSG("Faild to allocate BIO size=%u\n", + bio_size); + return -ENOMEM; + } + } + + while (cur_len > 0) { + int added_len; + struct bio_vec *bvec = &ios->bio->bi_io_vec[bv]; + + BUG_ON(ios->bio->bi_vcnt <= bv); + cur_len -= bvec->bv_len; + + added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page, + bvec->bv_len, bvec->bv_offset); + if (unlikely(bvec->bv_len != added_len)) + return -ENOMEM; + ++bv; + } + BUG_ON(cur_len); + + *cur_bvec = bv; + return 0; +} + +static int _prepare_for_striping(struct exofs_io_state *ios) +{ + u64 length = ios->length; + u64 offset = ios->offset; + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned comp = 0; + unsigned stripes = 0; + unsigned cur_bvec = 0; + int ret; + + if (!ios->bio) { + if (ios->kern_buff) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + unsigned unit_off; + + _offset_dev_unit_off(ios, offset, &per_dev->offset, + &per_dev->dev, &unit_off); + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (unit_off + length > stripe_unit)); + } + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + while (length) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; + unsigned cur_len; + + if (!per_dev->length) { + unsigned unit_off; + + _offset_dev_unit_off(ios, offset, &per_dev->offset, + &per_dev->dev, &unit_off); + stripes++; + cur_len = min_t(u64, stripe_unit - unit_off, length); + offset += cur_len; + } else { + cur_len = min_t(u64, stripe_unit, length); + } + + ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len); + if (unlikely(ret)) + goto out; + + comp += ios->layout->mirrors_p1; + comp %= ios->layout->s_numdevs; + + length -= cur_len; + } +out: + ios->numdevs = stripes * ios->layout->mirrors_p1; + return ret; +} + int exofs_sbi_create(struct exofs_io_state *ios) { int i, ret; @@ -296,61 +428,71 @@ out: return ret; } -int exofs_sbi_write(struct exofs_io_state *ios) +static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) { - int i, ret; + struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; + unsigned dev = ios->per_dev[cur_comp].dev; + unsigned last_comp = cur_comp + ios->layout->mirrors_p1; + int ret = 0; - for (i = 0; i < ios->layout->s_numdevs; i++) { + for (; cur_comp < last_comp; ++cur_comp, ++dev) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; goto out; } - ios->per_dev[i].or = or; - ios->numdevs++; + per_dev->or = or; + per_dev->offset = master_dev->offset; if (ios->bio) { struct bio *bio; - if (i != 0) { + if (per_dev != master_dev) { bio = bio_kmalloc(GFP_KERNEL, - ios->bio->bi_max_vecs); + master_dev->bio->bi_max_vecs); if (unlikely(!bio)) { EXOFS_DBGMSG( "Faild to allocate BIO size=%u\n", - ios->bio->bi_max_vecs); + master_dev->bio->bi_max_vecs); ret = -ENOMEM; goto out; } - __bio_clone(bio, ios->bio); + __bio_clone(bio, master_dev->bio); bio->bi_bdev = NULL; bio->bi_next = NULL; - ios->per_dev[i].bio = bio; + per_dev->length = master_dev->length; + per_dev->bio = bio; + per_dev->dev = dev; } else { - bio = ios->bio; + bio = master_dev->bio; + /* FIXME: bio_set_dir() */ + bio->bi_rw |= (1 << BIO_RW); } - osd_req_write(or, &ios->obj, ios->offset, bio, - ios->length); + osd_req_write(or, &ios->obj, per_dev->offset, bio, + per_dev->length); EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(ios->offset), - _LLU(ios->length), i); + _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(per_dev->length), dev); } else if (ios->kern_buff) { - osd_req_write_kern(or, &ios->obj, ios->offset, + ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, ios->kern_buff, ios->length); + if (unlikely(ret)) + goto out; EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(ios->offset), - _LLU(ios->length), i); + _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(ios->length), dev); } else { osd_req_set_attributes(or, &ios->obj); EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", - _LLU(ios->obj.id), ios->out_attr_len, i); + _LLU(ios->obj.id), ios->out_attr_len, dev); } if (ios->out_attr) @@ -361,40 +503,57 @@ int exofs_sbi_write(struct exofs_io_state *ios) osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); } - ret = exofs_io_execute(ios); out: return ret; } -int exofs_sbi_read(struct exofs_io_state *ios) +int exofs_sbi_write(struct exofs_io_state *ios) +{ + int i; + int ret; + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _sbi_write_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = exofs_io_execute(ios); + return ret; +} + +static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) { struct osd_request *or; - struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; unsigned first_dev = (unsigned)ios->obj.id; - first_dev %= ios->layout->s_numdevs; + first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); if (unlikely(!or)) { EXOFS_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; } per_dev->or = or; - ios->numdevs++; if (ios->bio) { - osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length); + osd_req_read(or, &ios->obj, per_dev->offset, + per_dev->bio, per_dev->length); EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" " dev=%d\n", _LLU(ios->obj.id), - _LLU(ios->offset), _LLU(ios->length), + _LLU(per_dev->offset), _LLU(per_dev->length), first_dev); } else if (ios->kern_buff) { - int ret = osd_req_read_kern(or, &ios->obj, ios->offset, + int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, ios->kern_buff, ios->length); - EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " "length=0x%llx dev=%d ret=>%d\n", - _LLU(ios->obj.id), _LLU(ios->offset), + _LLU(ios->obj.id), _LLU(per_dev->offset), _LLU(ios->length), first_dev, ret); if (unlikely(ret)) return ret; @@ -403,14 +562,32 @@ int exofs_sbi_read(struct exofs_io_state *ios) EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", _LLU(ios->obj.id), ios->in_attr_len, first_dev); } - if (ios->out_attr) osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); if (ios->in_attr) osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); - return exofs_io_execute(ios); + return 0; +} + +int exofs_sbi_read(struct exofs_io_state *ios) +{ + int i; + int ret; + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _sbi_read_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = exofs_io_execute(ios); + return ret; } int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) @@ -434,42 +611,84 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) return -EIO; } +static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, + struct osd_attr *attr) +{ + int last_comp = cur_comp + ios->layout->mirrors_p1; + + for (; cur_comp < last_comp; ++cur_comp) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_request *or; + + or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + + osd_req_set_attributes(or, &ios->obj); + osd_req_add_set_attr_list(or, attr, 1); + } + + return 0; +} + int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) { struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; struct exofs_io_state *ios; - struct osd_attr attr; - __be64 newsize; + struct exofs_trunc_attr { + struct osd_attr attr; + __be64 newsize; + } *size_attrs; + u64 this_obj_size; + unsigned dev; + unsigned unit_off; int i, ret; - if (exofs_get_io_state(&sbi->layout, &ios)) - return -ENOMEM; + ret = exofs_get_io_state(&sbi->layout, &ios); + if (unlikely(ret)) + return ret; + + size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), + GFP_KERNEL); + if (unlikely(!size_attrs)) { + ret = -ENOMEM; + goto out; + } ios->obj.id = exofs_oi_objno(oi); ios->cred = oi->i_cred; - newsize = cpu_to_be64(size); - attr = g_attr_logical_length; - attr.val_ptr = &newsize; + ios->numdevs = ios->layout->s_numdevs; + _offset_dev_unit_off(ios, size, &this_obj_size, &dev, &unit_off); - for (i = 0; i < sbi->layout.s_numdevs; i++) { - struct osd_request *or; + for (i = 0; i < ios->layout->group_width; ++i) { + struct exofs_trunc_attr *size_attr = &size_attrs[i]; + u64 obj_size; - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; + if (i < dev) + obj_size = this_obj_size + + ios->layout->stripe_unit - unit_off; + else if (i == dev) + obj_size = this_obj_size; + else /* i > dev */ + obj_size = this_obj_size - unit_off; + + size_attr->newsize = cpu_to_be64(obj_size); + size_attr->attr = g_attr_logical_length; + size_attr->attr.val_ptr = &size_attr->newsize; + + ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, + &size_attr->attr); + if (unlikely(ret)) goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; - - osd_req_set_attributes(or, &ios->obj); - osd_req_add_set_attr_list(or, &attr, 1); } ret = exofs_io_execute(ios); out: + kfree(size_attrs); exofs_put_io_state(ios); return ret; } diff --git a/fs/exofs/super.c b/fs/exofs/super.c index fc8875186ae8..8f4e4b37a578 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -308,6 +308,8 @@ static void exofs_put_super(struct super_block *sb) static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_device_table *dt) { + u64 stripe_length; + sbi->data_map.odm_num_comps = le32_to_cpu(dt->dt_data_map.cb_num_comps); sbi->data_map.odm_stripe_unit = @@ -321,14 +323,47 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->data_map.odm_raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Hard coded mirror only for now. if not so do not mount */ - if ((sbi->data_map.odm_num_comps != numdevs) || - (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) || - (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) || - (sbi->data_map.odm_mirror_cnt != (numdevs - 1))) +/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */ + if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) { + EXOFS_ERR("Group width/depth not supported\n"); return -EINVAL; - else - return 0; + } + if (sbi->data_map.odm_num_comps != numdevs) { + EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", + sbi->data_map.odm_num_comps, numdevs); + return -EINVAL; + } + if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { + EXOFS_ERR("Only RAID_0 for now\n"); + return -EINVAL; + } + if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { + EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", + numdevs, sbi->data_map.odm_mirror_cnt); + return -EINVAL; + } + + stripe_length = sbi->data_map.odm_stripe_unit * + (numdevs / (sbi->data_map.odm_mirror_cnt + 1)); + if (stripe_length >= (1ULL << 32)) { + EXOFS_ERR("Total Stripe length(0x%llx)" + " >= 32bit is not supported\n", _LLU(stripe_length)); + return -EINVAL; + } + + if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { + EXOFS_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); + return -EINVAL; + } + + sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; + sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; + sbi->layout.group_width = sbi->data_map.odm_num_comps / + sbi->layout.mirrors_p1; + + return 0; } /* @odi is valid only as long as @fscb_dev is valid */ @@ -502,6 +537,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) } /* Default layout in case we do not have a device-table */ + sbi->layout.stripe_unit = PAGE_SIZE; + sbi->layout.mirrors_p1 = 1; + sbi->layout.group_width = 1; sbi->layout.s_ods[0] = od; sbi->layout.s_numdevs = 1; sbi->layout.s_pid = opts->pid; From 86093aaff5be5b214613eb60553e236bdb389c84 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 28 Jan 2010 18:24:06 +0200 Subject: [PATCH 08/11] exofs: convert io_state to use pages array instead of bio at input * inode.c operations are full-pages based, and not actually true scatter-gather * Lets us use more pages at once upto 512 (from 249) in 64 bit * Brings us much much closer to be able to use exofs's io_state engine from objlayout driver. (Once I decide where to put the common code) After RAID0 patch the outer (input) bio was never used as a bio, but was simply a page carrier into the raid engine. Even in the simple mirror/single-dev arrangement pages info was copied into a second bio. It is now easer to just pass a pages array into the io_state and prepare bio(s) once. Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 5 ++- fs/exofs/inode.c | 81 ++++++++++++++++++++++++------------------------ fs/exofs/ios.c | 46 +++++++++++++++------------ 3 files changed, 71 insertions(+), 61 deletions(-) diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 0d8a34b21ae1..acfebd36de83 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -128,7 +128,10 @@ struct exofs_io_state { loff_t offset; unsigned long length; void *kern_buff; - struct bio *bio; + + struct page **pages; + unsigned nr_pages; + unsigned pgbase; /* Attributes */ unsigned in_attr_len; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 2b3163ea56eb..6ca0b0117f04 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -41,16 +41,18 @@ enum { BIO_MAX_PAGES_KMALLOC = (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), + MAX_PAGES_KMALLOC = + PAGE_SIZE / sizeof(struct page *), }; struct page_collect { struct exofs_sb_info *sbi; - struct request_queue *req_q; struct inode *inode; unsigned expected_pages; struct exofs_io_state *ios; - struct bio *bio; + struct page **pages; + unsigned alloc_pages; unsigned nr_pages; unsigned long length; loff_t pg_first; /* keep 64bit also in 32-arches */ @@ -62,15 +64,12 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; pcol->sbi = sbi; - /* Create master bios on first Q, later on cloning, each clone will be - * allocated on it's destination Q - */ - pcol->req_q = osd_request_queue(sbi->layout.s_ods[0]); pcol->inode = inode; pcol->expected_pages = expected_pages; pcol->ios = NULL; - pcol->bio = NULL; + pcol->pages = NULL; + pcol->alloc_pages = 0; pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; @@ -80,7 +79,8 @@ static void _pcol_reset(struct page_collect *pcol) { pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); - pcol->bio = NULL; + pcol->pages = NULL; + pcol->alloc_pages = 0; pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; @@ -90,13 +90,13 @@ static void _pcol_reset(struct page_collect *pcol) * it might not end here. don't be left with nothing */ if (!pcol->expected_pages) - pcol->expected_pages = BIO_MAX_PAGES_KMALLOC; + pcol->expected_pages = MAX_PAGES_KMALLOC; } static int pcol_try_alloc(struct page_collect *pcol) { - int pages = min_t(unsigned, pcol->expected_pages, - BIO_MAX_PAGES_KMALLOC); + unsigned pages = min_t(unsigned, pcol->expected_pages, + MAX_PAGES_KMALLOC); if (!pcol->ios) { /* First time allocate io_state */ int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); @@ -105,23 +105,28 @@ static int pcol_try_alloc(struct page_collect *pcol) return ret; } + /* TODO: easily support bio chaining */ + pages = min_t(unsigned, pages, + pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC); + for (; pages; pages >>= 1) { - pcol->bio = bio_kmalloc(GFP_KERNEL, pages); - if (likely(pcol->bio)) + pcol->pages = kmalloc(pages * sizeof(struct page *), + GFP_KERNEL); + if (likely(pcol->pages)) { + pcol->alloc_pages = pages; return 0; + } } - EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n", + EXOFS_ERR("Failed to kmalloc expected_pages=%u\n", pcol->expected_pages); return -ENOMEM; } static void pcol_free(struct page_collect *pcol) { - if (pcol->bio) { - bio_put(pcol->bio); - pcol->bio = NULL; - } + kfree(pcol->pages); + pcol->pages = NULL; if (pcol->ios) { exofs_put_io_state(pcol->ios); @@ -132,11 +137,10 @@ static void pcol_free(struct page_collect *pcol) static int pcol_add_page(struct page_collect *pcol, struct page *page, unsigned len) { - int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0); - if (unlikely(len != added_len)) + if (unlikely(pcol->nr_pages >= pcol->alloc_pages)) return -ENOMEM; - ++pcol->nr_pages; + pcol->pages[pcol->nr_pages++] = page; pcol->length += len; return 0; } @@ -181,7 +185,6 @@ static void update_write_page(struct page *page, int ret) */ static int __readpages_done(struct page_collect *pcol, bool do_unlock) { - struct bio_vec *bvec; int i; u64 resid; u64 good_bytes; @@ -198,8 +201,8 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock) pcol->inode->i_ino, _LLU(good_bytes), pcol->length, pcol->nr_pages); - __bio_for_each_segment(bvec, pcol->bio, i, 0) { - struct page *page = bvec->bv_page; + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; struct inode *inode = page->mapping->host; int page_stat; @@ -218,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock) ret = update_read_page(page, page_stat); if (do_unlock) unlock_page(page); - length += bvec->bv_len; + length += PAGE_SIZE; } pcol_free(pcol); @@ -238,11 +241,10 @@ static void readpages_done(struct exofs_io_state *ios, void *p) static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) { - struct bio_vec *bvec; int i; - __bio_for_each_segment(bvec, pcol->bio, i, 0) { - struct page *page = bvec->bv_page; + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; if (rw == READ) update_read_page(page, ret); @@ -260,13 +262,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync) struct page_collect *pcol_copy = NULL; int ret; - if (!pcol->bio) + if (!pcol->pages) return 0; /* see comment in _readpage() about sync reads */ WARN_ON(is_sync && (pcol->nr_pages != 1)); - ios->bio = pcol->bio; + ios->pages = pcol->pages; + ios->nr_pages = pcol->nr_pages; ios->length = pcol->length; ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; @@ -366,7 +369,7 @@ try_again: goto try_again; } - if (!pcol->bio) { + if (!pcol->pages) { ret = pcol_try_alloc(pcol); if (unlikely(ret)) goto fail; @@ -448,7 +451,6 @@ static int exofs_readpage(struct file *file, struct page *page) static void writepages_done(struct exofs_io_state *ios, void *p) { struct page_collect *pcol = p; - struct bio_vec *bvec; int i; u64 resid; u64 good_bytes; @@ -467,8 +469,8 @@ static void writepages_done(struct exofs_io_state *ios, void *p) pcol->inode->i_ino, _LLU(good_bytes), pcol->length, pcol->nr_pages); - __bio_for_each_segment(bvec, pcol->bio, i, 0) { - struct page *page = bvec->bv_page; + for (i = 0; i < pcol->nr_pages; i++) { + struct page *page = pcol->pages[i]; struct inode *inode = page->mapping->host; int page_stat; @@ -485,7 +487,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p) EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", inode->i_ino, page->index, page_stat); - length += bvec->bv_len; + length += PAGE_SIZE; } pcol_free(pcol); @@ -500,7 +502,7 @@ static int write_exec(struct page_collect *pcol) struct page_collect *pcol_copy = NULL; int ret; - if (!pcol->bio) + if (!pcol->pages) return 0; pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); @@ -512,9 +514,8 @@ static int write_exec(struct page_collect *pcol) *pcol_copy = *pcol; - pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ - - ios->bio = pcol_copy->bio; + ios->pages = pcol_copy->pages; + ios->nr_pages = pcol_copy->nr_pages; ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; ios->length = pcol_copy->length; ios->done = writepages_done; @@ -605,7 +606,7 @@ try_again: goto try_again; } - if (!pcol->bio) { + if (!pcol->pages) { ret = pcol_try_alloc(pcol); if (unlikely(ret)) goto fail; diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 6e446b2670b9..263052c77f41 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -283,10 +283,11 @@ static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset, *dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1; } -static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, - struct exofs_per_dev_state *per_dev, int cur_len) +static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct exofs_per_dev_state *per_dev, + int cur_len) { - unsigned bv = *cur_bvec; + unsigned pg = *cur_pg; struct request_queue *q = osd_request_queue(exofs_ios_od(ios, per_dev->dev)); @@ -295,7 +296,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) / + unsigned bio_size = (ios->nr_pages + pages_in_stripe) / ios->layout->group_width; per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); @@ -307,21 +308,22 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_bvec, } while (cur_len > 0) { - int added_len; - struct bio_vec *bvec = &ios->bio->bi_io_vec[bv]; + unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); + unsigned added_len; - BUG_ON(ios->bio->bi_vcnt <= bv); - cur_len -= bvec->bv_len; + BUG_ON(ios->nr_pages <= pg); + cur_len -= pglen; - added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page, - bvec->bv_len, bvec->bv_offset); - if (unlikely(bvec->bv_len != added_len)) + added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], + pglen, pgbase); + if (unlikely(pglen != added_len)) return -ENOMEM; - ++bv; + pgbase = 0; + ++pg; } BUG_ON(cur_len); - *cur_bvec = bv; + *cur_pg = pg; return 0; } @@ -332,10 +334,10 @@ static int _prepare_for_striping(struct exofs_io_state *ios) unsigned stripe_unit = ios->layout->stripe_unit; unsigned comp = 0; unsigned stripes = 0; - unsigned cur_bvec = 0; - int ret; + unsigned cur_pg = 0; + int ret = 0; - if (!ios->bio) { + if (!ios->pages) { if (ios->kern_buff) { struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; unsigned unit_off; @@ -352,7 +354,7 @@ static int _prepare_for_striping(struct exofs_io_state *ios) while (length) { struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; - unsigned cur_len; + unsigned cur_len, page_off; if (!per_dev->length) { unsigned unit_off; @@ -362,11 +364,15 @@ static int _prepare_for_striping(struct exofs_io_state *ios) stripes++; cur_len = min_t(u64, stripe_unit - unit_off, length); offset += cur_len; + page_off = unit_off & ~PAGE_MASK; + BUG_ON(page_off != ios->pgbase); } else { cur_len = min_t(u64, stripe_unit, length); + page_off = 0; } - ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len); + ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, + cur_len); if (unlikely(ret)) goto out; @@ -448,7 +454,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) per_dev->or = or; per_dev->offset = master_dev->offset; - if (ios->bio) { + if (ios->pages) { struct bio *bio; if (per_dev != master_dev) { @@ -541,7 +547,7 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) } per_dev->or = or; - if (ios->bio) { + if (ios->pages) { osd_req_read(or, &ios->obj, per_dev->offset, per_dev->bio, per_dev->length); EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" From 96391e2bae0f8882b6f44809202a68be66e91dce Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Tue, 9 Feb 2010 11:43:21 +0200 Subject: [PATCH 09/11] exofs: Error recovery if object is missing from storage If an object is referenced by a directory but does not exist on a target, it is a very serious corruption that means: 1. Either a power failure with very slim chance of it happening. Because the directory update is always submitted much after object creation, but if a directory is written to one device and the object creation to another it might theoretically happen. 2. It only ever happened to me while developing with BUGs causing file corruption. Crashes could also cause it but they are more like case 1. In any way the object does not exist, so data is surely lost. If there is a mix-up in the obj-id or data-map, then lost objects can be salvaged by off-line fsck. The only recoverable information is the directory name. By letting it appear as a regular empty file, with date==0 (1970 Jan 1st) ownership to root, we enable recovery of the only useful information. And also enable deletion or over-write. I can see how this can hurt. Signed-off-by: Boaz Harrosh --- fs/exofs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 6ca0b0117f04..5514f3c2c2f4 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -903,8 +903,18 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ios->in_attr_len = ARRAY_SIZE(attrs); ret = exofs_sbi_read(ios); - if (ret) + if (unlikely(ret)) { + EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", + _LLU(ios->obj.id), ret); + memset(inode, 0, sizeof(*inode)); + inode->i_mode = 0040000 | (0777 & ~022); + /* If object is lost on target we might as well enable it's + * delete. + */ + if ((ret == -ENOENT) || (ret == -EINVAL)) + ret = 0; goto out; + } ret = extract_attr_from_ios(ios, &attrs[0]); if (ret) { From b367e78bd1c7af4c018ce98b1f6d3e001aba895a Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 7 Feb 2010 19:18:58 +0200 Subject: [PATCH 10/11] exofs: Prepare for groups * Rename _offset_dev_unit_off() to _calc_stripe_info() and recieve a struct for the output params * In _prepare_for_striping we only need to call _calc_stripe_info() once. The other componets are easy to calculate from that. This code was inspired by what's done in truncate. * Some code shifts that make sense now but will make more sense when group support is added. Signed-off-by: Boaz Harrosh --- fs/exofs/ios.c | 155 +++++++++++++++++++++++++++++++------------------ 1 file changed, 97 insertions(+), 58 deletions(-) diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 263052c77f41..d28febdf54ab 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -259,28 +259,46 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) return acumulated_lin_err; } -/* REMOVEME: After review - Some quoteing from the standard +/* + * L - logical offset into the file + * + * U - The number of bytes in a full stripe + * + * U = stripe_unit * group_width + * + * N - The stripe number + * + * N = L / U + * + * C - The component index coresponding to L + * + * C = (L - (N*U)) / stripe_unit + * + * O - The component offset coresponding to L + * + * (N*stripe_unit)+(L%stripe_unit) + */ - L = logical offset into the file - W = number of data components in a stripe - S = W * stripe_unit (S is Stripe length) - N = L / S (N is the stripe Number) - C = (L-(N*S)) / stripe_unit (C is the component) - O = (N*stripe_unit)+(L%stripe_unit) (O is the object's offset) -*/ +struct _striping_info { + u64 obj_offset; + unsigned dev; + unsigned unit_off; +}; -static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset, - u64 *obj_offset, unsigned *dev, unsigned *unit_off) +static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, + struct _striping_info *si) { - unsigned stripe_unit = ios->layout->stripe_unit; - unsigned stripe_length = stripe_unit * ios->layout->group_width; - u64 stripe_no = file_offset; - unsigned stripe_mod = do_div(stripe_no, stripe_length); + u32 stripe_unit = ios->layout->stripe_unit; + u32 group_width = ios->layout->group_width; + u32 U = stripe_unit * group_width; - *unit_off = stripe_mod % stripe_unit; - *obj_offset = stripe_no * stripe_unit + *unit_off; - *dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1; + u32 LmodU; + u64 N = div_u64_rem(file_offset, U, &LmodU); + + si->unit_off = LmodU % stripe_unit; + si->obj_offset = N * stripe_unit + si->unit_off; + si->dev = LmodU / stripe_unit; + si->dev *= ios->layout->mirrors_p1; } static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, @@ -327,65 +345,88 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, return 0; } -static int _prepare_for_striping(struct exofs_io_state *ios) +static int _prepare_pages(struct exofs_io_state *ios, + struct _striping_info *si) { u64 length = ios->length; - u64 offset = ios->offset; unsigned stripe_unit = ios->layout->stripe_unit; + unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned dev = si->dev; unsigned comp = 0; unsigned stripes = 0; unsigned cur_pg = 0; int ret = 0; - if (!ios->pages) { - if (ios->kern_buff) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; - unsigned unit_off; - - _offset_dev_unit_off(ios, offset, &per_dev->offset, - &per_dev->dev, &unit_off); - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (unit_off + length > stripe_unit)); - } - ios->numdevs = ios->layout->mirrors_p1; - return 0; - } - while (length) { struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; - unsigned cur_len, page_off; + unsigned cur_len, page_off = 0; if (!per_dev->length) { - unsigned unit_off; + per_dev->dev = dev; + if (dev < si->dev) { + per_dev->offset = si->obj_offset + stripe_unit - + si->unit_off; + cur_len = stripe_unit; + } else if (dev == si->dev) { + per_dev->offset = si->obj_offset; + cur_len = stripe_unit - si->unit_off; + page_off = si->unit_off & ~PAGE_MASK; + BUG_ON(page_off && (page_off != ios->pgbase)); + } else { /* dev > si->dev */ + per_dev->offset = si->obj_offset - si->unit_off; + cur_len = stripe_unit; + } - _offset_dev_unit_off(ios, offset, &per_dev->offset, - &per_dev->dev, &unit_off); stripes++; - cur_len = min_t(u64, stripe_unit - unit_off, length); - offset += cur_len; - page_off = unit_off & ~PAGE_MASK; - BUG_ON(page_off != ios->pgbase); + + dev += mirrors_p1; + dev %= ios->layout->s_numdevs; } else { - cur_len = min_t(u64, stripe_unit, length); - page_off = 0; + cur_len = stripe_unit; } + if (cur_len >= length) + cur_len = length; ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, cur_len); if (unlikely(ret)) goto out; - comp += ios->layout->mirrors_p1; + comp += mirrors_p1; comp %= ios->layout->s_numdevs; length -= cur_len; } out: - ios->numdevs = stripes * ios->layout->mirrors_p1; + ios->numdevs = stripes * mirrors_p1; return ret; } +static int _prepare_for_striping(struct exofs_io_state *ios) +{ + struct _striping_info si; + + _calc_stripe_info(ios, ios->offset, &si); + + if (!ios->pages) { + if (ios->kern_buff) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + + per_dev->offset = si.obj_offset; + per_dev->dev = si.dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (si.unit_off + ios->length > + ios->layout->stripe_unit)); + } + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + return _prepare_pages(ios, &si); +} + int exofs_sbi_create(struct exofs_io_state *ios) { int i, ret; @@ -648,9 +689,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) struct osd_attr attr; __be64 newsize; } *size_attrs; - u64 this_obj_size; - unsigned dev; - unsigned unit_off; + struct _striping_info si; int i, ret; ret = exofs_get_io_state(&sbi->layout, &ios); @@ -668,19 +707,19 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) ios->cred = oi->i_cred; ios->numdevs = ios->layout->s_numdevs; - _offset_dev_unit_off(ios, size, &this_obj_size, &dev, &unit_off); + _calc_stripe_info(ios, size, &si); for (i = 0; i < ios->layout->group_width; ++i) { struct exofs_trunc_attr *size_attr = &size_attrs[i]; u64 obj_size; - if (i < dev) - obj_size = this_obj_size + - ios->layout->stripe_unit - unit_off; - else if (i == dev) - obj_size = this_obj_size; - else /* i > dev */ - obj_size = this_obj_size - unit_off; + if (i < si.dev) + obj_size = si.obj_offset + + ios->layout->stripe_unit - si.unit_off; + else if (i == si.dev) + obj_size = si.obj_offset; + else /* i > si.dev */ + obj_size = si.obj_offset - si.unit_off; size_attr->newsize = cpu_to_be64(obj_size); size_attr->attr = g_attr_logical_length; From 50a76fd3c352ed2740eba01512efcfceee0703be Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 11 Feb 2010 13:01:39 +0200 Subject: [PATCH 11/11] exofs: groups support * _calc_stripe_info() changes to accommodate for grouping calculations. Returns additional information * old _prepare_pages() becomes _prepare_one_group() which stores pages belonging to one device group. * New _prepare_for_striping iterates on all groups calling _prepare_one_group(). * Enable mounting of groups data_maps (group_width != 0) [QUESTION] what is faster A or B; A. x += stride; x = x % width + first_x; B x += stride if (x < last_x) x = first_x; Signed-off-by: Boaz Harrosh --- fs/exofs/exofs.h | 3 ++ fs/exofs/ios.c | 127 +++++++++++++++++++++++++++++++++++++++-------- fs/exofs/super.c | 46 +++++++++++------ 3 files changed, 140 insertions(+), 36 deletions(-) diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index acfebd36de83..59b8bf2825c7 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -63,6 +63,8 @@ struct exofs_layout { unsigned mirrors_p1; unsigned group_width; + u64 group_depth; + unsigned group_count; enum exofs_inode_layout_gen_functions lay_func; @@ -132,6 +134,7 @@ struct exofs_io_state { struct page **pages; unsigned nr_pages; unsigned pgbase; + unsigned pages_consumed; /* Attributes */ unsigned in_attr_len; diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index d28febdf54ab..5293bc411d17 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -262,25 +262,50 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) /* * L - logical offset into the file * - * U - The number of bytes in a full stripe + * U - The number of bytes in a stripe within a group * * U = stripe_unit * group_width * - * N - The stripe number + * T - The number of bytes striped within a group of component objects + * (before advancing to the next group) * - * N = L / U + * T = stripe_unit * group_width * group_depth + * + * S - The number of bytes striped across all component objects + * before the pattern repeats + * + * S = stripe_unit * group_width * group_depth * group_count + * + * M - The "major" (i.e., across all components) stripe number + * + * M = L / S + * + * G - Counts the groups from the beginning of the major stripe + * + * G = (L - (M * S)) / T [or (L % S) / T] + * + * H - The byte offset within the group + * + * H = (L - (M * S)) % T [or (L % S) % T] + * + * N - The "minor" (i.e., across the group) stripe number + * + * N = H / U * * C - The component index coresponding to L * - * C = (L - (N*U)) / stripe_unit + * C = (H - (N * U)) / stripe_unit + G * group_width + * [or (L % U) / stripe_unit + G * group_width] * * O - The component offset coresponding to L * - * (N*stripe_unit)+(L%stripe_unit) + * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit */ - struct _striping_info { u64 obj_offset; + u64 group_length; + u64 total_group_length; + u64 Major; unsigned dev; unsigned unit_off; }; @@ -290,15 +315,35 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, { u32 stripe_unit = ios->layout->stripe_unit; u32 group_width = ios->layout->group_width; + u64 group_depth = ios->layout->group_depth; + u32 U = stripe_unit * group_width; + u64 T = U * group_depth; + u64 S = T * ios->layout->group_count; + u64 M = div64_u64(file_offset, S); - u32 LmodU; - u64 N = div_u64_rem(file_offset, U, &LmodU); + /* + G = (L - (M * S)) / T + H = (L - (M * S)) % T + */ + u64 LmodS = file_offset - M * S; + u32 G = div64_u64(LmodS, T); + u64 H = LmodS - G * T; - si->unit_off = LmodU % stripe_unit; - si->obj_offset = N * stripe_unit + si->unit_off; - si->dev = LmodU / stripe_unit; + u32 N = div_u64(H, U); + + /* "H - (N * U)" is just "H % U" so it's bound to u32 */ + si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; si->dev *= ios->layout->mirrors_p1; + + div_u64_rem(file_offset, stripe_unit, &si->unit_off); + + si->obj_offset = si->unit_off + (N * stripe_unit) + + (M * group_depth * stripe_unit); + + si->group_length = T - H; + si->total_group_length = T; + si->Major = M; } static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, @@ -345,16 +390,17 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, return 0; } -static int _prepare_pages(struct exofs_io_state *ios, - struct _striping_info *si) +static int _prepare_one_group(struct exofs_io_state *ios, u64 length, + struct _striping_info *si, unsigned first_comp) { - u64 length = ios->length; unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned devs_in_group = ios->layout->group_width * mirrors_p1; unsigned dev = si->dev; - unsigned comp = 0; - unsigned stripes = 0; - unsigned cur_pg = 0; + unsigned first_dev = dev - (dev % devs_in_group); + unsigned comp = first_comp + (dev - first_dev); + unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; + unsigned cur_pg = ios->pages_consumed; int ret = 0; while (length) { @@ -377,10 +423,11 @@ static int _prepare_pages(struct exofs_io_state *ios, cur_len = stripe_unit; } - stripes++; + if (max_comp < comp) + max_comp = comp; dev += mirrors_p1; - dev %= ios->layout->s_numdevs; + dev = (dev % devs_in_group) + first_dev; } else { cur_len = stripe_unit; } @@ -393,18 +440,24 @@ static int _prepare_pages(struct exofs_io_state *ios, goto out; comp += mirrors_p1; - comp %= ios->layout->s_numdevs; + comp = (comp % devs_in_group) + first_comp; length -= cur_len; } out: - ios->numdevs = stripes * mirrors_p1; + ios->numdevs = max_comp + mirrors_p1; + ios->pages_consumed = cur_pg; return ret; } static int _prepare_for_striping(struct exofs_io_state *ios) { + u64 length = ios->length; struct _striping_info si; + unsigned devs_in_group = ios->layout->group_width * + ios->layout->mirrors_p1; + unsigned first_comp = 0; + int ret = 0; _calc_stripe_info(ios, ios->offset, &si); @@ -424,7 +477,31 @@ static int _prepare_for_striping(struct exofs_io_state *ios) return 0; } - return _prepare_pages(ios, &si); + while (length) { + if (length < si.group_length) + si.group_length = length; + + ret = _prepare_one_group(ios, si.group_length, &si, first_comp); + if (unlikely(ret)) + goto out; + + length -= si.group_length; + + si.group_length = si.total_group_length; + si.unit_off = 0; + ++si.Major; + si.obj_offset = si.Major * ios->layout->stripe_unit * + ios->layout->group_depth; + + si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; + si.dev %= ios->layout->s_numdevs; + + first_comp += devs_in_group; + first_comp %= ios->layout->s_numdevs; + } + +out: + return ret; } int exofs_sbi_create(struct exofs_io_state *ios) @@ -482,6 +559,9 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) unsigned last_comp = cur_comp + ios->layout->mirrors_p1; int ret = 0; + if (ios->pages && !master_dev->length) + return 0; /* Just an empty slot */ + for (; cur_comp < last_comp; ++cur_comp, ++dev) { struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; @@ -580,6 +660,9 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; unsigned first_dev = (unsigned)ios->obj.id; + if (ios->pages && !per_dev->length) + return 0; /* Just an empty slot */ + first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); if (unlikely(!or)) { diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 8f4e4b37a578..6cf5e4e84d61 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -323,11 +323,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->data_map.odm_raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */ - if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) { - EXOFS_ERR("Group width/depth not supported\n"); - return -EINVAL; - } +/* FIXME: Only raid0 for now. if not so, do not mount */ if (sbi->data_map.odm_num_comps != numdevs) { EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", sbi->data_map.odm_num_comps, numdevs); @@ -343,14 +339,6 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, return -EINVAL; } - stripe_length = sbi->data_map.odm_stripe_unit * - (numdevs / (sbi->data_map.odm_mirror_cnt + 1)); - if (stripe_length >= (1ULL << 32)) { - EXOFS_ERR("Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -EINVAL; - } - if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { EXOFS_ERR("Stripe Unit(0x%llx)" " must be Multples of PAGE_SIZE(0x%lx)\n", @@ -360,8 +348,36 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; - sbi->layout.group_width = sbi->data_map.odm_num_comps / + + if (sbi->data_map.odm_group_width) { + sbi->layout.group_width = sbi->data_map.odm_group_width; + sbi->layout.group_depth = sbi->data_map.odm_group_depth; + if (!sbi->layout.group_depth) { + EXOFS_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + sbi->layout.group_count = sbi->data_map.odm_num_comps / + sbi->layout.mirrors_p1 / + sbi->data_map.odm_group_width; + } else { + if (sbi->data_map.odm_group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %d\n", + sbi->data_map.odm_group_depth); + sbi->data_map.odm_group_depth = 0; + } + sbi->layout.group_width = sbi->data_map.odm_num_comps / sbi->layout.mirrors_p1; + sbi->layout.group_depth = -1; + sbi->layout.group_count = 1; + } + + stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; + if (stripe_length >= (1ULL << 32)) { + EXOFS_ERR("Total Stripe length(0x%llx)" + " >= 32bit is not supported\n", _LLU(stripe_length)); + return -EINVAL; + } return 0; } @@ -540,6 +556,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->layout.stripe_unit = PAGE_SIZE; sbi->layout.mirrors_p1 = 1; sbi->layout.group_width = 1; + sbi->layout.group_depth = -1; + sbi->layout.group_count = 1; sbi->layout.s_ods[0] = od; sbi->layout.s_numdevs = 1; sbi->layout.s_pid = opts->pid;