From 2baba25019ec564cd247af74013873d69a0b8190 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 18 Dec 2009 13:51:57 -0800 Subject: ceph: writeback congestion control Set bdi congestion bit when amount of write data in flight exceeds adjustable threshold. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/addr.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'fs/ceph/addr.c') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d0cdceb0b90..a6850a14038 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -47,6 +47,12 @@ * accounting is preserved. */ +#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) +#define CONGESTION_OFF_THRESH(congestion_kb) \ + (CONGESTION_ON_THRESH(congestion_kb) - \ + (CONGESTION_ON_THRESH(congestion_kb) >> 2)) + + /* * Dirty a page. Optimistically adjust accounting, on the assumption @@ -377,6 +383,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { struct inode *inode; struct ceph_inode_info *ci; + struct ceph_client *client; struct ceph_osd_client *osdc; loff_t page_off = page->index << PAGE_CACHE_SHIFT; int len = PAGE_CACHE_SIZE; @@ -384,6 +391,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) int err = 0; struct ceph_snap_context *snapc; u64 snap_size = 0; + long writeback_stat; dout("writepage %p idx %lu\n", page, page->index); @@ -393,7 +401,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } inode = page->mapping->host; ci = ceph_inode(inode); - osdc = &ceph_inode_to_client(inode)->osdc; + client = ceph_inode_to_client(inode); + osdc = &client->osdc; /* verify this is a writeable snap context */ snapc = (void *)page->private; @@ -420,6 +429,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p index %lu on %llu~%u\n", inode, page, page->index, page_off, len); + writeback_stat = atomic_long_inc_return(&client->writeback_count); + if (writeback_stat > + CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) + set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, @@ -499,6 +513,8 @@ static void writepages_finish(struct ceph_osd_request *req, struct writeback_control *wbc = req->r_wbc; __s32 rc = -EIO; u64 bytes = 0; + struct ceph_client *client = ceph_inode_to_client(inode); + long writeback_stat; /* parse reply */ replyhead = msg->front.iov_base; @@ -524,6 +540,13 @@ static void writepages_finish(struct ceph_osd_request *req, BUG_ON(!page); WARN_ON(!PageUptodate(page)); + writeback_stat = + atomic_long_dec_return(&client->writeback_count); + if (writeback_stat < + CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) + clear_bdi_congested(&client->backing_dev_info, + BLK_RW_ASYNC); + if (i >= wrote) { dout("inode %p skipping page %p\n", inode, page); wbc->pages_skipped++; @@ -666,6 +689,7 @@ retry: u64 offset, len; struct ceph_osd_request_head *reqhead; struct ceph_osd_op *op; + long writeback_stat; next = 0; locked_pages = 0; @@ -773,6 +797,12 @@ get_more_pages: first = i; dout("%p will write page %p idx %lu\n", inode, page, page->index); + + writeback_stat = atomic_long_inc_return(&client->writeback_count); + if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { + set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + } + set_page_writeback(page); req->r_pages[locked_pages] = page; locked_pages++; @@ -998,7 +1028,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file->f_dentry->d_inode; - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = &client->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; -- cgit v1.2.3-70-g09d2