diff options
Diffstat (limited to 'net')
33 files changed, 780 insertions, 321 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ae610f046de..e34ea9e5e28 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -720,6 +720,7 @@ static int vlan_dev_init(struct net_device *dev) dev->fcoe_ddp_xid = real_dev->fcoe_ddp_xid; #endif + dev->needed_headroom = real_dev->needed_headroom; if (real_dev->features & NETIF_F_HW_VLAN_TX) { dev->header_ops = real_dev->header_ops; dev->hard_header_len = real_dev->hard_header_len; diff --git a/net/9p/client.c b/net/9p/client.c index 347ec0cd271..2ccbf04d37d 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -223,7 +223,7 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) req = &c->reqs[row][col]; if (!req->tc) { - req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); + req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS); if (!req->wq) { printk(KERN_ERR "Couldn't grow tag array\n"); return ERR_PTR(-ENOMEM); @@ -233,17 +233,17 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) P9_TRANS_PREF_PAYLOAD_SEP) { int alloc_msize = min(c->msize, 4096); req->tc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, - GFP_KERNEL); + GFP_NOFS); req->tc->capacity = alloc_msize; req->rc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, - GFP_KERNEL); + GFP_NOFS); req->rc->capacity = alloc_msize; } else { req->tc = kmalloc(sizeof(struct p9_fcall)+c->msize, - GFP_KERNEL); + GFP_NOFS); req->tc->capacity = c->msize; req->rc = kmalloc(sizeof(struct p9_fcall)+c->msize, - GFP_KERNEL); + GFP_NOFS); req->rc->capacity = c->msize; } if ((!req->tc) || (!req->rc)) { diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 2ce515b859b..8a4084fa8b5 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -205,7 +205,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, if (errcode) break; - *sptr = kmalloc(len + 1, GFP_KERNEL); + *sptr = kmalloc(len + 1, GFP_NOFS); if (*sptr == NULL) { errcode = -EFAULT; break; @@ -273,7 +273,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, if (!errcode) { *wnames = kmalloc(sizeof(char *) * *nwname, - GFP_KERNEL); + GFP_NOFS); if (!*wnames) errcode = -ENOMEM; } @@ -317,7 +317,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, *wqids = kmalloc(*nwqid * sizeof(struct p9_qid), - GFP_KERNEL); + GFP_NOFS); if (*wqids == NULL) errcode = -ENOMEM; } diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index d62b9aa58df..9172ab78fcb 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -41,9 +41,9 @@ EXPORT_SYMBOL(p9_release_req_pages); int p9_nr_pages(struct p9_req_t *req) { - int start_page, end_page; - start_page = (unsigned long long)req->tc->pubuf >> PAGE_SHIFT; - end_page = ((unsigned long long)req->tc->pubuf + req->tc->pbuf_size + + unsigned long start_page, end_page; + start_page = (unsigned long)req->tc->pubuf >> PAGE_SHIFT; + end_page = ((unsigned long)req->tc->pubuf + req->tc->pbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT; return end_page - start_page; } @@ -69,8 +69,8 @@ p9_payload_gup(struct p9_req_t *req, size_t *pdata_off, int *pdata_len, *pdata_off = (size_t)req->tc->pubuf & (PAGE_SIZE-1); if (*pdata_off) - first_page_bytes = min((PAGE_SIZE - *pdata_off), - req->tc->pbuf_size); + first_page_bytes = min(((size_t)PAGE_SIZE - *pdata_off), + req->tc->pbuf_size); rpinfo = req->tc->private; pdata_mapped_pages = get_user_pages_fast((unsigned long)req->tc->pubuf, diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index a30471e5174..aa5672b15ea 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -350,7 +350,7 @@ static void p9_read_work(struct work_struct *work) if (m->req->rc == NULL) { m->req->rc = kmalloc(sizeof(struct p9_fcall) + - m->client->msize, GFP_KERNEL); + m->client->msize, GFP_NOFS); if (!m->req->rc) { m->req = NULL; err = -ENOMEM; diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 29a54ccd213..150e0c4bbf4 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -424,7 +424,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) struct p9_rdma_context *rpl_context = NULL; /* Allocate an fcall for the reply */ - rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL); + rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); if (!rpl_context) { err = -ENOMEM; goto err_close; @@ -437,7 +437,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) */ if (!req->rc) { req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize, - GFP_KERNEL); + GFP_NOFS); if (req->rc) { req->rc->sdata = (char *) req->rc + sizeof(struct p9_fcall); @@ -468,7 +468,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) req->rc = NULL; /* Post the request */ - c = kmalloc(sizeof *c, GFP_KERNEL); + c = kmalloc(sizeof *c, GFP_NOFS); if (!c) { err = -ENOMEM; goto err_free1; diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 9b550ed9c71..e8f046b0718 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -43,6 +43,7 @@ #include <net/9p/client.h> #include <net/9p/transport.h> #include <linux/scatterlist.h> +#include <linux/swap.h> #include <linux/virtio.h> #include <linux/virtio_9p.h> #include "trans_common.h" @@ -51,6 +52,8 @@ /* a single mutex to manage channel initialization and attachment */ static DEFINE_MUTEX(virtio_9p_lock); +static DECLARE_WAIT_QUEUE_HEAD(vp_wq); +static atomic_t vp_pinned = ATOMIC_INIT(0); /** * struct virtio_chan - per-instance transport information @@ -78,7 +81,10 @@ struct virtio_chan { struct virtqueue *vq; int ring_bufs_avail; wait_queue_head_t *vc_wq; - + /* This is global limit. Since we don't have a global structure, + * will be placing it in each channel. + */ + int p9_max_pages; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; @@ -141,34 +147,36 @@ static void req_done(struct virtqueue *vq) P9_DPRINTK(P9_DEBUG_TRANS, ": request done\n"); - do { + while (1) { spin_lock_irqsave(&chan->lock, flags); rc = virtqueue_get_buf(chan->vq, &len); - if (rc != NULL) { - if (!chan->ring_bufs_avail) { - chan->ring_bufs_avail = 1; - wake_up(chan->vc_wq); - } - spin_unlock_irqrestore(&chan->lock, flags); - P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); - P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", - rc->tag); - req = p9_tag_lookup(chan->client, rc->tag); - req->status = REQ_STATUS_RCVD; - if (req->tc->private) { - struct trans_rpage_info *rp = req->tc->private; - /*Release pages */ - p9_release_req_pages(rp); - if (rp->rp_alloc) - kfree(rp); - req->tc->private = NULL; - } - p9_client_cb(chan->client, req); - } else { + if (rc == NULL) { spin_unlock_irqrestore(&chan->lock, flags); + break; + } + + chan->ring_bufs_avail = 1; + spin_unlock_irqrestore(&chan->lock, flags); + /* Wakeup if anyone waiting for VirtIO ring space. */ + wake_up(chan->vc_wq); + P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); + P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); + req = p9_tag_lookup(chan->client, rc->tag); + if (req->tc->private) { + struct trans_rpage_info *rp = req->tc->private; + int p = rp->rp_nr_pages; + /*Release pages */ + p9_release_req_pages(rp); + atomic_sub(p, &vp_pinned); + wake_up(&vp_wq); + if (rp->rp_alloc) + kfree(rp); + req->tc->private = NULL; } - } while (rc != NULL); + req->status = REQ_STATUS_RCVD; + p9_client_cb(chan->client, req); + } } /** @@ -263,7 +271,6 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n"); -req_retry: req->status = REQ_STATUS_SENT; if (req->tc->pbuf_size && (req->tc->pubuf && P9_IS_USER_CONTEXT)) { @@ -271,6 +278,14 @@ req_retry: int rpinfo_size = sizeof(struct trans_rpage_info) + sizeof(struct page *) * nr_pages; + if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { + err = wait_event_interruptible(vp_wq, + atomic_read(&vp_pinned) < chan->p9_max_pages); + if (err == -ERESTARTSYS) + return err; + P9_DPRINTK(P9_DEBUG_TRANS, "9p: May gup pages now.\n"); + } + if (rpinfo_size <= (req->tc->capacity - req->tc->size)) { /* We can use sdata */ req->tc->private = req->tc->sdata + req->tc->size; @@ -293,9 +308,12 @@ req_retry: if (rpinfo->rp_alloc) kfree(rpinfo); return err; + } else { + atomic_add(rpinfo->rp_nr_pages, &vp_pinned); } } +req_retry_pinned: spin_lock_irqsave(&chan->lock, flags); /* Handle out VirtIO ring buffers */ @@ -356,7 +374,7 @@ req_retry: return err; P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n"); - goto req_retry; + goto req_retry_pinned; } else { spin_unlock_irqrestore(&chan->lock, flags); P9_DPRINTK(P9_DEBUG_TRANS, @@ -453,6 +471,8 @@ static int p9_virtio_probe(struct virtio_device *vdev) } init_waitqueue_head(chan->vc_wq); chan->ring_bufs_avail = 1; + /* Ceiling limit to avoid denial of service attacks */ + chan->p9_max_pages = nr_free_buffer_pages()/4; mutex_lock(&virtio_9p_lock); list_add_tail(&chan->chan_list, &virtio_chan_list); diff --git a/net/9p/util.c b/net/9p/util.c index e048701a72d..b84619b5ba2 100644 --- a/net/9p/util.c +++ b/net/9p/util.c @@ -92,7 +92,7 @@ int p9_idpool_get(struct p9_idpool *p) unsigned long flags; retry: - if (idr_pre_get(&p->pool, GFP_KERNEL) == 0) + if (idr_pre_get(&p->pool, GFP_NOFS) == 0) return 0; spin_lock_irqsave(&p->lock, flags); diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 3d4f4b04340..206e771e82d 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1051,6 +1051,7 @@ static int atalk_release(struct socket *sock) { struct sock *sk = sock->sk; + sock_hold(sk); lock_sock(sk); if (sk) { sock_orphan(sk); @@ -1058,6 +1059,8 @@ static int atalk_release(struct socket *sock) atalk_destroy_socket(sk); } release_sock(sk); + sock_put(sk); + return 0; } diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index f97af5590ba..008ff6c4eec 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -739,6 +739,9 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb, nf_bridge->mask |= BRNF_PKT_TYPE; } + if (br_parse_ip_options(skb)) + return NF_DROP; + /* The physdev module checks on this */ nf_bridge->mask |= BRNF_BRIDGED; nf_bridge->physoutdev = skb->dev; diff --git a/net/ceph/armor.c b/net/ceph/armor.c index eb2a666b0be..1fc1ee11dfa 100644 --- a/net/ceph/armor.c +++ b/net/ceph/armor.c @@ -78,8 +78,10 @@ int ceph_unarmor(char *dst, const char *src, const char *end) while (src < end) { int a, b, c, d; - if (src < end && src[0] == '\n') + if (src[0] == '\n') { src++; + continue; + } if (src + 4 > end) return -EINVAL; a = decode_bits(src[0]); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index f3e4a13fea0..95f96ab94bb 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -62,6 +62,7 @@ const char *ceph_msg_type_name(int type) case CEPH_MSG_OSD_MAP: return "osd_map"; case CEPH_MSG_OSD_OP: return "osd_op"; case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; + case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; default: return "unknown"; } } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3e20a122ffa..02212ed5085 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -22,10 +22,15 @@ #define OSD_OPREPLY_FRONT_LEN 512 static const struct ceph_connection_operations osd_con_ops; -static int __kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd); -static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static void send_queued(struct ceph_osd_client *osdc); +static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static void __register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +static void __unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +static int __send_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); static int op_needs_trail(int op) { @@ -34,6 +39,7 @@ static int op_needs_trail(int op) case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: case CEPH_OSD_OP_CALL: + case CEPH_OSD_OP_NOTIFY: return 1; default: return 0; @@ -209,6 +215,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, init_completion(&req->r_completion); init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); + INIT_LIST_HEAD(&req->r_linger_item); + INIT_LIST_HEAD(&req->r_linger_osd); req->r_flags = flags; WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); @@ -315,6 +323,24 @@ static void osd_req_encode_op(struct ceph_osd_request *req, break; case CEPH_OSD_OP_STARTSYNC: break; + case CEPH_OSD_OP_NOTIFY: + { + __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); + __le32 timeout = cpu_to_le32(src->watch.timeout); + + BUG_ON(!req->r_trail); + + ceph_pagelist_append(req->r_trail, + &prot_ver, sizeof(prot_ver)); + ceph_pagelist_append(req->r_trail, + &timeout, sizeof(timeout)); + } + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + dst->watch.cookie = cpu_to_le64(src->watch.cookie); + dst->watch.ver = cpu_to_le64(src->watch.ver); + dst->watch.flag = src->watch.flag; + break; default: pr_err("unrecognized osd opcode %d\n", dst->op); WARN_ON(1); @@ -529,6 +555,45 @@ __lookup_request_ge(struct ceph_osd_client *osdc, return NULL; } +/* + * Resubmit requests pending on the given osd. + */ +static void __kick_osd_requests(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + struct ceph_osd_request *req, *nreq; + int err; + + dout("__kick_osd_requests osd%d\n", osd->o_osd); + err = __reset_osd(osdc, osd); + if (err == -EAGAIN) + return; + + list_for_each_entry(req, &osd->o_requests, r_osd_item) { + list_move(&req->r_req_lru_item, &osdc->req_unsent); + dout("requeued %p tid %llu osd%d\n", req, req->r_tid, + osd->o_osd); + if (!req->r_linger) + req->r_flags |= CEPH_OSD_FLAG_RETRY; + } + + list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, + r_linger_osd) { + __unregister_linger_request(osdc, req); + __register_request(osdc, req); + list_move(&req->r_req_lru_item, &osdc->req_unsent); + dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, + osd->o_osd); + } +} + +static void kick_osd_requests(struct ceph_osd_client *osdc, + struct ceph_osd *kickosd) +{ + mutex_lock(&osdc->request_mutex); + __kick_osd_requests(osdc, kickosd); + mutex_unlock(&osdc->request_mutex); +} /* * If the osd connection drops, we need to resubmit all requests. @@ -543,7 +608,8 @@ static void osd_reset(struct ceph_connection *con) dout("osd_reset osd%d\n", osd->o_osd); osdc = osd->o_osdc; down_read(&osdc->map_sem); - kick_requests(osdc, osd); + kick_osd_requests(osdc, osd); + send_queued(osdc); up_read(&osdc->map_sem); } @@ -561,6 +627,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) atomic_set(&osd->o_ref, 1); osd->o_osdc = osdc; INIT_LIST_HEAD(&osd->o_requests); + INIT_LIST_HEAD(&osd->o_linger_requests); INIT_LIST_HEAD(&osd->o_osd_lru); osd->o_incarnation = 1; @@ -650,7 +717,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) int ret = 0; dout("__reset_osd %p osd%d\n", osd, osd->o_osd); - if (list_empty(&osd->o_requests)) { + if (list_empty(&osd->o_requests) && + list_empty(&osd->o_linger_requests)) { __remove_osd(osdc, osd); } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], &osd->o_con.peer_addr, @@ -723,10 +791,9 @@ static void __cancel_osd_timeout(struct ceph_osd_client *osdc) * Register request, assign tid. If this is the first request, set up * the timeout event. */ -static void register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static void __register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) { - mutex_lock(&osdc->request_mutex); req->r_tid = ++osdc->last_tid; req->r_request->hdr.tid = cpu_to_le64(req->r_tid); INIT_LIST_HEAD(&req->r_req_lru_item); @@ -740,6 +807,13 @@ static void register_request(struct ceph_osd_client *osdc, dout(" first request, scheduling timeout\n"); __schedule_osd_timeout(osdc); } +} + +static void register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + mutex_lock(&osdc->request_mutex); + __register_request(osdc, req); mutex_unlock(&osdc->request_mutex); } @@ -758,9 +832,14 @@ static void __unregister_request(struct ceph_osd_client *osdc, ceph_con_revoke(&req->r_osd->o_con, req->r_request); list_del_init(&req->r_osd_item); - if (list_empty(&req->r_osd->o_requests)) + if (list_empty(&req->r_osd->o_requests) && + list_empty(&req->r_osd->o_linger_requests)) { + dout("moving osd to %p lru\n", req->r_osd); __move_osd_to_lru(osdc, req->r_osd); - req->r_osd = NULL; + } + if (list_empty(&req->r_osd_item) && + list_empty(&req->r_linger_item)) + req->r_osd = NULL; } ceph_osdc_put_request(req); @@ -781,20 +860,72 @@ static void __cancel_request(struct ceph_osd_request *req) ceph_con_revoke(&req->r_osd->o_con, req->r_request); req->r_sent = 0; } - list_del_init(&req->r_req_lru_item); } +static void __register_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + dout("__register_linger_request %p\n", req); + list_add_tail(&req->r_linger_item, &osdc->req_linger); + list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); +} + +static void __unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + dout("__unregister_linger_request %p\n", req); + if (req->r_osd) { + list_del_init(&req->r_linger_item); + list_del_init(&req->r_linger_osd); + + if (list_empty(&req->r_osd->o_requests) && + list_empty(&req->r_osd->o_linger_requests)) { + dout("moving osd to %p lru\n", req->r_osd); + __move_osd_to_lru(osdc, req->r_osd); + } + req->r_osd = NULL; + } +} + +void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + mutex_lock(&osdc->request_mutex); + if (req->r_linger) { + __unregister_linger_request(osdc, req); + ceph_osdc_put_request(req); + } + mutex_unlock(&osdc->request_mutex); +} +EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); + +void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + if (!req->r_linger) { + dout("set_request_linger %p\n", req); + req->r_linger = 1; + /* + * caller is now responsible for calling + * unregister_linger_request + */ + ceph_osdc_get_request(req); + } +} +EXPORT_SYMBOL(ceph_osdc_set_request_linger); + /* * Pick an osd (the first 'up' osd in the pg), allocate the osd struct * (as needed), and set the request r_osd appropriately. If there is - * no up osd, set r_osd to NULL. + * no up osd, set r_osd to NULL. Move the request to the appropiate list + * (unsent, homeless) or leave on in-flight lru. * * Return 0 if unchanged, 1 if changed, or negative on error. * * Caller should hold map_sem for read and request_mutex. */ -static int __map_osds(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static int __map_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) { struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; struct ceph_pg pgid; @@ -802,11 +933,13 @@ static int __map_osds(struct ceph_osd_client *osdc, int o = -1, num = 0; int err; - dout("map_osds %p tid %lld\n", req, req->r_tid); + dout("map_request %p tid %lld\n", req, req->r_tid); err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, &req->r_file_layout, osdc->osdmap); - if (err) + if (err) { + list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; + } pgid = reqhead->layout.ol_pgid; req->r_pgid = pgid; @@ -823,7 +956,7 @@ static int __map_osds(struct ceph_osd_client *osdc, (req->r_osd == NULL && o == -1)) return 0; /* no change */ - dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n", + dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, req->r_osd ? req->r_osd->o_osd : -1); @@ -841,10 +974,12 @@ static int __map_osds(struct ceph_osd_client *osdc, if (!req->r_osd && o >= 0) { err = -ENOMEM; req->r_osd = create_osd(osdc); - if (!req->r_osd) + if (!req->r_osd) { + list_move(&req->r_req_lru_item, &osdc->req_notarget); goto out; + } - dout("map_osds osd %p is osd%d\n", req->r_osd, o); + dout("map_request osd %p is osd%d\n", req->r_osd, o); req->r_osd->o_osd = o; req->r_osd->o_con.peer_name.num = cpu_to_le64(o); __insert_osd(osdc, req->r_osd); @@ -855,6 +990,9 @@ static int __map_osds(struct ceph_osd_client *osdc, if (req->r_osd) { __remove_osd_from_lru(req->r_osd); list_add(&req->r_osd_item, &req->r_osd->o_requests); + list_move(&req->r_req_lru_item, &osdc->req_unsent); + } else { + list_move(&req->r_req_lru_item, &osdc->req_notarget); } err = 1; /* osd or pg changed */ @@ -869,16 +1007,6 @@ static int __send_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { struct ceph_osd_request_head *reqhead; - int err; - - err = __map_osds(osdc, req); - if (err < 0) - return err; - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - return 0; - } dout("send_request %p tid %llu to osd%d flags %d\n", req, req->r_tid, req->r_osd->o_osd, req->r_flags); @@ -898,6 +1026,21 @@ static int __send_request(struct ceph_osd_client *osdc, } /* + * Send any requests in the queue (req_unsent). + */ +static void send_queued(struct ceph_osd_client *osdc) +{ + struct ceph_osd_request *req, *tmp; + + dout("send_queued\n"); + mutex_lock(&osdc->request_mutex); + list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { + __send_request(osdc, req); + } + mutex_unlock(&osdc->request_mutex); +} + +/* * Timeout callback, called every N seconds when 1 or more osd * requests has been active for more than N seconds. When this * happens, we ping all OSDs with requests who have timed out to @@ -916,30 +1059,13 @@ static void handle_timeout(struct work_struct *work) unsigned long keepalive = osdc->client->options->osd_keepalive_timeout * HZ; unsigned long last_stamp = 0; - struct rb_node *p; struct list_head slow_osds; - dout("timeout\n"); down_read(&osdc->map_sem); ceph_monc_request_next_osdmap(&osdc->client->monc); mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - req = rb_entry(p, struct ceph_osd_request, r_node); - - if (req->r_resend) { - int err; - - dout("osdc resending prev failed %lld\n", req->r_tid); - err = __send_request(osdc, req); - if (err) - dout("osdc failed again on %lld\n", req->r_tid); - else - req->r_resend = false; - continue; - } - } /* * reset osds that appear to be _really_ unresponsive. this @@ -963,7 +1089,7 @@ static void handle_timeout(struct work_struct *work) BUG_ON(!osd); pr_warning(" tid %llu timed out on osd%d, will reset osd\n", req->r_tid, osd->o_osd); - __kick_requests(osdc, osd); + __kick_osd_requests(osdc, osd); } /* @@ -991,7 +1117,7 @@ static void handle_timeout(struct work_struct *work) __schedule_osd_timeout(osdc); mutex_unlock(&osdc->request_mutex); - + send_queued(osdc); up_read(&osdc->map_sem); } @@ -1035,7 +1161,6 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, numops * sizeof(struct ceph_osd_op)) goto bad; dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); - /* lookup */ mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); @@ -1079,6 +1204,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, dout("handle_reply tid %llu flags %d\n", tid, flags); + if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) + __register_linger_request(osdc, req); + /* either this is a read, or we got the safe response */ if (result < 0 || (flags & CEPH_OSD_FLAG_ONDISK) || @@ -1099,6 +1227,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, } done: + dout("req=%p req->r_linger=%d\n", req, req->r_linger); ceph_osdc_put_request(req); return; @@ -1109,108 +1238,83 @@ bad: ceph_msg_dump(msg); } - -static int __kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd) +static void reset_changed_osds(struct ceph_osd_client *osdc) { - struct ceph_osd_request *req; struct rb_node *p, *n; - int needmap = 0; - int err; - dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); - if (kickosd) { - err = __reset_osd(osdc, kickosd); - if (err == -EAGAIN) - return 1; - } else { - for (p = rb_first(&osdc->osds); p; p = n) { - struct ceph_osd *osd = - rb_entry(p, struct ceph_osd, o_node); - - n = rb_next(p); - if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || - memcmp(&osd->o_con.peer_addr, - ceph_osd_addr(osdc->osdmap, - osd->o_osd), - sizeof(struct ceph_entity_addr)) != 0) - __reset_osd(osdc, osd); - } + for (p = rb_first(&osdc->osds); p; p = n) { + struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); + + n = rb_next(p); + if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || + memcmp(&osd->o_con.peer_addr, + ceph_osd_addr(osdc->osdmap, + osd->o_osd), + sizeof(struct ceph_entity_addr)) != 0) + __reset_osd(osdc, osd); } +} + +/* + * Requeue requests whose mapping to an OSD has changed. If requests map to + * no osd, request a new map. + * + * Caller should hold map_sem for read and request_mutex. + */ +static void kick_requests(struct ceph_osd_client *osdc) +{ + struct ceph_osd_request *req, *nreq; + struct rb_node *p; + int needmap = 0; + int err; + dout("kick_requests\n"); + mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { req = rb_entry(p, struct ceph_osd_request, r_node); - - if (req->r_resend) { - dout(" r_resend set on tid %llu\n", req->r_tid); - __cancel_request(req); - goto kick; - } - if (req->r_osd && kickosd == req->r_osd) { - __cancel_request(req); - goto kick; + err = __map_request(osdc, req); + if (err < 0) + continue; /* error */ + if (req->r_osd == NULL) { + dout("%p tid %llu maps to no osd\n", req, req->r_tid); + needmap++; /* request a newer map */ + } else if (err > 0) { + dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + if (!req->r_linger) + req->r_flags |= CEPH_OSD_FLAG_RETRY; } + } + + list_for_each_entry_safe(req, nreq, &osdc->req_linger, + r_linger_item) { + dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); - err = __map_osds(osdc, req); + err = __map_request(osdc, req); if (err == 0) - continue; /* no change */ - if (err < 0) { - /* - * FIXME: really, we should set the request - * error and fail if this isn't a 'nofail' - * request, but that's a fair bit more - * complicated to do. So retry! - */ - dout(" setting r_resend on %llu\n", req->r_tid); - req->r_resend = true; - continue; - } + continue; /* no change and no osd was specified */ + if (err < 0) + continue; /* hrm! */ if (req->r_osd == NULL) { dout("tid %llu maps to no valid osd\n", req->r_tid); needmap++; /* request a newer map */ continue; } -kick: - dout("kicking %p tid %llu osd%d\n", req, req->r_tid, + dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); - req->r_flags |= CEPH_OSD_FLAG_RETRY; - err = __send_request(osdc, req); - if (err) { - dout(" setting r_resend on %llu\n", req->r_tid); - req->r_resend = true; - } + __unregister_linger_request(osdc, req); + __register_request(osdc, req); } - - return needmap; -} - -/* - * Resubmit osd requests whose osd or osd address has changed. Request - * a new osd map if osds are down, or we are otherwise unable to determine - * how to direct a request. - * - * Close connections to down osds. - * - * If @who is specified, resubmit requests for that specific osd. - * - * Caller should hold map_sem for read and request_mutex. - */ -static void kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd) -{ - int needmap; - - mutex_lock(&osdc->request_mutex); - needmap = __kick_requests(osdc, kickosd); mutex_unlock(&osdc->request_mutex); if (needmap) { dout("%d requests for down osds, need new map\n", needmap); ceph_monc_request_next_osdmap(&osdc->client->monc); } - } + + /* * Process updated osd map. * @@ -1263,6 +1367,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_osdmap_destroy(osdc->osdmap); osdc->osdmap = newmap; } + kick_requests(osdc); + reset_changed_osds(osdc); } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); @@ -1300,6 +1406,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) osdc->osdmap = newmap; if (oldmap) ceph_osdmap_destroy(oldmap); + kick_requests(osdc); } p += maplen; nr_maps--; @@ -1308,8 +1415,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) done: downgrade_write(&osdc->map_sem); ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); - if (newmap) - kick_requests(osdc, NULL); + send_queued(osdc); up_read(&osdc->map_sem); wake_up_all(&osdc->client->auth_wq); return; @@ -1322,6 +1428,223 @@ bad: } /* + * watch/notify callback event infrastructure + * + * These callbacks are used both for watch and notify operations. + */ +static void __release_event(struct kref *kref) +{ + struct ceph_osd_event *event = + container_of(kref, struct ceph_osd_event, kref); + + dout("__release_event %p\n", event); + kfree(event); +} + +static void get_event(struct ceph_osd_event *event) +{ + kref_get(&event->kref); +} + +void ceph_osdc_put_event(struct ceph_osd_event *event) +{ + kref_put(&event->kref, __release_event); +} +EXPORT_SYMBOL(ceph_osdc_put_event); + +static void __insert_event(struct ceph_osd_client *osdc, + struct ceph_osd_event *new) +{ + struct rb_node **p = &osdc->event_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_event *event = NULL; + + while (*p) { + parent = *p; + event = rb_entry(parent, struct ceph_osd_event, node); + if (new->cookie < event->cookie) + p = &(*p)->rb_left; + else if (new->cookie > event->cookie) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &osdc->event_tree); +} + +static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, + u64 cookie) +{ + struct rb_node **p = &osdc->event_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_event *event = NULL; + + while (*p) { + parent = *p; + event = rb_entry(parent, struct ceph_osd_event, node); + if (cookie < event->cookie) + p = &(*p)->rb_left; + else if (cookie > event->cookie) + p = &(*p)->rb_right; + else + return event; + } + return NULL; +} + +static void __remove_event(struct ceph_osd_event *event) +{ + struct ceph_osd_client *osdc = event->osdc; + + if (!RB_EMPTY_NODE(&event->node)) { + dout("__remove_event removed %p\n", event); + rb_erase(&event->node, &osdc->event_tree); + ceph_osdc_put_event(event); + } else { + dout("__remove_event didn't remove %p\n", event); + } +} + +int ceph_osdc_create_event(struct ceph_osd_client *osdc, + void (*event_cb)(u64, u64, u8, void *), + int one_shot, void *data, + struct ceph_osd_event **pevent) +{ + struct ceph_osd_event *event; + + event = kmalloc(sizeof(*event), GFP_NOIO); + if (!event) + return -ENOMEM; + + dout("create_event %p\n", event); + event->cb = event_cb; + event->one_shot = one_shot; + event->data = data; + event->osdc = osdc; + INIT_LIST_HEAD(&event->osd_node); + kref_init(&event->kref); /* one ref for us */ + kref_get(&event->kref); /* one ref for the caller */ + init_completion(&event->completion); + + spin_lock(&osdc->event_lock); + event->cookie = ++osdc->event_count; + __insert_event(osdc, event); + spin_unlock(&osdc->event_lock); + + *pevent = event; + return 0; +} +EXPORT_SYMBOL(ceph_osdc_create_event); + +void ceph_osdc_cancel_event(struct ceph_osd_event *event) +{ + struct ceph_osd_client *osdc = event->osdc; + + dout("cancel_event %p\n", event); + spin_lock(&osdc->event_lock); + __remove_event(event); + spin_unlock(&osdc->event_lock); + ceph_osdc_put_event(event); /* caller's */ +} +EXPORT_SYMBOL(ceph_osdc_cancel_event); + + +static void do_event_work(struct work_struct *work) +{ + struct ceph_osd_event_work *event_work = + container_of(work, struct ceph_osd_event_work, work); + struct ceph_osd_event *event = event_work->event; + u64 ver = event_work->ver; + u64 notify_id = event_work->notify_id; + u8 opcode = event_work->opcode; + + dout("do_event_work completing %p\n", event); + event->cb(ver, notify_id, opcode, event->data); + complete(&event->completion); + dout("do_event_work completed %p\n", event); + ceph_osdc_put_event(event); + kfree(event_work); +} + + +/* + * Process osd watch notifications + */ +void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) +{ + void *p, *end; + u8 proto_ver; + u64 cookie, ver, notify_id; + u8 opcode; + struct ceph_osd_event *event; + struct ceph_osd_event_work *event_work; + + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + ceph_decode_8_safe(&p, end, proto_ver, bad); + ceph_decode_8_safe(&p, end, opcode, bad); + ceph_decode_64_safe(&p, end, cookie, bad); + ceph_decode_64_safe(&p, end, ver, bad); + ceph_decode_64_safe(&p, end, notify_id, bad); + + spin_lock(&osdc->event_lock); + event = __find_event(osdc, cookie); + if (event) { + get_event(event); + if (event->one_shot) + __remove_event(event); + } + spin_unlock(&osdc->event_lock); + dout("handle_watch_notify cookie %lld ver %lld event %p\n", + cookie, ver, event); + if (event) { + event_work = kmalloc(sizeof(*event_work), GFP_NOIO); + INIT_WORK(&event_work->work, do_event_work); + if (!event_work) { + dout("ERROR: could not allocate event_work\n"); + goto done_err; + } + event_work->event = event; + event_work->ver = ver; + event_work->notify_id = notify_id; + event_work->opcode = opcode; + if (!queue_work(osdc->notify_wq, &event_work->work)) { + dout("WARNING: failed to queue notify event work\n"); + goto done_err; + } + } + + return; + +done_err: + complete(&event->completion); + ceph_osdc_put_event(event); + return; + +bad: + pr_err("osdc handle_watch_notify corrupt msg\n"); + return; +} + +int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout) +{ + int err; + + dout("wait_event %p\n", event); + err = wait_for_completion_interruptible_timeout(&event->completion, + timeout * HZ); + ceph_osdc_put_event(event); + if (err > 0) + err = 0; + dout("wait_event %p returns %d\n", event, err); + return err; +} +EXPORT_SYMBOL(ceph_osdc_wait_event); + +/* * Register request, send initial attempt. */ int ceph_osdc_start_request(struct ceph_osd_client *osdc, @@ -1347,15 +1670,22 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, * the request still han't been touched yet. */ if (req->r_sent == 0) { - rc = __send_request(osdc, req); - if (rc) { - if (nofail) { - dout("osdc_start_request failed send, " - " marking %lld\n", req->r_tid); - req->r_resend = true; - rc = 0; - } else { - __unregister_request(osdc, req); + rc = __map_request(osdc, req); + if (rc < 0) + return rc; + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + rc = __send_request(osdc, req); + if (rc) { + if (nofail) { + dout("osdc_start_request failed send, " + " will retry %lld\n", req->r_tid); + rc = 0; + } else { + __unregister_request(osdc, req); + } } } } @@ -1441,9 +1771,15 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) INIT_LIST_HEAD(&osdc->osd_lru); osdc->requests = RB_ROOT; INIT_LIST_HEAD(&osdc->req_lru); + INIT_LIST_HEAD(&osdc->req_unsent); + INIT_LIST_HEAD(&osdc->req_notarget); + INIT_LIST_HEAD(&osdc->req_linger); osdc->num_requests = 0; INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); + spin_lock_init(&osdc->event_lock); + osdc->event_tree = RB_ROOT; + osdc->event_count = 0; schedule_delayed_work(&osdc->osds_timeout_work, round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); @@ -1463,6 +1799,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) "osd_op_reply"); if (err < 0) goto out_msgpool; + + osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); + if (IS_ERR(osdc->notify_wq)) { + err = PTR_ERR(osdc->notify_wq); + osdc->notify_wq = NULL; + goto out_msgpool; + } return 0; out_msgpool: @@ -1476,6 +1819,8 @@ EXPORT_SYMBOL(ceph_osdc_init); void ceph_osdc_stop(struct ceph_osd_client *osdc) { + flush_workqueue(osdc->notify_wq); + destroy_workqueue(osdc->notify_wq); cancel_delayed_work_sync(&osdc->timeout_work); cancel_delayed_work_sync(&osdc->osds_timeout_work); if (osdc->osdmap) { @@ -1483,6 +1828,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) osdc->osdmap = NULL; } remove_old_osds(osdc, 1); + WARN_ON(!RB_EMPTY_ROOT(&osdc->osds)); mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); @@ -1591,6 +1937,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) case CEPH_MSG_OSD_OPREPLY: handle_reply(osdc, msg, con); break; + case CEPH_MSG_WATCH_NOTIFY: + handle_watch_notify(osdc, msg); + break; default: pr_err("received unknown message type %d %s\n", type, @@ -1684,6 +2033,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_OSD_MAP: + case CEPH_MSG_WATCH_NOTIFY: return ceph_msg_new(type, front, GFP_NOFS); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 36e603c78ce..706502ff64a 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -350,7 +350,7 @@ static int __init init_net_drop_monitor(void) struct per_cpu_dm_data *data; int cpu, rc; - printk(KERN_INFO "Initalizing network drop monitor service\n"); + printk(KERN_INFO "Initializing network drop monitor service\n"); if (sizeof(void *) > 8) { printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n"); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c1a71bb738d..a1086fb0c0c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1457,6 +1457,9 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data) { int err; + if (!dev->ethtool_ops->set_sg) + return -EOPNOTSUPP; + if (data && !(dev->features & NETIF_F_ALL_CSUM)) return -EINVAL; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 0c55eaa70e3..aeeece72b72 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3761,7 +3761,10 @@ static int __init pktgen_create_thread(int cpu) list_add_tail(&t->th_list, &pktgen_threads); init_completion(&t->start_done); - p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu); + p = kthread_create_on_node(pktgen_thread_worker, + t, + cpu_to_node(cpu), + "kpktgend_%d", cpu); if (IS_ERR(p)) { pr_err("kernel_thread() failed for cpu %d\n", t->cpu); list_del(&t->th_list); diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 0c282633791..116d3fd3d66 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -435,10 +435,10 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, udpdest.sin_addr.s_addr = htonl(network | addr.station); } + memset(&ah, 0, sizeof(ah)); ah.port = port; ah.cb = cb & 0x7f; ah.code = 2; /* magic */ - ah.pad = 0; /* tack our header on the front of the iovec */ size = sizeof(struct aunhdr); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b09ed0d080f..ffcea0d1678 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -387,7 +387,7 @@ ipt_do_table(struct sk_buff *skb, verdict = (unsigned)(-v) - 1; break; } - if (*stackptr == 0) { + if (*stackptr <= origptr) { e = get_entry(table_base, private->underflow[hook]); pr_debug("Underflow (this is normal) " @@ -427,10 +427,10 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - xt_info_rdunlock_bh(); pr_debug("Exiting %s; resetting sp from %u to %u\n", __func__, *stackptr, origptr); *stackptr = origptr; + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; #else diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 403ca57f601..d609ac3cb9a 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -664,8 +664,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, char buffer[PROC_WRITELEN+1]; unsigned long nodenum; - if (copy_from_user(buffer, input, PROC_WRITELEN)) + if (size > PROC_WRITELEN) + return -EIO; + if (copy_from_user(buffer, input, size)) return -EFAULT; + buffer[size] = 0; if (*buffer == '+') { nodenum = simple_strtoul(buffer+1, NULL, 10); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c9598a9067d..0b2af9b85ce 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -410,7 +410,7 @@ ip6t_do_table(struct sk_buff *skb, verdict = (unsigned)(-v) - 1; break; } - if (*stackptr == 0) + if (*stackptr <= origptr) e = get_entry(table_base, private->underflow[hook]); else @@ -441,8 +441,8 @@ ip6t_do_table(struct sk_buff *skb, break; } while (!acpar.hotdrop); - xt_info_rdunlock_bh(); *stackptr = origptr; + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 7cb65ef79f9..6dcf5e7d661 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -17,6 +17,16 @@ static struct ctl_table empty[1]; +static ctl_table ipv6_static_skeleton[] = { + { + .procname = "neigh", + .maxlen = 0, + .mode = 0555, + .child = empty, + }, + { } +}; + static ctl_table ipv6_table_template[] = { { .procname = "route", @@ -37,12 +47,6 @@ static ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "neigh", - .maxlen = 0, - .mode = 0555, - .child = empty, - }, { } }; @@ -160,7 +164,7 @@ static struct ctl_table_header *ip6_base; int ipv6_static_sysctl_register(void) { - ip6_base = register_sysctl_paths(net_ipv6_ctl_path, empty); + ip6_base = register_sysctl_paths(net_ipv6_ctl_path, ipv6_static_skeleton); if (ip6_base == NULL) return -ENOMEM; return 0; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 2731b51923d..9680226640e 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -148,7 +148,6 @@ static void ipx_destroy_socket(struct sock *sk) ipx_remove_socket(sk); skb_queue_purge(&sk->sk_receive_queue); sk_refcnt_debug_dec(sk); - sock_put(sk); } /* @@ -1404,6 +1403,7 @@ static int ipx_release(struct socket *sock) sk_refcnt_debug_release(sk); ipx_destroy_socket(sk); release_sock(sk); + sock_put(sk); out: return 0; } diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 8d9ce0accc9..a8193f52c13 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -283,7 +283,7 @@ static __net_init int l2tp_eth_init_net(struct net *net) return 0; } -static __net_initdata struct pernet_operations l2tp_eth_net_ops = { +static struct pernet_operations l2tp_eth_net_ops = { .init = l2tp_eth_init_net, .id = &l2tp_eth_net_id, .size = sizeof(struct l2tp_eth_net), diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 618a615acc9..d6b48230a54 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -94,16 +94,28 @@ static int find_set_type_get(const char *name, u8 family, u8 revision, struct ip_set_type **found) { + struct ip_set_type *type; + int err; + rcu_read_lock(); *found = find_set_type(name, family, revision); if (*found) { - int err = !try_module_get((*found)->me); - rcu_read_unlock(); - return err ? -EFAULT : 0; + err = !try_module_get((*found)->me) ? -EFAULT : 0; + goto unlock; } + /* Make sure the type is loaded but we don't support the revision */ + list_for_each_entry_rcu(type, &ip_set_type_list, list) + if (STREQ(type->name, name)) { + err = -IPSET_ERR_FIND_TYPE; + goto unlock; + } rcu_read_unlock(); return try_to_load_type(name); + +unlock: + rcu_read_unlock(); + return err; } /* Find a given set type by name and family. @@ -116,7 +128,7 @@ find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max) struct ip_set_type *type; bool found = false; - *min = *max = 0; + *min = 255; *max = 0; rcu_read_lock(); list_for_each_entry_rcu(type, &ip_set_type_list, list) if (STREQ(type->name, name) && @@ -124,7 +136,7 @@ find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max) found = true; if (type->revision < *min) *min = type->revision; - else if (type->revision > *max) + if (type->revision > *max) *max = type->revision; } rcu_read_unlock(); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index adbe787ea5d..b9214145d35 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -150,6 +150,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipport4_elem data = { }; u32 ip, ip_to, p, port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || @@ -172,21 +173,15 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMP: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMP)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -195,7 +190,6 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], } if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &data, timeout); @@ -219,13 +213,12 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], } else ip_to = ip; - port = ntohs(data.port); - if (tb[IPSET_ATTR_PORT_TO]) { + port_to = port = ntohs(data.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); - } else - port_to = port; + } for (; !before(ip_to, ip); ip++) for (p = port; p <= port_to; p++) { @@ -361,6 +354,7 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipport6_elem data = { }; u32 port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || @@ -385,21 +379,15 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMPV6: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -407,9 +395,7 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } - if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || - !tb[IPSET_ATTR_PORT_TO]) { + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &data, timeout); return ip_set_eexist(ret, flags) ? 0 : ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 22e23abb86c..4642872df6e 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -154,6 +154,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipportip4_elem data = { }; u32 ip, ip_to, p, port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || @@ -180,21 +181,15 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMP: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMP)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -203,7 +198,6 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], } if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &data, timeout); @@ -227,13 +221,12 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], } else ip_to = ip; - port = ntohs(data.port); - if (tb[IPSET_ATTR_PORT_TO]) { + port_to = port = ntohs(data.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); - } else - port_to = port; + } for (; !before(ip_to, ip); ip++) for (p = port; p <= port_to; p++) { @@ -375,6 +368,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipportip6_elem data = { }; u32 port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || @@ -403,21 +397,15 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMPV6: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -425,9 +413,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } - if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || - !tb[IPSET_ATTR_PORT_TO]) { + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &data, timeout); return ip_set_eexist(ret, flags) ? 0 : ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 6033e8b54bb..2cb84a54b7a 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -174,6 +174,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipportnet4_elem data = { .cidr = HOST_MASK }; u32 ip, ip_to, p, port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || @@ -208,21 +209,15 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMP: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMP)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -231,7 +226,6 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &data, timeout); @@ -255,13 +249,12 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else ip_to = ip; - port = ntohs(data.port); - if (tb[IPSET_ATTR_PORT_TO]) { + port_to = port = ntohs(data.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); - } else - port_to = port; + } for (; !before(ip_to, ip); ip++) for (p = port; p <= port_to; p++) { @@ -429,6 +422,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_ipportnet6_elem data = { .cidr = HOST_MASK }; u32 port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || @@ -465,21 +459,15 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMPV6: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -487,9 +475,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } - if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || - !tb[IPSET_ATTR_PORT_TO]) { + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &data, timeout); return ip_set_eexist(ret, flags) ? 0 : ret; } diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 34a165626ee..8598676f2a0 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -170,6 +170,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netport4_elem data = { .cidr = HOST_MASK }; u32 port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || @@ -198,21 +199,15 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMP: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMP)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -220,9 +215,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } - if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || - !tb[IPSET_ATTR_PORT_TO]) { + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &data, timeout); return ip_set_eexist(ret, flags) ? 0 : ret; } @@ -390,6 +383,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netport6_elem data = { .cidr = HOST_MASK }; u32 port, port_to; u32 timeout = h->timeout; + bool with_ports = false; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || @@ -418,21 +412,15 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_PROTO]) { data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); if (data.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else return -IPSET_ERR_MISSING_PROTO; - switch (data.proto) { - case IPPROTO_UDP: - case IPPROTO_TCP: - case IPPROTO_ICMPV6: - break; - default: + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) data.port = 0; - break; - } if (tb[IPSET_ATTR_TIMEOUT]) { if (!with_timeout(h->timeout)) @@ -440,9 +428,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } - if (adt == IPSET_TEST || - !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) || - !tb[IPSET_ATTR_PORT_TO]) { + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &data, timeout); return ip_set_eexist(ret, flags) ? 0 : ret; } diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 5c48ffb60c2..2dc6de13ac1 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -43,6 +43,8 @@ EXPORT_SYMBOL(register_ip_vs_app); EXPORT_SYMBOL(unregister_ip_vs_app); EXPORT_SYMBOL(register_ip_vs_app_inc); +static DEFINE_MUTEX(__ip_vs_app_mutex); + /* * Get an ip_vs_app object */ @@ -167,14 +169,13 @@ int register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, __u16 port) { - struct netns_ipvs *ipvs = net_ipvs(net); int result; - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); result = ip_vs_app_inc_new(net, app, proto, port); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); return result; } @@ -189,11 +190,11 @@ int register_ip_vs_app(struct net *net, struct ip_vs_app *app) /* increase the module use count */ ip_vs_use_count_inc(); - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); list_add(&app->a_list, &ipvs->app_list); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); return 0; } @@ -205,10 +206,9 @@ int register_ip_vs_app(struct net *net, struct ip_vs_app *app) */ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *inc, *nxt; - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { ip_vs_app_inc_release(net, inc); @@ -216,7 +216,7 @@ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) list_del(&app->a_list); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -501,7 +501,7 @@ static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; } @@ -535,9 +535,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) { - struct netns_ipvs *ipvs = net_ipvs(seq_file_net(seq)); - - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); } static int ip_vs_app_seq_show(struct seq_file *seq, void *v) @@ -583,7 +581,6 @@ static int __net_init __ip_vs_app_init(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); INIT_LIST_HEAD(&ipvs->app_list); - __mutex_init(&ipvs->app_mutex, "ipvs->app_mutex", &ipvs->app_key); proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b799cea31f9..33733c8872e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3605,7 +3605,7 @@ int __net_init __ip_vs_control_init(struct net *net) /* procfs stats */ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (ipvs->tot_stats.cpustats) { + if (!ipvs->tot_stats.cpustats) { pr_err("%s(): alloc_percpu.\n", __func__); return -ENOMEM; } diff --git a/net/socket.c b/net/socket.c index 937d0fcf74b..5212447c86e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2588,23 +2588,123 @@ static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) { + struct compat_ethtool_rxnfc __user *compat_rxnfc; + bool convert_in = false, convert_out = false; + size_t buf_size = ALIGN(sizeof(struct ifreq), 8); + struct ethtool_rxnfc __user *rxnfc; struct ifreq __user *ifr; + u32 rule_cnt = 0, actual_rule_cnt; + u32 ethcmd; u32 data; - void __user *datap; + int ret; + + if (get_user(data, &ifr32->ifr_ifru.ifru_data)) + return -EFAULT; - ifr = compat_alloc_user_space(sizeof(*ifr)); + compat_rxnfc = compat_ptr(data); - if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ)) + if (get_user(ethcmd, &compat_rxnfc->cmd)) return -EFAULT; - if (get_user(data, &ifr32->ifr_ifru.ifru_data)) + /* Most ethtool structures are defined without padding. + * Unfortunately struct ethtool_rxnfc is an exception. + */ + switch (ethcmd) { + default: + break; + case ETHTOOL_GRXCLSRLALL: + /* Buffer size is variable */ + if (get_user(rule_cnt, &compat_rxnfc->rule_cnt)) + return -EFAULT; + if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32)) + return -ENOMEM; + buf_size += rule_cnt * sizeof(u32); + /* fall through */ + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + convert_out = true; + /* fall through */ + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + buf_size += sizeof(struct ethtool_rxnfc); + convert_in = true; + break; + } + + ifr = compat_alloc_user_space(buf_size); + rxnfc = (void *)ifr + ALIGN(sizeof(struct ifreq), 8); + + if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ)) return -EFAULT; - datap = compat_ptr(data); - if (put_user(datap, &ifr->ifr_ifru.ifru_data)) + if (put_user(convert_in ? rxnfc : compat_ptr(data), + &ifr->ifr_ifru.ifru_data)) return -EFAULT; - return dev_ioctl(net, SIOCETHTOOL, ifr); + if (convert_in) { + /* We expect there to be holes between fs.m_u and + * fs.ring_cookie and at the end of fs, but nowhere else. + */ + BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_u) + + sizeof(compat_rxnfc->fs.m_u) != + offsetof(struct ethtool_rxnfc, fs.m_u) + + sizeof(rxnfc->fs.m_u)); + BUILD_BUG_ON( + offsetof(struct compat_ethtool_rxnfc, fs.location) - + offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != + offsetof(struct ethtool_rxnfc, fs.location) - + offsetof(struct ethtool_rxnfc, fs.ring_cookie)); + + if (copy_in_user(rxnfc, compat_rxnfc, + (void *)(&rxnfc->fs.m_u + 1) - + (void *)rxnfc) || + copy_in_user(&rxnfc->fs.ring_cookie, + &compat_rxnfc->fs.ring_cookie, + (void *)(&rxnfc->fs.location + 1) - + (void *)&rxnfc->fs.ring_cookie) || + copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt, + sizeof(rxnfc->rule_cnt))) + return -EFAULT; + } + + ret = dev_ioctl(net, SIOCETHTOOL, ifr); + if (ret) + return ret; + + if (convert_out) { + if (copy_in_user(compat_rxnfc, rxnfc, + (const void *)(&rxnfc->fs.m_u + 1) - + (const void *)rxnfc) || + copy_in_user(&compat_rxnfc->fs.ring_cookie, + &rxnfc->fs.ring_cookie, + (const void *)(&rxnfc->fs.location + 1) - + (const void *)&rxnfc->fs.ring_cookie) || + copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt, + sizeof(rxnfc->rule_cnt))) + return -EFAULT; + + if (ethcmd == ETHTOOL_GRXCLSRLALL) { + /* As an optimisation, we only copy the actual + * number of rules that the underlying + * function returned. Since Mallory might + * change the rule count in user memory, we + * check that it is less than the rule count + * originally given (as the user buffer size), + * which has been range-checked. + */ + if (get_user(actual_rule_cnt, &rxnfc->rule_cnt)) + return -EFAULT; + if (actual_rule_cnt < rule_cnt) + rule_cnt = actual_rule_cnt; + if (copy_in_user(&compat_rxnfc->rule_locs[0], + &rxnfc->rule_locs[0], + rule_cnt * sizeof(u32))) + return -EFAULT; + } + } + + return 0; } static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index d575f053486..f83a3d1da81 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1907,7 +1907,7 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu) return res; } -int xfrm_init_state(struct xfrm_state *x) +int __xfrm_init_state(struct xfrm_state *x, bool init_replay) { struct xfrm_state_afinfo *afinfo; struct xfrm_mode *inner_mode; @@ -1980,12 +1980,25 @@ int xfrm_init_state(struct xfrm_state *x) if (x->outer_mode == NULL) goto error; + if (init_replay) { + err = xfrm_init_replay(x); + if (err) + goto error; + } + x->km.state = XFRM_STATE_VALID; error: return err; } +EXPORT_SYMBOL(__xfrm_init_state); + +int xfrm_init_state(struct xfrm_state *x) +{ + return __xfrm_init_state(x, true); +} + EXPORT_SYMBOL(xfrm_init_state); int __net_init xfrm_state_init(struct net *net) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 706385ae3e4..fc152d28753 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -511,7 +511,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_mark_get(attrs, &x->mark); - err = xfrm_init_state(x); + err = __xfrm_init_state(x, false); if (err) goto error; |