diff options
Diffstat (limited to 'net/netfilter/ipvs')
-rw-r--r-- | net/netfilter/ipvs/ip_vs_app.c | 2 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_conn.c | 104 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 43 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ctl.c | 270 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_dh.c | 4 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ftp.c | 6 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_lblc.c | 14 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_lblcr.c | 14 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto.c | 56 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto_sctp.c | 5 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto_tcp.c | 5 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_proto_udp.c | 5 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_sh.c | 4 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_sync.c | 668 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_wrr.c | 2 |
15 files changed, 785 insertions, 417 deletions
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 52856178c9d..64f9e8f1320 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -313,7 +313,7 @@ vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) * Assumes already checked proto==IPPROTO_TCP and diff!=0. */ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, - unsigned flag, __u32 seq, int diff) + unsigned int flag, __u32 seq, int diff) { /* spinlock is to keep updating cp->flags atomic */ spin_lock(&cp->lock); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 29fa5badde7..1548df9a752 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -86,42 +86,42 @@ struct ip_vs_aligned_lock static struct ip_vs_aligned_lock __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; -static inline void ct_read_lock(unsigned key) +static inline void ct_read_lock(unsigned int key) { read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static inline void ct_read_unlock(unsigned key) +static inline void ct_read_unlock(unsigned int key) { read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static inline void ct_write_lock(unsigned key) +static inline void ct_write_lock(unsigned int key) { write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static inline void ct_write_unlock(unsigned key) +static inline void ct_write_unlock(unsigned int key) { write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static inline void ct_read_lock_bh(unsigned key) +static inline void ct_read_lock_bh(unsigned int key) { read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static inline void ct_read_unlock_bh(unsigned key) +static inline void ct_read_unlock_bh(unsigned int key) { read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static inline void ct_write_lock_bh(unsigned key) +static inline void ct_write_lock_bh(unsigned int key) { write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } -static inline void ct_write_unlock_bh(unsigned key) +static inline void ct_write_unlock_bh(unsigned int key) { write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } @@ -130,7 +130,7 @@ static inline void ct_write_unlock_bh(unsigned key) /* * Returns hash value for IPVS connection entry */ -static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto, +static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { @@ -188,7 +188,7 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) */ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) { - unsigned hash; + unsigned int hash; int ret; if (cp->flags & IP_VS_CONN_F_ONE_PACKET) @@ -224,7 +224,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) */ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) { - unsigned hash; + unsigned int hash; int ret; /* unhash it and decrease its reference counter */ @@ -257,7 +257,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) static inline struct ip_vs_conn * __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp; struct hlist_node *n; @@ -344,7 +344,7 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); /* Get reference to connection template */ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp; struct hlist_node *n; @@ -394,7 +394,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) * p->vaddr, p->vport: pkt dest address (foreign host) */ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) { - unsigned hash; + unsigned int hash; struct ip_vs_conn *cp, *ret=NULL; struct hlist_node *n; @@ -548,6 +548,7 @@ static inline void ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) { unsigned int conn_flags; + __u32 flags; /* if dest is NULL, then return directly */ if (!dest) @@ -559,17 +560,19 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) conn_flags = atomic_read(&dest->conn_flags); if (cp->protocol != IPPROTO_UDP) conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; + flags = cp->flags; /* Bind with the destination and its corresponding transmitter */ - if (cp->flags & IP_VS_CONN_F_SYNC) { + if (flags & IP_VS_CONN_F_SYNC) { /* if the connection is not template and is created * by sync, preserve the activity flag. */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) + if (!(flags & IP_VS_CONN_F_TEMPLATE)) conn_flags &= ~IP_VS_CONN_F_INACTIVE; /* connections inherit forwarding method from dest */ - cp->flags &= ~IP_VS_CONN_F_FWD_MASK; + flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); } - cp->flags |= conn_flags; + flags |= conn_flags; + cp->flags = flags; cp->dest = dest; IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " @@ -584,12 +587,12 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) atomic_read(&dest->refcnt)); /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so increase the inactive - connection counter because it is in TCP SYNRECV - state (inactive) or other protocol inacive state */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so modify the counters + * according to the flags, later the protocol can + * update them on state change + */ + if (!(flags & IP_VS_CONN_F_INACTIVE)) atomic_inc(&dest->activeconns); else atomic_inc(&dest->inactconns); @@ -613,14 +616,40 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) { struct ip_vs_dest *dest; - if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, - cp->dport, &cp->vaddr, cp->vport, - cp->protocol, cp->fwmark, cp->flags); + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + cp->dport, &cp->vaddr, cp->vport, + cp->protocol, cp->fwmark, cp->flags); + if (dest) { + struct ip_vs_proto_data *pd; + + spin_lock(&cp->lock); + if (cp->dest) { + spin_unlock(&cp->lock); + return dest; + } + + /* Applications work depending on the forwarding method + * but better to reassign them always when binding dest */ + if (cp->app) + ip_vs_unbind_app(cp); + ip_vs_bind_dest(cp, dest); - return dest; - } else - return NULL; + spin_unlock(&cp->lock); + + /* Update its packet transmitter */ + cp->packet_xmit = NULL; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol); + if (pd && atomic_read(&pd->appcnt)) + ip_vs_bind_app(cp, pd->pp); + } + return dest; } @@ -743,7 +772,8 @@ int ip_vs_check_template(struct ip_vs_conn *ct) static void ip_vs_conn_expire(unsigned long data) { struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + struct net *net = ip_vs_conn_net(cp); + struct netns_ipvs *ipvs = net_ipvs(net); cp->timeout = 60*HZ; @@ -808,6 +838,9 @@ static void ip_vs_conn_expire(unsigned long data) atomic_read(&cp->refcnt)-1, atomic_read(&cp->n_control)); + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs)); + ip_vs_conn_put(cp); } @@ -824,7 +857,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) */ struct ip_vs_conn * ip_vs_conn_new(const struct ip_vs_conn_param *p, - const union nf_inet_addr *daddr, __be16 dport, unsigned flags, + const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; @@ -881,6 +914,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, /* Set its state and timeout */ cp->state = 0; cp->timeout = 3*HZ; + cp->sync_endtime = jiffies & ~3UL; /* Bind its packet transmitter */ #ifdef CONFIG_IP_VS_IPV6 @@ -1057,7 +1091,7 @@ static const struct file_operations ip_vs_conn_fops = { .release = seq_release_net, }; -static const char *ip_vs_origin_name(unsigned flags) +static const char *ip_vs_origin_name(unsigned int flags) { if (flags & IP_VS_CONN_F_SYNC) return "SYNC"; @@ -1169,7 +1203,7 @@ void ip_vs_random_dropentry(struct net *net) * Randomly scan 1/32 of the whole table every second */ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { - unsigned hash = net_random() & ip_vs_conn_tab_mask; + unsigned int hash = net_random() & ip_vs_conn_tab_mask; struct hlist_node *n; /* diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 2555816e778..a54b018c6ee 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -80,7 +80,7 @@ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); #define icmp_id(icmph) (((icmph)->un).echo.id) #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) -const char *ip_vs_proto_name(unsigned proto) +const char *ip_vs_proto_name(unsigned int proto) { static char buf[20]; @@ -1613,34 +1613,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) else pkts = atomic_add_return(1, &cp->in_pkts); - if ((ipvs->sync_state & IP_VS_STATE_MASTER) && - cp->protocol == IPPROTO_SCTP) { - if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && - (pkts % sysctl_sync_period(ipvs) - == sysctl_sync_threshold(ipvs))) || - (cp->old_state != cp->state && - ((cp->state == IP_VS_SCTP_S_CLOSED) || - (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || - (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { - ip_vs_sync_conn(net, cp); - goto out; - } - } - - /* Keep this block last: TCP and others with pp->num_states <= 1 */ - else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && - (((cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (pkts % sysctl_sync_period(ipvs) - == sysctl_sync_threshold(ipvs))) || - ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && - ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE) || - (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || - (cp->state == IP_VS_TCP_S_TIME_WAIT))))) - ip_vs_sync_conn(net, cp); -out: - cp->old_state = cp->state; + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(net, cp, pkts); ip_vs_conn_put(cp); return ret; @@ -1924,6 +1898,7 @@ protocol_fail: control_fail: ip_vs_estimator_net_cleanup(net); estimator_fail: + net->ipvs = NULL; return -ENOMEM; } @@ -1936,6 +1911,7 @@ static void __net_exit __ip_vs_cleanup(struct net *net) ip_vs_control_net_cleanup(net); ip_vs_estimator_net_cleanup(net); IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); + net->ipvs = NULL; } static void __net_exit __ip_vs_dev_cleanup(struct net *net) @@ -1993,10 +1969,18 @@ static int __init ip_vs_init(void) goto cleanup_dev; } + ret = ip_vs_register_nl_ioctl(); + if (ret < 0) { + pr_err("can't register netlink/ioctl.\n"); + goto cleanup_hooks; + } + pr_info("ipvs loaded.\n"); return ret; +cleanup_hooks: + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); cleanup_dev: unregister_pernet_device(&ipvs_core_dev_ops); cleanup_sub: @@ -2012,6 +1996,7 @@ exit: static void __exit ip_vs_cleanup(void) { + ip_vs_unregister_nl_ioctl(); nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); unregister_pernet_device(&ipvs_core_dev_ops); unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b3afe189af6..dd811b8dd97 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -265,11 +265,11 @@ static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; /* * Returns hash value for virtual service */ -static inline unsigned -ip_vs_svc_hashkey(struct net *net, int af, unsigned proto, +static inline unsigned int +ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { - register unsigned porth = ntohs(port); + register unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 @@ -286,7 +286,7 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned proto, /* * Returns hash value of fwmark for virtual service lookup */ -static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) +static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) { return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } @@ -298,7 +298,7 @@ static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) */ static int ip_vs_svc_hash(struct ip_vs_service *svc) { - unsigned hash; + unsigned int hash; if (svc->flags & IP_VS_SVC_F_HASHED) { pr_err("%s(): request for already hashed, called from %pF\n", @@ -361,7 +361,7 @@ static inline struct ip_vs_service * __ip_vs_service_find(struct net *net, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { - unsigned hash; + unsigned int hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ @@ -388,7 +388,7 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol, static inline struct ip_vs_service * __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) { - unsigned hash; + unsigned int hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ @@ -489,11 +489,11 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest) /* * Returns hash value for real service */ -static inline unsigned ip_vs_rs_hashkey(int af, +static inline unsigned int ip_vs_rs_hashkey(int af, const union nf_inet_addr *addr, __be16 port) { - register unsigned porth = ntohs(port); + register unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 @@ -512,7 +512,7 @@ static inline unsigned ip_vs_rs_hashkey(int af, */ static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { - unsigned hash; + unsigned int hash; if (!list_empty(&dest->d_list)) { return 0; @@ -555,7 +555,7 @@ ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, __be16 dport) { struct netns_ipvs *ipvs = net_ipvs(net); - unsigned hash; + unsigned int hash; struct ip_vs_dest *dest; /* @@ -842,7 +842,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, struct ip_vs_dest **dest_p) { struct ip_vs_dest *dest; - unsigned atype; + unsigned int atype; EnterFunction(2); @@ -1599,6 +1599,10 @@ static int ip_vs_zero_all(struct net *net) } #ifdef CONFIG_SYSCTL + +static int zero; +static int three = 3; + static int proc_do_defense_mode(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1632,7 +1636,8 @@ proc_do_sync_threshold(ctl_table *table, int write, memcpy(val, valp, sizeof(val)); rc = proc_dointvec(table, write, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + if (write && (valp[0] < 0 || valp[1] < 0 || + (valp[0] >= valp[1] && valp[1]))) { /* Restore the correct value */ memcpy(valp, val, sizeof(val)); } @@ -1652,9 +1657,24 @@ proc_do_sync_mode(ctl_table *table, int write, if ((*valp < 0) || (*valp > 1)) { /* Restore the correct value */ *valp = val; - } else { - struct net *net = current->nsproxy->net_ns; - ip_vs_sync_switch_mode(net, val); + } + } + return rc; +} + +static int +proc_do_sync_ports(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if (*valp < 1 || !is_power_of_2(*valp)) { + /* Restore the correct value */ + *valp = val; } } return rc; @@ -1718,6 +1738,24 @@ static struct ctl_table vs_vars[] = { .proc_handler = &proc_do_sync_mode, }, { + .procname = "sync_ports", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_ports, + }, + { + .procname = "sync_qlen_max", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_sock_size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "cache_bypass", .maxlen = sizeof(int), .mode = 0644, @@ -1743,6 +1781,20 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_do_sync_threshold, }, { + .procname = "sync_refresh_period", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "sync_retries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &three, + }, + { .procname = "nat_icmp_send", .maxlen = sizeof(int), .mode = 0644, @@ -1846,13 +1898,6 @@ static struct ctl_table vs_vars[] = { { } }; -const struct ctl_path net_vs_ctl_path[] = { - { .procname = "net", }, - { .procname = "ipv4", }, - { .procname = "vs", }, - { } -}; -EXPORT_SYMBOL_GPL(net_vs_ctl_path); #endif #ifdef CONFIG_PROC_FS @@ -1867,7 +1912,7 @@ struct ip_vs_iter { * Write the contents of the VS rule table to a PROCfs file. * (It is kept just for backward compatibility) */ -static inline const char *ip_vs_fwd_name(unsigned flags) +static inline const char *ip_vs_fwd_name(unsigned int flags) { switch (flags & IP_VS_CONN_F_FWD_MASK) { case IP_VS_CONN_F_LOCALNODE: @@ -2816,17 +2861,17 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, ip_vs_copy_stats(&ustats, stats); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, ustats.cps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps); - + if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) || + nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps)) + goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; @@ -2847,23 +2892,25 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, if (!nl_service) return -EMSGSIZE; - NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); - + if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) + goto nla_put_failure; if (svc->fwmark) { - NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); + if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) + goto nla_put_failure; } else { - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); - NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); + if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || + nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || + nla_put_u16(skb, IPVS_SVC_ATTR_PORT, svc->port)) + goto nla_put_failure; } - NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); - if (svc->pe) - NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name); - NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); - + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) || + (svc->pe && + nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) || + nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || + nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || + nla_put_u32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) + goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) goto nla_put_failure; @@ -3038,21 +3085,22 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) if (!nl_dest) return -EMSGSIZE; - NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); - NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); - - NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, - atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, - atomic_read(&dest->activeconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, - atomic_read(&dest->inactconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns)); - + if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || + nla_put_u16(skb, IPVS_DEST_ATTR_PORT, dest->port) || + nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, + (atomic_read(&dest->conn_flags) & + IP_VS_CONN_F_FWD_MASK)) || + nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, + atomic_read(&dest->weight)) || + nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns))) + goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) goto nla_put_failure; @@ -3181,10 +3229,10 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, if (!nl_daemon) return -EMSGSIZE; - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); - NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); - + if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || + nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) || + nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid)) + goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; @@ -3473,21 +3521,26 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) __ip_vs_get_timeouts(net, &t); #ifdef CONFIG_IP_VS_PROTO_TCP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, - t.tcp_fin_timeout); + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, + t.tcp_timeout) || + nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout)) + goto nla_put_failure; #endif #ifdef CONFIG_IP_VS_PROTO_UDP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) + goto nla_put_failure; #endif break; } case IPVS_CMD_GET_INFO: - NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); - NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - ip_vs_conn_tab_size); + if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, + IP_VS_VERSION_CODE) || + nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + ip_vs_conn_tab_size)) + goto nla_put_failure; break; } @@ -3654,6 +3707,12 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net) tbl[idx++].data = &ipvs->sysctl_snat_reroute; ipvs->sysctl_sync_ver = 1; tbl[idx++].data = &ipvs->sysctl_sync_ver; + ipvs->sysctl_sync_ports = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ports; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + tbl[idx++].data = &ipvs->sysctl_sync_sock_size; tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; @@ -3661,11 +3720,14 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net) ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); + ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; + tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; + ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); + tbl[idx++].data = &ipvs->sysctl_sync_retries; tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; - ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path, - tbl); + ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { if (!net_eq(net, &init_net)) kfree(tbl); @@ -3680,7 +3742,7 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net) return 0; } -void __net_init ip_vs_control_net_cleanup_sysctl(struct net *net) +void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -3692,7 +3754,7 @@ void __net_init ip_vs_control_net_cleanup_sysctl(struct net *net) #else int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } -void __net_init ip_vs_control_net_cleanup_sysctl(struct net *net) { } +void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { } #endif @@ -3750,21 +3812,10 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net) free_percpu(ipvs->tot_stats.cpustats); } -int __init ip_vs_control_init(void) +int __init ip_vs_register_nl_ioctl(void) { - int idx; int ret; - EnterFunction(2); - - /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - - smp_wmb(); /* Do we really need it now ? */ - ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); @@ -3776,28 +3827,47 @@ int __init ip_vs_control_init(void) pr_err("cannot register Generic Netlink interface.\n"); goto err_genl; } - - ret = register_netdevice_notifier(&ip_vs_dst_notifier); - if (ret < 0) - goto err_notf; - - LeaveFunction(2); return 0; -err_notf: - ip_vs_genl_unregister(); err_genl: nf_unregister_sockopt(&ip_vs_sockopts); err_sock: return ret; } +void ip_vs_unregister_nl_ioctl(void) +{ + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); +} + +int __init ip_vs_control_init(void) +{ + int idx; + int ret; + + EnterFunction(2); + + /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + + smp_wmb(); /* Do we really need it now ? */ + + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + return ret; + + LeaveFunction(2); + return 0; +} + void ip_vs_control_cleanup(void) { EnterFunction(2); unregister_netdevice_notifier(&ip_vs_dst_notifier); - ip_vs_genl_unregister(); - nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); } diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 1c269e56200..8b7dca9ea42 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -68,7 +68,7 @@ struct ip_vs_dh_bucket { /* * Returns hash value for IPVS DH entry */ -static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -149,7 +149,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc) /* allocate the DH table for this service */ tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, - GFP_ATOMIC); + GFP_KERNEL); if (tbl == NULL) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 538d74ee4f6..b20b29c903e 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -177,7 +177,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, __be16 port; struct ip_vs_conn *n_cp; char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ - unsigned buf_len; + unsigned int buf_len; int ret = 0; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -439,6 +439,8 @@ static int __net_init __ip_vs_ftp_init(struct net *net) struct ip_vs_app *app; struct netns_ipvs *ipvs = net_ipvs(net); + if (!ipvs) + return -ENOENT; app = kmemdup(&ip_vs_ftp, sizeof(struct ip_vs_app), GFP_KERNEL); if (!app) return -ENOMEM; @@ -483,7 +485,7 @@ static struct pernet_operations ip_vs_ftp_ops = { .exit = __ip_vs_ftp_exit, }; -int __init ip_vs_ftp_init(void) +static int __init ip_vs_ftp_init(void) { int rv; diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 0f16283fd05..df646ccf08a 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -142,7 +142,7 @@ static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) /* * Returns hash value for IPVS LBLC entry */ -static inline unsigned +static inline unsigned int ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -163,7 +163,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) static void ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { - unsigned hash = ip_vs_lblc_hashkey(en->af, &en->addr); + unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); list_add(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); @@ -178,7 +178,7 @@ static inline struct ip_vs_lblc_entry * ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, const union nf_inet_addr *addr) { - unsigned hash = ip_vs_lblc_hashkey(af, addr); + unsigned int hash = ip_vs_lblc_hashkey(af, addr); struct ip_vs_lblc_entry *en; list_for_each_entry(en, &tbl->bucket[hash], list) @@ -342,7 +342,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblc_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; @@ -551,6 +551,9 @@ static int __net_init __ip_vs_lblc_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + if (!ipvs) + return -ENOENT; + if (!net_eq(net, &init_net)) { ipvs->lblc_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), @@ -563,8 +566,7 @@ static int __net_init __ip_vs_lblc_init(struct net *net) ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; ipvs->lblc_ctl_header = - register_net_sysctl_table(net, net_vs_ctl_path, - ipvs->lblc_ctl_table); + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); if (!ipvs->lblc_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblc_ctl_table); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index eec797f8cce..570e31ea427 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -311,7 +311,7 @@ static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) /* * Returns hash value for IPVS LBLCR entry */ -static inline unsigned +static inline unsigned int ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -332,7 +332,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) static void ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) { - unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr); + unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr); list_add(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); @@ -347,7 +347,7 @@ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *addr) { - unsigned hash = ip_vs_lblcr_hashkey(af, addr); + unsigned int hash = ip_vs_lblcr_hashkey(af, addr); struct ip_vs_lblcr_entry *en; list_for_each_entry(en, &tbl->bucket[hash], list) @@ -511,7 +511,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblcr_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; @@ -745,6 +745,9 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + if (!ipvs) + return -ENOENT; + if (!net_eq(net, &init_net)) { ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), @@ -757,8 +760,7 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; ipvs->lblcr_ctl_header = - register_net_sysctl_table(net, net_vs_ctl_path, - ipvs->lblcr_ctl_table); + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table); if (!ipvs->lblcr_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblcr_ctl_table); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index f843a883325..50d82186da8 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -48,7 +48,7 @@ static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; */ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) { - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp->next = ip_vs_proto_table[hash]; ip_vs_proto_table[hash] = pp; @@ -59,9 +59,6 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) return 0; } -#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) || \ - defined(CONFIG_IP_VS_PROTO_SCTP) || defined(CONFIG_IP_VS_PROTO_AH) || \ - defined(CONFIG_IP_VS_PROTO_ESP) /* * register an ipvs protocols netns related data */ @@ -69,9 +66,9 @@ static int register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) { struct netns_ipvs *ipvs = net_ipvs(net); - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = - kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC); + kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); if (!pd) return -ENOMEM; @@ -81,12 +78,18 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) ipvs->proto_data_table[hash] = pd; atomic_set(&pd->appcnt, 0); /* Init app counter */ - if (pp->init_netns != NULL) - pp->init_netns(net, pd); + if (pp->init_netns != NULL) { + int ret = pp->init_netns(net, pd); + if (ret) { + /* unlink an free proto data */ + ipvs->proto_data_table[hash] = pd->next; + kfree(pd); + return ret; + } + } return 0; } -#endif /* * unregister an ipvs protocol @@ -94,7 +97,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) { struct ip_vs_protocol **pp_p; - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp_p = &ip_vs_proto_table[hash]; for (; *pp_p; pp_p = &(*pp_p)->next) { @@ -117,7 +120,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data **pd_p; - unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol); + unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); pd_p = &ipvs->proto_data_table[hash]; for (; *pd_p; pd_p = &(*pd_p)->next) { @@ -139,7 +142,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) { struct ip_vs_protocol *pp; - unsigned hash = IP_VS_PROTO_HASH(proto); + unsigned int hash = IP_VS_PROTO_HASH(proto); for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { if (pp->protocol == proto) @@ -153,11 +156,11 @@ EXPORT_SYMBOL(ip_vs_proto_get); /* * get ip_vs_protocol object data by netns and proto */ -struct ip_vs_proto_data * +static struct ip_vs_proto_data * __ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { struct ip_vs_proto_data *pd; - unsigned hash = IP_VS_PROTO_HASH(proto); + unsigned int hash = IP_VS_PROTO_HASH(proto); for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { if (pd->pp->protocol == proto) @@ -196,7 +199,7 @@ void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) int * ip_vs_create_timeout_table(int *table, int size) { - return kmemdup(table, size, GFP_ATOMIC); + return kmemdup(table, size, GFP_KERNEL); } @@ -316,22 +319,35 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, */ int __net_init ip_vs_protocol_net_init(struct net *net) { + int i, ret; + static struct ip_vs_protocol *protos[] = { #ifdef CONFIG_IP_VS_PROTO_TCP - register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); + &ip_vs_protocol_tcp, #endif #ifdef CONFIG_IP_VS_PROTO_UDP - register_ip_vs_proto_netns(net, &ip_vs_protocol_udp); + &ip_vs_protocol_udp, #endif #ifdef CONFIG_IP_VS_PROTO_SCTP - register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp); + &ip_vs_protocol_sctp, #endif #ifdef CONFIG_IP_VS_PROTO_AH - register_ip_vs_proto_netns(net, &ip_vs_protocol_ah); + &ip_vs_protocol_ah, #endif #ifdef CONFIG_IP_VS_PROTO_ESP - register_ip_vs_proto_netns(net, &ip_vs_protocol_esp); + &ip_vs_protocol_esp, #endif + }; + + for (i = 0; i < ARRAY_SIZE(protos); i++) { + ret = register_ip_vs_proto_netns(net, protos[i]); + if (ret < 0) + goto cleanup; + } return 0; + +cleanup: + ip_vs_protocol_net_cleanup(net); + return ret; } void __net_exit ip_vs_protocol_net_cleanup(struct net *net) diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 1fbf7a2816f..9f3fb751c49 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -1090,7 +1090,7 @@ out: * timeouts is netns related now. * --------------------------------------------- */ -static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -1098,6 +1098,9 @@ static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) spin_lock_init(&ipvs->sctp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, sizeof(sctp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; } static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index ef8641f7af8..cd609cc6272 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -677,7 +677,7 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) * timeouts is netns related now. * --------------------------------------------- */ -static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -685,7 +685,10 @@ static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) spin_lock_init(&ipvs->tcp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; pd->tcp_state_table = tcp_states; + return 0; } static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index f4b7262896b..2fedb2dcb3d 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -467,7 +467,7 @@ udp_state_transition(struct ip_vs_conn *cp, int direction, cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; } -static void __udp_init(struct net *net, struct ip_vs_proto_data *pd) +static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -475,6 +475,9 @@ static void __udp_init(struct net *net, struct ip_vs_proto_data *pd) spin_lock_init(&ipvs->udp_app_lock); pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, sizeof(udp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; } static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 069e8d4d5c0..05126521743 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -70,7 +70,7 @@ struct ip_vs_sh_bucket { /* * Returns hash value for IPVS SH entry */ -static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; @@ -162,7 +162,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc) /* allocate the SH table for this service */ tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_ATOMIC); + GFP_KERNEL); if (tbl == NULL) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 8a0d6d6889f..effa10c9e4e 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -196,6 +196,7 @@ struct ip_vs_sync_thread_data { struct net *net; struct socket *sock; char *buf; + int id; }; /* Version 0 definition of packet sizes */ @@ -271,13 +272,6 @@ struct ip_vs_sync_buff { unsigned char *end; }; -/* multicast addr */ -static struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), -}; - /* * Copy of struct ip_vs_seq * From unaligned network order to aligned host order @@ -300,18 +294,22 @@ static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) put_unaligned_be32(ho->previous_delta, &no->previous_delta); } -static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs) +static inline struct ip_vs_sync_buff * +sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) { struct ip_vs_sync_buff *sb; spin_lock_bh(&ipvs->sync_lock); - if (list_empty(&ipvs->sync_queue)) { + if (list_empty(&ms->sync_queue)) { sb = NULL; + __set_current_state(TASK_INTERRUPTIBLE); } else { - sb = list_entry(ipvs->sync_queue.next, - struct ip_vs_sync_buff, + sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, list); list_del(&sb->list); + ms->sync_queue_len--; + if (!ms->sync_queue_len) + ms->sync_queue_delay = 0; } spin_unlock_bh(&ipvs->sync_lock); @@ -334,7 +332,7 @@ ip_vs_sync_buff_create(struct netns_ipvs *ipvs) kfree(sb); return NULL; } - sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */ + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ sb->mesg->version = SYNC_PROTO_VER; sb->mesg->syncid = ipvs->master_syncid; sb->mesg->size = sizeof(struct ip_vs_sync_mesg); @@ -353,14 +351,22 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) kfree(sb); } -static inline void sb_queue_tail(struct netns_ipvs *ipvs) +static inline void sb_queue_tail(struct netns_ipvs *ipvs, + struct ipvs_master_sync_state *ms) { - struct ip_vs_sync_buff *sb = ipvs->sync_buff; + struct ip_vs_sync_buff *sb = ms->sync_buff; spin_lock(&ipvs->sync_lock); - if (ipvs->sync_state & IP_VS_STATE_MASTER) - list_add_tail(&sb->list, &ipvs->sync_queue); - else + if (ipvs->sync_state & IP_VS_STATE_MASTER && + ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { + if (!ms->sync_queue_len) + schedule_delayed_work(&ms->master_wakeup_work, + max(IPVS_SYNC_SEND_DELAY, 1)); + ms->sync_queue_len++; + list_add_tail(&sb->list, &ms->sync_queue); + if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) + wake_up_process(ms->master_thread); + } else ip_vs_sync_buff_release(sb); spin_unlock(&ipvs->sync_lock); } @@ -370,49 +376,26 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs) * than the specified time or the specified time is zero. */ static inline struct ip_vs_sync_buff * -get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time) +get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, + unsigned long time) { struct ip_vs_sync_buff *sb; spin_lock_bh(&ipvs->sync_buff_lock); - if (ipvs->sync_buff && - time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) { - sb = ipvs->sync_buff; - ipvs->sync_buff = NULL; + sb = ms->sync_buff; + if (sb && time_after_eq(jiffies - sb->firstuse, time)) { + ms->sync_buff = NULL; + __set_current_state(TASK_RUNNING); } else sb = NULL; spin_unlock_bh(&ipvs->sync_buff_lock); return sb; } -/* - * Switch mode from sending version 0 or 1 - * - must handle sync_buf - */ -void ip_vs_sync_switch_mode(struct net *net, int mode) +static inline int +select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) { - struct netns_ipvs *ipvs = net_ipvs(net); - - if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) - return; - if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff) - return; - - spin_lock_bh(&ipvs->sync_buff_lock); - /* Buffer empty ? then let buf_create do the job */ - if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) { - kfree(ipvs->sync_buff); - ipvs->sync_buff = NULL; - } else { - spin_lock_bh(&ipvs->sync_lock); - if (ipvs->sync_state & IP_VS_STATE_MASTER) - list_add_tail(&ipvs->sync_buff->list, - &ipvs->sync_queue); - else - ip_vs_sync_buff_release(ipvs->sync_buff); - spin_unlock_bh(&ipvs->sync_lock); - } - spin_unlock_bh(&ipvs->sync_buff_lock); + return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; } /* @@ -442,15 +425,101 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) return sb; } +/* Check if conn should be synced. + * pkts: conn packets, use sysctl_sync_threshold to avoid packet check + * - (1) sync_refresh_period: reduce sync rate. Additionally, retry + * sync_retries times with period of sync_refresh_period/8 + * - (2) if both sync_refresh_period and sync_period are 0 send sync only + * for state changes or only once when pkts matches sync_threshold + * - (3) templates: rate can be reduced only with sync_refresh_period or + * with (2) + */ +static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, + struct ip_vs_conn *cp, int pkts) +{ + unsigned long orig = ACCESS_ONCE(cp->sync_endtime); + unsigned long now = jiffies; + unsigned long n = (now + cp->timeout) & ~3UL; + unsigned int sync_refresh_period; + int sync_period; + int force; + + /* Check if we sync in current state */ + if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) + force = 0; + else if (likely(cp->protocol == IPPROTO_TCP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_TCP_S_ESTABLISHED) | + (1 << IP_VS_TCP_S_FIN_WAIT) | + (1 << IP_VS_TCP_S_CLOSE) | + (1 << IP_VS_TCP_S_CLOSE_WAIT) | + (1 << IP_VS_TCP_S_TIME_WAIT)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) + goto set; + } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_SCTP_S_ESTABLISHED) | + (1 << IP_VS_SCTP_S_CLOSED) | + (1 << IP_VS_SCTP_S_SHUT_ACK_CLI) | + (1 << IP_VS_SCTP_S_SHUT_ACK_SER)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) + goto set; + } else { + /* UDP or another protocol with single state */ + force = 0; + } + + sync_refresh_period = sysctl_sync_refresh_period(ipvs); + if (sync_refresh_period > 0) { + long diff = n - orig; + long min_diff = max(cp->timeout >> 1, 10UL * HZ); + + /* Avoid sync if difference is below sync_refresh_period + * and below the half timeout. + */ + if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { + int retries = orig & 3; + + if (retries >= sysctl_sync_retries(ipvs)) + return 0; + if (time_before(now, orig - cp->timeout + + (sync_refresh_period >> 3))) + return 0; + n |= retries + 1; + } + } + sync_period = sysctl_sync_period(ipvs); + if (sync_period > 0) { + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && + pkts % sync_period != sysctl_sync_threshold(ipvs)) + return 0; + } else if (sync_refresh_period <= 0 && + pkts != sysctl_sync_threshold(ipvs)) + return 0; + +set: + cp->old_state = cp->state; + n = cmpxchg(&cp->sync_endtime, orig, n); + return n == orig || force; +} + /* * Version 0 , could be switched in by sys_ctl. * Add an ip_vs_conn information into the current sync_buff. */ -void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) +static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, + int pkts) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg_v0 *m; struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; int len; if (unlikely(cp->af != AF_INET)) @@ -459,21 +528,41 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return; + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + return; + spin_lock(&ipvs->sync_buff_lock); - if (!ipvs->sync_buff) { - ipvs->sync_buff = - ip_vs_sync_buff_create_v0(ipvs); - if (!ipvs->sync_buff) { + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; + buff = ms->sync_buff; + if (buff) { + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + /* Send buffer if it is for v1 */ + if (!m->nr_conns) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; + } + } + if (!buff) { + buff = ip_vs_sync_buff_create_v0(ipvs); + if (!buff) { spin_unlock(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } + ms->sync_buff = buff; } len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : SIMPLE_CONN_SIZE; - m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg; - s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head; + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + s = (struct ip_vs_sync_conn_v0 *) buff->head; /* copy members */ s->reserved = 0; @@ -494,18 +583,24 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) m->nr_conns++; m->size += len; - ipvs->sync_buff->head += len; + buff->head += len; /* check if there is a space for next one */ - if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) { - sb_queue_tail(ipvs); - ipvs->sync_buff = NULL; + if (buff->head + FULL_CONN_SIZE > buff->end) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; } spin_unlock(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ - if (cp->control) - ip_vs_sync_conn(net, cp->control); + cp = cp->control; + if (cp) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); + ip_vs_sync_conn(net, cp->control, pkts); + } } /* @@ -513,23 +608,29 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) * Called by ip_vs_in. * Sending Version 1 messages */ -void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) +void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_sync_mesg *m; union ip_vs_sync_conn *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; __u8 *p; unsigned int len, pe_name_len, pad; /* Handle old version of the protocol */ if (sysctl_sync_ver(ipvs) == 0) { - ip_vs_sync_conn_v0(net, cp); + ip_vs_sync_conn_v0(net, cp, pkts); return; } /* Do not sync ONE PACKET */ if (cp->flags & IP_VS_CONN_F_ONE_PACKET) goto control; sloop: + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + goto control; + /* Sanity checks */ pe_name_len = 0; if (cp->pe_data_len) { @@ -541,6 +642,13 @@ sloop: } spin_lock(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -559,27 +667,32 @@ sloop: /* check if there is a space for this one */ pad = 0; - if (ipvs->sync_buff) { - pad = (4 - (size_t)ipvs->sync_buff->head) & 3; - if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) { - sb_queue_tail(ipvs); - ipvs->sync_buff = NULL; + buff = ms->sync_buff; + if (buff) { + m = buff->mesg; + pad = (4 - (size_t) buff->head) & 3; + /* Send buffer if it is for v0 */ + if (buff->head + len + pad > buff->end || m->reserved) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; pad = 0; } } - if (!ipvs->sync_buff) { - ipvs->sync_buff = ip_vs_sync_buff_create(ipvs); - if (!ipvs->sync_buff) { + if (!buff) { + buff = ip_vs_sync_buff_create(ipvs); + if (!buff) { spin_unlock(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } + ms->sync_buff = buff; + m = buff->mesg; } - m = ipvs->sync_buff->mesg; - p = ipvs->sync_buff->head; - ipvs->sync_buff->head += pad + len; + p = buff->head; + buff->head += pad + len; m->size += pad + len; /* Add ev. padding from prev. sync_conn */ while (pad--) @@ -644,16 +757,10 @@ control: cp = cp->control; if (!cp) return; - /* - * Reduce sync rate for templates - * i.e only increment in_pkts for Templates. - */ - if (cp->flags & IP_VS_CONN_F_TEMPLATE) { - int pkts = atomic_add_return(1, &cp->in_pkts); - - if (pkts % sysctl_sync_period(ipvs) != 1) - return; - } + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_add_return(1, &cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); goto sloop; } @@ -731,9 +838,32 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, else cp = ip_vs_ct_in_get(param); - if (cp && param->pe_data) /* Free pe_data */ + if (cp) { + /* Free pe_data */ kfree(param->pe_data); - if (!cp) { + + dest = cp->dest; + spin_lock(&cp->lock); + if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && + !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { + if (flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + } else { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + } + } + flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; + flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; + cp->flags = flags; + spin_unlock(&cp->lock); + if (!dest) { + dest = ip_vs_try_bind_dest(cp); + if (dest) + atomic_dec(&dest->refcnt); + } + } else { /* * Find the appropriate destination for the connection. * If it is not found the connection will remain unbound @@ -742,18 +872,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, param->vport, protocol, fwmark, flags); - /* Set the approprite ativity flag */ - if (protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } else if (protocol == IPPROTO_SCTP) { - if (state != IP_VS_SCTP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); if (dest) atomic_dec(&dest->refcnt); @@ -763,34 +881,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); return; } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && - (cp->state != state)) { - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_SCTP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } } if (opt) @@ -839,7 +929,7 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer, p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); for (i=0; i<m->nr_conns; i++) { - unsigned flags, state; + unsigned int flags, state; if (p + SIMPLE_CONN_SIZE > buffer+buflen) { IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); @@ -1109,7 +1199,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, for (i=0; i<nr_conns; i++) { union ip_vs_sync_conn *s; - unsigned size; + unsigned int size; int retc; p = msg_end; @@ -1149,6 +1239,28 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, /* + * Setup sndbuf (mode=1) or rcvbuf (mode=0) + */ +static void set_sock_size(struct sock *sk, int mode, int val) +{ + /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ + /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ + lock_sock(sk); + if (mode) { + val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, + sysctl_wmem_max); + sk->sk_sndbuf = val * 2; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } else { + val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, + sysctl_rmem_max); + sk->sk_rcvbuf = val * 2; + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } + release_sock(sk); +} + +/* * Setup loopback of outgoing multicasts on a sending socket */ static void set_mcast_loop(struct sock *sk, u_char loop) @@ -1298,9 +1410,15 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) /* * Set up sending multicast socket over UDP */ -static struct socket *make_send_sock(struct net *net) +static struct socket *make_send_sock(struct net *net, int id) { struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; struct socket *sock; int result; @@ -1324,6 +1442,9 @@ static struct socket *make_send_sock(struct net *net) set_mcast_loop(sock->sk, 0); set_mcast_ttl(sock->sk, 1); + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 1, result); result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); if (result < 0) { @@ -1349,9 +1470,15 @@ error: /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct net *net) +static struct socket *make_receive_sock(struct net *net, int id) { struct netns_ipvs *ipvs = net_ipvs(net); + /* multicast addr */ + struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), + }; struct socket *sock; int result; @@ -1368,7 +1495,10 @@ static struct socket *make_receive_sock(struct net *net) */ sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ - sock->sk->sk_reuse = 1; + sock->sk->sk_reuse = SK_CAN_REUSE; + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 0, result); result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, sizeof(struct sockaddr)); @@ -1411,18 +1541,22 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) return len; } -static void +static int ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) { int msize; + int ret; msize = msg->size; /* Put size in network byte order */ msg->size = htons(msg->size); - if (ip_vs_send_async(sock, (char *)msg, msize) != msize) - pr_err("ip_vs_send_async error\n"); + ret = ip_vs_send_async(sock, (char *)msg, msize); + if (ret >= 0 || ret == -EAGAIN) + return ret; + pr_err("ip_vs_send_async error %d\n", ret); + return 0; } static int @@ -1438,48 +1572,90 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) iov.iov_base = buffer; iov.iov_len = (size_t)buflen; - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT); if (len < 0) - return -1; + return len; LeaveFunction(7); return len; } +/* Wakeup the master thread for sending */ +static void master_wakeup_work_handler(struct work_struct *work) +{ + struct ipvs_master_sync_state *ms = + container_of(work, struct ipvs_master_sync_state, + master_wakeup_work.work); + struct netns_ipvs *ipvs = ms->ipvs; + + spin_lock_bh(&ipvs->sync_lock); + if (ms->sync_queue_len && + ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { + ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; + wake_up_process(ms->master_thread); + } + spin_unlock_bh(&ipvs->sync_lock); +} + +/* Get next buffer to send */ +static inline struct ip_vs_sync_buff * +next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) +{ + struct ip_vs_sync_buff *sb; + + sb = sb_dequeue(ipvs, ms); + if (sb) + return sb; + /* Do not delay entries in buffer for more than 2 seconds */ + return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); +} static int sync_thread_master(void *data) { struct ip_vs_sync_thread_data *tinfo = data; struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; + struct sock *sk = tinfo->sock->sk; struct ip_vs_sync_buff *sb; pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " - "syncid = %d\n", - ipvs->master_mcast_ifn, ipvs->master_syncid); + "syncid = %d, id = %d\n", + ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); - while (!kthread_should_stop()) { - while ((sb = sb_dequeue(ipvs))) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); + for (;;) { + sb = next_sync_buff(ipvs, ms); + if (unlikely(kthread_should_stop())) + break; + if (!sb) { + schedule_timeout(IPVS_SYNC_CHECK_PERIOD); + continue; } - - /* check if entries stay in ipvs->sync_buff for 2 seconds */ - sb = get_curr_sync_buff(ipvs, 2 * HZ); - if (sb) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); + while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { + int ret = 0; + + __wait_event_interruptible(*sk_sleep(sk), + sock_writeable(sk) || + kthread_should_stop(), + ret); + if (unlikely(kthread_should_stop())) + goto done; } - - schedule_timeout_interruptible(HZ); + ip_vs_sync_buff_release(sb); } +done: + __set_current_state(TASK_RUNNING); + if (sb) + ip_vs_sync_buff_release(sb); + /* clean up the sync_buff queue */ - while ((sb = sb_dequeue(ipvs))) + while ((sb = sb_dequeue(ipvs, ms))) ip_vs_sync_buff_release(sb); + __set_current_state(TASK_RUNNING); /* clean up the current sync_buff */ - sb = get_curr_sync_buff(ipvs, 0); + sb = get_curr_sync_buff(ipvs, ms, 0); if (sb) ip_vs_sync_buff_release(sb); @@ -1498,8 +1674,8 @@ static int sync_thread_backup(void *data) int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " - "syncid = %d\n", - ipvs->backup_mcast_ifn, ipvs->backup_syncid); + "syncid = %d, id = %d\n", + ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); while (!kthread_should_stop()) { wait_event_interruptible(*sk_sleep(tinfo->sock->sk), @@ -1511,7 +1687,8 @@ static int sync_thread_backup(void *data) len = ip_vs_receive(tinfo->sock, tinfo->buf, ipvs->recv_mesg_maxlen); if (len <= 0) { - pr_err("receiving message error\n"); + if (len != -EAGAIN) + pr_err("receiving message error\n"); break; } @@ -1535,86 +1712,140 @@ static int sync_thread_backup(void *data) int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) { struct ip_vs_sync_thread_data *tinfo; - struct task_struct **realtask, *task; + struct task_struct **array = NULL, *task; struct socket *sock; struct netns_ipvs *ipvs = net_ipvs(net); - char *name, *buf = NULL; + char *name; int (*threadfn)(void *data); + int id, count; int result = -ENOMEM; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", sizeof(struct ip_vs_sync_conn_v0)); + if (!ipvs->sync_state) { + count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); + ipvs->threads_mask = count - 1; + } else + count = ipvs->threads_mask + 1; if (state == IP_VS_STATE_MASTER) { - if (ipvs->master_thread) + if (ipvs->ms) return -EEXIST; strlcpy(ipvs->master_mcast_ifn, mcast_ifn, sizeof(ipvs->master_mcast_ifn)); ipvs->master_syncid = syncid; - realtask = &ipvs->master_thread; - name = "ipvs_master:%d"; + name = "ipvs-m:%d:%d"; threadfn = sync_thread_master; - sock = make_send_sock(net); } else if (state == IP_VS_STATE_BACKUP) { - if (ipvs->backup_thread) + if (ipvs->backup_threads) return -EEXIST; strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, sizeof(ipvs->backup_mcast_ifn)); ipvs->backup_syncid = syncid; - realtask = &ipvs->backup_thread; - name = "ipvs_backup:%d"; + name = "ipvs-b:%d:%d"; threadfn = sync_thread_backup; - sock = make_receive_sock(net); } else { return -EINVAL; } - if (IS_ERR(sock)) { - result = PTR_ERR(sock); - goto out; - } + if (state == IP_VS_STATE_MASTER) { + struct ipvs_master_sync_state *ms; - set_sync_mesg_maxlen(net, state); - if (state == IP_VS_STATE_BACKUP) { - buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL); - if (!buf) - goto outsocket; + ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL); + if (!ipvs->ms) + goto out; + ms = ipvs->ms; + for (id = 0; id < count; id++, ms++) { + INIT_LIST_HEAD(&ms->sync_queue); + ms->sync_queue_len = 0; + ms->sync_queue_delay = 0; + INIT_DELAYED_WORK(&ms->master_wakeup_work, + master_wakeup_work_handler); + ms->ipvs = ipvs; + } + } else { + array = kzalloc(count * sizeof(struct task_struct *), + GFP_KERNEL); + if (!array) + goto out; } + set_sync_mesg_maxlen(net, state); - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - goto outbuf; - - tinfo->net = net; - tinfo->sock = sock; - tinfo->buf = buf; + tinfo = NULL; + for (id = 0; id < count; id++) { + if (state == IP_VS_STATE_MASTER) + sock = make_send_sock(net, id); + else + sock = make_receive_sock(net, id); + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto outtinfo; + } + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outsocket; + tinfo->net = net; + tinfo->sock = sock; + if (state == IP_VS_STATE_BACKUP) { + tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, + GFP_KERNEL); + if (!tinfo->buf) + goto outtinfo; + } + tinfo->id = id; - task = kthread_run(threadfn, tinfo, name, ipvs->gen); - if (IS_ERR(task)) { - result = PTR_ERR(task); - goto outtinfo; + task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } + tinfo = NULL; + if (state == IP_VS_STATE_MASTER) + ipvs->ms[id].master_thread = task; + else + array[id] = task; } /* mark as active */ - *realtask = task; + + if (state == IP_VS_STATE_BACKUP) + ipvs->backup_threads = array; + spin_lock_bh(&ipvs->sync_buff_lock); ipvs->sync_state |= state; + spin_unlock_bh(&ipvs->sync_buff_lock); /* increase the module use count */ ip_vs_use_count_inc(); return 0; -outtinfo: - kfree(tinfo); -outbuf: - kfree(buf); outsocket: sk_release_kernel(sock->sk); + +outtinfo: + if (tinfo) { + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo->buf); + kfree(tinfo); + } + count = id; + while (count-- > 0) { + if (state == IP_VS_STATE_MASTER) + kthread_stop(ipvs->ms[count].master_thread); + else + kthread_stop(array[count]); + } + kfree(array); + out: + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + kfree(ipvs->ms); + ipvs->ms = NULL; + } return result; } @@ -1622,38 +1853,60 @@ out: int stop_sync_thread(struct net *net, int state) { struct netns_ipvs *ipvs = net_ipvs(net); + struct task_struct **array; + int id; int retc = -EINVAL; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); if (state == IP_VS_STATE_MASTER) { - if (!ipvs->master_thread) + if (!ipvs->ms) return -ESRCH; - pr_info("stopping master sync thread %d ...\n", - task_pid_nr(ipvs->master_thread)); - /* * The lock synchronizes with sb_queue_tail(), so that we don't * add sync buffers to the queue, when we are already in * progress of stopping the master sync daemon. */ - spin_lock_bh(&ipvs->sync_lock); + spin_lock_bh(&ipvs->sync_buff_lock); + spin_lock(&ipvs->sync_lock); ipvs->sync_state &= ~IP_VS_STATE_MASTER; - spin_unlock_bh(&ipvs->sync_lock); - retc = kthread_stop(ipvs->master_thread); - ipvs->master_thread = NULL; + spin_unlock(&ipvs->sync_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); + + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + struct ipvs_master_sync_state *ms = &ipvs->ms[id]; + int ret; + + pr_info("stopping master sync thread %d ...\n", + task_pid_nr(ms->master_thread)); + cancel_delayed_work_sync(&ms->master_wakeup_work); + ret = kthread_stop(ms->master_thread); + if (retc >= 0) + retc = ret; + } + kfree(ipvs->ms); + ipvs->ms = NULL; } else if (state == IP_VS_STATE_BACKUP) { - if (!ipvs->backup_thread) + if (!ipvs->backup_threads) return -ESRCH; - pr_info("stopping backup sync thread %d ...\n", - task_pid_nr(ipvs->backup_thread)); - ipvs->sync_state &= ~IP_VS_STATE_BACKUP; - retc = kthread_stop(ipvs->backup_thread); - ipvs->backup_thread = NULL; + array = ipvs->backup_threads; + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + int ret; + + pr_info("stopping backup sync thread %d ...\n", + task_pid_nr(array[id])); + ret = kthread_stop(array[id]); + if (retc >= 0) + retc = ret; + } + kfree(array); + ipvs->backup_threads = NULL; } /* decrease the module use count */ @@ -1670,13 +1923,8 @@ int __net_init ip_vs_sync_net_init(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); - INIT_LIST_HEAD(&ipvs->sync_queue); spin_lock_init(&ipvs->sync_lock); spin_lock_init(&ipvs->sync_buff_lock); - - ipvs->sync_mcast_addr.sin_family = AF_INET; - ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT); - ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index fd0d4e09876..231be7dd547 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -84,7 +84,7 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) /* * Allocate the mark variable for WRR scheduling */ - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); if (mark == NULL) return -ENOMEM; |