From 9a812198ae49967f239789164c55ec3e72b7e0dd Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Thu, 14 Aug 2008 14:08:44 +0200 Subject: IPVS: Add genetlink interface implementation Add the implementation of the new Generic Netlink interface to IPVS and keep the old set/getsockopt interface for userspace backwards compatibility. Signed-off-by: Julius Volz Acked-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 875 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 875 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 6379705a8dc..d1dbd8b311b 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -2320,6 +2321,872 @@ static struct nf_sockopt_ops ip_vs_sockopts = { .owner = THIS_MODULE, }; +/* + * Generic Netlink interface + */ + +/* IPVS genetlink family */ +static struct genl_family ip_vs_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = IPVS_GENL_NAME, + .version = IPVS_GENL_VERSION, + .maxattr = IPVS_CMD_MAX, +}; + +/* Policy used for first-level command attributes */ +static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { + [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ +static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { + [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, + .len = IP_VS_IFNAME_MAXLEN }, + [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ +static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { + [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_SCHEDNAME_MAXLEN }, + [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, + .len = sizeof(struct ip_vs_flags) }, + [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ +static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { + [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, + struct ip_vs_stats *stats) +{ + struct nlattr *nl_stats = nla_nest_start(skb, container_type); + if (!nl_stats) + return -EMSGSIZE; + + spin_lock_bh(&stats->lock); + + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->conns); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->inpkts); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->outpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->inbytes); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->outbytes); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->cps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->inpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->outpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->inbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->outbps); + + spin_unlock_bh(&stats->lock); + + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + spin_unlock_bh(&stats->lock); + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_service(struct sk_buff *skb, + struct ip_vs_service *svc) +{ + struct nlattr *nl_service; + struct ip_vs_flags flags = { .flags = svc->flags, + .mask = ~0 }; + + nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); + if (!nl_service) + return -EMSGSIZE; + + NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, AF_INET); + + if (svc->fwmark) { + NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); + } else { + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); + NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); + } + + NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); + NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); + + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_service); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_service); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_service(struct sk_buff *skb, + struct ip_vs_service *svc, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_SERVICE); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_service(skb, svc) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_services(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0, i; + int start = cb->args[0]; + struct ip_vs_service *svc; + + mutex_lock(&__ip_vs_mutex); + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + cb->args[0] = idx; + + return skb->len; +} + +static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; + struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; + + /* Parse mandatory identifying service fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) + return -EINVAL; + + nla_af = attrs[IPVS_SVC_ATTR_AF]; + nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; + nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; + nla_port = attrs[IPVS_SVC_ATTR_PORT]; + nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; + + if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) + return -EINVAL; + + /* For now, only support IPv4 */ + if (nla_get_u16(nla_af) != AF_INET) + return -EAFNOSUPPORT; + + if (nla_fwmark) { + usvc->protocol = IPPROTO_TCP; + usvc->fwmark = nla_get_u32(nla_fwmark); + } else { + usvc->protocol = nla_get_u16(nla_protocol); + nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); + usvc->port = nla_get_u16(nla_port); + usvc->fwmark = 0; + } + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_sched, *nla_flags, *nla_timeout, + *nla_netmask; + struct ip_vs_flags flags; + struct ip_vs_service *svc; + + nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; + nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; + nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; + + if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) + return -EINVAL; + + nla_memcpy(&flags, nla_flags, sizeof(flags)); + + /* prefill flags from service if it already exists */ + if (usvc->fwmark) + svc = __ip_vs_svc_fwm_get(usvc->fwmark); + else + svc = __ip_vs_service_get(usvc->protocol, usvc->addr, + usvc->port); + if (svc) { + usvc->flags = svc->flags; + ip_vs_service_put(svc); + } else + usvc->flags = 0; + + /* set new flags from userland */ + usvc->flags = (usvc->flags & ~flags.mask) | + (flags.flags & flags.mask); + + strlcpy(usvc->sched_name, nla_data(nla_sched), + sizeof(usvc->sched_name)); + usvc->timeout = nla_get_u32(nla_timeout); + usvc->netmask = nla_get_u32(nla_netmask); + } + + return 0; +} + +static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) +{ + struct ip_vs_service_user usvc; + int ret; + + ret = ip_vs_genl_parse_service(&usvc, nla, 0); + if (ret) + return ERR_PTR(ret); + + if (usvc.fwmark) + return __ip_vs_svc_fwm_get(usvc.fwmark); + else + return __ip_vs_service_get(usvc.protocol, usvc.addr, + usvc.port); +} + +static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) +{ + struct nlattr *nl_dest; + + nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); + if (!nl_dest) + return -EMSGSIZE; + + NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); + NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); + + NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, + atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns)); + + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_dest); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_dest); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DEST); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_dest(skb, dest) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dests(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + + mutex_lock(&__ip_vs_mutex); + + /* Try to find the service for which to dump destinations */ + if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, + IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) + goto out_err; + + svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc) || svc == NULL) + goto out_err; + + /* Dump the destinations */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + +nla_put_failure: + cb->args[0] = idx; + ip_vs_service_put(svc); + +out_err: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_parse_dest(struct ip_vs_dest_user *udest, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; + struct nlattr *nla_addr, *nla_port; + + /* Parse mandatory identifying destination fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) + return -EINVAL; + + nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; + nla_port = attrs[IPVS_DEST_ATTR_PORT]; + + if (!(nla_addr && nla_port)) + return -EINVAL; + + nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); + udest->port = nla_get_u16(nla_port); + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, + *nla_l_thresh; + + nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; + nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; + nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; + nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + + if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) + return -EINVAL; + + udest->conn_flags = nla_get_u32(nla_fwd) + & IP_VS_CONN_F_FWD_MASK; + udest->weight = nla_get_u32(nla_weight); + udest->u_threshold = nla_get_u32(nla_u_thresh); + udest->l_threshold = nla_get_u32(nla_l_thresh); + } + + return 0; +} + +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid) +{ + struct nlattr *nl_daemon; + + nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); + if (!nl_daemon) + return -EMSGSIZE; + + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); + NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); + + nla_nest_end(skb, nl_daemon); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_daemon); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid, + struct netlink_callback *cb) +{ + void *hdr; + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DAEMON); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemons(struct sk_buff *skb, + struct netlink_callback *cb) +{ + mutex_lock(&__ip_vs_mutex); + if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, + ip_vs_master_mcast_ifn, + ip_vs_master_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[0] = 1; + } + + if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, + ip_vs_backup_mcast_ifn, + ip_vs_backup_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[1] = 1; + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_new_daemon(struct nlattr **attrs) +{ + if (!(attrs[IPVS_DAEMON_ATTR_STATE] && + attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && + attrs[IPVS_DAEMON_ATTR_SYNC_ID])) + return -EINVAL; + + return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); +} + +static int ip_vs_genl_del_daemon(struct nlattr **attrs) +{ + if (!attrs[IPVS_DAEMON_ATTR_STATE]) + return -EINVAL; + + return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); +} + +static int ip_vs_genl_set_config(struct nlattr **attrs) +{ + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) + t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) + t.tcp_fin_timeout = + nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) + t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); + + return ip_vs_set_timeout(&t); +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct ip_vs_service *svc = NULL; + struct ip_vs_service_user usvc; + struct ip_vs_dest_user udest; + int ret = 0, cmd; + int need_full_svc = 0, need_full_dest = 0; + + cmd = info->genlhdr->cmd; + + mutex_lock(&__ip_vs_mutex); + + if (cmd == IPVS_CMD_FLUSH) { + ret = ip_vs_flush(); + goto out; + } else if (cmd == IPVS_CMD_SET_CONFIG) { + ret = ip_vs_genl_set_config(info->attrs); + goto out; + } else if (cmd == IPVS_CMD_NEW_DAEMON || + cmd == IPVS_CMD_DEL_DAEMON) { + + struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; + + if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || + nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, + info->attrs[IPVS_CMD_ATTR_DAEMON], + ip_vs_daemon_policy)) { + ret = -EINVAL; + goto out; + } + + if (cmd == IPVS_CMD_NEW_DAEMON) + ret = ip_vs_genl_new_daemon(daemon_attrs); + else + ret = ip_vs_genl_del_daemon(daemon_attrs); + goto out; + } else if (cmd == IPVS_CMD_ZERO && + !info->attrs[IPVS_CMD_ATTR_SERVICE]) { + ret = ip_vs_zero_all(); + goto out; + } + + /* All following commands require a service argument, so check if we + * received a valid one. We need a full service specification when + * adding / editing a service. Only identifying members otherwise. */ + if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) + need_full_svc = 1; + + ret = ip_vs_genl_parse_service(&usvc, + info->attrs[IPVS_CMD_ATTR_SERVICE], + need_full_svc); + if (ret) + goto out; + + /* Lookup the exact service by or fwmark */ + if (usvc.fwmark == 0) + svc = __ip_vs_service_get(usvc.protocol, usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_get(usvc.fwmark); + + /* Unless we're adding a new service, the service must already exist */ + if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { + ret = -ESRCH; + goto out; + } + + /* Destination commands require a valid destination argument. For + * adding / editing a destination, we need a full destination + * specification. */ + if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || + cmd == IPVS_CMD_DEL_DEST) { + if (cmd != IPVS_CMD_DEL_DEST) + need_full_dest = 1; + + ret = ip_vs_genl_parse_dest(&udest, + info->attrs[IPVS_CMD_ATTR_DEST], + need_full_dest); + if (ret) + goto out; + } + + switch (cmd) { + case IPVS_CMD_NEW_SERVICE: + if (svc == NULL) + ret = ip_vs_add_service(&usvc, &svc); + else + ret = -EEXIST; + break; + case IPVS_CMD_SET_SERVICE: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IPVS_CMD_DEL_SERVICE: + ret = ip_vs_del_service(svc); + break; + case IPVS_CMD_NEW_DEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IPVS_CMD_SET_DEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IPVS_CMD_DEL_DEST: + ret = ip_vs_del_dest(svc, &udest); + break; + case IPVS_CMD_ZERO: + ret = ip_vs_zero_service(svc); + break; + default: + ret = -EINVAL; + } + +out: + if (svc) + ip_vs_service_put(svc); + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + +static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *reply; + int ret, cmd, reply_cmd; + + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_GET_SERVICE) + reply_cmd = IPVS_CMD_NEW_SERVICE; + else if (cmd == IPVS_CMD_GET_INFO) + reply_cmd = IPVS_CMD_SET_INFO; + else if (cmd == IPVS_CMD_GET_CONFIG) + reply_cmd = IPVS_CMD_SET_CONFIG; + else { + IP_VS_ERR("unknown Generic Netlink command\n"); + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&__ip_vs_mutex); + + reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); + if (reply == NULL) + goto nla_put_failure; + + switch (cmd) { + case IPVS_CMD_GET_SERVICE: + { + struct ip_vs_service *svc; + + svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc)) { + ret = PTR_ERR(svc); + goto out_err; + } else if (svc) { + ret = ip_vs_genl_fill_service(msg, svc); + ip_vs_service_put(svc); + if (ret) + goto nla_put_failure; + } else { + ret = -ESRCH; + goto out_err; + } + + break; + } + + case IPVS_CMD_GET_CONFIG: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); +#ifdef CONFIG_IP_VS_PROTO_TCP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); +#endif + + break; + } + + case IPVS_CMD_GET_INFO: + NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); + NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + IP_VS_CONN_TAB_SIZE); + break; + } + + genlmsg_end(msg, reply); + ret = genlmsg_unicast(msg, info->snd_pid); + goto out; + +nla_put_failure: + IP_VS_ERR("not enough space in Netlink message\n"); + ret = -EMSGSIZE; + +out_err: + nlmsg_free(msg); +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + + +static struct genl_ops ip_vs_genl_ops[] __read_mostly = { + { + .cmd = IPVS_CMD_NEW_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_SERVICE, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + .dumpit = ip_vs_genl_dump_services, + .policy = ip_vs_cmd_policy, + }, + { + .cmd = IPVS_CMD_NEW_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .dumpit = ip_vs_genl_dump_dests, + }, + { + .cmd = IPVS_CMD_NEW_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DAEMON, + .flags = GENL_ADMIN_PERM, + .dumpit = ip_vs_genl_dump_daemons, + }, + { + .cmd = IPVS_CMD_SET_CONFIG, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_CONFIG, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_GET_INFO, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_ZERO, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_FLUSH, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, +}; + +static int __init ip_vs_genl_register(void) +{ + int ret, i; + + ret = genl_register_family(&ip_vs_genl_family); + if (ret) + return ret; + + for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) { + ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]); + if (ret) + goto err_out; + } + return 0; + +err_out: + genl_unregister_family(&ip_vs_genl_family); + return ret; +} + +static void ip_vs_genl_unregister(void) +{ + genl_unregister_family(&ip_vs_genl_family); +} + +/* End of Generic Netlink interface definitions */ + int __init ip_vs_control_init(void) { @@ -2334,6 +3201,13 @@ int __init ip_vs_control_init(void) return ret; } + ret = ip_vs_genl_register(); + if (ret) { + IP_VS_ERR("cannot register Generic Netlink interface.\n"); + nf_unregister_sockopt(&ip_vs_sockopts); + return ret; + } + proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); @@ -2368,6 +3242,7 @@ void ip_vs_control_cleanup(void) unregister_sysctl_table(sysctl_header); proc_net_remove(&init_net, "ip_vs_stats"); proc_net_remove(&init_net, "ip_vs"); + ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); } -- cgit v1.2.3-70-g09d2 From 82dfb6f32219d8e6cf6b979a520cb2b11d977d4e Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Mon, 11 Aug 2008 19:36:06 +0000 Subject: ipvs: Only call init_service, update_service and done_service for schedulers if defined There are schedulers that only schedule based on data available in the service or destination structures and they don't need any persistent storage or initialization routine. These schedulers currently provide dummy functions for the init_service, update_service and/or done_service functions. For the init_service and done_service cases we already have code that only calls these functions, if the scheduler provides them. Do the same for the update_service case and remove the dummy functions from all schedulers. Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 21 ++++++++++++--------- net/ipv4/ipvs/ip_vs_lblc.c | 7 ------- net/ipv4/ipvs/ip_vs_lblcr.c | 7 ------- net/ipv4/ipvs/ip_vs_lc.c | 21 --------------------- net/ipv4/ipvs/ip_vs_nq.c | 24 ------------------------ net/ipv4/ipvs/ip_vs_rr.c | 7 ------- net/ipv4/ipvs/ip_vs_sed.c | 24 ------------------------ net/ipv4/ipvs/ip_vs_wlc.c | 24 ------------------------ 8 files changed, 12 insertions(+), 123 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index d1dbd8b311b..ede101eeec1 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -869,7 +869,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) svc->num_dests++; /* call the update_service function of its scheduler */ - svc->scheduler->update_service(svc); + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); write_unlock_bh(&__ip_vs_svc_lock); return 0; @@ -899,7 +900,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) svc->num_dests++; /* call the update_service function of its scheduler */ - svc->scheduler->update_service(svc); + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); write_unlock_bh(&__ip_vs_svc_lock); @@ -949,7 +951,8 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); /* call the update_service, because server weight may be changed */ - svc->scheduler->update_service(svc); + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); write_unlock_bh(&__ip_vs_svc_lock); @@ -1012,12 +1015,12 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, */ list_del(&dest->n_list); svc->num_dests--; - if (svcupd) { - /* - * Call the update_service function of its scheduler - */ - svc->scheduler->update_service(svc); - } + + /* + * Call the update_service function of its scheduler + */ + if (svcupd && svc->scheduler->update_service) + svc->scheduler->update_service(svc); } diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 7a6a319f544..4a14d069f8a 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -388,12 +388,6 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) } -static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline struct ip_vs_dest * __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) { @@ -542,7 +536,6 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), .init_service = ip_vs_lblc_init_svc, .done_service = ip_vs_lblc_done_svc, - .update_service = ip_vs_lblc_update_svc, .schedule = ip_vs_lblc_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index c234e73968a..46b870385b8 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -572,12 +572,6 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) } -static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline struct ip_vs_dest * __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) { @@ -731,7 +725,6 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), .init_service = ip_vs_lblcr_init_svc, .done_service = ip_vs_lblcr_done_svc, - .update_service = ip_vs_lblcr_update_svc, .schedule = ip_vs_lblcr_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c index ebcdbf75ac6..2c3de1b6351 100644 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ b/net/ipv4/ipvs/ip_vs_lc.c @@ -20,24 +20,6 @@ #include -static int ip_vs_lc_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int ip_vs_lc_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int ip_vs_lc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline unsigned int ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) { @@ -99,9 +81,6 @@ static struct ip_vs_scheduler ip_vs_lc_scheduler = { .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), - .init_service = ip_vs_lc_init_svc, - .done_service = ip_vs_lc_done_svc, - .update_service = ip_vs_lc_update_svc, .schedule = ip_vs_lc_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c index 92f3a677003..5330d5a2de1 100644 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ b/net/ipv4/ipvs/ip_vs_nq.c @@ -37,27 +37,6 @@ #include -static int -ip_vs_nq_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_nq_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_nq_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline unsigned int ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) { @@ -137,9 +116,6 @@ static struct ip_vs_scheduler ip_vs_nq_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), - .init_service = ip_vs_nq_init_svc, - .done_service = ip_vs_nq_done_svc, - .update_service = ip_vs_nq_update_svc, .schedule = ip_vs_nq_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c index 358110d17e5..f7492911753 100644 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ b/net/ipv4/ipvs/ip_vs_rr.c @@ -32,12 +32,6 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc) } -static int ip_vs_rr_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static int ip_vs_rr_update_svc(struct ip_vs_service *svc) { svc->sched_data = &svc->destinations; @@ -96,7 +90,6 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = { .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), .init_service = ip_vs_rr_init_svc, - .done_service = ip_vs_rr_done_svc, .update_service = ip_vs_rr_update_svc, .schedule = ip_vs_rr_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c index 77663d84cbd..53f73bea66c 100644 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ b/net/ipv4/ipvs/ip_vs_sed.c @@ -41,27 +41,6 @@ #include -static int -ip_vs_sed_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_sed_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_sed_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline unsigned int ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) { @@ -139,9 +118,6 @@ static struct ip_vs_scheduler ip_vs_sed_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), - .init_service = ip_vs_sed_init_svc, - .done_service = ip_vs_sed_done_svc, - .update_service = ip_vs_sed_update_svc, .schedule = ip_vs_sed_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c index 9b0ef86bb1f..df7ad8d7476 100644 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ b/net/ipv4/ipvs/ip_vs_wlc.c @@ -25,27 +25,6 @@ #include -static int -ip_vs_wlc_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_wlc_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_wlc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - static inline unsigned int ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) { @@ -127,9 +106,6 @@ static struct ip_vs_scheduler ip_vs_wlc_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), - .init_service = ip_vs_wlc_init_svc, - .done_service = ip_vs_wlc_done_svc, - .update_service = ip_vs_wlc_update_svc, .schedule = ip_vs_wlc_schedule, }; -- cgit v1.2.3-70-g09d2 From a919cf4b6b499416b6e2247dbc79196c4325f2e6 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Thu, 14 Aug 2008 00:47:16 +0200 Subject: ipvs: Create init functions for estimator code Commit 8ab19ea36c5c5340ff598e4d15fc084eb65671dc ("ipvs: Fix possible deadlock in estimator code") fixed a deadlock condition, but that condition can only happen during unload of IPVS, because during normal operation there is at least our global stats structure in the estimator list. The mod_timer() and del_timer_sync() calls are actually initialization and cleanup code in disguise. Let's make it explicit and move them to their own init and cleanup function. Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 ++ net/ipv4/ipvs/ip_vs_core.c | 8 ++++++-- net/ipv4/ipvs/ip_vs_est.c | 18 +++++++++++------- 3 files changed, 19 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 7312c3dd309..a25ad243031 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -683,6 +683,8 @@ extern void ip_vs_sync_conn(struct ip_vs_conn *cp); /* * IPVS rate estimator prototypes (from ip_vs_est.c) */ +extern int ip_vs_estimator_init(void); +extern void ip_vs_estimator_cleanup(void); extern void ip_vs_new_estimator(struct ip_vs_stats *stats); extern void ip_vs_kill_estimator(struct ip_vs_stats *stats); extern void ip_vs_zero_estimator(struct ip_vs_stats *stats); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index a7879eafc3b..9fbf0a6d739 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1070,10 +1070,12 @@ static int __init ip_vs_init(void) { int ret; + ip_vs_estimator_init(); + ret = ip_vs_control_init(); if (ret < 0) { IP_VS_ERR("can't setup control.\n"); - goto cleanup_nothing; + goto cleanup_estimator; } ip_vs_protocol_init(); @@ -1106,7 +1108,8 @@ static int __init ip_vs_init(void) cleanup_protocol: ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); - cleanup_nothing: + cleanup_estimator: + ip_vs_estimator_cleanup(); return ret; } @@ -1117,6 +1120,7 @@ static void __exit ip_vs_cleanup(void) ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); + ip_vs_estimator_cleanup(); IP_VS_INFO("ipvs unloaded.\n"); } diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 5a20f93bd7f..4fb620ec208 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -124,8 +124,6 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats) est->outbps = stats->outbps<<5; spin_lock_bh(&est_lock); - if (list_empty(&est_list)) - mod_timer(&est_timer, jiffies + 2 * HZ); list_add(&est->list, &est_list); spin_unlock_bh(&est_lock); } @@ -136,11 +134,6 @@ void ip_vs_kill_estimator(struct ip_vs_stats *stats) spin_lock_bh(&est_lock); list_del(&est->list); - while (list_empty(&est_list) && try_to_del_timer_sync(&est_timer) < 0) { - spin_unlock_bh(&est_lock); - cpu_relax(); - spin_lock_bh(&est_lock); - } spin_unlock_bh(&est_lock); } @@ -160,3 +153,14 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats) est->inbps = 0; est->outbps = 0; } + +int __init ip_vs_estimator_init(void) +{ + mod_timer(&est_timer, jiffies + 2 * HZ); + return 0; +} + +void ip_vs_estimator_cleanup(void) +{ + del_timer_sync(&est_timer); +} -- cgit v1.2.3-70-g09d2 From 4a031b0e6acd8a8c23725ceb5db6a0aa5c4e231f Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 15 Aug 2008 09:26:15 +1000 Subject: ipvs: rename __ip_vs_wlc_schedule in lblc and lblcr schedulers For the sake of clarity, rename __ip_vs_wlc_schedule() in lblc.c to __ip_vs_lblc_schedule() and the version in lblcr.c to __ip_vs_lblc_schedule(). I guess the original name stuck from a copy and paste. Cc: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_lblc.c | 6 +++--- net/ipv4/ipvs/ip_vs_lblcr.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 4a14d069f8a..b9b334cccf3 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -389,7 +389,7 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) static inline struct ip_vs_dest * -__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) { struct ip_vs_dest *dest, *least; int loh, doh; @@ -488,7 +488,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) tbl = (struct ip_vs_lblc_table *)svc->sched_data; en = ip_vs_lblc_get(tbl, iph->daddr); if (en == NULL) { - dest = __ip_vs_wlc_schedule(svc, iph); + dest = __ip_vs_lblc_schedule(svc, iph); if (dest == NULL) { IP_VS_DBG(1, "no destination available\n"); return NULL; @@ -503,7 +503,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 || is_overloaded(dest, svc)) { - dest = __ip_vs_wlc_schedule(svc, iph); + dest = __ip_vs_lblc_schedule(svc, iph); if (dest == NULL) { IP_VS_DBG(1, "no destination available\n"); return NULL; diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 46b870385b8..f1c84503689 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -573,7 +573,7 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) static inline struct ip_vs_dest * -__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) { struct ip_vs_dest *dest, *least; int loh, doh; @@ -673,7 +673,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) tbl = (struct ip_vs_lblcr_table *)svc->sched_data; en = ip_vs_lblcr_get(tbl, iph->daddr); if (en == NULL) { - dest = __ip_vs_wlc_schedule(svc, iph); + dest = __ip_vs_lblcr_schedule(svc, iph); if (dest == NULL) { IP_VS_DBG(1, "no destination available\n"); return NULL; @@ -687,7 +687,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } else { dest = ip_vs_dest_set_min(&en->set); if (!dest || is_overloaded(dest, svc)) { - dest = __ip_vs_wlc_schedule(svc, iph); + dest = __ip_vs_lblcr_schedule(svc, iph); if (dest == NULL) { IP_VS_DBG(1, "no destination available\n"); return NULL; -- cgit v1.2.3-70-g09d2 From 39ac50d0c79747b186c1268d9a488f8c1d256be7 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Mon, 18 Aug 2008 00:52:08 +0200 Subject: ipvs: Fix race conditions in lblc scheduler We can't access the cache entry outside of our critical read-locked region, because someone may free that entry. And we also need to check in the critical region wether the destination is still available, i.e. it's not in the trash. If we drop our reference counter, the destination can be purged from the trash at any time. Our caller only guarantees that no destination is moved to the trash, while we are scheduling. Also there is no need for our own rwlock, there is already one in the service structure for use in the schedulers. Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_lblc.c | 204 +++++++++++++++++++++------------------------ 1 file changed, 96 insertions(+), 108 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index b9b334cccf3..d2a43aa3fe4 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -96,7 +96,6 @@ struct ip_vs_lblc_entry { * IPVS lblc hash table */ struct ip_vs_lblc_table { - rwlock_t lock; /* lock for this table */ struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ @@ -123,31 +122,6 @@ static ctl_table vs_vars_table[] = { static struct ctl_table_header * sysctl_header; -/* - * new/free a ip_vs_lblc_entry, which is a mapping of a destionation - * IP address to a server. - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest) -{ - struct ip_vs_lblc_entry *en; - - en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); - if (en == NULL) { - IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); - return NULL; - } - - INIT_LIST_HEAD(&en->list); - en->addr = daddr; - - atomic_inc(&dest->refcnt); - en->dest = dest; - - return en; -} - - static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) { list_del(&en->list); @@ -173,55 +147,66 @@ static inline unsigned ip_vs_lblc_hashkey(__be32 addr) * Hash an entry in the ip_vs_lblc_table. * returns bool success. */ -static int +static void ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { - unsigned hash; - - if (!list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } + unsigned hash = ip_vs_lblc_hashkey(en->addr); - /* - * Hash by destination IP address - */ - hash = ip_vs_lblc_hashkey(en->addr); - - write_lock(&tbl->lock); list_add(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); - write_unlock(&tbl->lock); - - return 1; } /* - * Get ip_vs_lblc_entry associated with supplied parameters. + * Get ip_vs_lblc_entry associated with supplied parameters. Called under read + * lock */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) { - unsigned hash; + unsigned hash = ip_vs_lblc_hashkey(addr); struct ip_vs_lblc_entry *en; - hash = ip_vs_lblc_hashkey(addr); + list_for_each_entry(en, &tbl->bucket[hash], list) + if (en->addr == addr) + return en; - read_lock(&tbl->lock); + return NULL; +} - list_for_each_entry(en, &tbl->bucket[hash], list) { - if (en->addr == addr) { - /* HIT */ - read_unlock(&tbl->lock); - return en; + +/* + * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP + * address to a server. Called under write lock. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = ip_vs_lblc_get(tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); + return NULL; } - } - read_unlock(&tbl->lock); + en->addr = daddr; + en->lastuse = jiffies; - return NULL; + atomic_inc(&dest->refcnt); + en->dest = dest; + + ip_vs_lblc_hash(tbl, en); + } else if (en->dest != dest) { + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + + return en; } @@ -230,30 +215,29 @@ ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) */ static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) { - int i; struct ip_vs_lblc_entry *en, *nxt; + int i; for (i=0; ilock); list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { ip_vs_lblc_free(en); atomic_dec(&tbl->entries); } - write_unlock(&tbl->lock); } } -static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) +static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) { + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en, *nxt; unsigned long now = jiffies; int i, j; - struct ip_vs_lblc_entry *en, *nxt; for (i=0, j=tbl->rover; ilock); + write_lock(&svc->sched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + sysctl_ip_vs_lblc_expiration)) @@ -262,7 +246,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) ip_vs_lblc_free(en); atomic_dec(&tbl->entries); } - write_unlock(&tbl->lock); + write_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -281,17 +265,16 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) */ static void ip_vs_lblc_check_expire(unsigned long data) { - struct ip_vs_lblc_table *tbl; + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblc_table *tbl = svc->sched_data; unsigned long now = jiffies; int goal; int i, j; struct ip_vs_lblc_entry *en, *nxt; - tbl = (struct ip_vs_lblc_table *)data; - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ - ip_vs_lblc_full_check(tbl); + ip_vs_lblc_full_check(svc); tbl->counter = 1; goto out; } @@ -308,7 +291,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) for (i=0, j=tbl->rover; ilock); + write_lock(&svc->sched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) continue; @@ -317,7 +300,7 @@ static void ip_vs_lblc_check_expire(unsigned long data) atomic_dec(&tbl->entries); goal--; } - write_unlock(&tbl->lock); + write_unlock(&svc->sched_lock); if (goal <= 0) break; } @@ -336,15 +319,14 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblc_table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); if (tbl == NULL) { IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); return -ENOMEM; } svc->sched_data = tbl; IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_lblc_table)); + "current service\n", sizeof(*tbl)); /* * Initialize the hash buckets @@ -352,7 +334,6 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) for (i=0; ibucket[i]); } - rwlock_init(&tbl->lock); tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; @@ -361,9 +342,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) * Hook periodic timer for garbage collection */ setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, - (unsigned long)tbl); - tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; - add_timer(&tbl->periodic_timer); + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; } @@ -380,9 +360,9 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) ip_vs_lblc_flush(tbl); /* release the table itself */ - kfree(svc->sched_data); + kfree(tbl); IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_lblc_table)); + sizeof(*tbl)); return 0; } @@ -478,46 +458,54 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) static struct ip_vs_dest * ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { - struct ip_vs_dest *dest; - struct ip_vs_lblc_table *tbl; - struct ip_vs_lblc_entry *en; + struct ip_vs_lblc_table *tbl = svc->sched_data; struct iphdr *iph = ip_hdr(skb); + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblc_entry *en; IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); - tbl = (struct ip_vs_lblc_table *)svc->sched_data; + /* First look in our cache */ + read_lock(&svc->sched_lock); en = ip_vs_lblc_get(tbl, iph->daddr); - if (en == NULL) { - dest = __ip_vs_lblc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - en = ip_vs_lblc_new(iph->daddr, dest); - if (en == NULL) { - return NULL; - } - ip_vs_lblc_hash(tbl, en); - } else { - dest = en->dest; - if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest, svc)) { - dest = __ip_vs_lblc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; - } + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* + * If the destination is not available, i.e. it's in the trash, + * we must ignore it, as it may be removed from under our feet, + * if someone drops our reference count. Our caller only makes + * sure that destinations, that are not in the trash, are not + * moved to the trash, while we are scheduling. But anyone can + * free up entries from the trash at any time. + */ + + if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) + dest = en->dest; + } + read_unlock(&svc->sched_lock); + + /* If the destination has a weight and is not overloaded, use it */ + if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; + + /* No cache entry or it is invalid, time to schedule */ + dest = __ip_vs_lblc_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; } - en->lastuse = jiffies; + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblc_new(tbl, iph->daddr, dest); + write_unlock(&svc->sched_lock); + +out: IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(en->addr), + NIPQUAD(iph->daddr), NIPQUAD(dest->addr), ntohs(dest->port)); -- cgit v1.2.3-70-g09d2 From f728bafb5698076dd35bca35ee6cfe52ea1b8ab2 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Tue, 19 Aug 2008 08:16:19 +0200 Subject: ipvs: Fix race conditions in lblcr scheduler We can't access the cache entry outside of our critical read-locked region, because someone may free that entry. Also getting an entry under read lock, then locking for write and trying to delete that entry looks fishy, but should be no problem here, because we're only comparing a pointer. Also there is no need for our own rwlock, there is already one in the service structure for use in the schedulers. Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_lblcr.c | 229 ++++++++++++++++++++++---------------------- 1 file changed, 114 insertions(+), 115 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index f1c84503689..375a1ffb6b6 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) return NULL; } - e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); + e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e == NULL) { IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); return NULL; @@ -116,11 +116,9 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) e->dest = dest; /* link it to the list */ - write_lock(&set->lock); e->next = set->list; set->list = e; atomic_inc(&set->size); - write_unlock(&set->lock); set->lastmod = jiffies; return e; @@ -131,7 +129,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) { struct ip_vs_dest_list *e, **ep; - write_lock(&set->lock); for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { if (e->dest == dest) { /* HIT */ @@ -144,7 +141,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) } ep = &e->next; } - write_unlock(&set->lock); } static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) @@ -174,7 +170,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) if (set == NULL) return NULL; - read_lock(&set->lock); /* select the first destination server, whose weight > 0 */ for (e=set->list; e!=NULL; e=e->next) { least = e->dest; @@ -188,7 +183,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) goto nextstage; } } - read_unlock(&set->lock); return NULL; /* find the destination with the weighted least load */ @@ -207,7 +201,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) loh = doh; } } - read_unlock(&set->lock); IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", @@ -229,7 +222,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) if (set == NULL) return NULL; - read_lock(&set->lock); /* select the first destination server, whose weight > 0 */ for (e=set->list; e!=NULL; e=e->next) { most = e->dest; @@ -239,7 +231,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) goto nextstage; } } - read_unlock(&set->lock); return NULL; /* find the destination with the weighted most load */ @@ -256,7 +247,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) moh = doh; } } - read_unlock(&set->lock); IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", @@ -284,7 +274,6 @@ struct ip_vs_lblcr_entry { * IPVS lblcr hash table */ struct ip_vs_lblcr_table { - rwlock_t lock; /* lock for this table */ struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ @@ -311,32 +300,6 @@ static ctl_table vs_vars_table[] = { static struct ctl_table_header * sysctl_header; -/* - * new/free a ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. - */ -static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr) -{ - struct ip_vs_lblcr_entry *en; - - en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); - if (en == NULL) { - IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); - return NULL; - } - - INIT_LIST_HEAD(&en->list); - en->addr = daddr; - - /* initilize its dest set */ - atomic_set(&(en->set.size), 0); - en->set.list = NULL; - rwlock_init(&en->set.lock); - - return en; -} - - static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { list_del(&en->list); @@ -358,55 +321,68 @@ static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) * Hash an entry in the ip_vs_lblcr_table. * returns bool success. */ -static int +static void ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) { - unsigned hash; - - if (!list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } + unsigned hash = ip_vs_lblcr_hashkey(en->addr); - /* - * Hash by destination IP address - */ - hash = ip_vs_lblcr_hashkey(en->addr); - - write_lock(&tbl->lock); list_add(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); - write_unlock(&tbl->lock); - - return 1; } /* - * Get ip_vs_lblcr_entry associated with supplied parameters. + * Get ip_vs_lblcr_entry associated with supplied parameters. Called under + * read lock. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) { - unsigned hash; + unsigned hash = ip_vs_lblcr_hashkey(addr); struct ip_vs_lblcr_entry *en; - hash = ip_vs_lblcr_hashkey(addr); + list_for_each_entry(en, &tbl->bucket[hash], list) + if (en->addr == addr) + return en; + + return NULL; +} - read_lock(&tbl->lock); - list_for_each_entry(en, &tbl->bucket[hash], list) { - if (en->addr == addr) { - /* HIT */ - read_unlock(&tbl->lock); - return en; +/* + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. Called under write lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblcr_entry *en; + + en = ip_vs_lblcr_get(tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); + return NULL; } + + en->addr = daddr; + en->lastuse = jiffies; + + /* initilize its dest set */ + atomic_set(&(en->set.size), 0); + en->set.list = NULL; + rwlock_init(&en->set.lock); + + ip_vs_lblcr_hash(tbl, en); } - read_unlock(&tbl->lock); + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); - return NULL; + return en; } @@ -418,19 +394,18 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) int i; struct ip_vs_lblcr_entry *en, *nxt; + /* No locking required, only called during cleanup. */ for (i=0; ilock); list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); } - write_unlock(&tbl->lock); } } -static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) { + struct ip_vs_lblcr_table *tbl = svc->sched_data; unsigned long now = jiffies; int i, j; struct ip_vs_lblcr_entry *en, *nxt; @@ -438,7 +413,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) for (i=0, j=tbl->rover; ilock); + write_lock(&svc->sched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, now)) @@ -447,7 +422,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) ip_vs_lblcr_free(en); atomic_dec(&tbl->entries); } - write_unlock(&tbl->lock); + write_unlock(&svc->sched_lock); } tbl->rover = j; } @@ -466,17 +441,16 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) */ static void ip_vs_lblcr_check_expire(unsigned long data) { - struct ip_vs_lblcr_table *tbl; + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblcr_table *tbl = svc->sched_data; unsigned long now = jiffies; int goal; int i, j; struct ip_vs_lblcr_entry *en, *nxt; - tbl = (struct ip_vs_lblcr_table *)data; - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ - ip_vs_lblcr_full_check(tbl); + ip_vs_lblcr_full_check(svc); tbl->counter = 1; goto out; } @@ -493,7 +467,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data) for (i=0, j=tbl->rover; ilock); + write_lock(&svc->sched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) continue; @@ -502,7 +476,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data) atomic_dec(&tbl->entries); goal--; } - write_unlock(&tbl->lock); + write_unlock(&svc->sched_lock); if (goal <= 0) break; } @@ -520,15 +494,14 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblcr_table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); if (tbl == NULL) { IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); return -ENOMEM; } svc->sched_data = tbl; IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_lblcr_table)); + "current service\n", sizeof(*tbl)); /* * Initialize the hash buckets @@ -536,7 +509,6 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) for (i=0; ibucket[i]); } - rwlock_init(&tbl->lock); tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; @@ -545,9 +517,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) * Hook periodic timer for garbage collection */ setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, - (unsigned long)tbl); - tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; - add_timer(&tbl->periodic_timer); + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; } @@ -564,9 +535,9 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) ip_vs_lblcr_flush(tbl); /* release the table itself */ - kfree(svc->sched_data); + kfree(tbl); IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_lblcr_table)); + sizeof(*tbl)); return 0; } @@ -663,50 +634,78 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) static struct ip_vs_dest * ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { - struct ip_vs_dest *dest; - struct ip_vs_lblcr_table *tbl; - struct ip_vs_lblcr_entry *en; + struct ip_vs_lblcr_table *tbl = svc->sched_data; struct iphdr *iph = ip_hdr(skb); + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblcr_entry *en; IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); - tbl = (struct ip_vs_lblcr_table *)svc->sched_data; + /* First look in our cache */ + read_lock(&svc->sched_lock); en = ip_vs_lblcr_get(tbl, iph->daddr); - if (en == NULL) { - dest = __ip_vs_lblcr_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - en = ip_vs_lblcr_new(iph->daddr); - if (en == NULL) { - return NULL; - } - ip_vs_dest_set_insert(&en->set, dest); - ip_vs_lblcr_hash(tbl, en); - } else { + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* Get the least loaded destination */ + read_lock(&en->set.lock); dest = ip_vs_dest_set_min(&en->set); - if (!dest || is_overloaded(dest, svc)) { - dest = __ip_vs_lblcr_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - ip_vs_dest_set_insert(&en->set, dest); - } + read_unlock(&en->set.lock); + + /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && - jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { + time_after(jiffies, en->set.lastmod + + sysctl_ip_vs_lblcr_expiration)) { struct ip_vs_dest *m; + + write_lock(&en->set.lock); m = ip_vs_dest_set_max(&en->set); if (m) ip_vs_dest_set_erase(&en->set, m); + write_unlock(&en->set.lock); + } + + /* If the destination is not overloaded, use it */ + if (dest && !is_overloaded(dest, svc)) { + read_unlock(&svc->sched_lock); + goto out; + } + + /* The cache entry is invalid, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + read_unlock(&svc->sched_lock); + return NULL; } + + /* Update our cache entry */ + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + } + read_unlock(&svc->sched_lock); + + if (dest) + goto out; + + /* No cache entry, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; } - en->lastuse = jiffies; + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblcr_new(tbl, iph->daddr, dest); + write_unlock(&svc->sched_lock); + +out: IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(en->addr), + NIPQUAD(iph->daddr), NIPQUAD(dest->addr), ntohs(dest->port)); -- cgit v1.2.3-70-g09d2 From cbe2d128a01315fb4bd55b96cf8b963f5df28ea2 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 23 Aug 2008 05:10:12 -0700 Subject: tcp: Add tcp_validate_incoming & put duplicated code there MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Large block of code duplication removed. Sadly, the return value thing is a bit tricky here but it seems the most sensible way to return positive from validator on success rather than negative. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 147 ++++++++++++++++++++++++--------------------------- 1 file changed, 69 insertions(+), 78 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 67ccce2a96b..e1b15d4e617 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4691,6 +4691,67 @@ out: } #endif /* CONFIG_NET_DMA */ +/* Does PAWS and seqno based validation of an incoming segment, flags will + * play significant role here. + */ +static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, int syn_inerr) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* RFC1323: H1. Apply PAWS check first. */ + if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && + tcp_paws_discard(sk, skb)) { + if (!th->rst) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + tcp_send_dupack(sk, skb); + goto discard; + } + /* Reset is accepted even if it did not pass PAWS. */ + } + + /* Step 1: check sequence number */ + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + /* RFC793, page 37: "In all states except SYN-SENT, all reset + * (RST) segments are validated by checking their SEQ-fields." + * And page 69: "If an incoming segment is not acceptable, + * an acknowledgment should be sent in reply (unless the RST + * bit is set, if so drop the segment and return)". + */ + if (!th->rst) + tcp_send_dupack(sk, skb); + goto discard; + } + + /* Step 2: check RST bit */ + if (th->rst) { + tcp_reset(sk); + goto discard; + } + + /* ts_recent update must be made after we are sure that the packet + * is in window. + */ + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + + /* step 3: check security and precedence [ignored] */ + + /* step 4: Check for a SYN in window. */ + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + if (syn_inerr) + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); + tcp_reset(sk); + return -1; + } + + return 1; + +discard: + __kfree_skb(skb); + return 0; +} + /* * TCP receive function for the ESTABLISHED state. * @@ -4718,6 +4779,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + int res; /* * Header prediction. @@ -4898,52 +4960,13 @@ slow_path: if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) goto csum_error; - /* - * RFC1323: H1. Apply PAWS check first. - */ - if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(sk, skb)) { - if (!th->rst) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); - tcp_send_dupack(sk, skb); - goto discard; - } - /* Resets are accepted even if PAWS failed. - - ts_recent update must be made after we are sure - that the packet is in window. - */ - } - /* * Standard slow path. */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { - /* RFC793, page 37: "In all states except SYN-SENT, all reset - * (RST) segments are validated by checking their SEQ-fields." - * And page 69: "If an incoming segment is not acceptable, - * an acknowledgment should be sent in reply (unless the RST bit - * is set, if so drop the segment and return)". - */ - if (!th->rst) - tcp_send_dupack(sk, skb); - goto discard; - } - - if (th->rst) { - tcp_reset(sk); - goto discard; - } - - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); - tcp_reset(sk); - return 1; - } + res = tcp_validate_incoming(sk, skb, th, 1); + if (res <= 0) + return -res; step5: if (th->ack) @@ -5225,6 +5248,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; + int res; tp->rx_opt.saw_tstamp = 0; @@ -5277,42 +5301,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; } - if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(sk, skb)) { - if (!th->rst) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); - tcp_send_dupack(sk, skb); - goto discard; - } - /* Reset is accepted even if it did not pass PAWS. */ - } - - /* step 1: check sequence number */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { - if (!th->rst) - tcp_send_dupack(sk, skb); - goto discard; - } - - /* step 2: check RST bit */ - if (th->rst) { - tcp_reset(sk); - goto discard; - } - - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - - /* step 3: check security and precedence [ignored] */ - - /* step 4: - * - * Check for a SYN in window. - */ - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); - tcp_reset(sk); - return 1; - } + res = tcp_validate_incoming(sk, skb, th, 0); + if (res <= 0) + return -res; /* step 5: check the ACK field */ if (th->ack) { -- cgit v1.2.3-70-g09d2 From 2cf46637b501794d7fe9e365f0a3046f5d1f5dfb Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 23 Aug 2008 05:11:41 -0700 Subject: tcp: Add tcp_collapse_one to eliminate duplicated code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e1b15d4e617..580f9547ddf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4161,6 +4161,18 @@ add_sack: } } +static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, + struct sk_buff_head *list) +{ + struct sk_buff *next = skb->next; + + __skb_unlink(skb, list); + __kfree_skb(skb); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); + + return next; +} + /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * Segments with FIN/SYN are not collapsed (only because this @@ -4178,11 +4190,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, for (skb = head; skb != tail;) { /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - struct sk_buff *next = skb->next; - __skb_unlink(skb, list); - __kfree_skb(skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); - skb = next; + skb = tcp_collapse_one(sk, skb, list); continue; } @@ -4246,11 +4254,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, start += size; } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - struct sk_buff *next = skb->next; - __skb_unlink(skb, list); - __kfree_skb(skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); - skb = next; + skb = tcp_collapse_one(sk, skb, list); if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) -- cgit v1.2.3-70-g09d2 From a4356b2920fd4861dd6c75f558749fa5c38a00e8 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 23 Aug 2008 05:12:29 -0700 Subject: tcp: Add tcp_parse_aligned_timestamp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some duplicated code lying around. Located with my suffix tree tool. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 580f9547ddf..f79a5160729 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3442,6 +3442,22 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } } +static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) +{ + __be32 *ptr = (__be32 *)(th + 1); + + if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { + tp->rx_opt.saw_tstamp = 1; + ++ptr; + tp->rx_opt.rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rx_opt.rcv_tsecr = ntohl(*ptr); + return 1; + } + return 0; +} + /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ @@ -3453,16 +3469,8 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, return 0; } else if (tp->rx_opt.tstamp_ok && th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { - __be32 *ptr = (__be32 *)(th + 1); - if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { - tp->rx_opt.saw_tstamp = 1; - ++ptr; - tp->rx_opt.rcv_tsval = ntohl(*ptr); - ++ptr; - tp->rx_opt.rcv_tsecr = ntohl(*ptr); + if (tcp_parse_aligned_timestamp(tp, th)) return 1; - } } tcp_parse_options(skb, &tp->rx_opt, 1); return 1; @@ -4822,19 +4830,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* Check timestamp */ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { - __be32 *ptr = (__be32 *)(th + 1); - /* No? Slow path! */ - if (*ptr != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) + if (!tcp_parse_aligned_timestamp(tp, th)) goto slow_path; - tp->rx_opt.saw_tstamp = 1; - ++ptr; - tp->rx_opt.rcv_tsval = ntohl(*ptr); - ++ptr; - tp->rx_opt.rcv_tsecr = ntohl(*ptr); - /* If PAWS failed, check it more carefully in slow path */ if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) goto slow_path; -- cgit v1.2.3-70-g09d2 From 409a19669e4cd8d1bab7dff31d3b6aa493ff60f0 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Fri, 22 Aug 2008 14:06:12 +0200 Subject: IPVS: Integrate ESP protocol into ip_vs_proto_ah.c Rename all ah_* functions to ah_esp_* (and adjust comments). Move ESP protocol definition into ip_vs_proto_ah.c and remove all usage of ip_vs_proto_esp.c. Make the compilation of ip_vs_proto_ah.c dependent on a new config variable, IP_VS_PROTO_AH_ESP, which is selected either by IP_VS_PROTO_ESP or IP_VS_PROTO_AH. Only compile the selected protocols' structures within this file. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/Kconfig | 6 ++++ net/ipv4/ipvs/Makefile | 3 +- net/ipv4/ipvs/ip_vs_proto_ah.c | 69 ++++++++++++++++++++++++++++-------------- 3 files changed, 54 insertions(+), 24 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig index 09d0c3f3566..2e48a7e2722 100644 --- a/net/ipv4/ipvs/Kconfig +++ b/net/ipv4/ipvs/Kconfig @@ -71,14 +71,20 @@ config IP_VS_PROTO_UDP This option enables support for load balancing UDP transport protocol. Say Y if unsure. +config IP_VS_PROTO_AH_ESP + bool + depends on UNDEFINED + config IP_VS_PROTO_ESP bool "ESP load balancing support" + select IP_VS_PROTO_AH_ESP ---help--- This option enables support for load balancing ESP (Encapsulation Security Payload) transport protocol. Say Y if unsure. config IP_VS_PROTO_AH bool "AH load balancing support" + select IP_VS_PROTO_AH_ESP ---help--- This option enables support for load balancing AH (Authentication Header) transport protocol. Say Y if unsure. diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile index 30e85de9fff..cda3e0860d6 100644 --- a/net/ipv4/ipvs/Makefile +++ b/net/ipv4/ipvs/Makefile @@ -6,8 +6,7 @@ ip_vs_proto-objs-y := ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah.o ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 73e0ea87c1f..3f9ebd7639a 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -1,5 +1,5 @@ /* - * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS + * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS * * Authors: Julian Anastasov , February 2002 * Wensong Zhang @@ -39,11 +39,11 @@ struct isakmp_hdr { static struct ip_vs_conn * -ah_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, - int inverse) +ah_esp_conn_in_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) { struct ip_vs_conn *cp; @@ -79,8 +79,8 @@ ah_conn_in_get(const struct sk_buff *skb, static struct ip_vs_conn * -ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) { struct ip_vs_conn *cp; @@ -112,12 +112,12 @@ ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, static int -ah_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) +ah_esp_conn_schedule(struct sk_buff *skb, + struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) { /* - * AH is only related traffic. Pass the packet to IP stack. + * AH/ESP is only related traffic. Pass the packet to IP stack. */ *verdict = NF_ACCEPT; return 0; @@ -125,8 +125,8 @@ ah_conn_schedule(struct sk_buff *skb, static void -ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) +ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) { char buf[256]; struct iphdr _iph, *ih; @@ -143,28 +143,29 @@ ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, } -static void ah_init(struct ip_vs_protocol *pp) +static void ah_esp_init(struct ip_vs_protocol *pp) { /* nothing to do now */ } -static void ah_exit(struct ip_vs_protocol *pp) +static void ah_esp_exit(struct ip_vs_protocol *pp) { /* nothing to do now */ } +#ifdef CONFIG_IP_VS_PROTO_AH struct ip_vs_protocol ip_vs_protocol_ah = { .name = "AH", .protocol = IPPROTO_AH, .num_states = 1, .dont_defrag = 1, - .init = ah_init, - .exit = ah_exit, - .conn_schedule = ah_conn_schedule, - .conn_in_get = ah_conn_in_get, - .conn_out_get = ah_conn_out_get, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, .snat_handler = NULL, .dnat_handler = NULL, .csum_check = NULL, @@ -172,7 +173,31 @@ struct ip_vs_protocol ip_vs_protocol_ah = { .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, - .debug_packet = ah_debug_packet, + .debug_packet = ah_esp_debug_packet, .timeout_change = NULL, /* ISAKMP */ .set_state_timeout = NULL, }; +#endif + +#ifdef CONFIG_IP_VS_PROTO_ESP +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif -- cgit v1.2.3-70-g09d2 From e3c2ced8d21410e8bc897480081e2ffc516c0f70 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Fri, 22 Aug 2008 14:06:13 +0200 Subject: IPVS: Rename ip_vs_proto_ah.c to ip_vs_proto_ah_esp.c After integrating ESP into ip_vs_proto_ah, rename it (and the references to it) to ip_vs_proto_ah_esp.c and delete the old ip_vs_proto_esp.c. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/Makefile | 2 +- net/ipv4/ipvs/ip_vs_proto_ah.c | 203 ------------------------------------- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 203 +++++++++++++++++++++++++++++++++++++ net/ipv4/ipvs/ip_vs_proto_esp.c | 176 -------------------------------- 4 files changed, 204 insertions(+), 380 deletions(-) delete mode 100644 net/ipv4/ipvs/ip_vs_proto_ah.c create mode 100644 net/ipv4/ipvs/ip_vs_proto_ah_esp.c delete mode 100644 net/ipv4/ipvs/ip_vs_proto_esp.c (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile index cda3e0860d6..73a46fe1fe4 100644 --- a/net/ipv4/ipvs/Makefile +++ b/net/ipv4/ipvs/Makefile @@ -6,7 +6,7 @@ ip_vs_proto-objs-y := ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c deleted file mode 100644 index 3f9ebd7639a..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS - * - * Authors: Julian Anastasov , February 2002 - * Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation; - * - */ - -#include -#include -#include -#include -#include -#include - -#include - - -/* TODO: - -struct isakmp_hdr { - __u8 icookie[8]; - __u8 rcookie[8]; - __u8 np; - __u8 version; - __u8 xchgtype; - __u8 flags; - __u32 msgid; - __u32 length; -}; - -*/ - -#define PORT_ISAKMP 500 - - -static struct ip_vs_conn * -ah_esp_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - /* - * We are not sure if the packet is from our - * service, so our conn_schedule hook should return NF_ACCEPT - */ - IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static struct ip_vs_conn * -ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static int -ah_esp_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - /* - * AH/ESP is only related traffic. Pass the packet to IP stack. - */ - *verdict = NF_ACCEPT; - return 0; -} - - -static void -ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - - -static void ah_esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void ah_esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -#ifdef CONFIG_IP_VS_PROTO_AH -struct ip_vs_protocol ip_vs_protocol_ah = { - .name = "AH", - .protocol = IPPROTO_AH, - .num_states = 1, - .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, - .conn_schedule = ah_esp_conn_schedule, - .conn_in_get = ah_esp_conn_in_get, - .conn_out_get = ah_esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ - .set_state_timeout = NULL, -}; -#endif - -#ifdef CONFIG_IP_VS_PROTO_ESP -struct ip_vs_protocol ip_vs_protocol_esp = { - .name = "ESP", - .protocol = IPPROTO_ESP, - .num_states = 1, - .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, - .conn_schedule = ah_esp_conn_schedule, - .conn_in_get = ah_esp_conn_in_get, - .conn_out_get = ah_esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ -}; -#endif diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c new file mode 100644 index 00000000000..3f9ebd7639a --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c @@ -0,0 +1,203 @@ +/* + * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS + * + * Authors: Julian Anastasov , February 2002 + * Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + + +static struct ip_vs_conn * +ah_esp_conn_in_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->saddr, + htons(PORT_ISAKMP), + iph->daddr, + htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->daddr, + htons(PORT_ISAKMP), + iph->saddr, + htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->saddr, + htons(PORT_ISAKMP), + iph->daddr, + htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->daddr, + htons(PORT_ISAKMP), + iph->saddr, + htons(PORT_ISAKMP)); + } + + if (!cp) { + IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static int +ah_esp_conn_schedule(struct sk_buff *skb, + struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * AH/ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + + +static void +ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + + +static void ah_esp_init(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +static void ah_esp_exit(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +#ifdef CONFIG_IP_VS_PROTO_AH +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ + .set_state_timeout = NULL, +}; +#endif + +#ifdef CONFIG_IP_VS_PROTO_ESP +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c deleted file mode 100644 index 21d70c8ffa5..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS - * - * Authors: Julian Anastasov , February 2002 - * Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation; - * - */ - -#include -#include -#include -#include -#include -#include - -#include - - -/* TODO: - -struct isakmp_hdr { - __u8 icookie[8]; - __u8 rcookie[8]; - __u8 np; - __u8 version; - __u8 xchgtype; - __u8 flags; - __u32 msgid; - __u32 length; -}; - -*/ - -#define PORT_ISAKMP 500 - - -static struct ip_vs_conn * -esp_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - /* - * We are not sure if the packet is from our - * service, so our conn_schedule hook should return NF_ACCEPT - */ - IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static struct ip_vs_conn * -esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static int -esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - /* - * ESP is only related traffic. Pass the packet to IP stack. - */ - *verdict = NF_ACCEPT; - return 0; -} - - -static void -esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - - -static void esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -struct ip_vs_protocol ip_vs_protocol_esp = { - .name = "ESP", - .protocol = IPPROTO_ESP, - .num_states = 1, - .dont_defrag = 1, - .init = esp_init, - .exit = esp_exit, - .conn_schedule = esp_conn_schedule, - .conn_in_get = esp_conn_in_get, - .conn_out_get = esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ -}; -- cgit v1.2.3-70-g09d2 From 6eac56040787c3ff604fe7d48bbbb7897cd1387c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 28 Aug 2008 01:08:02 -0700 Subject: tcp: Skip empty hash buckets faster in /proc/net/tcp On most systems most of the TCP established/time-wait hash buckets are empty. When walking the hash table for /proc/net/tcp their read locks would always be aquired just to find out they're empty. This patch changes the code to check first if the buckets have any entries before taking the lock, which is much cheaper than taking a lock. Since the hash tables are large this makes a measurable difference on processing /proc/net/tcp, especially on architectures with slow read_lock (e.g. PPC) On a 2GB Core2 system time cat /proc/net/tcp > /dev/null (with a mostly empty hash table) goes from 0.046s to 0.005s. On systems with slower atomics (like P4 or POWER4) or larger hash tables (more RAM) the difference is much higher. This can be noticeable because there are some daemons around who regularly scan /proc/net/tcp. Original idea for this patch from Marcus Meissner, but redone by me. Signed-off-by: Andi Kleen Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 44c1e934824..37ca3843c40 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1946,6 +1946,12 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) return rc; } +static inline int empty_bucket(struct tcp_iter_state *st) +{ + return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) && + hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain); +} + static void *established_get_first(struct seq_file *seq) { struct tcp_iter_state* st = seq->private; @@ -1958,6 +1964,10 @@ static void *established_get_first(struct seq_file *seq) struct inet_timewait_sock *tw; rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); + /* Lockless fast path for the common case of empty buckets */ + if (empty_bucket(st)) + continue; + read_lock_bh(lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family || @@ -2008,13 +2018,15 @@ get_tw: read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); st->state = TCP_SEQ_STATE_ESTABLISHED; - if (++st->bucket < tcp_hashinfo.ehash_size) { - read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); - sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); - } else { - cur = NULL; - goto out; - } + /* Look for next non empty bucket */ + while (++st->bucket < tcp_hashinfo.ehash_size && + empty_bucket(st)) + ; + if (st->bucket >= tcp_hashinfo.ehash_size) + return NULL; + + read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); } else sk = sk_next(sk); -- cgit v1.2.3-70-g09d2 From 6be547a61d6220199826070cda792297c3d15994 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 28 Aug 2008 01:09:54 -0700 Subject: inet_diag: Add empty bucket optimization to inet_diag too Skip quickly over empty buckets in inet_diag. Signed-off-by: Andi Kleen Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c10036e7a46..89cb047ab31 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -782,11 +782,15 @@ skip_listen_ht: struct sock *sk; struct hlist_node *node; + num = 0; + + if (hlist_empty(&head->chain) && hlist_empty(&head->twchain)) + continue; + if (i > s_i) s_num = 0; read_lock_bh(lock); - num = 0; sk_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); -- cgit v1.2.3-70-g09d2 From a627266570605a98c5fda5b8234d9e92015e4d14 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2008 01:11:25 -0700 Subject: ip: speedup /proc/net/rt_cache handling When scanning route cache hash table, we can avoid taking locks for empty buckets. Both /proc/net/rt_cache and NETLINK RTM_GETROUTE interface are taken into account. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cca921ea855..71598f64c11 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -282,6 +282,8 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) struct rtable *r = NULL; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { + if (!rt_hash_table[st->bucket].chain) + continue; rcu_read_lock_bh(); r = rcu_dereference(rt_hash_table[st->bucket].chain); while (r) { @@ -299,11 +301,14 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, struct rtable *r) { struct rt_cache_iter_state *st = seq->private; + r = r->u.dst.rt_next; while (!r) { rcu_read_unlock_bh(); - if (--st->bucket < 0) - break; + do { + if (--st->bucket < 0) + return NULL; + } while (!rt_hash_table[st->bucket].chain); rcu_read_lock_bh(); r = rt_hash_table[st->bucket].chain; } @@ -2840,7 +2845,9 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) if (s_h < 0) s_h = 0; s_idx = idx = cb->args[1]; - for (h = s_h; h <= rt_hash_mask; h++) { + for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { + if (!rt_hash_table[h].chain) + continue; rcu_read_lock_bh(); for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; rt = rcu_dereference(rt->u.dst.rt_next), idx++) { @@ -2859,7 +2866,6 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) dst_release(xchg(&skb->dst, NULL)); } rcu_read_unlock_bh(); - s_idx = 0; } done: -- cgit v1.2.3-70-g09d2 From 6224877b2ca4be5de96270a8ae490fe2ba11b0e0 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: tcp/dccp: Consolidate common code for RFC 3390 conversion This patch consolidates the code common to TCP and CCID-2: * TCP uses RFC 3390 in a packet-oriented manner (tcp_input.c) and * CCID-2 uses RFC 3390 in packet-oriented manner (RFC 4341). Signed-off-by: Gerrit Renker --- include/net/tcp.h | 15 +++++++++++++++ net/dccp/ccids/ccid2.c | 8 ++------ net/ipv4/tcp_input.c | 17 ++--------------- 3 files changed, 19 insertions(+), 21 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 8983386356a..6bc4b8148ca 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -782,6 +782,21 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) /* Use define here intentionally to get WARN_ON location shown at the caller */ #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) +/* + * Convert RFC3390 larger initial windows into an equivalent number of packets. + * + * John Heffner states: + * + * The RFC specifies a window of no more than 4380 bytes + * unless 2*MSS > 4380. Reading the pseudocode in the RFC + * is a bit misleading because they use a clamp at 4380 bytes + * rather than a multiplier in the relevant range. + */ +static inline u32 rfc3390_bytes_to_packets(const u32 bytes) +{ + return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3); +} + extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index c539f79ab8e..fa713227c66 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -596,12 +596,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ hctx->ssthresh = ~0U; - /* - * RFC 4341, 5: "The cwnd parameter is initialized to at most four - * packets for new connections, following the rules from [RFC3390]". - * We need to convert the bytes of RFC3390 into the packets of RFC 4341. - */ - hctx->cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); + /* Use larger initial windows (RFC 3390, rfc2581bis) */ + hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); /* Make sure that Ack Ratio is enabled and within bounds. */ max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 67ccce2a96b..16d0040de34 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -811,25 +811,12 @@ void tcp_update_metrics(struct sock *sk) } } -/* Numbers are taken from RFC3390. - * - * John Heffner states: - * - * The RFC specifies a window of no more than 4380 bytes - * unless 2*MSS > 4380. Reading the pseudocode in the RFC - * is a bit misleading because they use a clamp at 4380 bytes - * rather than use a multiplier in the relevant range. - */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); - if (!cwnd) { - if (tp->mss_cache > 1460) - cwnd = 2; - else - cwnd = (tp->mss_cache > 1095) ? 3 : 4; - } + if (!cwnd) + cwnd = rfc3390_bytes_to_packets(tp->mss_cache); return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } -- cgit v1.2.3-70-g09d2 From fab0de02fb0da83b90cec7fce4294747d86d5c6f Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:32 +0200 Subject: IPVS: Add CONFIG_IP_VS_IPV6 option for IPv6 support Add boolean config option CONFIG_IP_VS_IPV6 for enabling experimental IPv6 support in IPVS. Only visible if IPv6 support is set to 'y' or both IPv6 and IPVS are modules. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig index 2e48a7e2722..794cecb249a 100644 --- a/net/ipv4/ipvs/Kconfig +++ b/net/ipv4/ipvs/Kconfig @@ -24,6 +24,14 @@ menuconfig IP_VS if IP_VS +config IP_VS_IPV6 + bool "IPv6 support for IPVS (DANGEROUS)" + depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) + ---help--- + Add IPv6 support to IPVS. This is incomplete and might be dangerous. + + Say N if unsure. + config IP_VS_DEBUG bool "IP virtual server debugging" ---help--- -- cgit v1.2.3-70-g09d2 From e7ade46a53055c19a01c8becbe7807f9075d6fee Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:33 +0200 Subject: IPVS: Change IPVS data structures to support IPv6 addresses Introduce new 'af' fields into IPVS data structures for specifying an entry's address family. Convert IP addresses to be of type union nf_inet_addr. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 18 ++++++++----- net/ipv4/ipvs/ip_vs_conn.c | 60 ++++++++++++++++++++--------------------- net/ipv4/ipvs/ip_vs_core.c | 28 +++++++++---------- net/ipv4/ipvs/ip_vs_ctl.c | 37 ++++++++++++------------- net/ipv4/ipvs/ip_vs_dh.c | 2 +- net/ipv4/ipvs/ip_vs_ftp.c | 18 ++++++------- net/ipv4/ipvs/ip_vs_lblc.c | 4 +-- net/ipv4/ipvs/ip_vs_lblcr.c | 8 +++--- net/ipv4/ipvs/ip_vs_lc.c | 2 +- net/ipv4/ipvs/ip_vs_nq.c | 2 +- net/ipv4/ipvs/ip_vs_proto_tcp.c | 16 +++++------ net/ipv4/ipvs/ip_vs_proto_udp.c | 12 ++++----- net/ipv4/ipvs/ip_vs_rr.c | 2 +- net/ipv4/ipvs/ip_vs_sed.c | 2 +- net/ipv4/ipvs/ip_vs_sh.c | 2 +- net/ipv4/ipvs/ip_vs_sync.c | 6 ++--- net/ipv4/ipvs/ip_vs_wlc.c | 2 +- net/ipv4/ipvs/ip_vs_wrr.c | 2 +- net/ipv4/ipvs/ip_vs_xmit.c | 12 ++++----- 19 files changed, 121 insertions(+), 114 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a25ad243031..d3282558550 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -21,6 +21,9 @@ #include #include +#include /* for union nf_inet_addr */ +#include /* for struct ipv6hdr */ +#include /* for ipv6_addr_copy */ #ifdef CONFIG_IP_VS_DEBUG #include @@ -259,9 +262,10 @@ struct ip_vs_conn { struct list_head c_list; /* hashed list heads */ /* Protocol, addresses and port numbers */ - __be32 caddr; /* client address */ - __be32 vaddr; /* virtual address */ - __be32 daddr; /* destination address */ + u16 af; /* address family */ + union nf_inet_addr caddr; /* client address */ + union nf_inet_addr vaddr; /* virtual address */ + union nf_inet_addr daddr; /* destination address */ __be16 cport; __be16 vport; __be16 dport; @@ -314,8 +318,9 @@ struct ip_vs_service { atomic_t refcnt; /* reference counter */ atomic_t usecnt; /* use counter */ + u16 af; /* address family */ __u16 protocol; /* which protocol (TCP/UDP) */ - __be32 addr; /* IP address for virtual service */ + union nf_inet_addr addr; /* IP address for virtual service */ __be16 port; /* port number for the service */ __u32 fwmark; /* firewall mark of the service */ unsigned flags; /* service status flags */ @@ -342,7 +347,8 @@ struct ip_vs_dest { struct list_head n_list; /* for the dests in the service */ struct list_head d_list; /* for table with all the dests */ - __be32 addr; /* IP address of the server */ + u16 af; /* address family */ + union nf_inet_addr addr; /* IP address of the server */ __be16 port; /* port number of the server */ volatile unsigned flags; /* dest status flags */ atomic_t conn_flags; /* flags to copy to conn */ @@ -366,7 +372,7 @@ struct ip_vs_dest { /* for virtual service */ struct ip_vs_service *svc; /* service it belongs to */ __u16 protocol; /* which protocol (TCP/UDP) */ - __be32 vaddr; /* virtual IP address */ + union nf_inet_addr vaddr; /* virtual IP address */ __be16 vport; /* virtual port number */ __u32 vfwmark; /* firewall mark of service */ }; diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 44a6872dc24..639d4bc7fc1 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -131,7 +131,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) int ret; /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); + hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport); ct_write_lock(hash); @@ -162,7 +162,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) int ret; /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); + hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport); ct_write_lock(hash); @@ -197,10 +197,10 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr==cp->caddr && s_port==cp->cport && - d_port==cp->vport && d_addr==cp->vaddr && + if (s_addr == cp->caddr.ip && s_port == cp->cport && + d_port == cp->vport && d_addr == cp->vaddr.ip && ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - protocol==cp->protocol) { + protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); ct_read_unlock(hash); @@ -243,10 +243,10 @@ struct ip_vs_conn *ip_vs_ct_in_get ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr==cp->caddr && s_port==cp->cport && - d_port==cp->vport && d_addr==cp->vaddr && + if (s_addr == cp->caddr.ip && s_port == cp->cport && + d_port == cp->vport && d_addr == cp->vaddr.ip && cp->flags & IP_VS_CONN_F_TEMPLATE && - protocol==cp->protocol) { + protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); goto out; @@ -286,8 +286,8 @@ struct ip_vs_conn *ip_vs_conn_out_get ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (d_addr == cp->caddr && d_port == cp->cport && - s_port == cp->dport && s_addr == cp->daddr && + if (d_addr == cp->caddr.ip && d_port == cp->cport && + s_port == cp->dport && s_addr == cp->daddr.ip && protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); @@ -406,9 +406,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), + NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), + NIPQUAD(cp->daddr.ip), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, cp->flags, atomic_read(&cp->refcnt), atomic_read(&dest->refcnt)); @@ -444,8 +444,8 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) struct ip_vs_dest *dest; if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->daddr, cp->dport, - cp->vaddr, cp->vport, cp->protocol); + dest = ip_vs_find_dest(cp->daddr.ip, cp->dport, + cp->vaddr.ip, cp->vport, cp->protocol); ip_vs_bind_dest(cp, dest); return dest; } else @@ -468,9 +468,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), + NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), + NIPQUAD(cp->daddr.ip), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, cp->flags, atomic_read(&cp->refcnt), atomic_read(&dest->refcnt)); @@ -530,9 +530,9 @@ int ip_vs_check_template(struct ip_vs_conn *ct) "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " "-> d:%u.%u.%u.%u:%d\n", ip_vs_proto_name(ct->protocol), - NIPQUAD(ct->caddr), ntohs(ct->cport), - NIPQUAD(ct->vaddr), ntohs(ct->vport), - NIPQUAD(ct->daddr), ntohs(ct->dport)); + NIPQUAD(ct->caddr.ip), ntohs(ct->cport), + NIPQUAD(ct->vaddr.ip), ntohs(ct->vport), + NIPQUAD(ct->daddr.ip), ntohs(ct->dport)); /* * Invalidate the connection template @@ -641,11 +641,11 @@ ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport INIT_LIST_HEAD(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); cp->protocol = proto; - cp->caddr = caddr; + cp->caddr.ip = caddr; cp->cport = cport; - cp->vaddr = vaddr; + cp->vaddr.ip = vaddr; cp->vport = vport; - cp->daddr = daddr; + cp->daddr.ip = daddr; cp->dport = dport; cp->flags = flags; spin_lock_init(&cp->lock); @@ -763,9 +763,9 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr), ntohs(cp->cport), - ntohl(cp->vaddr), ntohs(cp->vport), - ntohl(cp->daddr), ntohs(cp->dport), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), (cp->timer.expires-jiffies)/HZ); } @@ -812,9 +812,9 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n", ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr), ntohs(cp->cport), - ntohl(cp->vaddr), ntohs(cp->vport), - ntohl(cp->daddr), ntohs(cp->dport), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 9fbf0a6d739..4a54f33b60d 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -232,14 +232,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, snet, 0, iph->daddr, ports[1], - dest->addr, dest->port, + dest->addr.ip, dest->port, IP_VS_CONN_F_TEMPLATE, dest); else ct = ip_vs_conn_new(iph->protocol, snet, 0, iph->daddr, 0, - dest->addr, 0, + dest->addr.ip, 0, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) @@ -286,14 +286,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, ct = ip_vs_conn_new(IPPROTO_IP, snet, 0, htonl(svc->fwmark), 0, - dest->addr, 0, + dest->addr.ip, 0, IP_VS_CONN_F_TEMPLATE, dest); else ct = ip_vs_conn_new(iph->protocol, snet, 0, iph->daddr, 0, - dest->addr, 0, + dest->addr.ip, 0, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) @@ -313,7 +313,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, cp = ip_vs_conn_new(iph->protocol, iph->saddr, ports[0], iph->daddr, ports[1], - dest->addr, dport, + dest->addr.ip, dport, 0, dest); if (cp == NULL) { @@ -380,7 +380,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) cp = ip_vs_conn_new(iph->protocol, iph->saddr, pptr[0], iph->daddr, pptr[1], - dest->addr, dest->port?dest->port:pptr[1], + dest->addr.ip, dest->port ? dest->port : pptr[1], 0, dest); if (cp == NULL) @@ -389,9 +389,9 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), + NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), + NIPQUAD(cp->daddr.ip), ntohs(cp->dport), cp->flags, atomic_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); @@ -526,14 +526,14 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct iphdr *ciph = (struct iphdr *)(icmph + 1); if (inout) { - iph->saddr = cp->vaddr; + iph->saddr = cp->vaddr.ip; ip_send_check(iph); - ciph->daddr = cp->vaddr; + ciph->daddr = cp->vaddr.ip; ip_send_check(ciph); } else { - iph->daddr = cp->daddr; + iph->daddr = cp->daddr.ip; ip_send_check(iph); - ciph->saddr = cp->daddr; + ciph->saddr = cp->daddr.ip; ip_send_check(ciph); } @@ -762,7 +762,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, /* mangle the packet */ if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) goto drop; - ip_hdr(skb)->saddr = cp->vaddr; + ip_hdr(skb)->saddr = cp->vaddr.ip; ip_send_check(ip_hdr(skb)); /* For policy routing, packets originating from this diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index ede101eeec1..3f2277b847d 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -317,7 +317,8 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* * Hash it by in ip_vs_svc_table */ - hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); + hash = ip_vs_svc_hashkey(svc->protocol, svc->addr.ip, + svc->port); list_add(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* @@ -373,7 +374,7 @@ __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport) hash = ip_vs_svc_hashkey(protocol, vaddr, vport); list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ - if ((svc->addr == vaddr) + if ((svc->addr.ip == vaddr) && (svc->port == vport) && (svc->protocol == protocol)) { /* HIT */ @@ -503,7 +504,7 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest) * Hash by proto,addr,port, * which are the parameters of the real service. */ - hash = ip_vs_rs_hashkey(dest->addr, dest->port); + hash = ip_vs_rs_hashkey(dest->addr.ip, dest->port); list_add(&dest->d_list, &ip_vs_rtable[hash]); return 1; @@ -543,7 +544,7 @@ ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) read_lock(&__ip_vs_rs_lock); list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { - if ((dest->addr == daddr) + if ((dest->addr.ip == daddr) && (dest->port == dport) && ((dest->protocol == protocol) || dest->vfwmark)) { @@ -569,7 +570,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) * Find the destination for the given service */ list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->addr == daddr) && (dest->port == dport)) { + if ((dest->addr.ip == daddr) && (dest->port == dport)) { /* HIT */ return dest; } @@ -626,14 +627,14 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, - NIPQUAD(dest->addr), ntohs(dest->port), + NIPQUAD(dest->addr.ip), ntohs(dest->port), atomic_read(&dest->refcnt)); - if (dest->addr == daddr && + if (dest->addr.ip == daddr && dest->port == dport && dest->vfwmark == svc->fwmark && dest->protocol == svc->protocol && (svc->fwmark || - (dest->vaddr == svc->addr && + (dest->vaddr.ip == svc->addr.ip && dest->vport == svc->port))) { /* HIT */ return dest; @@ -646,7 +647,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " "from trash\n", dest->vfwmark, - NIPQUAD(dest->addr), ntohs(dest->port)); + NIPQUAD(dest->addr.ip), ntohs(dest->port)); list_del(&dest->n_list); ip_vs_dst_reset(dest); __ip_vs_unbind_svc(dest); @@ -779,10 +780,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, } dest->protocol = svc->protocol; - dest->vaddr = svc->addr; + dest->vaddr.ip = svc->addr.ip; dest->vport = svc->port; dest->vfwmark = svc->fwmark; - dest->addr = udest->addr; + dest->addr.ip = udest->addr; dest->port = udest->port; atomic_set(&dest->activeconns, 0); @@ -847,7 +848,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) NIPQUAD(daddr), ntohs(dport), atomic_read(&dest->refcnt), dest->vfwmark, - NIPQUAD(dest->vaddr), + NIPQUAD(dest->vaddr.ip), ntohs(dest->vport)); __ip_vs_update_dest(svc, dest, udest); @@ -993,7 +994,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) } else { IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " "dest->refcnt=%d\n", - NIPQUAD(dest->addr), ntohs(dest->port), + NIPQUAD(dest->addr.ip), ntohs(dest->port), atomic_read(&dest->refcnt)); list_add(&dest->n_list, &ip_vs_dest_trash); atomic_inc(&dest->refcnt); @@ -1101,7 +1102,7 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) atomic_set(&svc->refcnt, 0); svc->protocol = u->protocol; - svc->addr = u->addr; + svc->addr.ip = u->addr; svc->port = u->port; svc->fwmark = u->fwmark; svc->flags = u->flags; @@ -1751,7 +1752,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) if (iter->table == ip_vs_svc_table) seq_printf(seq, "%s %08X:%04X %s ", ip_vs_proto_name(svc->protocol), - ntohl(svc->addr), + ntohl(svc->addr.ip), ntohs(svc->port), svc->scheduler->name); else @@ -1768,7 +1769,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) list_for_each_entry(dest, &svc->destinations, n_list) { seq_printf(seq, " -> %08X:%04X %-7s %-6d %-10d %-10d\n", - ntohl(dest->addr), ntohs(dest->port), + ntohl(dest->addr.ip), ntohs(dest->port), ip_vs_fwd_name(atomic_read(&dest->conn_flags)), atomic_read(&dest->weight), atomic_read(&dest->activeconns), @@ -2040,7 +2041,7 @@ static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { dst->protocol = src->protocol; - dst->addr = src->addr; + dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); @@ -2114,7 +2115,7 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, if (count >= get->num_dests) break; - entry.addr = dest->addr; + entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); entry.weight = atomic_read(&dest->weight); diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index fa66824d264..9f9d795dbd7 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -218,7 +218,7 @@ ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", NIPQUAD(iph->daddr), - NIPQUAD(dest->addr), + NIPQUAD(dest->addr.ip), ntohs(dest->port)); return dest; diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c index c1c758e4f73..bfe5d7050a5 100644 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -172,17 +172,17 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " "%u.%u.%u.%u:%d detected\n", - NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0); + NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr.ip), 0); /* * Now update or create an connection entry for it */ n_cp = ip_vs_conn_out_get(iph->protocol, from, port, - cp->caddr, 0); + cp->caddr.ip, 0); if (!n_cp) { n_cp = ip_vs_conn_new(IPPROTO_TCP, - cp->caddr, 0, - cp->vaddr, port, + cp->caddr.ip, 0, + cp->vaddr.ip, port, from, port, IP_VS_CONN_F_NO_CPORT, cp->dest); @@ -196,7 +196,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Replace the old passive address with the new one */ - from = n_cp->vaddr; + from = n_cp->vaddr.ip; port = n_cp->vport; sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), (ntohs(port)>>8)&255, ntohs(port)&255); @@ -306,16 +306,16 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, */ IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", ip_vs_proto_name(iph->protocol), - NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0); + NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); n_cp = ip_vs_conn_in_get(iph->protocol, to, port, - cp->vaddr, htons(ntohs(cp->vport)-1)); + cp->vaddr.ip, htons(ntohs(cp->vport)-1)); if (!n_cp) { n_cp = ip_vs_conn_new(IPPROTO_TCP, to, port, - cp->vaddr, htons(ntohs(cp->vport)-1), - cp->daddr, htons(ntohs(cp->dport)-1), + cp->vaddr.ip, htons(ntohs(cp->vport)-1), + cp->daddr.ip, htons(ntohs(cp->dport)-1), 0, cp->dest); if (!n_cp) diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index d2a43aa3fe4..69309edc0c4 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -422,7 +422,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); @@ -506,7 +506,7 @@ out: IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", NIPQUAD(iph->daddr), - NIPQUAD(dest->addr), + NIPQUAD(dest->addr.ip), ntohs(dest->port)); return dest; diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 375a1ffb6b6..51c746e2083 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -204,7 +204,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); @@ -250,7 +250,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(most->addr), ntohs(most->port), + NIPQUAD(most->addr.ip), ntohs(most->port), atomic_read(&most->activeconns), atomic_read(&most->refcnt), atomic_read(&most->weight), moh); @@ -598,7 +598,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); @@ -706,7 +706,7 @@ out: IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", NIPQUAD(iph->daddr), - NIPQUAD(dest->addr), + NIPQUAD(dest->addr.ip), ntohs(dest->port)); return dest; diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c index 2c3de1b6351..551d293347f 100644 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ b/net/ipv4/ipvs/ip_vs_lc.c @@ -68,7 +68,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) if (least) IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->inactconns)); diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c index 5330d5a2de1..aa0e32ad348 100644 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ b/net/ipv4/ipvs/ip_vs_nq.c @@ -101,7 +101,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) out: IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index d0ea467986a..15860e1441b 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -147,7 +147,7 @@ tcp_snat_handler(struct sk_buff *skb, /* Adjust TCP checksums */ if (!cp->app) { /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr, + tcp_fast_csum_update(tcph, cp->daddr.ip, cp->vaddr.ip, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -155,7 +155,7 @@ tcp_snat_handler(struct sk_buff *skb, /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, + tcph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", @@ -198,7 +198,7 @@ tcp_dnat_handler(struct sk_buff *skb, */ if (!cp->app) { /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr, + tcp_fast_csum_update(tcph, cp->vaddr.ip, cp->daddr.ip, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -206,7 +206,7 @@ tcp_dnat_handler(struct sk_buff *skb, /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, + tcph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -427,8 +427,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, th->fin? 'F' : '.', th->ack? 'A' : '.', th->rst? 'R' : '.', - NIPQUAD(cp->daddr), ntohs(cp->dport), - NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->daddr.ip), ntohs(cp->dport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), tcp_state_name(cp->state), tcp_state_name(new_state), atomic_read(&cp->refcnt)); @@ -549,8 +549,8 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" "%u.%u.%u.%u:%u to app %s on port %u\n", __func__, - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), + NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index c6be5d56823..8dfad5db829 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -158,7 +158,7 @@ udp_snat_handler(struct sk_buff *skb, */ if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->daddr, cp->vaddr, + udp_fast_csum_update(udph, cp->daddr.ip, cp->vaddr.ip, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -166,7 +166,7 @@ udp_snat_handler(struct sk_buff *skb, /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, + udph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, skb->len - udphoff, cp->protocol, skb->csum); if (udph->check == 0) @@ -211,7 +211,7 @@ udp_dnat_handler(struct sk_buff *skb, */ if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->vaddr, cp->daddr, + udp_fast_csum_update(udph, cp->vaddr.ip, cp->daddr.ip, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -219,7 +219,7 @@ udp_dnat_handler(struct sk_buff *skb, /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, + udph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, skb->len - udphoff, cp->protocol, skb->csum); if (udph->check == 0) @@ -343,8 +343,8 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" "%u.%u.%u.%u:%u to app %s on port %u\n", __func__, - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->caddr.ip), ntohs(cp->cport), + NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c index f7492911753..27f0b624283 100644 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ b/net/ipv4/ipvs/ip_vs_rr.c @@ -76,7 +76,7 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) write_unlock(&svc->sched_lock); IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr), ntohs(dest->port), + NIPQUAD(dest->addr.ip), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c index 53f73bea66c..38b574b2fdf 100644 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ b/net/ipv4/ipvs/ip_vs_sed.c @@ -103,7 +103,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 7b979e22805..c9e54e2ec29 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -215,7 +215,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " "--> server %u.%u.%u.%u:%d\n", NIPQUAD(iph->saddr), - NIPQUAD(dest->addr), + NIPQUAD(dest->addr.ip), ntohs(dest->port)); return dest; diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index a652da2c320..2cf47b2e166 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -256,9 +256,9 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) s->cport = cp->cport; s->vport = cp->vport; s->dport = cp->dport; - s->caddr = cp->caddr; - s->vaddr = cp->vaddr; - s->daddr = cp->daddr; + s->caddr = cp->caddr.ip; + s->vaddr = cp->vaddr.ip; + s->daddr = cp->daddr.ip; s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); s->state = htons(cp->state); if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c index df7ad8d7476..09fd993040f 100644 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ b/net/ipv4/ipvs/ip_vs_wlc.c @@ -91,7 +91,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), + NIPQUAD(least->addr.ip), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c index 0d86a79b87b..19c49b234f3 100644 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ b/net/ipv4/ipvs/ip_vs_wrr.c @@ -197,7 +197,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr), ntohs(dest->port), + NIPQUAD(dest->addr.ip), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index 9892d4aca42..88199c9f2d3 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -71,7 +71,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .oif = 0, .nl_u = { .ip4_u = { - .daddr = dest->addr, + .daddr = dest->addr.ip, .saddr = 0, .tos = rtos, } }, }; @@ -80,12 +80,12 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) spin_unlock(&dest->dst_lock); IP_VS_DBG_RL("ip_route_output error, " "dest: %u.%u.%u.%u\n", - NIPQUAD(dest->addr)); + NIPQUAD(dest->addr.ip)); return NULL; } __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", - NIPQUAD(dest->addr), + NIPQUAD(dest->addr.ip), atomic_read(&rt->u.dst.__refcnt), rtos); } spin_unlock(&dest->dst_lock); @@ -94,14 +94,14 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .oif = 0, .nl_u = { .ip4_u = { - .daddr = cp->daddr, + .daddr = cp->daddr.ip, .saddr = 0, .tos = rtos, } }, }; if (ip_route_output_key(&init_net, &rt, &fl)) { IP_VS_DBG_RL("ip_route_output error, dest: " - "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); + "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip)); return NULL; } } @@ -264,7 +264,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) goto tx_error; - ip_hdr(skb)->daddr = cp->daddr; + ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); -- cgit v1.2.3-70-g09d2 From c860c6b1479992440e4962e9c95d258bfdce4fca Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:36 +0200 Subject: IPVS: Add internal versions of sockopt interface structs Add extended internal versions of struct ip_vs_service_user and struct ip_vs_dest_user (the originals can't be modified as they are part of the old sockopt interface). Adjust ip_vs_ctl.c to work with the new data structures and add some minor AF-awareness. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 39 +++++++++++++ net/ipv4/ipvs/ip_vs_ctl.c | 138 ++++++++++++++++++++++++++++++---------------- 2 files changed, 129 insertions(+), 48 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 0400e590b6a..1adf8a026e4 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -399,6 +399,45 @@ struct ip_vs_conn { }; +/* + * Extended internal versions of struct ip_vs_service_user and + * ip_vs_dest_user for IPv6 support. + * + * We need these to conveniently pass around service and destination + * options, but unfortunately, we also need to keep the old definitions to + * maintain userspace backwards compatibility for the setsockopt interface. + */ +struct ip_vs_service_user_kern { + /* virtual service addresses */ + u16 af; + u16 protocol; + union nf_inet_addr addr; /* virtual ip address */ + u16 port; + u32 fwmark; /* firwall mark of service */ + + /* virtual service options */ + char *sched_name; + unsigned flags; /* virtual service flags */ + unsigned timeout; /* persistent timeout in sec */ + u32 netmask; /* persistent netmask */ +}; + + +struct ip_vs_dest_user_kern { + /* destination server address */ + union nf_inet_addr addr; + u16 port; + + /* real server options */ + unsigned conn_flags; /* connection flags */ + int weight; /* destination weight */ + + /* thresholds for active connections */ + u32 u_threshold; /* upper threshold */ + u32 l_threshold; /* lower threshold */ +}; + + /* * The information about the virtual service offered to the net * and the forwarding entries diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 3f2277b847d..a0c8b7bb553 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -708,7 +708,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) */ static void __ip_vs_update_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, struct ip_vs_dest_user *udest) + struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) { int conn_flags; @@ -717,7 +717,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; /* check if local node and update the flags */ - if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) { + if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) | IP_VS_CONN_F_LOCALNODE; } @@ -761,7 +761,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, * Create a destination for the given service */ static int -ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, struct ip_vs_dest **dest_p) { struct ip_vs_dest *dest; @@ -769,7 +769,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, EnterFunction(2); - atype = inet_addr_type(&init_net, udest->addr); + atype = inet_addr_type(&init_net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; @@ -779,11 +779,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, return -ENOMEM; } + dest->af = svc->af; dest->protocol = svc->protocol; - dest->vaddr.ip = svc->addr.ip; + dest->vaddr = svc->addr; dest->vport = svc->port; dest->vfwmark = svc->fwmark; - dest->addr.ip = udest->addr; + ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); dest->port = udest->port; atomic_set(&dest->activeconns, 0); @@ -808,10 +809,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, * Add a destination into an existing service */ static int -ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; - __be32 daddr = udest->addr; + union nf_inet_addr daddr; __be16 dport = udest->port; int ret; @@ -828,10 +829,12 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) return -ERANGE; } + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + /* * Check if the dest already exists in the list */ - dest = ip_vs_lookup_dest(svc, daddr, dport); + dest = ip_vs_lookup_dest(svc, daddr.ip, dport); if (dest != NULL) { IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); return -EEXIST; @@ -841,7 +844,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) * Check if the dest already exists in the trash and * is from the same service */ - dest = ip_vs_trash_get_dest(svc, daddr, dport); + dest = ip_vs_trash_get_dest(svc, daddr.ip, dport); if (dest != NULL) { IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", @@ -916,10 +919,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) * Edit a destination in the given service */ static int -ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; - __be32 daddr = udest->addr; + union nf_inet_addr daddr; __be16 dport = udest->port; EnterFunction(2); @@ -935,10 +938,12 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) return -ERANGE; } + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + /* * Lookup the destination list */ - dest = ip_vs_lookup_dest(svc, daddr, dport); + dest = ip_vs_lookup_dest(svc, daddr.ip, dport); if (dest == NULL) { IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); return -ENOENT; @@ -1029,15 +1034,15 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, * Delete a destination server in the given service */ static int -ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) +ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; - __be32 daddr = udest->addr; __be16 dport = udest->port; EnterFunction(2); - dest = ip_vs_lookup_dest(svc, daddr, dport); + dest = ip_vs_lookup_dest(svc, udest->addr.ip, dport); + if (dest == NULL) { IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); return -ENOENT; @@ -1072,7 +1077,8 @@ ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) * Add a service into the service hash table */ static int -ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) +ip_vs_add_service(struct ip_vs_service_user_kern *u, + struct ip_vs_service **svc_p) { int ret = 0; struct ip_vs_scheduler *sched = NULL; @@ -1101,8 +1107,9 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) atomic_set(&svc->usecnt, 1); atomic_set(&svc->refcnt, 0); + svc->af = u->af; svc->protocol = u->protocol; - svc->addr.ip = u->addr; + ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); svc->port = u->port; svc->fwmark = u->fwmark; svc->flags = u->flags; @@ -1161,7 +1168,7 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) * Edit a service and bind it with a new scheduler */ static int -ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u) +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { struct ip_vs_scheduler *sched, *old_sched; int ret = 0; @@ -1905,14 +1912,44 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, }; +static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, + struct ip_vs_service_user *usvc_compat) +{ + usvc->af = AF_INET; + usvc->protocol = usvc_compat->protocol; + usvc->addr.ip = usvc_compat->addr; + usvc->port = usvc_compat->port; + usvc->fwmark = usvc_compat->fwmark; + + /* Deep copy of sched_name is not needed here */ + usvc->sched_name = usvc_compat->sched_name; + + usvc->flags = usvc_compat->flags; + usvc->timeout = usvc_compat->timeout; + usvc->netmask = usvc_compat->netmask; +} + +static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest_user *udest_compat) +{ + udest->addr.ip = udest_compat->addr; + udest->port = udest_compat->port; + udest->conn_flags = udest_compat->conn_flags; + udest->weight = udest_compat->weight; + udest->u_threshold = udest_compat->u_threshold; + udest->l_threshold = udest_compat->l_threshold; +} + static int do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; unsigned char arg[MAX_ARG_LEN]; - struct ip_vs_service_user *usvc; + struct ip_vs_service_user *usvc_compat; + struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; - struct ip_vs_dest_user *udest; + struct ip_vs_dest_user *udest_compat; + struct ip_vs_dest_user_kern udest; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1952,35 +1989,40 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) goto out_unlock; } - usvc = (struct ip_vs_service_user *)arg; - udest = (struct ip_vs_dest_user *)(usvc + 1); + usvc_compat = (struct ip_vs_service_user *)arg; + udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); + + /* We only use the new structs internally, so copy userspace compat + * structs to extended internal versions */ + ip_vs_copy_usvc_compat(&usvc, usvc_compat); + ip_vs_copy_udest_compat(&udest, udest_compat); if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ - if (!usvc->fwmark && !usvc->addr && !usvc->port) { + if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { ret = ip_vs_zero_all(); goto out_unlock; } } /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ - if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) { + if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) { IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", - usvc->protocol, NIPQUAD(usvc->addr), - ntohs(usvc->port), usvc->sched_name); + usvc.protocol, NIPQUAD(usvc.addr.ip), + ntohs(usvc.port), usvc.sched_name); ret = -EFAULT; goto out_unlock; } /* Lookup the exact service by or fwmark */ - if (usvc->fwmark == 0) - svc = __ip_vs_service_get(usvc->protocol, - usvc->addr, usvc->port); + if (usvc.fwmark == 0) + svc = __ip_vs_service_get(usvc.protocol, + usvc.addr.ip, usvc.port); else - svc = __ip_vs_svc_fwm_get(usvc->fwmark); + svc = __ip_vs_svc_fwm_get(usvc.fwmark); if (cmd != IP_VS_SO_SET_ADD - && (svc == NULL || svc->protocol != usvc->protocol)) { + && (svc == NULL || svc->protocol != usvc.protocol)) { ret = -ESRCH; goto out_unlock; } @@ -1990,10 +2032,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (svc != NULL) ret = -EEXIST; else - ret = ip_vs_add_service(usvc, &svc); + ret = ip_vs_add_service(&usvc, &svc); break; case IP_VS_SO_SET_EDIT: - ret = ip_vs_edit_service(svc, usvc); + ret = ip_vs_edit_service(svc, &usvc); break; case IP_VS_SO_SET_DEL: ret = ip_vs_del_service(svc); @@ -2004,13 +2046,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = ip_vs_zero_service(svc); break; case IP_VS_SO_SET_ADDDEST: - ret = ip_vs_add_dest(svc, udest); + ret = ip_vs_add_dest(svc, &udest); break; case IP_VS_SO_SET_EDITDEST: - ret = ip_vs_edit_dest(svc, udest); + ret = ip_vs_edit_dest(svc, &udest); break; case IP_VS_SO_SET_DELDEST: - ret = ip_vs_del_dest(svc, udest); + ret = ip_vs_del_dest(svc, &udest); break; default: ret = -EINVAL; @@ -2517,7 +2559,7 @@ nla_put_failure: return skb->len; } -static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, +static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry) { struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; @@ -2537,6 +2579,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) return -EINVAL; + usvc->af = nla_get_u16(nla_af); /* For now, only support IPv4 */ if (nla_get_u16(nla_af) != AF_INET) return -EAFNOSUPPORT; @@ -2572,7 +2615,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, if (usvc->fwmark) svc = __ip_vs_svc_fwm_get(usvc->fwmark); else - svc = __ip_vs_service_get(usvc->protocol, usvc->addr, + svc = __ip_vs_service_get(usvc->protocol, usvc->addr.ip, usvc->port); if (svc) { usvc->flags = svc->flags; @@ -2583,9 +2626,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, /* set new flags from userland */ usvc->flags = (usvc->flags & ~flags.mask) | (flags.flags & flags.mask); - - strlcpy(usvc->sched_name, nla_data(nla_sched), - sizeof(usvc->sched_name)); + usvc->sched_name = nla_data(nla_sched); usvc->timeout = nla_get_u32(nla_timeout); usvc->netmask = nla_get_u32(nla_netmask); } @@ -2595,7 +2636,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc, static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) { - struct ip_vs_service_user usvc; + struct ip_vs_service_user_kern usvc; int ret; ret = ip_vs_genl_parse_service(&usvc, nla, 0); @@ -2605,7 +2646,7 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) if (usvc.fwmark) return __ip_vs_svc_fwm_get(usvc.fwmark); else - return __ip_vs_service_get(usvc.protocol, usvc.addr, + return __ip_vs_service_get(usvc.protocol, usvc.addr.ip, usvc.port); } @@ -2705,7 +2746,7 @@ out_err: return skb->len; } -static int ip_vs_genl_parse_dest(struct ip_vs_dest_user *udest, +static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, struct nlattr *nla, int full_entry) { struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; @@ -2861,8 +2902,8 @@ static int ip_vs_genl_set_config(struct nlattr **attrs) static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_service *svc = NULL; - struct ip_vs_service_user usvc; - struct ip_vs_dest_user udest; + struct ip_vs_service_user_kern usvc; + struct ip_vs_dest_user_kern udest; int ret = 0, cmd; int need_full_svc = 0, need_full_dest = 0; @@ -2914,7 +2955,8 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) /* Lookup the exact service by or fwmark */ if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.protocol, usvc.addr, usvc.port); + svc = __ip_vs_service_get(usvc.protocol, usvc.addr.ip, + usvc.port); else svc = __ip_vs_svc_fwm_get(usvc.fwmark); -- cgit v1.2.3-70-g09d2 From b18610de9ec2728159f723a9b864ca78a5774193 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:37 +0200 Subject: IPVS: Convert __ip_vs_svc_get() and __ip_vs_fwm_get() Add support for getting services based on their address family to __ip_vs_service_get(), __ip_vs_fwm_get() and the helper hash function ip_vs_svc_hashkey(). Adjust the callers. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 79 ++++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 32 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index a0c8b7bb553..a2d69b2ce6a 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -282,11 +282,19 @@ static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); * Returns hash value for virtual service */ static __inline__ unsigned -ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port) +ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, + __be16 port) { register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; - return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) & IP_VS_SVC_TAB_MASK; } @@ -317,7 +325,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* * Hash it by in ip_vs_svc_table */ - hash = ip_vs_svc_hashkey(svc->protocol, svc->addr.ip, + hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, svc->port); list_add(&svc->s_list, &ip_vs_svc_table[hash]); } else { @@ -364,17 +372,19 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) /* * Get service by {proto,addr,port} in the service table. */ -static __inline__ struct ip_vs_service * -__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport) +static inline struct ip_vs_service * +__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, + __be16 vport) { unsigned hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(protocol, vaddr, vport); + hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ - if ((svc->addr.ip == vaddr) + if ((svc->af == af) + && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) && (svc->protocol == protocol)) { /* HIT */ @@ -390,7 +400,8 @@ __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport) /* * Get service by {fwmark} in the service table. */ -static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) +static inline struct ip_vs_service * +__ip_vs_svc_fwm_get(int af, __u32 fwmark) { unsigned hash; struct ip_vs_service *svc; @@ -399,7 +410,7 @@ static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) hash = ip_vs_svc_fwm_hashkey(fwmark); list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { - if (svc->fwmark == fwmark) { + if (svc->fwmark == fwmark && svc->af == af) { /* HIT */ atomic_inc(&svc->usecnt); return svc; @@ -413,20 +424,20 @@ struct ip_vs_service * ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) { struct ip_vs_service *svc; - + union nf_inet_addr _vaddr = { .ip = vaddr }; read_lock(&__ip_vs_svc_lock); /* * Check the table hashed by fwmark first */ - if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark))) + if (fwmark && (svc = __ip_vs_svc_fwm_get(AF_INET, fwmark))) goto out; /* * Check the table hashed by * for "full" addressed entries */ - svc = __ip_vs_service_get(protocol, vaddr, vport); + svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, vport); if (svc == NULL && protocol == IPPROTO_TCP @@ -436,7 +447,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ - svc = __ip_vs_service_get(protocol, vaddr, FTPPORT); + svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, FTPPORT); } if (svc == NULL @@ -444,7 +455,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) /* * Check if the catch-all port (port zero) exists */ - svc = __ip_vs_service_get(protocol, vaddr, 0); + svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, 0); } out: @@ -2016,10 +2027,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* Lookup the exact service by or fwmark */ if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.protocol, - usvc.addr.ip, usvc.port); + svc = __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); else - svc = __ip_vs_svc_fwm_get(usvc.fwmark); + svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { @@ -2141,13 +2152,15 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; + union nf_inet_addr addr = { .ip = get->addr }; int ret = 0; if (get->fwmark) - svc = __ip_vs_svc_fwm_get(get->fwmark); + svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); else - svc = __ip_vs_service_get(get->protocol, - get->addr, get->port); + svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, + get->port); + if (svc) { int count = 0; struct ip_vs_dest *dest; @@ -2282,13 +2295,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_service_entry *entry; struct ip_vs_service *svc; + union nf_inet_addr addr; entry = (struct ip_vs_service_entry *)arg; + addr.ip = entry->addr; if (entry->fwmark) - svc = __ip_vs_svc_fwm_get(entry->fwmark); + svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); else - svc = __ip_vs_service_get(entry->protocol, - entry->addr, entry->port); + svc = __ip_vs_service_get(AF_INET, entry->protocol, + &addr, entry->port); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) @@ -2613,10 +2628,10 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, /* prefill flags from service if it already exists */ if (usvc->fwmark) - svc = __ip_vs_svc_fwm_get(usvc->fwmark); + svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark); else - svc = __ip_vs_service_get(usvc->protocol, usvc->addr.ip, - usvc->port); + svc = __ip_vs_service_get(usvc->af, usvc->protocol, + &usvc->addr, usvc->port); if (svc) { usvc->flags = svc->flags; ip_vs_service_put(svc); @@ -2644,10 +2659,10 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) return ERR_PTR(ret); if (usvc.fwmark) - return __ip_vs_svc_fwm_get(usvc.fwmark); + return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); else - return __ip_vs_service_get(usvc.protocol, usvc.addr.ip, - usvc.port); + return __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); } static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) @@ -2955,10 +2970,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) /* Lookup the exact service by or fwmark */ if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.protocol, usvc.addr.ip, - usvc.port); + svc = __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); else - svc = __ip_vs_svc_fwm_get(usvc.fwmark); + svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); /* Unless we're adding a new service, the service must already exist */ if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { -- cgit v1.2.3-70-g09d2 From 3c2e0505d25cdc9425336f167fd4ff5f505aecff Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:38 +0200 Subject: IPVS: Add v6 support to ip_vs_service_get() Add support for selecting services based on their address family to ip_vs_service_get() and adjust the callers. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 ++- net/ipv4/ipvs/ip_vs_ctl.c | 24 +++++++++++++----------- net/ipv4/ipvs/ip_vs_proto_tcp.c | 9 ++++++--- net/ipv4/ipvs/ip_vs_proto_udp.c | 11 +++++++---- 4 files changed, 28 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 1adf8a026e4..30baed0e696 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -783,7 +783,8 @@ extern struct ip_vs_stats ip_vs_stats; extern const struct ctl_path net_vs_ctl_path[]; extern struct ip_vs_service * -ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport); +ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport); static inline void ip_vs_service_put(struct ip_vs_service *svc) { diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index a2d69b2ce6a..1f3fc66e694 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -421,23 +421,24 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark) } struct ip_vs_service * -ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) +ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; - union nf_inet_addr _vaddr = { .ip = vaddr }; + read_lock(&__ip_vs_svc_lock); /* * Check the table hashed by fwmark first */ - if (fwmark && (svc = __ip_vs_svc_fwm_get(AF_INET, fwmark))) + if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) goto out; /* * Check the table hashed by * for "full" addressed entries */ - svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, vport); + svc = __ip_vs_service_get(af, protocol, vaddr, vport); if (svc == NULL && protocol == IPPROTO_TCP @@ -447,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ - svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, FTPPORT); + svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); } if (svc == NULL @@ -455,16 +456,16 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) /* * Check if the catch-all port (port zero) exists */ - svc = __ip_vs_service_get(AF_INET, protocol, &_vaddr, 0); + svc = __ip_vs_service_get(af, protocol, vaddr, 0); } out: read_unlock(&__ip_vs_svc_lock); - IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", - fwmark, ip_vs_proto_name(protocol), - NIPQUAD(vaddr), ntohs(vport), - svc?"hit":"not hit"); + IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), + svc ? "hit" : "not hit"); return svc; } @@ -605,8 +606,9 @@ struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, { struct ip_vs_dest *dest; struct ip_vs_service *svc; + union nf_inet_addr _vaddr = { .ip = vaddr }; - svc = ip_vs_service_get(0, protocol, vaddr, vport); + svc = ip_vs_service_get(AF_INET, 0, protocol, &_vaddr, vport); if (!svc) return NULL; dest = ip_vs_lookup_dest(svc, daddr, dport); diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 15860e1441b..fe93c9e6ff6 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -74,16 +74,19 @@ tcp_conn_schedule(struct sk_buff *skb, { struct ip_vs_service *svc; struct tcphdr _tcph, *th; + struct ip_vs_iphdr iph; - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + + th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); if (th == NULL) { *verdict = NF_DROP; return 0; } if (th->syn && - (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, - ip_hdr(skb)->daddr, th->dest))) { + (svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol, + &iph.daddr, th->dest))) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 8dfad5db829..d208ed6eb9f 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -80,16 +80,19 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, { struct ip_vs_service *svc; struct udphdr _udph, *uh; + struct ip_vs_iphdr iph; - uh = skb_header_pointer(skb, ip_hdrlen(skb), - sizeof(_udph), &_udph); + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + + uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); if (uh == NULL) { *verdict = NF_DROP; return 0; } - if ((svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, - ip_hdr(skb)->daddr, uh->dest))) { + svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol, + &iph.daddr, uh->dest); + if (svc) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. -- cgit v1.2.3-70-g09d2 From b14198f6c1bea1687d20723db35d8effecd9d899 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:39 +0200 Subject: IPVS: Add IPv6 support flag to schedulers Add 'supports_ipv6' flag to struct ip_vs_scheduler to indicate whether a scheduler supports IPv6. Set the flag to 1 in schedulers that work with IPv6, 0 otherwise. This flag is checked in a later patch while trying to add a service with a specific scheduler. Adjust debug in v6-supporting schedulers to work with both address families. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 +++ net/ipv4/ipvs/ip_vs_dh.c | 3 +++ net/ipv4/ipvs/ip_vs_lblc.c | 3 +++ net/ipv4/ipvs/ip_vs_lblcr.c | 3 +++ net/ipv4/ipvs/ip_vs_lc.c | 11 +++++++---- net/ipv4/ipvs/ip_vs_nq.c | 15 +++++++++------ net/ipv4/ipvs/ip_vs_rr.c | 13 ++++++++----- net/ipv4/ipvs/ip_vs_sed.c | 15 +++++++++------ net/ipv4/ipvs/ip_vs_sh.c | 3 +++ net/ipv4/ipvs/ip_vs_wlc.c | 15 +++++++++------ net/ipv4/ipvs/ip_vs_wrr.c | 15 +++++++++------ 11 files changed, 66 insertions(+), 33 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 30baed0e696..3d3b699dc02 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -516,6 +516,9 @@ struct ip_vs_scheduler { char *name; /* scheduler name */ atomic_t refcnt; /* reference counter */ struct module *module; /* THIS_MODULE/NULL */ +#ifdef CONFIG_IP_VS_IPV6 + int supports_ipv6; /* scheduler has IPv6 support */ +#endif /* scheduler initializing service */ int (*init_service)(struct ip_vs_service *svc); diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index 9f9d795dbd7..a16943fd72f 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -234,6 +234,9 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif .init_service = ip_vs_dh_init_svc, .done_service = ip_vs_dh_done_svc, .update_service = ip_vs_dh_update_svc, diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 69309edc0c4..6ecef3518ca 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -522,6 +522,9 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif .init_service = ip_vs_lblc_init_svc, .done_service = ip_vs_lblc_done_svc, .schedule = ip_vs_lblc_schedule, diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 51c746e2083..1f75ea83bcf 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -722,6 +722,9 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif .init_service = ip_vs_lblcr_init_svc, .done_service = ip_vs_lblcr_done_svc, .schedule = ip_vs_lblcr_schedule, diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c index 551d293347f..b69f808ac46 100644 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ b/net/ipv4/ipvs/ip_vs_lc.c @@ -67,10 +67,10 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } if (least) - IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->inactconns)); + IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); return least; } @@ -81,6 +81,9 @@ static struct ip_vs_scheduler ip_vs_lc_scheduler = { .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .schedule = ip_vs_lc_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c index aa0e32ad348..9a2d8033f08 100644 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ b/net/ipv4/ipvs/ip_vs_nq.c @@ -99,12 +99,12 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) return NULL; out: - IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); + IP_VS_DBG_BUF(6, "NQ: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); return least; } @@ -116,6 +116,9 @@ static struct ip_vs_scheduler ip_vs_nq_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .schedule = ip_vs_nq_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c index 27f0b624283..a22195f68ac 100644 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ b/net/ipv4/ipvs/ip_vs_rr.c @@ -74,11 +74,11 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) out: svc->sched_data = q; write_unlock(&svc->sched_lock); - IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr.ip), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + IP_VS_DBG_BUF(6, "RR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); return dest; } @@ -89,6 +89,9 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = { .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .init_service = ip_vs_rr_init_svc, .update_service = ip_vs_rr_update_svc, .schedule = ip_vs_rr_schedule, diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c index 38b574b2fdf..7d2f22f04b8 100644 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ b/net/ipv4/ipvs/ip_vs_sed.c @@ -101,12 +101,12 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } } - IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); + IP_VS_DBG_BUF(6, "SED: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); return least; } @@ -118,6 +118,9 @@ static struct ip_vs_scheduler ip_vs_sed_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .schedule = ip_vs_sed_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index c9e54e2ec29..1d96de27fef 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -231,6 +231,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif .init_service = ip_vs_sh_init_svc, .done_service = ip_vs_sh_done_svc, .update_service = ip_vs_sh_update_svc, diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c index 09fd993040f..8c596e71259 100644 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ b/net/ipv4/ipvs/ip_vs_wlc.c @@ -89,12 +89,12 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } } - IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); + IP_VS_DBG_BUF(6, "WLC: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); return least; } @@ -106,6 +106,9 @@ static struct ip_vs_scheduler ip_vs_wlc_scheduler = .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .schedule = ip_vs_wlc_schedule, }; diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c index 19c49b234f3..7ea92fed50b 100644 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ b/net/ipv4/ipvs/ip_vs_wrr.c @@ -195,12 +195,12 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } } - IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr.ip), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), - atomic_read(&dest->weight)); + IP_VS_DBG_BUF(6, "WRR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); out: write_unlock(&svc->sched_lock); @@ -213,6 +213,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = { .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif .init_service = ip_vs_wrr_init_svc, .done_service = ip_vs_wrr_done_svc, .update_service = ip_vs_wrr_update_svc, -- cgit v1.2.3-70-g09d2 From 51ef348b14183789e4cb3444d05ce83b1b69d8fb Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:40 +0200 Subject: IPVS: Add 'af' args to protocol handler functions Add 'af' arguments to conn_schedule(), conn_in_get(), conn_out_get() and csum_check() function pointers in struct ip_vs_protocol. Extend the respective functions for TCP, UDP, AH and ESP and adjust the callers. The changes in the callers need to be somewhat extensive, since they now need to pass a filled out struct ip_vs_iphdr * to the modified functions instead of a struct iphdr *. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 15 ++++--- net/ipv4/ipvs/ip_vs_core.c | 64 +++++++++++++++--------------- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 56 +++++++++++++------------- net/ipv4/ipvs/ip_vs_proto_tcp.c | 79 ++++++++++++++++++++++++------------- net/ipv4/ipvs/ip_vs_proto_udp.c | 81 ++++++++++++++++++++++++-------------- 5 files changed, 171 insertions(+), 124 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 3d3b699dc02..68f004f9bcc 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -296,21 +296,23 @@ struct ip_vs_protocol { void (*exit)(struct ip_vs_protocol *pp); - int (*conn_schedule)(struct sk_buff *skb, + int (*conn_schedule)(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp); struct ip_vs_conn * - (*conn_in_get)(const struct sk_buff *skb, + (*conn_in_get)(int af, + const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, + const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse); struct ip_vs_conn * - (*conn_out_get)(const struct sk_buff *skb, + (*conn_out_get)(int af, + const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, + const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse); @@ -320,7 +322,8 @@ struct ip_vs_protocol { int (*dnat_handler)(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp); - int (*csum_check)(struct sk_buff *skb, struct ip_vs_protocol *pp); + int (*csum_check)(int af, struct sk_buff *skb, + struct ip_vs_protocol *pp); const char *(*state_name)(int state); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 4a54f33b60d..34aaa1480d9 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -572,6 +572,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl, verdict; @@ -627,8 +628,9 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) offset += cih->ihl * 4; + ip_vs_fill_iphdr(AF_INET, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(skb, pp, cih, offset, 1); + cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); if (!cp) return NF_ACCEPT; @@ -686,43 +688,41 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct iphdr *iph; + struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; - int ihl; EnterFunction(11); if (skb->ipvs_property) return NF_ACCEPT; - iph = ip_hdr(skb); - if (unlikely(iph->protocol == IPPROTO_ICMP)) { + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related, verdict = ip_vs_out_icmp(skb, &related); if (related) return verdict; - iph = ip_hdr(skb); + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); } - pp = ip_vs_proto_get(iph->protocol); + pp = ip_vs_proto_get(iph.protocol); if (unlikely(!pp)) return NF_ACCEPT; /* reassemble IP fragments */ - if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) && + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && !pp->dont_defrag)) { if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) return NF_STOLEN; - iph = ip_hdr(skb); - } - ihl = iph->ihl << 2; + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + } /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(skb, pp, iph, ihl, 0); + cp = pp->conn_out_get(AF_INET, skb, pp, &iph, iph.len, 0); if (unlikely(!cp)) { if (sysctl_ip_vs_nat_icmp_send && @@ -730,18 +730,18 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, pp->protocol == IPPROTO_UDP)) { __be16 _ports[2], *pptr; - pptr = skb_header_pointer(skb, ihl, + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(iph->protocol, - iph->saddr, pptr[0])) { + if (ip_vs_lookup_real_service(iph.protocol, + iph.saddr.ip, pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST * packet or not TCP packet. */ - if (iph->protocol != IPPROTO_TCP + if (iph.protocol != IPPROTO_TCP || !is_tcp_reset(skb)) { icmp_send(skb,ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); @@ -756,7 +756,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); - if (!skb_make_writable(skb, ihl)) + if (!skb_make_writable(skb, iph.len)) goto drop; /* mangle the packet */ @@ -804,6 +804,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl, verdict; @@ -860,8 +861,9 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) offset += cih->ihl * 4; + ip_vs_fill_iphdr(AF_INET, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(skb, pp, cih, offset, 1); + cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); if (!cp) return NF_ACCEPT; @@ -897,11 +899,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct iphdr *iph; + struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; int ret, restart; - int ihl; + + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); /* * Big tappo: only PACKET_HOST (neither loopback nor mcasts) @@ -909,38 +912,35 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, */ if (unlikely(skb->pkt_type != PACKET_HOST || skb->dev->flags & IFF_LOOPBACK || skb->sk)) { - IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", - skb->pkt_type, - ip_hdr(skb)->protocol, - NIPQUAD(ip_hdr(skb)->daddr)); + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", + skb->pkt_type, + iph.protocol, + IP_VS_DBG_ADDR(AF_INET, &iph.daddr)); return NF_ACCEPT; } - iph = ip_hdr(skb); - if (unlikely(iph->protocol == IPPROTO_ICMP)) { + if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); if (related) return verdict; - iph = ip_hdr(skb); + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); } /* Protocol supported? */ - pp = ip_vs_proto_get(iph->protocol); + pp = ip_vs_proto_get(iph.protocol); if (unlikely(!pp)) return NF_ACCEPT; - ihl = iph->ihl << 2; - /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(skb, pp, iph, ihl, 0); + cp = pp->conn_in_get(AF_INET, skb, pp, &iph, iph.len, 0); if (unlikely(!cp)) { int v; - if (!pp->conn_schedule(skb, pp, &v, &cp)) + if (!pp->conn_schedule(AF_INET, skb, pp, &v, &cp)) return v; } diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c index 3f9ebd7639a..2a361a99174 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c @@ -39,25 +39,23 @@ struct isakmp_hdr { static struct ip_vs_conn * -ah_esp_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, +ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { struct ip_vs_conn *cp; if (likely(!inverse)) { cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr, + iph->saddr.ip, htons(PORT_ISAKMP), - iph->daddr, + iph->daddr.ip, htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr, + iph->daddr.ip, htons(PORT_ISAKMP), - iph->saddr, + iph->saddr.ip, htons(PORT_ISAKMP)); } @@ -66,12 +64,12 @@ ah_esp_conn_in_get(const struct sk_buff *skb, * We are not sure if the packet is from our * service, so our conn_schedule hook should return NF_ACCEPT */ - IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + pp->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); } return cp; @@ -79,32 +77,35 @@ ah_esp_conn_in_get(const struct sk_buff *skb, static struct ip_vs_conn * -ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +ah_esp_conn_out_get(int af, const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, + int inverse) { struct ip_vs_conn *cp; if (likely(!inverse)) { cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr, + iph->saddr.ip, htons(PORT_ISAKMP), - iph->daddr, + iph->daddr.ip, htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr, + iph->daddr.ip, htons(PORT_ISAKMP), - iph->saddr, + iph->saddr.ip, htons(PORT_ISAKMP)); } if (!cp) { - IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + pp->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); } return cp; @@ -112,8 +113,7 @@ ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, static int -ah_esp_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) { /* diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index fe93c9e6ff6..9211afa8f30 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -25,8 +25,9 @@ static struct ip_vs_conn * -tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) { __be16 _ports[2], *pptr; @@ -36,18 +37,19 @@ tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, if (likely(!inverse)) { return ip_vs_conn_in_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); + iph->saddr.ip, pptr[0], + iph->daddr.ip, pptr[1]); } else { return ip_vs_conn_in_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); + iph->daddr.ip, pptr[1], + iph->saddr.ip, pptr[0]); } } static struct ip_vs_conn * -tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) { __be16 _ports[2], *pptr; @@ -57,26 +59,25 @@ tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, if (likely(!inverse)) { return ip_vs_conn_out_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); + iph->saddr.ip, pptr[0], + iph->daddr.ip, pptr[1]); } else { return ip_vs_conn_out_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); + iph->daddr.ip, pptr[1], + iph->saddr.ip, pptr[0]); } } static int -tcp_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) { struct ip_vs_service *svc; struct tcphdr _tcph, *th; struct ip_vs_iphdr iph; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); if (th == NULL) { @@ -85,8 +86,8 @@ tcp_conn_schedule(struct sk_buff *skb, } if (th->syn && - (svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol, - &iph.daddr, th->dest))) { + (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, + th->dest))) { if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -136,7 +137,7 @@ tcp_snat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) + if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) return 0; /* Call application helper if needed */ @@ -182,7 +183,7 @@ tcp_dnat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) + if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) return 0; /* @@ -219,21 +220,43 @@ tcp_dnat_handler(struct sk_buff *skb, static int -tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { - const unsigned int tcphoff = ip_hdrlen(skb); + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - skb->len - tcphoff, - ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - tcphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - tcphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } break; default: /* No need to checksum. */ diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index d208ed6eb9f..d3a1b1f2d10 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -24,8 +24,9 @@ #include static struct ip_vs_conn * -udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) { struct ip_vs_conn *cp; __be16 _ports[2], *pptr; @@ -36,12 +37,12 @@ udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, if (likely(!inverse)) { cp = ip_vs_conn_in_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); + iph->saddr.ip, pptr[0], + iph->daddr.ip, pptr[1]); } else { cp = ip_vs_conn_in_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); + iph->daddr.ip, pptr[1], + iph->saddr.ip, pptr[0]); } return cp; @@ -49,25 +50,25 @@ udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, static struct ip_vs_conn * -udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) +udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) { struct ip_vs_conn *cp; __be16 _ports[2], *pptr; - pptr = skb_header_pointer(skb, ip_hdrlen(skb), - sizeof(_ports), _ports); + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); if (pptr == NULL) return NULL; if (likely(!inverse)) { cp = ip_vs_conn_out_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); + iph->saddr.ip, pptr[0], + iph->daddr.ip, pptr[1]); } else { cp = ip_vs_conn_out_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); + iph->daddr.ip, pptr[1], + iph->saddr.ip, pptr[0]); } return cp; @@ -75,14 +76,14 @@ udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, static int -udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) { struct ip_vs_service *svc; struct udphdr _udph, *uh; struct ip_vs_iphdr iph; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); if (uh == NULL) { @@ -90,7 +91,7 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; } - svc = ip_vs_service_get(AF_INET, skb->mark, iph.protocol, + svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, uh->dest); if (svc) { if (ip_vs_todrop()) { @@ -143,7 +144,7 @@ udp_snat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) + if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) return 0; /* @@ -195,7 +196,7 @@ udp_dnat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) + if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) return 0; /* @@ -234,10 +235,17 @@ udp_dnat_handler(struct sk_buff *skb, static int -udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { struct udphdr _udph, *uh; - const unsigned int udphoff = ip_hdrlen(skb); + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); if (uh == NULL) @@ -249,15 +257,28 @@ udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, - skb->len - udphoff, - ip_hdr(skb)->protocol, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - udphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - udphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } break; default: /* No need to checksum. */ -- cgit v1.2.3-70-g09d2 From 3b047d9d0407e78a52f009835a0e26cb62edb8c7 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:41 +0200 Subject: IPVS: Add protocol debug functions for IPv6 Add protocol (TCP, UDP, AH, ESP) debug functions for IPv6 packet debug output. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_proto.c | 63 +++++++++++++++++++++++++++++++++++--- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 36 ++++++++++++++++++++-- 2 files changed, 93 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index 6099a88fc20..50f6215beda 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -152,10 +152,10 @@ const char * ip_vs_state_name(__u16 proto, int state) void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) +ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) { char buf[128]; struct iphdr _iph, *ih; @@ -189,6 +189,61 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); } +#ifdef CONFIG_IP_VS_IPV6 +void +ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[192]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->nexthdr == IPPROTO_FRAGMENT) + sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag", + pp->name, NIP6(ih->saddr), + NIP6(ih->daddr)); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT, + pp->name, + NIP6(ih->saddr), + NIP6(ih->daddr)); + else + sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u", + pp->name, + NIP6(ih->saddr), + ntohs(pptr[0]), + NIP6(ih->daddr), + ntohs(pptr[1])); + } + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} +#endif + + +void +ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb->protocol == __constant_htons(ETH_P_IPV6)) + ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); +} + int __init ip_vs_protocol_init(void) { diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c index 2a361a99174..4b0b8f268d1 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c @@ -125,8 +125,8 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, static void -ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) +ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) { char buf[256]; struct iphdr _iph, *ih; @@ -142,6 +142,38 @@ ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); } +#ifdef CONFIG_IP_VS_IPV6 +static void +ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT, + pp->name, NIP6(ih->saddr), + NIP6(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} +#endif + +static void +ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb->protocol == __constant_htons(ETH_P_IPV6)) + ah_esp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ah_esp_debug_packet_v4(pp, skb, offset, msg); +} + static void ah_esp_init(struct ip_vs_protocol *pp) { -- cgit v1.2.3-70-g09d2 From 0bbdd42b7efa66685b6d74701bcde3a596a3a59d Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:42 +0200 Subject: IPVS: Extend protocol DNAT/SNAT and state handlers Extend protocol DNAT/SNAT and state handlers to work with IPv6. Also change/introduce new checksumming helper functions for this. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 11 ++++++ net/ipv4/ipvs/ip_vs_proto_tcp.c | 85 ++++++++++++++++++++++++++++++++--------- net/ipv4/ipvs/ip_vs_proto_udp.c | 82 ++++++++++++++++++++++++++++++--------- 3 files changed, 142 insertions(+), 36 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 68f004f9bcc..c173f1a7de2 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -904,6 +904,17 @@ static inline __wsum ip_vs_check_diff4(__be32 old, __be32 new, __wsum oldsum) return csum_partial((char *) diff, sizeof(diff), oldsum); } +#ifdef CONFIG_IP_VS_IPV6 +static inline __wsum ip_vs_check_diff16(const __be32 *old, const __be32 *new, + __wsum oldsum) +{ + __be32 diff[8] = { ~old[3], ~old[2], ~old[1], ~old[0], + new[3], new[2], new[1], new[0] }; + + return csum_partial((char *) diff, sizeof(diff), oldsum); +} +#endif + static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum) { __be16 diff[2] = { ~old, new }; diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 9211afa8f30..3daae43ae44 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -114,11 +114,21 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, static inline void -tcp_fast_csum_update(struct tcphdr *tcph, __be32 oldip, __be32 newip, +tcp_fast_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, __be16 oldport, __be16 newport) { +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); + else +#endif tcph->check = - csum_fold(ip_vs_check_diff4(oldip, newip, + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); } @@ -129,7 +139,14 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct tcphdr *tcph; - const unsigned int tcphoff = ip_hdrlen(skb); + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) @@ -137,7 +154,7 @@ tcp_snat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ @@ -145,13 +162,13 @@ tcp_snat_handler(struct sk_buff *skb, return 0; } - tcph = (void *)ip_hdr(skb) + tcphoff; + tcph = (void *)skb_network_header(skb) + tcphoff; tcph->source = cp->vport; /* Adjust TCP checksums */ if (!cp->app) { /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->daddr.ip, cp->vaddr.ip, + tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -159,9 +176,20 @@ tcp_snat_handler(struct sk_buff *skb, /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, - skb->len - tcphoff, - cp->protocol, skb->csum); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, tcph->check, (char*)&(tcph->check) - (char*)tcph); @@ -175,7 +203,14 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct tcphdr *tcph; - const unsigned int tcphoff = ip_hdrlen(skb); + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) @@ -183,7 +218,7 @@ tcp_dnat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* @@ -194,7 +229,7 @@ tcp_dnat_handler(struct sk_buff *skb, return 0; } - tcph = (void *)ip_hdr(skb) + tcphoff; + tcph = (void *)skb_network_header(skb) + tcphoff; tcph->dest = cp->dport; /* @@ -202,7 +237,7 @@ tcp_dnat_handler(struct sk_buff *skb, */ if (!cp->app) { /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->vaddr.ip, cp->daddr.ip, + tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -210,9 +245,19 @@ tcp_dnat_handler(struct sk_buff *skb, /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, - skb->len - tcphoff, - cp->protocol, skb->csum); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; @@ -487,7 +532,13 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, { struct tcphdr _tcph, *th; - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); +#ifdef CONFIG_IP_VS_IPV6 + int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + int ihl = ip_hdrlen(skb); +#endif + + th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); if (th == NULL) return 0; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index d3a1b1f2d10..6cca0ad8e32 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -120,13 +120,23 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, static inline void -udp_fast_csum_update(struct udphdr *uhdr, __be32 oldip, __be32 newip, +udp_fast_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, __be16 oldport, __be16 newport) { - uhdr->check = - csum_fold(ip_vs_check_diff4(oldip, newip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(uhdr->check)))); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); if (!uhdr->check) uhdr->check = CSUM_MANGLED_0; } @@ -136,7 +146,14 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct udphdr *udph; - const unsigned int udphoff = ip_hdrlen(skb); + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, udphoff+sizeof(*udph))) @@ -144,7 +161,7 @@ udp_snat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* @@ -154,7 +171,7 @@ udp_snat_handler(struct sk_buff *skb, return 0; } - udph = (void *)ip_hdr(skb) + udphoff; + udph = (void *)skb_network_header(skb) + udphoff; udph->source = cp->vport; /* @@ -162,7 +179,7 @@ udp_snat_handler(struct sk_buff *skb, */ if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->daddr.ip, cp->vaddr.ip, + udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -170,9 +187,19 @@ udp_snat_handler(struct sk_buff *skb, /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, - skb->len - udphoff, - cp->protocol, skb->csum); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", @@ -188,7 +215,14 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct udphdr *udph; - unsigned int udphoff = ip_hdrlen(skb); + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); /* csum_check requires unshared skb */ if (!skb_make_writable(skb, udphoff+sizeof(*udph))) @@ -196,7 +230,7 @@ udp_dnat_handler(struct sk_buff *skb, if (unlikely(cp->app != NULL)) { /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(AF_INET, skb, pp)) + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* @@ -207,7 +241,7 @@ udp_dnat_handler(struct sk_buff *skb, return 0; } - udph = (void *)ip_hdr(skb) + udphoff; + udph = (void *)skb_network_header(skb) + udphoff; udph->dest = cp->dport; /* @@ -215,7 +249,7 @@ udp_dnat_handler(struct sk_buff *skb, */ if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->vaddr.ip, cp->daddr.ip, + udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; @@ -223,9 +257,19 @@ udp_dnat_handler(struct sk_buff *skb, /* full checksum calculation */ udph->check = 0; skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, - skb->len - udphoff, - cp->protocol, skb->csum); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_UNNECESSARY; -- cgit v1.2.3-70-g09d2 From 28364a59f3dfe7fed3560ec7aff9b7aeb02824fb Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:43 +0200 Subject: IPVS: Extend functions for getting/creating connections Extend functions for getting/creating connections and connection templates for IPv6 support and fix the callers. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 16 ++++-- net/ipv4/ipvs/ip_vs_conn.c | 100 ++++++++++++++++++++------------- net/ipv4/ipvs/ip_vs_core.c | 112 ++++++++++++++++++++----------------- net/ipv4/ipvs/ip_vs_ftp.c | 45 +++++++-------- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 24 ++++---- net/ipv4/ipvs/ip_vs_proto_tcp.c | 24 ++++---- net/ipv4/ipvs/ip_vs_proto_udp.c | 24 ++++---- net/ipv4/ipvs/ip_vs_sync.c | 27 +++++---- 8 files changed, 209 insertions(+), 163 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index c173f1a7de2..26893499eb6 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -642,11 +642,16 @@ enum { }; extern struct ip_vs_conn *ip_vs_conn_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port); +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port); + extern struct ip_vs_conn *ip_vs_ct_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port); +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port); + extern struct ip_vs_conn *ip_vs_conn_out_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port); +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port); /* put back the conn without restarting its timer */ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) @@ -657,8 +662,9 @@ extern void ip_vs_conn_put(struct ip_vs_conn *cp); extern void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); extern struct ip_vs_conn * -ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport, - __be32 daddr, __be16 dport, unsigned flags, +ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + const union nf_inet_addr *daddr, __be16 dport, unsigned flags, struct ip_vs_dest *dest); extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp); diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 639d4bc7fc1..15eec282b17 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -114,9 +114,18 @@ static inline void ct_write_unlock_bh(unsigned key) /* * Returns hash value for IPVS connection entry */ -static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port) +static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, + const union nf_inet_addr *addr, + __be16 port) { - return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd) +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), + (__force u32)port, proto, ip_vs_conn_rnd) + & IP_VS_CONN_TAB_MASK; +#endif + return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, + ip_vs_conn_rnd) & IP_VS_CONN_TAB_MASK; } @@ -131,7 +140,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) int ret; /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport); + hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); ct_write_lock(hash); @@ -162,7 +171,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) int ret; /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr.ip, cp->cport); + hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); ct_write_lock(hash); @@ -187,18 +196,21 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) * d_addr, d_port: pkt dest address (load balancer) */ static inline struct ip_vs_conn *__ip_vs_conn_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) { unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); + hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr == cp->caddr.ip && s_port == cp->cport && - d_port == cp->vport && d_addr == cp->vaddr.ip && + if (cp->af == af && + ip_vs_addr_equal(af, s_addr, &cp->caddr) && + ip_vs_addr_equal(af, d_addr, &cp->vaddr) && + s_port == cp->cport && d_port == cp->vport && ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && protocol == cp->protocol) { /* HIT */ @@ -214,37 +226,42 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get } struct ip_vs_conn *ip_vs_conn_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) { struct ip_vs_conn *cp; - cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); + cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) - cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); + cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, + d_port); - IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - cp?"hit":"not hit"); + IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + cp ? "hit" : "not hit"); return cp; } /* Get reference to connection template */ struct ip_vs_conn *ip_vs_ct_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) { unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); + hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr == cp->caddr.ip && s_port == cp->cport && - d_port == cp->vport && d_addr == cp->vaddr.ip && + if (cp->af == af && + ip_vs_addr_equal(af, s_addr, &cp->caddr) && + ip_vs_addr_equal(af, d_addr, &cp->vaddr) && + s_port == cp->cport && d_port == cp->vport && cp->flags & IP_VS_CONN_F_TEMPLATE && protocol == cp->protocol) { /* HIT */ @@ -257,11 +274,11 @@ struct ip_vs_conn *ip_vs_ct_in_get out: ct_read_unlock(hash); - IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - cp?"hit":"not hit"); + IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + cp ? "hit" : "not hit"); return cp; } @@ -273,7 +290,8 @@ struct ip_vs_conn *ip_vs_ct_in_get * d_addr, d_port: pkt dest address (foreign host) */ struct ip_vs_conn *ip_vs_conn_out_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) { unsigned hash; struct ip_vs_conn *cp, *ret=NULL; @@ -281,13 +299,15 @@ struct ip_vs_conn *ip_vs_conn_out_get /* * Check for "full" addressed entries */ - hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); + hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (d_addr == cp->caddr.ip && d_port == cp->cport && - s_port == cp->dport && s_addr == cp->daddr.ip && + if (cp->af == af && + ip_vs_addr_equal(af, d_addr, &cp->caddr) && + ip_vs_addr_equal(af, s_addr, &cp->daddr) && + d_port == cp->cport && s_port == cp->dport && protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); @@ -298,11 +318,11 @@ struct ip_vs_conn *ip_vs_conn_out_get ct_read_unlock(hash); - IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - ret?"hit":"not hit"); + IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ret ? "hit" : "not hit"); return ret; } @@ -625,8 +645,9 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) * Create a new connection entry and hash it into the ip_vs_conn_tab */ struct ip_vs_conn * -ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport, - __be32 daddr, __be16 dport, unsigned flags, +ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + const union nf_inet_addr *daddr, __be16 dport, unsigned flags, struct ip_vs_dest *dest) { struct ip_vs_conn *cp; @@ -640,12 +661,13 @@ ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport INIT_LIST_HEAD(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + cp->af = af; cp->protocol = proto; - cp->caddr.ip = caddr; + ip_vs_addr_copy(af, &cp->caddr, caddr); cp->cport = cport; - cp->vaddr.ip = vaddr; + ip_vs_addr_copy(af, &cp->vaddr, vaddr); cp->vport = vport; - cp->daddr.ip = daddr; + ip_vs_addr_copy(af, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; spin_lock_init(&cp->lock); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 34aaa1480d9..2d5a4331709 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -173,19 +173,21 @@ ip_vs_sched_persist(struct ip_vs_service *svc, __be16 ports[2]) { struct ip_vs_conn *cp = NULL; - struct iphdr *iph = ip_hdr(skb); + struct ip_vs_iphdr iph; struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport; /* destination port to forward */ - __be32 snet; /* source network of the client, after masking */ + union nf_inet_addr snet; /* source network of the client, + after masking */ + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); /* Mask saddr with the netmask to adjust template granularity */ - snet = iph->saddr & svc->netmask; + snet.ip = iph.saddr.ip & svc->netmask; IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " "mnet %u.%u.%u.%u\n", - NIPQUAD(iph->saddr), ntohs(ports[0]), - NIPQUAD(iph->daddr), ntohs(ports[1]), + NIPQUAD(iph.saddr.ip), ntohs(ports[0]), + NIPQUAD(iph.daddr.ip), ntohs(ports[1]), NIPQUAD(snet)); /* @@ -204,11 +206,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc, if (ports[1] == svc->port) { /* Check if a template already exists */ if (svc->port != FTPPORT) - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, ports[1]); + ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + &iph.daddr, ports[1]); else - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, 0); + ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + &iph.daddr, 0); if (!ct || !ip_vs_check_template(ct)) { /* @@ -228,18 +230,18 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * for ftp service. */ if (svc->port != FTPPORT) - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, + ct = ip_vs_conn_new(AF_INET, iph.protocol, + &snet, 0, + &iph.daddr, ports[1], - dest->addr.ip, dest->port, + &dest->addr, dest->port, IP_VS_CONN_F_TEMPLATE, dest); else - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, 0, - dest->addr.ip, 0, + ct = ip_vs_conn_new(AF_INET, iph.protocol, + &snet, 0, + &iph.daddr, 0, + &dest->addr, 0, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) @@ -258,12 +260,16 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * fwmark template: * port zero template: */ - if (svc->fwmark) - ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0, - htonl(svc->fwmark), 0); - else - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, 0); + if (svc->fwmark) { + union nf_inet_addr fwmark = { + .all = { 0, 0, 0, htonl(svc->fwmark) } + }; + + ct = ip_vs_ct_in_get(AF_INET, IPPROTO_IP, &snet, 0, + &fwmark, 0); + } else + ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + &iph.daddr, 0); if (!ct || !ip_vs_check_template(ct)) { /* @@ -282,18 +288,22 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a template according to the service */ - if (svc->fwmark) - ct = ip_vs_conn_new(IPPROTO_IP, - snet, 0, - htonl(svc->fwmark), 0, - dest->addr.ip, 0, + if (svc->fwmark) { + union nf_inet_addr fwmark = { + .all = { 0, 0, 0, htonl(svc->fwmark) } + }; + + ct = ip_vs_conn_new(AF_INET, IPPROTO_IP, + &snet, 0, + &fwmark, 0, + &dest->addr, 0, IP_VS_CONN_F_TEMPLATE, dest); - else - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, 0, - dest->addr.ip, 0, + } else + ct = ip_vs_conn_new(AF_INET, iph.protocol, + &snet, 0, + &iph.daddr, 0, + &dest->addr, 0, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) @@ -310,10 +320,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, ports[0], - iph->daddr, ports[1], - dest->addr.ip, dport, + cp = ip_vs_conn_new(AF_INET, iph.protocol, + &iph.saddr, ports[0], + &iph.daddr, ports[1], + &dest->addr, dport, 0, dest); if (cp == NULL) { @@ -342,12 +352,12 @@ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_conn *cp = NULL; - struct iphdr *iph = ip_hdr(skb); + struct ip_vs_iphdr iph; struct ip_vs_dest *dest; __be16 _ports[2], *pptr; - pptr = skb_header_pointer(skb, iph->ihl*4, - sizeof(_ports), _ports); + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NULL; @@ -377,10 +387,10 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* * Create a connection entry. */ - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1], - dest->addr.ip, dest->port ? dest->port : pptr[1], + cp = ip_vs_conn_new(AF_INET, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], + &dest->addr, dest->port ? dest->port : pptr[1], 0, dest); if (cp == NULL) @@ -408,10 +418,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_protocol *pp) { __be16 _ports[2], *pptr; - struct iphdr *iph = ip_hdr(skb); + struct ip_vs_iphdr iph; + ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); - pptr = skb_header_pointer(skb, iph->ihl*4, - sizeof(_ports), _ports); + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) { ip_vs_service_put(svc); return NF_DROP; @@ -421,7 +431,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, and the destination is RTN_UNICAST (and not local), then create a cache_bypass connection entry */ if (sysctl_ip_vs_cache_bypass && svc->fwmark - && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) { + && (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST)) { int ret, cs; struct ip_vs_conn *cp; @@ -429,9 +439,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, /* create a new connection entry */ IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1], + cp = ip_vs_conn_new(AF_INET, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], 0, 0, IP_VS_CONN_F_BYPASS, NULL); diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c index bfe5d7050a5..0c3fbe0de5f 100644 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -140,7 +140,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, struct tcphdr *th; char *data, *data_limit; char *start, *end; - __be32 from; + union nf_inet_addr from; __be16 port; struct ip_vs_conn *n_cp; char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ @@ -166,24 +166,25 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, if (ip_vs_ftp_get_addrport(data, data_limit, SERVER_STRING, sizeof(SERVER_STRING)-1, ')', - &from, &port, + &from.ip, &port, &start, &end) != 1) return 1; IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " "%u.%u.%u.%u:%d detected\n", - NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr.ip), 0); + NIPQUAD(from.ip), ntohs(port), + NIPQUAD(cp->caddr.ip), 0); /* * Now update or create an connection entry for it */ - n_cp = ip_vs_conn_out_get(iph->protocol, from, port, - cp->caddr.ip, 0); + n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, + &cp->caddr, 0); if (!n_cp) { - n_cp = ip_vs_conn_new(IPPROTO_TCP, - cp->caddr.ip, 0, - cp->vaddr.ip, port, - from, port, + n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, + &cp->caddr, 0, + &cp->vaddr, port, + &from, port, IP_VS_CONN_F_NO_CPORT, cp->dest); if (!n_cp) @@ -196,9 +197,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Replace the old passive address with the new one */ - from = n_cp->vaddr.ip; + from.ip = n_cp->vaddr.ip; port = n_cp->vport; - sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), + sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip), (ntohs(port)>>8)&255, ntohs(port)&255); buf_len = strlen(buf); @@ -243,7 +244,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, struct tcphdr *th; char *data, *data_start, *data_limit; char *start, *end; - __be32 to; + union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; @@ -291,12 +292,12 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, */ if (ip_vs_ftp_get_addrport(data_start, data_limit, CLIENT_STRING, sizeof(CLIENT_STRING)-1, - '\r', &to, &port, + '\r', &to.ip, &port, &start, &end) != 1) return 1; IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", - NIPQUAD(to), ntohs(port)); + NIPQUAD(to.ip), ntohs(port)); /* Passive mode off */ cp->app_data = NULL; @@ -306,16 +307,16 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, */ IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", ip_vs_proto_name(iph->protocol), - NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); + NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); - n_cp = ip_vs_conn_in_get(iph->protocol, - to, port, - cp->vaddr.ip, htons(ntohs(cp->vport)-1)); + n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, + &to, port, + &cp->vaddr, htons(ntohs(cp->vport)-1)); if (!n_cp) { - n_cp = ip_vs_conn_new(IPPROTO_TCP, - to, port, - cp->vaddr.ip, htons(ntohs(cp->vport)-1), - cp->daddr.ip, htons(ntohs(cp->dport)-1), + n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, + &to, port, + &cp->vaddr, htons(ntohs(cp->vport)-1), + &cp->daddr, htons(ntohs(cp->dport)-1), 0, cp->dest); if (!n_cp) diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c index 4b0b8f268d1..2b18a78d039 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c @@ -46,16 +46,16 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp; if (likely(!inverse)) { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr.ip, + cp = ip_vs_conn_in_get(af, IPPROTO_UDP, + &iph->saddr, htons(PORT_ISAKMP), - iph->daddr.ip, + &iph->daddr, htons(PORT_ISAKMP)); } else { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr.ip, + cp = ip_vs_conn_in_get(af, IPPROTO_UDP, + &iph->daddr, htons(PORT_ISAKMP), - iph->saddr.ip, + &iph->saddr, htons(PORT_ISAKMP)); } @@ -86,16 +86,16 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_conn *cp; if (likely(!inverse)) { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr.ip, + cp = ip_vs_conn_out_get(af, IPPROTO_UDP, + &iph->saddr, htons(PORT_ISAKMP), - iph->daddr.ip, + &iph->daddr, htons(PORT_ISAKMP)); } else { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr.ip, + cp = ip_vs_conn_out_get(af, IPPROTO_UDP, + &iph->daddr, htons(PORT_ISAKMP), - iph->saddr.ip, + &iph->saddr, htons(PORT_ISAKMP)); } diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 3daae43ae44..3da2bb05ee7 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -36,13 +36,13 @@ tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, return NULL; if (likely(!inverse)) { - return ip_vs_conn_in_get(iph->protocol, - iph->saddr.ip, pptr[0], - iph->daddr.ip, pptr[1]); + return ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); } else { - return ip_vs_conn_in_get(iph->protocol, - iph->daddr.ip, pptr[1], - iph->saddr.ip, pptr[0]); + return ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); } } @@ -58,13 +58,13 @@ tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, return NULL; if (likely(!inverse)) { - return ip_vs_conn_out_get(iph->protocol, - iph->saddr.ip, pptr[0], - iph->daddr.ip, pptr[1]); + return ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); } else { - return ip_vs_conn_out_get(iph->protocol, - iph->daddr.ip, pptr[1], - iph->saddr.ip, pptr[0]); + return ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); } } diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 6cca0ad8e32..fd8bd934cc0 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -36,13 +36,13 @@ udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, return NULL; if (likely(!inverse)) { - cp = ip_vs_conn_in_get(iph->protocol, - iph->saddr.ip, pptr[0], - iph->daddr.ip, pptr[1]); + cp = ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); } else { - cp = ip_vs_conn_in_get(iph->protocol, - iph->daddr.ip, pptr[1], - iph->saddr.ip, pptr[0]); + cp = ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); } return cp; @@ -62,13 +62,13 @@ udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, return NULL; if (likely(!inverse)) { - cp = ip_vs_conn_out_get(iph->protocol, - iph->saddr.ip, pptr[0], - iph->daddr.ip, pptr[1]); + cp = ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); } else { - cp = ip_vs_conn_out_get(iph->protocol, - iph->daddr.ip, pptr[1], - iph->saddr.ip, pptr[0]); + cp = ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); } return cp; diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 2cf47b2e166..3ce1093e067 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -366,13 +366,17 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) } if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport); + cp = ip_vs_conn_in_get(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport); else - cp = ip_vs_ct_in_get(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport); + cp = ip_vs_ct_in_get(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport); if (!cp) { /* * Find the appropriate destination for the connection. @@ -389,10 +393,13 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) else flags &= ~IP_VS_CONN_F_INACTIVE; } - cp = ip_vs_conn_new(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport, - s->daddr, s->dport, + cp = ip_vs_conn_new(AF_INET, s->protocol, + (union nf_inet_addr *)s->caddr, + s->cport, + (union nf_inet_addr *)s->vaddr, + s->vport, + (union nf_inet_addr *)s->daddr, + s->dport, flags, dest); if (dest) atomic_dec(&dest->refcnt); -- cgit v1.2.3-70-g09d2 From 38cdcc9a039b92a9972dca3c954fb3d8b3ef13bf Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:44 +0200 Subject: IPVS: Add IPv6 support to xmit() support functions Add IPv6 support to IP_VS_XMIT() and to the xmit routing cache, introducing a new function __ip_vs_get_out_rt_v6(). Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_xmit.c | 82 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index 88199c9f2d3..fd8342ec7e1 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -20,6 +20,9 @@ #include #include /* for icmp_send */ #include /* for ip_route_output */ +#include +#include +#include #include #include @@ -47,7 +50,8 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) if (!dst) return NULL; - if ((dst->obsolete || rtos != dest->dst_rtos) && + if ((dst->obsolete + || (dest->af == AF_INET && rtos != dest->dst_rtos)) && dst->ops->check(dst, cookie) == NULL) { dest->dst_cache = NULL; dst_release(dst); @@ -109,6 +113,70 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) return rt; } +#ifdef CONFIG_IP_VS_IPV6 +static struct rt6_info * +__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); + if (!rt) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = dest->addr.in6, + .saddr = { + .s6_addr32 = + { 0, 0, 0, 0 }, + }, + }, + }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, + NULL, &fl); + if (!rt) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip6_route_output error, " + "dest: " NIP6_FMT "\n", + NIP6(dest->addr.in6)); + return NULL; + } + __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n", + NIP6(dest->addr.in6), + atomic_read(&rt->u.dst.__refcnt)); + } + spin_unlock(&dest->dst_lock); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = cp->daddr.in6, + .saddr = { + .s6_addr32 = { 0, 0, 0, 0 }, + }, + }, + }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (!rt) { + IP_VS_DBG_RL("ip6_route_output error, dest: " + NIP6_FMT "\n", NIP6(cp->daddr.in6)); + return NULL; + } + } + + return rt; +} +#endif + /* * Release dest->dst_cache before a dest is removed @@ -123,11 +191,11 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) dst_release(old_dst); } -#define IP_VS_XMIT(skb, rt) \ +#define IP_VS_XMIT(pf, skb, rt) \ do { \ (skb)->ipvs_property = 1; \ skb_forward_csum(skb); \ - NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL, \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ (rt)->u.dst.dev, dst_output); \ } while (0) @@ -200,7 +268,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(skb, rt); + IP_VS_XMIT(PF_INET, skb, rt); LeaveFunction(10); return NF_STOLEN; @@ -276,7 +344,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(skb, rt); + IP_VS_XMIT(PF_INET, skb, rt); LeaveFunction(10); return NF_STOLEN; @@ -467,7 +535,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(skb, rt); + IP_VS_XMIT(PF_INET, skb, rt); LeaveFunction(10); return NF_STOLEN; @@ -540,7 +608,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(skb, rt); + IP_VS_XMIT(PF_INET, skb, rt); rc = NF_STOLEN; goto out; -- cgit v1.2.3-70-g09d2 From b3cdd2a73867d309dca288b8e820c09e3b7f1da1 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:45 +0200 Subject: IPVS: Add and bind IPv6 xmit functions Add xmit functions for IPv6. Also add the already needed __ip_vs_get_out_rt_v6() to ip_vs_core.c. Bind the new xmit functions to v6 connections. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 20 ++- net/ipv4/ipvs/ip_vs_conn.c | 34 +++- net/ipv4/ipvs/ip_vs_core.c | 43 ++++++ net/ipv4/ipvs/ip_vs_xmit.c | 377 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 472 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 26893499eb6..ac709fa5a79 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -855,6 +855,19 @@ extern int ip_vs_icmp_xmit (struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset); extern void ip_vs_dst_reset(struct ip_vs_dest *dest); +#ifdef CONFIG_IP_VS_IPV6 +extern int ip_vs_bypass_xmit_v6 +(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); +extern int ip_vs_nat_xmit_v6 +(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); +extern int ip_vs_tunnel_xmit_v6 +(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); +extern int ip_vs_dr_xmit_v6 +(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); +extern int ip_vs_icmp_xmit_v6 +(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, + int offset); +#endif /* * This is a simple mechanism to ignore packets when @@ -899,7 +912,12 @@ static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp) } extern void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int dir); + struct ip_vs_conn *cp, int dir); + +#ifdef CONFIG_IP_VS_IPV6 +extern void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int dir); +#endif extern __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset); diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 15eec282b17..f5dddad6d5e 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -389,6 +389,33 @@ static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) } } +#ifdef CONFIG_IP_VS_IPV6 +static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit_v6; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit_v6; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit_v6; + break; + } +} +#endif + static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) { @@ -694,7 +721,12 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, cp->timeout = 3*HZ; /* Bind its packet transmitter */ - ip_vs_bind_xmit(cp); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); if (unlikely(pp && atomic_read(&pp->appcnt))) ip_vs_bind_app(cp, pp); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 2d5a4331709..d6f5bf9049a 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -570,6 +570,49 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, "Forwarding altered incoming ICMP"); } +#ifdef CONFIG_IP_VS_IPV6 +void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int icmp_offset = sizeof(struct ipv6hdr); + struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + + icmp_offset); + struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.in6; + ciph->daddr = cp->vaddr.in6; + } else { + iph->daddr = cp->daddr.in6; + ciph->saddr = cp->daddr.in6; + } + + /* the TCP/UDP port */ + if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) { + __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->icmp6_cksum = 0; + /* TODO IPv6: is this correct for ICMPv6? */ + ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); + else + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); +} +#endif + /* * Handle ICMP messages in the inside-to-outside direction (outgoing). * Find any that might be relevant, check against existing connections, diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index fd8342ec7e1..02ddc2b3ce2 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -281,6 +281,70 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, return NF_STOLEN; } +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ipv6hdr *iph = ipv6_hdr(skb); + int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = iph->daddr, + .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, + }; + + EnterFunction(10); + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (!rt) { + IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, " + "dest: " NIP6_FMT "\n", NIP6(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->u.dst); + return NF_STOLEN; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif /* * NAT transmitter (only for outside-to-inside nat forwarding) @@ -360,6 +424,83 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, sizeof(struct ipv6hdr), + sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "ip_vs_nat_xmit_v6(): frag needed for"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error; + ipv6_hdr(skb)->daddr = cp->daddr.in6; + + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; +tx_error_put: + dst_release(&rt->u.dst); + goto tx_error; +} +#endif + /* * IP Tunneling transmitter @@ -491,6 +632,112 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, return NF_STOLEN; } +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct ipv6hdr *old_iph = ipv6_hdr(skb); + sk_buff_data_t old_transport_header = skb->transport_header; + struct ipv6hdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != htons(ETH_P_IPV6)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, " + "ETH_P_IPV6: %d, skb protocol: %d\n", + htons(ETH_P_IPV6), skb->protocol); + goto tx_error; + } + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); + /* TODO IPv6: do we need this check in IPv6? */ + if (mtu < 1280) { + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n"); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + dst_release(&rt->u.dst); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n"); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ipv6_hdr(skb); + } + + skb->transport_header = old_transport_header; + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = ipv6_hdr(skb); + iph->version = 6; + iph->nexthdr = IPPROTO_IPV6; + iph->payload_len = old_iph->payload_len + sizeof(old_iph); + iph->priority = old_iph->priority; + memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); + iph->daddr = rt->rt6i_dst.addr; + iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ + iph->hop_limit = old_iph->hop_limit; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ip6_local_out(skb); + + LeaveFunction(10); + + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + /* * Direct Routing transmitter @@ -548,6 +795,60 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, return NF_STOLEN; } +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->u.dst); + return NF_STOLEN; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + /* * ICMP packet transmitter @@ -625,3 +926,79 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_rt_put(rt); goto tx_error; } + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + int rc; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop the old route when skb is not shared */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + ip_vs_nat_icmp_v6(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + rc = NF_STOLEN; + goto out; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; +out: + LeaveFunction(10); + return rc; +tx_error_put: + dst_release(&rt->u.dst); + goto tx_error; +} +#endif -- cgit v1.2.3-70-g09d2 From cd17f9ed099ed27e9b0d298253e5c05e335ac656 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:46 +0200 Subject: IPVS: Extend scheduling functions for IPv6 support Convert ip_vs_schedule() and ip_vs_sched_persist() to support scheduling of IPv6 connections. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 56 +++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 25 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index d6f5bf9049a..8bfd7c2f0eb 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -176,19 +176,25 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct ip_vs_iphdr iph; struct ip_vs_dest *dest; struct ip_vs_conn *ct; - __be16 dport; /* destination port to forward */ + __be16 dport; /* destination port to forward */ union nf_inet_addr snet; /* source network of the client, after masking */ - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); /* Mask saddr with the netmask to adjust template granularity */ - snet.ip = iph.saddr.ip & svc->netmask; +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); + else +#endif + snet.ip = iph.saddr.ip & svc->netmask; - IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " - "mnet %u.%u.%u.%u\n", - NIPQUAD(iph.saddr.ip), ntohs(ports[0]), - NIPQUAD(iph.daddr.ip), ntohs(ports[1]), - NIPQUAD(snet)); + IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " + "mnet %s\n", + IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), + IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), + IP_VS_DBG_ADDR(svc->af, &snet)); /* * As far as we know, FTP is a very complicated network protocol, and @@ -206,10 +212,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc, if (ports[1] == svc->port) { /* Check if a template already exists */ if (svc->port != FTPPORT) - ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, &iph.daddr, ports[1]); else - ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, &iph.daddr, 0); if (!ct || !ip_vs_check_template(ct)) { @@ -230,7 +236,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * for ftp service. */ if (svc->port != FTPPORT) - ct = ip_vs_conn_new(AF_INET, iph.protocol, + ct = ip_vs_conn_new(svc->af, iph.protocol, &snet, 0, &iph.daddr, ports[1], @@ -238,7 +244,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, IP_VS_CONN_F_TEMPLATE, dest); else - ct = ip_vs_conn_new(AF_INET, iph.protocol, + ct = ip_vs_conn_new(svc->af, iph.protocol, &snet, 0, &iph.daddr, 0, &dest->addr, 0, @@ -265,10 +271,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc, .all = { 0, 0, 0, htonl(svc->fwmark) } }; - ct = ip_vs_ct_in_get(AF_INET, IPPROTO_IP, &snet, 0, + ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0, &fwmark, 0); } else - ct = ip_vs_ct_in_get(AF_INET, iph.protocol, &snet, 0, + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, &iph.daddr, 0); if (!ct || !ip_vs_check_template(ct)) { @@ -293,14 +299,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, .all = { 0, 0, 0, htonl(svc->fwmark) } }; - ct = ip_vs_conn_new(AF_INET, IPPROTO_IP, + ct = ip_vs_conn_new(svc->af, IPPROTO_IP, &snet, 0, &fwmark, 0, &dest->addr, 0, IP_VS_CONN_F_TEMPLATE, dest); } else - ct = ip_vs_conn_new(AF_INET, iph.protocol, + ct = ip_vs_conn_new(svc->af, iph.protocol, &snet, 0, &iph.daddr, 0, &dest->addr, 0, @@ -320,7 +326,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - cp = ip_vs_conn_new(AF_INET, iph.protocol, + cp = ip_vs_conn_new(svc->af, iph.protocol, &iph.saddr, ports[0], &iph.daddr, ports[1], &dest->addr, dport, @@ -387,7 +393,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* * Create a connection entry. */ - cp = ip_vs_conn_new(AF_INET, iph.protocol, + cp = ip_vs_conn_new(svc->af, iph.protocol, &iph.saddr, pptr[0], &iph.daddr, pptr[1], &dest->addr, dest->port ? dest->port : pptr[1], @@ -396,13 +402,13 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) if (cp == NULL) return NULL; - IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " - "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", - ip_vs_fwd_tag(cp), - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), - NIPQUAD(cp->daddr.ip), ntohs(cp->dport), - cp->flags, atomic_read(&cp->refcnt)); + IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); return cp; -- cgit v1.2.3-70-g09d2 From 2a3b791e6e1169f374224d164738e9f7be703d77 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:47 +0200 Subject: IPVS: Add/adjust Netfilter hook functions and helpers for v6 Add Netfilter hook functions or modify existing ones, if possible, to process IPv6 packets. Some support functions are also added/modified for this. ip_vs_nat_icmp_v6() was already added in the patch that added the v6 xmit functions, as it is called from one of them. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 365 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 329 insertions(+), 36 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 8bfd7c2f0eb..035a511e12f 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -39,6 +39,11 @@ #include #include +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#endif + #include @@ -60,6 +65,7 @@ EXPORT_SYMBOL(ip_vs_get_debug_level); /* ID used in ICMP lookups */ #define icmp_id(icmph) (((icmph)->un).echo.id) +#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) const char *ip_vs_proto_name(unsigned proto) { @@ -74,6 +80,10 @@ const char *ip_vs_proto_name(unsigned proto) return "TCP"; case IPPROTO_ICMP: return "ICMP"; +#ifdef CONFIG_IP_VS_IPV6 + case IPPROTO_ICMPV6: + return "ICMPv6"; +#endif default: sprintf(buf, "IP_%d", proto); return buf; @@ -425,7 +435,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, { __be16 _ports[2], *pptr; struct ip_vs_iphdr iph; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + int unicast; + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) { @@ -433,11 +444,17 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, return NF_DROP; } +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; + else +#endif + unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); + /* if it is fwmark-based service, the cache_bypass sysctl is up - and the destination is RTN_UNICAST (and not local), then create + and the destination is a non-local unicast, then create a cache_bypass connection entry */ - if (sysctl_ip_vs_cache_bypass && svc->fwmark - && (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST)) { + if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { int ret, cs; struct ip_vs_conn *cp; @@ -445,7 +462,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, /* create a new connection entry */ IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); - cp = ip_vs_conn_new(AF_INET, iph.protocol, + cp = ip_vs_conn_new(svc->af, iph.protocol, &iph.saddr, pptr[0], &iph.daddr, pptr[1], 0, 0, @@ -489,7 +506,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * created, the TCP RST packet cannot be sent, instead that * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ */ - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, + skb->dev); + else +#endif + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + return NF_DROP; } @@ -528,6 +552,14 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) return err; } +#ifdef CONFIG_IP_VS_IPV6 +static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) +{ + /* TODO IPv6: Find out what to do here for IPv6 */ + return 0; +} +#endif + /* * Packet has been made sufficiently writable in caller * - inout: 1=in->out, 0=out->in @@ -727,11 +759,117 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) return verdict; } -static inline int is_tcp_reset(const struct sk_buff *skb) +#ifdef CONFIG_IP_VS_IPV6 +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + NIP6(iph->saddr), NIP6(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + IP_VS_ERR("shouldn't reach here, because the box is on the " + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) + && ip_vs_checksum_complete(skb, sizeof(struct ipv6hdr))) { + /* Failed checksum! */ + IP_VS_DBG(1, "Forward ICMPv6: failed checksum from " + NIP6_FMT "!\n", + NIP6(iph->saddr)); + goto out; + } + + if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) + offset += 2 * sizeof(__u16); + if (!skb_make_writable(skb, offset)) + goto out; + + ip_vs_nat_icmp_v6(skb, pp, cp, 1); + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->ipvs_property = 1; + verdict = NF_ACCEPT; + +out: + __ip_vs_conn_put(cp); + + return verdict; +} +#endif + +static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) { struct tcphdr _tcph, *th; - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); if (th == NULL) return 0; return th->rst; @@ -750,38 +888,64 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; + int af; EnterFunction(11); + af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6; + if (skb->ipvs_property) return NF_ACCEPT; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); - if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_out_icmp(skb, &related); + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related, verdict = ip_vs_out_icmp_v6(skb, &related); - if (related) - return verdict; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); - } + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_out_icmp(skb, &related); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } pp = ip_vs_proto_get(iph.protocol); if (unlikely(!pp)) return NF_ACCEPT; /* reassemble IP fragments */ - if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && - !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related, verdict = ip_vs_out_icmp_v6(skb, &related); - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); - } + if (related) + return verdict; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && + !pp->dont_defrag)) { + if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(AF_INET, skb, pp, &iph, iph.len, 0); + cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); if (unlikely(!cp)) { if (sysctl_ip_vs_nat_icmp_send && @@ -794,16 +958,26 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, if (pptr == NULL) return NF_ACCEPT; /* Not for me */ if (ip_vs_lookup_real_service(iph.protocol, - iph.saddr.ip, pptr[0])) { + iph.saddr.ip, + pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST * packet or not TCP packet. */ if (iph.protocol != IPPROTO_TCP - || !is_tcp_reset(skb)) { - icmp_send(skb,ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); + || !is_tcp_reset(skb, iph.len)) { +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0, skb->dev); + else +#endif + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); return NF_DROP; } } @@ -821,8 +995,16 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, /* mangle the packet */ if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) goto drop; - ip_hdr(skb)->saddr = cp->vaddr.ip; - ip_send_check(ip_hdr(skb)); + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ipv6_hdr(skb)->saddr = cp->vaddr.in6; + else +#endif + { + ip_hdr(skb)->saddr = cp->vaddr.ip; + ip_send_check(ip_hdr(skb)); + } /* For policy routing, packets originating from this * machine itself may be routed differently to packets @@ -830,8 +1012,14 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, * if it came from this machine itself. So re-compute * the routing information. */ - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (ip6_route_me_harder(skb) != 0) + goto drop; + } else +#endif + if (ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); @@ -949,6 +1137,94 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) return verdict; } +#ifdef CONFIG_IP_VS_IPV6 +static int +ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? + IP_DEFRAG_VS_IN : + IP_DEFRAG_VS_FWD)) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + NIP6(iph->saddr), NIP6(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + verdict = NF_DROP; + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); + /* do not touch skb anymore */ + + __ip_vs_conn_put(cp); + + return verdict; +} +#endif + + /* * Check if it's for virtual services, look it up, * and send it on its way... @@ -961,9 +1237,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; - int ret, restart; + int ret, restart, af; + + af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); /* * Big tappo: only PACKET_HOST (neither loopback nor mcasts) @@ -974,7 +1252,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", skb->pkt_type, iph.protocol, - IP_VS_DBG_ADDR(AF_INET, &iph.daddr)); + IP_VS_DBG_ADDR(af, &iph.daddr)); return NF_ACCEPT; } @@ -983,7 +1261,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, if (related) return verdict; - ip_vs_fill_iphdr(AF_INET, skb_network_header(skb), &iph); + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } /* Protocol supported? */ @@ -994,12 +1272,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(AF_INET, skb, pp, &iph, iph.len, 0); + cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); if (unlikely(!cp)) { int v; - if (!pp->conn_schedule(AF_INET, skb, pp, &v, &cp)) + if (!pp->conn_schedule(af, skb, pp, &v, &cp)) return v; } @@ -1082,6 +1360,21 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, return ip_vs_in_icmp(skb, &r, hooknum); } +#ifdef CONFIG_IP_VS_IPV6 +static unsigned int +ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + + if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + return NF_ACCEPT; + + return ip_vs_in_icmp_v6(skb, &r, hooknum); +} +#endif + static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After packet filtering, forward packet through VS/DR, VS/TUN, -- cgit v1.2.3-70-g09d2 From 7937df1564783806c285d34a1c6fd63d8da29d7a Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:48 +0200 Subject: IPVS: Convert real server lookup functions Convert functions for looking up destinations (real servers) to support IPv6 services/dests. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 8 +++-- net/ipv4/ipvs/ip_vs_conn.c | 5 +-- net/ipv4/ipvs/ip_vs_core.c | 4 +-- net/ipv4/ipvs/ip_vs_ctl.c | 80 ++++++++++++++++++++++++++++++---------------- net/ipv4/ipvs/ip_vs_sync.c | 7 ++-- 5 files changed, 67 insertions(+), 37 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ac709fa5a79..a719c0ef99e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -804,14 +804,16 @@ static inline void ip_vs_service_put(struct ip_vs_service *svc) } extern struct ip_vs_dest * -ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport); +ip_vs_lookup_real_service(int af, __u16 protocol, + const union nf_inet_addr *daddr, __be16 dport); + extern int ip_vs_use_count_inc(void); extern void ip_vs_use_count_dec(void); extern int ip_vs_control_init(void); extern void ip_vs_control_cleanup(void); extern struct ip_vs_dest * -ip_vs_find_dest(__be32 daddr, __be16 dport, - __be32 vaddr, __be16 vport, __u16 protocol); +ip_vs_find_dest(int af, const union nf_inet_addr *daddr, __be16 dport, + const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol); extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index f5dddad6d5e..c2a42a62433 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -491,8 +491,9 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) struct ip_vs_dest *dest; if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->daddr.ip, cp->dport, - cp->vaddr.ip, cp->vport, cp->protocol); + dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, + &cp->vaddr, cp->vport, + cp->protocol); ip_vs_bind_dest(cp, dest); return dest; } else diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 035a511e12f..27bef1d67aa 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -957,8 +957,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(iph.protocol, - iph.saddr.ip, + if (ip_vs_lookup_real_service(af, iph.protocol, + &iph.saddr, pptr[0])) { /* * Notify the real server: there is no diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 1f3fc66e694..bb0e1e3c885 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -492,11 +492,20 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest) /* * Returns hash value for real service */ -static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port) +static inline unsigned ip_vs_rs_hashkey(int af, + const union nf_inet_addr *addr, + __be16 port) { register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif - return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) + return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) & IP_VS_RTAB_MASK; } @@ -516,7 +525,8 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest) * Hash by proto,addr,port, * which are the parameters of the real service. */ - hash = ip_vs_rs_hashkey(dest->addr.ip, dest->port); + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + list_add(&dest->d_list, &ip_vs_rtable[hash]); return 1; @@ -543,7 +553,9 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest) * Lookup real service by in the real service table. */ struct ip_vs_dest * -ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) +ip_vs_lookup_real_service(int af, __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) { unsigned hash; struct ip_vs_dest *dest; @@ -552,11 +564,12 @@ ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) * Check for "full" addressed entries * Return the first found entry */ - hash = ip_vs_rs_hashkey(daddr, dport); + hash = ip_vs_rs_hashkey(af, daddr, dport); read_lock(&__ip_vs_rs_lock); list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { - if ((dest->addr.ip == daddr) + if ((dest->af == af) + && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->port == dport) && ((dest->protocol == protocol) || dest->vfwmark)) { @@ -574,7 +587,8 @@ ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) * Lookup destination by {addr,port} in the given service */ static struct ip_vs_dest * -ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) +ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) { struct ip_vs_dest *dest; @@ -582,7 +596,9 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) * Find the destination for the given service */ list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->addr.ip == daddr) && (dest->port == dport)) { + if ((dest->af == svc->af) + && ip_vs_addr_equal(svc->af, &dest->addr, daddr) + && (dest->port == dport)) { /* HIT */ return dest; } @@ -601,14 +617,15 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) * ip_vs_lookup_real_service() looked promissing, but * seems not working as expected. */ -struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, - __be32 vaddr, __be16 vport, __u16 protocol) +struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, + __be16 dport, + const union nf_inet_addr *vaddr, + __be16 vport, __u16 protocol) { struct ip_vs_dest *dest; struct ip_vs_service *svc; - union nf_inet_addr _vaddr = { .ip = vaddr }; - svc = ip_vs_service_get(AF_INET, 0, protocol, &_vaddr, vport); + svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); if (!svc) return NULL; dest = ip_vs_lookup_dest(svc, daddr, dport); @@ -629,7 +646,8 @@ struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, * scheduling. */ static struct ip_vs_dest * -ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) +ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) { struct ip_vs_dest *dest, *nxt; @@ -637,17 +655,19 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) * Find the destination in trash */ list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " - "dest->refcnt=%d\n", - dest->vfwmark, - NIPQUAD(dest->addr.ip), ntohs(dest->port), - atomic_read(&dest->refcnt)); - if (dest->addr.ip == daddr && + IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " + "dest->refcnt=%d\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->af == svc->af && + ip_vs_addr_equal(svc->af, &dest->addr, daddr) && dest->port == dport && dest->vfwmark == svc->fwmark && dest->protocol == svc->protocol && (svc->fwmark || - (dest->vaddr.ip == svc->addr.ip && + (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && dest->vport == svc->port))) { /* HIT */ return dest; @@ -657,10 +677,11 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) * Try to purge the destination from trash if not referenced */ if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " - "from trash\n", - dest->vfwmark, - NIPQUAD(dest->addr.ip), ntohs(dest->port)); + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " + "from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); list_del(&dest->n_list); ip_vs_dst_reset(dest); __ip_vs_unbind_svc(dest); @@ -847,7 +868,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Check if the dest already exists in the list */ - dest = ip_vs_lookup_dest(svc, daddr.ip, dport); + dest = ip_vs_lookup_dest(svc, &daddr, dport); + if (dest != NULL) { IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); return -EEXIST; @@ -857,7 +879,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) * Check if the dest already exists in the trash and * is from the same service */ - dest = ip_vs_trash_get_dest(svc, daddr.ip, dport); + dest = ip_vs_trash_get_dest(svc, &daddr, dport); + if (dest != NULL) { IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", @@ -956,7 +979,8 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Lookup the destination list */ - dest = ip_vs_lookup_dest(svc, daddr.ip, dport); + dest = ip_vs_lookup_dest(svc, &daddr, dport); + if (dest == NULL) { IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); return -ENOENT; @@ -1054,7 +1078,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) EnterFunction(2); - dest = ip_vs_lookup_dest(svc, udest->addr.ip, dport); + dest = ip_vs_lookup_dest(svc, &udest->addr, dport); if (dest == NULL) { IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 3ce1093e067..40647edf102 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -383,8 +383,11 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) * If it is not found the connection will remain unbound * but still handled. */ - dest = ip_vs_find_dest(s->daddr, s->dport, - s->vaddr, s->vport, + dest = ip_vs_find_dest(AF_INET, + (union nf_inet_addr *)&s->daddr, + s->dport, + (union nf_inet_addr *)&s->vaddr, + s->vport, s->protocol); /* Set the approprite ativity flag */ if (s->protocol == IPPROTO_TCP) { -- cgit v1.2.3-70-g09d2 From 667a5f18162e803e30722af46ade1737e3b93198 Mon Sep 17 00:00:00 2001 From: Vince Busam Date: Tue, 2 Sep 2008 15:55:49 +0200 Subject: IPVS: Convert procfs files for IPv6 entry output Correctly output IPv6 connection/service/dest entries in procfs files. Signed-off-by: Vince Busam Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_conn.c | 37 +++++++++++++++++++++++++++---- net/ipv4/ipvs/ip_vs_ctl.c | 54 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 73 insertions(+), 18 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index c2a42a62433..e7603d749c0 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -815,8 +815,22 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) else { const struct ip_vs_conn *cp = v; - seq_printf(seq, - "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, + "%-3s " NIP6_FMT " %04X " NIP6_FMT + " %04X " NIP6_FMT " %04X %-11s %7lu\n", + ip_vs_proto_name(cp->protocol), + NIP6(cp->caddr.in6), ntohs(cp->cport), + NIP6(cp->vaddr.in6), ntohs(cp->vport), + NIP6(cp->daddr.in6), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X" + " %08X %04X %-11s %7lu\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), @@ -864,8 +878,23 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) else { const struct ip_vs_conn *cp = v; - seq_printf(seq, - "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n", +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, + "%-3s " NIP6_FMT " %04X " NIP6_FMT + " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + NIP6(cp->caddr.in6), ntohs(cp->cport), + NIP6(cp->vaddr.in6), ntohs(cp->vport), + NIP6(cp->daddr.in6), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X " + "%08X %04X %-11s %-6s %7lu\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index bb0e1e3c885..25d9e98e31f 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1793,15 +1793,25 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; - if (iter->table == ip_vs_svc_table) - seq_printf(seq, "%s %08X:%04X %s ", - ip_vs_proto_name(svc->protocol), - ntohl(svc->addr.ip), - ntohs(svc->port), - svc->scheduler->name); - else + if (iter->table == ip_vs_svc_table) { +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ", + ip_vs_proto_name(svc->protocol), + NIP6(svc->addr.in6), + ntohs(svc->port), + svc->scheduler->name); + else +#endif + seq_printf(seq, "%s %08X:%04X %s ", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr.ip), + ntohs(svc->port), + svc->scheduler->name); + } else { seq_printf(seq, "FWM %08X %s ", svc->fwmark, svc->scheduler->name); + } if (svc->flags & IP_VS_SVC_F_PERSISTENT) seq_printf(seq, "persistent %d %08X\n", @@ -1811,13 +1821,29 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) seq_putc(seq, '\n'); list_for_each_entry(dest, &svc->destinations, n_list) { - seq_printf(seq, - " -> %08X:%04X %-7s %-6d %-10d %-10d\n", - ntohl(dest->addr.ip), ntohs(dest->port), - ip_vs_fwd_name(atomic_read(&dest->conn_flags)), - atomic_read(&dest->weight), - atomic_read(&dest->activeconns), - atomic_read(&dest->inactconns)); +#ifdef CONFIG_IP_VS_IPV6 + if (dest->af == AF_INET6) + seq_printf(seq, + " -> [" NIP6_FMT "]:%04X" + " %-7s %-6d %-10d %-10d\n", + NIP6(dest->addr.in6), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + else +#endif + seq_printf(seq, + " -> %08X:%04X " + "%-7s %-6d %-10d %-10d\n", + ntohl(dest->addr.ip), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + } } return 0; -- cgit v1.2.3-70-g09d2 From c6883f587341a3ed113856de8769d0992b4bbd85 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:50 +0200 Subject: IVPS: Disable sync daemon for IPv6 connections Disable the sync daemon for IPv6 connections, works only with IPv4 for now. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 27bef1d67aa..5a7a81778b0 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1321,7 +1321,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, * encorage the standby servers to update the connections timeout */ atomic_inc(&cp->in_pkts); - if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && + if (af == AF_INET && + (ip_vs_sync_state & IP_VS_STATE_MASTER) && (((cp->protocol != IPPROTO_TCP || cp->state == IP_VS_TCP_S_ESTABLISHED) && (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] -- cgit v1.2.3-70-g09d2 From a0eb662f9ec8962928d937a185ad128db12c4637 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:51 +0200 Subject: IPVS: Turn off FTP application helper for IPv6 Immediately return from FTP application helper and do nothing when dealing with IPv6 packets. IPv6 is not supported by this helper yet. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ftp.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c index 0c3fbe0de5f..2e7dbd8b73a 100644 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -147,6 +147,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, unsigned buf_len; int ret; +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + *diff = 0; /* Only useful for established sessions */ @@ -248,6 +256,14 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, __be16 port; struct ip_vs_conn *n_cp; +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + /* no diff required for incoming packets */ *diff = 0; -- cgit v1.2.3-70-g09d2 From 09571c7ae30865adfa79dccd12a822a65d2c4b5a Mon Sep 17 00:00:00 2001 From: Vince Busam Date: Tue, 2 Sep 2008 15:55:52 +0200 Subject: IPVS: Add function to determine if IPv6 address is local Add __ip_vs_addr_is_local_v6() to find out if an IPv6 address belongs to a local interface. Use this function to decide whether to set the IP_VS_CONN_F_LOCALNODE flag for IPv6 destinations. Signed-off-by: Vince Busam Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 56 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 25d9e98e31f..640203a153c 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -35,6 +35,10 @@ #include #include +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#endif #include #include #include @@ -91,6 +95,26 @@ int ip_vs_get_debug_level(void) } #endif +#ifdef CONFIG_IP_VS_IPV6 +/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ +static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) +{ + struct rt6_info *rt; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = *addr, + .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) + return 1; + + return 0; +} +#endif /* * update_defense_level is called from keventd and from sysctl, * so it needs to protect itself from softirqs @@ -751,10 +775,18 @@ __ip_vs_update_dest(struct ip_vs_service *svc, conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; /* check if local node and update the flags */ - if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + } else +#endif + if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { @@ -803,9 +835,19 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, EnterFunction(2); - atype = inet_addr_type(&init_net, udest->addr.ip); - if (atype != RTN_LOCAL && atype != RTN_UNICAST) - return -EINVAL; +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + atype = ipv6_addr_type(&udest->addr.in6); + if (!(atype & IPV6_ADDR_UNICAST) && + !__ip_vs_addr_is_local_v6(&udest->addr.in6)) + return -EINVAL; + } else +#endif + { + atype = inet_addr_type(&init_net, udest->addr.ip); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + } dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); if (dest == NULL) { -- cgit v1.2.3-70-g09d2 From cfc78c5a09241a3a9561466834996a7fb90c4228 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:53 +0200 Subject: IPVS: Adjust various debug outputs to use new macros Adjust various debug outputs to use the new *_BUF macro variants for correct output of v4/v6 addresses. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 53 +++++++++++++++++++++++--------------- net/ipv4/ipvs/ip_vs_conn.c | 57 ++++++++++++++++++++++------------------- net/ipv4/ipvs/ip_vs_ctl.c | 24 +++++++++-------- net/ipv4/ipvs/ip_vs_proto_tcp.c | 45 ++++++++++++++++++-------------- net/ipv4/ipvs/ip_vs_proto_udp.c | 15 ++++++----- 5 files changed, 111 insertions(+), 83 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a719c0ef99e..1b13cef4b54 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -680,24 +680,32 @@ static inline void ip_vs_control_del(struct ip_vs_conn *cp) { struct ip_vs_conn *ctl_cp = cp->control; if (!ctl_cp) { - IP_VS_ERR("request control DEL for uncontrolled: " - "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", - NIPQUAD(cp->caddr),ntohs(cp->cport), - NIPQUAD(cp->vaddr),ntohs(cp->vport)); + IP_VS_ERR_BUF("request control DEL for uncontrolled: " + "%s:%d to %s:%d\n", + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport)); + return; } - IP_VS_DBG(7, "DELeting control for: " - "cp.dst=%d.%d.%d.%d:%d ctl_cp.dst=%d.%d.%d.%d:%d\n", - NIPQUAD(cp->caddr),ntohs(cp->cport), - NIPQUAD(ctl_cp->caddr),ntohs(ctl_cp->cport)); + IP_VS_DBG_BUF(7, "DELeting control for: " + "cp.dst=%s:%d ctl_cp.dst=%s:%d\n", + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &ctl_cp->caddr), + ntohs(ctl_cp->cport)); cp->control = NULL; if (atomic_read(&ctl_cp->n_control) == 0) { - IP_VS_ERR("BUG control DEL with n=0 : " - "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", - NIPQUAD(cp->caddr),ntohs(cp->cport), - NIPQUAD(cp->vaddr),ntohs(cp->vport)); + IP_VS_ERR_BUF("BUG control DEL with n=0 : " + "%s:%d to %s:%d\n", + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport)); + return; } atomic_dec(&ctl_cp->n_control); @@ -707,17 +715,22 @@ static inline void ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) { if (cp->control) { - IP_VS_ERR("request control ADD for already controlled: " - "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", - NIPQUAD(cp->caddr),ntohs(cp->cport), - NIPQUAD(cp->vaddr),ntohs(cp->vport)); + IP_VS_ERR_BUF("request control ADD for already controlled: " + "%s:%d to %s:%d\n", + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport)); + ip_vs_control_del(cp); } - IP_VS_DBG(7, "ADDing control for: " - "cp.dst=%d.%d.%d.%d:%d ctl_cp.dst=%d.%d.%d.%d:%d\n", - NIPQUAD(cp->caddr),ntohs(cp->cport), - NIPQUAD(ctl_cp->caddr),ntohs(ctl_cp->cport)); + IP_VS_DBG_BUF(7, "ADDing control for: " + "cp.dst=%s:%d ctl_cp.dst=%s:%d\n", + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &ctl_cp->caddr), + ntohs(ctl_cp->cport)); cp->control = ctl_cp; atomic_inc(&ctl_cp->n_control); diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index e7603d749c0..9a24332fbed 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -449,16 +449,16 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) cp->flags |= atomic_read(&dest->conn_flags); cp->dest = dest; - IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), - NIPQUAD(cp->daddr.ip), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); + IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); /* Update the connection counters */ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { @@ -512,16 +512,16 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) if (!dest) return; - IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), - NIPQUAD(cp->daddr.ip), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); + IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); /* Update the connection counters */ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { @@ -574,13 +574,16 @@ int ip_vs_check_template(struct ip_vs_conn *ct) !(dest->flags & IP_VS_DEST_F_AVAILABLE) || (sysctl_ip_vs_expire_quiescent_template && (atomic_read(&dest->weight) == 0))) { - IP_VS_DBG(9, "check_template: dest not available for " - "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "-> d:%u.%u.%u.%u:%d\n", - ip_vs_proto_name(ct->protocol), - NIPQUAD(ct->caddr.ip), ntohs(ct->cport), - NIPQUAD(ct->vaddr.ip), ntohs(ct->vport), - NIPQUAD(ct->daddr.ip), ntohs(ct->dport)); + IP_VS_DBG_BUF(9, "check_template: dest not available for " + "protocol %s s:%s:%d v:%s:%d " + "-> d:%s:%d\n", + ip_vs_proto_name(ct->protocol), + IP_VS_DBG_ADDR(ct->af, &ct->caddr), + ntohs(ct->cport), + IP_VS_DBG_ADDR(ct->af, &ct->vaddr), + ntohs(ct->vport), + IP_VS_DBG_ADDR(ct->af, &ct->daddr), + ntohs(ct->dport)); /* * Invalidate the connection template diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 640203a153c..6dbc527285f 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -924,13 +924,14 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) dest = ip_vs_trash_get_dest(svc, &daddr, dport); if (dest != NULL) { - IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " - "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", - NIPQUAD(daddr), ntohs(dport), - atomic_read(&dest->refcnt), - dest->vfwmark, - NIPQUAD(dest->vaddr.ip), - ntohs(dest->vport)); + IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " + "dest->refcnt=%d, service %u/%s:%u\n", + IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->vaddr), + ntohs(dest->vport)); + __ip_vs_update_dest(svc, dest, udest); /* @@ -1076,10 +1077,11 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) atomic_dec(&dest->svc->refcnt); kfree(dest); } else { - IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " - "dest->refcnt=%d\n", - NIPQUAD(dest->addr.ip), ntohs(dest->port), - atomic_read(&dest->refcnt)); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " + "dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); list_add(&dest->n_list, &ip_vs_dest_trash); atomic_inc(&dest->refcnt); } diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 3da2bb05ee7..de8ed73997c 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -490,19 +490,23 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, if (new_state != cp->state) { struct ip_vs_dest *dest = cp->dest; - IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" - "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n", - pp->name, - (state_off==TCP_DIR_OUTPUT)?"output ":"input ", - th->syn? 'S' : '.', - th->fin? 'F' : '.', - th->ack? 'A' : '.', - th->rst? 'R' : '.', - NIPQUAD(cp->daddr.ip), ntohs(cp->dport), - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - tcp_state_name(cp->state), - tcp_state_name(new_state), - atomic_read(&cp->refcnt)); + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pp->name, + ((state_off == TCP_DIR_OUTPUT) ? + "output " : "input "), + th->syn ? 'S' : '.', + th->fin ? 'F' : '.', + th->ack ? 'A' : '.', + th->rst ? 'R' : '.', + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + atomic_read(&cp->refcnt)); + if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && (new_state != IP_VS_TCP_S_ESTABLISHED)) { @@ -623,12 +627,15 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) break; spin_unlock(&tcp_app_lock); - IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" - "%u.%u.%u.%u:%u to app %s on port %u\n", - __func__, - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), - inc->name, ntohs(inc->port)); + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index fd8bd934cc0..5f2073e41cf 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -408,12 +408,15 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) break; spin_unlock(&udp_app_lock); - IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" - "%u.%u.%u.%u:%u to app %s on port %u\n", - __func__, - NIPQUAD(cp->caddr.ip), ntohs(cp->cport), - NIPQUAD(cp->vaddr.ip), ntohs(cp->vport), - inc->name, ntohs(inc->port)); + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); -- cgit v1.2.3-70-g09d2 From 473b23d37b697c66ac0bfcfdcc9badf718e25d2a Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:54 +0200 Subject: IPVS: Activate IPv6 Netfilter hooks Register the previously defined or adapted netfilter hook functions for IPv6 as PF_INET6 hooks. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 5a7a81778b0..7d3de9db5ac 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1413,6 +1413,43 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC-1, }, +#ifdef CONFIG_IP_VS_IPV6 + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_IN, + .priority = 100, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp_v6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_NAT_SRC-1, + }, +#endif }; -- cgit v1.2.3-70-g09d2 From f94fd041402e4e70d2b4ed00008b9bb857e6ae87 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Tue, 2 Sep 2008 15:55:55 +0200 Subject: IPVS: Allow adding IPv6 services from userspace Allow adding IPv6 services through the genetlink interface and add checks to see if the chosen scheduler is supported with IPv6 and whether the supplied prefix length is sane. Make sure the service count exported via the sockopt interface only counts IPv4 services. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 53 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 6dbc527285f..7f89c588e58 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1177,6 +1177,19 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, goto out_mod_dec; } +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + if (!sched->supports_ipv6) { + ret = -EAFNOSUPPORT; + goto out_err; + } + if ((u->netmask < 1) || (u->netmask > 128)) { + ret = -EINVAL; + goto out_err; + } + } +#endif + svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); if (svc == NULL) { IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); @@ -1214,7 +1227,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, atomic_inc(&ip_vs_nullsvc_counter); ip_vs_new_estimator(&svc->stats); - ip_vs_num_services++; + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ip_vs_num_services++; /* Hash the service into the service table */ write_lock_bh(&__ip_vs_svc_lock); @@ -1265,6 +1281,19 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) } old_sched = sched; +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + if (!sched->supports_ipv6) { + ret = EAFNOSUPPORT; + goto out; + } + if ((u->netmask < 1) || (u->netmask > 128)) { + ret = EINVAL; + goto out; + } + } +#endif + write_lock_bh(&__ip_vs_svc_lock); /* @@ -1329,7 +1358,10 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; - ip_vs_num_services--; + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ip_vs_num_services--; + ip_vs_kill_estimator(&svc->stats); /* Unbind scheduler */ @@ -2212,6 +2244,10 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET) + continue; + if (count >= get->num_services) goto out; memset(&entry, 0, sizeof(entry)); @@ -2227,6 +2263,10 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET) + continue; + if (count >= get->num_services) goto out; memset(&entry, 0, sizeof(entry)); @@ -2584,7 +2624,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, if (!nl_service) return -EMSGSIZE; - NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, AF_INET); + NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); if (svc->fwmark) { NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); @@ -2691,8 +2731,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, return -EINVAL; usvc->af = nla_get_u16(nla_af); - /* For now, only support IPv4 */ - if (nla_get_u16(nla_af) != AF_INET) +#ifdef CONFIG_IP_VS_IPV6 + if (usvc->af != AF_INET && usvc->af != AF_INET6) +#else + if (usvc->af != AF_INET) +#endif return -EAFNOSUPPORT; if (nla_fwmark) { -- cgit v1.2.3-70-g09d2 From 4856c84c1358b79852743ac64e50c1e9d5118f05 Mon Sep 17 00:00:00 2001 From: Malcolm Turnbull Date: Fri, 5 Sep 2008 11:17:13 +1000 Subject: ipvs: load balance IPv4 connections from a local process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows IPVS to load balance connections made by a local process. For example a proxy server running locally. External client --> pound:443 -> Local:443 --> IPVS:80 --> RealServer Signed-off-by: Siim Põder Signed-off-by: Malcolm Turnbull Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 224 +++++++++++++++++++++++----------------- net/ipv4/ipvs/ip_vs_proto_tcp.c | 4 +- 2 files changed, 134 insertions(+), 94 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 7d3de9db5ac..26e3d99bbee 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -651,12 +651,53 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, } #endif +/* Handle relevant response ICMP messages - forward to the right + * destination host. Used for NAT and local client. + */ +static int handle_response_icmp(struct sk_buff *skb, struct iphdr *iph, + struct iphdr *cih, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, + unsigned int offset, unsigned int ihl) +{ + unsigned int verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + IP_VS_ERR("shouldn't reach here, because the box is on the " + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, + "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + goto out; + } + + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + offset += 2 * sizeof(__u16); + if (!skb_make_writable(skb, offset)) + goto out; + + ip_vs_nat_icmp(skb, pp, cp, 1); + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->ipvs_property = 1; + verdict = NF_ACCEPT; + +out: + __ip_vs_conn_put(cp); + + return verdict; +} + /* * Handle ICMP messages in the inside-to-outside direction (outgoing). - * Find any that might be relevant, check against existing connections, - * forward to the right destination host if relevant. + * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. - * (Only used in VS/NAT) */ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) { @@ -666,7 +707,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; - unsigned int offset, ihl, verdict; + unsigned int offset, ihl; *related = 1; @@ -725,38 +766,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) if (!cp) return NF_ACCEPT; - verdict = NF_DROP; - - if (IP_VS_FWD_METHOD(cp) != 0) { - IP_VS_ERR("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); - goto out; - } - - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) - offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) - goto out; - - ip_vs_nat_icmp(skb, pp, cp, 1); - - /* do the statistics and put it back */ - ip_vs_out_stats(cp, skb); - - skb->ipvs_property = 1; - verdict = NF_ACCEPT; - - out: - __ip_vs_conn_put(cp); - - return verdict; + return handle_response_icmp(skb, iph, cih, cp, pp, offset, ihl); } #ifdef CONFIG_IP_VS_IPV6 @@ -875,10 +885,76 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) return th->rst; } +/* Handle response packets: rewrite addresses and send away... + * Used for NAT and local client. + */ +static unsigned int +handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int ihl) +{ + IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + + if (!skb_make_writable(skb, ihl)) + goto drop; + + /* mangle the packet */ + if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) + goto drop; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ipv6_hdr(skb)->saddr = cp->vaddr.in6; + else +#endif + { + ip_hdr(skb)->saddr = cp->vaddr.ip; + ip_send_check(ip_hdr(skb)); + } + + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (ip6_route_me_harder(skb) != 0) + goto drop; + } else +#endif + if (ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; + + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ + if (ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; + + IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + ip_vs_conn_put(cp); + + skb->ipvs_property = 1; + + LeaveFunction(11); + return NF_ACCEPT; + +drop: + ip_vs_conn_put(cp); + kfree_skb(skb); + return NF_STOLEN; +} + /* * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. - * Check if outgoing packet belongs to the established ip_vs_conn, - * rewrite addresses of the packet and send it on its way... + * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int ip_vs_out(unsigned int hooknum, struct sk_buff *skb, @@ -987,55 +1063,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, return NF_ACCEPT; } - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); - - if (!skb_make_writable(skb, iph.len)) - goto drop; - - /* mangle the packet */ - if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) - goto drop; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - ipv6_hdr(skb)->saddr = cp->vaddr.in6; - else -#endif - { - ip_hdr(skb)->saddr = cp->vaddr.ip; - ip_send_check(ip_hdr(skb)); - } - - /* For policy routing, packets originating from this - * machine itself may be routed differently to packets - * passing through. We want this packet to be routed as - * if it came from this machine itself. So re-compute - * the routing information. - */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ip6_route_me_harder(skb) != 0) - goto drop; - } else -#endif - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); - - ip_vs_out_stats(cp, skb); - ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); - ip_vs_conn_put(cp); - - skb->ipvs_property = 1; - - LeaveFunction(11); - return NF_ACCEPT; - - drop: - ip_vs_conn_put(cp); - kfree_skb(skb); - return NF_STOLEN; + return handle_response(af, skb, pp, cp, iph.len); } @@ -1111,8 +1139,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) ip_vs_fill_iphdr(AF_INET, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); - if (!cp) + if (!cp) { + /* The packet could also belong to a local client */ + cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + if (cp) + return handle_response_icmp(skb, iph, cih, cp, pp, + offset, ihl); return NF_ACCEPT; + } verdict = NF_DROP; @@ -1244,11 +1278,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); /* - * Big tappo: only PACKET_HOST (neither loopback nor mcasts) - * ... don't know why 1st test DOES NOT include 2nd (?) + * Big tappo: only PACKET_HOST, including loopback for local client + * Don't handle local packets on IPv6 for now */ - if (unlikely(skb->pkt_type != PACKET_HOST - || skb->dev->flags & IFF_LOOPBACK || skb->sk)) { + if (unlikely(skb->pkt_type != PACKET_HOST || + (af == AF_INET6 || (skb->dev->flags & IFF_LOOPBACK || + skb->sk)))) { IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", skb->pkt_type, iph.protocol, @@ -1277,6 +1312,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, if (unlikely(!cp)) { int v; + /* For local client packets, it could be a response */ + cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); + if (cp) + return handle_response(af, skb, pp, cp, iph.len); + if (!pp->conn_schedule(af, skb, pp, &v, &cp)) return v; } diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index de8ed73997c..808e8be0280 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -166,7 +166,7 @@ tcp_snat_handler(struct sk_buff *skb, tcph->source = cp->vport; /* Adjust TCP checksums */ - if (!cp->app) { + if (!cp->app && (tcph->check != 0)) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); @@ -235,7 +235,7 @@ tcp_dnat_handler(struct sk_buff *skb, /* * Adjust TCP checksums */ - if (!cp->app) { + if (!cp->app && (tcph->check != 0)) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); -- cgit v1.2.3-70-g09d2 From f2428ed5e7bc89c7716ead22748cb5d076e204f0 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 5 Sep 2008 11:17:14 +1000 Subject: ipvs: load balance ipv6 connections from a local process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows IPVS to load balance IPv6 connections made by a local process. For example a proxy server running locally. External client --> pound:443 -> Local:443 --> IPVS:80 --> RealServer This is an extenstion to the IPv4 work done in this area by Siim Põder and Malcolm Turnbull. Cc: Siim Põder Cc: Malcolm Turnbull Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 91 +++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 50 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 26e3d99bbee..05797a5ce15 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -654,8 +654,9 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, /* Handle relevant response ICMP messages - forward to the right * destination host. Used for NAT and local client. */ -static int handle_response_icmp(struct sk_buff *skb, struct iphdr *iph, - struct iphdr *cih, struct ip_vs_conn *cp, +static int handle_response_icmp(int af, struct sk_buff *skb, + union nf_inet_addr *snet, + __u8 protocol, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, unsigned int offset, unsigned int ihl) { @@ -669,18 +670,22 @@ static int handle_response_icmp(struct sk_buff *skb, struct iphdr *iph, /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ - IP_VS_DBG(1, - "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); + IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", + IP_VS_DBG_ADDR(af, snet)); goto out; } - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol) offset += 2 * sizeof(__u16); if (!skb_make_writable(skb, offset)) goto out; - ip_vs_nat_icmp(skb, pp, cp, 1); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_nat_icmp_v6(skb, pp, cp, 1); + else +#endif + ip_vs_nat_icmp(skb, pp, cp, 1); /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); @@ -708,6 +713,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl; + union nf_inet_addr snet; *related = 1; @@ -766,7 +772,9 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) if (!cp) return NF_ACCEPT; - return handle_response_icmp(skb, iph, cih, cp, pp, offset, ihl); + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, + pp, offset, ihl); } #ifdef CONFIG_IP_VS_IPV6 @@ -779,7 +787,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; - unsigned int offset, verdict; + unsigned int offset; + union nf_inet_addr snet; *related = 1; @@ -838,40 +847,9 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) if (!cp) return NF_ACCEPT; - verdict = NF_DROP; - - if (IP_VS_FWD_METHOD(cp) != 0) { - IP_VS_ERR("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) - && ip_vs_checksum_complete(skb, sizeof(struct ipv6hdr))) { - /* Failed checksum! */ - IP_VS_DBG(1, "Forward ICMPv6: failed checksum from " - NIP6_FMT "!\n", - NIP6(iph->saddr)); - goto out; - } - - if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) - offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) - goto out; - - ip_vs_nat_icmp_v6(skb, pp, cp, 1); - - /* do the statistics and put it back */ - ip_vs_out_stats(cp, skb); - - skb->ipvs_property = 1; - verdict = NF_ACCEPT; - -out: - __ip_vs_conn_put(cp); - - return verdict; + snet.in6 = iph->saddr; + return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, + pp, offset, sizeof(struct ipv6hdr)); } #endif @@ -1055,7 +1033,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; - } + } } } IP_VS_DBG_PKT(12, pp, skb, 0, @@ -1083,6 +1061,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl, verdict; + union nf_inet_addr snet; *related = 1; @@ -1142,9 +1121,12 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (!cp) { /* The packet could also belong to a local client */ cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); - if (cp) - return handle_response_icmp(skb, iph, cih, cp, pp, + if (cp) { + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, + cih->protocol, cp, pp, offset, ihl); + } return NF_ACCEPT; } @@ -1183,6 +1165,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, verdict; + union nf_inet_addr snet; *related = 1; @@ -1240,8 +1223,18 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) ip_vs_fill_iphdr(AF_INET6, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (!cp) + if (!cp) { + /* The packet could also belong to a local client */ + cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (cp) { + snet.in6 = iph->saddr; + return handle_response_icmp(AF_INET6, skb, &snet, + cih->nexthdr, + cp, pp, offset, + sizeof(struct ipv6hdr)); + } return NF_ACCEPT; + } verdict = NF_DROP; @@ -1281,9 +1274,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, * Big tappo: only PACKET_HOST, including loopback for local client * Don't handle local packets on IPv6 for now */ - if (unlikely(skb->pkt_type != PACKET_HOST || - (af == AF_INET6 || (skb->dev->flags & IFF_LOOPBACK || - skb->sk)))) { + if (unlikely(skb->pkt_type != PACKET_HOST)) { IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", skb->pkt_type, iph.protocol, -- cgit v1.2.3-70-g09d2 From cd9fe6c4f0afe334862c871bf5d32770daa748ec Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Fri, 5 Sep 2008 13:46:00 +0200 Subject: ipvs: Use pointer to address from sync message We want a pointer to it, not the value casted to a pointer. Signed-off-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_sync.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 40647edf102..28237a5f62e 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -397,11 +397,11 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) flags &= ~IP_VS_CONN_F_INACTIVE; } cp = ip_vs_conn_new(AF_INET, s->protocol, - (union nf_inet_addr *)s->caddr, + (union nf_inet_addr *)&s->caddr, s->cport, - (union nf_inet_addr *)s->vaddr, + (union nf_inet_addr *)&s->vaddr, s->vport, - (union nf_inet_addr *)s->daddr, + (union nf_inet_addr *)&s->daddr, s->dport, flags, dest); if (dest) -- cgit v1.2.3-70-g09d2 From a5ba4bf2732c85d8c95e0432966f79aa2b159478 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Fri, 5 Sep 2008 13:47:37 +0200 Subject: ipvs: Return negative error values from ip_vs_edit_service() Like the other code in this function does. Signed-off-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 7f89c588e58..d2dc05a843f 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1284,11 +1284,11 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6) { if (!sched->supports_ipv6) { - ret = EAFNOSUPPORT; + ret = -EAFNOSUPPORT; goto out; } if ((u->netmask < 1) || (u->netmask > 128)) { - ret = EINVAL; + ret = -EINVAL; goto out; } } -- cgit v1.2.3-70-g09d2 From 77eb851630bba8ea9962a1b2f01b23bd5d57c58e Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Fri, 5 Sep 2008 14:43:00 +0200 Subject: ipvs: Mark tcp/udp v4 and v6 debug functions static They are only used in this file, so they should be static Signed-off-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_proto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index 50f6215beda..b06da1c3445 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -151,7 +151,7 @@ const char * ip_vs_state_name(__u16 proto, int state) } -void +static void ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, @@ -190,7 +190,7 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, } #ifdef CONFIG_IP_VS_IPV6 -void +static void ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, -- cgit v1.2.3-70-g09d2 From 3bfb92f4073aa829f8e67e459d54c79306ddbd73 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Fri, 5 Sep 2008 16:53:49 +0200 Subject: ipvs: Reject ipv6 link-local addresses for destinations We can't use non-local link-local addresses for destinations, without knowing the interface on which we can reach the address. Reject them for now. Signed-off-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index d2dc05a843f..e53efe41f01 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -838,7 +838,8 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); - if (!(atype & IPV6_ADDR_UNICAST) && + if ((!(atype & IPV6_ADDR_UNICAST) || + atype & IPV6_ADDR_LINKLOCAL) && !__ip_vs_addr_is_local_v6(&udest->addr.in6)) return -EINVAL; } else -- cgit v1.2.3-70-g09d2 From 5af149cc34143c4e24abcc6355b29b3161eff3b8 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 8 Sep 2008 09:34:45 +1000 Subject: IPVS: fix bogus indentation Sorry, this was my error. Thanks to Julius Volz for pointing it out. Signed-off-by: Simon Horman Acked-by: Julius Volz --- net/ipv4/ipvs/ip_vs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 05797a5ce15..1f4f3b94359 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1033,7 +1033,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; - } + } } } IP_VS_DBG_PKT(12, pp, skb, 0, -- cgit v1.2.3-70-g09d2 From 178f5e494e3c0252d06a9b1473016addff71e01e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 8 Sep 2008 09:34:46 +1000 Subject: IPVS: use ipv6_addr_copy() It is standard to use ipv6_addr_copy() to fill in the in6 element of a union nf_inet_addr snet. Thanks to Julius Volz for pointing this out. Cc: Brian Haley Signed-off-by: Simon Horman Acked-by: Julius Volz --- net/ipv4/ipvs/ip_vs_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 1f4f3b94359..f5180ac56be 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -847,7 +847,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) if (!cp) return NF_ACCEPT; - snet.in6 = iph->saddr; + ipv6_addr_copy(&snet.in6, &iph->saddr); return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, pp, offset, sizeof(struct ipv6hdr)); } @@ -1227,7 +1227,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) /* The packet could also belong to a local client */ cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); if (cp) { - snet.in6 = iph->saddr; + ipv6_addr_copy(&snet.in6, &iph->saddr); return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, pp, offset, -- cgit v1.2.3-70-g09d2 From 503e81f65adac596a0275ea0230f2ae1fd64c301 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 8 Sep 2008 12:04:21 +1000 Subject: ipvs: handle PARTIAL_CHECKSUM Now that LVS can load balance locally generated traffic, packets may come from the loopback device and thus may have a partial checksum. The existing code allows for the case where there is no checksum at all for TCP, however Herbert Xu has confirmed that this is not legal. Signed-off-by: Simon Horman Acked-by: Julius Volz --- net/ipv4/ipvs/ip_vs_proto_tcp.c | 37 +++++++++++++++++++++++++++++++++++-- net/ipv4/ipvs/ip_vs_proto_udp.c | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 808e8be0280..537f616776d 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -134,12 +134,34 @@ tcp_fast_csum_update(int af, struct tcphdr *tcph, } +static inline void +tcp_partial_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(tcph->check)))); +} + + static int tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct tcphdr *tcph; unsigned int tcphoff; + int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -147,6 +169,7 @@ tcp_snat_handler(struct sk_buff *skb, else #endif tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) @@ -166,7 +189,11 @@ tcp_snat_handler(struct sk_buff *skb, tcph->source = cp->vport; /* Adjust TCP checksums */ - if (!cp->app && (tcph->check != 0)) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - tcphoff)); + } else if (!cp->app) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); @@ -204,6 +231,7 @@ tcp_dnat_handler(struct sk_buff *skb, { struct tcphdr *tcph; unsigned int tcphoff; + int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -211,6 +239,7 @@ tcp_dnat_handler(struct sk_buff *skb, else #endif tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) @@ -235,7 +264,11 @@ tcp_dnat_handler(struct sk_buff *skb, /* * Adjust TCP checksums */ - if (!cp->app && (tcph->check != 0)) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - tcphoff)); + } else if (!cp->app) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 5f2073e41cf..e3ee26bd1de 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -141,12 +141,34 @@ udp_fast_csum_update(int af, struct udphdr *uhdr, uhdr->check = CSUM_MANGLED_0; } +static inline void +udp_partial_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(uhdr->check)))); +} + + static int udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { struct udphdr *udph; unsigned int udphoff; + int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -154,6 +176,7 @@ udp_snat_handler(struct sk_buff *skb, else #endif udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ if (!skb_make_writable(skb, udphoff+sizeof(*udph))) @@ -177,7 +200,11 @@ udp_snat_handler(struct sk_buff *skb, /* * Adjust UDP checksums */ - if (!cp->app && (udph->check != 0)) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - udphoff)); + } else if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); @@ -216,6 +243,7 @@ udp_dnat_handler(struct sk_buff *skb, { struct udphdr *udph; unsigned int udphoff; + int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -223,6 +251,7 @@ udp_dnat_handler(struct sk_buff *skb, else #endif udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ if (!skb_make_writable(skb, udphoff+sizeof(*udph))) @@ -247,7 +276,11 @@ udp_dnat_handler(struct sk_buff *skb, /* * Adjust UDP checksums */ - if (!cp->app && (udph->check != 0)) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - udphoff)); + } else if (!cp->app && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); -- cgit v1.2.3-70-g09d2 From 9d7f2a2b1aa9e55537a053c68bdbd119fc479dd3 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Mon, 8 Sep 2008 14:55:42 +0200 Subject: IPVS: Remove incorrect ip_route_me_harder(), fix IPv6 Remove an incorrect ip_route_me_harder() that was probably a result of merging my IPv6 patches with the local client patches. With this, IPv6+NAT are working again. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index f5180ac56be..bdc92d73fbe 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -904,15 +904,6 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, if (ip_route_me_harder(skb, RTN_LOCAL) != 0) goto drop; - /* For policy routing, packets originating from this - * machine itself may be routed differently to packets - * passing through. We want this packet to be routed as - * if it came from this machine itself. So re-compute - * the routing information. - */ - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); -- cgit v1.2.3-70-g09d2 From 2206a3f5b75be5dadf11541961bd7c924857eb5d Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Mon, 8 Sep 2008 13:38:11 +0200 Subject: ipvs: Restrict connection table size via Kconfig Instead of checking the value in include/net/ip_vs.h, we can just restrict the range in our Kconfig file. This will prevent values outside of the range early. Signed-off-by: Sven Wegener Reviewed-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 10 +--------- net/ipv4/ipvs/Kconfig | 3 ++- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 1b13cef4b54..38f4f690b18 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -621,16 +621,8 @@ extern void ip_vs_init_hash_table(struct list_head *table, int rows); #ifndef CONFIG_IP_VS_TAB_BITS #define CONFIG_IP_VS_TAB_BITS 12 #endif -/* make sure that IP_VS_CONN_TAB_BITS is located in [8, 20] */ -#if CONFIG_IP_VS_TAB_BITS < 8 -#define IP_VS_CONN_TAB_BITS 8 -#endif -#if CONFIG_IP_VS_TAB_BITS > 20 -#define IP_VS_CONN_TAB_BITS 20 -#endif -#if 8 <= CONFIG_IP_VS_TAB_BITS && CONFIG_IP_VS_TAB_BITS <= 20 + #define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS -#endif #define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS) #define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1) diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig index 794cecb249a..de6004de80b 100644 --- a/net/ipv4/ipvs/Kconfig +++ b/net/ipv4/ipvs/Kconfig @@ -41,7 +41,8 @@ config IP_VS_DEBUG config IP_VS_TAB_BITS int "IPVS connection table size (the Nth power of 2)" - default "12" + range 8 20 + default 12 ---help--- The IPVS connection hash table uses the chaining scheme to handle hash collisions. Using a big IPVS connection hash table will greatly -- cgit v1.2.3-70-g09d2 From e9c0ce232e7a36daae1ca08282609d7f0c57c567 Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Mon, 8 Sep 2008 13:39:04 +0200 Subject: ipvs: Embed user stats structure into kernel stats structure Instead of duplicating the fields, integrate a user stats structure into the kernel stats structure. This is more robust when the members are changed, because they are now automatically kept in sync. Signed-off-by: Sven Wegener Reviewed-by: Julius Volz Signed-off-by: Simon Horman --- include/net/ip_vs.h | 21 ++---------------- net/ipv4/ipvs/ip_vs_core.c | 30 +++++++++++++------------- net/ipv4/ipvs/ip_vs_ctl.c | 53 ++++++++++++++++++---------------------------- net/ipv4/ipvs/ip_vs_est.c | 40 +++++++++++++++++----------------- 4 files changed, 58 insertions(+), 86 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 38f4f690b18..33e2ac6ceb3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -254,27 +254,10 @@ struct ip_vs_estimator { struct ip_vs_stats { - __u32 conns; /* connections scheduled */ - __u32 inpkts; /* incoming packets */ - __u32 outpkts; /* outgoing packets */ - __u64 inbytes; /* incoming bytes */ - __u64 outbytes; /* outgoing bytes */ - - __u32 cps; /* current connection rate */ - __u32 inpps; /* current in packet rate */ - __u32 outpps; /* current out packet rate */ - __u32 inbps; /* current in byte rate */ - __u32 outbps; /* current out byte rate */ - - /* - * Don't add anything before the lock, because we use memcpy() to copy - * the members before the lock to struct ip_vs_stats_user in - * ip_vs_ctl.c. - */ + struct ip_vs_stats_user ustats; /* statistics */ + struct ip_vs_estimator est; /* estimator */ spinlock_t lock; /* spin lock */ - - struct ip_vs_estimator est; /* estimator */ }; struct dst_entry; diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index bdc92d73fbe..80a4fcf33a5 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -102,18 +102,18 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_dest *dest = cp->dest; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { spin_lock(&dest->stats.lock); - dest->stats.inpkts++; - dest->stats.inbytes += skb->len; + dest->stats.ustats.inpkts++; + dest->stats.ustats.inbytes += skb->len; spin_unlock(&dest->stats.lock); spin_lock(&dest->svc->stats.lock); - dest->svc->stats.inpkts++; - dest->svc->stats.inbytes += skb->len; + dest->svc->stats.ustats.inpkts++; + dest->svc->stats.ustats.inbytes += skb->len; spin_unlock(&dest->svc->stats.lock); spin_lock(&ip_vs_stats.lock); - ip_vs_stats.inpkts++; - ip_vs_stats.inbytes += skb->len; + ip_vs_stats.ustats.inpkts++; + ip_vs_stats.ustats.inbytes += skb->len; spin_unlock(&ip_vs_stats.lock); } } @@ -125,18 +125,18 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_dest *dest = cp->dest; if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { spin_lock(&dest->stats.lock); - dest->stats.outpkts++; - dest->stats.outbytes += skb->len; + dest->stats.ustats.outpkts++; + dest->stats.ustats.outbytes += skb->len; spin_unlock(&dest->stats.lock); spin_lock(&dest->svc->stats.lock); - dest->svc->stats.outpkts++; - dest->svc->stats.outbytes += skb->len; + dest->svc->stats.ustats.outpkts++; + dest->svc->stats.ustats.outbytes += skb->len; spin_unlock(&dest->svc->stats.lock); spin_lock(&ip_vs_stats.lock); - ip_vs_stats.outpkts++; - ip_vs_stats.outbytes += skb->len; + ip_vs_stats.ustats.outpkts++; + ip_vs_stats.ustats.outbytes += skb->len; spin_unlock(&ip_vs_stats.lock); } } @@ -146,15 +146,15 @@ static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) { spin_lock(&cp->dest->stats.lock); - cp->dest->stats.conns++; + cp->dest->stats.ustats.conns++; spin_unlock(&cp->dest->stats.lock); spin_lock(&svc->stats.lock); - svc->stats.conns++; + svc->stats.ustats.conns++; spin_unlock(&svc->stats.lock); spin_lock(&ip_vs_stats.lock); - ip_vs_stats.conns++; + ip_vs_stats.ustats.conns++; spin_unlock(&ip_vs_stats.lock); } diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index e53efe41f01..993a83fb0d5 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -744,18 +744,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) { spin_lock_bh(&stats->lock); - stats->conns = 0; - stats->inpkts = 0; - stats->outpkts = 0; - stats->inbytes = 0; - stats->outbytes = 0; - - stats->cps = 0; - stats->inpps = 0; - stats->outpps = 0; - stats->inbps = 0; - stats->outbps = 0; - + memset(&stats->ustats, 0, sizeof(stats->ustats)); ip_vs_zero_estimator(stats); spin_unlock_bh(&stats->lock); @@ -1964,20 +1953,20 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) " Conns Packets Packets Bytes Bytes\n"); spin_lock_bh(&ip_vs_stats.lock); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns, - ip_vs_stats.inpkts, ip_vs_stats.outpkts, - (unsigned long long) ip_vs_stats.inbytes, - (unsigned long long) ip_vs_stats.outbytes); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, + ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, + (unsigned long long) ip_vs_stats.ustats.inbytes, + (unsigned long long) ip_vs_stats.ustats.outbytes); /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq,"%8X %8X %8X %16X %16X\n", - ip_vs_stats.cps, - ip_vs_stats.inpps, - ip_vs_stats.outpps, - ip_vs_stats.inbps, - ip_vs_stats.outbps); + ip_vs_stats.ustats.cps, + ip_vs_stats.ustats.inpps, + ip_vs_stats.ustats.outpps, + ip_vs_stats.ustats.inbps, + ip_vs_stats.ustats.outbps); spin_unlock_bh(&ip_vs_stats.lock); return 0; @@ -2215,7 +2204,7 @@ static void ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) { spin_lock_bh(&src->lock); - memcpy(dst, src, (char*)&src->lock - (char*)src); + memcpy(dst, &src->ustats, sizeof(*dst)); spin_unlock_bh(&src->lock); } @@ -2591,16 +2580,16 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, spin_lock_bh(&stats->lock); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->conns); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->inpkts); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->outpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->inbytes); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->outbytes); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->cps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->inpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->outpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->inbps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->outbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); spin_unlock_bh(&stats->lock); diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 4fb620ec208..2eb2860dabb 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -65,37 +65,37 @@ static void estimation_timer(unsigned long arg) s = container_of(e, struct ip_vs_stats, est); spin_lock(&s->lock); - n_conns = s->conns; - n_inpkts = s->inpkts; - n_outpkts = s->outpkts; - n_inbytes = s->inbytes; - n_outbytes = s->outbytes; + n_conns = s->ustats.conns; + n_inpkts = s->ustats.inpkts; + n_outpkts = s->ustats.outpkts; + n_inbytes = s->ustats.inbytes; + n_outbytes = s->ustats.outbytes; /* scaled by 2^10, but divided 2 seconds */ rate = (n_conns - e->last_conns)<<9; e->last_conns = n_conns; e->cps += ((long)rate - (long)e->cps)>>2; - s->cps = (e->cps+0x1FF)>>10; + s->ustats.cps = (e->cps+0x1FF)>>10; rate = (n_inpkts - e->last_inpkts)<<9; e->last_inpkts = n_inpkts; e->inpps += ((long)rate - (long)e->inpps)>>2; - s->inpps = (e->inpps+0x1FF)>>10; + s->ustats.inpps = (e->inpps+0x1FF)>>10; rate = (n_outpkts - e->last_outpkts)<<9; e->last_outpkts = n_outpkts; e->outpps += ((long)rate - (long)e->outpps)>>2; - s->outpps = (e->outpps+0x1FF)>>10; + s->ustats.outpps = (e->outpps+0x1FF)>>10; rate = (n_inbytes - e->last_inbytes)<<4; e->last_inbytes = n_inbytes; e->inbps += ((long)rate - (long)e->inbps)>>2; - s->inbps = (e->inbps+0xF)>>5; + s->ustats.inbps = (e->inbps+0xF)>>5; rate = (n_outbytes - e->last_outbytes)<<4; e->last_outbytes = n_outbytes; e->outbps += ((long)rate - (long)e->outbps)>>2; - s->outbps = (e->outbps+0xF)>>5; + s->ustats.outbps = (e->outbps+0xF)>>5; spin_unlock(&s->lock); } spin_unlock(&est_lock); @@ -108,20 +108,20 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats) INIT_LIST_HEAD(&est->list); - est->last_conns = stats->conns; - est->cps = stats->cps<<10; + est->last_conns = stats->ustats.conns; + est->cps = stats->ustats.cps<<10; - est->last_inpkts = stats->inpkts; - est->inpps = stats->inpps<<10; + est->last_inpkts = stats->ustats.inpkts; + est->inpps = stats->ustats.inpps<<10; - est->last_outpkts = stats->outpkts; - est->outpps = stats->outpps<<10; + est->last_outpkts = stats->ustats.outpkts; + est->outpps = stats->ustats.outpps<<10; - est->last_inbytes = stats->inbytes; - est->inbps = stats->inbps<<5; + est->last_inbytes = stats->ustats.inbytes; + est->inbps = stats->ustats.inbps<<5; - est->last_outbytes = stats->outbytes; - est->outbps = stats->outbps<<5; + est->last_outbytes = stats->ustats.outbytes; + est->outbps = stats->ustats.outbps<<5; spin_lock_bh(&est_lock); list_add(&est->list, &est_list); -- cgit v1.2.3-70-g09d2 From 410e27a49bb98bc7fa3ff5fc05cc313817b9f253 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 9 Sep 2008 13:27:22 +0200 Subject: This reverts "Merge branch 'dccp' of git://eden-feed.erg.abdn.ac.uk/dccp_exp" as it accentally contained the wrong set of patches. These will be submitted separately. Signed-off-by: Gerrit Renker --- Documentation/networking/dccp.txt | 54 +- include/linux/dccp.h | 122 ++- include/net/tcp.h | 15 - net/dccp/Kconfig | 3 + net/dccp/Makefile | 5 +- net/dccp/ackvec.c | 619 ++++++------ net/dccp/ackvec.h | 204 ++-- net/dccp/ccid.c | 101 +- net/dccp/ccid.h | 113 +-- net/dccp/ccids/Kconfig | 30 +- net/dccp/ccids/ccid2.c | 622 +++++++----- net/dccp/ccids/ccid2.h | 63 +- net/dccp/ccids/ccid3.c | 762 +++++++++------ net/dccp/ccids/ccid3.h | 153 +-- net/dccp/ccids/lib/loss_interval.c | 30 +- net/dccp/ccids/lib/loss_interval.h | 4 +- net/dccp/ccids/lib/packet_history.c | 282 +++--- net/dccp/ccids/lib/packet_history.h | 78 +- net/dccp/ccids/lib/tfrc.h | 16 - net/dccp/ccids/lib/tfrc_equation.c | 29 +- net/dccp/dccp.h | 104 +- net/dccp/diag.c | 2 +- net/dccp/feat.c | 1805 +++++++++-------------------------- net/dccp/feat.h | 144 +-- net/dccp/input.c | 164 ++-- net/dccp/ipv4.c | 4 +- net/dccp/ipv6.c | 4 +- net/dccp/minisocks.c | 87 +- net/dccp/options.c | 341 ++++--- net/dccp/output.c | 279 ++---- net/dccp/probe.c | 75 +- net/dccp/proto.c | 281 ++---- net/dccp/qpolicy.c | 137 --- net/dccp/sysctl.c | 64 +- net/dccp/timer.c | 42 +- net/ipv4/tcp_input.c | 17 +- 36 files changed, 2884 insertions(+), 3971 deletions(-) delete mode 100644 net/dccp/qpolicy.c (limited to 'net/ipv4') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index fcfc1253442..39131a3c78f 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -45,25 +45,6 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree Socket options ============== -DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes -a policy ID as argument and can only be set before the connection (i.e. changes -during an established connection are not supported). Currently, two policies are -defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, -and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an -u32 priority value as ancillary data to sendmsg(), where higher numbers indicate -a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to -be formatted using a cmsg(3) message header filled in as follows: - cmsg->cmsg_level = SOL_DCCP; - cmsg->cmsg_type = DCCP_SCM_PRIORITY; - cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ - -DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero -value is always interpreted as unbounded queue length. If different from zero, -the interpretation of this parameter depends on the current dequeuing policy -(see above): the "simple" policy will enforce a fixed queue size by returning -EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the -lowest-priority packet first. The default value for this parameter is -initialised from /proc/sys/net/dccp/default/tx_qlen. DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, @@ -76,24 +57,6 @@ can be set before calling bind(). DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet size (application payload size) in bytes, see RFC 4340, section 14. -DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs -supported by the endpoint (see include/linux/dccp.h for symbolic constants). -The caller needs to provide a sufficiently large (> 2) array of type uint8_t. - -DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same -time, combining the operation of the next two socket options. This option is -preferrable over the latter two, since often applications will use the same -type of CCID for both directions; and mixed use of CCIDs is not currently well -understood. This socket option takes as argument at least one uint8_t value, or -an array of uint8_t values, which must match available CCIDS (see above). CCIDs -must be registered on the socket before calling connect() or listen(). - -DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets -the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. -Please note that the getsockopt argument type here is `int', not uint8_t. - -DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. - DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold timewait state when closing the connection (RFC 4340, 8.3). The usual case is that the closing server sends a CloseReq, whereupon the client holds timewait @@ -152,16 +115,23 @@ retries2 importance for retransmitted acknowledgments and feature negotiation, data packets are never retransmitted. Analogue of tcp_retries2. +send_ndp = 1 + Whether or not to send NDP count options (sec. 7.7.2). + +send_ackvec = 1 + Whether or not to send Ack Vector options (sec. 11.5). + +ack_ratio = 2 + The default Ack Ratio (sec. 11.3) to use. + tx_ccid = 2 - Default CCID for the sender-receiver half-connection. Depending on the - choice of CCID, the Send Ack Vector feature is enabled automatically. + Default CCID for the sender-receiver half-connection. rx_ccid = 2 - Default CCID for the receiver-sender half-connection; see tx_ccid. + Default CCID for the receiver-sender half-connection. seq_window = 100 - The initial sequence window (sec. 7.5.2) of the sender. This influences - the local ackno validity and the remote seqno validity windows (7.5.1). + The initial sequence window (sec. 7.5.2). tx_qlen = 5 The size of the transmit buffer in packets. A value of 0 corresponds diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 010e2d87ed7..6080449fbec 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -165,13 +165,9 @@ enum { DCCPO_TIMESTAMP_ECHO = 42, DCCPO_ELAPSED_TIME = 43, DCCPO_MAX = 45, - DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */ - DCCPO_MAX_RX_CCID_SPECIFIC = 191, - DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */ - DCCPO_MAX_TX_CCID_SPECIFIC = 255, + DCCPO_MIN_CCID_SPECIFIC = 128, + DCCPO_MAX_CCID_SPECIFIC = 255, }; -/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ -#define DCCP_SINGLE_OPT_MAXLEN 253 /* DCCP CCIDS */ enum { @@ -180,36 +176,27 @@ enum { }; /* DCCP features (RFC 4340 section 6.4) */ -enum dccp_feature_numbers { +enum { DCCPF_RESERVED = 0, DCCPF_CCID = 1, - DCCPF_SHORT_SEQNOS = 2, + DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */ DCCPF_SEQUENCE_WINDOW = 3, - DCCPF_ECN_INCAPABLE = 4, + DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */ DCCPF_ACK_RATIO = 5, DCCPF_SEND_ACK_VECTOR = 6, DCCPF_SEND_NDP_COUNT = 7, DCCPF_MIN_CSUM_COVER = 8, - DCCPF_DATA_CHECKSUM = 9, + DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */ /* 10-127 reserved */ DCCPF_MIN_CCID_SPECIFIC = 128, - DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */ DCCPF_MAX_CCID_SPECIFIC = 255, }; -/* DCCP socket control message types for cmsg */ -enum dccp_cmsg_type { - DCCP_SCM_PRIORITY = 1, - DCCP_SCM_QPOLICY_MAX = 0xFFFF, - /* ^-- Up to here reserved exclusively for qpolicy parameters */ - DCCP_SCM_MAX -}; - -/* DCCP priorities for outgoing/queued packets */ -enum dccp_packet_dequeueing_policy { - DCCPQ_POLICY_SIMPLE, - DCCPQ_POLICY_PRIO, - DCCPQ_POLICY_MAX +/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */ +struct dccp_so_feat { + __u8 dccpsf_feat; + __u8 __user *dccpsf_val; + __u8 dccpsf_len; }; /* DCCP socket options */ @@ -221,12 +208,6 @@ enum dccp_packet_dequeueing_policy { #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 #define DCCP_SOCKOPT_SEND_CSCOV 10 #define DCCP_SOCKOPT_RECV_CSCOV 11 -#define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 -#define DCCP_SOCKOPT_CCID 13 -#define DCCP_SOCKOPT_TX_CCID 14 -#define DCCP_SOCKOPT_RX_CCID 15 -#define DCCP_SOCKOPT_QPOLICY_ID 16 -#define DCCP_SOCKOPT_QPOLICY_TXQLEN 17 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 @@ -374,13 +355,62 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) return __dccp_hdr_len(dccp_hdr(skb)); } + +/* initial values for each feature */ +#define DCCPF_INITIAL_SEQUENCE_WINDOW 100 +#define DCCPF_INITIAL_ACK_RATIO 2 +#define DCCPF_INITIAL_CCID DCCPC_CCID2 +#define DCCPF_INITIAL_SEND_ACK_VECTOR 1 +/* FIXME: for now we're default to 1 but it should really be 0 */ +#define DCCPF_INITIAL_SEND_NDP_COUNT 1 + +/** + * struct dccp_minisock - Minimal DCCP connection representation + * + * Will be used to pass the state from dccp_request_sock to dccp_sock. + * + * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) + * @dccpms_ccid - Congestion Control Id (CCID) (section 10) + * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) + * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) + * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3) + * @dccpms_pending - List of features being negotiated + * @dccpms_conf - + */ +struct dccp_minisock { + __u64 dccpms_sequence_window; + __u8 dccpms_rx_ccid; + __u8 dccpms_tx_ccid; + __u8 dccpms_send_ack_vector; + __u8 dccpms_send_ndp_count; + __u8 dccpms_ack_ratio; + struct list_head dccpms_pending; + struct list_head dccpms_conf; +}; + +struct dccp_opt_conf { + __u8 *dccpoc_val; + __u8 dccpoc_len; +}; + +struct dccp_opt_pend { + struct list_head dccpop_node; + __u8 dccpop_type; + __u8 dccpop_feat; + __u8 *dccpop_val; + __u8 dccpop_len; + int dccpop_conf; + struct dccp_opt_conf *dccpop_sc; +}; + +extern void dccp_minisock_init(struct dccp_minisock *dmsk); + /** * struct dccp_request_sock - represent DCCP-specific connection request * @dreq_inet_rsk: structure inherited from * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) * @dreq_isr: initial sequence number received on the Request * @dreq_service: service code present on the Request (there is just one) - * @dreq_featneg: feature negotiation options for this connection * The following two fields are analogous to the ones in dccp_sock: * @dreq_timestamp_echo: last received timestamp to echo (13.1) * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo @@ -390,7 +420,6 @@ struct dccp_request_sock { __u64 dreq_iss; __u64 dreq_isr; __be32 dreq_service; - struct list_head dreq_featneg; __u32 dreq_timestamp_echo; __u32 dreq_timestamp_time; }; @@ -462,28 +491,21 @@ struct dccp_ackvec; * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo * @dccps_l_ack_ratio - feature-local Ack Ratio * @dccps_r_ack_ratio - feature-remote Ack Ratio - * @dccps_l_seq_win - local Sequence Window (influences ack number validity) - * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) * @dccps_pcslen - sender partial checksum coverage (via sockopt) * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) - * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) - * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) + * @dccps_minisock - associated minisock (accessed via dccp_msk) * @dccps_hc_rx_ackvec - rx half connection ack vector * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) * @dccps_options_received - parsed set of retrieved options - * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy - * @dccps_tx_qlen - maximum length of the TX queue * @dccps_role - role of this sock, one of %dccp_role * @dccps_hc_rx_insert_options - receiver wants to add options when acking * @dccps_hc_tx_insert_options - sender wants to add options when sending * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) - * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" - * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets - * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) + * @dccps_xmit_timer - timer for when CCID is not ready to send * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) */ struct dccp_sock { @@ -507,26 +529,19 @@ struct dccp_sock { __u32 dccps_timestamp_time; __u16 dccps_l_ack_ratio; __u16 dccps_r_ack_ratio; - __u64 dccps_l_seq_win:48; - __u64 dccps_r_seq_win:48; - __u8 dccps_pcslen:4; - __u8 dccps_pcrlen:4; - __u8 dccps_send_ndp_count:1; + __u16 dccps_pcslen; + __u16 dccps_pcrlen; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; - struct list_head dccps_featneg; + struct dccp_minisock dccps_minisock; struct dccp_ackvec *dccps_hc_rx_ackvec; struct ccid *dccps_hc_rx_ccid; struct ccid *dccps_hc_tx_ccid; struct dccp_options_received dccps_options_received; - __u8 dccps_qpolicy; - __u32 dccps_tx_qlen; enum dccp_role dccps_role:2; __u8 dccps_hc_rx_insert_options:1; __u8 dccps_hc_tx_insert_options:1; __u8 dccps_server_timewait:1; - __u8 dccps_sync_scheduled:1; - struct tasklet_struct dccps_xmitlet; struct timer_list dccps_xmit_timer; }; @@ -535,6 +550,11 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk) return (struct dccp_sock *)sk; } +static inline struct dccp_minisock *dccp_msk(const struct sock *sk) +{ + return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock; +} + static inline const char *dccp_role(const struct sock *sk) { switch (dccp_sk(sk)->dccps_role) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 6bc4b8148ca..8983386356a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -782,21 +782,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) /* Use define here intentionally to get WARN_ON location shown at the caller */ #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) -/* - * Convert RFC3390 larger initial windows into an equivalent number of packets. - * - * John Heffner states: - * - * The RFC specifies a window of no more than 4380 bytes - * unless 2*MSS > 4380. Reading the pseudocode in the RFC - * is a bit misleading because they use a clamp at 4380 bytes - * rather than a multiplier in the relevant range. - */ -static inline u32 rfc3390_bytes_to_packets(const u32 bytes) -{ - return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3); -} - extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 206c16ad9c3..7aa2a7acc7e 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -25,6 +25,9 @@ config INET_DCCP_DIAG def_tristate y if (IP_DCCP = y && INET_DIAG = y) def_tristate m +config IP_DCCP_ACKVEC + bool + source "net/dccp/ccids/Kconfig" menu "DCCP Kernel Hacking" diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 0c1c9af2bf7..f4f8793aaff 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,7 +1,6 @@ obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o -dccp-y := ccid.o feat.o input.o minisocks.o options.o \ - qpolicy.o output.o proto.o timer.o ackvec.o +dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o dccp_ipv4-y := ipv4.o @@ -9,6 +8,8 @@ dccp_ipv4-y := ipv4.o obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o dccp_ipv6-y := ipv6.o +dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o + obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 41819848bdd..1e8be246ad1 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -1,375 +1,445 @@ /* * net/dccp/ackvec.c * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK + * An implementation of the DCCP protocol * Copyright (c) 2005 Arnaldo Carvalho de Melo * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; version 2 of the License; */ + +#include "ackvec.h" #include "dccp.h" + +#include +#include +#include #include +#include #include +#include + static struct kmem_cache *dccp_ackvec_slab; static struct kmem_cache *dccp_ackvec_record_slab; -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) +static struct dccp_ackvec_record *dccp_ackvec_record_new(void) { - struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); + struct dccp_ackvec_record *avr = + kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); - if (av != NULL) { - av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; - INIT_LIST_HEAD(&av->av_records); - } - return av; + if (avr != NULL) + INIT_LIST_HEAD(&avr->avr_node); + + return avr; } -static void dccp_ackvec_purge_records(struct dccp_ackvec *av) +static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) { - struct dccp_ackvec_record *cur, *next; - - list_for_each_entry_safe(cur, next, &av->av_records, avr_node) - kmem_cache_free(dccp_ackvec_record_slab, cur); - INIT_LIST_HEAD(&av->av_records); + if (unlikely(avr == NULL)) + return; + /* Check if deleting a linked record */ + WARN_ON(!list_empty(&avr->avr_node)); + kmem_cache_free(dccp_ackvec_record_slab, avr); } -void dccp_ackvec_free(struct dccp_ackvec *av) +static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, + struct dccp_ackvec_record *avr) { - if (likely(av != NULL)) { - dccp_ackvec_purge_records(av); - kmem_cache_free(dccp_ackvec_slab, av); + /* + * AVRs are sorted by seqno. Since we are sending them in order, we + * just add the AVR at the head of the list. + * -sorbo. + */ + if (!list_empty(&av->av_records)) { + const struct dccp_ackvec_record *head = + list_entry(av->av_records.next, + struct dccp_ackvec_record, + avr_node); + BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno)); } + + list_add(&avr->avr_node, &av->av_records); } -/** - * dccp_ackvec_update_records - Record information about sent Ack Vectors - * @av: Ack Vector records to update - * @seqno: Sequence number of the packet carrying the Ack Vector just sent - * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector - */ -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) +int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) { + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; + /* Figure out how many options do we need to represent the ackvec */ + const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); + u16 len = av->av_vec_len + 2 * nr_opts, i; + u32 elapsed_time; + const unsigned char *tail, *from; + unsigned char *to; struct dccp_ackvec_record *avr; + suseconds_t delta; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + return -1; + + delta = ktime_us_delta(ktime_get_real(), av->av_time); + elapsed_time = delta / 10; - avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); + if (elapsed_time != 0 && + dccp_insert_option_elapsed_time(sk, skb, elapsed_time)) + return -1; + + avr = dccp_ackvec_record_new(); if (avr == NULL) - return -ENOBUFS; + return -1; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + len = av->av_vec_len; + from = av->av_buf + av->av_buf_head; + tail = av->av_buf + DCCP_MAX_ACKVEC_LEN; + + for (i = 0; i < nr_opts; ++i) { + int copylen = len; + + if (len > DCCP_MAX_ACKVEC_OPT_LEN) + copylen = DCCP_MAX_ACKVEC_OPT_LEN; + + *to++ = DCCPO_ACK_VECTOR_0; + *to++ = copylen + 2; + + /* Check if buf_head wraps */ + if (from + copylen > tail) { + const u16 tailsize = tail - from; + + memcpy(to, from, tailsize); + to += tailsize; + len -= tailsize; + copylen -= tailsize; + from = av->av_buf; + } + + memcpy(to, from, copylen); + from += copylen; + to += copylen; + len -= copylen; + } - avr->avr_ack_seqno = seqno; - avr->avr_ack_ptr = av->av_buf_head; - avr->avr_ack_ackno = av->av_buf_ackno; - avr->avr_ack_nonce = nonce_sum; - avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); - /* - * When the buffer overflows, we keep no more than one record. This is - * the simplest way of disambiguating sender-Acks dating from before the - * overflow from sender-Acks which refer to after the overflow; a simple - * solution is preferable here since we are handling an exception. - */ - if (av->av_overflow) - dccp_ackvec_purge_records(av); /* - * Since GSS is incremented for each packet, the list is automatically - * arranged in descending order of @ack_seqno. + * From RFC 4340, A.2: + * + * For each acknowledgement it sends, the HC-Receiver will add an + * acknowledgement record. ack_seqno will equal the HC-Receiver + * sequence number it used for the ack packet; ack_ptr will equal + * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will + * equal buf_nonce. */ - list_add(&avr->avr_node, &av->av_records); + avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + avr->avr_ack_ptr = av->av_buf_head; + avr->avr_ack_ackno = av->av_buf_ackno; + avr->avr_ack_nonce = av->av_buf_nonce; + avr->avr_sent_len = av->av_vec_len; - dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", + dccp_ackvec_insert_avr(av, avr); + + dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " + "ack_ackno=%llu\n", + dccp_role(sk), avr->avr_sent_len, (unsigned long long)avr->avr_ack_seqno, - (unsigned long long)avr->avr_ack_ackno, - avr->avr_ack_runlen); + (unsigned long long)avr->avr_ack_ackno); return 0; } -static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, - const u64 ackno) +struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) { - struct dccp_ackvec_record *avr; - /* - * Exploit that records are inserted in descending order of sequence - * number, start with the oldest record first. If @ackno is `before' - * the earliest ack_ackno, the packet is too old to be considered. - */ - list_for_each_entry_reverse(avr, av_list, avr_node) { - if (avr->avr_ack_seqno == ackno) - return avr; - if (before48(ackno, avr->avr_ack_seqno)) - break; + struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); + + if (av != NULL) { + av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; + av->av_buf_ackno = UINT48_MAX + 1; + av->av_buf_nonce = 0; + av->av_time = ktime_set(0, 0); + av->av_vec_len = 0; + INIT_LIST_HEAD(&av->av_records); } - return NULL; + + return av; } -/* - * Buffer index and length computation using modulo-buffersize arithmetic. - * Note that, as pointers move from right to left, head is `before' tail. - */ -static inline u16 __ackvec_idx_add(const u16 a, const u16 b) +void dccp_ackvec_free(struct dccp_ackvec *av) { - return (a + b) % DCCPAV_MAX_ACKVEC_LEN; + if (unlikely(av == NULL)) + return; + + if (!list_empty(&av->av_records)) { + struct dccp_ackvec_record *avr, *next; + + list_for_each_entry_safe(avr, next, &av->av_records, avr_node) { + list_del_init(&avr->avr_node); + dccp_ackvec_record_delete(avr); + } + } + + kmem_cache_free(dccp_ackvec_slab, av); } -static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) +static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, + const u32 index) { - return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); + return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; } -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) +static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, + const u32 index) { - if (unlikely(av->av_overflow)) - return DCCPAV_MAX_ACKVEC_LEN; - return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); + return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; } -/** - * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 - * @av: non-empty buffer to update - * @distance: negative or zero distance of @seqno from buf_ackno downward - * @seqno: the (old) sequence number whose record is to be updated - * @state: state in which packet carrying @seqno was received +/* + * If several packets are missing, the HC-Receiver may prefer to enter multiple + * bytes with run length 0, rather than a single byte with a larger run length; + * this simplifies table updates if one of the missing packets arrives. */ -static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, - u64 seqno, enum dccp_ackvec_states state) +static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, + const unsigned int packets, + const unsigned char state) { - u16 ptr = av->av_buf_head; + unsigned int gap; + long new_head; - BUG_ON(distance > 0); - if (unlikely(dccp_ackvec_is_empty(av))) - return; + if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) + return -ENOBUFS; - do { - u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); + gap = packets - 1; + new_head = av->av_buf_head - packets; - if (distance + runlen >= 0) { - /* - * Only update the state if packet has not been received - * yet. This is OK as per the second table in RFC 4340, - * 11.4.1; i.e. here we are using the following table: - * RECEIVED - * 0 1 3 - * S +---+---+---+ - * T 0 | 0 | 0 | 0 | - * O +---+---+---+ - * R 1 | 1 | 1 | 1 | - * E +---+---+---+ - * D 3 | 0 | 1 | 3 | - * +---+---+---+ - * The "Not Received" state was set by reserve_seats(). - */ - if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) - av->av_buf[ptr] = state; - else - dccp_pr_debug("Not changing %llu state to %u\n", - (unsigned long long)seqno, state); - break; + if (new_head < 0) { + if (gap > 0) { + memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, + gap + new_head + 1); + gap = -new_head; } + new_head += DCCP_MAX_ACKVEC_LEN; + } - distance += runlen + 1; - ptr = __ackvec_idx_add(ptr, 1); + av->av_buf_head = new_head; - } while (ptr != av->av_buf_tail); -} + if (gap > 0) + memset(av->av_buf + av->av_buf_head + 1, + DCCP_ACKVEC_STATE_NOT_RECEIVED, gap); -/* Mark @num entries after buf_head as "Not yet received". */ -static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) -{ - u16 start = __ackvec_idx_add(av->av_buf_head, 1), - len = DCCPAV_MAX_ACKVEC_LEN - start; - - /* check for buffer wrap-around */ - if (num > len) { - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); - start = 0; - num -= len; - } - if (num) - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); + av->av_buf[av->av_buf_head] = state; + av->av_vec_len += packets; + return 0; } -/** - * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer - * @av: container of buffer to update (can be empty or non-empty) - * @num_packets: number of packets to register (must be >= 1) - * @seqno: sequence number of the first packet in @num_packets - * @state: state in which packet carrying @seqno was received +/* + * Implements the RFC 4340, Appendix A */ -static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, - u64 seqno, enum dccp_ackvec_states state) +int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, + const u64 ackno, const u8 state) { - u32 num_cells = num_packets; + /* + * Check at the right places if the buffer is full, if it is, tell the + * caller to start dropping packets till the HC-Sender acks our ACK + * vectors, when we will free up space in av_buf. + * + * We may well decide to do buffer compression, etc, but for now lets + * just drop. + * + * From Appendix A.1.1 (`New Packets'): + * + * Of course, the circular buffer may overflow, either when the + * HC-Sender is sending data at a very high rate, when the + * HC-Receiver's acknowledgements are not reaching the HC-Sender, + * or when the HC-Sender is forgetting to acknowledge those acks + * (so the HC-Receiver is unable to clean up old state). In this + * case, the HC-Receiver should either compress the buffer (by + * increasing run lengths when possible), transfer its state to + * a larger buffer, or, as a last resort, drop all received + * packets, without processing them whatsoever, until its buffer + * shrinks again. + */ - if (num_packets > DCCPAV_BURST_THRESH) { - u32 lost_packets = num_packets - 1; + /* See if this is the first ackno being inserted */ + if (av->av_vec_len == 0) { + av->av_buf[av->av_buf_head] = state; + av->av_vec_len = 1; + } else if (after48(ackno, av->av_buf_ackno)) { + const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno); - DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); /* - * We received 1 packet and have a loss of size "num_packets-1" - * which we squeeze into num_cells-1 rather than reserving an - * entire byte for each lost packet. - * The reason is that the vector grows in O(burst_length); when - * it grows too large there will no room left for the payload. - * This is a trade-off: if a few packets out of the burst show - * up later, their state will not be changed; it is simply too - * costly to reshuffle/reallocate/copy the buffer each time. - * Should such problems persist, we will need to switch to a - * different underlying data structure. + * Look if the state of this packet is the same as the + * previous ackno and if so if we can bump the head len. */ - for (num_packets = num_cells = 1; lost_packets; ++num_cells) { - u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN); - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); - av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; + if (delta == 1 && + dccp_ackvec_state(av, av->av_buf_head) == state && + dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK) + av->av_buf[av->av_buf_head]++; + else if (dccp_ackvec_set_buf_head_state(av, delta, state)) + return -ENOBUFS; + } else { + /* + * A.1.2. Old Packets + * + * When a packet with Sequence Number S <= buf_ackno + * arrives, the HC-Receiver will scan the table for + * the byte corresponding to S. (Indexing structures + * could reduce the complexity of this scan.) + */ + u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno); + u32 index = av->av_buf_head; - lost_packets -= len; + while (1) { + const u8 len = dccp_ackvec_len(av, index); + const u8 av_state = dccp_ackvec_state(av, index); + /* + * valid packets not yet in av_buf have a reserved + * entry, with a len equal to 0. + */ + if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED && + len == 0 && delta == 0) { /* Found our + reserved seat! */ + dccp_pr_debug("Found %llu reserved seat!\n", + (unsigned long long)ackno); + av->av_buf[index] = state; + goto out; + } + /* len == 0 means one packet */ + if (delta < len + 1) + goto out_duplicate; + + delta -= len + 1; + if (++index == DCCP_MAX_ACKVEC_LEN) + index = 0; } } - if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { - DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n"); - av->av_overflow = true; - } - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); - if (av->av_overflow) - av->av_buf_tail = av->av_buf_head; - - av->av_buf[av->av_buf_head] = state; - av->av_buf_ackno = seqno; + av->av_buf_ackno = ackno; + av->av_time = ktime_get_real(); +out: + return 0; - if (num_packets > 1) - dccp_ackvec_reserve_seats(av, num_packets - 1); +out_duplicate: + /* Duplicate packet */ + dccp_pr_debug("Received a dup or already considered lost " + "packet: %llu\n", (unsigned long long)ackno); + return -EILSEQ; } -/** - * dccp_ackvec_input - Register incoming packet in the buffer - */ -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) +static void dccp_ackvec_throw_record(struct dccp_ackvec *av, + struct dccp_ackvec_record *avr) { - u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; - enum dccp_ackvec_states state = DCCPAV_RECEIVED; + struct dccp_ackvec_record *next; - if (dccp_ackvec_is_empty(av)) { - dccp_ackvec_add_new(av, 1, seqno, state); - av->av_tail_ackno = seqno; + /* sort out vector length */ + if (av->av_buf_head <= avr->avr_ack_ptr) + av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; + else + av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 - + av->av_buf_head + avr->avr_ack_ptr; - } else { - s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); - u8 *current_head = av->av_buf + av->av_buf_head; - - if (num_packets == 1 && - dccp_ackvec_state(current_head) == state && - dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { + /* free records */ + list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { + list_del_init(&avr->avr_node); + dccp_ackvec_record_delete(avr); + } +} - *current_head += 1; - av->av_buf_ackno = seqno; +void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, + const u64 ackno) +{ + struct dccp_ackvec_record *avr; - } else if (num_packets > 0) { - dccp_ackvec_add_new(av, num_packets, seqno, state); - } else { - dccp_ackvec_update_old(av, num_packets, seqno, state); - } + /* + * If we traverse backwards, it should be faster when we have large + * windows. We will be receiving ACKs for stuff we sent a while back + * -sorbo. + */ + list_for_each_entry_reverse(avr, &av->av_records, avr_node) { + if (ackno == avr->avr_ack_seqno) { + dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " + "ack_ackno=%llu, ACKED!\n", + dccp_role(sk), 1, + (unsigned long long)avr->avr_ack_seqno, + (unsigned long long)avr->avr_ack_ackno); + dccp_ackvec_throw_record(av, avr); + break; + } else if (avr->avr_ack_seqno > ackno) + break; /* old news */ } } -/** - * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection - * This routine is called when the peer acknowledges the receipt of Ack Vectors - * up to and including @ackno. While based on on section A.3 of RFC 4340, here - * are additional precautions to prevent corrupted buffer state. In particular, - * we use tail_ackno to identify outdated records; it always marks the earliest - * packet of group (2) in 11.4.2. - */ -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) - { - struct dccp_ackvec_record *avr, *next; - u8 runlen_now, eff_runlen; - s64 delta; +static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, + struct sock *sk, u64 *ackno, + const unsigned char len, + const unsigned char *vector) +{ + unsigned char i; + struct dccp_ackvec_record *avr; - avr = dccp_ackvec_lookup(&av->av_records, ackno); - if (avr == NULL) + /* Check if we actually sent an ACK vector */ + if (list_empty(&av->av_records)) return; - /* - * Deal with outdated acknowledgments: this arises when e.g. there are - * several old records and the acks from the peer come in slowly. In - * that case we may still have records that pre-date tail_ackno. - */ - delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); - if (delta < 0) - goto free_records; - /* - * Deal with overlapping Ack Vectors: don't subtract more than the - * number of packets between tail_ackno and ack_ackno. - */ - eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; - runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); + i = len; /* - * The run length of Ack Vector cells does not decrease over time. If - * the run length is the same as at the time the Ack Vector was sent, we - * free the ack_ptr cell. That cell can however not be freed if the run - * length has increased: in this case we need to move the tail pointer - * backwards (towards higher indices), to its next-oldest neighbour. + * XXX + * I think it might be more efficient to work backwards. See comment on + * rcv_ackno. -sorbo. */ - if (runlen_now > eff_runlen) { + avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); + while (i--) { + const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; + u64 ackno_end_rl; - av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; - av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); + dccp_set_seqno(&ackno_end_rl, *ackno - rl); - /* This move may not have cleared the overflow flag. */ - if (av->av_overflow) - av->av_overflow = (av->av_buf_head == av->av_buf_tail); - } else { - av->av_buf_tail = avr->avr_ack_ptr; /* - * We have made sure that avr points to a valid cell within the - * buffer. This cell is either older than head, or equals head - * (empty buffer): in both cases we no longer have any overflow. + * If our AVR sequence number is greater than the ack, go + * forward in the AVR list until it is not so. */ - av->av_overflow = 0; - } - - /* - * The peer has acknowledged up to and including ack_ackno. Hence the - * first packet in group (2) of 11.4.2 is the successor of ack_ackno. - */ - av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); + list_for_each_entry_from(avr, &av->av_records, avr_node) { + if (!after48(avr->avr_ack_seqno, *ackno)) + goto found; + } + /* End of the av_records list, not found, exit */ + break; +found: + if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) { + const u8 state = *vector & DCCP_ACKVEC_STATE_MASK; + if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) { + dccp_pr_debug("%s ACK vector 0, len=%d, " + "ack_seqno=%llu, ack_ackno=%llu, " + "ACKED!\n", + dccp_role(sk), len, + (unsigned long long) + avr->avr_ack_seqno, + (unsigned long long) + avr->avr_ack_ackno); + dccp_ackvec_throw_record(av, avr); + break; + } + /* + * If it wasn't received, continue scanning... we might + * find another one. + */ + } -free_records: - list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { - list_del(&avr->avr_node); - kmem_cache_free(dccp_ackvec_record_slab, avr); + dccp_set_seqno(ackno, ackno_end_rl - 1); + ++vector; } } -/* - * Routines to keep track of Ack Vectors received in an skb - */ -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) +int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, + u64 *ackno, const u8 opt, const u8 *value, const u8 len) { - struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); - - if (new == NULL) - return -ENOBUFS; - new->vec = vec; - new->len = len; - new->nonce = nonce; + if (len > DCCP_MAX_ACKVEC_OPT_LEN) + return -1; - list_add_tail(&new->node, head); + /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ + dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk, + ackno, len, value); return 0; } -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); - -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) -{ - struct dccp_ackvec_parsed *cur, *next; - - list_for_each_entry_safe(cur, next, parsed_chunks, node) - kfree(cur); - INIT_LIST_HEAD(parsed_chunks); -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); int __init dccp_ackvec_init(void) { @@ -379,9 +449,10 @@ int __init dccp_ackvec_init(void) if (dccp_ackvec_slab == NULL) goto out_err; - dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", - sizeof(struct dccp_ackvec_record), - 0, SLAB_HWCACHE_ALIGN, NULL); + dccp_ackvec_record_slab = + kmem_cache_create("dccp_ackvec_record", + sizeof(struct dccp_ackvec_record), + 0, SLAB_HWCACHE_ALIGN, NULL); if (dccp_ackvec_record_slab == NULL) goto out_destroy_slab; diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 6cdca79a99f..bcb64fb4ace 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -3,134 +3,156 @@ /* * net/dccp/ackvec.h * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK + * An implementation of the DCCP protocol * Copyright (c) 2005 Arnaldo Carvalho de Melo + * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include #include +#include #include #include -/* - * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, - * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 - * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives - * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. - * The maximum value is bounded by the u16 types for indices and functions. - */ -#define DCCPAV_NUM_ACKVECS 2 -#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) - -/* Estimated minimum average Ack Vector length - used for updating MPS */ -#define DCCPAV_MIN_OPTLEN 16 - -/* Threshold for coping with large bursts of losses */ -#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) - -enum dccp_ackvec_states { - DCCPAV_RECEIVED = 0x00, - DCCPAV_ECN_MARKED = 0x40, - DCCPAV_RESERVED = 0x80, - DCCPAV_NOT_RECEIVED = 0xC0 -}; -#define DCCPAV_MAX_RUNLEN 0x3F +/* Read about the ECN nonce to see why it is 253 */ +#define DCCP_MAX_ACKVEC_OPT_LEN 253 +/* We can spread an ack vector across multiple options */ +#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2) -static inline u8 dccp_ackvec_runlen(const u8 *cell) -{ - return *cell & DCCPAV_MAX_RUNLEN; -} +#define DCCP_ACKVEC_STATE_RECEIVED 0 +#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) +#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) -static inline u8 dccp_ackvec_state(const u8 *cell) -{ - return *cell & ~DCCPAV_MAX_RUNLEN; -} +#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */ +#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */ -/** struct dccp_ackvec - Ack Vector main data structure +/** struct dccp_ackvec - ack vector + * + * This data structure is the one defined in RFC 4340, Appendix A. * - * This implements a fixed-size circular buffer within an array and is largely - * based on Appendix A of RFC 4340. + * @av_buf_head - circular buffer head + * @av_buf_tail - circular buffer tail + * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the + * buffer (i.e. %av_buf_head) + * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked + * by the buffer with State 0 * - * @av_buf: circular buffer storage area - * @av_buf_head: head index; begin of live portion in @av_buf - * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf - * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf - * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf - * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to - * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf - * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound - * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) + * Additionally, the HC-Receiver must keep some information about the + * Ack Vectors it has recently sent. For each packet sent carrying an + * Ack Vector, it remembers four variables: + * + * @av_records - list of dccp_ackvec_record + * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. + * + * @av_time - the time in usecs + * @av_buf - circular buffer of acknowledgeable packets */ struct dccp_ackvec { - u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; - u16 av_buf_head; - u16 av_buf_tail; - u64 av_buf_ackno:48; - u64 av_tail_ackno:48; - bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; - u8 av_overflow:1; + u64 av_buf_ackno; struct list_head av_records; + ktime_t av_time; + u16 av_buf_head; + u16 av_vec_len; + u8 av_buf_nonce; + u8 av_ack_nonce; + u8 av_buf[DCCP_MAX_ACKVEC_LEN]; }; -/** struct dccp_ackvec_record - Records information about sent Ack Vectors +/** struct dccp_ackvec_record - ack vector record * - * These list entries define the additional information which the HC-Receiver - * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. + * ACK vector record as defined in Appendix A of spec. * - * @avr_node: the list node in @av_records - * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on - * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to - * @avr_ack_ptr: pointer into @av_buf where this record starts - * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending - * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent + * The list is sorted by avr_ack_seqno * - * The list as a whole is sorted in descending order by @avr_ack_seqno. + * @avr_node - node in av_records + * @avr_ack_seqno - sequence number of the packet this record was sent on + * @avr_ack_ackno - sequence number being acknowledged + * @avr_ack_ptr - pointer into av_buf where this record starts + * @avr_ack_nonce - av_ack_nonce at the time this record was sent + * @avr_sent_len - lenght of the record in av_buf */ struct dccp_ackvec_record { struct list_head avr_node; - u64 avr_ack_seqno:48; - u64 avr_ack_ackno:48; + u64 avr_ack_seqno; + u64 avr_ack_ackno; u16 avr_ack_ptr; - u8 avr_ack_runlen; - u8 avr_ack_nonce:1; + u16 avr_sent_len; + u8 avr_ack_nonce; }; -extern int dccp_ackvec_init(void); +struct sock; +struct sk_buff; + +#ifdef CONFIG_IP_DCCP_ACKVEC +extern int dccp_ackvec_init(void); extern void dccp_ackvec_exit(void); extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); extern void dccp_ackvec_free(struct dccp_ackvec *av); -extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); -extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); -extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); -extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); +extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, + const u64 ackno, const u8 state); + +extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, + struct sock *sk, const u64 ackno); +extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, + u64 *ackno, const u8 opt, + const u8 *value, const u8 len); -static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) +extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb); + +static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) +{ + return av->av_vec_len; +} +#else /* CONFIG_IP_DCCP_ACKVEC */ +static inline int dccp_ackvec_init(void) { - return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; + return 0; } -/** - * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb - * @vec: start of vector (offset into skb) - * @len: length of @vec - * @nonce: whether @vec had an ECN nonce of 0 or 1 - * @node: FIFO - arranged in descending order of ack_ackno - * This structure is used by CCIDs to access Ack Vectors in a received skb. - */ -struct dccp_ackvec_parsed { - u8 *vec, - len, - nonce:1; - struct list_head node; -}; +static inline void dccp_ackvec_exit(void) +{ +} + +static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) +{ + return NULL; +} + +static inline void dccp_ackvec_free(struct dccp_ackvec *av) +{ +} + +static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, + const u64 ackno, const u8 state) +{ + return -1; +} -extern int dccp_ackvec_parsed_add(struct list_head *head, - u8 *vec, u8 len, u8 nonce); -extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); +static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, + struct sock *sk, const u64 ackno) +{ +} + +static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, + const u64 *ackno, const u8 opt, + const u8 *value, const u8 len) +{ + return -1; +} + +static inline int dccp_insert_option_ackvec(const struct sock *sk, + const struct sk_buff *skb) +{ + return -1; +} + +static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) +{ + return 0; +} +#endif /* CONFIG_IP_DCCP_ACKVEC */ #endif /* _ACKVEC_H */ diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index e3fb52b4f5c..4809753d12a 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -13,13 +13,6 @@ #include "ccid.h" -static u8 builtin_ccids[] = { - DCCPC_CCID2, /* CCID2 is supported by default */ -#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE) - DCCPC_CCID3, -#endif -}; - static struct ccid_operations *ccids[CCID_MAX]; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) static atomic_t ccids_lockct = ATOMIC_INIT(0); @@ -93,47 +86,6 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab) } } -/* check that up to @array_len members in @ccid_array are supported */ -bool ccid_support_check(u8 const *ccid_array, u8 array_len) -{ - u8 i, j, found; - - for (i = 0, found = 0; i < array_len; i++, found = 0) { - for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++) - found = (ccid_array[i] == builtin_ccids[j]); - if (!found) - return false; - } - return true; -} - -/** - * ccid_get_builtin_ccids - Provide copy of `builtin' CCID array - * @ccid_array: pointer to copy into - * @array_len: value to return length into - * This function allocates memory - caller must see that it is freed after use. - */ -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) -{ - *ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any()); - if (*ccid_array == NULL) - return -ENOBUFS; - *array_len = ARRAY_SIZE(builtin_ccids); - return 0; -} - -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *optval, int __user *optlen) -{ - if (len < sizeof(builtin_ccids)) - return -EINVAL; - - if (put_user(sizeof(builtin_ccids), optlen) || - copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids))) - return -EFAULT; - return 0; -} - int ccid_register(struct ccid_operations *ccid_ops) { int err = -ENOBUFS; @@ -196,41 +148,22 @@ int ccid_unregister(struct ccid_operations *ccid_ops) EXPORT_SYMBOL_GPL(ccid_unregister); -/** - * ccid_request_module - Pre-load CCID module for later use - * This should be called only from process context (e.g. during connection - * setup) and is necessary for later calls to ccid_new (typically in software - * interrupt), so that it has the modules available when they are needed. - */ -static int ccid_request_module(u8 id) -{ - if (!in_atomic()) { - ccids_read_lock(); - if (ccids[id] == NULL) { - ccids_read_unlock(); - return request_module("net-dccp-ccid-%d", id); - } - ccids_read_unlock(); - } - return 0; -} - -int ccid_request_modules(u8 const *ccid_array, u8 array_len) -{ -#ifdef CONFIG_KMOD - while (array_len--) - if (ccid_request_module(ccid_array[array_len])) - return -1; -#endif - return 0; -} - struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) { struct ccid_operations *ccid_ops; struct ccid *ccid = NULL; ccids_read_lock(); +#ifdef CONFIG_KMOD + if (ccids[id] == NULL) { + /* We only try to load if in process context */ + ccids_read_unlock(); + if (gfp & GFP_ATOMIC) + goto out; + request_module("net-dccp-ccid-%d", id); + ccids_read_lock(); + } +#endif ccid_ops = ccids[id]; if (ccid_ops == NULL) goto out_unlock; @@ -272,6 +205,20 @@ out_module_put: EXPORT_SYMBOL_GPL(ccid_new); +struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp) +{ + return ccid_new(id, sk, 1, gfp); +} + +EXPORT_SYMBOL_GPL(ccid_hc_rx_new); + +struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp) +{ + return ccid_new(id, sk, 0, gfp); +} + +EXPORT_SYMBOL_GPL(ccid_hc_tx_new); + static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) { struct ccid_operations *ccid_ops; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index d27054ba215..fdeae7b5731 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -60,18 +60,22 @@ struct ccid_operations { void (*ccid_hc_tx_exit)(struct sock *sk); void (*ccid_hc_rx_packet_recv)(struct sock *sk, struct sk_buff *skb); - int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); + int (*ccid_hc_rx_parse_options)(struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value); int (*ccid_hc_rx_insert_options)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_recv)(struct sock *sk, struct sk_buff *skb); - int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); + int (*ccid_hc_tx_parse_options)(struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value); int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_sent)(struct sock *sk, - unsigned int len); + int more, unsigned int len); void (*ccid_hc_rx_get_info)(struct sock *sk, struct tcp_info *info); void (*ccid_hc_tx_get_info)(struct sock *sk, @@ -99,78 +103,31 @@ static inline void *ccid_priv(const struct ccid *ccid) return (void *)ccid->ccid_priv; } -extern bool ccid_support_check(u8 const *ccid_array, u8 array_len); -extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); -extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *, int __user *); - -extern int ccid_request_modules(u8 const *ccid_array, u8 array_len); extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp); -static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_rx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_tx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} +extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, + gfp_t gfp); +extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, + gfp_t gfp); extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); -/* - * Congestion control of queued data packets via CCID decision. - * - * The TX CCID performs its congestion-control by indicating whether and when a - * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). - * The following modes are supported via the symbolic constants below: - * - timer-based pacing (CCID returns a delay value in milliseconds); - * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). - */ - -enum ccid_dequeueing_decision { - CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ - CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ - CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ - CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ - CCID_PACKET_ERR = 0xF0000, /* error condition */ -}; - -static inline int ccid_packet_dequeue_eval(const int return_code) -{ - if (return_code < 0) - return CCID_PACKET_ERR; - if (return_code == 0) - return CCID_PACKET_SEND_AT_ONCE; - if (return_code <= CCID_PACKET_DELAY_MAX) - return CCID_PACKET_DELAY; - return return_code; -} - static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { + int rc = 0; if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) - return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); - return CCID_PACKET_SEND_AT_ONCE; + rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); + return rc; } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, - unsigned int len) + int more, unsigned int len) { if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); + ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); } static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, @@ -187,31 +144,27 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); } -/** - * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver - * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) - * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) - * @val: value of @opt - * @len: length of @val in bytes - */ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value) { - if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) - return 0; - return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); + int rc = 0; + if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) + rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, + value); + return rc; } -/** - * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender - * Arguments are analogous to ccid_hc_tx_parse_options() - */ static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value) { - if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) - return 0; - return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); + int rc = 0; + if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) + rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); + return rc; } static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index fb168be2cb4..12275943eab 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -1,8 +1,10 @@ menu "DCCP CCIDs Configuration (EXPERIMENTAL)" + depends on EXPERIMENTAL config IP_DCCP_CCID2 - tristate "CCID2 (TCP-Like)" + tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" def_tristate IP_DCCP + select IP_DCCP_ACKVEC ---help--- CCID 2, TCP-like Congestion Control, denotes Additive Increase, Multiplicative Decrease (AIMD) congestion control with behavior @@ -34,7 +36,7 @@ config IP_DCCP_CCID2_DEBUG If in doubt, say N. config IP_DCCP_CCID3 - tristate "CCID3 (TCP-Friendly)" + tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" def_tristate IP_DCCP select IP_DCCP_TFRC_LIB ---help--- @@ -62,9 +64,9 @@ config IP_DCCP_CCID3 If in doubt, say M. -if IP_DCCP_CCID3 config IP_DCCP_CCID3_DEBUG bool "CCID3 debugging messages" + depends on IP_DCCP_CCID3 ---help--- Enable CCID3-specific debugging messages. @@ -74,29 +76,10 @@ config IP_DCCP_CCID3_DEBUG If in doubt, say N. -choice - prompt "Select method for measuring the packet size s" - default IP_DCCP_CCID3_MEASURE_S_AS_MPS - -config IP_DCCP_CCID3_MEASURE_S_AS_MPS - bool "Always use MPS in place of s" - ---help--- - This use is recommended as it is consistent with the initialisation - of X and suggested when s varies (rfc3448bis, (1) in section 4.1). -config IP_DCCP_CCID3_MEASURE_S_AS_AVG - bool "Use moving average" - ---help--- - An alternative way of tracking s, also supported by rfc3448bis. - This used to be the default for CCID-3 in previous kernels. -config IP_DCCP_CCID3_MEASURE_S_AS_MAX - bool "Track the maximum payload length" - ---help--- - An experimental method based on tracking the maximum packet size. -endchoice - config IP_DCCP_CCID3_RTO int "Use higher bound for nofeedback timer" default 100 + depends on IP_DCCP_CCID3 && EXPERIMENTAL ---help--- Use higher lower bound for nofeedback timer expiration. @@ -123,7 +106,6 @@ config IP_DCCP_CCID3_RTO The purpose of the nofeedback timer is to slow DCCP down when there is serious network congestion: experimenting with larger values should therefore not be performed on WANs. -endif # IP_DCCP_CCID3 config IP_DCCP_TFRC_LIB tristate diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index fa713227c66..9a430734530 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -25,7 +25,7 @@ /* * This implementation should follow RFC 4341 */ -#include "../feat.h" + #include "../ccid.h" #include "../dccp.h" #include "ccid2.h" @@ -34,8 +34,51 @@ #ifdef CONFIG_IP_DCCP_CCID2_DEBUG static int ccid2_debug; #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) + +static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx) +{ + int len = 0; + int pipe = 0; + struct ccid2_seq *seqp = hctx->ccid2hctx_seqh; + + /* there is data in the chain */ + if (seqp != hctx->ccid2hctx_seqt) { + seqp = seqp->ccid2s_prev; + len++; + if (!seqp->ccid2s_acked) + pipe++; + + while (seqp != hctx->ccid2hctx_seqt) { + struct ccid2_seq *prev = seqp->ccid2s_prev; + + len++; + if (!prev->ccid2s_acked) + pipe++; + + /* packets are sent sequentially */ + BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, + prev->ccid2s_seq ) >= 0); + BUG_ON(time_before(seqp->ccid2s_sent, + prev->ccid2s_sent)); + + seqp = prev; + } + } + + BUG_ON(pipe != hctx->ccid2hctx_pipe); + ccid2_pr_debug("len of chain=%d\n", len); + + do { + seqp = seqp->ccid2s_prev; + len++; + } while (seqp != hctx->ccid2hctx_seqh); + + ccid2_pr_debug("total len=%d\n", len); + BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN); +} #else #define ccid2_pr_debug(format, a...) +#define ccid2_hc_tx_check_sanity(hctx) #endif static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) @@ -44,7 +87,8 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) int i; /* check if we have space to preserve the pointer to the buffer */ - if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *)) + if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) / + sizeof(struct ccid2_seq*))) return -ENOMEM; /* allocate buffer and initialize linked list */ @@ -60,35 +104,38 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; /* This is the first allocation. Initiate the head and tail. */ - if (hctx->seqbufc == 0) - hctx->seqh = hctx->seqt = seqp; + if (hctx->ccid2hctx_seqbufc == 0) + hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp; else { /* link the existing list with the one we just created */ - hctx->seqh->ccid2s_next = seqp; - seqp->ccid2s_prev = hctx->seqh; + hctx->ccid2hctx_seqh->ccid2s_next = seqp; + seqp->ccid2s_prev = hctx->ccid2hctx_seqh; - hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt; + hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; + seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt; } /* store the original pointer to the buffer so we can free it */ - hctx->seqbuf[hctx->seqbufc] = seqp; - hctx->seqbufc++; + hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp; + hctx->ccid2hctx_seqbufc++; return 0; } static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { - if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) - return CCID_PACKET_WILL_DEQUEUE_LATER; - return CCID_PACKET_SEND_AT_ONCE; + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + + if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) + return 0; + + return 1; /* XXX CCID should dequeue when ready instead of polling */ } static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) { struct dccp_sock *dp = dccp_sk(sk); - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2); + u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); /* * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from @@ -100,8 +147,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); val = max_ratio; } - if (val > DCCPF_ACK_RATIO_MAX) - val = DCCPF_ACK_RATIO_MAX; + if (val > 0xFFFF) /* RFC 4340, 11.3 */ + val = 0xFFFF; if (val == dp->dccps_l_ack_ratio) return; @@ -110,77 +157,99 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) dp->dccps_l_ack_ratio = val; } +static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) +{ + ccid2_pr_debug("change SRTT to %ld\n", val); + hctx->ccid2hctx_srtt = val; +} + +static void ccid2_start_rto_timer(struct sock *sk); + static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); + long s; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { - sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5); + sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, + jiffies + HZ / 5); goto out; } ccid2_pr_debug("RTO_EXPIRE\n"); + ccid2_hc_tx_check_sanity(hctx); + /* back-off timer */ - hctx->rto <<= 1; - if (hctx->rto > DCCP_RTO_MAX) - hctx->rto = DCCP_RTO_MAX; + hctx->ccid2hctx_rto <<= 1; + + s = hctx->ccid2hctx_rto / HZ; + if (s > 60) + hctx->ccid2hctx_rto = 60 * HZ; + + ccid2_start_rto_timer(sk); /* adjust pipe, cwnd etc */ - hctx->ssthresh = hctx->cwnd / 2; - if (hctx->ssthresh < 2) - hctx->ssthresh = 2; - hctx->cwnd = 1; - hctx->pipe = 0; + hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; + if (hctx->ccid2hctx_ssthresh < 2) + hctx->ccid2hctx_ssthresh = 2; + hctx->ccid2hctx_cwnd = 1; + hctx->ccid2hctx_pipe = 0; /* clear state about stuff we sent */ - hctx->seqt = hctx->seqh; - hctx->packets_acked = 0; + hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; + hctx->ccid2hctx_packets_acked = 0; /* clear ack ratio state. */ - hctx->rpseq = 0; - hctx->rpdupack = -1; + hctx->ccid2hctx_rpseq = 0; + hctx->ccid2hctx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); - - /* if we were blocked before, we may now send cwnd=1 packet */ - if (sender_was_blocked) - tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); - /* restart backed-off timer */ - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); + ccid2_hc_tx_check_sanity(hctx); out: bh_unlock_sock(sk); sock_put(sk); } -static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) +static void ccid2_start_rto_timer(struct sock *sk) +{ + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + + ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto); + + BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer)); + sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, + jiffies + hctx->ccid2hctx_rto); +} + +static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); struct ccid2_seq *next; - hctx->pipe++; + hctx->ccid2hctx_pipe++; - hctx->seqh->ccid2s_seq = dp->dccps_gss; - hctx->seqh->ccid2s_acked = 0; - hctx->seqh->ccid2s_sent = jiffies; + hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; + hctx->ccid2hctx_seqh->ccid2s_acked = 0; + hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; - next = hctx->seqh->ccid2s_next; + next = hctx->ccid2hctx_seqh->ccid2s_next; /* check if we need to alloc more space */ - if (next == hctx->seqt) { + if (next == hctx->ccid2hctx_seqt) { if (ccid2_hc_tx_alloc_seq(hctx)) { DCCP_CRIT("packet history - out of memory!"); /* FIXME: find a more graceful way to bail out */ return; } - next = hctx->seqh->ccid2s_next; - BUG_ON(next == hctx->seqt); + next = hctx->ccid2hctx_seqh->ccid2s_next; + BUG_ON(next == hctx->ccid2hctx_seqt); } - hctx->seqh = next; + hctx->ccid2hctx_seqh = next; - ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe); + ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, + hctx->ccid2hctx_pipe); /* * FIXME: The code below is broken and the variables have been removed @@ -203,12 +272,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) */ #if 0 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ - hctx->arsent++; + hctx->ccid2hctx_arsent++; /* We had an ack loss in this window... */ - if (hctx->ackloss) { - if (hctx->arsent >= hctx->cwnd) { - hctx->arsent = 0; - hctx->ackloss = 0; + if (hctx->ccid2hctx_ackloss) { + if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) { + hctx->ccid2hctx_arsent = 0; + hctx->ccid2hctx_ackloss = 0; } } else { /* No acks lost up to now... */ @@ -218,28 +287,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - dp->dccps_l_ack_ratio; - denom = hctx->cwnd * hctx->cwnd / denom; + denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom; - if (hctx->arsent >= denom) { + if (hctx->ccid2hctx_arsent >= denom) { ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); - hctx->arsent = 0; + hctx->ccid2hctx_arsent = 0; } } else { /* we can't increase ack ratio further [1] */ - hctx->arsent = 0; /* or maybe set it to cwnd*/ + hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ } } #endif /* setup RTO timer */ - if (!timer_pending(&hctx->rtotimer)) - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); + if (!timer_pending(&hctx->ccid2hctx_rtotimer)) + ccid2_start_rto_timer(sk); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG do { - struct ccid2_seq *seqp = hctx->seqt; + struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; - while (seqp != hctx->seqh) { + while (seqp != hctx->ccid2hctx_seqh) { ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", (unsigned long long)seqp->ccid2s_seq, seqp->ccid2s_acked, seqp->ccid2s_sent); @@ -247,158 +316,205 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) } } while (0); ccid2_pr_debug("=========\n"); + ccid2_hc_tx_check_sanity(hctx); #endif } -/** - * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm - * This code is almost identical with TCP's tcp_rtt_estimator(), since - * - it has a higher sampling frequency (recommended by RFC 1323), - * - the RTO does not collapse into RTT due to RTTVAR going towards zero, - * - it is simple (cf. more complex proposals such as Eifel timer or research - * which suggests that the gain should be set according to window size), - * - in tests it was found to work well with CCID2 [gerrit]. +/* XXX Lame code duplication! + * returns -1 if none was found. + * else returns the next offset to use in the function call. */ -static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) +static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, + unsigned char **vec, unsigned char *veclen) { - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - long m = mrtt ? : 1; - - if (hctx->srtt == 0) { - /* First measurement m */ - hctx->srtt = m << 3; - hctx->mdev = m << 1; - - hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev); - hctx->rttvar = hctx->mdev_max; - hctx->rtt_seq = dccp_sk(sk)->dccps_gss; - } else { - /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ - m -= (hctx->srtt >> 3); - hctx->srtt += m; - - /* Similarly, update scaled mdev with regard to |m| */ - if (m < 0) { - m = -m; - m -= (hctx->mdev >> 2); + const struct dccp_hdr *dh = dccp_hdr(skb); + unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); + unsigned char *opt_ptr; + const unsigned char *opt_end = (unsigned char *)dh + + (dh->dccph_doff * 4); + unsigned char opt, len; + unsigned char *value; + + BUG_ON(offset < 0); + options += offset; + opt_ptr = options; + if (opt_ptr >= opt_end) + return -1; + + while (opt_ptr != opt_end) { + opt = *opt_ptr++; + len = 0; + value = NULL; + + /* Check if this isn't a single byte option */ + if (opt > DCCPO_MAX_RESERVED) { + if (opt_ptr == opt_end) + goto out_invalid_option; + + len = *opt_ptr++; + if (len < 3) + goto out_invalid_option; /* - * This neutralises RTO increase when RTT < SRTT - mdev - * (see P. Sarolahti, A. Kuznetsov,"Congestion Control - * in Linux TCP", USENIX 2002, pp. 49-62). + * Remove the type and len fields, leaving + * just the value size */ - if (m > 0) - m >>= 3; - } else { - m -= (hctx->mdev >> 2); - } - hctx->mdev += m; + len -= 2; + value = opt_ptr; + opt_ptr += len; - if (hctx->mdev > hctx->mdev_max) { - hctx->mdev_max = hctx->mdev; - if (hctx->mdev_max > hctx->rttvar) - hctx->rttvar = hctx->mdev_max; + if (opt_ptr > opt_end) + goto out_invalid_option; } - /* - * Decay RTTVAR at most once per flight, exploiting that - * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) - * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) - * GAR is a useful bound for FlightSize = pipe, AWL is probably - * too low as it over-estimates pipe. - */ - if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) { - if (hctx->mdev_max < hctx->rttvar) - hctx->rttvar -= (hctx->rttvar - - hctx->mdev_max) >> 2; - hctx->rtt_seq = dccp_sk(sk)->dccps_gss; - hctx->mdev_max = TCP_RTO_MIN; + switch (opt) { + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + *vec = value; + *veclen = len; + return offset + (opt_ptr - options); } } - /* - * Set RTO from SRTT and RTTVAR - * Clock granularity is ignored since the minimum error for RTTVAR is - * clamped to 50msec (corresponding to HZ=20). This leads to a minimum - * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP - * does not retransmit data, DCCP does not require TCP's recommended - * minimum timeout of one second". - */ - hctx->rto = (hctx->srtt >> 3) + hctx->rttvar; + return -1; - if (hctx->rto > DCCP_RTO_MAX) - hctx->rto = DCCP_RTO_MAX; +out_invalid_option: + DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); + return -1; } -static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, - unsigned int *maxincr) +static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - if (hctx->cwnd < hctx->ssthresh) { - if (*maxincr > 0 && ++hctx->packets_acked == 2) { - hctx->cwnd += 1; - *maxincr -= 1; - hctx->packets_acked = 0; - } - } else if (++hctx->packets_acked >= hctx->cwnd) { - hctx->cwnd += 1; - hctx->packets_acked = 0; - } - /* - * FIXME: RTT is sampled several times per acknowledgment (for each - * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). - * This causes the RTT to be over-estimated, since the older entries - * in the Ack Vector have earlier sending times. - * The cleanest solution is to not use the ccid2s_sent field at all - * and instead use DCCP timestamps - need to be resolved at some time. - */ - ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); + sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer); + ccid2_pr_debug("deleted RTO timer\n"); } -static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) +static inline void ccid2_new_ack(struct sock *sk, + struct ccid2_seq *seqp, + unsigned int *maxincr) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - if (time_before(seqp->ccid2s_sent, hctx->last_cong)) { - ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); - return; + if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { + if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { + hctx->ccid2hctx_cwnd += 1; + *maxincr -= 1; + hctx->ccid2hctx_packets_acked = 0; + } + } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { + hctx->ccid2hctx_cwnd += 1; + hctx->ccid2hctx_packets_acked = 0; } - hctx->last_cong = jiffies; + /* update RTO */ + if (hctx->ccid2hctx_srtt == -1 || + time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) { + unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; + int s; + + /* first measurement */ + if (hctx->ccid2hctx_srtt == -1) { + ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", + r, jiffies, + (unsigned long long)seqp->ccid2s_seq); + ccid2_change_srtt(hctx, r); + hctx->ccid2hctx_rttvar = r >> 1; + } else { + /* RTTVAR */ + long tmp = hctx->ccid2hctx_srtt - r; + long srtt; + + if (tmp < 0) + tmp *= -1; + + tmp >>= 2; + hctx->ccid2hctx_rttvar *= 3; + hctx->ccid2hctx_rttvar >>= 2; + hctx->ccid2hctx_rttvar += tmp; + + /* SRTT */ + srtt = hctx->ccid2hctx_srtt; + srtt *= 7; + srtt >>= 3; + tmp = r >> 3; + srtt += tmp; + ccid2_change_srtt(hctx, srtt); + } + s = hctx->ccid2hctx_rttvar << 2; + /* clock granularity is 1 when based on jiffies */ + if (!s) + s = 1; + hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s; + + /* must be at least a second */ + s = hctx->ccid2hctx_rto / HZ; + /* DCCP doesn't require this [but I like it cuz my code sux] */ +#if 1 + if (s < 1) + hctx->ccid2hctx_rto = HZ; +#endif + /* max 60 seconds */ + if (s > 60) + hctx->ccid2hctx_rto = HZ * 60; - hctx->cwnd = hctx->cwnd / 2 ? : 1U; - hctx->ssthresh = max(hctx->cwnd, 2U); + hctx->ccid2hctx_lastrtt = jiffies; - /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ - if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd) - ccid2_change_l_ack_ratio(sk, hctx->cwnd); + ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", + hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar, + hctx->ccid2hctx_rto, HZ, r); + } + + /* we got a new ack, so re-start RTO timer */ + ccid2_hc_tx_kill_rto_timer(sk); + ccid2_start_rto_timer(sk); } -static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) +static void ccid2_hc_tx_dec_pipe(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - switch (option) { - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen, - option - DCCPO_ACK_VECTOR_0); + if (hctx->ccid2hctx_pipe == 0) + DCCP_BUG("pipe == 0"); + else + hctx->ccid2hctx_pipe--; + + if (hctx->ccid2hctx_pipe == 0) + ccid2_hc_tx_kill_rto_timer(sk); +} + +static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) +{ + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + + if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { + ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); + return; } - return 0; + + hctx->ccid2hctx_last_cong = jiffies; + + hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; + hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); + + /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ + if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) + ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); } static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); - struct dccp_ackvec_parsed *avp; u64 ackno, seqno; struct ccid2_seq *seqp; + unsigned char *vector; + unsigned char veclen; + int offset = 0; int done = 0; unsigned int maxincr = 0; + ccid2_hc_tx_check_sanity(hctx); /* check reverse path congestion */ seqno = DCCP_SKB_CB(skb)->dccpd_seq; @@ -407,21 +523,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * -sorbo. */ /* need to bootstrap */ - if (hctx->rpdupack == -1) { - hctx->rpdupack = 0; - hctx->rpseq = seqno; + if (hctx->ccid2hctx_rpdupack == -1) { + hctx->ccid2hctx_rpdupack = 0; + hctx->ccid2hctx_rpseq = seqno; } else { /* check if packet is consecutive */ - if (dccp_delta_seqno(hctx->rpseq, seqno) == 1) - hctx->rpseq = seqno; + if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1) + hctx->ccid2hctx_rpseq = seqno; /* it's a later packet */ - else if (after48(seqno, hctx->rpseq)) { - hctx->rpdupack++; + else if (after48(seqno, hctx->ccid2hctx_rpseq)) { + hctx->ccid2hctx_rpdupack++; /* check if we got enough dupacks */ - if (hctx->rpdupack >= NUMDUPACK) { - hctx->rpdupack = -1; /* XXX lame */ - hctx->rpseq = 0; + if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { + hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ + hctx->ccid2hctx_rpseq = 0; ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); } @@ -429,22 +545,27 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* check forward path congestion */ - if (dccp_packet_without_ack(skb)) + /* still didn't send out new data packets */ + if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt) return; - /* still didn't send out new data packets */ - if (hctx->seqh == hctx->seqt) - goto done; + switch (DCCP_SKB_CB(skb)->dccpd_type) { + case DCCP_PKT_ACK: + case DCCP_PKT_DATAACK: + break; + default: + return; + } ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - if (after48(ackno, hctx->high_ack)) - hctx->high_ack = ackno; + if (after48(ackno, hctx->ccid2hctx_high_ack)) + hctx->ccid2hctx_high_ack = ackno; - seqp = hctx->seqt; + seqp = hctx->ccid2hctx_seqt; while (before48(seqp->ccid2s_seq, ackno)) { seqp = seqp->ccid2s_next; - if (seqp == hctx->seqh) { - seqp = hctx->seqh->ccid2s_prev; + if (seqp == hctx->ccid2hctx_seqh) { + seqp = hctx->ccid2hctx_seqh->ccid2s_prev; break; } } @@ -454,26 +575,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * packets per acknowledgement. Rounding up avoids that cwnd is not * advanced when Ack Ratio is 1 and gives a slight edge otherwise. */ - if (hctx->cwnd < hctx->ssthresh) + if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); /* go through all ack vectors */ - list_for_each_entry(avp, &hctx->av_chunks, node) { + while ((offset = ccid2_ackvector(sk, skb, offset, + &vector, &veclen)) != -1) { /* go through this ack vector */ - for (; avp->len--; avp->vec++) { - u64 ackno_end_rl = SUB48(ackno, - dccp_ackvec_runlen(avp->vec)); + while (veclen--) { + const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; + u64 ackno_end_rl = SUB48(ackno, rl); - ccid2_pr_debug("ackvec %llu |%u,%u|\n", + ccid2_pr_debug("ackvec start:%llu end:%llu\n", (unsigned long long)ackno, - dccp_ackvec_state(avp->vec) >> 6, - dccp_ackvec_runlen(avp->vec)); + (unsigned long long)ackno_end_rl); /* if the seqno we are analyzing is larger than the * current ackno, then move towards the tail of our * seqnos. */ while (after48(seqp->ccid2s_seq, ackno)) { - if (seqp == hctx->seqt) { + if (seqp == hctx->ccid2hctx_seqt) { done = 1; break; } @@ -486,24 +607,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * run length */ while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = dccp_ackvec_state(avp->vec); + const u8 state = *vector & + DCCP_ACKVEC_STATE_MASK; /* new packet received or marked */ - if (state != DCCPAV_NOT_RECEIVED && + if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && !seqp->ccid2s_acked) { - if (state == DCCPAV_ECN_MARKED) + if (state == + DCCP_ACKVEC_STATE_ECN_MARKED) { ccid2_congestion_event(sk, seqp); - else + } else ccid2_new_ack(sk, seqp, &maxincr); seqp->ccid2s_acked = 1; ccid2_pr_debug("Got ack for %llu\n", (unsigned long long)seqp->ccid2s_seq); - hctx->pipe--; + ccid2_hc_tx_dec_pipe(sk); } - if (seqp == hctx->seqt) { + if (seqp == hctx->ccid2hctx_seqt) { done = 1; break; } @@ -513,6 +636,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) break; ackno = SUB48(ackno_end_rl, 1); + vector++; } if (done) break; @@ -521,11 +645,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* The state about what is acked should be correct now * Check for NUMDUPACK */ - seqp = hctx->seqt; - while (before48(seqp->ccid2s_seq, hctx->high_ack)) { + seqp = hctx->ccid2hctx_seqt; + while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) { seqp = seqp->ccid2s_next; - if (seqp == hctx->seqh) { - seqp = hctx->seqh->ccid2s_prev; + if (seqp == hctx->ccid2hctx_seqh) { + seqp = hctx->ccid2hctx_seqh->ccid2s_prev; break; } } @@ -536,7 +660,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (done == NUMDUPACK) break; } - if (seqp == hctx->seqt) + if (seqp == hctx->ccid2hctx_seqt) break; seqp = seqp->ccid2s_prev; } @@ -557,34 +681,25 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * one ack vector. */ ccid2_congestion_event(sk, seqp); - hctx->pipe--; + ccid2_hc_tx_dec_pipe(sk); } - if (seqp == hctx->seqt) + if (seqp == hctx->ccid2hctx_seqt) break; seqp = seqp->ccid2s_prev; } - hctx->seqt = last_acked; + hctx->ccid2hctx_seqt = last_acked; } /* trim acked packets in tail */ - while (hctx->seqt != hctx->seqh) { - if (!hctx->seqt->ccid2s_acked) + while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) { + if (!hctx->ccid2hctx_seqt->ccid2s_acked) break; - hctx->seqt = hctx->seqt->ccid2s_next; + hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next; } - /* restart RTO timer if not all outstanding data has been acked */ - if (hctx->pipe == 0) - sk_stop_timer(sk, &hctx->rtotimer); - else - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); -done: - /* check if incoming Acks allow pending packets to be sent */ - if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) - tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); - dccp_ackvec_parsed_cleanup(&hctx->av_chunks); + ccid2_hc_tx_check_sanity(hctx); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) @@ -594,13 +709,17 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) u32 max_ratio; /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ - hctx->ssthresh = ~0U; + hctx->ccid2hctx_ssthresh = ~0U; - /* Use larger initial windows (RFC 3390, rfc2581bis) */ - hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); + /* + * RFC 4341, 5: "The cwnd parameter is initialized to at most four + * packets for new connections, following the rules from [RFC3390]". + * We need to convert the bytes of RFC3390 into the packets of RFC 4341. + */ + hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); /* Make sure that Ack Ratio is enabled and within bounds. */ - max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); + max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) dp->dccps_l_ack_ratio = max_ratio; @@ -608,11 +727,15 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) if (ccid2_hc_tx_alloc_seq(hctx)) return -ENOMEM; - hctx->rto = DCCP_TIMEOUT_INIT; - hctx->rpdupack = -1; - hctx->last_cong = jiffies; - setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); - INIT_LIST_HEAD(&hctx->av_chunks); + hctx->ccid2hctx_rto = 3 * HZ; + ccid2_change_srtt(hctx, -1); + hctx->ccid2hctx_rttvar = -1; + hctx->ccid2hctx_rpdupack = -1; + hctx->ccid2hctx_last_cong = jiffies; + setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire, + (unsigned long)sk); + + ccid2_hc_tx_check_sanity(hctx); return 0; } @@ -621,11 +744,11 @@ static void ccid2_hc_tx_exit(struct sock *sk) struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); int i; - sk_stop_timer(sk, &hctx->rtotimer); + ccid2_hc_tx_kill_rto_timer(sk); - for (i = 0; i < hctx->seqbufc; i++) - kfree(hctx->seqbuf[i]); - hctx->seqbufc = 0; + for (i = 0; i < hctx->ccid2hctx_seqbufc; i++) + kfree(hctx->ccid2hctx_seqbuf[i]); + hctx->ccid2hctx_seqbufc = 0; } static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) @@ -636,28 +759,27 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) switch (DCCP_SKB_CB(skb)->dccpd_type) { case DCCP_PKT_DATA: case DCCP_PKT_DATAACK: - hcrx->data++; - if (hcrx->data >= dp->dccps_r_ack_ratio) { + hcrx->ccid2hcrx_data++; + if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) { dccp_send_ack(sk); - hcrx->data = 0; + hcrx->ccid2hcrx_data = 0; } break; } } static struct ccid_operations ccid2 = { - .ccid_id = DCCPC_CCID2, - .ccid_name = "TCP-like", - .ccid_owner = THIS_MODULE, - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), - .ccid_hc_tx_init = ccid2_hc_tx_init, - .ccid_hc_tx_exit = ccid2_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, - .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, + .ccid_id = DCCPC_CCID2, + .ccid_name = "TCP-like", + .ccid_owner = THIS_MODULE, + .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), + .ccid_hc_tx_init = ccid2_hc_tx_init, + .ccid_hc_tx_exit = ccid2_hc_tx_exit, + .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, + .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, + .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, + .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), + .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, }; #ifdef CONFIG_IP_DCCP_CCID2_DEBUG diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 8b7a2dee2f6..2c94ca02901 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -42,49 +42,34 @@ struct ccid2_seq { /** struct ccid2_hc_tx_sock - CCID2 TX half connection * - * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 - * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @srtt: smoothed RTT estimate, scaled by 2^3 - * @mdev: smoothed RTT variation, scaled by 2^2 - * @mdev_max: maximum of @mdev during one flight - * @rttvar: moving average/maximum of @mdev_max - * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) - * @rtt_seq: to decay RTTVAR at most once per flight - * @rpseq: last consecutive seqno - * @rpdupack: dupacks since rpseq - * @av_chunks: list of Ack Vectors received on current skb - */ + * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 + * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465) + * @ccid2hctx_lastrtt -time RTT was last measured + * @ccid2hctx_rpseq - last consecutive seqno + * @ccid2hctx_rpdupack - dupacks since rpseq +*/ struct ccid2_hc_tx_sock { - u32 cwnd; - u32 ssthresh; - u32 pipe; - u32 packets_acked; - struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX]; - int seqbufc; - struct ccid2_seq *seqh; - struct ccid2_seq *seqt; - /* RTT measurement: variables/principles are the same as in TCP */ - u32 srtt, - mdev, - mdev_max, - rttvar, - rto; - u64 rtt_seq:48; - struct timer_list rtotimer; - u64 rpseq; - int rpdupack; - unsigned long last_cong; - u64 high_ack; - struct list_head av_chunks; + u32 ccid2hctx_cwnd; + u32 ccid2hctx_ssthresh; + u32 ccid2hctx_pipe; + u32 ccid2hctx_packets_acked; + struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; + int ccid2hctx_seqbufc; + struct ccid2_seq *ccid2hctx_seqh; + struct ccid2_seq *ccid2hctx_seqt; + long ccid2hctx_rto; + long ccid2hctx_srtt; + long ccid2hctx_rttvar; + unsigned long ccid2hctx_lastrtt; + struct timer_list ccid2hctx_rtotimer; + u64 ccid2hctx_rpseq; + int ccid2hctx_rpdupack; + unsigned long ccid2hctx_last_cong; + u64 ccid2hctx_high_ack; }; -static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx) -{ - return (hctx->pipe >= hctx->cwnd); -} - struct ccid2_hc_rx_sock { - int data; + int ccid2hcrx_data; }; static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 06cfdad84a6..3b8bd7ca676 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -49,41 +49,75 @@ static int ccid3_debug; /* * Transmitter Half-Connection Routines */ -/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */ -static int do_osc_prev = true; +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG +static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) +{ + static char *ccid3_state_names[] = { + [TFRC_SSTATE_NO_SENT] = "NO_SENT", + [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", + [TFRC_SSTATE_FBACK] = "FBACK", + [TFRC_SSTATE_TERM] = "TERM", + }; + + return ccid3_state_names[state]; +} +#endif + +static void ccid3_hc_tx_set_state(struct sock *sk, + enum ccid3_hc_tx_states state) +{ + struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_tx_state_name(oldstate), + ccid3_tx_state_name(state)); + WARN_ON(state == oldstate); + hctx->ccid3hctx_state = state; +} /* * Compute the initial sending rate X_init in the manner of RFC 3390: * - * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT + * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT * + * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis + * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. * For consistency with other parts of the code, X_init is scaled by 2^6. */ static inline u64 rfc3390_initial_rate(struct sock *sk) { - const u32 mps = dccp_sk(sk)->dccps_mss_cache, - w_init = clamp(4380U, 2 * mps, 4 * mps); + const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + const __u32 w_init = clamp_t(__u32, 4380U, + 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s); - return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt); + return scaled_div(w_init << 6, hctx->ccid3hctx_rtt); } -/** - * ccid3_update_send_interval - Calculate new t_ipi = s / X - * This respects the granularity of X (64 * bytes/second) and enforces the - * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342. +/* + * Recalculate t_ipi and delta (should be called whenever X changes) */ static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) { - if (unlikely(hctx->x <= hctx->s)) - hctx->x = hctx->s; - hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); + /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ + hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, + hctx->ccid3hctx_x); + + /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ + hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, + TFRC_OPSYS_HALF_TIME_GRAN); + + ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", + hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta, + hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6)); + } static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) { - u32 delta = ktime_us_delta(now, hctx->t_last_win_count); + u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); - return delta / hctx->rtt; + return delta / hctx->ccid3hctx_rtt; } /** @@ -99,8 +133,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - u64 min_rate = 2 * hctx->x_recv; - const u64 old_x = hctx->x; + __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; + const __u64 old_x = hctx->ccid3hctx_x; ktime_t now = stamp ? *stamp : ktime_get_real(); /* @@ -111,44 +145,50 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) */ if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { min_rate = rfc3390_initial_rate(sk); - min_rate = max(min_rate, 2 * hctx->x_recv); + min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); } - if (hctx->p > 0) { + if (hctx->ccid3hctx_p > 0) { - hctx->x = min(((u64)hctx->x_calc) << 6, min_rate); + hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, + min_rate); + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, + (((__u64)hctx->ccid3hctx_s) << 6) / + TFRC_T_MBI); - } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) { + } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld) + - (s64)hctx->ccid3hctx_rtt >= 0) { - hctx->x = min(2 * hctx->x, min_rate); - hctx->x = max(hctx->x, - scaled_div(((u64)hctx->s) << 6, hctx->rtt)); - hctx->t_ld = now; + hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate); + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, + scaled_div(((__u64)hctx->ccid3hctx_s) << 6, + hctx->ccid3hctx_rtt)); + hctx->ccid3hctx_t_ld = now; } - if (hctx->x != old_x) { + if (hctx->ccid3hctx_x != old_x) { ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " "X_recv=%u\n", (unsigned)(old_x >> 6), - (unsigned)(hctx->x >> 6), hctx->x_calc, - (unsigned)(hctx->x_recv >> 6)); + (unsigned)(hctx->ccid3hctx_x >> 6), + hctx->ccid3hctx_x_calc, + (unsigned)(hctx->ccid3hctx_x_recv >> 6)); ccid3_update_send_interval(hctx); } } /* - * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1) - * @new_len: DCCP payload size in bytes (not used by all methods) + * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) + * @len: DCCP packet payload size in bytes */ -static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) +static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) { -#if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG) - return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9); -#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX) - return max(ccid3_hc_tx_sk(sk)->s, new_len); -#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */ - return dccp_sk(sk)->dccps_mss_cache; -#endif + const u16 old_s = hctx->ccid3hctx_s; + + hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9); + + if (hctx->ccid3hctx_s != old_s) + ccid3_update_send_interval(hctx); } /* @@ -158,13 +198,13 @@ static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, ktime_t now) { - u32 delta = ktime_us_delta(now, hctx->t_last_win_count), - quarter_rtts = (4 * delta) / hctx->rtt; + u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count), + quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt; if (quarter_rtts > 0) { - hctx->t_last_win_count = now; - hctx->last_win_count += min(quarter_rtts, 5U); - hctx->last_win_count &= 0xF; /* mod 16 */ + hctx->ccid3hctx_t_last_win_count = now; + hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U); + hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ } } @@ -181,26 +221,25 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) goto restart_timer; } - ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk, - hctx->feedback ? "" : "out"); + ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, + ccid3_tx_state_name(hctx->ccid3hctx_state)); - /* Ignore and do not restart after leaving the established state */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) + if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK) + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) goto out; - /* Reset feedback state to "no feedback received" */ - hctx->feedback = false; - /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 - * RTO is 0 if and only if no feedback has been received yet. */ - if (hctx->t_rto == 0 || hctx->p == 0) { + if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */ + hctx->ccid3hctx_p == 0) { /* halve send rate directly */ - hctx->x /= 2; + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, + (((__u64)hctx->ccid3hctx_s) << 6) / + TFRC_T_MBI); ccid3_update_send_interval(hctx); - } else { /* * Modify the cached value of X_recv @@ -212,41 +251,44 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) * * Note that X_recv is scaled by 2^6 while X_calc is not */ - BUG_ON(hctx->p && !hctx->x_calc); + BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); - if (hctx->x_calc > (hctx->x_recv >> 5)) - hctx->x_recv /= 2; + if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5)) + hctx->ccid3hctx_x_recv = + max(hctx->ccid3hctx_x_recv / 2, + (((__u64)hctx->ccid3hctx_s) << 6) / + (2 * TFRC_T_MBI)); else { - hctx->x_recv = hctx->x_calc; - hctx->x_recv <<= 4; + hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; + hctx->ccid3hctx_x_recv <<= 4; } ccid3_hc_tx_update_x(sk, NULL); } ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", - (unsigned long long)hctx->x); + (unsigned long long)hctx->ccid3hctx_x); /* * Set new timeout for the nofeedback timer. * See comments in packet_recv() regarding the value of t_RTO. */ - if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */ + if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */ t_nfb = TFRC_INITIAL_TIMEOUT; else - t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); restart_timer: - sk_reset_timer(sk, &hctx->no_feedback_timer, + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + usecs_to_jiffies(t_nfb)); out: bh_unlock_sock(sk); sock_put(sk); } -/** - * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets - * @skb: next packet candidate to send on @sk - * This function uses the convention of ccid_packet_dequeue_eval() and - * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. +/* + * returns + * > 0: delay (in msecs) that should pass before actually sending + * = 0: can send immediately + * < 0: error condition; do not send packet */ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { @@ -263,14 +305,18 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) if (unlikely(skb->len == 0)) return -EBADMSG; - if (hctx->s == 0) { - sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + (jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); - hctx->last_win_count = 0; - hctx->t_last_win_count = now; + hctx->ccid3hctx_last_win_count = 0; + hctx->ccid3hctx_t_last_win_count = now; /* Set t_0 for initial packet */ - hctx->t_nom = now; + hctx->ccid3hctx_t_nom = now; + + hctx->ccid3hctx_s = skb->len; /* * Use initial RTT sample when available: recommended by erratum @@ -279,9 +325,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) */ if (dp->dccps_syn_rtt) { ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); - hctx->rtt = dp->dccps_syn_rtt; - hctx->x = rfc3390_initial_rate(sk); - hctx->t_ld = now; + hctx->ccid3hctx_rtt = dp->dccps_syn_rtt; + hctx->ccid3hctx_x = rfc3390_initial_rate(sk); + hctx->ccid3hctx_t_ld = now; } else { /* * Sender does not have RTT sample: @@ -289,20 +335,17 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) * is needed in several parts (e.g. window counter); * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. */ - hctx->rtt = DCCP_FALLBACK_RTT; - hctx->x = dp->dccps_mss_cache; - hctx->x <<= 6; + hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT; + hctx->ccid3hctx_x = hctx->ccid3hctx_s; + hctx->ccid3hctx_x <<= 6; } - - /* Compute t_ipi = s / X */ - hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len); ccid3_update_send_interval(hctx); - /* Seed value for Oscillation Prevention (sec. 4.5) */ - hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt); - - } else { - delay = ktime_us_delta(hctx->t_nom, now); + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + break; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); /* * Scheduling of packet transmissions [RFC 3448, 4.6] @@ -312,80 +355,99 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) * else * // send the packet in (t_nom - t_now) milliseconds. */ - if (delay >= TFRC_T_DELTA) - return (u32)delay / USEC_PER_MSEC; + if (delay - (s64)hctx->ccid3hctx_delta >= 1000) + return (u32)delay / 1000L; ccid3_hc_tx_update_win_count(hctx, now); + break; + case TFRC_SSTATE_TERM: + DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); + return -EINVAL; } /* prepare to send now (add options etc.) */ dp->dccps_hc_tx_insert_options = 1; - DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count; + DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; /* set the nominal send time for the next following packet */ - hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi); - return CCID_PACKET_SEND_AT_ONCE; + hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); + return 0; } -static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) +static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, + unsigned int len) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - /* Changes to s will become effective the next time X is computed */ - hctx->s = ccid3_hc_tx_measure_packet_size(sk, len); + ccid3_hc_tx_update_s(hctx, len); - if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss)) + if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss)) DCCP_CRIT("packet history - out of memory!"); } static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct tfrc_tx_hist_entry *acked; + struct ccid3_options_received *opt_recv; ktime_t now; unsigned long t_nfb; - u32 r_sample; + u32 pinv, r_sample; /* we are only interested in ACKs */ if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) return; - /* - * Locate the acknowledged packet in the TX history. - * - * Returning "entry not found" here can for instance happen when - * - the host has not sent out anything (e.g. a passive server), - * - the Ack is outdated (packet with higher Ack number was received), - * - it is a bogus Ack (for a packet not sent on this connection). - */ - acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb)); - if (acked == NULL) + /* ... and only in the established state */ + if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK && + hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) + return; + + opt_recv = &hctx->ccid3hctx_options_received; + now = ktime_get_real(); + + /* Estimate RTT from history if ACK number is valid */ + r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist, + DCCP_SKB_CB(skb)->dccpd_ack_seq, now); + if (r_sample == 0) { + DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, + dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type), + (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq); return; - /* For the sake of RTT sampling, ignore/remove all older entries */ - tfrc_tx_hist_purge(&acked->next); + } - /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ - now = ktime_get_real(); - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); - hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9); + /* Update receive rate in units of 64 * bytes/second */ + hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; + hctx->ccid3hctx_x_recv <<= 6; + /* Update loss event rate (which is scaled by 1e6) */ + pinv = opt_recv->ccid3or_loss_event_rate; + if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ + hctx->ccid3hctx_p = 0; + else /* can not exceed 100% */ + hctx->ccid3hctx_p = scaled_div(1, pinv); + /* + * Validate new RTT sample and update moving average + */ + r_sample = dccp_sample_rtt(sk, r_sample); + hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9); /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ - if (!hctx->feedback) { - hctx->feedback = true; + if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - if (hctx->t_rto == 0) { + if (hctx->ccid3hctx_t_rto == 0) { /* * Initial feedback packet: Larger Initial Windows (4.2) */ - hctx->x = rfc3390_initial_rate(sk); - hctx->t_ld = now; + hctx->ccid3hctx_x = rfc3390_initial_rate(sk); + hctx->ccid3hctx_t_ld = now; ccid3_update_send_interval(hctx); goto done_computing_x; - } else if (hctx->p == 0) { + } else if (hctx->ccid3hctx_p == 0) { /* * First feedback after nofeedback timer expiry (4.3) */ @@ -394,52 +456,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ - if (hctx->p > 0) - hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p); + if (hctx->ccid3hctx_p > 0) + hctx->ccid3hctx_x_calc = + tfrc_calc_x(hctx->ccid3hctx_s, + hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p); ccid3_hc_tx_update_x(sk, &now); done_computing_x: ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " "p=%u, X_calc=%u, X_recv=%u, X=%u\n", - dccp_role(sk), sk, hctx->rtt, r_sample, - hctx->s, hctx->p, hctx->x_calc, - (unsigned)(hctx->x_recv >> 6), - (unsigned)(hctx->x >> 6)); - /* - * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to - * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This - * can be useful if few connections share a link, avoiding that buffer - * fill levels (RTT) oscillate as a result of frequent adjustments to X. - * A useful presentation with background information is in - * Joerg Widmer, "Equation-Based Congestion Control", - * MSc Thesis, University of Mannheim, Germany, 2000 - * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation"). - */ - if (do_osc_prev) { - r_sample = tfrc_scaled_sqrt(r_sample); - /* - * The modulation can work in both ways: increase/decrease t_ipi - * according to long-term increases/decreases of the RTT. The - * former is a useful measure, since it works against queue - * build-up. The latter temporarily increases the sending rate, - * so that buffers fill up more quickly. This in turn causes - * the RTT to increase, so that either later reduction becomes - * necessary or the RTT stays at a very high level. Decreasing - * t_ipi is therefore not supported. - * Furthermore, during the initial slow-start phase the RTT - * naturally increases, where using the algorithm would cause - * delays. Hence it is disabled during the initial slow-start. - */ - if (r_sample > hctx->r_sqmean && hctx->p > 0) - hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample, - hctx->r_sqmean); - hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI); - /* update R_sqmean _after_ computing the modulation factor */ - hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9); - } + dccp_role(sk), + sk, hctx->ccid3hctx_rtt, r_sample, + hctx->ccid3hctx_s, hctx->ccid3hctx_p, + hctx->ccid3hctx_x_calc, + (unsigned)(hctx->ccid3hctx_x_recv >> 6), + (unsigned)(hctx->ccid3hctx_x >> 6)); /* unschedule no feedback timer */ - sk_stop_timer(sk, &hctx->no_feedback_timer); + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); /* * As we have calculated new ipi, delta, t_nom it is possible @@ -453,66 +488,95 @@ done_computing_x: * This can help avoid triggering the nofeedback timer too * often ('spinning') on LANs with small RTTs. */ - hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO * - (USEC_PER_SEC / 1000))); + hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, + (CONFIG_IP_DCCP_CCID3_RTO * + (USEC_PER_SEC / 1000))); /* * Schedule no feedback timer to expire in * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) */ - t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " "expire in %lu jiffies (%luus)\n", - dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); + dccp_role(sk), + sk, usecs_to_jiffies(t_nfb), t_nfb); - sk_reset_timer(sk, &hctx->no_feedback_timer, + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + usecs_to_jiffies(t_nfb)); } -static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) +static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, + unsigned char len, u16 idx, + unsigned char *value) { + int rc = 0; + const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + struct ccid3_options_received *opt_recv; __be32 opt_val; - switch (option) { - case TFRC_OPT_RECEIVE_RATE: - case TFRC_OPT_LOSS_EVENT_RATE: - /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ - if (packet_type == DCCP_PKT_DATA) - break; - if (unlikely(optlen != 4)) { - DCCP_WARN("%s(%p), invalid len %d for %u\n", - dccp_role(sk), sk, optlen, option); - return -EINVAL; - } - opt_val = ntohl(get_unaligned((__be32 *)optval)); + opt_recv = &hctx->ccid3hctx_options_received; - if (option == TFRC_OPT_RECEIVE_RATE) { - /* Receive Rate is kept in units of 64 bytes/second */ - hctx->x_recv = opt_val; - hctx->x_recv <<= 6; + if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { + opt_recv->ccid3or_seqno = dp->dccps_gsr; + opt_recv->ccid3or_loss_event_rate = ~0; + opt_recv->ccid3or_loss_intervals_idx = 0; + opt_recv->ccid3or_loss_intervals_len = 0; + opt_recv->ccid3or_receive_rate = 0; + } - ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", - dccp_role(sk), sk, opt_val); + switch (option) { + case TFRC_OPT_LOSS_EVENT_RATE: + if (unlikely(len != 4)) { + DCCP_WARN("%s(%p), invalid len %d " + "for TFRC_OPT_LOSS_EVENT_RATE\n", + dccp_role(sk), sk, len); + rc = -EINVAL; } else { - /* Update the fixpoint Loss Event Rate fraction */ - hctx->p = tfrc_invert_loss_event_rate(opt_val); - + opt_val = get_unaligned((__be32 *)value); + opt_recv->ccid3or_loss_event_rate = ntohl(opt_val); ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", - dccp_role(sk), sk, opt_val); + dccp_role(sk), sk, + opt_recv->ccid3or_loss_event_rate); } + break; + case TFRC_OPT_LOSS_INTERVALS: + opt_recv->ccid3or_loss_intervals_idx = idx; + opt_recv->ccid3or_loss_intervals_len = len; + ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", + dccp_role(sk), sk, + opt_recv->ccid3or_loss_intervals_idx, + opt_recv->ccid3or_loss_intervals_len); + break; + case TFRC_OPT_RECEIVE_RATE: + if (unlikely(len != 4)) { + DCCP_WARN("%s(%p), invalid len %d " + "for TFRC_OPT_RECEIVE_RATE\n", + dccp_role(sk), sk, len); + rc = -EINVAL; + } else { + opt_val = get_unaligned((__be32 *)value); + opt_recv->ccid3or_receive_rate = ntohl(opt_val); + ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", + dccp_role(sk), sk, + opt_recv->ccid3or_receive_rate); + } + break; } - return 0; + + return rc; } static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); - hctx->hist = NULL; - setup_timer(&hctx->no_feedback_timer, - ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); + hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; + hctx->ccid3hctx_hist = NULL; + setup_timer(&hctx->ccid3hctx_no_feedback_timer, + ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); + return 0; } @@ -520,36 +584,42 @@ static void ccid3_hc_tx_exit(struct sock *sk) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - sk_stop_timer(sk, &hctx->no_feedback_timer); - tfrc_tx_hist_purge(&hctx->hist); + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + + tfrc_tx_hist_purge(&hctx->ccid3hctx_hist); } static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) { - info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto; - info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt; + struct ccid3_hc_tx_sock *hctx; + + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return; + + hctx = ccid3_hc_tx_sk(sk); + info->tcpi_rto = hctx->ccid3hctx_t_rto; + info->tcpi_rtt = hctx->ccid3hctx_rtt; } static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct tfrc_tx_info tfrc; + const struct ccid3_hc_tx_sock *hctx; const void *val; + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return -EINVAL; + + hctx = ccid3_hc_tx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(tfrc)) + if (len < sizeof(hctx->ccid3hctx_tfrc)) return -EINVAL; - tfrc.tfrctx_x = hctx->x; - tfrc.tfrctx_x_recv = hctx->x_recv; - tfrc.tfrctx_x_calc = hctx->x_calc; - tfrc.tfrctx_rtt = hctx->rtt; - tfrc.tfrctx_p = hctx->p; - tfrc.tfrctx_rto = hctx->t_rto; - tfrc.tfrctx_ipi = hctx->t_ipi; - len = sizeof(tfrc); - val = &tfrc; + len = sizeof(hctx->ccid3hctx_tfrc); + val = &hctx->ccid3hctx_tfrc; break; default: return -ENOPROTOOPT; @@ -564,82 +634,112 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, /* * Receiver Half-Connection Routines */ + +/* CCID3 feedback types */ +enum ccid3_fback_type { + CCID3_FBACK_NONE = 0, + CCID3_FBACK_INITIAL, + CCID3_FBACK_PERIODIC, + CCID3_FBACK_PARAM_CHANGE +}; + +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG +static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) +{ + static char *ccid3_rx_state_names[] = { + [TFRC_RSTATE_NO_DATA] = "NO_DATA", + [TFRC_RSTATE_DATA] = "DATA", + [TFRC_RSTATE_TERM] = "TERM", + }; + + return ccid3_rx_state_names[state]; +} +#endif + +static void ccid3_hc_rx_set_state(struct sock *sk, + enum ccid3_hc_rx_states state) +{ + struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_rx_state_name(oldstate), + ccid3_rx_state_name(state)); + WARN_ON(state == oldstate); + hcrx->ccid3hcrx_state = state; +} + static void ccid3_hc_rx_send_feedback(struct sock *sk, const struct sk_buff *skb, enum ccid3_fback_type fbtype) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + ktime_t now; + s64 delta = 0; + + if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM)) + return; + + now = ktime_get_real(); switch (fbtype) { case CCID3_FBACK_INITIAL: - hcrx->x_recv = 0; - hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ + hcrx->ccid3hcrx_x_recv = 0; + hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ break; case CCID3_FBACK_PARAM_CHANGE: - if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) { - /* - * rfc3448bis-06, 6.3.1: First packet(s) lost or marked - * FIXME: in rfc3448bis the receiver returns X_recv=0 - * here as it normally would in the first feedback packet. - * However this is not possible yet, since the code still - * uses RFC 3448, i.e. - * If (p > 0) - * Calculate X_calc using the TCP throughput equation. - * X = max(min(X_calc, 2*X_recv), s/t_mbi); - * would bring X down to s/t_mbi. That is why we return - * X_recv according to rfc3448bis-06 for the moment. - */ - u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), - rtt = tfrc_rx_hist_rtt(&hcrx->hist); - - hcrx->x_recv = scaled_div32(s, 2 * rtt); - break; - } /* * When parameters change (new loss or p > p_prev), we do not * have a reliable estimate for R_m of [RFC 3448, 6.2] and so - * always check whether at least RTT time units were covered. + * need to reuse the previous value of X_recv. However, when + * X_recv was 0 (due to early loss), this would kill X down to + * s/t_mbi (i.e. one packet in 64 seconds). + * To avoid such drastic reduction, we approximate X_recv as + * the number of bytes since last feedback. + * This is a safe fallback, since X is bounded above by X_calc. */ - hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); - break; + if (hcrx->ccid3hcrx_x_recv > 0) + break; + /* fall through */ case CCID3_FBACK_PERIODIC: - /* - * Step (2) of rfc3448bis-06, 6.2: - * - if no data packets have been received, just restart timer - * - if data packets have been received, re-compute X_recv - */ - if (hcrx->hist.bytes_recvd == 0) - goto prepare_for_next_time; - hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); + delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback); + if (delta <= 0) + DCCP_BUG("delta (%ld) <= 0", (long)delta); + else + hcrx->ccid3hcrx_x_recv = + scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); break; default: return; } - ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse); + ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, + hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv); - dccp_sk(sk)->dccps_hc_rx_insert_options = 1; - dccp_send_ack(sk); + hcrx->ccid3hcrx_tstamp_last_feedback = now; + hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval; + hcrx->ccid3hcrx_bytes_recv = 0; -prepare_for_next_time: - tfrc_rx_hist_restart_byte_counter(&hcrx->hist); - hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; - hcrx->feedback = fbtype; + dp->dccps_hc_rx_insert_options = 1; + dccp_send_ack(sk); } static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) { - const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + const struct ccid3_hc_rx_sock *hcrx; __be32 x_recv, pinv; if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) return 0; + hcrx = ccid3_hc_rx_sk(sk); + if (dccp_packet_without_ack(skb)) return 0; - x_recv = htonl(hcrx->x_recv); - pinv = htonl(hcrx->p_inverse); + x_recv = htonl(hcrx->ccid3hcrx_x_recv); + pinv = htonl(hcrx->ccid3hcrx_pinv); if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, &pinv, sizeof(pinv)) || @@ -662,95 +762,171 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) static u32 ccid3_first_li(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), - rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p; + u32 x_recv, p, delta; u64 fval; - /* - * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p - * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p - * is about 20.64%. This yields an interval length of 4.84 (rounded up). - */ - if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) - return 5; + if (hcrx->ccid3hcrx_rtt == 0) { + DCCP_WARN("No RTT estimate available, using fallback RTT\n"); + hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT; + } - x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); - if (x_recv == 0) - goto failed; + delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback)); + x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); + if (x_recv == 0) { /* would also trigger divide-by-zero */ + DCCP_WARN("X_recv==0\n"); + if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) { + DCCP_BUG("stored value of X_recv is zero"); + return ~0U; + } + } - fval = scaled_div32(scaled_div(s, rtt), x_recv); + fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt); + fval = scaled_div32(fval, x_recv); p = tfrc_calc_x_reverse_lookup(fval); ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); - if (p > 0) - return scaled_div(1, p); -failed: - return UINT_MAX; + return p == 0 ? ~0U : scaled_div(1, p); } static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; const bool is_data_packet = dccp_data_packet(skb); + if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { + if (is_data_packet) { + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; + do_feedback = CCID3_FBACK_INITIAL; + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); + hcrx->ccid3hcrx_s = payload; + /* + * Not necessary to update ccid3hcrx_bytes_recv here, + * since X_recv = 0 for the first feedback packet (cf. + * RFC 3448, 6.3) -- gerrit + */ + } + goto update_records; + } + + if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb)) + return; /* done receiving */ + + if (is_data_packet) { + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; + /* + * Update moving-average of s and the sum of received payload bytes + */ + hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9); + hcrx->ccid3hcrx_bytes_recv += payload; + } + /* * Perform loss detection and handle pending losses */ - if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist, - skb, ndp, ccid3_first_li, sk)) - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE); + if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist, + skb, ndp, ccid3_first_li, sk)) { + do_feedback = CCID3_FBACK_PARAM_CHANGE; + goto done_receiving; + } + + if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist)) + return; /* done receiving */ + /* - * Feedback for first non-empty data packet (RFC 3448, 6.3) + * Handle data packets: RTT sampling and monitoring p */ - else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet)) - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL); + if (unlikely(!is_data_packet)) + goto update_records; + + if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) { + const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb); + /* + * Empty loss history: no loss so far, hence p stays 0. + * Sample RTT values, since an RTT estimate is required for the + * computation of p when the first loss occurs; RFC 3448, 6.3.1. + */ + if (sample != 0) + hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9); + + } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) { + /* + * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean + * has decreased (resp. p has increased), send feedback now. + */ + do_feedback = CCID3_FBACK_PARAM_CHANGE; + } + /* * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 */ - else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet && - SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC); + if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3) + do_feedback = CCID3_FBACK_PERIODIC; + +update_records: + tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp); + +done_receiving: + if (do_feedback) + ccid3_hc_rx_send_feedback(sk, skb, do_feedback); } static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); - tfrc_lh_init(&hcrx->li_hist); - return tfrc_rx_hist_init(&hcrx->hist, sk); + hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; + tfrc_lh_init(&hcrx->ccid3hcrx_li_hist); + return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist); } static void ccid3_hc_rx_exit(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - tfrc_rx_hist_purge(&hcrx->hist); - tfrc_lh_cleanup(&hcrx->li_hist); + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); + + tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist); + tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist); } static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) { + const struct ccid3_hc_rx_sock *hcrx; + + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return; + + hcrx = ccid3_hc_rx_sk(sk); + info->tcpi_ca_state = hcrx->ccid3hcrx_state; info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist); + info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; } static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + const struct ccid3_hc_rx_sock *hcrx; struct tfrc_rx_info rx_info; const void *val; + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return -EINVAL; + + hcrx = ccid3_hc_rx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_RX_INFO: if (len < sizeof(rx_info)) return -EINVAL; - rx_info.tfrcrx_x_recv = hcrx->x_recv; - rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist); - rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse); + rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv; + rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt; + rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U : + scaled_div(1, hcrx->ccid3hcrx_pinv); len = sizeof(rx_info); val = &rx_info; break; @@ -786,9 +962,6 @@ static struct ccid_operations ccid3 = { .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, }; -module_param(do_osc_prev, bool, 0644); -MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)"); - #ifdef CONFIG_IP_DCCP_CCID3_DEBUG module_param(ccid3_debug, bool, 0644); MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); @@ -796,19 +969,6 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); static __init int ccid3_module_init(void) { - struct timespec tp; - - /* - * Without a fine-grained clock resolution, RTTs/X_recv are not sampled - * correctly and feedback is sent either too early or too late. - */ - hrtimer_get_res(CLOCK_MONOTONIC, &tp); - if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) { - printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec" - " resolution - check your clocksource.\n", __func__, - tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION); - return -ESOCKTNOSUPPORT; - } return ccid_register(&ccid3); } module_init(ccid3_module_init); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index af6e1bf937d..49ca32bd7e7 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -47,22 +47,11 @@ /* Two seconds as per RFC 3448 4.2 */ #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) -/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */ -#define TFRC_T_MBI (64 * USEC_PER_SEC) +/* In usecs - half the scheduling granularity as per RFC3448 4.6 */ +#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) -/* - * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are - * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. - * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse - * resolution of HZ < 500 means that the error is below one timer tick (t_gran) - * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). - */ -#if (HZ >= 500) -# define TFRC_T_DELTA USEC_PER_MSEC -#else -# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) -#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC. -#endif +/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ +#define TFRC_T_MBI 64 enum ccid3_options { TFRC_OPT_LOSS_EVENT_RATE = 192, @@ -70,43 +59,62 @@ enum ccid3_options { TFRC_OPT_RECEIVE_RATE = 194, }; +struct ccid3_options_received { + u64 ccid3or_seqno:48, + ccid3or_loss_intervals_idx:16; + u16 ccid3or_loss_intervals_len; + u32 ccid3or_loss_event_rate; + u32 ccid3or_receive_rate; +}; + +/* TFRC sender states */ +enum ccid3_hc_tx_states { + TFRC_SSTATE_NO_SENT = 1, + TFRC_SSTATE_NO_FBACK, + TFRC_SSTATE_FBACK, + TFRC_SSTATE_TERM, +}; + /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket * - * @x - Current sending rate in 64 * bytes per second - * @x_recv - Receive rate in 64 * bytes per second - * @x_calc - Calculated rate in bytes per second - * @rtt - Estimate of current round trip time in usecs - * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5) - * @p - Current loss event rate (0-1) scaled by 1000000 - * @s - Packet size in bytes - * @t_rto - Nofeedback Timer setting in usecs - * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs - * @feedback - Whether feedback has been received or not - * @last_win_count - Last window counter sent - * @t_last_win_count - Timestamp of earliest packet with - * last_win_count value sent - * @no_feedback_timer - Handle to no feedback timer - * @t_ld - Time last doubled during slow start - * @t_nom - Nominal send time of next packet - * @hist - Packet history + * @ccid3hctx_x - Current sending rate in 64 * bytes per second + * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second + * @ccid3hctx_x_calc - Calculated rate in bytes per second + * @ccid3hctx_rtt - Estimate of current round trip time in usecs + * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 + * @ccid3hctx_s - Packet size in bytes + * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs + * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs + * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states + * @ccid3hctx_last_win_count - Last window counter sent + * @ccid3hctx_t_last_win_count - Timestamp of earliest packet + * with last_win_count value sent + * @ccid3hctx_no_feedback_timer - Handle to no feedback timer + * @ccid3hctx_t_ld - Time last doubled during slow start + * @ccid3hctx_t_nom - Nominal send time of next packet + * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs + * @ccid3hctx_hist - Packet history + * @ccid3hctx_options_received - Parsed set of retrieved options */ struct ccid3_hc_tx_sock { - u64 x; - u64 x_recv; - u32 x_calc; - u32 rtt; - u16 r_sqmean; - u32 p; - u32 t_rto; - u32 t_ipi; - u16 s; - bool feedback:1; - u8 last_win_count; - ktime_t t_last_win_count; - struct timer_list no_feedback_timer; - ktime_t t_ld; - ktime_t t_nom; - struct tfrc_tx_hist_entry *hist; + struct tfrc_tx_info ccid3hctx_tfrc; +#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x +#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv +#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc +#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt +#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p +#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto +#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi + u16 ccid3hctx_s; + enum ccid3_hc_tx_states ccid3hctx_state:8; + u8 ccid3hctx_last_win_count; + ktime_t ccid3hctx_t_last_win_count; + struct timer_list ccid3hctx_no_feedback_timer; + ktime_t ccid3hctx_t_ld; + ktime_t ccid3hctx_t_nom; + u32 ccid3hctx_delta; + struct tfrc_tx_hist_entry *ccid3hctx_hist; + struct ccid3_options_received ccid3hctx_options_received; }; static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) @@ -116,32 +124,41 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) return hctx; } - -enum ccid3_fback_type { - CCID3_FBACK_NONE = 0, - CCID3_FBACK_INITIAL, - CCID3_FBACK_PERIODIC, - CCID3_FBACK_PARAM_CHANGE +/* TFRC receiver states */ +enum ccid3_hc_rx_states { + TFRC_RSTATE_NO_DATA = 1, + TFRC_RSTATE_DATA, + TFRC_RSTATE_TERM = 127, }; /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket * - * @last_counter - Tracks window counter (RFC 4342, 8.1) - * @feedback - The type of the feedback last sent - * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) - * @tstamp_last_feedback - Time at which last feedback was sent - * @hist - Packet history (loss detection + RTT sampling) - * @li_hist - Loss Interval database - * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) + * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) + * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) + * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) + * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) + * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states + * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes + * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) + * @ccid3hcrx_rtt - Receiver estimate of RTT + * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent + * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent + * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling) + * @ccid3hcrx_li_hist - Loss Interval database + * @ccid3hcrx_s - Received packet size in bytes + * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) */ struct ccid3_hc_rx_sock { - u8 last_counter:4; - enum ccid3_fback_type feedback:4; - u32 x_recv; - ktime_t tstamp_last_feedback; - struct tfrc_rx_hist hist; - struct tfrc_loss_hist li_hist; -#define p_inverse li_hist.i_mean + u8 ccid3hcrx_last_counter:4; + enum ccid3_hc_rx_states ccid3hcrx_state:8; + u32 ccid3hcrx_bytes_recv; + u32 ccid3hcrx_x_recv; + u32 ccid3hcrx_rtt; + ktime_t ccid3hcrx_tstamp_last_feedback; + struct tfrc_rx_hist ccid3hcrx_hist; + struct tfrc_loss_hist ccid3hcrx_li_hist; + u16 ccid3hcrx_s; +#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean }; static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index b1ae8f8259e..5b3ce0688c5 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -86,26 +86,21 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) /** * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 - * This updates I_mean as the sequence numbers increase. As a consequence, the - * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1) - * decreases, and thus there is no need to send renewed feedback. + * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev */ -void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) +u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) { struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); + u32 old_i_mean = lh->i_mean; s64 len; if (cur == NULL) /* not initialised */ - return; - - /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */ - if (!dccp_data_packet(skb)) - return; + return 0; len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ - return; + return 0; if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) /* @@ -119,11 +114,14 @@ void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) cur->li_is_closed = 1; if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ - return; + return 0; cur->li_length = len; tfrc_lh_calc_i_mean(lh); + + return (lh->i_mean < old_i_mean); } +EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean); /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, @@ -140,18 +138,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, * @sk: Used by @calc_first_li in caller-specific way (subtyping) * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. */ -bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, - u32 (*calc_first_li)(struct sock *), struct sock *sk) +int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, + u32 (*calc_first_li)(struct sock *), struct sock *sk) { struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) - return false; + return 0; new = tfrc_lh_demand_next(lh); if (unlikely(new == NULL)) { DCCP_CRIT("Cannot allocate/add loss record."); - return false; + return 0; } new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; @@ -169,7 +167,7 @@ bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, tfrc_lh_calc_i_mean(lh); } - return true; + return 1; } EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h index d08a226db43..246018a3b26 100644 --- a/net/dccp/ccids/lib/loss_interval.h +++ b/net/dccp/ccids/lib/loss_interval.h @@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) struct tfrc_rx_hist; -extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, +extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, u32 (*first_li)(struct sock *), struct sock *); -extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); +extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); #endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index cce9f03bda3..6cc108afdc3 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -40,6 +40,18 @@ #include "packet_history.h" #include "../../dccp.h" +/** + * tfrc_tx_hist_entry - Simple singly-linked TX history list + * @next: next oldest entry (LIFO order) + * @seqno: sequence number of this entry + * @stamp: send time of packet with sequence number @seqno + */ +struct tfrc_tx_hist_entry { + struct tfrc_tx_hist_entry *next; + u64 seqno; + ktime_t stamp; +}; + /* * Transmitter History Routines */ @@ -61,6 +73,15 @@ void tfrc_tx_packet_history_exit(void) } } +static struct tfrc_tx_hist_entry * + tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) +{ + while (head != NULL && head->seqno != seqno) + head = head->next; + + return head; +} + int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) { struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); @@ -90,6 +111,25 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) } EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); +u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, + const ktime_t now) +{ + u32 rtt = 0; + struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); + + if (packet != NULL) { + rtt = ktime_us_delta(now, packet->stamp); + /* + * Garbage-collect older (irrelevant) entries: + */ + tfrc_tx_hist_purge(&packet->next); + } + + return rtt; +} +EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); + + /* * Receiver History Routines */ @@ -151,31 +191,14 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); - -static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) -{ - struct tfrc_rx_hist_entry *tmp = h->ring[a]; - - h->ring[a] = h->ring[b]; - h->ring[b] = tmp; -} - static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) { - __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a), - tfrc_rx_hist_index(h, b)); -} + const u8 idx_a = tfrc_rx_hist_index(h, a), + idx_b = tfrc_rx_hist_index(h, b); + struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; -/** - * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling - * This is called after loss detection has finished, when the history entry - * with the index of `loss_count' holds the highest-received sequence number. - * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt). - */ -static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h) -{ - __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count)); - h->loss_count = h->loss_start = 0; + h->ring[idx_a] = h->ring[idx_b]; + h->ring[idx_b] = tmp; } /* @@ -192,8 +215,10 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, s1 = DCCP_SKB_CB(skb)->dccpd_seq; - if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */ + if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ h->loss_count = 1; + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); + } } static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) @@ -215,7 +240,8 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2 if (dccp_loss_free(s2, s1, n1)) { /* hole is filled: S0, S2, and S1 are consecutive */ - tfrc_rx_hist_resume_rtt_sampling(h); + h->loss_count = 0; + h->loss_start = tfrc_rx_hist_index(h, 1); } else /* gap between S2 and S1: just update loss_prev */ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); @@ -268,7 +294,8 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) if (dccp_loss_free(s1, s2, n2)) { /* entire hole filled by S0, S3, S1, S2 */ - tfrc_rx_hist_resume_rtt_sampling(h); + h->loss_start = tfrc_rx_hist_index(h, 2); + h->loss_count = 0; } else { /* gap remains between S1 and S2 */ h->loss_start = tfrc_rx_hist_index(h, 1); @@ -312,7 +339,8 @@ static void __three_after_loss(struct tfrc_rx_hist *h) if (dccp_loss_free(s2, s3, n3)) { /* no gap between S2 and S3: entire hole is filled */ - tfrc_rx_hist_resume_rtt_sampling(h); + h->loss_start = tfrc_rx_hist_index(h, 3); + h->loss_count = 0; } else { /* gap between S2 and S3 */ h->loss_start = tfrc_rx_hist_index(h, 2); @@ -326,13 +354,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h) } /** - * tfrc_rx_congestion_event - Loss detection and further processing - * @h: The non-empty RX history object - * @lh: Loss Intervals database to update - * @skb: Currently received packet - * @ndp: The NDP count belonging to @skb - * @first_li: Caller-dependent computation of first loss interval in @lh - * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) + * tfrc_rx_handle_loss - Loss detection and further processing + * @h: The non-empty RX history object + * @lh: Loss Intervals database to update + * @skb: Currently received packet + * @ndp: The NDP count belonging to @skb + * @calc_first_li: Caller-dependent computation of first loss interval in @lh + * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) * Chooses action according to pending loss, updates LI database when a new * loss was detected, and does required post-processing. Returns 1 when caller * should send feedback, 0 otherwise. @@ -340,20 +368,15 @@ static void __three_after_loss(struct tfrc_rx_hist *h) * records accordingly, the caller should not perform any more RX history * operations when loss_count is greater than 0 after calling this function. */ -bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *), struct sock *sk) +int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, const u64 ndp, + u32 (*calc_first_li)(struct sock *), struct sock *sk) { - bool new_event = false; - - if (tfrc_rx_hist_duplicate(h, skb)) - return 0; + int is_new_loss = 0; if (h->loss_count == 0) { __do_track_loss(h, skb, ndp); - tfrc_rx_hist_sample_rtt(h, skb); - tfrc_rx_hist_add_packet(h, skb, ndp); } else if (h->loss_count == 1) { __one_after_loss(h, skb, ndp); } else if (h->loss_count != 2) { @@ -362,57 +385,34 @@ bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, /* * Update Loss Interval database and recycle RX records */ - new_event = tfrc_lh_interval_add(lh, h, first_li, sk); + is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); __three_after_loss(h); } - - /* - * Update moving-average of `s' and the sum of received payload bytes. - */ - if (dccp_data_packet(skb)) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - - h->packet_size = tfrc_ewma(h->packet_size, payload, 9); - h->bytes_recvd += payload; - } - - /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */ - if (!new_event) - tfrc_lh_update_i_mean(lh, skb); - - return new_event; + return is_new_loss; } -EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event); +EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); -/* Compute the sending rate X_recv measured between feedback intervals */ -u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv) +int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) { - u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate; - s64 delta = ktime_to_us(net_timedelta(h->bytes_start)); - - WARN_ON(delta <= 0); - /* - * Ensure that the sampling interval for X_recv is at least one RTT, - * by extending the sampling interval backwards in time, over the last - * R_(m-1) seconds, as per rfc3448bis-06, 6.2. - * To reduce noise (e.g. when the RTT changes often), this is only - * done when delta is smaller than RTT/2. - */ - if (last_x_recv > 0 && delta < last_rtt/2) { - tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n", - (long)delta, (unsigned)last_rtt); + int i; - delta = (bytes ? delta : 0) + last_rtt; - bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC); + for (i = 0; i <= TFRC_NDUPACK; i++) { + h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); + if (h->ring[i] == NULL) + goto out_free; } - if (unlikely(bytes == 0)) { - DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv); - return last_x_recv; + h->loss_count = h->loss_start = 0; + return 0; + +out_free: + while (i-- != 0) { + kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); + h->ring[i] = NULL; } - return scaled_div32(bytes, delta); + return -ENOBUFS; } -EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv); +EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) { @@ -426,81 +426,73 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) } EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); -static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) +/** + * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) { - int i; - - memset(h, 0, sizeof(*h)); - - for (i = 0; i <= TFRC_NDUPACK; i++) { - h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); - if (h->ring[i] == NULL) { - tfrc_rx_hist_purge(h); - return -ENOBUFS; - } - } - return 0; + return h->ring[0]; } -int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk) +/** + * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) { - if (tfrc_rx_hist_alloc(h)) - return -ENOBUFS; - /* - * Initialise first entry with GSR to start loss detection as early as - * possible. Code using this must not use any other fields. The entry - * will be overwritten once the CCID updates its received packets. - */ - tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr; - return 0; + return h->ring[h->rtt_sample_prev]; } -EXPORT_SYMBOL_GPL(tfrc_rx_hist_init); /** * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal - * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss - * is pending and uses the following history entries (via rtt_sample_prev): - * - h->ring[0] contains the most recent history entry prior to @skb; - * - h->ring[1] is an unused `dummy' entry when the current difference is 0; + * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able + * to compute a sample with given data - calling function should check this. */ -void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) +u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) { - struct tfrc_rx_hist_entry *last = h->ring[0]; - u32 sample, delta_v; - - /* - * When not to sample: - * - on non-data packets - * (RFC 4342, 8.1: CCVal only fully defined for data packets); - * - when no data packets have been received yet - * (FIXME: using sampled packet size as indicator here); - * - as long as there are gaps in the sequence space (pending loss). - */ - if (!dccp_data_packet(skb) || h->packet_size == 0 || - tfrc_rx_hist_loss_pending(h)) - return; + u32 sample = 0, + delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + + if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ + if (h->rtt_sample_prev == 2) { /* previous candidate stored */ + sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + if (sample) + sample = 4 / sample * + ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); + else /* + * FIXME: This condition is in principle not + * possible but occurs when CCID is used for + * two-way data traffic. I have tried to trace + * it, but the cause does not seem to be here. + */ + DCCP_BUG("please report to dccp@vger.kernel.org" + " => prev = %u, last = %u", + tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + } else if (delta_v < 1) { + h->rtt_sample_prev = 1; + goto keep_ref_for_next_time; + } - h->rtt_sample_prev = 0; /* reset previous candidate */ + } else if (delta_v == 4) /* optimal match */ + sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); + else { /* suboptimal match */ + h->rtt_sample_prev = 2; + goto keep_ref_for_next_time; + } - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval); - if (delta_v == 0) { /* less than RTT/4 difference */ - h->rtt_sample_prev = 1; - return; + if (unlikely(sample > DCCP_SANE_RTT_MAX)) { + DCCP_WARN("RTT sample %u too large, using max\n", sample); + sample = DCCP_SANE_RTT_MAX; } - sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp))); - if (delta_v <= 4) /* between RTT/4 and RTT */ - sample *= 4 / delta_v; - else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2)) - /* - * Optimisation: CCVal difference is greater than 1 RTT, yet the - * sample is less than the local RTT estimate; which means that - * the RTT estimate is too high. - * To avoid noise, it is not done if the sample is below RTT/2. - */ - return; + h->rtt_sample_prev = 0; /* use current entry as next reference */ +keep_ref_for_next_time: - /* Use a lower weight than usual to increase responsiveness */ - h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5); + return sample; } EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 555e65cd73a..461cc91cce8 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -40,28 +40,12 @@ #include #include "tfrc.h" -/** - * tfrc_tx_hist_entry - Simple singly-linked TX history list - * @next: next oldest entry (LIFO order) - * @seqno: sequence number of this entry - * @stamp: send time of packet with sequence number @seqno - */ -struct tfrc_tx_hist_entry { - struct tfrc_tx_hist_entry *next; - u64 seqno; - ktime_t stamp; -}; - -static inline struct tfrc_tx_hist_entry * - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) -{ - while (head != NULL && head->seqno != seqno) - head = head->next; - return head; -} +struct tfrc_tx_hist_entry; extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); +extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, + const u64 seqno, const ktime_t now); /* Subtraction a-b modulo-16, respects circular wrap-around */ #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) @@ -91,22 +75,12 @@ struct tfrc_rx_hist_entry { * @loss_count: Number of entries in circular history * @loss_start: Movable index (for loss detection) * @rtt_sample_prev: Used during RTT sampling, points to candidate entry - * @rtt_estimate: Receiver RTT estimate - * @packet_size: Packet size in bytes (as per RFC 3448, 3.1) - * @bytes_recvd: Number of bytes received since @bytes_start - * @bytes_start: Start time for counting @bytes_recvd */ struct tfrc_rx_hist { struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; u8 loss_count:2, loss_start:2; - /* Receiver RTT sampling */ #define rtt_sample_prev loss_start - u32 rtt_estimate; - /* Receiver sampling of application payload lengths */ - u32 packet_size, - bytes_recvd; - ktime_t bytes_start; }; /** @@ -150,50 +124,20 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) return h->loss_count > 0; } -/* - * Accessor functions to retrieve parameters sampled by the RX history - */ -static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h) -{ - if (h->packet_size == 0) { - DCCP_WARN("No sample for s, using fallback\n"); - return TCP_MIN_RCVMSS; - } - return h->packet_size; - -} -static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h) -{ - if (h->rtt_estimate == 0) { - DCCP_WARN("No RTT estimate available, using fallback RTT\n"); - return DCCP_FALLBACK_RTT; - } - return h->rtt_estimate; -} - -static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h) -{ - h->bytes_recvd = 0; - h->bytes_start = ktime_get_real(); -} - -extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv); - - extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, const u64 ndp); extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); struct tfrc_loss_hist; -extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *sk), - struct sock *sk); -extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, - const struct sk_buff *skb); -extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); +extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, const u64 ndp, + u32 (*first_li)(struct sock *sk), + struct sock *sk); +extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, + const struct sk_buff *skb); +extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); #endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index ede12f53de5..ed9857527ac 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h @@ -47,21 +47,6 @@ static inline u32 scaled_div32(u64 a, u64 b) return result; } -/** - * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1 - * Uses scaling to improve accuracy of the integer approximation of sqrt(). The - * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for - * clamped RTT samples (dccp_sample_rtt). - * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the - * scaling factor is neutralised. For this purpose, it avoids returning zero. - */ -static inline u16 tfrc_scaled_sqrt(const u32 sample) -{ - const unsigned long non_zero_sample = sample ? : 1; - - return int_sqrt(non_zero_sample << 10); -} - /** * tfrc_ewma - Exponentially weighted moving average * @weight: Weight to be used as damping factor, in units of 1/10 @@ -73,7 +58,6 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); -extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); extern int tfrc_tx_packet_history_init(void); extern void tfrc_tx_packet_history_exit(void); diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 38239c4d5e1..2f20a29cffe 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -632,16 +632,8 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ - /* - * In the congestion-avoidance phase p decays towards 0 - * when there are no further losses, so this case is - * natural. Truncating to p_min = 0.01% means that the - * maximum achievable throughput is limited to about - * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g. - * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps. - */ - tfrc_pr_debug("Value of p (%d) below resolution. " - "Substituting %d\n", p, TFRC_SMALLEST_P); + DCCP_WARN("Value of p (%d) below resolution. " + "Substituting %d\n", p, TFRC_SMALLEST_P); index = 0; } else /* 0.0001 <= p <= 0.05 */ index = p/TFRC_SMALLEST_P - 1; @@ -666,6 +658,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) result = scaled_div(s, R); return scaled_div32(result, f); } + EXPORT_SYMBOL_GPL(tfrc_calc_x); /** @@ -700,19 +693,5 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) index = tfrc_binsearch(fvalue, 0); return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } -EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); -/** - * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% - * When @loss_event_rate is large, there is a chance that p is truncated to 0. - * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. - */ -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) -{ - if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ - return 0; - if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ - return 1000000; - return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); -} -EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate); +EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 5281190aa19..b4bc6e095a0 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -42,11 +42,9 @@ extern int dccp_debug; #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) -#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) #else #define dccp_pr_debug(format, a...) #define dccp_pr_debug_cat(format, a...) -#define dccp_debug(format, a...) #endif extern struct inet_hashinfo dccp_hashinfo; @@ -63,14 +61,11 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields * Hence a safe upper bound for the maximum option length is 1020-28 = 992 */ -#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) +#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int)) #define DCCP_MAX_PACKET_HDR 28 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) -/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ -#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) - #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ @@ -86,13 +81,10 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); */ #define DCCP_RTO_MAX ((unsigned)(64 * HZ)) -/* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */ -#define DCCP_TIME_RESOLUTION 10 - /* * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 */ -#define DCCP_SANE_RTT_MIN (10 * DCCP_TIME_RESOLUTION) +#define DCCP_SANE_RTT_MIN 100 #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) @@ -103,6 +95,12 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); extern int sysctl_dccp_request_retries; extern int sysctl_dccp_retries1; extern int sysctl_dccp_retries2; +extern int sysctl_dccp_feat_sequence_window; +extern int sysctl_dccp_feat_rx_ccid; +extern int sysctl_dccp_feat_tx_ccid; +extern int sysctl_dccp_feat_ack_ratio; +extern int sysctl_dccp_feat_send_ack_vector; +extern int sysctl_dccp_feat_send_ndp_count; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; @@ -237,22 +235,8 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, extern void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); -/* - * TX Packet Dequeueing Interface - */ -extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); -extern bool dccp_qpolicy_full(struct sock *sk); -extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); -extern struct sk_buff *dccp_qpolicy_top(struct sock *sk); -extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk); -extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); - -/* - * TX Packet Output and TX Timers - */ -extern void dccp_write_xmit(struct sock *sk); +extern void dccp_write_xmit(struct sock *sk, int block); extern void dccp_write_space(struct sock *sk); -extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); extern void dccp_init_xmit_timers(struct sock *sk); static inline void dccp_clear_xmit_timers(struct sock *sk) @@ -268,8 +252,7 @@ extern const char *dccp_state_name(const int state); extern void dccp_set_state(struct sock *sk, const int state); extern void dccp_done(struct sock *sk); -extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, - struct sk_buff const *skb); +extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb); extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); @@ -334,14 +317,7 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk, extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); extern void dccp_send_close(struct sock *sk, const int active); extern int dccp_invalid_packet(struct sk_buff *skb); - -static inline u32 dccp_sane_rtt(long usec_sample) -{ - if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX)) - DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample); - return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX); -} -extern u32 dccp_sample_rtt(struct sock *sk, long delta); +extern u32 dccp_sample_rtt(struct sock *sk, long delta); static inline int dccp_bad_service_code(const struct sock *sk, const __be32 service) @@ -435,62 +411,36 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, static inline void dccp_update_gsr(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); + const struct dccp_minisock *dmsk = dccp_msk(sk); dp->dccps_gsr = seq; - /* Sequence validity window depends on remote Sequence Window (7.5.1) */ - dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); - /* - * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, - * 7.5.1 we perform this check beyond the initial handshake: W/W' are - * always > 32, so for the first W/W' packets in the lifetime of a - * connection we always have to adjust SWL. - * A second reason why we are doing this is that the window depends on - * the feature-remote value of Sequence Window: nothing stops the peer - * from updating this value while we are busy adjusting SWL for the - * first W packets (we would have to count from scratch again then). - * Therefore it is safer to always make sure that the Sequence Window - * is not artificially extended by a peer who grows SWL downwards by - * continually updating the feature-remote Sequence-Window. - * If sequence numbers wrap it is bad luck. But that will take a while - * (48 bit), and this measure prevents Sequence-number attacks. - */ - if (before48(dp->dccps_swl, dp->dccps_isr)) - dp->dccps_swl = dp->dccps_isr; - dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); + dccp_set_seqno(&dp->dccps_swl, + dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4)); + dccp_set_seqno(&dp->dccps_swh, + dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4); } static inline void dccp_update_gss(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); - dp->dccps_gss = seq; - /* Ack validity window depends on local Sequence Window value (7.5.1) */ - dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); - /* Adjust AWL so that it is not below ISS - see comment above for SWL */ - if (before48(dp->dccps_awl, dp->dccps_iss)) - dp->dccps_awl = dp->dccps_iss; - dp->dccps_awh = dp->dccps_gss; -} - -static inline int dccp_ackvec_pending(const struct sock *sk) -{ - return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && - !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); + dp->dccps_awh = dp->dccps_gss = seq; + dccp_set_seqno(&dp->dccps_awl, + (dp->dccps_gss - + dccp_msk(sk)->dccpms_sequence_window + 1)); } static inline int dccp_ack_pending(const struct sock *sk) { - return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); + const struct dccp_sock *dp = dccp_sk(sk); + return dp->dccps_timestamp_echo != 0 || +#ifdef CONFIG_IP_DCCP_ACKVEC + (dccp_msk(sk)->dccpms_send_ack_vector && + dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || +#endif + inet_csk_ack_scheduled(sk); } -extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); -extern int dccp_feat_finalise_settings(struct dccp_sock *dp); -extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); -extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, - struct sk_buff *skb); -extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); -extern void dccp_feat_list_purge(struct list_head *fn_list); - extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); extern int dccp_insert_option_elapsed_time(struct sock *sk, diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 93aae7c9555..d8a3509b26f 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_backoff = icsk->icsk_backoff; info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - if (dp->dccps_hc_rx_ackvec != NULL) + if (dccp_msk(sk)->dccpms_send_ack_vector) info->tcpi_options |= TCPI_OPT_SACK; ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index f94c7c9d1a7..933a0ecf8d4 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1,19 +1,11 @@ /* * net/dccp/feat.c * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * - * Copyright (c) 2008 The University of Aberdeen, Scotland, UK - * Copyright (c) 2008 Gerrit Renker - * Rewrote from scratch, some bits from earlier code by - * Copyright (c) 2005 Andrea Bittau - * + * An implementation of the DCCP protocol + * Andrea Bittau * * ASSUMPTIONS * ----------- - * o Feature negotiation is coordinated with connection setup (as in TCP), wild - * changes of parameters of an established connection are not supported. - * o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN. * o All currently known SP features have 1-byte quantities. If in the future * extensions of RFCs 4340..42 define features with item lengths larger than * one byte, a feature-specific extension of the code will be required. @@ -23,1510 +15,635 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ + #include + #include "ccid.h" #include "feat.h" -/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ -unsigned long sysctl_dccp_sequence_window __read_mostly = 100; -int sysctl_dccp_rx_ccid __read_mostly = 2, - sysctl_dccp_tx_ccid __read_mostly = 2; +#define DCCP_FEAT_SP_NOAGREE (-123) -/* - * Feature activation handlers. - * - * These all use an u64 argument, to provide enough room for NN/SP features. At - * this stage the negotiated values have been checked to be within their range. - */ -static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) +int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, + u8 *val, u8 len, gfp_t gfp) { - struct dccp_sock *dp = dccp_sk(sk); - struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any()); + struct dccp_opt_pend *opt; - if (new_ccid == NULL) - return -ENOMEM; + dccp_feat_debug(type, feature, *val); - if (rx) { - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = new_ccid; - } else { - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = new_ccid; + if (len > 3) { + DCCP_WARN("invalid length %d\n", len); + return -EINVAL; + } + /* XXX add further sanity checks */ + + /* check if that feature is already being negotiated */ + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { + /* ok we found a negotiation for this option already */ + if (opt->dccpop_feat == feature && opt->dccpop_type == type) { + dccp_pr_debug("Replacing old\n"); + /* replace */ + BUG_ON(opt->dccpop_val == NULL); + kfree(opt->dccpop_val); + opt->dccpop_val = val; + opt->dccpop_len = len; + opt->dccpop_conf = 0; + return 0; + } } - return 0; -} -static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); + /* negotiation for a new feature */ + opt = kmalloc(sizeof(*opt), gfp); + if (opt == NULL) + return -ENOMEM; - if (rx) { - dp->dccps_r_seq_win = seq_win; - /* propagate changes to update SWL/SWH */ - dccp_update_gsr(sk, dp->dccps_gsr); - } else { - dp->dccps_l_seq_win = seq_win; - /* propagate changes to update AWL */ - dccp_update_gss(sk, dp->dccps_gss); - } - return 0; -} + opt->dccpop_type = type; + opt->dccpop_feat = feature; + opt->dccpop_len = len; + opt->dccpop_val = val; + opt->dccpop_conf = 0; + opt->dccpop_sc = NULL; -static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) -{ -#ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__ - /* - * FIXME: This is required until several problems in the CCID-2 code are - * resolved. The CCID-2 code currently does not cope well; using dynamic - * Ack Ratios greater than 1 caused instabilities. These were manifest - * in hangups and long RTO timeouts (1...3 seconds). Until this has been - * stabilised, it is safer not to activate dynamic Ack Ratio changes. - */ - dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n", - rx ? "RX" : "TX", (u16)ratio); - ratio = 1; -#endif - if (rx) - dccp_sk(sk)->dccps_r_ack_ratio = ratio; - else - dccp_sk(sk)->dccps_l_ack_ratio = ratio; + BUG_ON(opt->dccpop_val == NULL); + + list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending); return 0; } -static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) +EXPORT_SYMBOL_GPL(dccp_feat_change); + +static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) { struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); + /* figure out if we are changing our CCID or the peer's */ + const int rx = type == DCCPO_CHANGE_R; + const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid; + struct ccid *new_ccid; + + /* Check if nothing is being changed. */ + if (ccid_nr == new_ccid_nr) + return 0; + + new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC); + if (new_ccid == NULL) + return -ENOMEM; if (rx) { - if (enable && dp->dccps_hc_rx_ackvec == NULL) { - dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); - if (dp->dccps_hc_rx_ackvec == NULL) - return -ENOMEM; - } else if (!enable) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + dp->dccps_hc_rx_ccid = new_ccid; + dmsk->dccpms_rx_ccid = new_ccid_nr; + } else { + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_tx_ccid = new_ccid; + dmsk->dccpms_tx_ccid = new_ccid_nr; } - return 0; -} -static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) -{ - if (!rx) - dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); return 0; } -/* - * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that - * `rx' holds when the sending peer informs about his partial coverage via a - * ChangeR() option. In the other case, we are the sender and the receiver - * announces its coverage via ChangeL() options. The policy here is to honour - * such communication by enabling the corresponding partial coverage - but only - * if it has not been set manually before; the warning here means that all - * packets will be dropped. - */ -static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) +static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) { - struct dccp_sock *dp = dccp_sk(sk); + dccp_feat_debug(type, feat, val); - if (rx) - dp->dccps_pcrlen = cscov; - else { - if (dp->dccps_pcslen == 0) - dp->dccps_pcslen = cscov; - else if (cscov > dp->dccps_pcslen) - DCCP_WARN("CsCov %u too small, peer requires >= %u\n", - dp->dccps_pcslen, (u8)cscov); + switch (feat) { + case DCCPF_CCID: + return dccp_feat_update_ccid(sk, type, val); + default: + dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n", + dccp_feat_typename(type), feat); + break; } return 0; } -static const struct { - u8 feat_num; /* DCCPF_xxx */ - enum dccp_feat_type rxtx; /* RX or TX */ - enum dccp_feat_type reconciliation; /* SP or NN */ - u8 default_value; /* as in 6.4 */ - int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); -/* - * Lookup table for location and type of features (from RFC 4340/4342) - * +--------------------------+----+-----+----+----+---------+-----------+ - * | Feature | Location | Reconc. | Initial | Section | - * | | RX | TX | SP | NN | Value | Reference | - * +--------------------------+----+-----+----+----+---------+-----------+ - * | DCCPF_CCID | | X | X | | 2 | 10 | - * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | - * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | - * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | - * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | - * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | - * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | - * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | - * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | - * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | - * +--------------------------+----+-----+----+----+---------+-----------+ - */ -} dccp_feat_table[] = { - { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, - { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, - { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, - { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, - { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, - { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, - { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, - { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, -}; -#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) - -/** - * dccp_feat_index - Hash function to map feature number into array position - * Returns consecutive array index or -1 if the feature is not understood. - */ -static int dccp_feat_index(u8 feat_num) +static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, + u8 *rpref, u8 rlen) { - /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ - if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) - return feat_num - 1; + struct dccp_sock *dp = dccp_sk(sk); + u8 *spref, slen, *res = NULL; + int i, j, rc, agree = 1; + BUG_ON(rpref == NULL); + + /* check if we are the black sheep */ + if (dp->dccps_role == DCCP_ROLE_CLIENT) { + spref = rpref; + slen = rlen; + rpref = opt->dccpop_val; + rlen = opt->dccpop_len; + } else { + spref = opt->dccpop_val; + slen = opt->dccpop_len; + } /* - * Other features: add cases for new feature types here after adding - * them to the above table. + * Now we have server preference list in spref and client preference in + * rpref */ - switch (feat_num) { - case DCCPF_SEND_LEV_RATE: - return DCCP_FEAT_SUPPORTED_MAX - 1; - } - return -1; -} - -static u8 dccp_feat_type(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - - if (idx < 0) - return FEAT_UNKNOWN; - return dccp_feat_table[idx].reconciliation; -} + BUG_ON(spref == NULL); + BUG_ON(rpref == NULL); -static int dccp_feat_default_value(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); + /* FIXME sanity check vals */ - return idx < 0 ? : dccp_feat_table[idx].default_value; -} - -/* - * Debugging and verbose-printing section - */ -static const char *dccp_feat_fname(const u8 feat) -{ - static const char *feature_names[] = { - [DCCPF_RESERVED] = "Reserved", - [DCCPF_CCID] = "CCID", - [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", - [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", - [DCCPF_ECN_INCAPABLE] = "ECN Incapable", - [DCCPF_ACK_RATIO] = "Ack Ratio", - [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", - [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", - [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", - [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", - }; - if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) - return feature_names[DCCPF_RESERVED]; - - if (feat == DCCPF_SEND_LEV_RATE) - return "Send Loss Event Rate"; - if (feat >= DCCPF_MIN_CCID_SPECIFIC) - return "CCID-specific"; - - return feature_names[feat]; -} - -static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING", - "UNSTABLE", "STABLE" }; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_feat_oname(const u8 opt) -{ - switch (opt) { - case DCCPO_CHANGE_L: return "Change_L"; - case DCCPO_CONFIRM_L: return "Confirm_L"; - case DCCPO_CHANGE_R: return "Change_R"; - case DCCPO_CONFIRM_R: return "Confirm_R"; + /* Are values in any order? XXX Lame "algorithm" here */ + for (i = 0; i < slen; i++) { + for (j = 0; j < rlen; j++) { + if (spref[i] == rpref[j]) { + res = &spref[i]; + break; + } + } + if (res) + break; } - return NULL; -} -static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) -{ - u8 i, type = dccp_feat_type(feat_num); - - if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) - dccp_pr_debug_cat("(NULL)"); - else if (type == FEAT_SP) - for (i = 0; i < val->sp.len; i++) - dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); - else if (type == FEAT_NN) - dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); - else - dccp_pr_debug_cat("unknown type %u", type); -} - -static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) -{ - u8 type = dccp_feat_type(feat_num); - dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; - - if (type == FEAT_NN) - fval.nn = dccp_decode_value_var(list, len); - dccp_feat_printval(feat_num, &fval); -} + /* we didn't agree on anything */ + if (res == NULL) { + /* confirm previous value */ + switch (opt->dccpop_feat) { + case DCCPF_CCID: + /* XXX did i get this right? =P */ + if (opt->dccpop_type == DCCPO_CHANGE_L) + res = &dccp_msk(sk)->dccpms_tx_ccid; + else + res = &dccp_msk(sk)->dccpms_rx_ccid; + break; -static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) -{ - dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", - dccp_feat_fname(entry->feat_num)); - dccp_feat_printval(entry->feat_num, &entry->val); - dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], - entry->needs_confirm ? "(Confirm pending)" : ""); -} + default: + DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat); + /* XXX implement res */ + return -EFAULT; + } -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ - dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ - dccp_feat_printvals(feat, val, len); \ - dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) - -#define dccp_feat_print_fnlist(fn_list) { \ - const struct dccp_feat_entry *___entry; \ - \ - dccp_pr_debug("List Dump:\n"); \ - list_for_each_entry(___entry, fn_list, node) \ - dccp_feat_print_entry(___entry); \ -} -#else /* ! CONFIG_IP_DCCP_DEBUG */ -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) -#define dccp_feat_print_fnlist(fn_list) -#endif + dccp_pr_debug("Don't agree... reconfirming %d\n", *res); + agree = 0; /* this is used for mandatory options... */ + } -static int __dccp_feat_activate(struct sock *sk, const int idx, - const bool is_local, dccp_feat_val const *fval) -{ - bool rx; - u64 val; + /* need to put result and our preference list */ + rlen = 1 + opt->dccpop_len; + rpref = kmalloc(rlen, GFP_ATOMIC); + if (rpref == NULL) + return -ENOMEM; - if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) - return -1; - if (dccp_feat_table[idx].activation_hdlr == NULL) - return 0; + *rpref = *res; + memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len); - if (fval == NULL) { - val = dccp_feat_table[idx].default_value; - } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { - if (fval->sp.vec == NULL) { - /* - * This can happen when an empty Confirm is sent - * for an SP (i.e. known) feature. In this case - * we would be using the default anyway. - */ - DCCP_CRIT("Feature #%d undefined: using default", idx); - val = dccp_feat_table[idx].default_value; - } else { - val = fval->sp.vec[0]; + /* put it in the "confirm queue" */ + if (opt->dccpop_sc == NULL) { + opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC); + if (opt->dccpop_sc == NULL) { + kfree(rpref); + return -ENOMEM; } } else { - val = fval->nn; + /* recycle the confirm slot */ + BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); + kfree(opt->dccpop_sc->dccpoc_val); + dccp_pr_debug("recycling confirm slot\n"); + } + memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc)); + + opt->dccpop_sc->dccpoc_val = rpref; + opt->dccpop_sc->dccpoc_len = rlen; + + /* update the option on our side [we are about to send the confirm] */ + rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res); + if (rc) { + kfree(opt->dccpop_sc->dccpoc_val); + kfree(opt->dccpop_sc); + opt->dccpop_sc = NULL; + return rc; } - /* Location is RX if this is a local-RX or remote-TX feature */ - rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); - - dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", - dccp_feat_fname(dccp_feat_table[idx].feat_num), - fval ? "" : "default ", (unsigned long long)val); - - return dccp_feat_table[idx].activation_hdlr(sk, val, rx); -} - -/** - * dccp_feat_activate - Activate feature value on socket - * @sk: fully connected DCCP socket (after handshake is complete) - * @feat_num: feature to activate, one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @fval: the value (SP or NN) to activate, or NULL to use the default value - * For general use this function is preferable over __dccp_feat_activate(). - */ -static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, - dccp_feat_val const *fval) -{ - return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); -} - -/* Test for "Req'd" feature (RFC 4340, 6.4) */ -static inline int dccp_feat_must_be_understood(u8 feat_num) -{ - return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || - feat_num == DCCPF_SEQUENCE_WINDOW; -} + dccp_pr_debug("Will confirm %d\n", *rpref); -/* copy constructor, fval must not already contain allocated memory */ -static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) -{ - fval->sp.len = len; - if (fval->sp.len > 0) { - fval->sp.vec = kmemdup(val, len, gfp_any()); - if (fval->sp.vec == NULL) { - fval->sp.len = 0; - return -ENOBUFS; - } + /* say we want to change to X but we just got a confirm X, suppress our + * change + */ + if (!opt->dccpop_conf) { + if (*opt->dccpop_val == *res) + opt->dccpop_conf = 1; + dccp_pr_debug("won't ask for change of same feature\n"); } - return 0; -} -static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) -{ - if (unlikely(val == NULL)) - return; - if (dccp_feat_type(feat_num) == FEAT_SP) - kfree(val->sp.vec); - memset(val, 0, sizeof(*val)); + return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */ } -static struct dccp_feat_entry * - dccp_feat_clone_entry(struct dccp_feat_entry const *original) +static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) { - struct dccp_feat_entry *new; - u8 type = dccp_feat_type(original->feat_num); - - if (type == FEAT_UNKNOWN) - return NULL; + struct dccp_minisock *dmsk = dccp_msk(sk); + struct dccp_opt_pend *opt; + int rc = 1; + u8 t; - new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); - if (new == NULL) - return NULL; + /* + * We received a CHANGE. We gotta match it against our own preference + * list. If we got a CHANGE_R it means it's a change for us, so we need + * to compare our CHANGE_L list. + */ + if (type == DCCPO_CHANGE_L) + t = DCCPO_CHANGE_R; + else + t = DCCPO_CHANGE_L; - if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, - original->val.sp.vec, - original->val.sp.len)) { - kfree(new); - return NULL; - } - return new; -} + /* find our preference list for this feature */ + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { + if (opt->dccpop_type != t || opt->dccpop_feat != feature) + continue; -static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) -{ - if (entry != NULL) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - kfree(entry); + /* find the winner from the two preference lists */ + rc = dccp_feat_reconcile(sk, opt, val, len); + break; } -} -/* - * List management functions - * - * Feature negotiation lists rely on and maintain the following invariants: - * - each feat_num in the list is known, i.e. we know its type and default value - * - each feat_num/is_local combination is unique (old entries are overwritten) - * - SP values are always freshly allocated - * - list is sorted in increasing order of feature number (faster lookup) - */ -static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, - u8 feat_num, bool is_local) -{ - struct dccp_feat_entry *entry; + /* We didn't deal with the change. This can happen if we have no + * preference list for the feature. In fact, it just shouldn't + * happen---if we understand a feature, we should have a preference list + * with at least the default value. + */ + BUG_ON(rc == 1); - list_for_each_entry(entry, fn_list, node) - if (entry->feat_num == feat_num && entry->is_local == is_local) - return entry; - else if (entry->feat_num > feat_num) - break; - return NULL; + return rc; } -/** - * dccp_feat_entry_new - Central list update routine (called by all others) - * @head: list to add to - * @feat: feature number - * @local: whether the local (1) or remote feature with number @feat is meant - * This is the only constructor and serves to ensure the above invariants. - */ -static struct dccp_feat_entry * - dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) +static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) { - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, head, node) - if (entry->feat_num == feat && entry->is_local == local) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - return entry; - } else if (entry->feat_num > feat) { - head = &entry->node; - break; - } + struct dccp_opt_pend *opt; + struct dccp_minisock *dmsk = dccp_msk(sk); + u8 *copy; + int rc; - entry = kmalloc(sizeof(*entry), gfp_any()); - if (entry != NULL) { - entry->feat_num = feat; - entry->is_local = local; - list_add_tail(&entry->node, head); + /* NN features must be Change L (sec. 6.3.2) */ + if (type != DCCPO_CHANGE_L) { + dccp_pr_debug("received %s for NN feature %d\n", + dccp_feat_typename(type), feature); + return -EFAULT; } - return entry; -} -/** - * dccp_feat_push_change - Add/overwrite a Change option in the list - * @fn_list: feature-negotiation list to update - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @needs_mandatory: whether to use Mandatory feature negotiation options - * @fval: pointer to NN/SP value to be inserted (will be copied) - */ -static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, - u8 mandatory, dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); + /* XXX sanity check opt val */ - if (new == NULL) + /* copy option so we can confirm it */ + opt = kzalloc(sizeof(*opt), GFP_ATOMIC); + if (opt == NULL) return -ENOMEM; - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_INITIALISING; - new->needs_confirm = 0; - new->empty_confirm = 0; - new->val = *fval; - new->needs_mandatory = mandatory; + copy = kmemdup(val, len, GFP_ATOMIC); + if (copy == NULL) { + kfree(opt); + return -ENOMEM; + } - return 0; -} + opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */ + opt->dccpop_feat = feature; + opt->dccpop_val = copy; + opt->dccpop_len = len; -/** - * dccp_feat_push_confirm - Add a Confirm entry to the FN list - * @fn_list: feature-negotiation list to add to - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is being confirmed - * @fval: pointer to NN/SP value to be inserted or NULL - * Returns 0 on success, a Reset code for further processing otherwise. - */ -static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, - dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); + /* change feature */ + rc = dccp_feat_update(sk, type, feature, *val); + if (rc) { + kfree(opt->dccpop_val); + kfree(opt); + return rc; + } - if (new == NULL) - return DCCP_RESET_CODE_TOO_BUSY; + dccp_feat_debug(type, feature, *copy); - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_STABLE; /* transition in 6.6.2 */ - new->needs_confirm = 1; - new->empty_confirm = (fval == NULL); - new->val.nn = 0; /* zeroes the whole structure */ - if (!new->empty_confirm) - new->val = *fval; - new->needs_mandatory = 0; + list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); return 0; } -static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) +static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk, + u8 type, u8 feature) { - return dccp_feat_push_confirm(fn_list, feat, local, NULL); -} + /* XXX check if other confirms for that are queued and recycle slot */ + struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC); -static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) -{ - list_del(&entry->node); - dccp_feat_entry_destructor(entry); -} - -void dccp_feat_list_purge(struct list_head *fn_list) -{ - struct dccp_feat_entry *entry, *next; - - list_for_each_entry_safe(entry, next, fn_list, node) - dccp_feat_entry_destructor(entry); - INIT_LIST_HEAD(fn_list); -} -EXPORT_SYMBOL_GPL(dccp_feat_list_purge); - -/* generate @to as full clone of @from - @to must not contain any nodes */ -int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) -{ - struct dccp_feat_entry *entry, *new; - - INIT_LIST_HEAD(to); - list_for_each_entry(entry, from, node) { - new = dccp_feat_clone_entry(entry); - if (new == NULL) - goto cloning_failed; - list_add_tail(&new->node, to); + if (opt == NULL) { + /* XXX what do we do? Ignoring should be fine. It's a change + * after all =P + */ + return; } - return 0; -cloning_failed: - dccp_feat_list_purge(to); - return -ENOMEM; -} + switch (type) { + case DCCPO_CHANGE_L: + opt->dccpop_type = DCCPO_CONFIRM_R; + break; + case DCCPO_CHANGE_R: + opt->dccpop_type = DCCPO_CONFIRM_L; + break; + default: + DCCP_WARN("invalid type %d\n", type); + kfree(opt); + return; + } + opt->dccpop_feat = feature; + opt->dccpop_val = NULL; + opt->dccpop_len = 0; -/** - * dccp_feat_valid_nn_length - Enforce length constraints on NN options - * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, - * incoming options are accepted as long as their values are valid. - */ -static u8 dccp_feat_valid_nn_length(u8 feat_num) -{ - if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ - return 2; - if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ - return 6; - return 0; -} + /* change feature */ + dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature); -static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) -{ - switch (feat_num) { - case DCCPF_ACK_RATIO: - return val <= DCCPF_ACK_RATIO_MAX; - case DCCPF_SEQUENCE_WINDOW: - return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; - } - return 0; /* feature unknown - so we can't tell */ + list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); } -/* check that SP values are within the ranges defined in RFC 4340 */ -static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) +static void dccp_feat_flush_confirm(struct sock *sk) { - switch (feat_num) { - case DCCPF_CCID: - return val == DCCPC_CCID2 || val == DCCPC_CCID3; - /* Type-check Boolean feature values: */ - case DCCPF_SHORT_SEQNOS: - case DCCPF_ECN_INCAPABLE: - case DCCPF_SEND_ACK_VECTOR: - case DCCPF_SEND_NDP_COUNT: - case DCCPF_DATA_CHECKSUM: - case DCCPF_SEND_LEV_RATE: - return val < 2; - case DCCPF_MIN_CSUM_COVER: - return val < 16; - } - return 0; /* feature unknown */ -} + struct dccp_minisock *dmsk = dccp_msk(sk); + /* Check if there is anything to confirm in the first place */ + int yes = !list_empty(&dmsk->dccpms_conf); -static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) -{ - if (sp_list == NULL || sp_len < 1) - return 0; - while (sp_len--) - if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) - return 0; - return 1; -} + if (!yes) { + struct dccp_opt_pend *opt; -/** - * dccp_feat_insert_opts - Generate FN options from current list state - * @skb: next sk_buff to be sent to the peer - * @dp: for client during handshake and general negotiation - * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) - */ -int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - struct dccp_feat_entry *pos, *next; - u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; - bool rpt; - - /* put entries into @skb in the order they appear in the list */ - list_for_each_entry_safe_reverse(pos, next, fn, node) { - opt = dccp_feat_genopt(pos); - type = dccp_feat_type(pos->feat_num); - rpt = false; - - if (pos->empty_confirm) { - len = 0; - ptr = NULL; - } else { - if (type == FEAT_SP) { - len = pos->val.sp.len; - ptr = pos->val.sp.vec; - rpt = pos->needs_confirm; - } else if (type == FEAT_NN) { - len = dccp_feat_valid_nn_length(pos->feat_num); - ptr = nn_in_nbo; - dccp_encode_value_var(pos->val.nn, ptr, len); - } else { - DCCP_BUG("unknown feature %u", pos->feat_num); - return -1; + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { + if (opt->dccpop_conf) { + yes = 1; + break; } } - dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); - - if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) - return -1; - if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) - return -1; - /* - * Enter CHANGING after transmitting the Change option (6.6.2). - */ - if (pos->state == FEAT_INITIALISING) - pos->state = FEAT_CHANGING; } - return 0; -} - -/** - * __feat_register_nn - Register new NN value on socket - * @fn: feature-negotiation list to register with - * @feat: an NN feature from %dccp_feature_numbers - * @mandatory: use Mandatory option if 1 - * @nn_val: value to register (restricted to 4 bytes) - * Note that NN features are local by definition (RFC 4340, 6.3.2). - */ -static int __feat_register_nn(struct list_head *fn, u8 feat, - u8 mandatory, u64 nn_val) -{ - dccp_feat_val fval = { .nn = nn_val }; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - /* Don't bother with default values, they will be activated anyway. */ - if (nn_val - (u64)dccp_feat_default_value(feat) == 0) - return 0; - - return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); -} - -/** - * __feat_register_sp - Register new SP value/list on socket - * @fn: feature-negotiation list to register with - * @feat: an SP feature from %dccp_feature_numbers - * @is_local: whether the local (1) or the remote (0) @feat is meant - * @mandatory: use Mandatory option if 1 - * @sp_val: SP value followed by optional preference list - * @sp_len: length of @sp_val in bytes - */ -static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, - u8 mandatory, u8 const *sp_val, u8 sp_len) -{ - dccp_feat_val fval; - if (dccp_feat_type(feat) != FEAT_SP || - !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) - return -EINVAL; - - /* Avoid negotiating alien CCIDs by only advertising supported ones */ - if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) - return -EOPNOTSUPP; - - if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) - return -ENOMEM; + if (!yes) + return; - return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); + /* OK there is something to confirm... */ + /* XXX check if packet is in flight? Send delayed ack?? */ + if (sk->sk_state == DCCP_OPEN) + dccp_send_ack(sk); } -/** - * dccp_feat_register_sp - Register requests to change SP feature values - * @sk: client or listening socket - * @feat: one of %dccp_feature_numbers - * @is_local: whether the local (1) or remote (0) @feat is meant - * @list: array of preferred values, in descending order of preference - * @len: length of @list in bytes - */ -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len) -{ /* any changes must be registered before establishing the connection */ - if (sk->sk_state != DCCP_CLOSED) - return -EISCONN; - if (dccp_feat_type(feat) != FEAT_SP) - return -EINVAL; - return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, - 0, list, len); -} - -/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */ -int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) +int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) { - /* any changes must be registered before establishing the connection */ - if (sk->sk_state != DCCP_CLOSED) - return -EISCONN; - if (dccp_feat_type(feat) != FEAT_NN) - return -EINVAL; - return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); -} + int rc; -/** - * dccp_feat_signal_nn_change - Update NN values for an established connection - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * @nn_val: the new value to use - * This function is used to communicate NN updates out-of-band. The difference - * to feature negotiation during connection setup is that values are activated - * immediately after validation, i.e. we don't wait for the Confirm: either the - * value is accepted by the peer (and then the waiting is futile), or it is not - * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values - * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2). - */ -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - dccp_feat_val fval = { .nn = nn_val }; - struct dccp_feat_entry *entry; + dccp_feat_debug(type, feature, *val); - if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) - return 0; + /* figure out if it's SP or NN feature */ + switch (feature) { + /* deal with SP features */ + case DCCPF_CCID: + rc = dccp_feat_sp(sk, type, feature, val, len); + break; - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; + /* deal with NN features */ + case DCCPF_ACK_RATIO: + rc = dccp_feat_nn(sk, type, feature, val, len); + break; - entry = dccp_feat_list_lookup(fn, feat, 1); - if (entry != NULL) { - dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n", - (unsigned long long)nn_val, - (unsigned long long)entry->val.nn, - dccp_feat_sname[entry->state]); - return 0; + /* XXX implement other features */ + default: + dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n", + dccp_feat_typename(type), feature); + rc = -EFAULT; + break; } - if (dccp_feat_activate(sk, feat, 1, &fval)) - return -EADV; - - inet_csk_schedule_ack(sk); - return dccp_feat_push_change(fn, feat, 1, 0, &fval); -} -EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); - -/* - * Tracking features whose value depend on the choice of CCID - * - * This is designed with an extension in mind so that a list walk could be done - * before activating any features. However, the existing framework was found to - * work satisfactorily up until now, the automatic verification is left open. - * When adding new CCIDs, add a corresponding dependency table here. - */ -static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) -{ - static const struct ccid_dependency ccid2_dependencies[2][2] = { - /* - * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX - * feature and Send Ack Vector is an RX feature, `is_local' - * needs to be reversed. + /* check if there were problems changing features */ + if (rc) { + /* If we don't agree on SP, we sent a confirm for old value. + * However we propagate rc to caller in case option was + * mandatory */ - { /* Dependencies of the receiver-side (remote) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - }, - { /* Dependencies of the sender-side (local) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - static const struct ccid_dependency ccid3_dependencies[2][5] = { - { /* - * Dependencies of the receiver-side CCID3 - */ - { /* locally disable Ack Vectors */ - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* see below why Send Loss Event Rate is on */ - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { /* NDP Count is needed as per RFC 4342, 6.1.1 */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 }, - }, - { /* - * CCID3 at the TX side: we request that the HC-receiver - * will not send Ack Vectors (they will be ignored, so - * Mandatory is not set); we enable Send Loss Event Rate - * (Mandatory since the implementation does not support - * the Loss Intervals option of RFC 4342, 8.6). - * The last two options are for peer's information only. - */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = false, - .val = 0 - }, - { - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { /* this CCID does not support Ack Ratio */ - .dependent_feat = DCCPF_ACK_RATIO, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* tell receiver we are sending NDP counts */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = true, - .is_mandatory = false, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - switch (ccid) { - case DCCPC_CCID2: - return ccid2_dependencies[is_local]; - case DCCPC_CCID3: - return ccid3_dependencies[is_local]; - default: - return NULL; + if (rc != DCCP_FEAT_SP_NOAGREE) + dccp_feat_empty_confirm(dccp_msk(sk), type, feature); } -} -/** - * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID - * @fn: feature-negotiation list to update - * @id: CCID number to track - * @is_local: whether TX CCID (1) or RX CCID (0) is meant - * This function needs to be called after registering all other features. - */ -static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) -{ - const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); - int i, rc = (table == NULL); - - for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) - if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) - rc = __feat_register_sp(fn, table[i].dependent_feat, - table[i].is_local, - table[i].is_mandatory, - &table[i].val, 1); - else - rc = __feat_register_nn(fn, table[i].dependent_feat, - table[i].is_mandatory, - table[i].val); - return rc; -} - -/** - * dccp_feat_finalise_settings - Finalise settings before starting negotiation - * @dp: client or listening socket (settings will be inherited) - * This is called after all registrations (socket initialisation, sysctls, and - * sockopt calls), and before sending the first packet containing Change options - * (ie. client-Request or server-Response), to ensure internal consistency. - */ -int dccp_feat_finalise_settings(struct dccp_sock *dp) -{ - struct list_head *fn = &dp->dccps_featneg; - struct dccp_feat_entry *entry; - int i = 2, ccids[2] = { -1, -1 }; + /* generate the confirm [if required] */ + dccp_feat_flush_confirm(sk); - /* - * Propagating CCIDs: - * 1) not useful to propagate CCID settings if this host advertises more - * than one CCID: the choice of CCID may still change - if this is - * the client, or if this is the server and the client sends - * singleton CCID values. - * 2) since is that propagate_ccid changes the list, we defer changing - * the sorted list until after the traversal. - */ - list_for_each_entry(entry, fn, node) - if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) - ccids[entry->is_local] = entry->val.sp.vec[0]; - while (i--) - if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) - return -1; - dccp_feat_print_fnlist(fn); - return 0; + return rc; } -/** - * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features - * It is the server which resolves the dependencies once the CCID has been - * fully negotiated. If no CCID has been negotiated, it uses the default CCID. - */ -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) -{ - struct list_head *fn = &dreq->dreq_featneg; - struct dccp_feat_entry *entry; - u8 is_local, ccid; - - for (is_local = 0; is_local <= 1; is_local++) { - entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); - - if (entry != NULL && !entry->empty_confirm) - ccid = entry->val.sp.vec[0]; - else - ccid = dccp_feat_default_value(DCCPF_CCID); - - if (dccp_feat_propagate_ccid(fn, ccid, is_local)) - return -1; - } - return 0; -} +EXPORT_SYMBOL_GPL(dccp_feat_change_recv); -/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ -static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) +int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, + u8 *val, u8 len) { - u8 c, s; + u8 t; + struct dccp_opt_pend *opt; + struct dccp_minisock *dmsk = dccp_msk(sk); + int found = 0; + int all_confirmed = 1; - for (s = 0; s < slen; s++) - for (c = 0; c < clen; c++) - if (servlist[s] == clilist[c]) - return servlist[s]; - return -1; -} + dccp_feat_debug(type, feature, *val); -/** - * dccp_feat_prefer - Move preferred entry to the start of array - * Reorder the @array_len elements in @array so that @preferred_value comes - * first. Returns >0 to indicate that @preferred_value does occur in @array. - */ -static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) -{ - u8 i, does_occur = 0; + /* locate our change request */ + switch (type) { + case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break; + case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break; + default: DCCP_WARN("invalid type %d\n", type); + return 1; - if (array != NULL) { - for (i = 0; i < array_len; i++) - if (array[i] == preferred_value) { - array[i] = array[0]; - does_occur++; - } - if (does_occur) - array[0] = preferred_value; } - return does_occur; -} + /* XXX sanity check feature value */ -/** - * dccp_feat_reconcile - Reconcile SP preference lists - * @fval: SP list to reconcile into - * @arr: received SP preference list - * @len: length of @arr in bytes - * @is_server: whether this side is the server (and @fv is the server's list) - * @reorder: whether to reorder the list in @fv after reconciling with @arr - * When successful, > 0 is returned and the reconciled list is in @fval. - * A value of 0 means that negotiation failed (no shared entry). - */ -static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, - bool is_server, bool reorder) -{ - int rc; + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { + if (!opt->dccpop_conf && opt->dccpop_type == t && + opt->dccpop_feat == feature) { + found = 1; + dccp_pr_debug("feature %d found\n", opt->dccpop_feat); - if (!fv->sp.vec || !arr) { - DCCP_CRIT("NULL feature value or array"); - return 0; - } + /* XXX do sanity check */ - if (is_server) - rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); - else - rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); - - if (!reorder) - return rc; - if (rc < 0) - return 0; + opt->dccpop_conf = 1; - /* - * Reorder list: used for activating features and in dccp_insert_fn_opt. - */ - return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); -} + /* We got a confirmation---change the option */ + dccp_feat_update(sk, opt->dccpop_type, + opt->dccpop_feat, *val); -/** - * dccp_feat_change_recv - Process incoming ChangeL/R options - * @fn: feature-negotiation list to update - * @is_mandatory: whether the Change was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is the server (1) or the client (0) - */ -static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 defval, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CHANGE_R); - struct dccp_feat_entry *entry; - dccp_feat_val fval; - - if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ - goto unknown_feature_or_value; - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - /* - * Negotiation of NN features: Change R is invalid, so there is no - * simultaneous negotiation; hence we do not look up in the list. - */ - if (type == FEAT_NN) { - if (local || len > sizeof(fval.nn)) - goto unknown_feature_or_value; - - /* 6.3.2: "The feature remote MUST accept any valid value..." */ - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto unknown_feature_or_value; + /* XXX check the return value of dccp_feat_update */ + break; + } - return dccp_feat_push_confirm(fn, feat, local, &fval); + if (!opt->dccpop_conf) + all_confirmed = 0; } - /* - * Unidirectional/simultaneous negotiation of SP features (6.3.1) + /* fix re-transmit timer */ + /* XXX gotta make sure that no option negotiation occurs during + * connection shutdown. Consider that the CLOSEREQ is sent and timer is + * on. if all options are confirmed it might kill timer which should + * remain alive until close is received. */ - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL) { - /* - * No particular preferences have been registered. We deal with - * this situation by assuming that all valid values are equally - * acceptable, and apply the following checks: - * - if the peer's list is a singleton, we accept a valid value; - * - if we are the server, we first try to see if the peer (the - * client) advertises the default value. If yes, we use it, - * otherwise we accept the preferred value; - * - else if we are the client, we use the first list element. - */ - if (dccp_feat_clone_sp_val(&fval, val, 1)) - return DCCP_RESET_CODE_TOO_BUSY; - - if (len > 1 && server) { - defval = dccp_feat_default_value(feat); - if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) - fval.sp.vec[0] = defval; - } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { - kfree(fval.sp.vec); - goto unknown_feature_or_value; - } - - /* Treat unsupported CCIDs like invalid values */ - if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { - kfree(fval.sp.vec); - goto not_valid_or_not_known; - } - - return dccp_feat_push_confirm(fn, feat, local, &fval); - - } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ - return 0; + if (all_confirmed) { + dccp_pr_debug("clear feat negotiation timer %p\n", sk); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } - if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { - entry->empty_confirm = 0; - } else if (is_mandatory) { - return DCCP_RESET_CODE_MANDATORY_ERROR; - } else if (entry->state == FEAT_INITIALISING) { - /* - * Failed simultaneous negotiation (server only): try to `save' - * the connection by checking whether entry contains the default - * value for @feat. If yes, send an empty Confirm to signal that - * the received Change was not understood - which implies using - * the default value. - * If this also fails, we use Reset as the last resort. - */ - WARN_ON(!server); - defval = dccp_feat_default_value(feat); - if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) - return DCCP_RESET_CODE_OPTION_ERROR; - entry->empty_confirm = 1; - } - entry->needs_confirm = 1; - entry->needs_mandatory = 0; - entry->state = FEAT_STABLE; + if (!found) + dccp_pr_debug("%s(%d, ...) never requested\n", + dccp_feat_typename(type), feature); return 0; - -unknown_feature_or_value: - if (!is_mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -not_valid_or_not_known: - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; } -/** - * dccp_feat_confirm_recv - Process received Confirm options - * @fn: feature-negotiation list to update - * @is_mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is server (1) or client (0) - */ -static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 *plist, plen, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - if (entry == NULL) { /* nothing queued: ignore or handle error */ - if (is_mandatory && type == FEAT_UNKNOWN) - return DCCP_RESET_CODE_MANDATORY_ERROR; - - if (!local && type == FEAT_NN) /* 6.3.2 */ - goto confirmation_failed; - return 0; - } - - if (entry->state != FEAT_CHANGING) /* 6.6.2 */ - return 0; - - if (len == 0) { - if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ - goto confirmation_failed; - /* - * Empty Confirm during connection setup: this means reverting - * to the `old' value, which in this case is the default. Since - * we handle default values automatically when no other values - * have been set, we revert to the old value by removing this - * entry from the list. - */ - dccp_feat_list_pop(entry); - return 0; - } +EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv); - if (type == FEAT_NN) { - if (len > sizeof(entry->val.nn)) - goto confirmation_failed; +void dccp_feat_clean(struct dccp_minisock *dmsk) +{ + struct dccp_opt_pend *opt, *next; - if (entry->val.nn == dccp_decode_value_var(val, len)) - goto confirmation_succeeded; + list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending, + dccpop_node) { + BUG_ON(opt->dccpop_val == NULL); + kfree(opt->dccpop_val); - DCCP_WARN("Bogus Confirm for non-existing value\n"); - goto confirmation_failed; - } + if (opt->dccpop_sc != NULL) { + BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); + kfree(opt->dccpop_sc->dccpoc_val); + kfree(opt->dccpop_sc); + } - /* - * Parsing SP Confirms: the first element of @val is the preferred - * SP value which the peer confirms, the remainder depends on @len. - * Note that only the confirmed value need to be a valid SP value. - */ - if (!dccp_feat_is_valid_sp_val(feat, *val)) - goto confirmation_failed; - - if (len == 1) { /* peer didn't supply a preference list */ - plist = val; - plen = len; - } else { /* preferred value + preference list */ - plist = val + 1; - plen = len - 1; + kfree(opt); } + INIT_LIST_HEAD(&dmsk->dccpms_pending); - /* Check whether the peer got the reconciliation right (6.6.8) */ - if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { - DCCP_WARN("Confirm selected the wrong value %u\n", *val); - return DCCP_RESET_CODE_OPTION_ERROR; + list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { + BUG_ON(opt == NULL); + if (opt->dccpop_val != NULL) + kfree(opt->dccpop_val); + kfree(opt); } - entry->val.sp.vec[0] = *val; - -confirmation_succeeded: - entry->state = FEAT_STABLE; - return 0; - -confirmation_failed: - DCCP_WARN("Confirmation failed\n"); - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; + INIT_LIST_HEAD(&dmsk->dccpms_conf); } -/** - * dccp_feat_handle_nn_established - Fast-path reception of NN options - * @sk: socket of an established DCCP connection - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) - * @feat: NN number, one of %dccp_feature_numbers - * @val: NN value - * @len: length of @val in bytes - * This function combines the functionality of change_recv/confirm_recv, with - * the following differences (reset codes are the same): - * - cleanup after receiving the Confirm; - * - values are directly activated after successful parsing; - * - deliberately restricted to NN features. - * The restriction to NN features is essential since SP features can have non- - * predictable outcomes (depending on the remote configuration), and are inter- - * dependent (CCIDs for instance cause further dependencies). +EXPORT_SYMBOL_GPL(dccp_feat_clean); + +/* this is to be called only when a listening sock creates its child. It is + * assumed by the function---the confirm is not duplicated, but rather it is + * "passed on". */ -static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, - u8 feat, u8 *val, u8 len) +int dccp_feat_clone(struct sock *oldsk, struct sock *newsk) { - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry; - u8 type = dccp_feat_type(feat); - dccp_feat_val fval; + struct dccp_minisock *olddmsk = dccp_msk(oldsk); + struct dccp_minisock *newdmsk = dccp_msk(newsk); + struct dccp_opt_pend *opt; + int rc = 0; - dccp_feat_print_opt(opt, feat, val, len, mandatory); + INIT_LIST_HEAD(&newdmsk->dccpms_pending); + INIT_LIST_HEAD(&newdmsk->dccpms_conf); - /* Ignore non-mandatory unknown and non-NN features */ - if (type == FEAT_UNKNOWN) { - if (local && !mandatory) - return 0; - goto fast_path_unknown; - } else if (type != FEAT_NN) { - return 0; - } - - /* - * We don't accept empty Confirms, since in fast-path feature - * negotiation the values are enabled immediately after sending - * the Change option. - * Empty Changes on the other hand are invalid (RFC 4340, 6.1). - */ - if (len == 0 || len > sizeof(fval.nn)) - goto fast_path_unknown; - - if (opt == DCCPO_CHANGE_L) { - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto fast_path_unknown; + list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) { + struct dccp_opt_pend *newopt; + /* copy the value of the option */ + u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC); - if (dccp_feat_push_confirm(fn, feat, local, &fval) || - dccp_feat_activate(sk, feat, local, &fval)) - return DCCP_RESET_CODE_TOO_BUSY; + if (val == NULL) + goto out_clean; - /* set the `Ack Pending' flag to piggyback a Confirm */ - inet_csk_schedule_ack(sk); - - } else if (opt == DCCPO_CONFIRM_R) { - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL || entry->state != FEAT_CHANGING) - return 0; - - fval.nn = dccp_decode_value_var(val, len); - if (fval.nn != entry->val.nn) { - DCCP_WARN("Bogus Confirm for non-existing value\n"); - goto fast_path_failed; + newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC); + if (newopt == NULL) { + kfree(val); + goto out_clean; } - /* It has been confirmed - so remove the entry */ - dccp_feat_list_pop(entry); + /* insert the option */ + newopt->dccpop_val = val; + list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending); - } else { - DCCP_WARN("Received illegal option %u\n", opt); - goto fast_path_failed; + /* XXX what happens with backlogs and multiple connections at + * once... + */ + /* the master socket no longer needs to worry about confirms */ + opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */ + + /* reset state for a new socket */ + opt->dccpop_conf = 0; } - return 0; -fast_path_unknown: - if (!mandatory) - return dccp_push_empty_confirm(fn, feat, local); + /* XXX not doing anything about the conf queue */ + +out: + return rc; -fast_path_failed: - return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; +out_clean: + dccp_feat_clean(newdmsk); + rc = -ENOMEM; + goto out; } -/** - * dccp_feat_parse_options - Process Feature-Negotiation Options - * @sk: for general use and used by the client during connection setup - * @dreq: used by the server during connection setup - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: value contents of @opt - * @len: length of @val in bytes - * Returns 0 on success, a Reset code for ending the connection otherwise. - */ -int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) +EXPORT_SYMBOL_GPL(dccp_feat_clone); + +static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat, + u8 *val, u8 len) { - struct dccp_sock *dp = dccp_sk(sk); - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - bool server = false; + int rc = -ENOMEM; + u8 *copy = kmemdup(val, len, GFP_KERNEL); - switch (sk->sk_state) { - /* - * Negotiation during connection setup - */ - case DCCP_LISTEN: - server = true; /* fall through */ - case DCCP_REQUESTING: - switch (opt) { - case DCCPO_CHANGE_L: - case DCCPO_CHANGE_R: - return dccp_feat_change_recv(fn, mandatory, opt, feat, - val, len, server); - case DCCPO_CONFIRM_R: - case DCCPO_CONFIRM_L: - return dccp_feat_confirm_recv(fn, mandatory, opt, feat, - val, len, server); - } - break; - /* - * Support for exchanging NN options on an established connection - * This is currently restricted to Ack Ratio (RFC 4341, 6.1.2) - */ - case DCCP_OPEN: - case DCCP_PARTOPEN: - return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, - val, len); + if (copy != NULL) { + rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL); + if (rc) + kfree(copy); } - return 0; /* ignore FN options in all other states */ + return rc; } -/** - * dccp_feat_init - Seed feature negotiation with host-specific defaults - * This initialises global defaults, depending on the value of the sysctls. - * These can later be overridden by registering changes via setsockopt calls. - * The last link in the chain is finalise_settings, to make sure that between - * here and the start of actual feature negotiation no inconsistencies enter. - * - * All features not appearing below use either defaults or are otherwise - * later adjusted through dccp_feat_finalise_settings(). - */ -int dccp_feat_init(struct sock *sk) +int dccp_feat_init(struct dccp_minisock *dmsk) { - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - u8 on = 1, off = 0; int rc; - struct { - u8 *val; - u8 len; - } tx, rx; - - /* Non-negotiable (NN) features */ - rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, - sysctl_dccp_sequence_window); - if (rc) - return rc; - /* Server-priority (SP) features */ - - /* Advertise that short seqnos are not supported (7.6.1) */ - rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); - if (rc) - return rc; + INIT_LIST_HEAD(&dmsk->dccpms_pending); + INIT_LIST_HEAD(&dmsk->dccpms_conf); - /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ - rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); + /* CCID L */ + rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID, + &dmsk->dccpms_tx_ccid, 1); if (rc) - return rc; - - /* - * We advertise the available list of CCIDs and reorder according to - * preferences, to avoid failure resulting from negotiating different - * singleton values (which always leads to failure). - * These settings can still (later) be overridden via sockopts. - */ - if (ccid_get_builtin_ccids(&tx.val, &tx.len) || - ccid_get_builtin_ccids(&rx.val, &rx.len)) - return -ENOBUFS; - - /* Pre-load all CCID modules that are going to be advertised */ - rc = -EUNATCH; - if (ccid_request_modules(tx.val, tx.len)) - goto free_ccid_lists; - - if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || - !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) - goto free_ccid_lists; + goto out; - rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); + /* CCID R */ + rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID, + &dmsk->dccpms_rx_ccid, 1); if (rc) - goto free_ccid_lists; + goto out; - rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); - -free_ccid_lists: - kfree(tx.val); - kfree(rx.val); + /* Ack ratio */ + rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO, + &dmsk->dccpms_ack_ratio, 1); +out: return rc; } -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *cur, *next; - int idx; - dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { - [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } - }; - - list_for_each_entry(cur, fn_list, node) { - /* - * An empty Confirm means that either an unknown feature type - * or an invalid value was present. In the first case there is - * nothing to activate, in the other the default value is used. - */ - if (cur->empty_confirm) - continue; +EXPORT_SYMBOL_GPL(dccp_feat_init); - idx = dccp_feat_index(cur->feat_num); - if (idx < 0) { - DCCP_BUG("Unknown feature %u", cur->feat_num); - goto activation_failed; - } - if (cur->state != FEAT_STABLE) { - DCCP_CRIT("Negotiation of %s %s failed in state %s", - cur->is_local ? "local" : "remote", - dccp_feat_fname(cur->feat_num), - dccp_feat_sname[cur->state]); - goto activation_failed; - } - fvals[idx][cur->is_local] = &cur->val; +#ifdef CONFIG_IP_DCCP_DEBUG +const char *dccp_feat_typename(const u8 type) +{ + switch(type) { + case DCCPO_CHANGE_L: return("ChangeL"); + case DCCPO_CONFIRM_L: return("ConfirmL"); + case DCCPO_CHANGE_R: return("ChangeR"); + case DCCPO_CONFIRM_R: return("ConfirmR"); + /* the following case must not appear in feature negotation */ + default: dccp_pr_debug("unknown type %d [BUG!]\n", type); } + return NULL; +} - /* - * Activate in decreasing order of index, so that the CCIDs are always - * activated as the last feature. This avoids the case where a CCID - * relies on the initialisation of one or more features that it depends - * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). - */ - for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) - if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || - __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { - DCCP_CRIT("Could not activate %d", idx); - goto activation_failed; - } +EXPORT_SYMBOL_GPL(dccp_feat_typename); - /* Clean up Change options which have been confirmed already */ - list_for_each_entry_safe(cur, next, fn_list, node) - if (!cur->needs_confirm) - dccp_feat_list_pop(cur); +const char *dccp_feat_name(const u8 feat) +{ + static const char *feature_names[] = { + [DCCPF_RESERVED] = "Reserved", + [DCCPF_CCID] = "CCID", + [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", + [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", + [DCCPF_ECN_INCAPABLE] = "ECN Incapable", + [DCCPF_ACK_RATIO] = "Ack Ratio", + [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", + [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", + [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", + [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", + }; + if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) + return feature_names[DCCPF_RESERVED]; - dccp_pr_debug("Activation OK\n"); - return 0; + if (feat >= DCCPF_MIN_CCID_SPECIFIC) + return "CCID-specific"; -activation_failed: - /* - * We clean up everything that may have been allocated, since - * it is difficult to track at which stage negotiation failed. - * This is ok, since all allocation functions below are robust - * against NULL arguments. - */ - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - return -1; + return feature_names[feat]; } + +EXPORT_SYMBOL_GPL(dccp_feat_name); +#endif /* CONFIG_IP_DCCP_DEBUG */ diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 2217066e22d..e272222c7ac 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -3,134 +3,38 @@ /* * net/dccp/feat.h * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * Copyright (c) 2008 Gerrit Renker + * An implementation of the DCCP protocol * Copyright (c) 2005 Andrea Bittau * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ + #include #include "dccp.h" -/* - * Known limit values - */ -/* Ack Ratio takes 2-byte integer values (11.3) */ -#define DCCPF_ACK_RATIO_MAX 0xFFFF -/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ -#define DCCPF_SEQ_WMIN 32 -#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull -/* Maximum number of SP values that fit in a single (Confirm) option */ -#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) - -enum dccp_feat_type { - FEAT_AT_RX = 1, /* located at RX side of half-connection */ - FEAT_AT_TX = 2, /* located at TX side of half-connection */ - FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ - FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ - FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ -}; - -enum dccp_feat_state { - FEAT_DEFAULT = 0, /* using default values from 6.4 */ - FEAT_INITIALISING, /* feature is being initialised */ - FEAT_CHANGING, /* Change sent but not confirmed yet */ - FEAT_UNSTABLE, /* local modification in state CHANGING */ - FEAT_STABLE /* both ends (think they) agree */ -}; +#ifdef CONFIG_IP_DCCP_DEBUG +extern const char *dccp_feat_typename(const u8 type); +extern const char *dccp_feat_name(const u8 feat); -/** - * dccp_feat_val - Container for SP or NN feature values - * @nn: single NN value - * @sp.vec: single SP value plus optional preference list - * @sp.len: length of @sp.vec in bytes - */ -typedef union { - u64 nn; - struct { - u8 *vec; - u8 len; - } sp; -} dccp_feat_val; - -/** - * struct feat_entry - Data structure to perform feature negotiation - * @feat_num: one of %dccp_feature_numbers - * @val: feature's current value (SP features may have preference list) - * @state: feature's current state - * @needs_mandatory: whether Mandatory options should be sent - * @needs_confirm: whether to send a Confirm instead of a Change - * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) - * @is_local: feature location (1) or feature-remote (0) - * @node: list pointers, entries arranged in FIFO order - */ -struct dccp_feat_entry { - u8 feat_num; - dccp_feat_val val; - enum dccp_feat_state state:8; - bool needs_mandatory:1, - needs_confirm:1, - empty_confirm:1, - is_local:1; - - struct list_head node; -}; - -static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) +static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) { - if (entry->needs_confirm) - return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; - return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; + dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type), + dccp_feat_name(feat), feat, val); } +#else +#define dccp_feat_debug(type, feat, val) +#endif /* CONFIG_IP_DCCP_DEBUG */ + +extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, + u8 *val, u8 len, gfp_t gfp); +extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, + u8 *val, u8 len); +extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, + u8 *val, u8 len); +extern void dccp_feat_clean(struct dccp_minisock *dmsk); +extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); +extern int dccp_feat_init(struct dccp_minisock *dmsk); -/** - * struct ccid_dependency - Track changes resulting from choosing a CCID - * @dependent_feat: one of %dccp_feature_numbers - * @is_local: local (1) or remote (0) @dependent_feat - * @is_mandatory: whether presence of @dependent_feat is mission-critical or not - * @val: corresponding default value for @dependent_feat (u8 is sufficient here) - */ -struct ccid_dependency { - u8 dependent_feat; - bool is_local:1, - is_mandatory:1; - u8 val; -}; - -/* - * Sysctls to seed defaults for feature negotiation - */ -extern unsigned long sysctl_dccp_sequence_window; -extern int sysctl_dccp_rx_ccid; -extern int sysctl_dccp_tx_ccid; - -extern int dccp_feat_init(struct sock *sk); -extern void dccp_feat_initialise_sysctls(void); -extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len); -extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); -extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, - u8 mand, u8 opt, u8 feat, u8 *val, u8 len); -extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); - -/* - * Encoding variable-length options and their maximum length. - * - * This affects NN options (SP options are all u8) and other variable-length - * options (see table 3 in RFC 4340). The limit is currently given the Sequence - * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other - * options consume less than 6 bytes (timestamps are 4 bytes). - * When updating this constant (e.g. due to new internet drafts / RFCs), make - * sure that you also update all code which refers to it. - */ -#define DCCP_OPTVAL_MAXLEN 6 - -extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); -extern u64 dccp_decode_value_var(const u8 *bf, const u8 len); - -extern int dccp_insert_option_mandatory(struct sk_buff *skb); -extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, - u8 *val, u8 len, bool repeat_first); #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c index df0e6714aa1..779d0ed9ae9 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -159,15 +159,13 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) dccp_time_wait(sk, DCCP_TIME_WAIT, 0); } -static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) +static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) { - struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; + struct dccp_sock *dp = dccp_sk(sk); - if (av == NULL) - return; - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); - dccp_ackvec_input(av, skb); + if (dccp_msk(sk)->dccpms_send_ack_vector) + dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_ack_seq); } static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) @@ -366,13 +364,22 @@ discard: int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned len) { + struct dccp_sock *dp = dccp_sk(sk); + if (dccp_check_seqno(sk, skb)) goto discard; if (dccp_parse_options(sk, NULL, skb)) return 1; - dccp_handle_ackvec_processing(sk, skb); + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + if (dccp_msk(sk)->dccpms_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto discard; dccp_deliver_input_to_ccids(sk, skb); return __dccp_rcv_established(sk, skb, dh, len); @@ -414,33 +421,40 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } - /* - * If option processing (Step 8) failed, return 1 here so that - * dccp_v4_do_rcv() sends a Reset. The Reset code depends on - * the option type and is set in dccp_parse_options(). - */ if (dccp_parse_options(sk, NULL, skb)) - return 1; + goto out_invalid_packet; /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - dp->dccps_options_received.dccpor_timestamp_echo)); + if (dccp_msk(sk)->dccpms_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto out_invalid_packet; /* FIXME: change error code */ + /* Stop the REQUEST timer */ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); WARN_ON(sk->sk_send_head == NULL); kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; + dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; + dccp_update_gsr(sk, dp->dccps_isr); /* - * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect - * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH - * is done as part of activating the feature values below, since - * these settings depend on the local/remote Sequence Window - * features, which were undefined or not confirmed until now. + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + * + * AWL was adjusted in dccp_v4_connect -acme */ - dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; + dccp_set_seqno(&dp->dccps_swl, + max48(dp->dccps_swl, dp->dccps_isr)); dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -461,15 +475,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, */ dccp_set_state(sk, DCCP_PARTOPEN); - /* - * If feature negotiation was successful, activate features now; - * an activation failure means that this host could not activate - * one ore more features (e.g. insufficient memory), which would - * leave at least one feature in an undefined state. - */ - if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) - goto unable_to_proceed; - /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); @@ -504,16 +509,6 @@ out_invalid_packet: /* dccp_v4_do_rcv will send a reset */ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; return 1; - -unable_to_proceed: - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; - /* - * We mark this socket as no longer usable, so that the loop in - * dccp_sendmsg() terminates and the application gets notified. - */ - dccp_set_state(sk, DCCP_CLOSED); - sk->sk_err = ECOMM; - return 1; } static int dccp_rcv_respond_partopen_state_process(struct sock *sk, @@ -595,6 +590,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) < 0) return 1; + + /* FIXME: do congestion control initialization */ goto discard; } if (dh->dccph_type == DCCP_PKT_RESET) @@ -603,35 +600,29 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Caller (dccp_v4_do_rcv) will send Reset */ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; return 1; - } else if (sk->sk_state == DCCP_CLOSED) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; } - /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ - if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) - goto discard; + if (sk->sk_state != DCCP_REQUESTING) { + if (dccp_check_seqno(sk, skb)) + goto discard; - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if ((dp->dccps_role != DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_RESPONSE) || - (dp->dccps_role == DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_REQUEST) || - (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); - goto discard; - } + /* + * Step 8: Process options and mark acknowledgeable + */ + if (dccp_parse_options(sk, NULL, skb)) + return 1; - /* Step 8: Process options */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; + if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + if (dccp_msk(sk)->dccpms_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto discard; + + dccp_deliver_input_to_ccids(sk, skb); + } /* * Step 9: Process Reset @@ -640,22 +631,44 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * S.state := TIMEWAIT * Set TIMEWAIT timer * Drop packet and return - */ + */ if (dh->dccph_type == DCCP_PKT_RESET) { dccp_rcv_reset(sk, skb); return 0; - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_RESPONSE) || + (dp->dccps_role == DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_REQUEST) || + (sk->sk_state == DCCP_RESPOND && + dh->dccph_type == DCCP_PKT_DATA)) { + dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); + goto discard; + } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { if (dccp_rcv_closereq(sk, skb)) return 0; goto discard; - } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ + } else if (dh->dccph_type == DCCP_PKT_CLOSE) { if (dccp_rcv_close(sk, skb)) return 0; goto discard; } switch (sk->sk_state) { + case DCCP_CLOSED: + dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + return 1; + case DCCP_REQUESTING: + /* FIXME: do congestion control initialization */ + queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); if (queued >= 0) return queued; @@ -663,12 +676,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, __kfree_skb(skb); return 0; - case DCCP_PARTOPEN: - /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - /* fall through */ case DCCP_RESPOND: + case DCCP_PARTOPEN: queued = dccp_rcv_respond_partopen_state_process(sk, skb, dh, len); break; @@ -707,7 +716,16 @@ u32 dccp_sample_rtt(struct sock *sk, long delta) /* dccpor_elapsed_time is either zeroed out or set and > 0 */ delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; - return dccp_sane_rtt(delta); + if (unlikely(delta <= 0)) { + DCCP_WARN("unusable RTT sample %ld, using min\n", delta); + return DCCP_SANE_RTT_MIN; + } + if (unlikely(delta > DCCP_SANE_RTT_MAX)) { + DCCP_WARN("RTT sample %ld too large, using max\n", delta); + return DCCP_SANE_RTT_MAX; + } + + return delta; } EXPORT_SYMBOL_GPL(dccp_sample_rtt); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b623f6b2548..882c5c4de69 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -545,7 +545,6 @@ out: static void dccp_v4_reqsk_destructor(struct request_sock *req) { - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); kfree(inet_rsk(req)->opt); } @@ -596,8 +595,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; + dccp_reqsk_init(req, skb); dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ad6212e0043..5e1ee0da2c4 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -302,7 +302,6 @@ done: static void dccp_v6_reqsk_destructor(struct request_sock *req) { - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); if (inet6_rsk(req)->pktopts != NULL) kfree_skb(inet6_rsk(req)->pktopts); } @@ -425,8 +424,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; + dccp_reqsk_init(req, skb); dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index f4d9c8f60ed..b2804e2d1b8 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -42,6 +42,16 @@ struct inet_timewait_death_row dccp_death_row = { EXPORT_SYMBOL_GPL(dccp_death_row); +void dccp_minisock_init(struct dccp_minisock *dmsk) +{ + dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; + dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; + dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; + dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio; + dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; + dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; +} + void dccp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; @@ -102,9 +112,10 @@ struct sock *dccp_create_openreq_child(struct sock *sk, struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); if (newsk != NULL) { - struct dccp_request_sock *dreq = dccp_rsk(req); + const struct dccp_request_sock *dreq = dccp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct dccp_sock *newdp = dccp_sk(newsk); + struct dccp_minisock *newdmsk = dccp_msk(newsk); newdp->dccps_role = DCCP_ROLE_SERVER; newdp->dccps_hc_rx_ackvec = NULL; @@ -114,32 +125,65 @@ struct sock *dccp_create_openreq_child(struct sock *sk, newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; newicsk->icsk_rto = DCCP_TIMEOUT_INIT; - INIT_LIST_HEAD(&newdp->dccps_featneg); + if (dccp_feat_clone(sk, newsk)) + goto out_free; + + if (newdmsk->dccpms_send_ack_vector) { + newdp->dccps_hc_rx_ackvec = + dccp_ackvec_alloc(GFP_ATOMIC); + if (unlikely(newdp->dccps_hc_rx_ackvec == NULL)) + goto out_free; + } + + newdp->dccps_hc_rx_ccid = + ccid_hc_rx_new(newdmsk->dccpms_rx_ccid, + newsk, GFP_ATOMIC); + newdp->dccps_hc_tx_ccid = + ccid_hc_tx_new(newdmsk->dccpms_tx_ccid, + newsk, GFP_ATOMIC); + if (unlikely(newdp->dccps_hc_rx_ccid == NULL || + newdp->dccps_hc_tx_ccid == NULL)) { + dccp_ackvec_free(newdp->dccps_hc_rx_ackvec); + ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk); + ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk); +out_free: + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; + } + /* * Step 3: Process LISTEN state * * Choose S.ISS (initial seqno) or set from Init Cookies * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR from packet (or Init Cookies) - * - * Setting AWL/AWH and SWL/SWH happens as part of the feature - * activation below, as these windows all depend on the local - * and remote Sequence Window feature values (7.5.2). + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies */ - newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; - newdp->dccps_gar = newdp->dccps_iss; - newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; + + /* See dccp_v4_conn_request */ + newdmsk->dccpms_sequence_window = req->rcv_wnd; + + newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; + dccp_update_gss(newsk, dreq->dreq_iss); + + newdp->dccps_isr = dreq->dreq_isr; + dccp_update_gsr(newsk, dreq->dreq_isr); /* - * Activate features: initialise CCIDs, sequence windows etc. + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. */ - if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - return NULL; - } + dccp_set_seqno(&newdp->dccps_swl, + max48(newdp->dccps_swl, newdp->dccps_isr)); + dccp_set_seqno(&newdp->dccps_awl, + max48(newdp->dccps_awl, newdp->dccps_iss)); + dccp_init_xmit_timers(newsk); DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); @@ -260,17 +304,14 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); -int dccp_reqsk_init(struct request_sock *req, - struct dccp_sock const *dp, struct sk_buff const *skb) +void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) { struct dccp_request_sock *dreq = dccp_rsk(req); inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; inet_rsk(req)->acked = 0; + req->rcv_wnd = sysctl_dccp_feat_sequence_window; dreq->dreq_timestamp_echo = 0; - - /* inherit feature negotiation options from listening socket */ - return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); } EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/options.c b/net/dccp/options.c index e5a32979d7d..0809b63cb05 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -23,20 +23,23 @@ #include "dccp.h" #include "feat.h" -u64 dccp_decode_value_var(const u8 *bf, const u8 len) +int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; +int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; +int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; +int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO; +int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; +int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; + +static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) { - u64 value = 0; + u32 value = 0; - if (len >= DCCP_OPTVAL_MAXLEN) - value += ((u64)*bf++) << 40; - if (len > 4) - value += ((u64)*bf++) << 32; if (len > 3) - value += ((u64)*bf++) << 24; + value += *bf++ << 24; if (len > 2) - value += ((u64)*bf++) << 16; + value += *bf++ << 16; if (len > 1) - value += ((u64)*bf++) << 8; + value += *bf++ << 8; if (len > 0) value += *bf; @@ -54,6 +57,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, struct dccp_sock *dp = dccp_sk(sk); const struct dccp_hdr *dh = dccp_hdr(skb); const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; + u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); unsigned char *opt_ptr = options; const unsigned char *opt_end = (unsigned char *)dh + @@ -95,11 +99,18 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, } /* + * CCID-Specific Options (from RFC 4340, sec. 10.3): + * + * Option numbers 128 through 191 are for options sent from the + * HC-Sender to the HC-Receiver; option numbers 192 through 255 + * are for options sent from the HC-Receiver to the HC-Sender. + * * CCID-specific options are ignored during connection setup, as * negotiation may still be in progress (see RFC 4340, 10.3). * The same applies to Ack Vectors, as these depend on the CCID. + * */ - if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || + if (dreq != NULL && (opt >= 128 || opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) goto ignore_option; @@ -120,13 +131,43 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), (unsigned long long)opt_recv->dccpor_ndp); break; - case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: - if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ + case DCCPO_CHANGE_L: + /* fall through */ + case DCCPO_CHANGE_R: + if (pkt_type == DCCP_PKT_DATA) break; - rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, - *value, value + 1, len - 1); - if (rc) - goto out_featneg_failed; + if (len < 2) + goto out_invalid_option; + rc = dccp_feat_change_recv(sk, opt, *value, value + 1, + len - 1); + /* + * When there is a change error, change_recv is + * responsible for dealing with it. i.e. reply with an + * empty confirm. + * If the change was mandatory, then we need to die. + */ + if (rc && mandatory) + goto out_invalid_option; + break; + case DCCPO_CONFIRM_L: + /* fall through */ + case DCCPO_CONFIRM_R: + if (pkt_type == DCCP_PKT_DATA) + break; + if (len < 2) /* FIXME this disallows empty confirm */ + goto out_invalid_option; + if (dccp_feat_confirm_recv(sk, opt, *value, + value + 1, len - 1)) + goto out_invalid_option; + break; + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ + break; + + if (dccp_msk(sk)->dccpms_send_ack_vector && + dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) + goto out_invalid_option; break; case DCCPO_TIMESTAMP: if (len != 4) @@ -154,8 +195,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_role(sk), ntohl(opt_val), (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq); - /* schedule an Ack in case this sender is quiescent */ - inet_csk_schedule_ack(sk); break; case DCCPO_TIMESTAMP_ECHO: if (len != 4 && len != 6 && len != 8) @@ -212,25 +251,23 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", dccp_role(sk), elapsed_time); break; - case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: + case 128 ... 191: { + const u16 idx = value - options; + if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, - pkt_type, opt, value, len)) + opt, len, idx, + value) != 0) goto out_invalid_option; + } break; - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ - break; - /* - * Ack vectors are processed by the TX CCID if it is - * interested. The RX CCID need not parse Ack Vectors, - * since it is only interested in clearing old state. - * Fall through. - */ - case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: + case 192 ... 255: { + const u16 idx = value - options; + if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, - pkt_type, opt, value, len)) + opt, len, idx, + value) != 0) goto out_invalid_option; + } break; default: DCCP_CRIT("DCCP(%p): option %d(len=%d) not " @@ -252,10 +289,8 @@ out_nonsensical_length: out_invalid_option: DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); - rc = DCCP_RESET_CODE_OPTION_ERROR; -out_featneg_failed: - DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); - DCCP_SKB_CB(skb)->dccpd_reset_code = rc; + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; + DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len); DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; @@ -264,12 +299,9 @@ out_featneg_failed: EXPORT_SYMBOL_GPL(dccp_parse_options); -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) +static void dccp_encode_value_var(const u32 value, unsigned char *to, + const unsigned int len) { - if (len >= DCCP_OPTVAL_MAXLEN) - *to++ = (value & 0xFF0000000000ull) >> 40; - if (len > 4) - *to++ = (value & 0xFF00000000ull) >> 32; if (len > 3) *to++ = (value & 0xFF000000) >> 24; if (len > 2) @@ -429,140 +461,92 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, return 0; } -static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) +static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, + u8 *val, u8 len) { - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const u16 buflen = dccp_ackvec_buflen(av); - /* Figure out how many options do we need to represent the ackvec */ - const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); - u16 len = buflen + 2 * nr_opts; - u8 i, nonce = 0; - const unsigned char *tail, *from; - unsigned char *to; + u8 *to; - if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, - dccp_packet_name(dcb->dccpd_type)); + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) { + DCCP_WARN("packet too small for feature %d option!\n", feat); return -1; } - /* - * Since Ack Vectors are variable-length, we can not always predict - * their size. To catch exception cases where the space is running out - * on the skb, a separate Sync is scheduled to carry the Ack Vector. - */ - if (len > DCCPAV_MIN_OPTLEN && - len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { - DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " - "MPS=%u ==> reduce payload size?\n", len, skb->len, - dcb->dccpd_opt_len, dp->dccps_mss_cache); - dp->dccps_sync_scheduled = 1; - return 0; - } - dcb->dccpd_opt_len += len; - to = skb_push(skb, len); - len = buflen; - from = av->av_buf + av->av_buf_head; - tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; + DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3; - for (i = 0; i < nr_opts; ++i) { - int copylen = len; - - if (len > DCCP_SINGLE_OPT_MAXLEN) - copylen = DCCP_SINGLE_OPT_MAXLEN; - - /* - * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via - * its type; ack_nonce is the sum of all individual buf_nonce's. - */ - nonce ^= av->av_buf_nonce[i]; - - *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; - *to++ = copylen + 2; - - /* Check if buf_head wraps */ - if (from + copylen > tail) { - const u16 tailsize = tail - from; - - memcpy(to, from, tailsize); - to += tailsize; - len -= tailsize; - copylen -= tailsize; - from = av->av_buf; - } - - memcpy(to, from, copylen); - from += copylen; - to += copylen; - len -= copylen; - } - /* - * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. - */ - if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) - return -ENOBUFS; - return 0; -} + to = skb_push(skb, len + 3); + *to++ = type; + *to++ = len + 3; + *to++ = feat; -/** - * dccp_insert_option_mandatory - Mandatory option (5.8.2) - * Note that since we are using skb_push, this function needs to be called - * _after_ inserting the option it is supposed to influence (stack order). - */ -int dccp_insert_option_mandatory(struct sk_buff *skb) -{ - if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) - return -1; + if (len) + memcpy(to, val, len); - DCCP_SKB_CB(skb)->dccpd_opt_len++; - *skb_push(skb, 1) = DCCPO_MANDATORY; + dccp_pr_debug("%s(%s (%d), ...), length %d\n", + dccp_feat_typename(type), + dccp_feat_name(feat), feat, len); return 0; } -/** - * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb - * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R - * @feat: one out of %dccp_feature_numbers - * @val: NN value or SP array (preferred element first) to copy - * @len: true length of @val in bytes (excluding first element repetition) - * @repeat_first: whether to copy the first element of @val twice - * The last argument is used to construct Confirm options, where the preferred - * value and the preference list appear separately (RFC 4340, 6.3.1). Preference - * lists are kept such that the preferred entry is always first, so we only need - * to copy twice, and avoid the overhead of cloning into a bigger array. - */ -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, - u8 *val, u8 len, bool repeat_first) +static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) { - u8 tot_len, *to; + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); + struct dccp_opt_pend *opt, *next; + int change = 0; + + /* confirm any options [NN opts] */ + list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { + dccp_insert_feat_opt(skb, opt->dccpop_type, + opt->dccpop_feat, opt->dccpop_val, + opt->dccpop_len); + /* fear empty confirms */ + if (opt->dccpop_val) + kfree(opt->dccpop_val); + kfree(opt); + } + INIT_LIST_HEAD(&dmsk->dccpms_conf); + + /* see which features we need to send */ + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { + /* see if we need to send any confirm */ + if (opt->dccpop_sc) { + dccp_insert_feat_opt(skb, opt->dccpop_type + 1, + opt->dccpop_feat, + opt->dccpop_sc->dccpoc_val, + opt->dccpop_sc->dccpoc_len); + + BUG_ON(!opt->dccpop_sc->dccpoc_val); + kfree(opt->dccpop_sc->dccpoc_val); + kfree(opt->dccpop_sc); + opt->dccpop_sc = NULL; + } - /* take the `Feature' field and possible repetition into account */ - if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { - DCCP_WARN("length %u for feature %u too large\n", len, feat); - return -1; + /* any option not confirmed, re-send it */ + if (!opt->dccpop_conf) { + dccp_insert_feat_opt(skb, opt->dccpop_type, + opt->dccpop_feat, opt->dccpop_val, + opt->dccpop_len); + change++; + } } - if (unlikely(val == NULL || len == 0)) - len = repeat_first = 0; - tot_len = 3 + repeat_first + len; + /* Retransmit timer. + * If this is the master listening sock, we don't set a timer on it. It + * should be fine because if the dude doesn't receive our RESPONSE + * [which will contain the CHANGE] he will send another REQUEST which + * will "retrnasmit" the change. + */ + if (change && dp->dccps_role != DCCP_ROLE_LISTEN) { + dccp_pr_debug("reset feat negotiation timer %p\n", sk); - if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("packet too small for feature %d option!\n", feat); - return -1; + /* XXX don't reset the timer on re-transmissions. I.e. reset it + * only when sending new stuff i guess. Currently the timer + * never backs off because on re-transmission it just resets it! + */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, DCCP_RTO_MAX); } - DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; - - to = skb_push(skb, tot_len); - *to++ = type; - *to++ = tot_len; - *to++ = feat; - if (repeat_first) - *to++ = *val; - if (len) - memcpy(to, val, len); return 0; } @@ -581,30 +565,19 @@ static void dccp_insert_option_padding(struct sk_buff *skb) int dccp_insert_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) + if (dmsk->dccpms_send_ndp_count && + dccp_insert_option_ndp(sk, skb)) return -1; - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { - - /* Feature Negotiation */ - if (dccp_feat_insert_opts(dp, NULL, skb)) + if (!dccp_packet_without_ack(skb)) { + if (dmsk->dccpms_send_ack_vector && + dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && + dccp_insert_option_ackvec(sk, skb)) return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { - /* - * Obtain RTT sample from Request/Response exchange. - * This is currently used in CCID 3 initialisation. - */ - if (dccp_insert_option_timestamp(sk, skb)) - return -1; - - } else if (dccp_ackvec_pending(sk) && - dccp_insert_option_ackvec(sk, skb)) { - return -1; - } } if (dp->dccps_hc_rx_insert_options) { @@ -613,6 +586,21 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) dp->dccps_hc_rx_insert_options = 0; } + /* Feature negotiation */ + /* Data packets can't do feat negotiation */ + if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA && + DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK && + dccp_insert_options_feat(sk, skb)) + return -1; + + /* + * Obtain RTT sample from Request/Response exchange. + * This is currently used in CCID 3 initialisation. + */ + if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST && + dccp_insert_option_timestamp(sk, skb)) + return -1; + if (dp->dccps_timestamp_echo != 0 && dccp_insert_option_timestamp_echo(dp, NULL, skb)) return -1; @@ -625,9 +613,6 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) { DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - if (dccp_feat_insert_opts(NULL, dreq, skb)) - return -1; - if (dreq->dreq_timestamp_echo != 0 && dccp_insert_option_timestamp_echo(NULL, dreq, skb)) return -1; diff --git a/net/dccp/output.c b/net/dccp/output.c index 2532797a800..d06945c7d3d 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -26,13 +26,11 @@ static inline void dccp_event_ack_sent(struct sock *sk) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } -/* enqueue @skb on sk_send_head for retransmission, return clone to send now */ -static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) +static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb) { skb_set_owner_w(skb, sk); WARN_ON(sk->sk_send_head); sk->sk_send_head = skb; - return skb_clone(sk->sk_send_head, gfp_any()); } /* @@ -163,27 +161,21 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); u32 ccmps = dccp_determine_ccmps(dp); - u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; + int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; /* Account for header lengths and IPv4/v6 option overhead */ cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); /* - * Leave enough headroom for common DCCP header options. - * This only considers options which may appear on DCCP-Data packets, as - * per table 3 in RFC 4340, 5.8. When running out of space for other - * options (eg. Ack Vector which can take up to 255 bytes), it is better - * to schedule a separate Ack. Thus we leave headroom for the following: - * - 1 byte for Slow Receiver (11.6) - * - 6 bytes for Timestamp (13.1) - * - 10 bytes for Timestamp Echo (13.3) - * - 8 bytes for NDP count (7.7, when activated) - * - 6 bytes for Data Checksum (9.3) - * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) + * FIXME: this should come from the CCID infrastructure, where, say, + * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets + * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED + * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to + * make it a multiple of 4 */ - cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + - (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); + + cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; @@ -208,158 +200,95 @@ void dccp_write_space(struct sock *sk) } /** - * dccp_wait_for_ccid - Await CCID send permission + * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet * @sk: socket to wait for - * @delay: timeout in jiffies - * This is used by CCIDs which need to delay the send time in process context. + * @skb: current skb to pass on for waiting + * @delay: sleep timeout in milliseconds (> 0) + * This function is called by default when the socket is closed, and + * when a non-zero linger time is set on the socket. For consistency */ -static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) +static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) { + struct dccp_sock *dp = dccp_sk(sk); DEFINE_WAIT(wait); - long remaining; - - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); - sk->sk_write_pending++; - release_sock(sk); + unsigned long jiffdelay; + int rc; - remaining = schedule_timeout(delay); - - lock_sock(sk); - sk->sk_write_pending--; - finish_wait(sk->sk_sleep, &wait); + do { + dccp_pr_debug("delayed send by %d msec\n", delay); + jiffdelay = msecs_to_jiffies(delay); - if (signal_pending(current) || sk->sk_err) - return -1; - return remaining; -} - -/** - * dccp_xmit_packet - Send data packet under control of CCID - * Transmits next-queued payload and informs CCID to account for the packet. - */ -static void dccp_xmit_packet(struct sock *sk) -{ - int err, len; - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb = dccp_qpolicy_pop(sk); + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); - if (unlikely(skb == NULL)) - return; - len = skb->len; + sk->sk_write_pending++; + release_sock(sk); + schedule_timeout(jiffdelay); + lock_sock(sk); + sk->sk_write_pending--; - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } + if (sk->sk_err) + goto do_error; + if (signal_pending(current)) + goto do_interrupted; - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; - } - - err = dccp_transmit_skb(sk, skb); - if (err) - dccp_pr_debug("transmit_skb() returned err=%d\n", err); - /* - * Register this one as sent even if an error occurred. To the remote - * end a local packet drop is indistinguishable from network loss, i.e. - * any local drop will eventually be reported via receiver feedback. - */ - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - - /* - * If the CCID needs to transfer additional header options out-of-band - * (e.g. Ack Vectors or feature-negotiation options), it activates this - * flag to schedule a Sync. The Sync will automatically incorporate all - * currently pending header options, thus clearing the backlog. - */ - if (dp->dccps_sync_scheduled) - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); + rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); + } while ((delay = rc) > 0); +out: + finish_wait(sk->sk_sleep, &wait); + return rc; + +do_error: + rc = -EPIPE; + goto out; +do_interrupted: + rc = -EINTR; + goto out; } -/** - * dccp_flush_write_queue - Drain queue at end of connection - * Since dccp_sendmsg queues packets without waiting for them to be sent, it may - * happen that the TX queue is not empty at the end of a connection. We give the - * HC-sender CCID a grace period of up to @time_budget jiffies. If this function - * returns with a non-empty write queue, it will be purged later. - */ -void dccp_flush_write_queue(struct sock *sk, long *time_budget) +void dccp_write_xmit(struct sock *sk, int block) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; - long delay, rc; - - while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { - rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - /* - * If the CCID determines when to send, the next sending - * time is unknown or the CCID may not even send again - * (e.g. remote host crashes or lost Ack packets). - */ - DCCP_WARN("CCID did not manage to send all packets\n"); - return; - case CCID_PACKET_DELAY: - delay = msecs_to_jiffies(rc); - if (delay > *time_budget) - return; - rc = dccp_wait_for_ccid(sk, delay); - if (rc < 0) - return; - *time_budget -= (delay - rc); - /* check again if we can send now */ - break; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - skb_dequeue(&sk->sk_write_queue); - kfree_skb(skb); - dccp_pr_debug("packet discarded due to err=%ld\n", rc); + while ((skb = skb_peek(&sk->sk_write_queue))) { + int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); + + if (err > 0) { + if (!block) { + sk_reset_timer(sk, &dp->dccps_xmit_timer, + msecs_to_jiffies(err)+jiffies); + break; + } else + err = dccp_wait_for_ccid(sk, skb, err); + if (err && err != -EINTR) + DCCP_BUG("err=%d after dccp_wait_for_ccid", err); } - } -} -void dccp_write_xmit(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; + skb_dequeue(&sk->sk_write_queue); + if (err == 0) { + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + const int len = skb->len; - while ((skb = dccp_qpolicy_top(sk))) { - int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - return; - case CCID_PACKET_DELAY: - sk_reset_timer(sk, &dp->dccps_xmit_timer, - jiffies + msecs_to_jiffies(rc)); - return; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - dccp_qpolicy_drop(sk, skb); - dccp_pr_debug("packet discarded due to err=%d\n", rc); + if (sk->sk_state == DCCP_PARTOPEN) { + /* See 8.1.5. Handshake Completion */ + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); + dcb->dccpd_type = DCCP_PKT_DATAACK; + } else if (dccp_ack_pending(sk)) + dcb->dccpd_type = DCCP_PKT_DATAACK; + else + dcb->dccpd_type = DCCP_PKT_DATA; + + err = dccp_transmit_skb(sk, skb); + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); + if (err) + DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", + err); + } else { + dccp_pr_debug("packet discarded due to err=%d\n", err); + kfree_skb(skb); } } } @@ -410,12 +339,10 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; - /* Resolve feature dependencies resulting from choice of CCID */ - if (dccp_feat_server_ccid_dependencies(dreq)) - goto response_failed; - - if (dccp_insert_options_rsk(dreq, skb)) - goto response_failed; + if (dccp_insert_options_rsk(dreq, skb)) { + kfree_skb(skb); + return NULL; + } /* Build and checksum header */ dh = dccp_zeroed_hdr(skb, dccp_header_size); @@ -436,9 +363,6 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, inet_rsk(req)->acked = 1; DCCP_INC_STATS(DCCP_MIB_OUTSEGS); return skb; -response_failed: - kfree_skb(skb); - return NULL; } EXPORT_SYMBOL_GPL(dccp_make_response); @@ -523,9 +447,8 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) /* * Do all connect socket setups that can be done AF independent. */ -int dccp_connect(struct sock *sk) +static inline void dccp_connect_init(struct sock *sk) { - struct sk_buff *skb; struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -535,13 +458,19 @@ int dccp_connect(struct sock *sk) dccp_sync_mss(sk, dst_mtu(dst)); - /* do not connect if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dccp_sk(sk))) - return -EPROTO; - /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ dp->dccps_gar = dp->dccps_iss; + icsk->icsk_retransmits = 0; +} + +int dccp_connect(struct sock *sk) +{ + struct sk_buff *skb; + struct inet_connection_sock *icsk = inet_csk(sk); + + dccp_connect_init(sk); + skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); if (unlikely(skb == NULL)) return -ENOBUFS; @@ -551,11 +480,11 @@ int dccp_connect(struct sock *sk) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; - dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); + dccp_skb_entail(sk, skb); + dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); /* Timer for repeating the REQUEST until an answer. */ - icsk->icsk_retransmits = 0; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, DCCP_RTO_MAX); return 0; @@ -642,12 +571,6 @@ void dccp_send_sync(struct sock *sk, const u64 ackno, DCCP_SKB_CB(skb)->dccpd_type = pkt_type; DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; - /* - * Clear the flag in case the Sync was scheduled for out-of-band data, - * such as carrying a long Ack Vector. - */ - dccp_sk(sk)->dccps_sync_scheduled = 0; - dccp_transmit_skb(sk, skb); } @@ -676,7 +599,9 @@ void dccp_send_close(struct sock *sk, const int active) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; if (active) { - skb = dccp_skb_entail(sk, skb); + dccp_write_xmit(sk, 1); + dccp_skb_entail(sk, skb); + dccp_transmit_skb(sk, skb_clone(skb, prio)); /* * Retransmission timer for active-close: RFC 4340, 8.3 requires * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ @@ -689,6 +614,6 @@ void dccp_send_close(struct sock *sk, const int active) */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); - } - dccp_transmit_skb(sk, skb); + } else + dccp_transmit_skb(sk, skb); } diff --git a/net/dccp/probe.c b/net/dccp/probe.c index eaa59d82ab0..81368a7f537 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -46,54 +46,75 @@ static struct { struct kfifo *fifo; spinlock_t lock; wait_queue_head_t wait; - ktime_t start; + struct timespec tstart; } dccpw; -static void jdccp_write_xmit(struct sock *sk) +static void printl(const char *fmt, ...) { - const struct inet_sock *inet = inet_sk(sk); - struct ccid3_hc_tx_sock *hctx = NULL; - struct timespec tv; - char buf[256]; - int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk)); + va_list args; + int len; + struct timespec now; + char tbuf[256]; - if (ccid == DCCPC_CCID3) - hctx = ccid3_hc_tx_sk(sk); + va_start(args, fmt); + getnstimeofday(&now); - if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) { + now = timespec_sub(now, dccpw.tstart); - tv = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start)); - len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d", - (unsigned long)tv.tv_sec, - (unsigned long)tv.tv_nsec, - NIPQUAD(inet->saddr), ntohs(inet->sport), - NIPQUAD(inet->daddr), ntohs(inet->dport), ccid); + len = sprintf(tbuf, "%lu.%06lu ", + (unsigned long) now.tv_sec, + (unsigned long) now.tv_nsec / NSEC_PER_USEC); + len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); + va_end(args); + kfifo_put(dccpw.fifo, tbuf, len); + wake_up(&dccpw.wait); +} + +static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size) +{ + const struct dccp_minisock *dmsk = dccp_msk(sk); + const struct inet_sock *inet = inet_sk(sk); + const struct ccid3_hc_tx_sock *hctx; + + if (dmsk->dccpms_tx_ccid == DCCPC_CCID3) + hctx = ccid3_hc_tx_sk(sk); + else + hctx = NULL; + + if (port == 0 || ntohs(inet->dport) == port || + ntohs(inet->sport) == port) { if (hctx) - len += sprintf(buf + len, " %d %d %d %u %u %u %d", - hctx->s, hctx->rtt, hctx->p, hctx->x_calc, - (unsigned)(hctx->x_recv >> 6), - (unsigned)(hctx->x >> 6), hctx->t_ipi); - - len += sprintf(buf + len, "\n"); - kfifo_put(dccpw.fifo, buf, len); - wake_up(&dccpw.wait); + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u " + "%llu %llu %d\n", + NIPQUAD(inet->saddr), ntohs(inet->sport), + NIPQUAD(inet->daddr), ntohs(inet->dport), size, + hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc, + hctx->ccid3hctx_x_recv >> 6, + hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi); + else + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n", + NIPQUAD(inet->saddr), ntohs(inet->sport), + NIPQUAD(inet->daddr), ntohs(inet->dport), size); } jprobe_return(); + return 0; } static struct jprobe dccp_send_probe = { .kp = { - .symbol_name = "dccp_write_xmit", + .symbol_name = "dccp_sendmsg", }, - .entry = jdccp_write_xmit, + .entry = jdccp_sendmsg, }; static int dccpprobe_open(struct inode *inode, struct file *file) { kfifo_reset(dccpw.fifo); - dccpw.start = ktime_get(); + getnstimeofday(&dccpw.tstart); return 0; } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index ecf3be961e1..d0bd3481976 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -67,9 +67,6 @@ void dccp_set_state(struct sock *sk, const int state) case DCCP_OPEN: if (oldstate != DCCP_OPEN) DCCP_INC_STATS(DCCP_MIB_CURRESTAB); - /* Client retransmits all Confirm options until entering OPEN */ - if (oldstate == DCCP_PARTOPEN) - dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); break; case DCCP_CLOSED: @@ -178,25 +175,63 @@ EXPORT_SYMBOL_GPL(dccp_state_name); int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + dccp_minisock_init(&dp->dccps_minisock); + icsk->icsk_rto = DCCP_TIMEOUT_INIT; icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; icsk->icsk_sync_mss = dccp_sync_mss; - dp->dccps_mss_cache = TCP_MIN_RCVMSS; + dp->dccps_mss_cache = 536; dp->dccps_rate_last = jiffies; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; - dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; + dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; dccp_init_xmit_timers(sk); - INIT_LIST_HEAD(&dp->dccps_featneg); - /* control socket doesn't need feat nego */ - if (likely(ctl_sock_initialized)) - return dccp_feat_init(sk); + /* + * FIXME: We're hardcoding the CCID, and doing this at this point makes + * the listening (master) sock get CCID control blocks, which is not + * necessary, but for now, to not mess with the test userspace apps, + * lets leave it here, later the real solution is to do this in a + * setsockopt(CCIDs-I-want/accept). -acme + */ + if (likely(ctl_sock_initialized)) { + int rc = dccp_feat_init(dmsk); + + if (rc) + return rc; + + if (dmsk->dccpms_send_ack_vector) { + dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL); + if (dp->dccps_hc_rx_ackvec == NULL) + return -ENOMEM; + } + dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid, + sk, GFP_KERNEL); + dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid, + sk, GFP_KERNEL); + if (unlikely(dp->dccps_hc_rx_ccid == NULL || + dp->dccps_hc_tx_ccid == NULL)) { + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + if (dmsk->dccpms_send_ack_vector) { + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); + dp->dccps_hc_rx_ackvec = NULL; + } + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + return -ENOMEM; + } + } else { + /* control socket doesn't need feat nego */ + INIT_LIST_HEAD(&dmsk->dccpms_pending); + INIT_LIST_HEAD(&dmsk->dccpms_conf); + } + return 0; } @@ -205,6 +240,7 @@ EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); /* * DCCP doesn't use sk_write_queue, just sk_send_head @@ -222,7 +258,7 @@ void dccp_destroy_sock(struct sock *sk) kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; - if (dp->dccps_hc_rx_ackvec != NULL) { + if (dmsk->dccpms_send_ack_vector) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } @@ -231,7 +267,7 @@ void dccp_destroy_sock(struct sock *sk) dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; /* clean up feature negotiation state */ - dccp_feat_list_purge(&dp->dccps_featneg); + dccp_feat_clean(dmsk); } EXPORT_SYMBOL_GPL(dccp_destroy_sock); @@ -241,9 +277,6 @@ static inline int dccp_listen_start(struct sock *sk, int backlog) struct dccp_sock *dp = dccp_sk(sk); dp->dccps_role = DCCP_ROLE_LISTEN; - /* do not start to listen if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dp)) - return -EPROTO; return inet_csk_listen_start(sk, backlog); } @@ -433,70 +466,42 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, return 0; } -static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) +/* byte 1 is feature. the rest is the preference list */ +static int dccp_setsockopt_change(struct sock *sk, int type, + struct dccp_so_feat __user *optval) { - u8 *list, len; - int i, rc; + struct dccp_so_feat opt; + u8 *val; + int rc; - if (cscov < 0 || cscov > 15) - return -EINVAL; + if (copy_from_user(&opt, optval, sizeof(opt))) + return -EFAULT; /* - * Populate a list of permissible values, in the range cscov...15. This - * is necessary since feature negotiation of single values only works if - * both sides incidentally choose the same value. Since the list starts - * lowest-value first, negotiation will pick the smallest shared value. + * rfc4340: 6.1. Change Options */ - if (cscov == 0) - return 0; - len = 16 - cscov; - - list = kmalloc(len, GFP_KERNEL); - if (list == NULL) - return -ENOBUFS; - - for (i = 0; i < len; i++) - list[i] = cscov++; - - rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); - - if (rc == 0) { - if (rx) - dccp_sk(sk)->dccps_pcrlen = cscov; - else - dccp_sk(sk)->dccps_pcslen = cscov; - } - kfree(list); - return rc; -} - -static int dccp_setsockopt_ccid(struct sock *sk, int type, - char __user *optval, int optlen) -{ - u8 *val; - int rc = 0; - - if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) + if (opt.dccpsf_len < 1) return -EINVAL; - val = kmalloc(optlen, GFP_KERNEL); - if (val == NULL) + val = kmalloc(opt.dccpsf_len, GFP_KERNEL); + if (!val) return -ENOMEM; - if (copy_from_user(val, optval, optlen)) { - kfree(val); - return -EFAULT; + if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) { + rc = -EFAULT; + goto out_free_val; } - lock_sock(sk); - if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); + rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat, + val, opt.dccpsf_len, GFP_KERNEL); + if (rc) + goto out_free_val; - if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); - release_sock(sk); +out: + return rc; +out_free_val: kfree(val); - return rc; + goto out; } static int do_dccp_setsockopt(struct sock *sk, int level, int optname, @@ -505,21 +510,7 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CHANGE_L: - case DCCP_SOCKOPT_CHANGE_R: - DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CCID: - case DCCP_SOCKOPT_RX_CCID: - case DCCP_SOCKOPT_TX_CCID: - return dccp_setsockopt_ccid(sk, optname, optval, optlen); - } - - if (optlen < (int)sizeof(int)) + if (optlen < sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) @@ -530,38 +521,53 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, lock_sock(sk); switch (optname) { + case DCCP_SOCKOPT_PACKET_SIZE: + DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); + err = 0; + break; + case DCCP_SOCKOPT_CHANGE_L: + if (optlen != sizeof(struct dccp_so_feat)) + err = -EINVAL; + else + err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L, + (struct dccp_so_feat __user *) + optval); + break; + case DCCP_SOCKOPT_CHANGE_R: + if (optlen != sizeof(struct dccp_so_feat)) + err = -EINVAL; + else + err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R, + (struct dccp_so_feat __user *) + optval); + break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: if (dp->dccps_role != DCCP_ROLE_SERVER) err = -EOPNOTSUPP; else dp->dccps_server_timewait = (val != 0); break; - case DCCP_SOCKOPT_SEND_CSCOV: - err = dccp_setsockopt_cscov(sk, val, false); - break; - case DCCP_SOCKOPT_RECV_CSCOV: - err = dccp_setsockopt_cscov(sk, val, true); - break; - case DCCP_SOCKOPT_QPOLICY_ID: - if (sk->sk_state != DCCP_CLOSED) - err = -EISCONN; - else if (val < 0 || val >= DCCPQ_POLICY_MAX) + case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ + if (val < 0 || val > 15) err = -EINVAL; else - dp->dccps_qpolicy = val; + dp->dccps_pcslen = val; break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - if (val < 0) + case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */ + if (val < 0 || val > 15) err = -EINVAL; - else - dp->dccps_tx_qlen = val; + else { + dp->dccps_pcrlen = val; + /* FIXME: add feature negotiation, + * ChangeL(MinimumChecksumCoverage, val) */ + } break; default: err = -ENOPROTOOPT; break; } - release_sock(sk); + release_sock(sk); return err; } @@ -642,18 +648,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_GET_CUR_MPS: val = dp->dccps_mss_cache; break; - case DCCP_SOCKOPT_AVAILABLE_CCIDS: - return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); - case DCCP_SOCKOPT_TX_CCID: - val = ccid_get_current_tx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_RX_CCID: - val = ccid_get_current_rx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; @@ -663,12 +657,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_RECV_CSCOV: val = dp->dccps_pcrlen; break; - case DCCP_SOCKOPT_QPOLICY_ID: - val = dp->dccps_qpolicy; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - val = dp->dccps_tx_qlen; - break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, len, (u32 __user *)optval, optlen); @@ -711,47 +699,6 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); #endif -static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) -{ - struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); - - /* - * Assign an (opaque) qpolicy priority value to skb->priority. - * - * We are overloading this skb field for use with the qpolicy subystem. - * The skb->priority is normally used for the SO_PRIORITY option, which - * is initialised from sk_priority. Since the assignment of sk_priority - * to skb->priority happens later (on layer 3), we overload this field - * for use with queueing priorities as long as the skb is on layer 4. - * The default priority value (if nothing is set) is 0. - */ - skb->priority = 0; - - for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { - - if (!CMSG_OK(msg, cmsg)) - return -EINVAL; - - if (cmsg->cmsg_level != SOL_DCCP) - continue; - - if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && - !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) - return -EINVAL; - - switch (cmsg->cmsg_type) { - case DCCP_SCM_PRIORITY: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) - return -EINVAL; - skb->priority = *(__u32 *)CMSG_DATA(cmsg); - break; - default: - return -EINVAL; - } - } - return 0; -} - int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -767,7 +714,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, lock_sock(sk); - if (dccp_qpolicy_full(sk)) { + if (sysctl_dccp_tx_qlen && + (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) { rc = -EAGAIN; goto out_release; } @@ -795,12 +743,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (rc != 0) goto out_discard; - rc = dccp_msghdr_parse(msg, skb); - if (rc != 0) - goto out_discard; - - dccp_qpolicy_push(sk, skb); - dccp_write_xmit(sk); + skb_queue_tail(&sk->sk_write_queue, skb); + dccp_write_xmit(sk,0); out_release: release_sock(sk); return rc ? : len; @@ -1023,22 +967,9 @@ void dccp_close(struct sock *sk, long timeout) /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (sk->sk_state != DCCP_CLOSED) { - /* - * Normal connection termination. May need to wait if there are - * still packets in the TX queue that are delayed by the CCID. - */ - dccp_flush_write_queue(sk, &timeout); dccp_terminate_connection(sk); } - /* - * Flush write queue. This may be necessary in several cases: - * - we have been closed by the peer but still have application data; - * - abortive termination (unread data or zero linger time), - * - normal termination but queue could not be flushed within time limit - */ - __skb_queue_purge(&sk->sk_write_queue); - sk_stream_wait_close(sk, timeout); adjudge_to_death: diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c deleted file mode 100644 index 27383f88c75..00000000000 --- a/net/dccp/qpolicy.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * net/dccp/qpolicy.c - * - * Policy-based packet dequeueing interface for DCCP. - * - * Copyright (c) 2008 Tomasz Grobelny - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License v2 - * as published by the Free Software Foundation. - */ -#include "dccp.h" - -/* - * Simple Dequeueing Policy: - * If tx_qlen is different from 0, enqueue up to tx_qlen elements. - */ -static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) -{ - skb_queue_tail(&sk->sk_write_queue, skb); -} - -static bool qpolicy_simple_full(struct sock *sk) -{ - return dccp_sk(sk)->dccps_tx_qlen && - sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; -} - -static struct sk_buff *qpolicy_simple_top(struct sock *sk) -{ - return skb_peek(&sk->sk_write_queue); -} - -/* - * Priority-based Dequeueing Policy: - * If tx_qlen is different from 0 and the queue has reached its upper bound - * of tx_qlen elements, replace older packets lowest-priority-first. - */ -static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) -{ - struct sk_buff *skb, *best = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (best == NULL || skb->priority > best->priority) - best = skb; - return best; -} - -static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) -{ - struct sk_buff *skb, *worst = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (worst == NULL || skb->priority < worst->priority) - worst = skb; - return worst; -} - -static bool qpolicy_prio_full(struct sock *sk) -{ - if (qpolicy_simple_full(sk)) - dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); - return false; -} - -/** - * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface - * @push: add a new @skb to the write queue - * @full: indicates that no more packets will be admitted - * @top: peeks at whatever the queueing policy defines as its `top' - */ -static struct dccp_qpolicy_operations { - void (*push) (struct sock *sk, struct sk_buff *skb); - bool (*full) (struct sock *sk); - struct sk_buff* (*top) (struct sock *sk); - __be32 params; - -} qpol_table[DCCPQ_POLICY_MAX] = { - [DCCPQ_POLICY_SIMPLE] = { - .push = qpolicy_simple_push, - .full = qpolicy_simple_full, - .top = qpolicy_simple_top, - .params = 0, - }, - [DCCPQ_POLICY_PRIO] = { - .push = qpolicy_simple_push, - .full = qpolicy_prio_full, - .top = qpolicy_prio_best_skb, - .params = DCCP_SCM_PRIORITY, - }, -}; - -/* - * Externally visible interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) -{ - qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); -} - -bool dccp_qpolicy_full(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); -} - -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) -{ - if (skb != NULL) { - skb_unlink(skb, &sk->sk_write_queue); - kfree_skb(skb); - } -} - -struct sk_buff *dccp_qpolicy_top(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); -} - -struct sk_buff *dccp_qpolicy_pop(struct sock *sk) -{ - struct sk_buff *skb = dccp_qpolicy_top(sk); - - /* Clear any skb fields that we used internally */ - skb->priority = 0; - - if (skb) - skb_unlink(skb, &sk->sk_write_queue); - return skb; -} - -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) -{ - /* check if exactly one bit is set */ - if (!param || (param & (param - 1))) - return false; - return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; -} diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index a5a1856234e..21295993fdb 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -18,72 +18,76 @@ #error This file should not be compiled without CONFIG_SYSCTL defined #endif -/* Boundary values */ -static int zero = 0, - u8_max = 0xFF; -static unsigned long seqw_min = 32; - static struct ctl_table dccp_default_table[] = { { .procname = "seq_window", - .data = &sysctl_dccp_sequence_window, - .maxlen = sizeof(sysctl_dccp_sequence_window), + .data = &sysctl_dccp_feat_sequence_window, + .maxlen = sizeof(sysctl_dccp_feat_sequence_window), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ + .proc_handler = proc_dointvec, }, { .procname = "rx_ccid", - .data = &sysctl_dccp_rx_ccid, - .maxlen = sizeof(sysctl_dccp_rx_ccid), + .data = &sysctl_dccp_feat_rx_ccid, + .maxlen = sizeof(sysctl_dccp_feat_rx_ccid), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &u8_max, /* RFC 4340, 10. */ + .proc_handler = proc_dointvec, }, { .procname = "tx_ccid", - .data = &sysctl_dccp_tx_ccid, - .maxlen = sizeof(sysctl_dccp_tx_ccid), + .data = &sysctl_dccp_feat_tx_ccid, + .maxlen = sizeof(sysctl_dccp_feat_tx_ccid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ack_ratio", + .data = &sysctl_dccp_feat_ack_ratio, + .maxlen = sizeof(sysctl_dccp_feat_ack_ratio), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "send_ackvec", + .data = &sysctl_dccp_feat_send_ack_vector, + .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "send_ndp", + .data = &sysctl_dccp_feat_send_ndp_count, + .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &u8_max, /* RFC 4340, 10. */ + .proc_handler = proc_dointvec, }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, .maxlen = sizeof(sysctl_dccp_request_retries), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &u8_max, + .proc_handler = proc_dointvec, }, { .procname = "retries1", .data = &sysctl_dccp_retries1, .maxlen = sizeof(sysctl_dccp_retries1), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &u8_max, + .proc_handler = proc_dointvec, }, { .procname = "retries2", .data = &sysctl_dccp_retries2, .maxlen = sizeof(sysctl_dccp_retries2), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &u8_max, + .proc_handler = proc_dointvec, }, { .procname = "tx_qlen", .data = &sysctl_dccp_tx_qlen, .maxlen = sizeof(sysctl_dccp_tx_qlen), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .proc_handler = proc_dointvec, }, { .procname = "sync_ratelimit", diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 16359e29e7f..54b3c7e9e01 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -87,6 +87,17 @@ static void dccp_retransmit_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + /* retransmit timer is used for feature negotiation throughout + * connection. In this case, no packet is re-transmitted, but rather an + * ack is generated and pending changes are placed into its options. + */ + if (sk->sk_send_head == NULL) { + dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk); + if (sk->sk_state == DCCP_OPEN) + dccp_send_ack(sk); + goto backoff; + } + /* * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was * sent, no need to retransmit, this sock is dead. @@ -115,6 +126,7 @@ static void dccp_retransmit_timer(struct sock *sk) return; } +backoff: icsk->icsk_backoff++; icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); @@ -237,35 +249,32 @@ out: sock_put(sk); } -/** - * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface - * See the comments above %ccid_dequeueing_decision for supported modes. - */ -static void dccp_write_xmitlet(unsigned long data) +/* Transmit-delay timer: used by the CCIDs to delay actual send time */ +static void dccp_write_xmit_timer(unsigned long data) { struct sock *sk = (struct sock *)data; + struct dccp_sock *dp = dccp_sk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); + sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); else - dccp_write_xmit(sk); + dccp_write_xmit(sk, 0); bh_unlock_sock(sk); + sock_put(sk); } -static void dccp_write_xmit_timer(unsigned long data) +static void dccp_init_write_xmit_timer(struct sock *sk) { - dccp_write_xmitlet(data); - sock_put((struct sock *)data); + struct dccp_sock *dp = dccp_sk(sk); + + setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, + (unsigned long)sk); } void dccp_init_xmit_timers(struct sock *sk) { - struct dccp_sock *dp = dccp_sk(sk); - - tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); - setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, - (unsigned long)sk); + dccp_init_write_xmit_timer(sk); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); } @@ -281,7 +290,8 @@ u32 dccp_timestamp(void) { s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); - return div_u64(delta, DCCP_TIME_RESOLUTION); + do_div(delta, 10); + return delta; } EXPORT_SYMBOL_GPL(dccp_timestamp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9da9f19ece8..f79a5160729 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -811,12 +811,25 @@ void tcp_update_metrics(struct sock *sk) } } +/* Numbers are taken from RFC3390. + * + * John Heffner states: + * + * The RFC specifies a window of no more than 4380 bytes + * unless 2*MSS > 4380. Reading the pseudocode in the RFC + * is a bit misleading because they use a clamp at 4380 bytes + * rather than use a multiplier in the relevant range. + */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); - if (!cwnd) - cwnd = rfc3390_bytes_to_packets(tp->mss_cache); + if (!cwnd) { + if (tp->mss_cache > 1460) + cwnd = 2; + else + cwnd = (tp->mss_cache > 1095) ? 3 : 4; + } return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } -- cgit v1.2.3-70-g09d2 From 63f2c0464875b6ef2132cecb19b2a5abbf061227 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 12 Sep 2008 23:23:50 -0700 Subject: net: ip_vs_proto_{tcp,udp} build fix Signed-off-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_proto_tcp.c | 1 + net/ipv4/ipvs/ip_vs_proto_udp.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 537f616776d..dd4566ea2bf 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -18,6 +18,7 @@ #include /* for tcphdr */ #include #include /* for csum_tcpudp_magic */ +#include #include #include diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index e3ee26bd1de..6eb6039d634 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -22,6 +22,7 @@ #include #include +#include static struct ip_vs_conn * udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, -- cgit v1.2.3-70-g09d2 From 93821778def10ec1e69aa3ac10adee975dad4ff3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Sep 2008 11:48:46 -0700 Subject: udp: Fix rcv socket locking The previous patch in response to the recursive locking on IPsec reception is broken as it tries to drop the BH socket lock while in user context. This patch fixes it by shrinking the section protected by the socket lock to sock_queue_rcv_skb only. The only reason we added the lock is for the accounting which happens in that function. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/udp.c | 62 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 29 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8e42fbbd576..57e26fa6618 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -951,6 +951,27 @@ int udp_disconnect(struct sock *sk, int flags) return 0; } +static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int is_udplite = IS_UDPLITE(sk); + int rc; + + if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) { + /* Note that an ENOMEM error is charged twice */ + if (rc == -ENOMEM) + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); + goto drop; + } + + return 0; + +drop: + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + kfree_skb(skb); + return -1; +} + /* returns: * -1: error * 0: success @@ -989,9 +1010,7 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) up->encap_rcv != NULL) { int ret; - bh_unlock_sock(sk); ret = (*up->encap_rcv)(sk, skb); - bh_lock_sock(sk); if (ret <= 0) { UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INDATAGRAMS, @@ -1044,17 +1063,16 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) goto drop; } - if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { - /* Note that an ENOMEM error is charged twice */ - if (rc == -ENOMEM) { - UDP_INC_STATS_BH(sock_net(sk), - UDP_MIB_RCVBUFERRORS, is_udplite); - atomic_inc(&sk->sk_drops); - } - goto drop; - } + rc = 0; - return 0; + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + rc = __udp_queue_rcv_skb(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + return rc; drop: UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); @@ -1092,15 +1110,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) { - int ret = 0; - - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) - ret = udp_queue_rcv_skb(sk, skb1); - else - sk_add_backlog(sk, skb1); - bh_unlock_sock(sk); - + int ret = udp_queue_rcv_skb(sk, skb1); if (ret > 0) /* we should probably re-process instead * of dropping packets here. */ @@ -1195,13 +1205,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], uh->dest, inet_iif(skb), udptable); if (sk != NULL) { - int ret = 0; - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) - ret = udp_queue_rcv_skb(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); + int ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but @@ -1494,7 +1498,7 @@ struct proto udp_prot = { .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .sendpage = udp_sendpage, - .backlog_rcv = udp_queue_rcv_skb, + .backlog_rcv = __udp_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, -- cgit v1.2.3-70-g09d2 From 9e691ed68d94ab3047e028736641445b4cf74d67 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 17 Sep 2008 10:10:41 +1000 Subject: ipvs: only unlock in ip_vs_edit_service() if already locked Jumping to out unlocks __ip_vs_svc_lock, but that lock is not taken until after code that may jump to out. This problem was detected by sparse. make C=1 CHECK net/ipv4/ipvs/ip_vs_ctl.c net/ipv4/ipvs/ip_vs_ctl.c:1332:2: warning: context imbalance in 'ip_vs_edit_service' - unexpected unlock Acked-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 993a83fb0d5..60ca24b9ec0 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1305,7 +1305,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) */ if ((ret = ip_vs_unbind_scheduler(svc))) { old_sched = sched; - goto out; + goto out_unlock; } /* @@ -1324,12 +1324,13 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) */ ip_vs_bind_scheduler(svc, old_sched); old_sched = sched; - goto out; + goto out_unlock; } } - out: + out_unlock: write_unlock_bh(&__ip_vs_svc_lock); + out: if (old_sched) ip_vs_scheduler_put(old_sched); -- cgit v1.2.3-70-g09d2 From dff630ddad3884b99fae3ad92f5eccbf26618679 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 17 Sep 2008 10:10:42 +1000 Subject: ipvs: supply a valid 0 address to ip_vs_conn_new() ip_vs_conn_new expects a union nf_inet_addr as the type for its address parameters, not a plain integer. This problem was detected by sparse. make C=1 CHECK net/ipv4/ipvs/ip_vs_core.c net/ipv4/ipvs/ip_vs_core.c:469:9: warning: Using plain integer as NULL pointer Acked-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 80a4fcf33a5..ece748dbd0c 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -457,6 +457,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { int ret, cs; struct ip_vs_conn *cp; + union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; ip_vs_service_put(svc); @@ -465,7 +466,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, cp = ip_vs_conn_new(svc->af, iph.protocol, &iph.saddr, pptr[0], &iph.daddr, pptr[1], - 0, 0, + &daddr, 0, IP_VS_CONN_F_BYPASS, NULL); if (cp == NULL) -- cgit v1.2.3-70-g09d2 From 563e94f072714657a82a59a3bf81a719a6a25591 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 17 Sep 2008 10:10:42 +1000 Subject: ipvs: add __aquire/__release annotations to ip_vs_info_seq_start/ip_vs_info_seq_stop This teaches sparse that the following are not problems: make C=1 CHECK net/ipv4/ipvs/ip_vs_ctl.c net/ipv4/ipvs/ip_vs_ctl.c:1793:14: warning: context imbalance in 'ip_vs_info_seq_start' - wrong count at exit net/ipv4/ipvs/ip_vs_ctl.c:1842:13: warning: context imbalance in 'ip_vs_info_seq_stop' - unexpected unlock Acked-by: Sven Wegener Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 60ca24b9ec0..771551d8fba 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1787,6 +1787,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) } static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) +__acquires(__ip_vs_svc_lock) { read_lock_bh(&__ip_vs_svc_lock); @@ -1840,6 +1841,7 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) +__releases(__ip_vs_svc_lock) { read_unlock_bh(&__ip_vs_svc_lock); } -- cgit v1.2.3-70-g09d2 From d286600e199aa2f1058a1f883d234e73626304d2 Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Tue, 16 Sep 2008 11:11:11 -0400 Subject: ipvs: change some __constant_htons() to htons() Change __contant_htons() to htons() in the IPVS code when not in an initializer. -Brian Signed-off-by: Brian Haley Acked-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_proto.c | 2 +- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index b06da1c3445..0791f9e08fe 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -237,7 +237,7 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == __constant_htons(ETH_P_IPV6)) + if (skb->protocol == htons(ETH_P_IPV6)) ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); else #endif diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c index 2b18a78d039..80ab0c8e5b4 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c @@ -167,7 +167,7 @@ ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == __constant_htons(ETH_P_IPV6)) + if (skb->protocol == htons(ETH_P_IPV6)) ah_esp_debug_packet_v6(pp, skb, offset, msg); else #endif -- cgit v1.2.3-70-g09d2 From 64edc2736e23994e0334b70c5ff08dc33e2ebbd9 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:18:32 -0700 Subject: tcp: Partial hint clearing has again become meaningless MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ie., the difference between partial and all clearing doesn't exists anymore since the SACK optimizations got dropped by an sacktag rewrite. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/net/tcp.h | 7 +------ net/ipv4/tcp_input.c | 5 ++--- net/ipv4/tcp_output.c | 4 ++-- 3 files changed, 5 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 8983386356a..b7167632695 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1035,7 +1035,7 @@ static inline void tcp_mib_init(struct net *net) } /* from STCP */ -static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) +static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { tp->lost_skb_hint = NULL; tp->scoreboard_skb_hint = NULL; @@ -1043,11 +1043,6 @@ static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) tp->forward_skb_hint = NULL; } -static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) -{ - tcp_clear_retrans_hints_partial(tp); -} - /* MD5 Signature */ struct crypto_hash; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f79a5160729..7306bfb16cd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1883,7 +1883,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); - tcp_clear_retrans_hints_partial(tp); + tcp_clear_all_retrans_hints(tp); } static void tcp_clear_retrans_partial(struct tcp_sock *tp) @@ -1934,12 +1934,11 @@ void tcp_enter_loss(struct sock *sk, int how) /* Push undo marker, if it was plain RTO and nothing * was retransmitted. */ tp->undo_marker = tp->snd_una; - tcp_clear_retrans_hints_partial(tp); } else { tp->sacked_out = 0; tp->fackets_out = 0; - tcp_clear_all_retrans_hints(tp); } + tcp_clear_all_retrans_hints(tp); tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8165f5aa8c7..11490958a09 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -750,7 +750,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, BUG_ON(len > skb->len); - tcp_clear_retrans_hints_partial(tp); + tcp_clear_all_retrans_hints(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -1823,7 +1823,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, tp->packets_out -= tcp_skb_pcount(next_skb); /* changed transmit queue under us so clear hints */ - tcp_clear_retrans_hints_partial(tp); + tcp_clear_all_retrans_hints(tp); sk_wmem_free_skb(sk, next_skb); } -- cgit v1.2.3-70-g09d2 From c8c213f20ce97c66fe2ff86f33814d1ca0f9d7ea Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:18:55 -0700 Subject: tcp: move tcp_verify_retransmit_hint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7306bfb16cd..9e95ad637db 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -979,6 +979,19 @@ static void tcp_update_reordering(struct sock *sk, const int metric, } } +/* RFC: This is from the original, I doubt that this is necessary at all: + * clear xmit_retrans hint if seq of this skb is beyond hint. How could we + * retransmitted past LOST markings in the first place? I'm not fully sure + * about undo and end of connection cases, which can cause R without L? + */ +static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) +{ + if ((tp->retransmit_skb_hint != NULL) && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + tp->retransmit_skb_hint = NULL; +} + /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -2156,19 +2169,6 @@ static int tcp_time_to_recover(struct sock *sk) return 0; } -/* RFC: This is from the original, I doubt that this is necessary at all: - * clear xmit_retrans hint if seq of this skb is beyond hint. How could we - * retransmitted past LOST markings in the first place? I'm not fully sure - * about undo and end of connection cases, which can cause R without L? - */ -static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) -{ - if ((tp->retransmit_skb_hint != NULL) && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) - tp->retransmit_skb_hint = NULL; -} - /* Mark head of queue up as lost. With RFC3517 SACK, the packets is * is against sacked "cnt", otherwise it's against facked "cnt" */ -- cgit v1.2.3-70-g09d2 From 41ea36e35a0daa75377b3e70680e5c3a3f83fe27 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:19:22 -0700 Subject: tcp: add helper for lost bit toggling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This useful because we'd need to verifying soon in many places which makes things slightly more complex than it used to be. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9e95ad637db..12512336dbd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -992,6 +992,16 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) tp->retransmit_skb_hint = NULL; } +static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tcp_verify_retransmit_hint(tp, skb); + + tp->lost_out += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + } +} + /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -2216,11 +2226,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) cnt = packets; } - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - tcp_verify_retransmit_hint(tp, skb); - } + tcp_skb_mark_lost(tp, skb); } tcp_verify_left_out(tp); } @@ -2262,11 +2268,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) if (!tcp_skb_timedout(sk, skb)) break; - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - tcp_verify_retransmit_hint(tp, skb); - } + tcp_skb_mark_lost(tp, skb); } tp->scoreboard_skb_hint = skb; -- cgit v1.2.3-70-g09d2 From 006f582c73f4eda35e06fd323193c3df43fb3459 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:20:20 -0700 Subject: tcp: convert retransmit_cnt_hint to seqno MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Main benefit in this is that we can then freely point the retransmit_skb_hint to anywhere we want to because there's no longer need to know what would be the count changes involve, and since this is really used only as a terminator, unnecessary work is one time walk at most, and if some retransmissions are necessary after that point later on, the walk is not full waste of time anyway. Since retransmit_high must be kept valid, all lost markers must ensure that. Now I also have learned how those "holes" in the rexmittable skbs can appear, mtu probe does them. So I removed the misleading comment as well. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- include/net/tcp.h | 2 ++ net/ipv4/tcp_input.c | 34 ++++++++++++++++++++-------------- net/ipv4/tcp_output.c | 25 +++++++------------------ 4 files changed, 30 insertions(+), 33 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2e2557388e3..d7637c4b284 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -358,7 +358,7 @@ struct tcp_sock { */ int lost_cnt_hint; - int retransmit_cnt_hint; + u32 retransmit_high; /* L-bits may be on up to this seqno */ u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */ diff --git a/include/net/tcp.h b/include/net/tcp.h index b7167632695..d0e90c50722 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -472,6 +472,8 @@ extern void tcp_send_delayed_ack(struct sock *sk); /* tcp_input.c */ extern void tcp_cwnd_application_limited(struct sock *sk); +extern void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, + struct sk_buff *skb); /* tcp_timer.c */ extern void tcp_init_xmit_timers(struct sock *); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 12512336dbd..d271cc82500 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -979,17 +979,17 @@ static void tcp_update_reordering(struct sock *sk, const int metric, } } -/* RFC: This is from the original, I doubt that this is necessary at all: - * clear xmit_retrans hint if seq of this skb is beyond hint. How could we - * retransmitted past LOST markings in the first place? I'm not fully sure - * about undo and end of connection cases, which can cause R without L? - */ +/* This must be called before lost_out is incremented */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { - if ((tp->retransmit_skb_hint != NULL) && + if ((tp->retransmit_skb_hint == NULL) || before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) - tp->retransmit_skb_hint = NULL; + tp->retransmit_skb_hint = skb; + + if (!tp->lost_out || + after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) + tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) @@ -1002,6 +1002,16 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) } } +void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) +{ + tcp_verify_retransmit_hint(tp, skb); + + if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tp->lost_out += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + } +} + /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -1178,13 +1188,7 @@ static void tcp_mark_lost_retrans(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { - tp->lost_out += tcp_skb_pcount(skb); - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - } + tcp_skb_mark_lost_uncond_verify(tp, skb); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); } else { if (before(ack_seq, new_low_seq)) @@ -1890,6 +1894,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } } tcp_verify_left_out(tp); @@ -1974,6 +1979,7 @@ void tcp_enter_loss(struct sock *sk, int how) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } } tcp_verify_left_out(tp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 11490958a09..cfae61b40c4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1838,7 +1838,7 @@ void tcp_simple_retransmit(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk, 0); - int lost = 0; + u32 prior_lost = tp->lost_out; tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) @@ -1849,17 +1849,13 @@ void tcp_simple_retransmit(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); } - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - lost = 1; - } + tcp_skb_mark_lost_uncond_verify(tp, skb); } } tcp_clear_all_retrans_hints(tp); - if (!lost) + if (prior_lost == tp->lost_out) return; if (tcp_is_reno(tp)) @@ -2009,15 +2005,11 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int packet_cnt; - if (tp->retransmit_skb_hint) { + if (tp->retransmit_skb_hint) skb = tp->retransmit_skb_hint; - packet_cnt = tp->retransmit_cnt_hint; - } else { + else skb = tcp_write_queue_head(sk); - packet_cnt = 0; - } /* First pass: retransmit lost packets. */ if (tp->lost_out) { @@ -2028,7 +2020,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk) break; /* we could do better than to assign each time */ tp->retransmit_skb_hint = skb; - tp->retransmit_cnt_hint = packet_cnt; /* Assume this retransmit will generate * only one packet for congestion window @@ -2039,6 +2030,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; + if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) + break; if (sacked & TCPCB_LOST) { if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { @@ -2059,10 +2052,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk) inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } - - packet_cnt += tcp_skb_pcount(skb); - if (packet_cnt >= tp->lost_out) - break; } } } -- cgit v1.2.3-70-g09d2 From f09142eddb75005e41b0af3e5214979d8b534b1d Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:20:50 -0700 Subject: tcp: Kill precaution that's very likely obsolete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I suspect it might have been related to the changed amount of lost skbs, which was counted by retransmit_cnt_hint that got changed. The place for this clearing was very illogical anyway, it should have been after the LOST-bit clearing loop to make any sense. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d271cc82500..28e93f1e421 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2385,10 +2385,6 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; - - /* There is something screwy going on with the retrans hints after - an undo */ - tcp_clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) -- cgit v1.2.3-70-g09d2 From 184d68b2b0b836587f92887b14baea41033ffeef Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:21:16 -0700 Subject: tcp: No need to clear retransmit_skb_hint when SACKing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because lost counter no longer requires tuning, this is trivial to remove (the tuning wouldn't have been too hard either) because no "new" retransmittable skb appeared below retransmit_skb_hint when SACKing for sure. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 28e93f1e421..d017aed6edd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1298,9 +1298,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb); - - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; } } else { if (!(sacked & TCPCB_RETRANS)) { @@ -1319,9 +1316,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out -= tcp_skb_pcount(skb); - - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; } } @@ -1351,7 +1345,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); - tp->retransmit_skb_hint = NULL; } return flag; -- cgit v1.2.3-70-g09d2 From b5afe7bc71a1689376c9b547376d17568469f3b3 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:21:54 -0700 Subject: tcp: add tcp_can_forward_retransmit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cfae61b40c4..957c4e3d217 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1992,6 +1992,33 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return err; } +static int tcp_can_forward_retransmit(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* Forward retransmissions are possible only during Recovery. */ + if (icsk->icsk_ca_state != TCP_CA_Recovery) + return 0; + + /* No forward retransmissions in Reno are possible. */ + if (tcp_is_reno(tp)) + return 0; + + /* Yeah, we have to make difficult choice between forward transmission + * and retransmission... Both ways have their merits... + * + * For now we do not retransmit anything, while we have some new + * segments to send. In the other cases, follow rule 3 for + * NextSeg() specified in RFC3517. + */ + + if (tcp_may_send_now(sk)) + return 0; + + return 1; +} + /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either @@ -2057,24 +2084,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) } /* OK, demanded retransmission is finished. */ - - /* Forward retransmissions are possible only during Recovery. */ - if (icsk->icsk_ca_state != TCP_CA_Recovery) - return; - - /* No forward retransmissions in Reno are possible. */ - if (tcp_is_reno(tp)) - return; - - /* Yeah, we have to make difficult choice between forward transmission - * and retransmission... Both ways have their merits... - * - * For now we do not retransmit anything, while we have some new - * segments to send. In the other cases, follow rule 3 for - * NextSeg() specified in RFC3517. - */ - - if (tcp_may_send_now(sk)) + if (!tcp_can_forward_retransmit(sk)) return; /* If nothing is SACKed, highest_sack in the loop won't be valid */ -- cgit v1.2.3-70-g09d2 From 34638570b58290e8cb875fb24dcbe836ffeb6cb8 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:22:17 -0700 Subject: tcp: remove obsolete validity concern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 957c4e3d217..6f2a3f4a1af 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2087,10 +2087,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (!tcp_can_forward_retransmit(sk)) return; - /* If nothing is SACKed, highest_sack in the loop won't be valid */ - if (!tp->sacked_out) - return; - if (tp->forward_skb_hint) skb = tp->forward_skb_hint; else -- cgit v1.2.3-70-g09d2 From 61eb55f4db7eaf5fb2d5ec12981a8cda755bb0e1 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:22:59 -0700 Subject: tcp: Reorganize skb tagbit checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6f2a3f4a1af..2f24ecc3706 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2032,6 +2032,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + int mib_idx; if (tp->retransmit_skb_hint) skb = tp->retransmit_skb_hint; @@ -2059,27 +2060,26 @@ void tcp_xmit_retransmit_queue(struct sock *sk) return; if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) break; + if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) + continue; - if (sacked & TCPCB_LOST) { - if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { - int mib_idx; - - if (tcp_retransmit_skb(sk, skb)) { - tp->retransmit_skb_hint = NULL; - return; - } - if (icsk->icsk_ca_state != TCP_CA_Loss) - mib_idx = LINUX_MIB_TCPFASTRETRANS; - else - mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; - NET_INC_STATS_BH(sock_net(sk), mib_idx); - - if (skb == tcp_write_queue_head(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); - } + if (!(sacked & TCPCB_LOST)) + continue; + + if (tcp_retransmit_skb(sk, skb)) { + tp->retransmit_skb_hint = NULL; + return; } + if (icsk->icsk_ca_state != TCP_CA_Loss) + mib_idx = LINUX_MIB_TCPFASTRETRANS; + else + mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; + NET_INC_STATS_BH(sock_net(sk), mib_idx); + + if (skb == tcp_write_queue_head(sk)) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); } } -- cgit v1.2.3-70-g09d2 From 08ebd1721ab8fd362e90ae17b461c07b23fa2824 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:23:49 -0700 Subject: tcp: remove tp->lost_out guard to make joining diff nicer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The validity of the retransmit_high must then be ensured if no L'ed skb exits! This makes a minor change to behavior, we now have to iterate the head to find out that the loop terminates. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 75 ++++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 37 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2f24ecc3706..9f44be633ef 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2034,53 +2034,54 @@ void tcp_xmit_retransmit_queue(struct sock *sk) struct sk_buff *skb; int mib_idx; + if (!tp->lost_out) + tp->retransmit_high = tp->snd_una; + if (tp->retransmit_skb_hint) skb = tp->retransmit_skb_hint; else skb = tcp_write_queue_head(sk); /* First pass: retransmit lost packets. */ - if (tp->lost_out) { - tcp_for_write_queue_from(skb, sk) { - __u8 sacked = TCP_SKB_CB(skb)->sacked; + tcp_for_write_queue_from(skb, sk) { + __u8 sacked = TCP_SKB_CB(skb)->sacked; - if (skb == tcp_send_head(sk)) - break; - /* we could do better than to assign each time */ - tp->retransmit_skb_hint = skb; - - /* Assume this retransmit will generate - * only one packet for congestion window - * calculation purposes. This works because - * tcp_retransmit_skb() will chop up the - * packet to be MSS sized and all the - * packet counting works out. - */ - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) - return; - if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) - break; - if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) - continue; + if (skb == tcp_send_head(sk)) + break; + /* we could do better than to assign each time */ + tp->retransmit_skb_hint = skb; + + /* Assume this retransmit will generate + * only one packet for congestion window + * calculation purposes. This works because + * tcp_retransmit_skb() will chop up the + * packet to be MSS sized and all the + * packet counting works out. + */ + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + return; + if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) + break; + if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) + continue; - if (!(sacked & TCPCB_LOST)) - continue; + if (!(sacked & TCPCB_LOST)) + continue; - if (tcp_retransmit_skb(sk, skb)) { - tp->retransmit_skb_hint = NULL; - return; - } - if (icsk->icsk_ca_state != TCP_CA_Loss) - mib_idx = LINUX_MIB_TCPFASTRETRANS; - else - mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; - NET_INC_STATS_BH(sock_net(sk), mib_idx); - - if (skb == tcp_write_queue_head(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); + if (tcp_retransmit_skb(sk, skb)) { + tp->retransmit_skb_hint = NULL; + return; } + if (icsk->icsk_ca_state != TCP_CA_Loss) + mib_idx = LINUX_MIB_TCPFASTRETRANS; + else + mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; + NET_INC_STATS_BH(sock_net(sk), mib_idx); + + if (skb == tcp_write_queue_head(sk)) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); } /* OK, demanded retransmission is finished. */ -- cgit v1.2.3-70-g09d2 From 0e1c54c2a405494281e0639aacc90db03b50ae77 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:24:21 -0700 Subject: tcp: reorganize retransmit code loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both loops are quite similar, so they can be combined with little effort. As a result, forward_skb_hint becomes obsolete as well. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 - include/net/tcp.h | 1 - net/ipv4/tcp_output.c | 79 +++++++++++++++++++++------------------------------ 3 files changed, 33 insertions(+), 48 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d7637c4b284..76729062829 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -342,7 +342,6 @@ struct tcp_sock { struct sk_buff* lost_skb_hint; struct sk_buff *scoreboard_skb_hint; struct sk_buff *retransmit_skb_hint; - struct sk_buff *forward_skb_hint; struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d0e90c50722..220f54cf42e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1042,7 +1042,6 @@ static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) tp->lost_skb_hint = NULL; tp->scoreboard_skb_hint = NULL; tp->retransmit_skb_hint = NULL; - tp->forward_skb_hint = NULL; } /* MD5 Signature */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9f44be633ef..b5b4ddcdda4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2032,7 +2032,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + struct sk_buff *hole = NULL; int mib_idx; + int fwd_rexmitting = 0; if (!tp->lost_out) tp->retransmit_high = tp->snd_una; @@ -2049,7 +2051,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == tcp_send_head(sk)) break; /* we could do better than to assign each time */ - tp->retransmit_skb_hint = skb; + if (hole == NULL) + tp->retransmit_skb_hint = skb; /* Assume this retransmit will generate * only one packet for congestion window @@ -2060,65 +2063,49 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; - if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) - break; - if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) - continue; - - if (!(sacked & TCPCB_LOST)) - continue; - - if (tcp_retransmit_skb(sk, skb)) { - tp->retransmit_skb_hint = NULL; - return; - } - if (icsk->icsk_ca_state != TCP_CA_Loss) - mib_idx = LINUX_MIB_TCPFASTRETRANS; - else - mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; - NET_INC_STATS_BH(sock_net(sk), mib_idx); - - if (skb == tcp_write_queue_head(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); - } - - /* OK, demanded retransmission is finished. */ - if (!tcp_can_forward_retransmit(sk)) - return; - if (tp->forward_skb_hint) - skb = tp->forward_skb_hint; - else - skb = tcp_write_queue_head(sk); + if (fwd_rexmitting) { +begin_fwd: + if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + break; + mib_idx = LINUX_MIB_TCPFORWARDRETRANS; - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - tp->forward_skb_hint = skb; + } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { + if (!tcp_can_forward_retransmit(sk)) + break; + /* Backtrack if necessary to non-L'ed skb */ + if (hole != NULL) { + skb = hole; + hole = NULL; + } + fwd_rexmitting = 1; + goto begin_fwd; - if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) - break; + } else if (!(sacked & TCPCB_LOST)) { + if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS)) + hole = skb; + continue; - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) - break; + } else { + if (icsk->icsk_ca_state != TCP_CA_Loss) + mib_idx = LINUX_MIB_TCPFASTRETRANS; + else + mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; + } - if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) + if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; - /* Ok, retransmit it. */ if (tcp_retransmit_skb(sk, skb)) { - tp->forward_skb_hint = NULL; - break; + tp->retransmit_skb_hint = NULL; + return; } + NET_INC_STATS_BH(sock_net(sk), mib_idx); if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); - - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFORWARDRETRANS); } } -- cgit v1.2.3-70-g09d2 From f0ceb0ed86b4792a4ed9d3438f5f7572e48f9803 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:24:49 -0700 Subject: tcp: remove retransmit_skb_hint clearing from failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This doesn't much sense here afaict, probably never has. Since fragmenting and collapsing deal the hints by themselves, there should be very little reason for the rexmit loop to do that. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b5b4ddcdda4..f900fae8b87 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2096,10 +2096,8 @@ begin_fwd: if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; - if (tcp_retransmit_skb(sk, skb)) { - tp->retransmit_skb_hint = NULL; + if (tcp_retransmit_skb(sk, skb)) return; - } NET_INC_STATS_BH(sock_net(sk), mib_idx); if (skb == tcp_write_queue_head(sk)) -- cgit v1.2.3-70-g09d2 From ef9da47c7cc64d69526331f315e76b5680d4048f Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:25:15 -0700 Subject: tcp: don't clear retransmit_skb_hint when not necessary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most importantly avoid doing it with cumulative ACK. Not clearing means that we no longer need n^2 processing in resolution of each fast recovery. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/net/tcp.h | 7 ++++++- net/ipv4/tcp_input.c | 4 +++- net/ipv4/tcp_output.c | 8 +++++--- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 220f54cf42e..ea815723d41 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1037,10 +1037,15 @@ static inline void tcp_mib_init(struct net *net) } /* from STCP */ -static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) +static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) { tp->lost_skb_hint = NULL; tp->scoreboard_skb_hint = NULL; +} + +static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) +{ + tcp_clear_retrans_hints_partial(tp); tp->retransmit_skb_hint = NULL; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d017aed6edd..44a4fffc2cc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2925,7 +2925,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - tcp_clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); + if (skb == tp->retransmit_skb_hint) + tp->retransmit_skb_hint = NULL; } if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f900fae8b87..239cea7b6c0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -750,7 +750,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, BUG_ON(len > skb->len); - tcp_clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -1823,7 +1823,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, tp->packets_out -= tcp_skb_pcount(next_skb); /* changed transmit queue under us so clear hints */ - tcp_clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); + if (next_skb == tp->retransmit_skb_hint) + tp->retransmit_skb_hint = skb; sk_wmem_free_skb(sk, next_skb); } @@ -1853,7 +1855,7 @@ void tcp_simple_retransmit(struct sock *sk) } } - tcp_clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); if (prior_lost == tp->lost_out) return; -- cgit v1.2.3-70-g09d2 From 90638a04ad8484b6b6c567656fb3f6d0689e23da Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:25:52 -0700 Subject: tcp: don't clear lost_skb_hint when not necessary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most importantly avoid doing it with cumulative ACK. However, since we have lost_cnt_hint in the picture as well needing adjustments, it's not as trivial as dealing with retransmit_skb_hint (and cannot be done in the all place we could trivially leave retransmit_skb_hint untouched). With the previous patch, this should mostly remove O(n^2) behavior while cumulative ACKs start flowing once rexmit after a lossy round-trip made it through. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 44a4fffc2cc..85627f83665 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2844,6 +2844,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) int flag = 0; u32 pkts_acked = 0; u32 reord = tp->packets_out; + u32 prior_sacked = tp->sacked_out; s32 seq_rtt = -1; s32 ca_seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); @@ -2925,9 +2926,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - tcp_clear_retrans_hints_partial(tp); + tp->scoreboard_skb_hint = NULL; if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = NULL; + if (skb == tp->lost_skb_hint) + tp->lost_skb_hint = NULL; } if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) @@ -2946,6 +2949,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) /* Non-retransmitted hole got filled? That's reordering */ if (reord < prior_fackets) tcp_update_reordering(sk, tp->fackets_out - reord, 0); + + /* No need to care for underflows here because + * the lost_skb_hint gets NULLed if we're past it + * (or something non-trivial happened) + */ + if (tcp_is_fack(tp)) + tp->lost_cnt_hint -= pkts_acked; + else + tp->lost_cnt_hint -= prior_sacked - tp->sacked_out; } tp->fackets_out -= min(pkts_acked, tp->fackets_out); -- cgit v1.2.3-70-g09d2 From 618d9f25548ba6fc3a9cd2ce5cd56f4f015b0635 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 20 Sep 2008 21:26:22 -0700 Subject: tcp: back retransmit_high when it over-estimated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If lost skb is sacked, we might have nothing to retransmit as high as the retransmit_high is pointing to, so place it lower to avoid unnecessary walking. This is mainly for the case where high L'ed skbs gets sacked. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 239cea7b6c0..8f9793a37b6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2035,16 +2035,22 @@ void tcp_xmit_retransmit_queue(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; struct sk_buff *hole = NULL; + u32 last_lost; int mib_idx; int fwd_rexmitting = 0; if (!tp->lost_out) tp->retransmit_high = tp->snd_una; - if (tp->retransmit_skb_hint) + if (tp->retransmit_skb_hint) { skb = tp->retransmit_skb_hint; - else + last_lost = TCP_SKB_CB(skb)->end_seq; + if (after(last_lost, tp->retransmit_high)) + last_lost = tp->retransmit_high; + } else { skb = tcp_write_queue_head(sk); + last_lost = tp->snd_una; + } /* First pass: retransmit lost packets. */ tcp_for_write_queue_from(skb, sk) { @@ -2073,6 +2079,7 @@ begin_fwd: mib_idx = LINUX_MIB_TCPFORWARDRETRANS; } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { + tp->retransmit_high = last_lost; if (!tcp_can_forward_retransmit(sk)) break; /* Backtrack if necessary to non-L'ed skb */ @@ -2089,6 +2096,7 @@ begin_fwd: continue; } else { + last_lost = TCP_SKB_CB(skb)->end_seq; if (icsk->icsk_ca_state != TCP_CA_Loss) mib_idx = LINUX_MIB_TCPFASTRETRANS; else -- cgit v1.2.3-70-g09d2 From 6067804047b64dde89f4f133fc7eba48ee44107d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 20 Sep 2008 22:20:49 -0700 Subject: net: Use hton[sl]() instead of __constant_hton[sl]() where applicable Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 2 +- net/atm/br2684.c | 8 ++++---- net/core/dev.c | 4 ++-- net/ethernet/eth.c | 2 +- net/ipv4/ipvs/ip_vs_core.c | 4 ++-- net/ipv6/ip6_tunnel.c | 4 ++-- net/mac80211/wme.c | 2 +- net/sched/cls_flow.c | 28 ++++++++++++++-------------- net/sched/sch_dsmark.c | 8 ++++---- net/sched/sch_sfq.c | 4 ++-- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++-- 11 files changed, 35 insertions(+), 35 deletions(-) (limited to 'net/ipv4') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 97688cdb550..8883e9c8a22 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -48,7 +48,7 @@ static int vlan_dev_rebuild_header(struct sk_buff *skb) switch (veth->h_vlan_encapsulated_proto) { #ifdef CONFIG_INET - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): /* TODO: Confirm this will work with VLAN headers... */ return arp_find(veth->h_dest, skb); diff --git a/net/atm/br2684.c b/net/atm/br2684.c index 8d9a6f15888..280de481edc 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -375,11 +375,11 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb) if (memcmp (skb->data + 6, ethertype_ipv6, sizeof(ethertype_ipv6)) == 0) - skb->protocol = __constant_htons(ETH_P_IPV6); + skb->protocol = htons(ETH_P_IPV6); else if (memcmp (skb->data + 6, ethertype_ipv4, sizeof(ethertype_ipv4)) == 0) - skb->protocol = __constant_htons(ETH_P_IP); + skb->protocol = htons(ETH_P_IP); else goto error; skb_pull(skb, sizeof(llc_oui_ipv4)); @@ -404,9 +404,9 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb) skb_reset_network_header(skb); iph = ip_hdr(skb); if (iph->version == 4) - skb->protocol = __constant_htons(ETH_P_IP); + skb->protocol = htons(ETH_P_IP); else if (iph->version == 6) - skb->protocol = __constant_htons(ETH_P_IPV6); + skb->protocol = htons(ETH_P_IPV6); else goto error; skb->pkt_type = PACKET_HOST; diff --git a/net/core/dev.c b/net/core/dev.c index f48d1b24f9c..fdfc4b6a644 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1675,13 +1675,13 @@ static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) } switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): ip_proto = ip_hdr(skb)->protocol; addr1 = ip_hdr(skb)->saddr; addr2 = ip_hdr(skb)->daddr; ihl = ip_hdr(skb)->ihl; break; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): ip_proto = ipv6_hdr(skb)->nexthdr; addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3]; addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3]; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index a80839b02e3..647a9edee37 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -129,7 +129,7 @@ int eth_rebuild_header(struct sk_buff *skb) switch (eth->h_proto) { #ifdef CONFIG_INET - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): return arp_find(eth->h_dest, skb); #endif default: diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index ece748dbd0c..958abf3e5f8 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -938,7 +938,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, EnterFunction(11); - af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6; + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; if (skb->ipvs_property) return NF_ACCEPT; @@ -1258,7 +1258,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, struct ip_vs_conn *cp; int ret, restart, af; - af = (skb->protocol == __constant_htons(ETH_P_IP)) ? AF_INET : AF_INET6; + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 17c7b098cdb..64ce3d33d9c 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1050,10 +1050,10 @@ ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) } switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): ret = ip4ip6_tnl_xmit(skb, dev); break; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): ret = ip6ip6_tnl_xmit(skb, dev); break; default: diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 6748dedcab5..c703f8b44e9 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -39,7 +39,7 @@ static unsigned int classify_1d(struct sk_buff *skb) return skb->priority - 256; switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): dscp = ip_hdr(skb)->tos & 0xfc; break; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 8f63a1a9401..0ebaff637e3 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -67,9 +67,9 @@ static inline u32 addr_fold(void *addr) static u32 flow_get_src(const struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): return ntohl(ip_hdr(skb)->saddr); - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]); default: return addr_fold(skb->sk); @@ -79,9 +79,9 @@ static u32 flow_get_src(const struct sk_buff *skb) static u32 flow_get_dst(const struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): return ntohl(ip_hdr(skb)->daddr); - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]); default: return addr_fold(skb->dst) ^ (__force u16)skb->protocol; @@ -91,9 +91,9 @@ static u32 flow_get_dst(const struct sk_buff *skb) static u32 flow_get_proto(const struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): return ip_hdr(skb)->protocol; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): return ipv6_hdr(skb)->nexthdr; default: return 0; @@ -120,7 +120,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb) u32 res = 0; switch (skb->protocol) { - case __constant_htons(ETH_P_IP): { + case htons(ETH_P_IP): { struct iphdr *iph = ip_hdr(skb); if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && @@ -128,7 +128,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb) res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4)); break; } - case __constant_htons(ETH_P_IPV6): { + case htons(ETH_P_IPV6): { struct ipv6hdr *iph = ipv6_hdr(skb); if (has_ports(iph->nexthdr)) @@ -147,7 +147,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb) u32 res = 0; switch (skb->protocol) { - case __constant_htons(ETH_P_IP): { + case htons(ETH_P_IP): { struct iphdr *iph = ip_hdr(skb); if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && @@ -155,7 +155,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb) res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2)); break; } - case __constant_htons(ETH_P_IPV6): { + case htons(ETH_P_IPV6): { struct ipv6hdr *iph = ipv6_hdr(skb); if (has_ports(iph->nexthdr)) @@ -213,9 +213,9 @@ static u32 flow_get_nfct(const struct sk_buff *skb) static u32 flow_get_nfct_src(const struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, src.u3.ip)); - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): return ntohl(CTTUPLE(skb, src.u3.ip6[3])); } fallback: @@ -225,9 +225,9 @@ fallback: static u32 flow_get_nfct_dst(const struct sk_buff *skb) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, dst.u3.ip)); - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): return ntohl(CTTUPLE(skb, dst.u3.ip6[3])); } fallback: diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index edd1298f85f..ba43aab3a85 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -202,7 +202,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (p->set_tc_index) { switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): if (skb_cow_head(skb, sizeof(struct iphdr))) goto drop; @@ -210,7 +210,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) & ~INET_ECN_MASK; break; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): if (skb_cow_head(skb, sizeof(struct ipv6hdr))) goto drop; @@ -289,11 +289,11 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) pr_debug("index %d->%d\n", skb->tc_index, index); switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): ipv4_change_dsfield(ip_hdr(skb), p->mask[index], p->value[index]); break; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index], p->value[index]); break; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 6e041d10dbd..fe1508ef0d3 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -119,7 +119,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) u32 h, h2; switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): { const struct iphdr *iph = ip_hdr(skb); h = iph->daddr; @@ -134,7 +134,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) h2 ^= *(((u32*)iph) + iph->ihl); break; } - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): { struct ipv6hdr *iph = ipv6_hdr(skb); h = iph->daddr.s6_addr32[3]; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index e55427f73df..5c1954d28d0 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -769,7 +769,7 @@ repost: /* check for expected message types */ /* The order of some of these tests is important. */ switch (headerp->rm_type) { - case __constant_htonl(RDMA_MSG): + case htonl(RDMA_MSG): /* never expect read chunks */ /* never expect reply chunks (two ways to check) */ /* never expect write chunks without having offered RDMA */ @@ -802,7 +802,7 @@ repost: rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len); break; - case __constant_htonl(RDMA_NOMSG): + case htonl(RDMA_NOMSG): /* never expect read or write chunks, always reply chunks */ if (headerp->rm_body.rm_chunks[0] != xdr_zero || headerp->rm_body.rm_chunks[1] != xdr_zero || -- cgit v1.2.3-70-g09d2 From f5fff5dc8a7a3f395b0525c02ba92c95d42b7390 Mon Sep 17 00:00:00 2001 From: Tom Quetchenbach Date: Sun, 21 Sep 2008 00:21:51 -0700 Subject: tcp: advertise MSS requested by user I'm trying to use the TCP_MAXSEG option to setsockopt() to set the MSS for both sides of a bidirectional connection. man tcp says: "If this option is set before connection establishment, it also changes the MSS value announced to the other end in the initial packet." However, the kernel only uses the MTU/route cache to set the advertised MSS. That means if I set the MSS to, say, 500 before calling connect(), I will send at most 500-byte packets, but I will still receive 1500-byte packets in reply. This is a bug, either in the kernel or the documentation. This patch (applies to latest net-2.6) reduces the advertised value to that requested by the user as long as setsockopt() is called before connect() or accept(). This seems like the behavior that one would expect as well as that which is documented. I've tried to make sure that things that depend on the advertised MSS are set correctly. Signed-off-by: Tom Quetchenbach Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++++ net/ipv4/tcp_output.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3dfbc21e555..44aef1c1f37 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1364,6 +1364,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + if (tcp_sk(sk)->rx_opt.user_mss && + tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + tcp_initialize_rcv_mss(newsk); #ifdef CONFIG_TCP_MD5SIG diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8f9793a37b6..c3d58ee3e16 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2232,6 +2232,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct sk_buff *skb; struct tcp_md5sig_key *md5; __u8 *md5_hash_location; + int mss; skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (skb == NULL) @@ -2242,13 +2243,17 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb->dst = dst_clone(dst); + mss = dst_metric(dst, RTAX_ADVMSS); + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) + mss = tp->rx_opt.user_mss; + if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ __u8 rcv_wscale; /* Set this up on the first call only */ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), - dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, @@ -2258,8 +2263,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_header_size = tcp_synack_options(sk, req, - dst_metric(dst, RTAX_ADVMSS), + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5) + sizeof(struct tcphdr); @@ -2333,6 +2337,9 @@ static void tcp_connect_init(struct sock *sk) if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) + tp->advmss = tp->rx_opt.user_mss; + tcp_initialize_rcv_mss(sk); tcp_select_initial_window(tcp_full_space(sk), -- cgit v1.2.3-70-g09d2 From e6f225ebb7c35fe30fdf8608927c5cf8fce6de7d Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Fri, 19 Sep 2008 20:41:56 +0200 Subject: ipvs: Restrict sync message to 255 connections The nr_conns variable in the sync message header is only eight bits wide and will overflow on interfaces with a large MTU. As a result the backup won't parse all connections contained in the sync buffer. On regular ethernet with an MTU of 1500 this isn't a problem, because we can't overflow the value, but consider jumbo frames being used on a cross-over connection between both directors. We now restrict the size of the sync buffer, so that we never put more than 255 connections into a single sync buffer. Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_sync.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 28237a5f62e..de5e7e118ee 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -99,6 +100,7 @@ struct ip_vs_sync_thread_data { */ #define SYNC_MESG_HEADER_LEN 4 +#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ struct ip_vs_sync_mesg { __u8 nr_conns; @@ -516,8 +518,8 @@ static int set_sync_mesg_maxlen(int sync_state) num = (dev->mtu - sizeof(struct iphdr) - sizeof(struct udphdr) - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - sync_send_mesg_maxlen = - SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num; + sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); IP_VS_DBG(7, "setting the maximum length of sync sending " "message %d.\n", sync_send_mesg_maxlen); } else if (sync_state == IP_VS_STATE_BACKUP) { -- cgit v1.2.3-70-g09d2 From 8d5803bf6fbe5264000afc8c34bff08e8ecc023b Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Sat, 20 Sep 2008 11:48:33 +0200 Subject: ipvs: Fix unused label warning Signed-off-by: Sven Wegener Signed-off-by: Simon Horman --- net/ipv4/ipvs/ip_vs_ctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 771551d8fba..0302cf3e503 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1330,7 +1330,9 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) out_unlock: write_unlock_bh(&__ip_vs_svc_lock); +#ifdef CONFIG_IP_VS_IPV6 out: +#endif if (old_sched) ip_vs_scheduler_put(old_sched); -- cgit v1.2.3-70-g09d2 From 43f59c89399fd76883a06c551f24794e98409432 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 21 Sep 2008 21:28:51 -0700 Subject: net: Remove __skb_insert() calls outside of skbuff internals. This minor cleanup simplifies later changes which will convert struct sk_buff and friends over to using struct list_head. Signed-off-by: David S. Miller --- drivers/net/cassini.c | 2 +- drivers/net/ppp_generic.c | 2 +- drivers/net/pppol2tp.c | 2 +- include/net/tcp.h | 4 ++-- net/ipv4/tcp_input.c | 4 ++-- net/sctp/ulpqueue.c | 5 ++--- 6 files changed, 9 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c index f1936d51b45..40ff6a90d0d 100644 --- a/drivers/net/cassini.c +++ b/drivers/net/cassini.c @@ -2182,7 +2182,7 @@ static inline void cas_rx_flow_pkt(struct cas *cp, const u64 *words, * do any additional locking here. stick the buffer * at the end. */ - __skb_insert(skb, flow->prev, (struct sk_buff *) flow, flow); + __skb_queue_tail(flow, skb); if (words[0] & RX_COMP1_RELEASE_FLOW) { while ((skb = __skb_dequeue(flow))) { cas_skb_release(skb); diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index ddccc074a76..98e04958fef 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -1864,7 +1864,7 @@ ppp_mp_insert(struct ppp *ppp, struct sk_buff *skb) for (p = list->next; p != (struct sk_buff *)list; p = p->next) if (seq_before(seq, p->sequence)) break; - __skb_insert(skb, p->prev, p, list); + __skb_queue_before(list, p, skb); } /* diff --git a/drivers/net/pppol2tp.c b/drivers/net/pppol2tp.c index ff175e8f36b..185b1dff10a 100644 --- a/drivers/net/pppol2tp.c +++ b/drivers/net/pppol2tp.c @@ -353,7 +353,7 @@ static void pppol2tp_recv_queue_skb(struct pppol2tp_session *session, struct sk_ spin_lock_bh(&session->reorder_q.lock); skb_queue_walk_safe(&session->reorder_q, skbp, tmp) { if (PPPOL2TP_SKB_CB(skbp)->ns > ns) { - __skb_insert(skb, skbp->prev, skbp, &session->reorder_q); + __skb_queue_before(&session->reorder_q, skbp, skb); PRINTK(session->debug, PPPOL2TP_MSG_SEQ, KERN_DEBUG, "%s: pkt %hu, inserted before %hu, reorder_q len=%d\n", session->name, ns, PPPOL2TP_SKB_CB(skbp)->ns, diff --git a/include/net/tcp.h b/include/net/tcp.h index ea815723d41..f857c3eff71 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1268,12 +1268,12 @@ static inline void tcp_insert_write_queue_after(struct sk_buff *skb, __skb_queue_after(&sk->sk_write_queue, skb, buff); } -/* Insert skb between prev and next on the write queue of sk. */ +/* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, struct sock *sk) { - __skb_insert(new, skb->prev, skb, &sk->sk_write_queue); + __skb_queue_before(&sk->sk_write_queue, skb, new); if (sk->sk_send_head == skb) sk->sk_send_head = new; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 85627f83665..cbfe13d5f42 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4156,7 +4156,7 @@ drop: skb1 = skb1->prev; } } - __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue); + __skb_queue_after(&tp->out_of_order_queue, skb1, skb); /* And clean segments covered by new one as whole. */ while ((skb1 = skb->next) != @@ -4254,7 +4254,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, memcpy(nskb->head, skb->head, header); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; - __skb_insert(nskb, skb->prev, skb, list); + __skb_queue_before(list, skb, nskb); skb_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 5061a26c502..7b23803343c 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -317,7 +317,7 @@ static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq, } /* Insert before pos. */ - __skb_insert(sctp_event2skb(event), pos->prev, pos, &ulpq->reasm); + __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event)); } @@ -825,8 +825,7 @@ static void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq, /* Insert before pos. */ - __skb_insert(sctp_event2skb(event), pos->prev, pos, &ulpq->lobby); - + __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, -- cgit v1.2.3-70-g09d2 From 77d40a0952b16e020ce07c4cf9fb22024448275b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 23 Sep 2008 01:29:23 -0700 Subject: tcp: Fix order of tests in tcp_retransmit_skb() tcp_write_queue_next() must only be made if we know that tcp_skb_is_last() evaluates to false. Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c3d58ee3e16..a8499ef3234 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1932,8 +1932,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Collapse two adjacent packets if worthwhile and we can. */ if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && (skb->len < (cur_mss >> 1)) && - (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && (!tcp_skb_is_last(sk, skb)) && + (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && (tcp_skb_pcount(skb) == 1 && -- cgit v1.2.3-70-g09d2 From 28e3487b7dd8a9791baac924bc887140ec747bed Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 23 Sep 2008 02:51:41 -0700 Subject: tcp: Fix queue traversal in tcp_use_frto(). We must check tcp_skb_is_last() before doing a tcp_write_queue_next(). Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cbfe13d5f42..3b76bce769d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1746,6 +1746,8 @@ int tcp_use_frto(struct sock *sk) return 0; skb = tcp_write_queue_head(sk); + if (tcp_skb_is_last(sk, skb)) + return 1; skb = tcp_write_queue_next(sk, skb); /* Skips head */ tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) -- cgit v1.2.3-70-g09d2 From 4dd7972d1204c3851a4092cecd2207e05eb29b09 Mon Sep 17 00:00:00 2001 From: Vitaliy Gusev Date: Wed, 1 Oct 2008 01:51:39 -0700 Subject: tcp: Fix NULL dereference in tcp_4_send_ack() Fix NULL dereference in tcp_4_send_ack(). As skb->dev is reset to NULL in tcp_v4_rcv() thus OOPS occurs: BUG: unable to handle kernel NULL pointer dereference at 00000000000004d0 IP: [] tcp_v4_send_ack+0x203/0x250 Stack: ffff810005dbb000 ffff810015c8acc0 e77b2c6e5f861600 a01610802e90cb6d 0a08010100000000 88afffff88afffff 0000000080762be8 0000000115c872e8 0004122000000000 0000000000000001 ffffffff80762b88 0000000000000020 Call Trace: [] tcp_v4_reqsk_send_ack+0x20/0x22 [] tcp_check_req+0x108/0x14c [] ? rt_intern_hash+0x322/0x33c [] tcp_v4_do_rcv+0x399/0x4ec [] ? skb_checksum+0x4f/0x272 [] ? __inet_lookup_listener+0x14a/0x15c [] tcp_v4_rcv+0x6a1/0x701 [] ip_local_deliver_finish+0x157/0x24a [] ip_local_deliver+0x72/0x7c [] ip_rcv_finish+0x38d/0x3b2 [] ? scsi_io_completion+0x19d/0x39e [] ip_rcv+0x2a2/0x2e5 [] netif_receive_skb+0x293/0x303 [] process_backlog+0x80/0xd0 [] ? __rcu_process_callbacks+0x125/0x1b4 [] net_rx_action+0xb9/0x17f [] __do_softirq+0xa3/0x164 [] call_softirq+0x1c/0x28 [] do_softirq+0x34/0x72 [] local_bh_enable_ip+0x3f/0x50 [] _spin_unlock_bh+0x12/0x14 [] release_sock+0xb8/0xc1 [] inet_stream_connect+0x146/0x25c [] ? autoremove_wake_function+0x0/0x38 [] sys_connect+0x68/0x8e [] ? fd_install+0x5f/0x68 [] ? sock_map_fd+0x55/0x62 [] system_call_after_swapgs+0x7b/0x80 Code: 41 10 11 d0 83 d0 00 4d 85 ed 89 45 c0 c7 45 c4 08 00 00 00 74 07 41 8b 45 04 89 45 c8 48 8b 43 20 8b 4d b8 48 8d 55 b0 48 89 de <48> 8b 80 d0 04 00 00 48 8b b8 60 01 00 00 e8 20 ae fe ff 65 48 RIP [] tcp_v4_send_ack+0x203/0x250 RSP CR2: 00000000000004d0 Signed-off-by: Vitaliy Gusev Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1b4fee20fc9..011478e46c4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -618,7 +618,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ]; } rep; struct ip_reply_arg arg; - struct net *net = dev_net(skb->dev); + struct net *net = dev_net(skb->dst->dev); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); -- cgit v1.2.3-70-g09d2 From a210d01ae3ee006b59e54e772a7f212486e0f021 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 1 Oct 2008 07:28:28 -0700 Subject: ipv4: Loosen source address check on IPv4 output ip_route_output() contains a check to make sure that no flows with non-local source IP addresses are routed. This obviously makes using such addresses impossible. This patch introduces a flowi flag which makes omitting this check possible. The new flag provides a way of handling transparent and non-transparent connections differently. Signed-off-by: Julian Anastasov Signed-off-by: KOVACS Krisztian Signed-off-by: David S. Miller --- include/net/flow.h | 2 ++ net/ipv4/route.c | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/flow.h b/include/net/flow.h index 228b2477cee..b45a5e4fcad 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -47,6 +47,8 @@ struct flowi { #define fl4_scope nl_u.ip4_u.scope __u8 proto; + __u8 flags; +#define FLOWI_FLAG_ANYSRC 0x01 union { struct { __be16 sport; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f62187bb6d0..a6d7c584f53 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2361,11 +2361,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, ipv4_is_zeronet(oldflp->fl4_src)) goto out; - /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); - if (dev_out == NULL) - goto out; - /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: 1. ip_dev_find(net, saddr) can return wrong iface, if saddr @@ -2377,6 +2372,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (oldflp->oif == 0 && (ipv4_is_multicast(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(net, oldflp->fl4_src); + if (dev_out == NULL) + goto out; + /* Special hack: user can direct multicasts and limited broadcast via necessary interface without fiddling with IP_MULTICAST_IF or IP_PKTINFO. @@ -2395,9 +2395,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, fl.oif = dev_out->ifindex; goto make_route; } - if (dev_out) + + if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(net, oldflp->fl4_src); + if (dev_out == NULL) + goto out; dev_put(dev_out); - dev_out = NULL; + dev_out = NULL; + } } -- cgit v1.2.3-70-g09d2 From f5715aea4564f233767ea1d944b2637a5fd7cd2e Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 1 Oct 2008 07:30:02 -0700 Subject: ipv4: Implement IP_TRANSPARENT socket option This patch introduces the IP_TRANSPARENT socket option: enabling that will make the IPv4 routing omit the non-local source address check on output. Setting IP_TRANSPARENT requires NET_ADMIN capability. Signed-off-by: KOVACS Krisztian Signed-off-by: David S. Miller --- include/linux/in.h | 1 + include/net/inet_sock.h | 3 ++- include/net/inet_timewait_sock.h | 3 ++- net/ipv4/inet_timewait_sock.c | 1 + net/ipv4/ip_sockglue.c | 15 ++++++++++++++- 5 files changed, 20 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/in.h b/include/linux/in.h index 4065313cd7e..db458beef19 100644 --- a/include/linux/in.h +++ b/include/linux/in.h @@ -75,6 +75,7 @@ struct in_addr { #define IP_IPSEC_POLICY 16 #define IP_XFRM_POLICY 17 #define IP_PASSSEC 18 +#define IP_TRANSPARENT 19 /* BSD compatibility */ #define IP_RECVRETOPTS IP_RETOPTS diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 643e26be058..e97b66e2a9d 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -129,7 +129,8 @@ struct inet_sock { is_icsk:1, freebind:1, hdrincl:1, - mc_loop:1; + mc_loop:1, + transparent:1; int mc_index; __be32 mc_addr; struct ip_mc_socklist *mc_list; diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 91324908fcc..80e4977631b 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -128,7 +128,8 @@ struct inet_timewait_sock { __be16 tw_dport; __u16 tw_num; /* And these are ours. */ - __u8 tw_ipv6only:1; + __u8 tw_ipv6only:1, + tw_transparent:1; /* 15 bits hole, try to pack */ __u16 tw_ipv6_offset; unsigned long tw_ttd; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 743f011b9a8..1c5fd38f882 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -126,6 +126,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_reuse = sk->sk_reuse; tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; + tw->tw_transparent = inet->transparent; tw->tw_prot = sk->sk_prot_creator; twsk_net_set(tw, hold_net(sock_net(sk))); atomic_set(&tw->tw_refcnt, 1); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 105d92a039b..465abf0a986 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -419,7 +419,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, (1<= sizeof(int)) { @@ -878,6 +878,16 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = xfrm_user_policy(sk, optname, optval, optlen); break; + case IP_TRANSPARENT: + if (!capable(CAP_NET_ADMIN)) { + err = -EPERM; + break; + } + if (optlen < 1) + goto e_inval; + inet->transparent = !!val; + break; + default: err = -ENOPROTOOPT; break; @@ -1130,6 +1140,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_FREEBIND: val = inet->freebind; break; + case IP_TRANSPARENT: + val = inet->transparent; + break; default: release_sock(sk); return -ENOPROTOOPT; -- cgit v1.2.3-70-g09d2 From b9fb15067ce93497bef852c05e406d7a96212a9a Mon Sep 17 00:00:00 2001 From: Tóth László Attila Date: Wed, 1 Oct 2008 07:31:24 -0700 Subject: ipv4: Allow binding to non-local addresses if IP_TRANSPARENT is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Setting IP_TRANSPARENT is not really useful without allowing non-local binds for the socket. To make user-space code simpler we allow these binds even if IP_TRANSPARENT is set but IP_FREEBIND is not. Signed-off-by: Tóth László Attila Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8a3ac1fa71a..1fbff5fa424 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -469,7 +469,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ err = -EADDRNOTAVAIL; if (!sysctl_ip_nonlocal_bind && - !inet->freebind && + !(inet->freebind || inet->transparent) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && -- cgit v1.2.3-70-g09d2 From 1668e010cbe1a7567c81d4c02d31dde9859e9da1 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 1 Oct 2008 07:33:10 -0700 Subject: ipv4: Make inet_sock.h independent of route.h inet_iif() in inet_sock.h requires route.h. Since users of inet_iif() usually require other route.h functionality anyway this patch moves inet_iif() to route.h. Signed-off-by: KOVACS Krisztian Signed-off-by: David S. Miller --- include/net/inet_sock.h | 7 ------- include/net/ip_vs.h | 1 + include/net/route.h | 5 +++++ net/ipv4/netfilter/nf_nat_helper.c | 1 + net/ipv6/af_inet6.c | 1 + 5 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index e97b66e2a9d..139b78b4dfe 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -24,7 +24,6 @@ #include #include #include -#include #include /** struct ip_options - IP Options @@ -195,12 +194,6 @@ static inline int inet_sk_ehashfn(const struct sock *sk) return inet_ehashfn(net, laddr, lport, faddr, fport); } - -static inline int inet_iif(const struct sk_buff *skb) -{ - return skb->rtable->rt_iif; -} - static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops) { struct request_sock *req = reqsk_alloc(ops); diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 33e2ac6ceb3..0b2071d9326 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -22,6 +22,7 @@ #include #include /* for union nf_inet_addr */ +#include #include /* for struct ipv6hdr */ #include /* for ipv6_addr_copy */ diff --git a/include/net/route.h b/include/net/route.h index 4f0d8c14736..31d1485b624 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -204,4 +204,9 @@ static inline struct inet_peer *rt_get_peer(struct rtable *rt) return rt->peer; } +static inline int inet_iif(const struct sk_buff *skb) +{ + return skb->rtable->rt_iif; +} + #endif /* _ROUTE_H */ diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 11976ea2988..112dcfa1290 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 95055f8c3f3..f018704ecb8 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 88ef4a5a78e63420dd1dd770f1bd1dc198926b04 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 1 Oct 2008 07:41:00 -0700 Subject: tcp: Handle TCP SYN+ACK/ACK/RST transparency The TCP stack sends out SYN+ACK/ACK/RST reply packets in response to incoming packets. The non-local source address check on output bites us again, as replies for transparently redirected traffic won't have a chance to leave the node. This patch selectively sets the FLOWI_FLAG_ANYSRC flag when doing the route lookup for those replies. Transparent replies are enabled if the listening socket has the transparent socket flag set. Signed-off-by: KOVACS Krisztian Signed-off-by: David S. Miller --- include/net/inet_sock.h | 8 +++++++- include/net/ip.h | 3 +++ net/ipv4/tcp_ipv4.c | 12 +++++++++--- 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 139b78b4dfe..dced3f64f97 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -72,7 +72,8 @@ struct inet_request_sock { sack_ok : 1, wscale_ok : 1, ecn_ok : 1, - acked : 1; + acked : 1, + no_srccheck: 1; struct ip_options *opt; }; @@ -204,4 +205,9 @@ static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops return req; } +static inline __u8 inet_sk_flowi_flags(const struct sock *sk) +{ + return inet_sk(sk)->transparent ? FLOWI_FLAG_ANYSRC : 0; +} + #endif /* _INET_SOCK_H */ diff --git a/include/net/ip.h b/include/net/ip.h index 250e6ef025a..90b27f634b7 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -140,12 +140,15 @@ static inline void ip_tr_mc_map(__be32 addr, char *buf) struct ip_reply_arg { struct kvec iov[1]; + int flags; __wsum csum; int csumoffset; /* u16 offset of csum in iov[0].iov_base */ /* -1 if not needed */ int bound_dev_if; }; +#define IP_REPLY_ARG_NOSRCCHECK 1 + void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d13688e3558..8b24bd833cb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -591,6 +591,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) ip_hdr(skb)->saddr, /* XXX */ sizeof(struct tcphdr), IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; + arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; net = dev_net(skb->dst->dev); ip_send_reply(net->ipv4.tcp_sock, skb, @@ -606,7 +607,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts, int oif, - struct tcp_md5sig_key *key) + struct tcp_md5sig_key *key, + int reply_flags) { struct tcphdr *th = tcp_hdr(skb); struct { @@ -659,6 +661,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ip_hdr(skb)->daddr, &rep.th); } #endif + arg.flags = reply_flags; arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); @@ -681,7 +684,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent, tw->tw_bound_dev_if, - tcp_twsk_md5_key(tcptw) + tcp_twsk_md5_key(tcptw), + tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0 ); inet_twsk_put(tw); @@ -694,7 +698,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent, 0, - tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr)); + tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr), + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0); } /* @@ -1244,6 +1249,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; + ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(sk, skb); if (!want_cookie) TCP_ECN_create_request(req, tcp_hdr(skb)); -- cgit v1.2.3-70-g09d2 From 86b08d867d7de001ab224180ed7865fab93fd56e Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 1 Oct 2008 07:44:42 -0700 Subject: ipv4: Make Netfilter's ip_route_me_harder() non-local address compatible Netfilter's ip_route_me_harder() tries to re-route packets either generated or re-routed by Netfilter. This patch changes ip_route_me_harder() to handle packets from non-locally-bound sockets with IP_TRANSPARENT set as local and to set the appropriate flowi flags when re-doing the routing lookup. Signed-off-by: KOVACS Krisztian Signed-off-by: David S. Miller --- include/net/ip.h | 6 ++++++ net/ipv4/inet_connection_sock.c | 1 + net/ipv4/ip_output.c | 4 +++- net/ipv4/netfilter.c | 3 +++ net/ipv4/syncookies.c | 2 ++ 5 files changed, 15 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index 90b27f634b7..d678ea3d474 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -29,6 +29,7 @@ #include #include +#include struct sock; @@ -149,6 +150,11 @@ struct ip_reply_arg { #define IP_REPLY_ARG_NOSRCCHECK 1 +static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) +{ + return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; +} + void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0c1ae68ee84..432c570c9f5 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -335,6 +335,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, .saddr = ireq->loc_addr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { .sport = inet_sk(sk)->sport, .dport = ireq->rmt_port } } }; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d533a89e08d..d2a8f8bb78a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -340,6 +340,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) .saddr = inet->saddr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { .sport = inet->sport, .dport = inet->dport } } }; @@ -1371,7 +1372,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar .uli_u = { .ports = { .sport = tcp_hdr(skb)->dest, .dport = tcp_hdr(skb)->source } }, - .proto = sk->sk_protocol }; + .proto = sk->sk_protocol, + .flags = ip_reply_arg_flowi_flags(arg) }; security_skb_classify_flow(skb, &fl); if (ip_route_output_key(sock_net(sk), &rt, &fl)) return; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index f8edacdf991..01671ad51ed 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -20,6 +20,8 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) unsigned int type; type = inet_addr_type(&init_net, iph->saddr); + if (skb->sk && inet_sk(skb->sk)->transparent) + type = RTN_LOCAL; if (addr_type == RTN_UNSPEC) addr_type = type; @@ -33,6 +35,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; fl.mark = skb->mark; + fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; if (ip_route_output_key(&init_net, &rt, &fl) != 0) return -1; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 9d38005abba..929302b2ba9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -16,6 +16,7 @@ #include #include #include +#include /* Timestamps: lowest 9 bits store TCP options */ #define TSBITS 9 @@ -337,6 +338,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, .saddr = ireq->loc_addr, .tos = RT_CONN_FLAGS(sk) } }, .proto = IPPROTO_TCP, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { .sport = th->dest, .dport = th->source } } }; -- cgit v1.2.3-70-g09d2 From a3116ac5c216fc3c145906a46df9ce542ff7dcf2 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 1 Oct 2008 07:46:49 -0700 Subject: tcp: Port redirection support for TCP Current TCP code relies on the local port of the listening socket being the same as the destination address of the incoming connection. Port redirection used by many transparent proxying techniques obviously breaks this, so we have to store the original destination port address. This patch extends struct inet_request_sock and stores the incoming destination port value there. It also modifies the handshake code to use that value as the source port when sending reply packets. Signed-off-by: KOVACS Krisztian Signed-off-by: David S. Miller --- include/net/inet_sock.h | 2 +- include/net/tcp.h | 1 + net/ipv4/inet_connection_sock.c | 2 ++ net/ipv4/syncookies.c | 1 + net/ipv4/tcp_output.c | 2 +- 5 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index dced3f64f97..de0ecc71cf0 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -61,8 +61,8 @@ struct inet_request_sock { struct request_sock req; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) u16 inet6_rsk_offset; - /* 2 bytes hole, try to pack */ #endif + __be16 loc_port; __be32 loc_addr; __be32 rmt_addr; __be16 rmt_port; diff --git a/include/net/tcp.h b/include/net/tcp.h index 12c9b4fec04..f6cc3414315 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -976,6 +976,7 @@ static inline void tcp_openreq_init(struct request_sock *req, ireq->acked = 0; ireq->ecn_ok = 0; ireq->rmt_port = tcp_hdr(skb)->source; + ireq->loc_port = tcp_hdr(skb)->dest; } extern void tcp_enter_memory_pressure(struct sock *sk); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 432c570c9f5..21fcc5a9045 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -516,6 +516,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, newicsk->icsk_bind_hash = NULL; inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; + inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port); + inet_sk(newsk)->sport = inet_rsk(req)->loc_port; newsk->sk_write_space = sk_stream_write_space; newicsk->icsk_retransmits = 0; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 929302b2ba9..d346c22aa6a 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -297,6 +297,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; req->mss = mss; + ireq->loc_port = th->dest; ireq->rmt_port = th->source; ireq->loc_addr = ip_hdr(skb)->daddr; ireq->rmt_addr = ip_hdr(skb)->saddr; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a8499ef3234..493553c71d3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2275,7 +2275,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->syn = 1; th->ack = 1; TCP_ECN_make_synack(req, th); - th->source = inet_sk(sk)->sport; + th->source = ireq->loc_port; th->dest = ireq->rmt_port; /* Setting of flags are superfluous here for callers (and ECE is * not even correctly set) -- cgit v1.2.3-70-g09d2 From bcd41303f422015ab662c9276d108414aa75b796 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 1 Oct 2008 07:48:10 -0700 Subject: udp: Export UDP socket lookup function The iptables tproxy code has to be able to do UDP socket hash lookups, so we have to provide an exported lookup function for this purpose. Signed-off-by: KOVACS Krisztian Signed-off-by: David S. Miller --- include/net/udp.h | 4 ++++ net/ipv4/udp.c | 7 +++++++ 2 files changed, 11 insertions(+) (limited to 'net/ipv4') diff --git a/include/net/udp.h b/include/net/udp.h index addcdc67234..d38f6f2419f 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -148,6 +148,10 @@ extern int udp_lib_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen, int (*push_pending_frames)(struct sock *)); +extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, + __be32 daddr, __be16 dport, + int dif); + DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6); /* UDP-Lite does not have a standardized MIB yet, so we inherit from UDP */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 57e26fa6618..c83d0ef469c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -302,6 +302,13 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, return result; } +struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, + __be32 daddr, __be16 dport, int dif) +{ + return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash); +} +EXPORT_SYMBOL_GPL(udp4_lib_lookup); + static inline struct sock *udp_v4_mcast_next(struct sock *sk, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, -- cgit v1.2.3-70-g09d2 From c7004482e8dcb7c3c72666395cfa98a216a4fb70 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 6 Oct 2008 10:43:54 -0700 Subject: tcp: Respect SO_RCVLOWAT in tcp_poll(). Based upon a report by Vito Caputo. Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1ab341e5d3e..7d81a1ee550 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -384,13 +384,17 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Connected? */ if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + int target = sock_rcvlowat(sk, 0, INT_MAX); + + if (tp->urg_seq == tp->copied_seq && + !sock_flag(sk, SOCK_URGINLINE) && + tp->urg_data) + target--; + /* Potential race condition. If read of tp below will * escape above sk->sk_state, we can be illegally awaken * in SYN_* states. */ - if ((tp->rcv_nxt != tp->copied_seq) && - (tp->urg_seq != tp->copied_seq || - tp->rcv_nxt != tp->copied_seq + 1 || - sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) + if (tp->rcv_nxt - tp->copied_seq >= target) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { -- cgit v1.2.3-70-g09d2 From cb7f6a7b716e801097b564dec3ccb58d330aef56 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Fri, 19 Sep 2008 12:32:57 +0200 Subject: IPVS: Move IPVS to net/netfilter/ipvs Since IPVS now has partial IPv6 support, this patch moves IPVS from net/ipv4/ipvs to net/netfilter/ipvs. It's a result of: $ git mv net/ipv4/ipvs net/netfilter and adapting the relevant Kconfigs/Makefiles to the new path. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/ipv4/Kconfig | 2 - net/ipv4/Makefile | 1 - net/ipv4/ipvs/Kconfig | 239 --- net/ipv4/ipvs/Makefile | 33 - net/ipv4/ipvs/ip_vs_app.c | 622 ------ net/ipv4/ipvs/ip_vs_conn.c | 1110 ---------- net/ipv4/ipvs/ip_vs_core.c | 1542 -------------- net/ipv4/ipvs/ip_vs_ctl.c | 3443 ------------------------------- net/ipv4/ipvs/ip_vs_est.c | 166 -- net/ipv4/ipvs/ip_vs_ftp.c | 410 ---- net/ipv4/ipvs/ip_vs_lblc.c | 555 ----- net/ipv4/ipvs/ip_vs_lblcr.c | 755 ------- net/ipv4/ipvs/ip_vs_lc.c | 103 - net/ipv4/ipvs/ip_vs_nq.c | 138 -- net/ipv4/ipvs/ip_vs_proto.c | 288 --- net/ipv4/ipvs/ip_vs_proto_ah_esp.c | 235 --- net/ipv4/ipvs/ip_vs_proto_tcp.c | 732 ------- net/ipv4/ipvs/ip_vs_proto_udp.c | 533 ----- net/ipv4/ipvs/ip_vs_rr.c | 112 - net/ipv4/ipvs/ip_vs_sched.c | 251 --- net/ipv4/ipvs/ip_vs_sed.c | 140 -- net/ipv4/ipvs/ip_vs_sh.c | 258 --- net/ipv4/ipvs/ip_vs_sync.c | 942 --------- net/ipv4/ipvs/ip_vs_wlc.c | 128 -- net/ipv4/ipvs/ip_vs_wrr.c | 237 --- net/ipv4/ipvs/ip_vs_xmit.c | 1004 --------- net/netfilter/Kconfig | 2 + net/netfilter/Makefile | 3 + net/netfilter/ipvs/Kconfig | 239 +++ net/netfilter/ipvs/Makefile | 33 + net/netfilter/ipvs/ip_vs_app.c | 622 ++++++ net/netfilter/ipvs/ip_vs_conn.c | 1110 ++++++++++ net/netfilter/ipvs/ip_vs_core.c | 1542 ++++++++++++++ net/netfilter/ipvs/ip_vs_ctl.c | 3443 +++++++++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_dh.c | 261 +++ net/netfilter/ipvs/ip_vs_est.c | 166 ++ net/netfilter/ipvs/ip_vs_ftp.c | 410 ++++ net/netfilter/ipvs/ip_vs_lblc.c | 555 +++++ net/netfilter/ipvs/ip_vs_lblcr.c | 755 +++++++ net/netfilter/ipvs/ip_vs_lc.c | 103 + net/netfilter/ipvs/ip_vs_nq.c | 138 ++ net/netfilter/ipvs/ip_vs_proto.c | 288 +++ net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 235 +++ net/netfilter/ipvs/ip_vs_proto_tcp.c | 732 +++++++ net/netfilter/ipvs/ip_vs_proto_udp.c | 533 +++++ net/netfilter/ipvs/ip_vs_rr.c | 112 + net/netfilter/ipvs/ip_vs_sched.c | 251 +++ net/netfilter/ipvs/ip_vs_sed.c | 140 ++ net/netfilter/ipvs/ip_vs_sh.c | 258 +++ net/netfilter/ipvs/ip_vs_sync.c | 942 +++++++++ net/netfilter/ipvs/ip_vs_wlc.c | 128 ++ net/netfilter/ipvs/ip_vs_wrr.c | 237 +++ net/netfilter/ipvs/ip_vs_xmit.c | 1004 +++++++++ 53 files changed, 14242 insertions(+), 13979 deletions(-) delete mode 100644 net/ipv4/ipvs/Kconfig delete mode 100644 net/ipv4/ipvs/Makefile delete mode 100644 net/ipv4/ipvs/ip_vs_app.c delete mode 100644 net/ipv4/ipvs/ip_vs_conn.c delete mode 100644 net/ipv4/ipvs/ip_vs_core.c delete mode 100644 net/ipv4/ipvs/ip_vs_ctl.c delete mode 100644 net/ipv4/ipvs/ip_vs_est.c delete mode 100644 net/ipv4/ipvs/ip_vs_ftp.c delete mode 100644 net/ipv4/ipvs/ip_vs_lblc.c delete mode 100644 net/ipv4/ipvs/ip_vs_lblcr.c delete mode 100644 net/ipv4/ipvs/ip_vs_lc.c delete mode 100644 net/ipv4/ipvs/ip_vs_nq.c delete mode 100644 net/ipv4/ipvs/ip_vs_proto.c delete mode 100644 net/ipv4/ipvs/ip_vs_proto_ah_esp.c delete mode 100644 net/ipv4/ipvs/ip_vs_proto_tcp.c delete mode 100644 net/ipv4/ipvs/ip_vs_proto_udp.c delete mode 100644 net/ipv4/ipvs/ip_vs_rr.c delete mode 100644 net/ipv4/ipvs/ip_vs_sched.c delete mode 100644 net/ipv4/ipvs/ip_vs_sed.c delete mode 100644 net/ipv4/ipvs/ip_vs_sh.c delete mode 100644 net/ipv4/ipvs/ip_vs_sync.c delete mode 100644 net/ipv4/ipvs/ip_vs_wlc.c delete mode 100644 net/ipv4/ipvs/ip_vs_wrr.c delete mode 100644 net/ipv4/ipvs/ip_vs_xmit.c create mode 100644 net/netfilter/ipvs/Kconfig create mode 100644 net/netfilter/ipvs/Makefile create mode 100644 net/netfilter/ipvs/ip_vs_app.c create mode 100644 net/netfilter/ipvs/ip_vs_conn.c create mode 100644 net/netfilter/ipvs/ip_vs_core.c create mode 100644 net/netfilter/ipvs/ip_vs_ctl.c create mode 100644 net/netfilter/ipvs/ip_vs_dh.c create mode 100644 net/netfilter/ipvs/ip_vs_est.c create mode 100644 net/netfilter/ipvs/ip_vs_ftp.c create mode 100644 net/netfilter/ipvs/ip_vs_lblc.c create mode 100644 net/netfilter/ipvs/ip_vs_lblcr.c create mode 100644 net/netfilter/ipvs/ip_vs_lc.c create mode 100644 net/netfilter/ipvs/ip_vs_nq.c create mode 100644 net/netfilter/ipvs/ip_vs_proto.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_ah_esp.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_tcp.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_udp.c create mode 100644 net/netfilter/ipvs/ip_vs_rr.c create mode 100644 net/netfilter/ipvs/ip_vs_sched.c create mode 100644 net/netfilter/ipvs/ip_vs_sed.c create mode 100644 net/netfilter/ipvs/ip_vs_sh.c create mode 100644 net/netfilter/ipvs/ip_vs_sync.c create mode 100644 net/netfilter/ipvs/ip_vs_wlc.c create mode 100644 net/netfilter/ipvs/ip_vs_wrr.c create mode 100644 net/netfilter/ipvs/ip_vs_xmit.c (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 591ea23639c..691268f3a35 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -630,5 +630,3 @@ config TCP_MD5SIG If unsure, say N. -source "net/ipv4/ipvs/Kconfig" - diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ad40ef3f9eb..80ff87ce43a 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ -obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig deleted file mode 100644 index de6004de80b..00000000000 --- a/net/ipv4/ipvs/Kconfig +++ /dev/null @@ -1,239 +0,0 @@ -# -# IP Virtual Server configuration -# -menuconfig IP_VS - tristate "IP virtual server support (EXPERIMENTAL)" - depends on NETFILTER - ---help--- - IP Virtual Server support will let you build a high-performance - virtual server based on cluster of two or more real servers. This - option must be enabled for at least one of the clustered computers - that will take care of intercepting incoming connections to a - single IP address and scheduling them to real servers. - - Three request dispatching techniques are implemented, they are - virtual server via NAT, virtual server via tunneling and virtual - server via direct routing. The several scheduling algorithms can - be used to choose which server the connection is directed to, - thus load balancing can be achieved among the servers. For more - information and its administration program, please visit the - following URL: . - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -if IP_VS - -config IP_VS_IPV6 - bool "IPv6 support for IPVS (DANGEROUS)" - depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) - ---help--- - Add IPv6 support to IPVS. This is incomplete and might be dangerous. - - Say N if unsure. - -config IP_VS_DEBUG - bool "IP virtual server debugging" - ---help--- - Say Y here if you want to get additional messages useful in - debugging the IP virtual server code. You can change the debug - level in /proc/sys/net/ipv4/vs/debug_level - -config IP_VS_TAB_BITS - int "IPVS connection table size (the Nth power of 2)" - range 8 20 - default 12 - ---help--- - The IPVS connection hash table uses the chaining scheme to handle - hash collisions. Using a big IPVS connection hash table will greatly - reduce conflicts when there are hundreds of thousands of connections - in the hash table. - - Note the table size must be power of 2. The table size will be the - value of 2 to the your input number power. The number to choose is - from 8 to 20, the default number is 12, which means the table size - is 4096. Don't input the number too small, otherwise you will lose - performance on it. You can adapt the table size yourself, according - to your virtual server application. It is good to set the table size - not far less than the number of connections per second multiplying - average lasting time of connection in the table. For example, your - virtual server gets 200 connections per second, the connection lasts - for 200 seconds in average in the connection table, the table size - should be not far less than 200x200, it is good to set the table - size 32768 (2**15). - - Another note that each connection occupies 128 bytes effectively and - each hash entry uses 8 bytes, so you can estimate how much memory is - needed for your box. - -comment "IPVS transport protocol load balancing support" - -config IP_VS_PROTO_TCP - bool "TCP load balancing support" - ---help--- - This option enables support for load balancing TCP transport - protocol. Say Y if unsure. - -config IP_VS_PROTO_UDP - bool "UDP load balancing support" - ---help--- - This option enables support for load balancing UDP transport - protocol. Say Y if unsure. - -config IP_VS_PROTO_AH_ESP - bool - depends on UNDEFINED - -config IP_VS_PROTO_ESP - bool "ESP load balancing support" - select IP_VS_PROTO_AH_ESP - ---help--- - This option enables support for load balancing ESP (Encapsulation - Security Payload) transport protocol. Say Y if unsure. - -config IP_VS_PROTO_AH - bool "AH load balancing support" - select IP_VS_PROTO_AH_ESP - ---help--- - This option enables support for load balancing AH (Authentication - Header) transport protocol. Say Y if unsure. - -comment "IPVS scheduler" - -config IP_VS_RR - tristate "round-robin scheduling" - ---help--- - The robin-robin scheduling algorithm simply directs network - connections to different real servers in a round-robin manner. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_WRR - tristate "weighted round-robin scheduling" - ---help--- - The weighted robin-robin scheduling algorithm directs network - connections to different real servers based on server weights - in a round-robin manner. Servers with higher weights receive - new connections first than those with less weights, and servers - with higher weights get more connections than those with less - weights and servers with equal weights get equal connections. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LC - tristate "least-connection scheduling" - ---help--- - The least-connection scheduling algorithm directs network - connections to the server with the least number of active - connections. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_WLC - tristate "weighted least-connection scheduling" - ---help--- - The weighted least-connection scheduling algorithm directs network - connections to the server with the least active connections - normalized by the server weight. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LBLC - tristate "locality-based least-connection scheduling" - ---help--- - The locality-based least-connection scheduling algorithm is for - destination IP load balancing. It is usually used in cache cluster. - This algorithm usually directs packet destined for an IP address to - its server if the server is alive and under load. If the server is - overloaded (its active connection numbers is larger than its weight) - and there is a server in its half load, then allocate the weighted - least-connection server to this IP address. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LBLCR - tristate "locality-based least-connection with replication scheduling" - ---help--- - The locality-based least-connection with replication scheduling - algorithm is also for destination IP load balancing. It is - usually used in cache cluster. It differs from the LBLC scheduling - as follows: the load balancer maintains mappings from a target - to a set of server nodes that can serve the target. Requests for - a target are assigned to the least-connection node in the target's - server set. If all the node in the server set are over loaded, - it picks up a least-connection node in the cluster and adds it - in the sever set for the target. If the server set has not been - modified for the specified time, the most loaded node is removed - from the server set, in order to avoid high degree of replication. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_DH - tristate "destination hashing scheduling" - ---help--- - The destination hashing scheduling algorithm assigns network - connections to the servers through looking up a statically assigned - hash table by their destination IP addresses. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_SH - tristate "source hashing scheduling" - ---help--- - The source hashing scheduling algorithm assigns network - connections to the servers through looking up a statically assigned - hash table by their source IP addresses. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_SED - tristate "shortest expected delay scheduling" - ---help--- - The shortest expected delay scheduling algorithm assigns network - connections to the server with the shortest expected delay. The - expected delay that the job will experience is (Ci + 1) / Ui if - sent to the ith server, in which Ci is the number of connections - on the ith server and Ui is the fixed service rate (weight) - of the ith server. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_NQ - tristate "never queue scheduling" - ---help--- - The never queue scheduling algorithm adopts a two-speed model. - When there is an idle server available, the job will be sent to - the idle server, instead of waiting for a fast one. When there - is no idle server available, the job will be sent to the server - that minimize its expected delay (The Shortest Expected Delay - scheduling algorithm). - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -comment 'IPVS application helper' - -config IP_VS_FTP - tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP - ---help--- - FTP is a protocol that transfers IP address and/or port number in - the payload. In the virtual server via Network Address Translation, - the IP address and port number of real servers cannot be sent to - clients in ftp connections directly, so FTP protocol helper is - required for tracking the connection and mangling it back to that of - virtual service. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -endif # IP_VS diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile deleted file mode 100644 index 73a46fe1fe4..00000000000 --- a/net/ipv4/ipvs/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -# -# Makefile for the IPVS modules on top of IPv4. -# - -# IPVS transport protocol load balancing support -ip_vs_proto-objs-y := -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o - -ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ - ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ - ip_vs_est.o ip_vs_proto.o \ - $(ip_vs_proto-objs-y) - - -# IPVS core -obj-$(CONFIG_IP_VS) += ip_vs.o - -# IPVS schedulers -obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o -obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o -obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o -obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o -obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o -obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o -obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o -obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o -obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o -obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o - -# IPVS application helpers -obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c deleted file mode 100644 index 201b8ea3020..00000000000 --- a/net/ipv4/ipvs/ip_vs_app.c +++ /dev/null @@ -1,622 +0,0 @@ -/* - * ip_vs_app.c: Application module support for IPVS - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference - * is that ip_vs_app module handles the reverse direction (incoming requests - * and outgoing responses). - * - * IP_MASQ_APP application masquerading module - * - * Author: Juan Jose Ciarlante, - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -EXPORT_SYMBOL(register_ip_vs_app); -EXPORT_SYMBOL(unregister_ip_vs_app); -EXPORT_SYMBOL(register_ip_vs_app_inc); - -/* ipvs application list head */ -static LIST_HEAD(ip_vs_app_list); -static DEFINE_MUTEX(__ip_vs_app_mutex); - - -/* - * Get an ip_vs_app object - */ -static inline int ip_vs_app_get(struct ip_vs_app *app) -{ - return try_module_get(app->module); -} - - -static inline void ip_vs_app_put(struct ip_vs_app *app) -{ - module_put(app->module); -} - - -/* - * Allocate/initialize app incarnation and register it in proto apps. - */ -static int -ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) -{ - struct ip_vs_protocol *pp; - struct ip_vs_app *inc; - int ret; - - if (!(pp = ip_vs_proto_get(proto))) - return -EPROTONOSUPPORT; - - if (!pp->unregister_app) - return -EOPNOTSUPP; - - inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); - if (!inc) - return -ENOMEM; - INIT_LIST_HEAD(&inc->p_list); - INIT_LIST_HEAD(&inc->incs_list); - inc->app = app; - inc->port = htons(port); - atomic_set(&inc->usecnt, 0); - - if (app->timeouts) { - inc->timeout_table = - ip_vs_create_timeout_table(app->timeouts, - app->timeouts_size); - if (!inc->timeout_table) { - ret = -ENOMEM; - goto out; - } - } - - ret = pp->register_app(inc); - if (ret) - goto out; - - list_add(&inc->a_list, &app->incs_list); - IP_VS_DBG(9, "%s application %s:%u registered\n", - pp->name, inc->name, inc->port); - - return 0; - - out: - kfree(inc->timeout_table); - kfree(inc); - return ret; -} - - -/* - * Release app incarnation - */ -static void -ip_vs_app_inc_release(struct ip_vs_app *inc) -{ - struct ip_vs_protocol *pp; - - if (!(pp = ip_vs_proto_get(inc->protocol))) - return; - - if (pp->unregister_app) - pp->unregister_app(inc); - - IP_VS_DBG(9, "%s App %s:%u unregistered\n", - pp->name, inc->name, inc->port); - - list_del(&inc->a_list); - - kfree(inc->timeout_table); - kfree(inc); -} - - -/* - * Get reference to app inc (only called from softirq) - * - */ -int ip_vs_app_inc_get(struct ip_vs_app *inc) -{ - int result; - - atomic_inc(&inc->usecnt); - if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) - atomic_dec(&inc->usecnt); - return result; -} - - -/* - * Put the app inc (only called from timer or net softirq) - */ -void ip_vs_app_inc_put(struct ip_vs_app *inc) -{ - ip_vs_app_put(inc->app); - atomic_dec(&inc->usecnt); -} - - -/* - * Register an application incarnation in protocol applications - */ -int -register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) -{ - int result; - - mutex_lock(&__ip_vs_app_mutex); - - result = ip_vs_app_inc_new(app, proto, port); - - mutex_unlock(&__ip_vs_app_mutex); - - return result; -} - - -/* - * ip_vs_app registration routine - */ -int register_ip_vs_app(struct ip_vs_app *app) -{ - /* increase the module use count */ - ip_vs_use_count_inc(); - - mutex_lock(&__ip_vs_app_mutex); - - list_add(&app->a_list, &ip_vs_app_list); - - mutex_unlock(&__ip_vs_app_mutex); - - return 0; -} - - -/* - * ip_vs_app unregistration routine - * We are sure there are no app incarnations attached to services - */ -void unregister_ip_vs_app(struct ip_vs_app *app) -{ - struct ip_vs_app *inc, *nxt; - - mutex_lock(&__ip_vs_app_mutex); - - list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { - ip_vs_app_inc_release(inc); - } - - list_del(&app->a_list); - - mutex_unlock(&__ip_vs_app_mutex); - - /* decrease the module use count */ - ip_vs_use_count_dec(); -} - - -/* - * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) - */ -int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) -{ - return pp->app_conn_bind(cp); -} - - -/* - * Unbind cp from application incarnation (called by cp destructor) - */ -void ip_vs_unbind_app(struct ip_vs_conn *cp) -{ - struct ip_vs_app *inc = cp->app; - - if (!inc) - return; - - if (inc->unbind_conn) - inc->unbind_conn(inc, cp); - if (inc->done_conn) - inc->done_conn(inc, cp); - ip_vs_app_inc_put(inc); - cp->app = NULL; -} - - -/* - * Fixes th->seq based on ip_vs_seq info. - */ -static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) -{ - __u32 seq = ntohl(th->seq); - - /* - * Adjust seq with delta-offset for all packets after - * the most recent resized pkt seq and with previous_delta offset - * for all packets before most recent resized pkt seq. - */ - if (vseq->delta || vseq->previous_delta) { - if(after(seq, vseq->init_seq)) { - th->seq = htonl(seq + vseq->delta); - IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", - vseq->delta); - } else { - th->seq = htonl(seq + vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " - "(%d) to seq\n", vseq->previous_delta); - } - } -} - - -/* - * Fixes th->ack_seq based on ip_vs_seq info. - */ -static inline void -vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) -{ - __u32 ack_seq = ntohl(th->ack_seq); - - /* - * Adjust ack_seq with delta-offset for - * the packets AFTER most recent resized pkt has caused a shift - * for packets before most recent resized pkt, use previous_delta - */ - if (vseq->delta || vseq->previous_delta) { - /* since ack_seq is the number of octet that is expected - to receive next, so compare it with init_seq+delta */ - if(after(ack_seq, vseq->init_seq+vseq->delta)) { - th->ack_seq = htonl(ack_seq - vseq->delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " - "(%d) from ack_seq\n", vseq->delta); - - } else { - th->ack_seq = htonl(ack_seq - vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " - "previous_delta (%d) from ack_seq\n", - vseq->previous_delta); - } - } -} - - -/* - * Updates ip_vs_seq if pkt has been resized - * Assumes already checked proto==IPPROTO_TCP and diff!=0. - */ -static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, - unsigned flag, __u32 seq, int diff) -{ - /* spinlock is to keep updating cp->flags atomic */ - spin_lock(&cp->lock); - if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { - vseq->previous_delta = vseq->delta; - vseq->delta += diff; - vseq->init_seq = seq; - cp->flags |= flag; - } - spin_unlock(&cp->lock); -} - -static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) -{ - int diff; - const unsigned int tcp_offset = ip_hdrlen(skb); - struct tcphdr *th; - __u32 seq; - - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) - return 0; - - th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); - - /* - * Remember seq number in case this pkt gets resized - */ - seq = ntohl(th->seq); - - /* - * Fix seq stuff if flagged as so. - */ - if (cp->flags & IP_VS_CONN_F_OUT_SEQ) - vs_fix_seq(&cp->out_seq, th); - if (cp->flags & IP_VS_CONN_F_IN_SEQ) - vs_fix_ack_seq(&cp->in_seq, th); - - /* - * Call private output hook function - */ - if (app->pkt_out == NULL) - return 1; - - if (!app->pkt_out(app, cp, skb, &diff)) - return 0; - - /* - * Update ip_vs seq stuff if len has changed. - */ - if (diff != 0) - vs_seq_update(cp, &cp->out_seq, - IP_VS_CONN_F_OUT_SEQ, seq, diff); - - return 1; -} - -/* - * Output pkt hook. Will call bound ip_vs_app specific function - * called by ipvs packet handler, assumes previously checked cp!=NULL - * returns false if it can't handle packet (oom) - */ -int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_app *app; - - /* - * check if application module is bound to - * this ip_vs_conn. - */ - if ((app = cp->app) == NULL) - return 1; - - /* TCP is complicated */ - if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_out(cp, skb, app); - - /* - * Call private output hook function - */ - if (app->pkt_out == NULL) - return 1; - - return app->pkt_out(app, cp, skb, NULL); -} - - -static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) -{ - int diff; - const unsigned int tcp_offset = ip_hdrlen(skb); - struct tcphdr *th; - __u32 seq; - - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) - return 0; - - th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); - - /* - * Remember seq number in case this pkt gets resized - */ - seq = ntohl(th->seq); - - /* - * Fix seq stuff if flagged as so. - */ - if (cp->flags & IP_VS_CONN_F_IN_SEQ) - vs_fix_seq(&cp->in_seq, th); - if (cp->flags & IP_VS_CONN_F_OUT_SEQ) - vs_fix_ack_seq(&cp->out_seq, th); - - /* - * Call private input hook function - */ - if (app->pkt_in == NULL) - return 1; - - if (!app->pkt_in(app, cp, skb, &diff)) - return 0; - - /* - * Update ip_vs seq stuff if len has changed. - */ - if (diff != 0) - vs_seq_update(cp, &cp->in_seq, - IP_VS_CONN_F_IN_SEQ, seq, diff); - - return 1; -} - -/* - * Input pkt hook. Will call bound ip_vs_app specific function - * called by ipvs packet handler, assumes previously checked cp!=NULL. - * returns false if can't handle packet (oom). - */ -int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_app *app; - - /* - * check if application module is bound to - * this ip_vs_conn. - */ - if ((app = cp->app) == NULL) - return 1; - - /* TCP is complicated */ - if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_in(cp, skb, app); - - /* - * Call private input hook function - */ - if (app->pkt_in == NULL) - return 1; - - return app->pkt_in(app, cp, skb, NULL); -} - - -#ifdef CONFIG_PROC_FS -/* - * /proc/net/ip_vs_app entry function - */ - -static struct ip_vs_app *ip_vs_app_idx(loff_t pos) -{ - struct ip_vs_app *app, *inc; - - list_for_each_entry(app, &ip_vs_app_list, a_list) { - list_for_each_entry(inc, &app->incs_list, a_list) { - if (pos-- == 0) - return inc; - } - } - return NULL; - -} - -static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) -{ - mutex_lock(&__ip_vs_app_mutex); - - return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; -} - -static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ip_vs_app *inc, *app; - struct list_head *e; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_app_idx(0); - - inc = v; - app = inc->app; - - if ((e = inc->a_list.next) != &app->incs_list) - return list_entry(e, struct ip_vs_app, a_list); - - /* go on to next application */ - for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { - app = list_entry(e, struct ip_vs_app, a_list); - list_for_each_entry(inc, &app->incs_list, a_list) { - return inc; - } - } - return NULL; -} - -static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) -{ - mutex_unlock(&__ip_vs_app_mutex); -} - -static int ip_vs_app_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "prot port usecnt name\n"); - else { - const struct ip_vs_app *inc = v; - - seq_printf(seq, "%-3s %-7u %-6d %-17s\n", - ip_vs_proto_name(inc->protocol), - ntohs(inc->port), - atomic_read(&inc->usecnt), - inc->name); - } - return 0; -} - -static const struct seq_operations ip_vs_app_seq_ops = { - .start = ip_vs_app_seq_start, - .next = ip_vs_app_seq_next, - .stop = ip_vs_app_seq_stop, - .show = ip_vs_app_seq_show, -}; - -static int ip_vs_app_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_app_seq_ops); -} - -static const struct file_operations ip_vs_app_fops = { - .owner = THIS_MODULE, - .open = ip_vs_app_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - - -/* - * Replace a segment of data with a new segment - */ -int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, - char *o_buf, int o_len, char *n_buf, int n_len) -{ - int diff; - int o_offset; - int o_left; - - EnterFunction(9); - - diff = n_len - o_len; - o_offset = o_buf - (char *)skb->data; - /* The length of left data after o_buf+o_len in the skb data */ - o_left = skb->len - (o_offset + o_len); - - if (diff <= 0) { - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - skb_trim(skb, skb->len + diff); - } else if (diff <= skb_tailroom(skb)) { - skb_put(skb, diff); - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - } else { - if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) - return -ENOMEM; - skb_put(skb, diff); - memmove(skb->data + o_offset + n_len, - skb->data + o_offset + o_len, o_left); - skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); - } - - /* must update the iph total length here */ - ip_hdr(skb)->tot_len = htons(skb->len); - - LeaveFunction(9); - return 0; -} - - -int __init ip_vs_app_init(void) -{ - /* we will replace it with proc_net_ipvs_create() soon */ - proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); - return 0; -} - - -void ip_vs_app_cleanup(void) -{ - proc_net_remove(&init_net, "ip_vs_app"); -} diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c deleted file mode 100644 index 9a24332fbed..00000000000 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ /dev/null @@ -1,1110 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, - * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms - * and others. Many code here is taken from IP MASQ code of kernel 2.2. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include -#include /* for proc_net_* */ -#include -#include -#include - -#include -#include - - -/* - * Connection hash table: for input and output packets lookups of IPVS - */ -static struct list_head *ip_vs_conn_tab; - -/* SLAB cache for IPVS connections */ -static struct kmem_cache *ip_vs_conn_cachep __read_mostly; - -/* counter for current IPVS connections */ -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); - -/* counter for no client port connections */ -static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); - -/* random value for IPVS connection hash */ -static unsigned int ip_vs_conn_rnd; - -/* - * Fine locking granularity for big connection hash table - */ -#define CT_LOCKARRAY_BITS 4 -#define CT_LOCKARRAY_SIZE (1<ip, (__force u32)port, proto, - ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; -} - - -/* - * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. - * returns bool success. - */ -static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) -{ - unsigned hash; - int ret; - - /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); - - ct_write_lock(hash); - - if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - list_add(&cp->c_list, &ip_vs_conn_tab[hash]); - cp->flags |= IP_VS_CONN_F_HASHED; - atomic_inc(&cp->refcnt); - ret = 1; - } else { - IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - ret = 0; - } - - ct_write_unlock(hash); - - return ret; -} - - -/* - * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. - */ -static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) -{ - unsigned hash; - int ret; - - /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); - - ct_write_lock(hash); - - if (cp->flags & IP_VS_CONN_F_HASHED) { - list_del(&cp->c_list); - cp->flags &= ~IP_VS_CONN_F_HASHED; - atomic_dec(&cp->refcnt); - ret = 1; - } else - ret = 0; - - ct_write_unlock(hash); - - return ret; -} - - -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from OUTside-to-INside. - * s_addr, s_port: pkt source address (foreign host) - * d_addr, d_port: pkt dest address (load balancer) - */ -static inline struct ip_vs_conn *__ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && - ip_vs_addr_equal(af, d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && - ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); - return cp; - } - } - - ct_read_unlock(hash); - - return NULL; -} - -struct ip_vs_conn *ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - struct ip_vs_conn *cp; - - cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); - if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) - cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, - d_port); - - IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), - cp ? "hit" : "not hit"); - - return cp; -} - -/* Get reference to connection template */ -struct ip_vs_conn *ip_vs_ct_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && - ip_vs_addr_equal(af, d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && - cp->flags & IP_VS_CONN_F_TEMPLATE && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - goto out; - } - } - cp = NULL; - - out: - ct_read_unlock(hash); - - IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), - cp ? "hit" : "not hit"); - - return cp; -} - -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from inside-to-OUTside. - * s_addr, s_port: pkt source address (inside host) - * d_addr, d_port: pkt dest address (foreign host) - */ -struct ip_vs_conn *ip_vs_conn_out_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp, *ret=NULL; - - /* - * Check for "full" addressed entries - */ - hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, d_addr, &cp->caddr) && - ip_vs_addr_equal(af, s_addr, &cp->daddr) && - d_port == cp->cport && s_port == cp->dport && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - ret = cp; - break; - } - } - - ct_read_unlock(hash); - - IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), - ret ? "hit" : "not hit"); - - return ret; -} - - -/* - * Put back the conn and restart its timer with its timeout - */ -void ip_vs_conn_put(struct ip_vs_conn *cp) -{ - /* reset it expire in its timeout */ - mod_timer(&cp->timer, jiffies+cp->timeout); - - __ip_vs_conn_put(cp); -} - - -/* - * Fill a no_client_port connection with a client port number - */ -void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) -{ - if (ip_vs_conn_unhash(cp)) { - spin_lock(&cp->lock); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) { - atomic_dec(&ip_vs_conn_no_cport_cnt); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; - } - spin_unlock(&cp->lock); - - /* hash on new dport */ - ip_vs_conn_hash(cp); - } -} - - -/* - * Bind a connection entry with the corresponding packet_xmit. - * Called by ip_vs_conn_new. - */ -static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) -{ - switch (IP_VS_FWD_METHOD(cp)) { - case IP_VS_CONN_F_MASQ: - cp->packet_xmit = ip_vs_nat_xmit; - break; - - case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit; - break; - - case IP_VS_CONN_F_DROUTE: - cp->packet_xmit = ip_vs_dr_xmit; - break; - - case IP_VS_CONN_F_LOCALNODE: - cp->packet_xmit = ip_vs_null_xmit; - break; - - case IP_VS_CONN_F_BYPASS: - cp->packet_xmit = ip_vs_bypass_xmit; - break; - } -} - -#ifdef CONFIG_IP_VS_IPV6 -static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) -{ - switch (IP_VS_FWD_METHOD(cp)) { - case IP_VS_CONN_F_MASQ: - cp->packet_xmit = ip_vs_nat_xmit_v6; - break; - - case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit_v6; - break; - - case IP_VS_CONN_F_DROUTE: - cp->packet_xmit = ip_vs_dr_xmit_v6; - break; - - case IP_VS_CONN_F_LOCALNODE: - cp->packet_xmit = ip_vs_null_xmit; - break; - - case IP_VS_CONN_F_BYPASS: - cp->packet_xmit = ip_vs_bypass_xmit_v6; - break; - } -} -#endif - - -static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) -{ - return atomic_read(&dest->activeconns) - + atomic_read(&dest->inactconns); -} - -/* - * Bind a connection entry with a virtual service destination - * Called just after a new connection entry is created. - */ -static inline void -ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) -{ - /* if dest is NULL, then return directly */ - if (!dest) - return; - - /* Increase the refcnt counter of the dest */ - atomic_inc(&dest->refcnt); - - /* Bind with the destination and its corresponding transmitter */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) - /* if the connection is not template and is created - * by sync, preserve the activity flag. - */ - cp->flags |= atomic_read(&dest->conn_flags) & - (~IP_VS_CONN_F_INACTIVE); - else - cp->flags |= atomic_read(&dest->conn_flags); - cp->dest = dest; - - IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " - "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); - - /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so increase the inactive - connection counter because it is in TCP SYNRECV - state (inactive) or other protocol inacive state */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_INACTIVE))) - atomic_inc(&dest->activeconns); - else - atomic_inc(&dest->inactconns); - } else { - /* It is a persistent connection/template, so increase - the peristent connection counter */ - atomic_inc(&dest->persistconns); - } - - if (dest->u_threshold != 0 && - ip_vs_dest_totalconns(dest) >= dest->u_threshold) - dest->flags |= IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Check if there is a destination for the connection, if so - * bind the connection to the destination. - */ -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) -{ - struct ip_vs_dest *dest; - - if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, - &cp->vaddr, cp->vport, - cp->protocol); - ip_vs_bind_dest(cp, dest); - return dest; - } else - return NULL; -} - - -/* - * Unbind a connection entry with its VS destination - * Called by the ip_vs_conn_expire function. - */ -static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) -{ - struct ip_vs_dest *dest = cp->dest; - - if (!dest) - return; - - IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " - "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); - - /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so decrease the inactconns - or activeconns counter */ - if (cp->flags & IP_VS_CONN_F_INACTIVE) { - atomic_dec(&dest->inactconns); - } else { - atomic_dec(&dest->activeconns); - } - } else { - /* It is a persistent connection/template, so decrease - the peristent connection counter */ - atomic_dec(&dest->persistconns); - } - - if (dest->l_threshold != 0) { - if (ip_vs_dest_totalconns(dest) < dest->l_threshold) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } else if (dest->u_threshold != 0) { - if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } else { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } - - /* - * Simply decrease the refcnt of the dest, because the - * dest will be either in service's destination list - * or in the trash. - */ - atomic_dec(&dest->refcnt); -} - - -/* - * Checking if the destination of a connection template is available. - * If available, return 1, otherwise invalidate this connection - * template and return 0. - */ -int ip_vs_check_template(struct ip_vs_conn *ct) -{ - struct ip_vs_dest *dest = ct->dest; - - /* - * Checking the dest server status. - */ - if ((dest == NULL) || - !(dest->flags & IP_VS_DEST_F_AVAILABLE) || - (sysctl_ip_vs_expire_quiescent_template && - (atomic_read(&dest->weight) == 0))) { - IP_VS_DBG_BUF(9, "check_template: dest not available for " - "protocol %s s:%s:%d v:%s:%d " - "-> d:%s:%d\n", - ip_vs_proto_name(ct->protocol), - IP_VS_DBG_ADDR(ct->af, &ct->caddr), - ntohs(ct->cport), - IP_VS_DBG_ADDR(ct->af, &ct->vaddr), - ntohs(ct->vport), - IP_VS_DBG_ADDR(ct->af, &ct->daddr), - ntohs(ct->dport)); - - /* - * Invalidate the connection template - */ - if (ct->vport != htons(0xffff)) { - if (ip_vs_conn_unhash(ct)) { - ct->dport = htons(0xffff); - ct->vport = htons(0xffff); - ct->cport = 0; - ip_vs_conn_hash(ct); - } - } - - /* - * Simply decrease the refcnt of the template, - * don't restart its timer. - */ - atomic_dec(&ct->refcnt); - return 0; - } - return 1; -} - -static void ip_vs_conn_expire(unsigned long data) -{ - struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - - cp->timeout = 60*HZ; - - /* - * hey, I'm using it - */ - atomic_inc(&cp->refcnt); - - /* - * do I control anybody? - */ - if (atomic_read(&cp->n_control)) - goto expire_later; - - /* - * unhash it if it is hashed in the conn table - */ - if (!ip_vs_conn_unhash(cp)) - goto expire_later; - - /* - * refcnt==1 implies I'm the only one referrer - */ - if (likely(atomic_read(&cp->refcnt) == 1)) { - /* delete the timer if it is activated by other users */ - if (timer_pending(&cp->timer)) - del_timer(&cp->timer); - - /* does anybody control me? */ - if (cp->control) - ip_vs_control_del(cp); - - if (unlikely(cp->app != NULL)) - ip_vs_unbind_app(cp); - ip_vs_unbind_dest(cp); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) - atomic_dec(&ip_vs_conn_no_cport_cnt); - atomic_dec(&ip_vs_conn_count); - - kmem_cache_free(ip_vs_conn_cachep, cp); - return; - } - - /* hash it back to the table */ - ip_vs_conn_hash(cp); - - expire_later: - IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", - atomic_read(&cp->refcnt)-1, - atomic_read(&cp->n_control)); - - ip_vs_conn_put(cp); -} - - -void ip_vs_conn_expire_now(struct ip_vs_conn *cp) -{ - if (del_timer(&cp->timer)) - mod_timer(&cp->timer, jiffies); -} - - -/* - * Create a new connection entry and hash it into the ip_vs_conn_tab - */ -struct ip_vs_conn * -ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, - const union nf_inet_addr *vaddr, __be16 vport, - const union nf_inet_addr *daddr, __be16 dport, unsigned flags, - struct ip_vs_dest *dest) -{ - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); - - cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); - if (cp == NULL) { - IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); - return NULL; - } - - INIT_LIST_HEAD(&cp->c_list); - setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); - cp->af = af; - cp->protocol = proto; - ip_vs_addr_copy(af, &cp->caddr, caddr); - cp->cport = cport; - ip_vs_addr_copy(af, &cp->vaddr, vaddr); - cp->vport = vport; - ip_vs_addr_copy(af, &cp->daddr, daddr); - cp->dport = dport; - cp->flags = flags; - spin_lock_init(&cp->lock); - - /* - * Set the entry is referenced by the current thread before hashing - * it in the table, so that other thread run ip_vs_random_dropentry - * but cannot drop this entry. - */ - atomic_set(&cp->refcnt, 1); - - atomic_set(&cp->n_control, 0); - atomic_set(&cp->in_pkts, 0); - - atomic_inc(&ip_vs_conn_count); - if (flags & IP_VS_CONN_F_NO_CPORT) - atomic_inc(&ip_vs_conn_no_cport_cnt); - - /* Bind the connection with a destination server */ - ip_vs_bind_dest(cp, dest); - - /* Set its state and timeout */ - cp->state = 0; - cp->timeout = 3*HZ; - - /* Bind its packet transmitter */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - ip_vs_bind_xmit_v6(cp); - else -#endif - ip_vs_bind_xmit(cp); - - if (unlikely(pp && atomic_read(&pp->appcnt))) - ip_vs_bind_app(cp, pp); - - /* Hash it in the ip_vs_conn_tab finally */ - ip_vs_conn_hash(cp); - - return cp; -} - - -/* - * /proc/net/ip_vs_conn entries - */ -#ifdef CONFIG_PROC_FS - -static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) -{ - int idx; - struct ip_vs_conn *cp; - - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - if (pos-- == 0) { - seq->private = &ip_vs_conn_tab[idx]; - return cp; - } - } - ct_read_unlock_bh(idx); - } - - return NULL; -} - -static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) -{ - seq->private = NULL; - return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; -} - -static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ip_vs_conn *cp = v; - struct list_head *e, *l = seq->private; - int idx; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_conn_array(seq, 0); - - /* more on same hash chain? */ - if ((e = cp->c_list.next) != l) - return list_entry(e, struct ip_vs_conn, c_list); - - idx = l - ip_vs_conn_tab; - ct_read_unlock_bh(idx); - - while (++idx < IP_VS_CONN_TAB_SIZE) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - seq->private = &ip_vs_conn_tab[idx]; - return cp; - } - ct_read_unlock_bh(idx); - } - seq->private = NULL; - return NULL; -} - -static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) -{ - struct list_head *l = seq->private; - - if (l) - ct_read_unlock_bh(l - ip_vs_conn_tab); -} - -static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) -{ - - if (v == SEQ_START_TOKEN) - seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); - else { - const struct ip_vs_conn *cp = v; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - seq_printf(seq, - "%-3s " NIP6_FMT " %04X " NIP6_FMT - " %04X " NIP6_FMT " %04X %-11s %7lu\n", - ip_vs_proto_name(cp->protocol), - NIP6(cp->caddr.in6), ntohs(cp->cport), - NIP6(cp->vaddr.in6), ntohs(cp->vport), - NIP6(cp->daddr.in6), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); - else -#endif - seq_printf(seq, - "%-3s %08X %04X %08X %04X" - " %08X %04X %-11s %7lu\n", - ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr.ip), ntohs(cp->cport), - ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); - } - return 0; -} - -static const struct seq_operations ip_vs_conn_seq_ops = { - .start = ip_vs_conn_seq_start, - .next = ip_vs_conn_seq_next, - .stop = ip_vs_conn_seq_stop, - .show = ip_vs_conn_seq_show, -}; - -static int ip_vs_conn_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_conn_seq_ops); -} - -static const struct file_operations ip_vs_conn_fops = { - .owner = THIS_MODULE, - .open = ip_vs_conn_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const char *ip_vs_origin_name(unsigned flags) -{ - if (flags & IP_VS_CONN_F_SYNC) - return "SYNC"; - else - return "LOCAL"; -} - -static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) -{ - - if (v == SEQ_START_TOKEN) - seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); - else { - const struct ip_vs_conn *cp = v; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - seq_printf(seq, - "%-3s " NIP6_FMT " %04X " NIP6_FMT - " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n", - ip_vs_proto_name(cp->protocol), - NIP6(cp->caddr.in6), ntohs(cp->cport), - NIP6(cp->vaddr.in6), ntohs(cp->vport), - NIP6(cp->daddr.in6), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - ip_vs_origin_name(cp->flags), - (cp->timer.expires-jiffies)/HZ); - else -#endif - seq_printf(seq, - "%-3s %08X %04X %08X %04X " - "%08X %04X %-11s %-6s %7lu\n", - ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr.ip), ntohs(cp->cport), - ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - ip_vs_origin_name(cp->flags), - (cp->timer.expires-jiffies)/HZ); - } - return 0; -} - -static const struct seq_operations ip_vs_conn_sync_seq_ops = { - .start = ip_vs_conn_seq_start, - .next = ip_vs_conn_seq_next, - .stop = ip_vs_conn_seq_stop, - .show = ip_vs_conn_sync_seq_show, -}; - -static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_conn_sync_seq_ops); -} - -static const struct file_operations ip_vs_conn_sync_fops = { - .owner = THIS_MODULE, - .open = ip_vs_conn_sync_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#endif - - -/* - * Randomly drop connection entries before running out of memory - */ -static inline int todrop_entry(struct ip_vs_conn *cp) -{ - /* - * The drop rate array needs tuning for real environments. - * Called from timer bh only => no locking - */ - static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - static char todrop_counter[9] = {0}; - int i; - - /* if the conn entry hasn't lasted for 60 seconds, don't drop it. - This will leave enough time for normal connection to get - through. */ - if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) - return 0; - - /* Don't drop the entry if its number of incoming packets is not - located in [0, 8] */ - i = atomic_read(&cp->in_pkts); - if (i > 8 || i < 0) return 0; - - if (!todrop_rate[i]) return 0; - if (--todrop_counter[i] > 0) return 0; - - todrop_counter[i] = todrop_rate[i]; - return 1; -} - -/* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(void) -{ - int idx; - struct ip_vs_conn *cp; - - /* - * Randomly scan 1/32 of the whole table every second - */ - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; - - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; - - if (cp->protocol == IPPROTO_TCP) { - switch(cp->state) { - case IP_VS_TCP_S_SYN_RECV: - case IP_VS_TCP_S_SYNACK: - break; - - case IP_VS_TCP_S_ESTABLISHED: - if (todrop_entry(cp)) - break; - continue; - - default: - continue; - } - } else { - if (!todrop_entry(cp)) - continue; - } - - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); - if (cp->control) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); - } - } - ct_write_unlock_bh(hash); - } -} - - -/* - * Flush all the connection entries in the ip_vs_conn_tab - */ -static void ip_vs_conn_flush(void) -{ - int idx; - struct ip_vs_conn *cp; - - flush_again: - for (idx=0; idxcontrol) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); - } - } - ct_write_unlock_bh(idx); - } - - /* the counter may be not NULL, because maybe some conn entries - are run by slow timer handler or unhashed but still referred */ - if (atomic_read(&ip_vs_conn_count) != 0) { - schedule(); - goto flush_again; - } -} - - -int __init ip_vs_conn_init(void) -{ - int idx; - - /* - * Allocate the connection hash table and initialize its list heads - */ - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); - if (!ip_vs_conn_tab) - return -ENOMEM; - - /* Allocate ip_vs_conn slab cache */ - ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", - sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!ip_vs_conn_cachep) { - vfree(ip_vs_conn_tab); - return -ENOMEM; - } - - IP_VS_INFO("Connection hash table configured " - "(size=%d, memory=%ldKbytes)\n", - IP_VS_CONN_TAB_SIZE, - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); - IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", - sizeof(struct ip_vs_conn)); - - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); - } - - for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); - } - - proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); - proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); - - /* calculate the random value for connection hash */ - get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - - return 0; -} - - -void ip_vs_conn_cleanup(void) -{ - /* flush all the connection entries first */ - ip_vs_conn_flush(); - - /* Release the empty cache */ - kmem_cache_destroy(ip_vs_conn_cachep); - proc_net_remove(&init_net, "ip_vs_conn"); - proc_net_remove(&init_net, "ip_vs_conn_sync"); - vfree(ip_vs_conn_tab); -} diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c deleted file mode 100644 index 958abf3e5f8..00000000000 --- a/net/ipv4/ipvs/ip_vs_core.c +++ /dev/null @@ -1,1542 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, - * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms - * and others. - * - * Changes: - * Paul `Rusty' Russell properly handle non-linear skbs - * Harald Welte don't use nfcache - * - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include /* for icmp_send */ -#include - -#include -#include - -#ifdef CONFIG_IP_VS_IPV6 -#include -#include -#endif - -#include - - -EXPORT_SYMBOL(register_ip_vs_scheduler); -EXPORT_SYMBOL(unregister_ip_vs_scheduler); -EXPORT_SYMBOL(ip_vs_skb_replace); -EXPORT_SYMBOL(ip_vs_proto_name); -EXPORT_SYMBOL(ip_vs_conn_new); -EXPORT_SYMBOL(ip_vs_conn_in_get); -EXPORT_SYMBOL(ip_vs_conn_out_get); -#ifdef CONFIG_IP_VS_PROTO_TCP -EXPORT_SYMBOL(ip_vs_tcp_conn_listen); -#endif -EXPORT_SYMBOL(ip_vs_conn_put); -#ifdef CONFIG_IP_VS_DEBUG -EXPORT_SYMBOL(ip_vs_get_debug_level); -#endif - - -/* ID used in ICMP lookups */ -#define icmp_id(icmph) (((icmph)->un).echo.id) -#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) - -const char *ip_vs_proto_name(unsigned proto) -{ - static char buf[20]; - - switch (proto) { - case IPPROTO_IP: - return "IP"; - case IPPROTO_UDP: - return "UDP"; - case IPPROTO_TCP: - return "TCP"; - case IPPROTO_ICMP: - return "ICMP"; -#ifdef CONFIG_IP_VS_IPV6 - case IPPROTO_ICMPV6: - return "ICMPv6"; -#endif - default: - sprintf(buf, "IP_%d", proto); - return buf; - } -} - -void ip_vs_init_hash_table(struct list_head *table, int rows) -{ - while (--rows >= 0) - INIT_LIST_HEAD(&table[rows]); -} - -static inline void -ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.inpkts++; - dest->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.inpkts++; - dest->svc->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.inpkts++; - ip_vs_stats.ustats.inbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); - } -} - - -static inline void -ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.outpkts++; - dest->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.outpkts++; - dest->svc->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.outpkts++; - ip_vs_stats.ustats.outbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); - } -} - - -static inline void -ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) -{ - spin_lock(&cp->dest->stats.lock); - cp->dest->stats.ustats.conns++; - spin_unlock(&cp->dest->stats.lock); - - spin_lock(&svc->stats.lock); - svc->stats.ustats.conns++; - spin_unlock(&svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.conns++; - spin_unlock(&ip_vs_stats.lock); -} - - -static inline int -ip_vs_set_state(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - if (unlikely(!pp->state_transition)) - return 0; - return pp->state_transition(cp, direction, skb, pp); -} - - -/* - * IPVS persistent scheduling function - * It creates a connection entry according to its template if exists, - * or selects a server and creates a connection entry plus a template. - * Locking: we are svc user (svc->refcnt), so we hold all dests too - * Protocols supported: TCP, UDP - */ -static struct ip_vs_conn * -ip_vs_sched_persist(struct ip_vs_service *svc, - const struct sk_buff *skb, - __be16 ports[2]) -{ - struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; - struct ip_vs_dest *dest; - struct ip_vs_conn *ct; - __be16 dport; /* destination port to forward */ - union nf_inet_addr snet; /* source network of the client, - after masking */ - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - - /* Mask saddr with the netmask to adjust template granularity */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); - else -#endif - snet.ip = iph.saddr.ip & svc->netmask; - - IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " - "mnet %s\n", - IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), - IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), - IP_VS_DBG_ADDR(svc->af, &snet)); - - /* - * As far as we know, FTP is a very complicated network protocol, and - * it uses control connection and data connections. For active FTP, - * FTP server initialize data connection to the client, its source port - * is often 20. For passive FTP, FTP server tells the clients the port - * that it passively listens to, and the client issues the data - * connection. In the tunneling or direct routing mode, the load - * balancer is on the client-to-server half of connection, the port - * number is unknown to the load balancer. So, a conn template like - * is created for persistent FTP - * service, and a template like - * is created for other persistent services. - */ - if (ports[1] == svc->port) { - /* Check if a template already exists */ - if (svc->port != FTPPORT) - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, ports[1]); - else - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * No template found or the dest of the connection - * template is not available. - */ - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template like for non-ftp service, - * and - * for ftp service. - */ - if (svc->port != FTPPORT) - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, - ports[1], - &dest->addr, dest->port, - IP_VS_CONN_F_TEMPLATE, - dest); - else - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; - } - dport = dest->port; - } else { - /* - * Note: persistent fwmark-based services and persistent - * port zero service are handled here. - * fwmark template: - * port zero template: - */ - if (svc->fwmark) { - union nf_inet_addr fwmark = { - .all = { 0, 0, 0, htonl(svc->fwmark) } - }; - - ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0, - &fwmark, 0); - } else - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * If it is not persistent port zero, return NULL, - * otherwise create a connection template. - */ - if (svc->port) - return NULL; - - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template according to the service - */ - if (svc->fwmark) { - union nf_inet_addr fwmark = { - .all = { 0, 0, 0, htonl(svc->fwmark) } - }; - - ct = ip_vs_conn_new(svc->af, IPPROTO_IP, - &snet, 0, - &fwmark, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - } else - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; - } - dport = ports[1]; - } - - /* - * Create a new connection according to the template - */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, ports[0], - &iph.daddr, ports[1], - &dest->addr, dport, - 0, - dest); - if (cp == NULL) { - ip_vs_conn_put(ct); - return NULL; - } - - /* - * Add its control - */ - ip_vs_control_add(cp, ct); - ip_vs_conn_put(ct); - - ip_vs_conn_stats(cp, svc); - return cp; -} - - -/* - * IPVS main scheduling function - * It selects a server according to the virtual service, and - * creates a connection entry. - * Protocols supported: TCP, UDP - */ -struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_conn *cp = NULL; - struct ip_vs_iphdr iph; - struct ip_vs_dest *dest; - __be16 _ports[2], *pptr; - - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - /* - * Persistent service - */ - if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, pptr); - - /* - * Non-persistent service - */ - if (!svc->fwmark && pptr[1] != svc->port) { - if (!svc->port) - IP_VS_ERR("Schedule: port zero only supported " - "in persistent services, " - "check your ipvs configuration\n"); - return NULL; - } - - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "Schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a connection entry. - */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &dest->addr, dest->port ? dest->port : pptr[1], - 0, - dest); - if (cp == NULL) - return NULL; - - IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " - "d:%s:%u conn->flags:%X conn->refcnt:%d\n", - ip_vs_fwd_tag(cp), - IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), - cp->flags, atomic_read(&cp->refcnt)); - - ip_vs_conn_stats(cp, svc); - return cp; -} - - -/* - * Pass or drop the packet. - * Called by ip_vs_in, when the virtual service is available but - * no destination is available for a new connection. - */ -int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - __be16 _ports[2], *pptr; - struct ip_vs_iphdr iph; - int unicast; - ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); - - pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); - if (pptr == NULL) { - ip_vs_service_put(svc); - return NF_DROP; - } - -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; - else -#endif - unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); - - /* if it is fwmark-based service, the cache_bypass sysctl is up - and the destination is a non-local unicast, then create - a cache_bypass connection entry */ - if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { - int ret, cs; - struct ip_vs_conn *cp; - union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; - - ip_vs_service_put(svc); - - /* create a new connection entry */ - IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &daddr, 0, - IP_VS_CONN_F_BYPASS, - NULL); - if (cp == NULL) - return NF_DROP; - - /* statistics */ - ip_vs_in_stats(cp, skb); - - /* set state */ - cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); - - /* transmit the first SYN packet */ - ret = cp->packet_xmit(skb, cp, pp); - /* do not touch skb anymore */ - - atomic_inc(&cp->in_pkts); - ip_vs_conn_put(cp); - return ret; - } - - /* - * When the virtual ftp service is presented, packets destined - * for other services on the VIP may get here (except services - * listed in the ipvs table), pass the packets, because it is - * not ipvs job to decide to drop the packets. - */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { - ip_vs_service_put(svc); - return NF_ACCEPT; - } - - ip_vs_service_put(svc); - - /* - * Notify the client that the destination is unreachable, and - * release the socket buffer. - * Since it is in IP layer, the TCP socket is not actually - * created, the TCP RST packet cannot be sent, instead that - * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ - */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, - skb->dev); - else -#endif - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - - return NF_DROP; -} - - -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain, and is used for VS/NAT. - * It detects packets for VS/NAT connections and sends the packets - * immediately. This can avoid that iptable_nat mangles the packets - * for VS/NAT. - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; -} - -__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) -{ - return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); -} - -static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) -{ - int err = ip_defrag(skb, user); - - if (!err) - ip_send_check(ip_hdr(skb)); - - return err; -} - -#ifdef CONFIG_IP_VS_IPV6 -static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) -{ - /* TODO IPv6: Find out what to do here for IPv6 */ - return 0; -} -#endif - -/* - * Packet has been made sufficiently writable in caller - * - inout: 1=in->out, 0=out->in - */ -void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int inout) -{ - struct iphdr *iph = ip_hdr(skb); - unsigned int icmp_offset = iph->ihl*4; - struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + - icmp_offset); - struct iphdr *ciph = (struct iphdr *)(icmph + 1); - - if (inout) { - iph->saddr = cp->vaddr.ip; - ip_send_check(iph); - ciph->daddr = cp->vaddr.ip; - ip_send_check(ciph); - } else { - iph->daddr = cp->daddr.ip; - ip_send_check(iph); - ciph->saddr = cp->daddr.ip; - ip_send_check(ciph); - } - - /* the TCP/UDP port */ - if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { - __be16 *ports = (void *)ciph + ciph->ihl*4; - - if (inout) - ports[1] = cp->vport; - else - ports[0] = cp->dport; - } - - /* And finally the ICMP checksum */ - icmph->checksum = 0; - icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMP"); - else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMP"); -} - -#ifdef CONFIG_IP_VS_IPV6 -void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int inout) -{ - struct ipv6hdr *iph = ipv6_hdr(skb); - unsigned int icmp_offset = sizeof(struct ipv6hdr); - struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + - icmp_offset); - struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); - - if (inout) { - iph->saddr = cp->vaddr.in6; - ciph->daddr = cp->vaddr.in6; - } else { - iph->daddr = cp->daddr.in6; - ciph->saddr = cp->daddr.in6; - } - - /* the TCP/UDP port */ - if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) { - __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); - - if (inout) - ports[1] = cp->vport; - else - ports[0] = cp->dport; - } - - /* And finally the ICMP checksum */ - icmph->icmp6_cksum = 0; - /* TODO IPv6: is this correct for ICMPv6? */ - ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMPv6"); - else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMPv6"); -} -#endif - -/* Handle relevant response ICMP messages - forward to the right - * destination host. Used for NAT and local client. - */ -static int handle_response_icmp(int af, struct sk_buff *skb, - union nf_inet_addr *snet, - __u8 protocol, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, - unsigned int offset, unsigned int ihl) -{ - unsigned int verdict = NF_DROP; - - if (IP_VS_FWD_METHOD(cp) != 0) { - IP_VS_ERR("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", - IP_VS_DBG_ADDR(af, snet)); - goto out; - } - - if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol) - offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) - goto out; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - ip_vs_nat_icmp_v6(skb, pp, cp, 1); - else -#endif - ip_vs_nat_icmp(skb, pp, cp, 1); - - /* do the statistics and put it back */ - ip_vs_out_stats(cp, skb); - - skb->ipvs_property = 1; - verdict = NF_ACCEPT; - -out: - __ip_vs_conn_put(cp); - - return verdict; -} - -/* - * Handle ICMP messages in the inside-to-outside direction (outgoing). - * Find any that might be relevant, check against existing connections. - * Currently handles error types - unreachable, quench, ttl exceeded. - */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related) -{ - struct iphdr *iph; - struct icmphdr _icmph, *ic; - struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, ihl; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - } - - iph = ip_hdr(skb); - offset = ihl = iph->ihl * 4; - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - ic->type, ntohs(icmp_id(ic)), - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->type != ICMP_DEST_UNREACH) && - (ic->type != ICMP_SOURCE_QUENCH) && - (ic->type != ICMP_TIME_EXCEEDED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->protocol); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & htons(IP_OFFSET) && - pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); - - offset += cih->ihl * 4; - - ip_vs_fill_iphdr(AF_INET, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); - if (!cp) - return NF_ACCEPT; - - snet.ip = iph->saddr; - return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, - pp, offset, ihl); -} - -#ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) -{ - struct ipv6hdr *iph; - struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - NIP6(iph->saddr), NIP6(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->nexthdr); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); - - offset += sizeof(struct ipv6hdr); - - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (!cp) - return NF_ACCEPT; - - ipv6_addr_copy(&snet.in6, &iph->saddr); - return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, - pp, offset, sizeof(struct ipv6hdr)); -} -#endif - -static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) -{ - struct tcphdr _tcph, *th; - - th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - return th->rst; -} - -/* Handle response packets: rewrite addresses and send away... - * Used for NAT and local client. - */ -static unsigned int -handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int ihl) -{ - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); - - if (!skb_make_writable(skb, ihl)) - goto drop; - - /* mangle the packet */ - if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) - goto drop; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - ipv6_hdr(skb)->saddr = cp->vaddr.in6; - else -#endif - { - ip_hdr(skb)->saddr = cp->vaddr.ip; - ip_send_check(ip_hdr(skb)); - } - - /* For policy routing, packets originating from this - * machine itself may be routed differently to packets - * passing through. We want this packet to be routed as - * if it came from this machine itself. So re-compute - * the routing information. - */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ip6_route_me_harder(skb) != 0) - goto drop; - } else -#endif - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); - - ip_vs_out_stats(cp, skb); - ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); - ip_vs_conn_put(cp); - - skb->ipvs_property = 1; - - LeaveFunction(11); - return NF_ACCEPT; - -drop: - ip_vs_conn_put(cp); - kfree_skb(skb); - return NF_STOLEN; -} - -/* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. - * Check if outgoing packet belongs to the established ip_vs_conn. - */ -static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_vs_iphdr iph; - struct ip_vs_protocol *pp; - struct ip_vs_conn *cp; - int af; - - EnterFunction(11); - - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - - if (skb->ipvs_property) - return NF_ACCEPT; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); - - if (related) - return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - } else -#endif - if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_out_icmp(skb, &related); - - if (related) - return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) - return NF_ACCEPT; - - /* reassemble IP fragments */ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); - - if (related) - return verdict; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - } else -#endif - if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && - !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - - /* - * Check if the packet belongs to an existing entry - */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - - if (unlikely(!cp)) { - if (sysctl_ip_vs_nat_icmp_send && - (pp->protocol == IPPROTO_TCP || - pp->protocol == IPPROTO_UDP)) { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, iph.len, - sizeof(_ports), _ports); - if (pptr == NULL) - return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(af, iph.protocol, - &iph.saddr, - pptr[0])) { - /* - * Notify the real server: there is no - * existing entry if it is not RST - * packet or not TCP packet. - */ - if (iph.protocol != IPPROTO_TCP - || !is_tcp_reset(skb, iph.len)) { -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - icmpv6_send(skb, - ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, - 0, skb->dev); - else -#endif - icmp_send(skb, - ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); - return NF_DROP; - } - } - } - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; - } - - return handle_response(af, skb, pp, cp, iph.len); -} - - -/* - * Handle ICMP messages in the outside-to-inside direction (incoming). - * Find any that might be relevant, check against existing connections, - * forward to the right destination host if relevant. - * Currently handles error types - unreachable, quench, ttl exceeded. - */ -static int -ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) -{ - struct iphdr *iph; - struct icmphdr _icmph, *ic; - struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, ihl, verdict; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) - return NF_STOLEN; - } - - iph = ip_hdr(skb); - offset = ihl = iph->ihl * 4; - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - ic->type, ntohs(icmp_id(ic)), - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->type != ICMP_DEST_UNREACH) && - (ic->type != ICMP_SOURCE_QUENCH) && - (ic->type != ICMP_TIME_EXCEEDED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->protocol); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & htons(IP_OFFSET) && - pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); - - offset += cih->ihl * 4; - - ip_vs_fill_iphdr(AF_INET, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); - if (!cp) { - /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); - if (cp) { - snet.ip = iph->saddr; - return handle_response_icmp(AF_INET, skb, &snet, - cih->protocol, cp, pp, - offset, ihl); - } - return NF_ACCEPT; - } - - verdict = NF_DROP; - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); - goto out; - } - - /* do the statistics and put it back */ - ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) - offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); - /* do not touch skb anymore */ - - out: - __ip_vs_conn_put(cp); - - return verdict; -} - -#ifdef CONFIG_IP_VS_IPV6 -static int -ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) -{ - struct ipv6hdr *iph; - struct icmp6hdr _icmph, *ic; - struct ipv6hdr _ciph, *cih; /* The ip header contained - within the ICMP */ - struct ip_vs_iphdr ciph; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, verdict; - union nf_inet_addr snet; - - *related = 1; - - /* reassemble IP fragments */ - if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : - IP_DEFRAG_VS_FWD)) - return NF_STOLEN; - } - - iph = ipv6_hdr(skb); - offset = sizeof(struct ipv6hdr); - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", - ic->icmp6_type, ntohs(icmpv6_id(ic)), - NIP6(iph->saddr), NIP6(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && - (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && - (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->nexthdr); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - /* TODO: we don't support fragmentation at the moment anyways */ - if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); - - offset += sizeof(struct ipv6hdr); - - ip_vs_fill_iphdr(AF_INET6, cih, &ciph); - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (!cp) { - /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); - if (cp) { - ipv6_addr_copy(&snet.in6, &iph->saddr); - return handle_response_icmp(AF_INET6, skb, &snet, - cih->nexthdr, - cp, pp, offset, - sizeof(struct ipv6hdr)); - } - return NF_ACCEPT; - } - - verdict = NF_DROP; - - /* do the statistics and put it back */ - ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) - offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); - /* do not touch skb anymore */ - - __ip_vs_conn_put(cp); - - return verdict; -} -#endif - - -/* - * Check if it's for virtual services, look it up, - * and send it on its way... - */ -static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct ip_vs_iphdr iph; - struct ip_vs_protocol *pp; - struct ip_vs_conn *cp; - int ret, restart, af; - - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - /* - * Big tappo: only PACKET_HOST, including loopback for local client - * Don't handle local packets on IPv6 for now - */ - if (unlikely(skb->pkt_type != PACKET_HOST)) { - IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", - skb->pkt_type, - iph.protocol, - IP_VS_DBG_ADDR(af, &iph.daddr)); - return NF_ACCEPT; - } - - if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); - - if (related) - return verdict; - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - } - - /* Protocol supported? */ - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) - return NF_ACCEPT; - - /* - * Check if the packet belongs to an existing connection entry - */ - cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); - - if (unlikely(!cp)) { - int v; - - /* For local client packets, it could be a response */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - if (cp) - return handle_response(af, skb, pp, cp, iph.len); - - if (!pp->conn_schedule(af, skb, pp, &v, &cp)) - return v; - } - - if (unlikely(!cp)) { - /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; - } - - IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); - - /* Check the server status */ - if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { - /* the destination server is not available */ - - if (sysctl_ip_vs_expire_nodest_conn) { - /* try to expire the connection immediately */ - ip_vs_conn_expire_now(cp); - } - /* don't restart its timer, and silently - drop the packet. */ - __ip_vs_conn_put(cp); - return NF_DROP; - } - - ip_vs_in_stats(cp, skb); - restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); - if (cp->packet_xmit) - ret = cp->packet_xmit(skb, cp, pp); - /* do not touch skb anymore */ - else { - IP_VS_DBG_RL("warning: packet_xmit is null"); - ret = NF_ACCEPT; - } - - /* Increase its packet counter and check if it is needed - * to be synchronized - * - * Sync connection if it is about to close to - * encorage the standby servers to update the connections timeout - */ - atomic_inc(&cp->in_pkts); - if (af == AF_INET && - (ip_vs_sync_state & IP_VS_STATE_MASTER) && - (((cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) || - ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && - ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || - (cp->state == IP_VS_TCP_S_TIME_WAIT))))) - ip_vs_sync_conn(cp); - cp->old_state = cp->state; - - ip_vs_conn_put(cp); - return ret; -} - - -/* - * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP - * related packets destined for 0.0.0.0/0. - * When fwmark-based virtual service is used, such as transparent - * cache cluster, TCP packets can be marked and routed to ip_vs_in, - * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and - * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain - * and send them to ip_vs_in_icmp. - */ -static unsigned int -ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - int r; - - if (ip_hdr(skb)->protocol != IPPROTO_ICMP) - return NF_ACCEPT; - - return ip_vs_in_icmp(skb, &r, hooknum); -} - -#ifdef CONFIG_IP_VS_IPV6 -static unsigned int -ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - int r; - - if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) - return NF_ACCEPT; - - return ip_vs_in_icmp_v6(skb, &r, hooknum); -} -#endif - - -static struct nf_hook_ops ip_vs_ops[] __read_mostly = { - /* After packet filtering, forward packet through VS/DR, VS/TUN, - * or VS/NAT(change destination), so that filtering rules can be - * applied to IPVS. */ - { - .hook = ip_vs_in, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, - }, - /* After packet filtering, change source only for VS/NAT */ - { - .hook = ip_vs_out, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 100, - }, - /* After packet filtering (but before ip_vs_out_icmp), catch icmp - * destined for 0.0.0.0/0, which is for incoming IPVS connections */ - { - .hook = ip_vs_forward_icmp, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 99, - }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, - }, -#ifdef CONFIG_IP_VS_IPV6 - /* After packet filtering, forward packet through VS/DR, VS/TUN, - * or VS/NAT(change destination), so that filtering rules can be - * applied to IPVS. */ - { - .hook = ip_vs_in, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, - }, - /* After packet filtering, change source only for VS/NAT */ - { - .hook = ip_vs_out, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 100, - }, - /* After packet filtering (but before ip_vs_out_icmp), catch icmp - * destined for 0.0.0.0/0, which is for incoming IPVS connections */ - { - .hook = ip_vs_forward_icmp_v6, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 99, - }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_NAT_SRC-1, - }, -#endif -}; - - -/* - * Initialize IP Virtual Server - */ -static int __init ip_vs_init(void) -{ - int ret; - - ip_vs_estimator_init(); - - ret = ip_vs_control_init(); - if (ret < 0) { - IP_VS_ERR("can't setup control.\n"); - goto cleanup_estimator; - } - - ip_vs_protocol_init(); - - ret = ip_vs_app_init(); - if (ret < 0) { - IP_VS_ERR("can't setup application helper.\n"); - goto cleanup_protocol; - } - - ret = ip_vs_conn_init(); - if (ret < 0) { - IP_VS_ERR("can't setup connection table.\n"); - goto cleanup_app; - } - - ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - if (ret < 0) { - IP_VS_ERR("can't register hooks.\n"); - goto cleanup_conn; - } - - IP_VS_INFO("ipvs loaded.\n"); - return ret; - - cleanup_conn: - ip_vs_conn_cleanup(); - cleanup_app: - ip_vs_app_cleanup(); - cleanup_protocol: - ip_vs_protocol_cleanup(); - ip_vs_control_cleanup(); - cleanup_estimator: - ip_vs_estimator_cleanup(); - return ret; -} - -static void __exit ip_vs_cleanup(void) -{ - nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - ip_vs_conn_cleanup(); - ip_vs_app_cleanup(); - ip_vs_protocol_cleanup(); - ip_vs_control_cleanup(); - ip_vs_estimator_cleanup(); - IP_VS_INFO("ipvs unloaded.\n"); -} - -module_init(ip_vs_init); -module_exit(ip_vs_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c deleted file mode 100644 index 0302cf3e503..00000000000 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ /dev/null @@ -1,3443 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the NetFilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#ifdef CONFIG_IP_VS_IPV6 -#include -#include -#endif -#include -#include -#include - -#include - -#include - -/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ -static DEFINE_MUTEX(__ip_vs_mutex); - -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - -/* lock for table with the real services */ -static DEFINE_RWLOCK(__ip_vs_rs_lock); - -/* lock for state and timeout tables */ -static DEFINE_RWLOCK(__ip_vs_securetcp_lock); - -/* lock for drop entry handling */ -static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); - -/* lock for drop packet handling */ -static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); - -/* 1/rate drop and drop-entry variables */ -int ip_vs_drop_rate = 0; -int ip_vs_drop_counter = 0; -static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); - -/* number of virtual services */ -static int ip_vs_num_services = 0; - -/* sysctl variables */ -static int sysctl_ip_vs_drop_entry = 0; -static int sysctl_ip_vs_drop_packet = 0; -static int sysctl_ip_vs_secure_tcp = 0; -static int sysctl_ip_vs_amemthresh = 1024; -static int sysctl_ip_vs_am_droprate = 10; -int sysctl_ip_vs_cache_bypass = 0; -int sysctl_ip_vs_expire_nodest_conn = 0; -int sysctl_ip_vs_expire_quiescent_template = 0; -int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; -int sysctl_ip_vs_nat_icmp_send = 0; - - -#ifdef CONFIG_IP_VS_DEBUG -static int sysctl_ip_vs_debug_level = 0; - -int ip_vs_get_debug_level(void) -{ - return sysctl_ip_vs_debug_level; -} -#endif - -#ifdef CONFIG_IP_VS_IPV6 -/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ -static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) -{ - struct rt6_info *rt; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = *addr, - .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) - return 1; - - return 0; -} -#endif -/* - * update_defense_level is called from keventd and from sysctl, - * so it needs to protect itself from softirqs - */ -static void update_defense_level(void) -{ - struct sysinfo i; - static int old_secure_tcp = 0; - int availmem; - int nomem; - int to_change = -1; - - /* we only count free and buffered memory (in pages) */ - si_meminfo(&i); - availmem = i.freeram + i.bufferram; - /* however in linux 2.5 the i.bufferram is total page cache size, - we need adjust it */ - /* si_swapinfo(&i); */ - /* availmem = availmem - (i.totalswap - i.freeswap); */ - - nomem = (availmem < sysctl_ip_vs_amemthresh); - - local_bh_disable(); - - /* drop_entry */ - spin_lock(&__ip_vs_dropentry_lock); - switch (sysctl_ip_vs_drop_entry) { - case 0: - atomic_set(&ip_vs_dropentry, 0); - break; - case 1: - if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - sysctl_ip_vs_drop_entry = 2; - } else { - atomic_set(&ip_vs_dropentry, 0); - } - break; - case 2: - if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - } else { - atomic_set(&ip_vs_dropentry, 0); - sysctl_ip_vs_drop_entry = 1; - }; - break; - case 3: - atomic_set(&ip_vs_dropentry, 1); - break; - } - spin_unlock(&__ip_vs_dropentry_lock); - - /* drop_packet */ - spin_lock(&__ip_vs_droppacket_lock); - switch (sysctl_ip_vs_drop_packet) { - case 0: - ip_vs_drop_rate = 0; - break; - case 1: - if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - sysctl_ip_vs_drop_packet = 2; - } else { - ip_vs_drop_rate = 0; - } - break; - case 2: - if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - } else { - ip_vs_drop_rate = 0; - sysctl_ip_vs_drop_packet = 1; - } - break; - case 3: - ip_vs_drop_rate = sysctl_ip_vs_am_droprate; - break; - } - spin_unlock(&__ip_vs_droppacket_lock); - - /* secure_tcp */ - write_lock(&__ip_vs_securetcp_lock); - switch (sysctl_ip_vs_secure_tcp) { - case 0: - if (old_secure_tcp >= 2) - to_change = 0; - break; - case 1: - if (nomem) { - if (old_secure_tcp < 2) - to_change = 1; - sysctl_ip_vs_secure_tcp = 2; - } else { - if (old_secure_tcp >= 2) - to_change = 0; - } - break; - case 2: - if (nomem) { - if (old_secure_tcp < 2) - to_change = 1; - } else { - if (old_secure_tcp >= 2) - to_change = 0; - sysctl_ip_vs_secure_tcp = 1; - } - break; - case 3: - if (old_secure_tcp < 2) - to_change = 1; - break; - } - old_secure_tcp = sysctl_ip_vs_secure_tcp; - if (to_change >= 0) - ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); - write_unlock(&__ip_vs_securetcp_lock); - - local_bh_enable(); -} - - -/* - * Timer for checking the defense - */ -#define DEFENSE_TIMER_PERIOD 1*HZ -static void defense_work_handler(struct work_struct *work); -static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); - -static void defense_work_handler(struct work_struct *work) -{ - update_defense_level(); - if (atomic_read(&ip_vs_dropentry)) - ip_vs_random_dropentry(); - - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); -} - -int -ip_vs_use_count_inc(void) -{ - return try_module_get(THIS_MODULE); -} - -void -ip_vs_use_count_dec(void) -{ - module_put(THIS_MODULE); -} - - -/* - * Hash table: for virtual service lookups - */ -#define IP_VS_SVC_TAB_BITS 8 -#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) -#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) - -/* the service table hashed by */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; -/* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; - -/* - * Hash table: for real service lookups - */ -#define IP_VS_RTAB_BITS 4 -#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) -#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) - -static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; - -/* - * Trash for destinations - */ -static LIST_HEAD(ip_vs_dest_trash); - -/* - * FTP & NULL virtual service counters - */ -static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); -static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); - - -/* - * Returns hash value for virtual service - */ -static __inline__ unsigned -ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, - __be16 port) -{ - register unsigned porth = ntohs(port); - __be32 addr_fold = addr->ip; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - addr_fold = addr->ip6[0]^addr->ip6[1]^ - addr->ip6[2]^addr->ip6[3]; -#endif - - return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) - & IP_VS_SVC_TAB_MASK; -} - -/* - * Returns hash value of fwmark for virtual service lookup - */ -static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) -{ - return fwmark & IP_VS_SVC_TAB_MASK; -} - -/* - * Hashes a service in the ip_vs_svc_table by - * or in the ip_vs_svc_fwm_table by fwmark. - * Should be called with locked tables. - */ -static int ip_vs_svc_hash(struct ip_vs_service *svc) -{ - unsigned hash; - - if (svc->flags & IP_VS_SVC_F_HASHED) { - IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - if (svc->fwmark == 0) { - /* - * Hash it by in ip_vs_svc_table - */ - hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, - svc->port); - list_add(&svc->s_list, &ip_vs_svc_table[hash]); - } else { - /* - * Hash it by fwmark in ip_vs_svc_fwm_table - */ - hash = ip_vs_svc_fwm_hashkey(svc->fwmark); - list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); - } - - svc->flags |= IP_VS_SVC_F_HASHED; - /* increase its refcnt because it is referenced by the svc table */ - atomic_inc(&svc->refcnt); - return 1; -} - - -/* - * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. - * Should be called with locked tables. - */ -static int ip_vs_svc_unhash(struct ip_vs_service *svc) -{ - if (!(svc->flags & IP_VS_SVC_F_HASHED)) { - IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - if (svc->fwmark == 0) { - /* Remove it from the ip_vs_svc_table table */ - list_del(&svc->s_list); - } else { - /* Remove it from the ip_vs_svc_fwm_table table */ - list_del(&svc->f_list); - } - - svc->flags &= ~IP_VS_SVC_F_HASHED; - atomic_dec(&svc->refcnt); - return 1; -} - - -/* - * Get service by {proto,addr,port} in the service table. - */ -static inline struct ip_vs_service * -__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, - __be16 vport) -{ - unsigned hash; - struct ip_vs_service *svc; - - /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); - - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ - if ((svc->af == af) - && ip_vs_addr_equal(af, &svc->addr, vaddr) - && (svc->port == vport) - && (svc->protocol == protocol)) { - /* HIT */ - atomic_inc(&svc->usecnt); - return svc; - } - } - - return NULL; -} - - -/* - * Get service by {fwmark} in the service table. - */ -static inline struct ip_vs_service * -__ip_vs_svc_fwm_get(int af, __u32 fwmark) -{ - unsigned hash; - struct ip_vs_service *svc; - - /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(fwmark); - - list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { - if (svc->fwmark == fwmark && svc->af == af) { - /* HIT */ - atomic_inc(&svc->usecnt); - return svc; - } - } - - return NULL; -} - -struct ip_vs_service * -ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, - const union nf_inet_addr *vaddr, __be16 vport) -{ - struct ip_vs_service *svc; - - read_lock(&__ip_vs_svc_lock); - - /* - * Check the table hashed by fwmark first - */ - if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) - goto out; - - /* - * Check the table hashed by - * for "full" addressed entries - */ - svc = __ip_vs_service_get(af, protocol, vaddr, vport); - - if (svc == NULL - && protocol == IPPROTO_TCP - && atomic_read(&ip_vs_ftpsvc_counter) - && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { - /* - * Check if ftp service entry exists, the packet - * might belong to FTP data connections. - */ - svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); - } - - if (svc == NULL - && atomic_read(&ip_vs_nullsvc_counter)) { - /* - * Check if the catch-all port (port zero) exists - */ - svc = __ip_vs_service_get(af, protocol, vaddr, 0); - } - - out: - read_unlock(&__ip_vs_svc_lock); - - IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", - fwmark, ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), - svc ? "hit" : "not hit"); - - return svc; -} - - -static inline void -__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - atomic_inc(&svc->refcnt); - dest->svc = svc; -} - -static inline void -__ip_vs_unbind_svc(struct ip_vs_dest *dest) -{ - struct ip_vs_service *svc = dest->svc; - - dest->svc = NULL; - if (atomic_dec_and_test(&svc->refcnt)) - kfree(svc); -} - - -/* - * Returns hash value for real service - */ -static inline unsigned ip_vs_rs_hashkey(int af, - const union nf_inet_addr *addr, - __be16 port) -{ - register unsigned porth = ntohs(port); - __be32 addr_fold = addr->ip; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - addr_fold = addr->ip6[0]^addr->ip6[1]^ - addr->ip6[2]^addr->ip6[3]; -#endif - - return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) - & IP_VS_RTAB_MASK; -} - -/* - * Hashes ip_vs_dest in ip_vs_rtable by . - * should be called with locked tables. - */ -static int ip_vs_rs_hash(struct ip_vs_dest *dest) -{ - unsigned hash; - - if (!list_empty(&dest->d_list)) { - return 0; - } - - /* - * Hash by proto,addr,port, - * which are the parameters of the real service. - */ - hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); - - list_add(&dest->d_list, &ip_vs_rtable[hash]); - - return 1; -} - -/* - * UNhashes ip_vs_dest from ip_vs_rtable. - * should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) -{ - /* - * Remove it from the ip_vs_rtable table. - */ - if (!list_empty(&dest->d_list)) { - list_del(&dest->d_list); - INIT_LIST_HEAD(&dest->d_list); - } - - return 1; -} - -/* - * Lookup real service by in the real service table. - */ -struct ip_vs_dest * -ip_vs_lookup_real_service(int af, __u16 protocol, - const union nf_inet_addr *daddr, - __be16 dport) -{ - unsigned hash; - struct ip_vs_dest *dest; - - /* - * Check for "full" addressed entries - * Return the first found entry - */ - hash = ip_vs_rs_hashkey(af, daddr, dport); - - read_lock(&__ip_vs_rs_lock); - list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { - if ((dest->af == af) - && ip_vs_addr_equal(af, &dest->addr, daddr) - && (dest->port == dport) - && ((dest->protocol == protocol) || - dest->vfwmark)) { - /* HIT */ - read_unlock(&__ip_vs_rs_lock); - return dest; - } - } - read_unlock(&__ip_vs_rs_lock); - - return NULL; -} - -/* - * Lookup destination by {addr,port} in the given service - */ -static struct ip_vs_dest * -ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) -{ - struct ip_vs_dest *dest; - - /* - * Find the destination for the given service - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->af == svc->af) - && ip_vs_addr_equal(svc->af, &dest->addr, daddr) - && (dest->port == dport)) { - /* HIT */ - return dest; - } - } - - return NULL; -} - -/* - * Find destination by {daddr,dport,vaddr,protocol} - * Cretaed to be used in ip_vs_process_message() in - * the backup synchronization daemon. It finds the - * destination to be bound to the received connection - * on the backup. - * - * ip_vs_lookup_real_service() looked promissing, but - * seems not working as expected. - */ -struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, - __be16 dport, - const union nf_inet_addr *vaddr, - __be16 vport, __u16 protocol) -{ - struct ip_vs_dest *dest; - struct ip_vs_service *svc; - - svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); - if (!svc) - return NULL; - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest) - atomic_inc(&dest->refcnt); - ip_vs_service_put(svc); - return dest; -} - -/* - * Lookup dest by {svc,addr,port} in the destination trash. - * The destination trash is used to hold the destinations that are removed - * from the service table but are still referenced by some conn entries. - * The reason to add the destination trash is when the dest is temporary - * down (either by administrator or by monitor program), the dest can be - * picked back from the trash, the remaining connections to the dest can - * continue, and the counting information of the dest is also useful for - * scheduling. - */ -static struct ip_vs_dest * -ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) -{ - struct ip_vs_dest *dest, *nxt; - - /* - * Find the destination in trash - */ - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " - "dest->refcnt=%d\n", - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port), - atomic_read(&dest->refcnt)); - if (dest->af == svc->af && - ip_vs_addr_equal(svc->af, &dest->addr, daddr) && - dest->port == dport && - dest->vfwmark == svc->fwmark && - dest->protocol == svc->protocol && - (svc->fwmark || - (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && - dest->vport == svc->port))) { - /* HIT */ - return dest; - } - - /* - * Try to purge the destination from trash if not referenced - */ - if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " - "from trash\n", - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), - ntohs(dest->port)); - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); - } - } - - return NULL; -} - - -/* - * Clean up all the destinations in the trash - * Called by the ip_vs_control_cleanup() - * - * When the ip_vs_control_clearup is activated by ipvs module exit, - * the service tables must have been flushed and all the connections - * are expired, and the refcnt of each destination in the trash must - * be 1, so we simply release them here. - */ -static void ip_vs_trash_cleanup(void) -{ - struct ip_vs_dest *dest, *nxt; - - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); - } -} - - -static void -ip_vs_zero_stats(struct ip_vs_stats *stats) -{ - spin_lock_bh(&stats->lock); - - memset(&stats->ustats, 0, sizeof(stats->ustats)); - ip_vs_zero_estimator(stats); - - spin_unlock_bh(&stats->lock); -} - -/* - * Update a destination in the given service - */ -static void -__ip_vs_update_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) -{ - int conn_flags; - - /* set the weight and the flags */ - atomic_set(&dest->weight, udest->weight); - conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; - - /* check if local node and update the flags */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { - if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - } else -#endif - if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - - /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ - if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { - conn_flags |= IP_VS_CONN_F_NOOUTPUT; - } else { - /* - * Put the real service in ip_vs_rtable if not present. - * For now only for NAT! - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_hash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - } - atomic_set(&dest->conn_flags, conn_flags); - - /* bind the service */ - if (!dest->svc) { - __ip_vs_bind_svc(dest, svc); - } else { - if (dest->svc != svc) { - __ip_vs_unbind_svc(dest); - ip_vs_zero_stats(&dest->stats); - __ip_vs_bind_svc(dest, svc); - } - } - - /* set the dest status flags */ - dest->flags |= IP_VS_DEST_F_AVAILABLE; - - if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - dest->u_threshold = udest->u_threshold; - dest->l_threshold = udest->l_threshold; -} - - -/* - * Create a destination for the given service - */ -static int -ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, - struct ip_vs_dest **dest_p) -{ - struct ip_vs_dest *dest; - unsigned atype; - - EnterFunction(2); - -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { - atype = ipv6_addr_type(&udest->addr.in6); - if ((!(atype & IPV6_ADDR_UNICAST) || - atype & IPV6_ADDR_LINKLOCAL) && - !__ip_vs_addr_is_local_v6(&udest->addr.in6)) - return -EINVAL; - } else -#endif - { - atype = inet_addr_type(&init_net, udest->addr.ip); - if (atype != RTN_LOCAL && atype != RTN_UNICAST) - return -EINVAL; - } - - dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); - if (dest == NULL) { - IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); - return -ENOMEM; - } - - dest->af = svc->af; - dest->protocol = svc->protocol; - dest->vaddr = svc->addr; - dest->vport = svc->port; - dest->vfwmark = svc->fwmark; - ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); - dest->port = udest->port; - - atomic_set(&dest->activeconns, 0); - atomic_set(&dest->inactconns, 0); - atomic_set(&dest->persistconns, 0); - atomic_set(&dest->refcnt, 0); - - INIT_LIST_HEAD(&dest->d_list); - spin_lock_init(&dest->dst_lock); - spin_lock_init(&dest->stats.lock); - __ip_vs_update_dest(svc, dest, udest); - ip_vs_new_estimator(&dest->stats); - - *dest_p = dest; - - LeaveFunction(2); - return 0; -} - - -/* - * Add a destination into an existing service - */ -static int -ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) -{ - struct ip_vs_dest *dest; - union nf_inet_addr daddr; - __be16 dport = udest->port; - int ret; - - EnterFunction(2); - - if (udest->weight < 0) { - IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); - return -ERANGE; - } - - if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " - "upper threshold\n"); - return -ERANGE; - } - - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - - /* - * Check if the dest already exists in the list - */ - dest = ip_vs_lookup_dest(svc, &daddr, dport); - - if (dest != NULL) { - IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); - return -EEXIST; - } - - /* - * Check if the dest already exists in the trash and - * is from the same service - */ - dest = ip_vs_trash_get_dest(svc, &daddr, dport); - - if (dest != NULL) { - IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " - "dest->refcnt=%d, service %u/%s:%u\n", - IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), - atomic_read(&dest->refcnt), - dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->vaddr), - ntohs(dest->vport)); - - __ip_vs_update_dest(svc, dest, udest); - - /* - * Get the destination from the trash - */ - list_del(&dest->n_list); - - ip_vs_new_estimator(&dest->stats); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - return 0; - } - - /* - * Allocate and initialize the dest structure - */ - ret = ip_vs_new_dest(svc, udest, &dest); - if (ret) { - return ret; - } - - /* - * Add the dest entry into the list - */ - atomic_inc(&dest->refcnt); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - LeaveFunction(2); - - return 0; -} - - -/* - * Edit a destination in the given service - */ -static int -ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) -{ - struct ip_vs_dest *dest; - union nf_inet_addr daddr; - __be16 dport = udest->port; - - EnterFunction(2); - - if (udest->weight < 0) { - IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); - return -ERANGE; - } - - if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " - "upper threshold\n"); - return -ERANGE; - } - - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); - - /* - * Lookup the destination list - */ - dest = ip_vs_lookup_dest(svc, &daddr, dport); - - if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); - return -ENOENT; - } - - __ip_vs_update_dest(svc, dest, udest); - - write_lock_bh(&__ip_vs_svc_lock); - - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* call the update_service, because server weight may be changed */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - LeaveFunction(2); - - return 0; -} - - -/* - * Delete a destination (must be already unlinked from the service) - */ -static void __ip_vs_del_dest(struct ip_vs_dest *dest) -{ - ip_vs_kill_estimator(&dest->stats); - - /* - * Remove it from the d-linked list with the real services. - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_unhash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - - /* - * Decrease the refcnt of the dest, and free the dest - * if nobody refers to it (refcnt=0). Otherwise, throw - * the destination into the trash. - */ - if (atomic_dec_and_test(&dest->refcnt)) { - ip_vs_dst_reset(dest); - /* simply decrease svc->refcnt here, let the caller check - and release the service if nobody refers to it. - Only user context can release destination and service, - and only one user context can update virtual service at a - time, so the operation here is OK */ - atomic_dec(&dest->svc->refcnt); - kfree(dest); - } else { - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " - "dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), - ntohs(dest->port), - atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ip_vs_dest_trash); - atomic_inc(&dest->refcnt); - } -} - - -/* - * Unlink a destination from the given service - */ -static void __ip_vs_unlink_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, - int svcupd) -{ - dest->flags &= ~IP_VS_DEST_F_AVAILABLE; - - /* - * Remove it from the d-linked destination list. - */ - list_del(&dest->n_list); - svc->num_dests--; - - /* - * Call the update_service function of its scheduler - */ - if (svcupd && svc->scheduler->update_service) - svc->scheduler->update_service(svc); -} - - -/* - * Delete a destination server in the given service - */ -static int -ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) -{ - struct ip_vs_dest *dest; - __be16 dport = udest->port; - - EnterFunction(2); - - dest = ip_vs_lookup_dest(svc, &udest->addr, dport); - - if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); - return -ENOENT; - } - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* - * Unlink dest from the service - */ - __ip_vs_unlink_dest(svc, dest, 1); - - write_unlock_bh(&__ip_vs_svc_lock); - - /* - * Delete the destination - */ - __ip_vs_del_dest(dest); - - LeaveFunction(2); - - return 0; -} - - -/* - * Add a service into the service hash table - */ -static int -ip_vs_add_service(struct ip_vs_service_user_kern *u, - struct ip_vs_service **svc_p) -{ - int ret = 0; - struct ip_vs_scheduler *sched = NULL; - struct ip_vs_service *svc = NULL; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - /* Lookup the scheduler by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); - ret = -ENOENT; - goto out_mod_dec; - } - -#ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6) { - if (!sched->supports_ipv6) { - ret = -EAFNOSUPPORT; - goto out_err; - } - if ((u->netmask < 1) || (u->netmask > 128)) { - ret = -EINVAL; - goto out_err; - } - } -#endif - - svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); - if (svc == NULL) { - IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); - ret = -ENOMEM; - goto out_err; - } - - /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 1); - atomic_set(&svc->refcnt, 0); - - svc->af = u->af; - svc->protocol = u->protocol; - ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); - svc->port = u->port; - svc->fwmark = u->fwmark; - svc->flags = u->flags; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; - - INIT_LIST_HEAD(&svc->destinations); - rwlock_init(&svc->sched_lock); - spin_lock_init(&svc->stats.lock); - - /* Bind the scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) - goto out_err; - sched = NULL; - - /* Update the virtual service counters */ - if (svc->port == FTPPORT) - atomic_inc(&ip_vs_ftpsvc_counter); - else if (svc->port == 0) - atomic_inc(&ip_vs_nullsvc_counter); - - ip_vs_new_estimator(&svc->stats); - - /* Count only IPv4 services for old get/setsockopt interface */ - if (svc->af == AF_INET) - ip_vs_num_services++; - - /* Hash the service into the service table */ - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_hash(svc); - write_unlock_bh(&__ip_vs_svc_lock); - - *svc_p = svc; - return 0; - - out_err: - if (svc != NULL) { - if (svc->scheduler) - ip_vs_unbind_scheduler(svc); - if (svc->inc) { - local_bh_disable(); - ip_vs_app_inc_put(svc->inc); - local_bh_enable(); - } - kfree(svc); - } - ip_vs_scheduler_put(sched); - - out_mod_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return ret; -} - - -/* - * Edit a service and bind it with a new scheduler - */ -static int -ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) -{ - struct ip_vs_scheduler *sched, *old_sched; - int ret = 0; - - /* - * Lookup the scheduler, by 'u->sched_name' - */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); - return -ENOENT; - } - old_sched = sched; - -#ifdef CONFIG_IP_VS_IPV6 - if (u->af == AF_INET6) { - if (!sched->supports_ipv6) { - ret = -EAFNOSUPPORT; - goto out; - } - if ((u->netmask < 1) || (u->netmask > 128)) { - ret = -EINVAL; - goto out; - } - } -#endif - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* - * Set the flags and timeout value - */ - svc->flags = u->flags | IP_VS_SVC_F_HASHED; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; - - old_sched = svc->scheduler; - if (sched != old_sched) { - /* - * Unbind the old scheduler - */ - if ((ret = ip_vs_unbind_scheduler(svc))) { - old_sched = sched; - goto out_unlock; - } - - /* - * Bind the new scheduler - */ - if ((ret = ip_vs_bind_scheduler(svc, sched))) { - /* - * If ip_vs_bind_scheduler fails, restore the old - * scheduler. - * The main reason of failure is out of memory. - * - * The question is if the old scheduler can be - * restored all the time. TODO: if it cannot be - * restored some time, we must delete the service, - * otherwise the system may crash. - */ - ip_vs_bind_scheduler(svc, old_sched); - old_sched = sched; - goto out_unlock; - } - } - - out_unlock: - write_unlock_bh(&__ip_vs_svc_lock); -#ifdef CONFIG_IP_VS_IPV6 - out: -#endif - - if (old_sched) - ip_vs_scheduler_put(old_sched); - - return ret; -} - - -/* - * Delete a service from the service list - * - The service must be unlinked, unlocked and not referenced! - * - We are called under _bh lock - */ -static void __ip_vs_del_service(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest, *nxt; - struct ip_vs_scheduler *old_sched; - - /* Count only IPv4 services for old get/setsockopt interface */ - if (svc->af == AF_INET) - ip_vs_num_services--; - - ip_vs_kill_estimator(&svc->stats); - - /* Unbind scheduler */ - old_sched = svc->scheduler; - ip_vs_unbind_scheduler(svc); - if (old_sched) - ip_vs_scheduler_put(old_sched); - - /* Unbind app inc */ - if (svc->inc) { - ip_vs_app_inc_put(svc->inc); - svc->inc = NULL; - } - - /* - * Unlink the whole destination list - */ - list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { - __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(dest); - } - - /* - * Update the virtual service counters - */ - if (svc->port == FTPPORT) - atomic_dec(&ip_vs_ftpsvc_counter); - else if (svc->port == 0) - atomic_dec(&ip_vs_nullsvc_counter); - - /* - * Free the service if nobody refers to it - */ - if (atomic_read(&svc->refcnt) == 0) - kfree(svc); - - /* decrease the module use count */ - ip_vs_use_count_dec(); -} - -/* - * Delete a service from the service list - */ -static int ip_vs_del_service(struct ip_vs_service *svc) -{ - if (svc == NULL) - return -EEXIST; - - /* - * Unhash it from the service table - */ - write_lock_bh(&__ip_vs_svc_lock); - - ip_vs_svc_unhash(svc); - - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - __ip_vs_del_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - return 0; -} - - -/* - * Flush all the virtual services - */ -static int ip_vs_flush(void) -{ - int idx; - struct ip_vs_service *svc, *nxt; - - /* - * Flush the service table hashed by - */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); - } - } - - /* - * Flush the service table hashed by fwmark - */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, - &ip_vs_svc_fwm_table[idx], f_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); - } - } - - return 0; -} - - -/* - * Zero counters in a service or all services - */ -static int ip_vs_zero_service(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - - write_lock_bh(&__ip_vs_svc_lock); - list_for_each_entry(dest, &svc->destinations, n_list) { - ip_vs_zero_stats(&dest->stats); - } - ip_vs_zero_stats(&svc->stats); - write_unlock_bh(&__ip_vs_svc_lock); - return 0; -} - -static int ip_vs_zero_all(void) -{ - int idx; - struct ip_vs_service *svc; - - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - ip_vs_zero_service(svc); - } - } - - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - ip_vs_zero_service(svc); - } - } - - ip_vs_zero_stats(&ip_vs_stats); - return 0; -} - - -static int -proc_do_defense_mode(ctl_table *table, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val = *valp; - int rc; - - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); - if (write && (*valp != val)) { - if ((*valp < 0) || (*valp > 3)) { - /* Restore the correct value */ - *valp = val; - } else { - update_defense_level(); - } - } - return rc; -} - - -static int -proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val[2]; - int rc; - - /* backup the value first */ - memcpy(val, valp, sizeof(val)); - - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { - /* Restore the correct value */ - memcpy(valp, val, sizeof(val)); - } - return rc; -} - - -/* - * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) - */ - -static struct ctl_table vs_vars[] = { - { - .procname = "amemthresh", - .data = &sysctl_ip_vs_amemthresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#ifdef CONFIG_IP_VS_DEBUG - { - .procname = "debug_level", - .data = &sysctl_ip_vs_debug_level, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .procname = "am_droprate", - .data = &sysctl_ip_vs_am_droprate, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "drop_entry", - .data = &sysctl_ip_vs_drop_entry, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, - { - .procname = "drop_packet", - .data = &sysctl_ip_vs_drop_packet, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, - { - .procname = "secure_tcp", - .data = &sysctl_ip_vs_secure_tcp, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, -#if 0 - { - .procname = "timeout_established", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synsent", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synrecv", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_finwait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_timewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_close", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_closewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_lastack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_listen", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_udp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_icmp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, -#endif - { - .procname = "cache_bypass", - .data = &sysctl_ip_vs_cache_bypass, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "expire_nodest_conn", - .data = &sysctl_ip_vs_expire_nodest_conn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "expire_quiescent_template", - .data = &sysctl_ip_vs_expire_quiescent_template, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "sync_threshold", - .data = &sysctl_ip_vs_sync_threshold, - .maxlen = sizeof(sysctl_ip_vs_sync_threshold), - .mode = 0644, - .proc_handler = &proc_do_sync_threshold, - }, - { - .procname = "nat_icmp_send", - .data = &sysctl_ip_vs_nat_icmp_send, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = 0 } -}; - -const struct ctl_path net_vs_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "vs", }, - { } -}; -EXPORT_SYMBOL_GPL(net_vs_ctl_path); - -static struct ctl_table_header * sysctl_header; - -#ifdef CONFIG_PROC_FS - -struct ip_vs_iter { - struct list_head *table; - int bucket; -}; - -/* - * Write the contents of the VS rule table to a PROCfs file. - * (It is kept just for backward compatibility) - */ -static inline const char *ip_vs_fwd_name(unsigned flags) -{ - switch (flags & IP_VS_CONN_F_FWD_MASK) { - case IP_VS_CONN_F_LOCALNODE: - return "Local"; - case IP_VS_CONN_F_TUNNEL: - return "Tunnel"; - case IP_VS_CONN_F_DROUTE: - return "Route"; - default: - return "Masq"; - } -} - - -/* Get the Nth entry in the two lists */ -static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) -{ - struct ip_vs_iter *iter = seq->private; - int idx; - struct ip_vs_service *svc; - - /* look in hash by protocol */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (pos-- == 0){ - iter->table = ip_vs_svc_table; - iter->bucket = idx; - return svc; - } - } - } - - /* keep looking in fwmark */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (pos-- == 0) { - iter->table = ip_vs_svc_fwm_table; - iter->bucket = idx; - return svc; - } - } - } - - return NULL; -} - -static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) -__acquires(__ip_vs_svc_lock) -{ - - read_lock_bh(&__ip_vs_svc_lock); - return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; -} - - -static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct list_head *e; - struct ip_vs_iter *iter; - struct ip_vs_service *svc; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_info_array(seq,0); - - svc = v; - iter = seq->private; - - if (iter->table == ip_vs_svc_table) { - /* next service in table hashed by protocol */ - if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, s_list); - - - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], - s_list) { - return svc; - } - } - - iter->table = ip_vs_svc_fwm_table; - iter->bucket = -1; - goto scan_fwmark; - } - - /* next service in hashed by fwmark */ - if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, f_list); - - scan_fwmark: - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], - f_list) - return svc; - } - - return NULL; -} - -static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) -__releases(__ip_vs_svc_lock) -{ - read_unlock_bh(&__ip_vs_svc_lock); -} - - -static int ip_vs_info_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) { - seq_printf(seq, - "IP Virtual Server version %d.%d.%d (size=%d)\n", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); - seq_puts(seq, - "Prot LocalAddress:Port Scheduler Flags\n"); - seq_puts(seq, - " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); - } else { - const struct ip_vs_service *svc = v; - const struct ip_vs_iter *iter = seq->private; - const struct ip_vs_dest *dest; - - if (iter->table == ip_vs_svc_table) { -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) - seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ", - ip_vs_proto_name(svc->protocol), - NIP6(svc->addr.in6), - ntohs(svc->port), - svc->scheduler->name); - else -#endif - seq_printf(seq, "%s %08X:%04X %s ", - ip_vs_proto_name(svc->protocol), - ntohl(svc->addr.ip), - ntohs(svc->port), - svc->scheduler->name); - } else { - seq_printf(seq, "FWM %08X %s ", - svc->fwmark, svc->scheduler->name); - } - - if (svc->flags & IP_VS_SVC_F_PERSISTENT) - seq_printf(seq, "persistent %d %08X\n", - svc->timeout, - ntohl(svc->netmask)); - else - seq_putc(seq, '\n'); - - list_for_each_entry(dest, &svc->destinations, n_list) { -#ifdef CONFIG_IP_VS_IPV6 - if (dest->af == AF_INET6) - seq_printf(seq, - " -> [" NIP6_FMT "]:%04X" - " %-7s %-6d %-10d %-10d\n", - NIP6(dest->addr.in6), - ntohs(dest->port), - ip_vs_fwd_name(atomic_read(&dest->conn_flags)), - atomic_read(&dest->weight), - atomic_read(&dest->activeconns), - atomic_read(&dest->inactconns)); - else -#endif - seq_printf(seq, - " -> %08X:%04X " - "%-7s %-6d %-10d %-10d\n", - ntohl(dest->addr.ip), - ntohs(dest->port), - ip_vs_fwd_name(atomic_read(&dest->conn_flags)), - atomic_read(&dest->weight), - atomic_read(&dest->activeconns), - atomic_read(&dest->inactconns)); - - } - } - return 0; -} - -static const struct seq_operations ip_vs_info_seq_ops = { - .start = ip_vs_info_seq_start, - .next = ip_vs_info_seq_next, - .stop = ip_vs_info_seq_stop, - .show = ip_vs_info_seq_show, -}; - -static int ip_vs_info_open(struct inode *inode, struct file *file) -{ - return seq_open_private(file, &ip_vs_info_seq_ops, - sizeof(struct ip_vs_iter)); -} - -static const struct file_operations ip_vs_info_fops = { - .owner = THIS_MODULE, - .open = ip_vs_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -#endif - -struct ip_vs_stats ip_vs_stats = { - .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), -}; - -#ifdef CONFIG_PROC_FS -static int ip_vs_stats_show(struct seq_file *seq, void *v) -{ - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ - seq_puts(seq, - " Total Incoming Outgoing Incoming Outgoing\n"); - seq_printf(seq, - " Conns Packets Packets Bytes Bytes\n"); - - spin_lock_bh(&ip_vs_stats.lock); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, - ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, - (unsigned long long) ip_vs_stats.ustats.inbytes, - (unsigned long long) ip_vs_stats.ustats.outbytes); - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ - seq_puts(seq, - " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq,"%8X %8X %8X %16X %16X\n", - ip_vs_stats.ustats.cps, - ip_vs_stats.ustats.inpps, - ip_vs_stats.ustats.outpps, - ip_vs_stats.ustats.inbps, - ip_vs_stats.ustats.outbps); - spin_unlock_bh(&ip_vs_stats.lock); - - return 0; -} - -static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) -{ - return single_open(file, ip_vs_stats_show, NULL); -} - -static const struct file_operations ip_vs_stats_fops = { - .owner = THIS_MODULE, - .open = ip_vs_stats_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -#endif - -/* - * Set timeout values for tcp tcpfin udp in the timeout_table. - */ -static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) -{ - IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", - u->tcp_timeout, - u->tcp_fin_timeout, - u->udp_timeout); - -#ifdef CONFIG_IP_VS_PROTO_TCP - if (u->tcp_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] - = u->tcp_timeout * HZ; - } - - if (u->tcp_fin_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] - = u->tcp_fin_timeout * HZ; - } -#endif - -#ifdef CONFIG_IP_VS_PROTO_UDP - if (u->udp_timeout) { - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] - = u->udp_timeout * HZ; - } -#endif - return 0; -} - - -#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) -#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ - sizeof(struct ip_vs_dest_user)) -#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) -#define MAX_ARG_LEN SVCDEST_ARG_LEN - -static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { - [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, - [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, -}; - -static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, - struct ip_vs_service_user *usvc_compat) -{ - usvc->af = AF_INET; - usvc->protocol = usvc_compat->protocol; - usvc->addr.ip = usvc_compat->addr; - usvc->port = usvc_compat->port; - usvc->fwmark = usvc_compat->fwmark; - - /* Deep copy of sched_name is not needed here */ - usvc->sched_name = usvc_compat->sched_name; - - usvc->flags = usvc_compat->flags; - usvc->timeout = usvc_compat->timeout; - usvc->netmask = usvc_compat->netmask; -} - -static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, - struct ip_vs_dest_user *udest_compat) -{ - udest->addr.ip = udest_compat->addr; - udest->port = udest_compat->port; - udest->conn_flags = udest_compat->conn_flags; - udest->weight = udest_compat->weight; - udest->u_threshold = udest_compat->u_threshold; - udest->l_threshold = udest_compat->l_threshold; -} - -static int -do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) -{ - int ret; - unsigned char arg[MAX_ARG_LEN]; - struct ip_vs_service_user *usvc_compat; - struct ip_vs_service_user_kern usvc; - struct ip_vs_service *svc; - struct ip_vs_dest_user *udest_compat; - struct ip_vs_dest_user_kern udest; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (len != set_arglen[SET_CMDID(cmd)]) { - IP_VS_ERR("set_ctl: len %u != %u\n", - len, set_arglen[SET_CMDID(cmd)]); - return -EINVAL; - } - - if (copy_from_user(arg, user, len) != 0) - return -EFAULT; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - if (mutex_lock_interruptible(&__ip_vs_mutex)) { - ret = -ERESTARTSYS; - goto out_dec; - } - - if (cmd == IP_VS_SO_SET_FLUSH) { - /* Flush the virtual service */ - ret = ip_vs_flush(); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_TIMEOUT) { - /* Set timeout values for (tcp tcpfin udp) */ - ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = stop_sync_thread(dm->state); - goto out_unlock; - } - - usvc_compat = (struct ip_vs_service_user *)arg; - udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); - - /* We only use the new structs internally, so copy userspace compat - * structs to extended internal versions */ - ip_vs_copy_usvc_compat(&usvc, usvc_compat); - ip_vs_copy_udest_compat(&udest, udest_compat); - - if (cmd == IP_VS_SO_SET_ZERO) { - /* if no service address is set, zero counters in all */ - if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { - ret = ip_vs_zero_all(); - goto out_unlock; - } - } - - /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ - if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) { - IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", - usvc.protocol, NIPQUAD(usvc.addr.ip), - ntohs(usvc.port), usvc.sched_name); - ret = -EFAULT; - goto out_unlock; - } - - /* Lookup the exact service by or fwmark */ - if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); - else - svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - - if (cmd != IP_VS_SO_SET_ADD - && (svc == NULL || svc->protocol != usvc.protocol)) { - ret = -ESRCH; - goto out_unlock; - } - - switch (cmd) { - case IP_VS_SO_SET_ADD: - if (svc != NULL) - ret = -EEXIST; - else - ret = ip_vs_add_service(&usvc, &svc); - break; - case IP_VS_SO_SET_EDIT: - ret = ip_vs_edit_service(svc, &usvc); - break; - case IP_VS_SO_SET_DEL: - ret = ip_vs_del_service(svc); - if (!ret) - goto out_unlock; - break; - case IP_VS_SO_SET_ZERO: - ret = ip_vs_zero_service(svc); - break; - case IP_VS_SO_SET_ADDDEST: - ret = ip_vs_add_dest(svc, &udest); - break; - case IP_VS_SO_SET_EDITDEST: - ret = ip_vs_edit_dest(svc, &udest); - break; - case IP_VS_SO_SET_DELDEST: - ret = ip_vs_del_dest(svc, &udest); - break; - default: - ret = -EINVAL; - } - - if (svc) - ip_vs_service_put(svc); - - out_unlock: - mutex_unlock(&__ip_vs_mutex); - out_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return ret; -} - - -static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) -{ - spin_lock_bh(&src->lock); - memcpy(dst, &src->ustats, sizeof(*dst)); - spin_unlock_bh(&src->lock); -} - -static void -ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) -{ - dst->protocol = src->protocol; - dst->addr = src->addr.ip; - dst->port = src->port; - dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); - dst->flags = src->flags; - dst->timeout = src->timeout / HZ; - dst->netmask = src->netmask; - dst->num_dests = src->num_dests; - ip_vs_copy_stats(&dst->stats, &src->stats); -} - -static inline int -__ip_vs_get_service_entries(const struct ip_vs_get_services *get, - struct ip_vs_get_services __user *uptr) -{ - int idx, count=0; - struct ip_vs_service *svc; - struct ip_vs_service_entry entry; - int ret = 0; - - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) - continue; - - if (count >= get->num_services) - goto out; - memset(&entry, 0, sizeof(entry)); - ip_vs_copy_service(&entry, svc); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - goto out; - } - count++; - } - } - - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) - continue; - - if (count >= get->num_services) - goto out; - memset(&entry, 0, sizeof(entry)); - ip_vs_copy_service(&entry, svc); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - goto out; - } - count++; - } - } - out: - return ret; -} - -static inline int -__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, - struct ip_vs_get_dests __user *uptr) -{ - struct ip_vs_service *svc; - union nf_inet_addr addr = { .ip = get->addr }; - int ret = 0; - - if (get->fwmark) - svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); - else - svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, - get->port); - - if (svc) { - int count = 0; - struct ip_vs_dest *dest; - struct ip_vs_dest_entry entry; - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (count >= get->num_dests) - break; - - entry.addr = dest->addr.ip; - entry.port = dest->port; - entry.conn_flags = atomic_read(&dest->conn_flags); - entry.weight = atomic_read(&dest->weight); - entry.u_threshold = dest->u_threshold; - entry.l_threshold = dest->l_threshold; - entry.activeconns = atomic_read(&dest->activeconns); - entry.inactconns = atomic_read(&dest->inactconns); - entry.persistconns = atomic_read(&dest->persistconns); - ip_vs_copy_stats(&entry.stats, &dest->stats); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - break; - } - count++; - } - ip_vs_service_put(svc); - } else - ret = -ESRCH; - return ret; -} - -static inline void -__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) -{ -#ifdef CONFIG_IP_VS_PROTO_TCP - u->tcp_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; - u->tcp_fin_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - u->udp_timeout = - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; -#endif -} - - -#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) -#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) -#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) -#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) -#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) - -static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { - [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, - [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, -}; - -static int -do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - unsigned char arg[128]; - int ret = 0; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (*len < get_arglen[GET_CMDID(cmd)]) { - IP_VS_ERR("get_ctl: len %u < %u\n", - *len, get_arglen[GET_CMDID(cmd)]); - return -EINVAL; - } - - if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) - return -EFAULT; - - if (mutex_lock_interruptible(&__ip_vs_mutex)) - return -ERESTARTSYS; - - switch (cmd) { - case IP_VS_SO_GET_VERSION: - { - char buf[64]; - - sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); - if (copy_to_user(user, buf, strlen(buf)+1) != 0) { - ret = -EFAULT; - goto out; - } - *len = strlen(buf)+1; - } - break; - - case IP_VS_SO_GET_INFO: - { - struct ip_vs_getinfo info; - info.version = IP_VS_VERSION_CODE; - info.size = IP_VS_CONN_TAB_SIZE; - info.num_services = ip_vs_num_services; - if (copy_to_user(user, &info, sizeof(info)) != 0) - ret = -EFAULT; - } - break; - - case IP_VS_SO_GET_SERVICES: - { - struct ip_vs_get_services *get; - int size; - - get = (struct ip_vs_get_services *)arg; - size = sizeof(*get) + - sizeof(struct ip_vs_service_entry) * get->num_services; - if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_service_entries(get, user); - } - break; - - case IP_VS_SO_GET_SERVICE: - { - struct ip_vs_service_entry *entry; - struct ip_vs_service *svc; - union nf_inet_addr addr; - - entry = (struct ip_vs_service_entry *)arg; - addr.ip = entry->addr; - if (entry->fwmark) - svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); - else - svc = __ip_vs_service_get(AF_INET, entry->protocol, - &addr, entry->port); - if (svc) { - ip_vs_copy_service(entry, svc); - if (copy_to_user(user, entry, sizeof(*entry)) != 0) - ret = -EFAULT; - ip_vs_service_put(svc); - } else - ret = -ESRCH; - } - break; - - case IP_VS_SO_GET_DESTS: - { - struct ip_vs_get_dests *get; - int size; - - get = (struct ip_vs_get_dests *)arg; - size = sizeof(*get) + - sizeof(struct ip_vs_dest_entry) * get->num_dests; - if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_dest_entries(get, user); - } - break; - - case IP_VS_SO_GET_TIMEOUT: - { - struct ip_vs_timeout_user t; - - __ip_vs_get_timeouts(&t); - if (copy_to_user(user, &t, sizeof(t)) != 0) - ret = -EFAULT; - } - break; - - case IP_VS_SO_GET_DAEMON: - { - struct ip_vs_daemon_user d[2]; - - memset(&d, 0, sizeof(d)); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) { - d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ip_vs_master_syncid; - } - if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { - d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ip_vs_backup_syncid; - } - if (copy_to_user(user, &d, sizeof(d)) != 0) - ret = -EFAULT; - } - break; - - default: - ret = -EINVAL; - } - - out: - mutex_unlock(&__ip_vs_mutex); - return ret; -} - - -static struct nf_sockopt_ops ip_vs_sockopts = { - .pf = PF_INET, - .set_optmin = IP_VS_BASE_CTL, - .set_optmax = IP_VS_SO_SET_MAX+1, - .set = do_ip_vs_set_ctl, - .get_optmin = IP_VS_BASE_CTL, - .get_optmax = IP_VS_SO_GET_MAX+1, - .get = do_ip_vs_get_ctl, - .owner = THIS_MODULE, -}; - -/* - * Generic Netlink interface - */ - -/* IPVS genetlink family */ -static struct genl_family ip_vs_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = IPVS_GENL_NAME, - .version = IPVS_GENL_VERSION, - .maxattr = IPVS_CMD_MAX, -}; - -/* Policy used for first-level command attributes */ -static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { - [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, - [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, - [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, - [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, - [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, - [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, -}; - -/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ -static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { - [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, - [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, - .len = IP_VS_IFNAME_MAXLEN }, - [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, -}; - -/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ -static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { - [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, - [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, - [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, - .len = sizeof(union nf_inet_addr) }, - [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, - [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, - [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, - .len = IP_VS_SCHEDNAME_MAXLEN }, - [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, - .len = sizeof(struct ip_vs_flags) }, - [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, - [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, - [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, -}; - -/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ -static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { - [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, - .len = sizeof(union nf_inet_addr) }, - [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, - [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, - [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, -}; - -static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, - struct ip_vs_stats *stats) -{ - struct nlattr *nl_stats = nla_nest_start(skb, container_type); - if (!nl_stats) - return -EMSGSIZE; - - spin_lock_bh(&stats->lock); - - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); - NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); - NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); - - spin_unlock_bh(&stats->lock); - - nla_nest_end(skb, nl_stats); - - return 0; - -nla_put_failure: - spin_unlock_bh(&stats->lock); - nla_nest_cancel(skb, nl_stats); - return -EMSGSIZE; -} - -static int ip_vs_genl_fill_service(struct sk_buff *skb, - struct ip_vs_service *svc) -{ - struct nlattr *nl_service; - struct ip_vs_flags flags = { .flags = svc->flags, - .mask = ~0 }; - - nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); - if (!nl_service) - return -EMSGSIZE; - - NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); - - if (svc->fwmark) { - NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); - } else { - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); - NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); - NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); - } - - NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); - NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); - NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); - - if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) - goto nla_put_failure; - - nla_nest_end(skb, nl_service); - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nl_service); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_service(struct sk_buff *skb, - struct ip_vs_service *svc, - struct netlink_callback *cb) -{ - void *hdr; - - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - &ip_vs_genl_family, NLM_F_MULTI, - IPVS_CMD_NEW_SERVICE); - if (!hdr) - return -EMSGSIZE; - - if (ip_vs_genl_fill_service(skb, svc) < 0) - goto nla_put_failure; - - return genlmsg_end(skb, hdr); - -nla_put_failure: - genlmsg_cancel(skb, hdr); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_services(struct sk_buff *skb, - struct netlink_callback *cb) -{ - int idx = 0, i; - int start = cb->args[0]; - struct ip_vs_service *svc; - - mutex_lock(&__ip_vs_mutex); - for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { - if (++idx <= start) - continue; - if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { - idx--; - goto nla_put_failure; - } - } - } - - for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { - if (++idx <= start) - continue; - if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { - idx--; - goto nla_put_failure; - } - } - } - -nla_put_failure: - mutex_unlock(&__ip_vs_mutex); - cb->args[0] = idx; - - return skb->len; -} - -static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, - struct nlattr *nla, int full_entry) -{ - struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; - struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; - - /* Parse mandatory identifying service fields first */ - if (nla == NULL || - nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) - return -EINVAL; - - nla_af = attrs[IPVS_SVC_ATTR_AF]; - nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; - nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; - nla_port = attrs[IPVS_SVC_ATTR_PORT]; - nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; - - if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) - return -EINVAL; - - usvc->af = nla_get_u16(nla_af); -#ifdef CONFIG_IP_VS_IPV6 - if (usvc->af != AF_INET && usvc->af != AF_INET6) -#else - if (usvc->af != AF_INET) -#endif - return -EAFNOSUPPORT; - - if (nla_fwmark) { - usvc->protocol = IPPROTO_TCP; - usvc->fwmark = nla_get_u32(nla_fwmark); - } else { - usvc->protocol = nla_get_u16(nla_protocol); - nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); - usvc->port = nla_get_u16(nla_port); - usvc->fwmark = 0; - } - - /* If a full entry was requested, check for the additional fields */ - if (full_entry) { - struct nlattr *nla_sched, *nla_flags, *nla_timeout, - *nla_netmask; - struct ip_vs_flags flags; - struct ip_vs_service *svc; - - nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; - nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; - nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; - nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; - - if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) - return -EINVAL; - - nla_memcpy(&flags, nla_flags, sizeof(flags)); - - /* prefill flags from service if it already exists */ - if (usvc->fwmark) - svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark); - else - svc = __ip_vs_service_get(usvc->af, usvc->protocol, - &usvc->addr, usvc->port); - if (svc) { - usvc->flags = svc->flags; - ip_vs_service_put(svc); - } else - usvc->flags = 0; - - /* set new flags from userland */ - usvc->flags = (usvc->flags & ~flags.mask) | - (flags.flags & flags.mask); - usvc->sched_name = nla_data(nla_sched); - usvc->timeout = nla_get_u32(nla_timeout); - usvc->netmask = nla_get_u32(nla_netmask); - } - - return 0; -} - -static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) -{ - struct ip_vs_service_user_kern usvc; - int ret; - - ret = ip_vs_genl_parse_service(&usvc, nla, 0); - if (ret) - return ERR_PTR(ret); - - if (usvc.fwmark) - return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - else - return __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); -} - -static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) -{ - struct nlattr *nl_dest; - - nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); - if (!nl_dest) - return -EMSGSIZE; - - NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); - NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); - - NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, - atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, - atomic_read(&dest->activeconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, - atomic_read(&dest->inactconns)); - NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns)); - - if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) - goto nla_put_failure; - - nla_nest_end(skb, nl_dest); - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nl_dest); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, - struct netlink_callback *cb) -{ - void *hdr; - - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - &ip_vs_genl_family, NLM_F_MULTI, - IPVS_CMD_NEW_DEST); - if (!hdr) - return -EMSGSIZE; - - if (ip_vs_genl_fill_dest(skb, dest) < 0) - goto nla_put_failure; - - return genlmsg_end(skb, hdr); - -nla_put_failure: - genlmsg_cancel(skb, hdr); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_dests(struct sk_buff *skb, - struct netlink_callback *cb) -{ - int idx = 0; - int start = cb->args[0]; - struct ip_vs_service *svc; - struct ip_vs_dest *dest; - struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; - - mutex_lock(&__ip_vs_mutex); - - /* Try to find the service for which to dump destinations */ - if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, - IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) - goto out_err; - - svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); - if (IS_ERR(svc) || svc == NULL) - goto out_err; - - /* Dump the destinations */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (++idx <= start) - continue; - if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { - idx--; - goto nla_put_failure; - } - } - -nla_put_failure: - cb->args[0] = idx; - ip_vs_service_put(svc); - -out_err: - mutex_unlock(&__ip_vs_mutex); - - return skb->len; -} - -static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, - struct nlattr *nla, int full_entry) -{ - struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; - struct nlattr *nla_addr, *nla_port; - - /* Parse mandatory identifying destination fields first */ - if (nla == NULL || - nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) - return -EINVAL; - - nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; - nla_port = attrs[IPVS_DEST_ATTR_PORT]; - - if (!(nla_addr && nla_port)) - return -EINVAL; - - nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); - udest->port = nla_get_u16(nla_port); - - /* If a full entry was requested, check for the additional fields */ - if (full_entry) { - struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, - *nla_l_thresh; - - nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; - nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; - nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; - nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; - - if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) - return -EINVAL; - - udest->conn_flags = nla_get_u32(nla_fwd) - & IP_VS_CONN_F_FWD_MASK; - udest->weight = nla_get_u32(nla_weight); - udest->u_threshold = nla_get_u32(nla_u_thresh); - udest->l_threshold = nla_get_u32(nla_l_thresh); - } - - return 0; -} - -static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid) -{ - struct nlattr *nl_daemon; - - nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); - if (!nl_daemon) - return -EMSGSIZE; - - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); - NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); - NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); - - nla_nest_end(skb, nl_daemon); - - return 0; - -nla_put_failure: - nla_nest_cancel(skb, nl_daemon); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, - const char *mcast_ifn, __be32 syncid, - struct netlink_callback *cb) -{ - void *hdr; - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - &ip_vs_genl_family, NLM_F_MULTI, - IPVS_CMD_NEW_DAEMON); - if (!hdr) - return -EMSGSIZE; - - if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) - goto nla_put_failure; - - return genlmsg_end(skb, hdr); - -nla_put_failure: - genlmsg_cancel(skb, hdr); - return -EMSGSIZE; -} - -static int ip_vs_genl_dump_daemons(struct sk_buff *skb, - struct netlink_callback *cb) -{ - mutex_lock(&__ip_vs_mutex); - if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { - if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, - ip_vs_master_mcast_ifn, - ip_vs_master_syncid, cb) < 0) - goto nla_put_failure; - - cb->args[0] = 1; - } - - if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { - if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, - ip_vs_backup_mcast_ifn, - ip_vs_backup_syncid, cb) < 0) - goto nla_put_failure; - - cb->args[1] = 1; - } - -nla_put_failure: - mutex_unlock(&__ip_vs_mutex); - - return skb->len; -} - -static int ip_vs_genl_new_daemon(struct nlattr **attrs) -{ - if (!(attrs[IPVS_DAEMON_ATTR_STATE] && - attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && - attrs[IPVS_DAEMON_ATTR_SYNC_ID])) - return -EINVAL; - - return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), - nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), - nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); -} - -static int ip_vs_genl_del_daemon(struct nlattr **attrs) -{ - if (!attrs[IPVS_DAEMON_ATTR_STATE]) - return -EINVAL; - - return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); -} - -static int ip_vs_genl_set_config(struct nlattr **attrs) -{ - struct ip_vs_timeout_user t; - - __ip_vs_get_timeouts(&t); - - if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) - t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); - - if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) - t.tcp_fin_timeout = - nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); - - if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) - t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); - - return ip_vs_set_timeout(&t); -} - -static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) -{ - struct ip_vs_service *svc = NULL; - struct ip_vs_service_user_kern usvc; - struct ip_vs_dest_user_kern udest; - int ret = 0, cmd; - int need_full_svc = 0, need_full_dest = 0; - - cmd = info->genlhdr->cmd; - - mutex_lock(&__ip_vs_mutex); - - if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(); - goto out; - } else if (cmd == IPVS_CMD_SET_CONFIG) { - ret = ip_vs_genl_set_config(info->attrs); - goto out; - } else if (cmd == IPVS_CMD_NEW_DAEMON || - cmd == IPVS_CMD_DEL_DAEMON) { - - struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; - - if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || - nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, - info->attrs[IPVS_CMD_ATTR_DAEMON], - ip_vs_daemon_policy)) { - ret = -EINVAL; - goto out; - } - - if (cmd == IPVS_CMD_NEW_DAEMON) - ret = ip_vs_genl_new_daemon(daemon_attrs); - else - ret = ip_vs_genl_del_daemon(daemon_attrs); - goto out; - } else if (cmd == IPVS_CMD_ZERO && - !info->attrs[IPVS_CMD_ATTR_SERVICE]) { - ret = ip_vs_zero_all(); - goto out; - } - - /* All following commands require a service argument, so check if we - * received a valid one. We need a full service specification when - * adding / editing a service. Only identifying members otherwise. */ - if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) - need_full_svc = 1; - - ret = ip_vs_genl_parse_service(&usvc, - info->attrs[IPVS_CMD_ATTR_SERVICE], - need_full_svc); - if (ret) - goto out; - - /* Lookup the exact service by or fwmark */ - if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); - else - svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - - /* Unless we're adding a new service, the service must already exist */ - if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { - ret = -ESRCH; - goto out; - } - - /* Destination commands require a valid destination argument. For - * adding / editing a destination, we need a full destination - * specification. */ - if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || - cmd == IPVS_CMD_DEL_DEST) { - if (cmd != IPVS_CMD_DEL_DEST) - need_full_dest = 1; - - ret = ip_vs_genl_parse_dest(&udest, - info->attrs[IPVS_CMD_ATTR_DEST], - need_full_dest); - if (ret) - goto out; - } - - switch (cmd) { - case IPVS_CMD_NEW_SERVICE: - if (svc == NULL) - ret = ip_vs_add_service(&usvc, &svc); - else - ret = -EEXIST; - break; - case IPVS_CMD_SET_SERVICE: - ret = ip_vs_edit_service(svc, &usvc); - break; - case IPVS_CMD_DEL_SERVICE: - ret = ip_vs_del_service(svc); - break; - case IPVS_CMD_NEW_DEST: - ret = ip_vs_add_dest(svc, &udest); - break; - case IPVS_CMD_SET_DEST: - ret = ip_vs_edit_dest(svc, &udest); - break; - case IPVS_CMD_DEL_DEST: - ret = ip_vs_del_dest(svc, &udest); - break; - case IPVS_CMD_ZERO: - ret = ip_vs_zero_service(svc); - break; - default: - ret = -EINVAL; - } - -out: - if (svc) - ip_vs_service_put(svc); - mutex_unlock(&__ip_vs_mutex); - - return ret; -} - -static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) -{ - struct sk_buff *msg; - void *reply; - int ret, cmd, reply_cmd; - - cmd = info->genlhdr->cmd; - - if (cmd == IPVS_CMD_GET_SERVICE) - reply_cmd = IPVS_CMD_NEW_SERVICE; - else if (cmd == IPVS_CMD_GET_INFO) - reply_cmd = IPVS_CMD_SET_INFO; - else if (cmd == IPVS_CMD_GET_CONFIG) - reply_cmd = IPVS_CMD_SET_CONFIG; - else { - IP_VS_ERR("unknown Generic Netlink command\n"); - return -EINVAL; - } - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - mutex_lock(&__ip_vs_mutex); - - reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); - if (reply == NULL) - goto nla_put_failure; - - switch (cmd) { - case IPVS_CMD_GET_SERVICE: - { - struct ip_vs_service *svc; - - svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); - if (IS_ERR(svc)) { - ret = PTR_ERR(svc); - goto out_err; - } else if (svc) { - ret = ip_vs_genl_fill_service(msg, svc); - ip_vs_service_put(svc); - if (ret) - goto nla_put_failure; - } else { - ret = -ESRCH; - goto out_err; - } - - break; - } - - case IPVS_CMD_GET_CONFIG: - { - struct ip_vs_timeout_user t; - - __ip_vs_get_timeouts(&t); -#ifdef CONFIG_IP_VS_PROTO_TCP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, - t.tcp_fin_timeout); -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); -#endif - - break; - } - - case IPVS_CMD_GET_INFO: - NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); - NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - IP_VS_CONN_TAB_SIZE); - break; - } - - genlmsg_end(msg, reply); - ret = genlmsg_unicast(msg, info->snd_pid); - goto out; - -nla_put_failure: - IP_VS_ERR("not enough space in Netlink message\n"); - ret = -EMSGSIZE; - -out_err: - nlmsg_free(msg); -out: - mutex_unlock(&__ip_vs_mutex); - - return ret; -} - - -static struct genl_ops ip_vs_genl_ops[] __read_mostly = { - { - .cmd = IPVS_CMD_NEW_SERVICE, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_SET_SERVICE, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_DEL_SERVICE, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_SERVICE, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_get_cmd, - .dumpit = ip_vs_genl_dump_services, - .policy = ip_vs_cmd_policy, - }, - { - .cmd = IPVS_CMD_NEW_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_SET_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_DEL_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_DEST, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .dumpit = ip_vs_genl_dump_dests, - }, - { - .cmd = IPVS_CMD_NEW_DAEMON, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_DEL_DAEMON, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_DAEMON, - .flags = GENL_ADMIN_PERM, - .dumpit = ip_vs_genl_dump_daemons, - }, - { - .cmd = IPVS_CMD_SET_CONFIG, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_GET_CONFIG, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_get_cmd, - }, - { - .cmd = IPVS_CMD_GET_INFO, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_get_cmd, - }, - { - .cmd = IPVS_CMD_ZERO, - .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, - .doit = ip_vs_genl_set_cmd, - }, - { - .cmd = IPVS_CMD_FLUSH, - .flags = GENL_ADMIN_PERM, - .doit = ip_vs_genl_set_cmd, - }, -}; - -static int __init ip_vs_genl_register(void) -{ - int ret, i; - - ret = genl_register_family(&ip_vs_genl_family); - if (ret) - return ret; - - for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) { - ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]); - if (ret) - goto err_out; - } - return 0; - -err_out: - genl_unregister_family(&ip_vs_genl_family); - return ret; -} - -static void ip_vs_genl_unregister(void) -{ - genl_unregister_family(&ip_vs_genl_family); -} - -/* End of Generic Netlink interface definitions */ - - -int __init ip_vs_control_init(void) -{ - int ret; - int idx; - - EnterFunction(2); - - ret = nf_register_sockopt(&ip_vs_sockopts); - if (ret) { - IP_VS_ERR("cannot register sockopt.\n"); - return ret; - } - - ret = ip_vs_genl_register(); - if (ret) { - IP_VS_ERR("cannot register Generic Netlink interface.\n"); - nf_unregister_sockopt(&ip_vs_sockopts); - return ret; - } - - proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); - - /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_rtable[idx]); - } - - ip_vs_new_estimator(&ip_vs_stats); - - /* Hook the defense timer */ - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); - - LeaveFunction(2); - return 0; -} - - -void ip_vs_control_cleanup(void) -{ - EnterFunction(2); - ip_vs_trash_cleanup(); - cancel_rearming_delayed_work(&defense_work); - cancel_work_sync(&defense_work.work); - ip_vs_kill_estimator(&ip_vs_stats); - unregister_sysctl_table(sysctl_header); - proc_net_remove(&init_net, "ip_vs_stats"); - proc_net_remove(&init_net, "ip_vs"); - ip_vs_genl_unregister(); - nf_unregister_sockopt(&ip_vs_sockopts); - LeaveFunction(2); -} diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c deleted file mode 100644 index 2eb2860dabb..00000000000 --- a/net/ipv4/ipvs/ip_vs_est.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * ip_vs_est.c: simple rate estimator for IPVS - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - This code is to estimate rate in a shorter interval (such as 8 - seconds) for virtual services and real servers. For measure rate in a - long interval, it is easy to implement a user level daemon which - periodically reads those statistical counters and measure rate. - - Currently, the measurement is activated by slow timer handler. Hope - this measurement will not introduce too much load. - - We measure rate during the last 8 seconds every 2 seconds: - - avgrate = avgrate*(1-W) + rate*W - - where W = 2^(-2) - - NOTES. - - * The stored value for average bps is scaled by 2^5, so that maximal - rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. - - * A lot code is taken from net/sched/estimator.c - */ - - -static void estimation_timer(unsigned long arg); - -static LIST_HEAD(est_list); -static DEFINE_SPINLOCK(est_lock); -static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); - -static void estimation_timer(unsigned long arg) -{ - struct ip_vs_estimator *e; - struct ip_vs_stats *s; - u32 n_conns; - u32 n_inpkts, n_outpkts; - u64 n_inbytes, n_outbytes; - u32 rate; - - spin_lock(&est_lock); - list_for_each_entry(e, &est_list, list) { - s = container_of(e, struct ip_vs_stats, est); - - spin_lock(&s->lock); - n_conns = s->ustats.conns; - n_inpkts = s->ustats.inpkts; - n_outpkts = s->ustats.outpkts; - n_inbytes = s->ustats.inbytes; - n_outbytes = s->ustats.outbytes; - - /* scaled by 2^10, but divided 2 seconds */ - rate = (n_conns - e->last_conns)<<9; - e->last_conns = n_conns; - e->cps += ((long)rate - (long)e->cps)>>2; - s->ustats.cps = (e->cps+0x1FF)>>10; - - rate = (n_inpkts - e->last_inpkts)<<9; - e->last_inpkts = n_inpkts; - e->inpps += ((long)rate - (long)e->inpps)>>2; - s->ustats.inpps = (e->inpps+0x1FF)>>10; - - rate = (n_outpkts - e->last_outpkts)<<9; - e->last_outpkts = n_outpkts; - e->outpps += ((long)rate - (long)e->outpps)>>2; - s->ustats.outpps = (e->outpps+0x1FF)>>10; - - rate = (n_inbytes - e->last_inbytes)<<4; - e->last_inbytes = n_inbytes; - e->inbps += ((long)rate - (long)e->inbps)>>2; - s->ustats.inbps = (e->inbps+0xF)>>5; - - rate = (n_outbytes - e->last_outbytes)<<4; - e->last_outbytes = n_outbytes; - e->outbps += ((long)rate - (long)e->outbps)>>2; - s->ustats.outbps = (e->outbps+0xF)>>5; - spin_unlock(&s->lock); - } - spin_unlock(&est_lock); - mod_timer(&est_timer, jiffies + 2*HZ); -} - -void ip_vs_new_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - INIT_LIST_HEAD(&est->list); - - est->last_conns = stats->ustats.conns; - est->cps = stats->ustats.cps<<10; - - est->last_inpkts = stats->ustats.inpkts; - est->inpps = stats->ustats.inpps<<10; - - est->last_outpkts = stats->ustats.outpkts; - est->outpps = stats->ustats.outpps<<10; - - est->last_inbytes = stats->ustats.inbytes; - est->inbps = stats->ustats.inbps<<5; - - est->last_outbytes = stats->ustats.outbytes; - est->outbps = stats->ustats.outbps<<5; - - spin_lock_bh(&est_lock); - list_add(&est->list, &est_list); - spin_unlock_bh(&est_lock); -} - -void ip_vs_kill_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - spin_lock_bh(&est_lock); - list_del(&est->list); - spin_unlock_bh(&est_lock); -} - -void ip_vs_zero_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - /* set counters zero, caller must hold the stats->lock lock */ - est->last_inbytes = 0; - est->last_outbytes = 0; - est->last_conns = 0; - est->last_inpkts = 0; - est->last_outpkts = 0; - est->cps = 0; - est->inpps = 0; - est->outpps = 0; - est->inbps = 0; - est->outbps = 0; -} - -int __init ip_vs_estimator_init(void) -{ - mod_timer(&est_timer, jiffies + 2 * HZ); - return 0; -} - -void ip_vs_estimator_cleanup(void) -{ - del_timer_sync(&est_timer); -} diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c deleted file mode 100644 index 2e7dbd8b73a..00000000000 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ /dev/null @@ -1,410 +0,0 @@ -/* - * ip_vs_ftp.c: IPVS ftp application module - * - * Authors: Wensong Zhang - * - * Changes: - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference - * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. - * - * IP_MASQ_FTP ftp masquerading module - * - * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 - * - * Author: Wouter Gadeyne - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - -#define SERVER_STRING "227 Entering Passive Mode (" -#define CLIENT_STRING "PORT " - - -/* - * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper - * First port is set to the default port. - */ -static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; -module_param_array(ports, ushort, NULL, 0); -MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); - - -/* Dummy variable */ -static int ip_vs_ftp_pasv; - - -static int -ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) -{ - return 0; -} - - -static int -ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) -{ - return 0; -} - - -/* - * Get from the string "xxx.xxx.xxx.xxx,ppp,ppp", started - * with the "pattern" and terminated with the "term" character. - * is in network order. - */ -static int ip_vs_ftp_get_addrport(char *data, char *data_limit, - const char *pattern, size_t plen, char term, - __be32 *addr, __be16 *port, - char **start, char **end) -{ - unsigned char p[6]; - int i = 0; - - if (data_limit - data < plen) { - /* check if there is partial match */ - if (strnicmp(data, pattern, data_limit - data) == 0) - return -1; - else - return 0; - } - - if (strnicmp(data, pattern, plen) != 0) { - return 0; - } - *start = data + plen; - - for (data = *start; *data != term; data++) { - if (data == data_limit) - return -1; - } - *end = data; - - memset(p, 0, sizeof(p)); - for (data = *start; data != *end; data++) { - if (*data >= '0' && *data <= '9') { - p[i] = p[i]*10 + *data - '0'; - } else if (*data == ',' && i < 5) { - i++; - } else { - /* unexpected character */ - return -1; - } - } - - if (i != 5) - return -1; - - *addr = get_unaligned((__be32 *)p); - *port = get_unaligned((__be16 *)(p + 4)); - return 1; -} - - -/* - * Look at outgoing ftp packets to catch the response to a PASV command - * from the server (inside-to-outside). - * When we see one, we build a connection entry with the client address, - * client port 0 (unknown at the moment), the server address and the - * server port. Mark the current connection entry as a control channel - * of the new entry. All this work is just to make the data connection - * can be scheduled to the right server later. - * - * The outgoing packet should be something like - * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". - * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. - */ -static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) -{ - struct iphdr *iph; - struct tcphdr *th; - char *data, *data_limit; - char *start, *end; - union nf_inet_addr from; - __be16 port; - struct ip_vs_conn *n_cp; - char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ - unsigned buf_len; - int ret; - -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - - *diff = 0; - - /* Only useful for established sessions */ - if (cp->state != IP_VS_TCP_S_ESTABLISHED) - return 1; - - /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) - return 0; - - if (cp->app_data == &ip_vs_ftp_pasv) { - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - data = (char *)th + (th->doff << 2); - data_limit = skb_tail_pointer(skb); - - if (ip_vs_ftp_get_addrport(data, data_limit, - SERVER_STRING, - sizeof(SERVER_STRING)-1, ')', - &from.ip, &port, - &start, &end) != 1) - return 1; - - IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " - "%u.%u.%u.%u:%d detected\n", - NIPQUAD(from.ip), ntohs(port), - NIPQUAD(cp->caddr.ip), 0); - - /* - * Now update or create an connection entry for it - */ - n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, - &cp->caddr, 0); - if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &cp->caddr, 0, - &cp->vaddr, port, - &from, port, - IP_VS_CONN_F_NO_CPORT, - cp->dest); - if (!n_cp) - return 0; - - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } - - /* - * Replace the old passive address with the new one - */ - from.ip = n_cp->vaddr.ip; - port = n_cp->vport; - sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip), - (ntohs(port)>>8)&255, ntohs(port)&255); - buf_len = strlen(buf); - - /* - * Calculate required delta-offset to keep TCP happy - */ - *diff = buf_len - (end-start); - - if (*diff == 0) { - /* simply replace it with new passive address */ - memcpy(start, buf, buf_len); - ret = 1; - } else { - ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, - end-start, buf, buf_len); - } - - cp->app_data = NULL; - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - return ret; - } - return 1; -} - - -/* - * Look at incoming ftp packets to catch the PASV/PORT command - * (outside-to-inside). - * - * The incoming packet having the PORT command should be something like - * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". - * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. - * In this case, we create a connection entry using the client address and - * port, so that the active ftp data connection from the server can reach - * the client. - */ -static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) -{ - struct iphdr *iph; - struct tcphdr *th; - char *data, *data_start, *data_limit; - char *start, *end; - union nf_inet_addr to; - __be16 port; - struct ip_vs_conn *n_cp; - -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - - /* no diff required for incoming packets */ - *diff = 0; - - /* Only useful for established sessions */ - if (cp->state != IP_VS_TCP_S_ESTABLISHED) - return 1; - - /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) - return 0; - - /* - * Detecting whether it is passive - */ - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - - /* Since there may be OPTIONS in the TCP packet and the HLEN is - the length of the header in 32-bit multiples, it is accurate - to calculate data address by th+HLEN*4 */ - data = data_start = (char *)th + (th->doff << 2); - data_limit = skb_tail_pointer(skb); - - while (data <= data_limit - 6) { - if (strnicmp(data, "PASV\r\n", 6) == 0) { - /* Passive mode on */ - IP_VS_DBG(7, "got PASV at %td of %td\n", - data - data_start, - data_limit - data_start); - cp->app_data = &ip_vs_ftp_pasv; - return 1; - } - data++; - } - - /* - * To support virtual FTP server, the scenerio is as follows: - * FTP client ----> Load Balancer ----> FTP server - * First detect the port number in the application data, - * then create a new connection entry for the coming data - * connection. - */ - if (ip_vs_ftp_get_addrport(data_start, data_limit, - CLIENT_STRING, sizeof(CLIENT_STRING)-1, - '\r', &to.ip, &port, - &start, &end) != 1) - return 1; - - IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", - NIPQUAD(to.ip), ntohs(port)); - - /* Passive mode off */ - cp->app_data = NULL; - - /* - * Now update or create a connection entry for it - */ - IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", - ip_vs_proto_name(iph->protocol), - NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); - - n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, - &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1)); - if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1), - &cp->daddr, htons(ntohs(cp->dport)-1), - 0, - cp->dest); - if (!n_cp) - return 0; - - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } - - /* - * Move tunnel to listen state - */ - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - - return 1; -} - - -static struct ip_vs_app ip_vs_ftp = { - .name = "ftp", - .type = IP_VS_APP_TYPE_FTP, - .protocol = IPPROTO_TCP, - .module = THIS_MODULE, - .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), - .init_conn = ip_vs_ftp_init_conn, - .done_conn = ip_vs_ftp_done_conn, - .bind_conn = NULL, - .unbind_conn = NULL, - .pkt_out = ip_vs_ftp_out, - .pkt_in = ip_vs_ftp_in, -}; - - -/* - * ip_vs_ftp initialization - */ -static int __init ip_vs_ftp_init(void) -{ - int i, ret; - struct ip_vs_app *app = &ip_vs_ftp; - - ret = register_ip_vs_app(app); - if (ret) - return ret; - - for (i=0; iprotocol, ports[i]); - if (ret) - break; - IP_VS_INFO("%s: loaded support on port[%d] = %d\n", - app->name, i, ports[i]); - } - - if (ret) - unregister_ip_vs_app(app); - - return ret; -} - - -/* - * ip_vs_ftp finish. - */ -static void __exit ip_vs_ftp_exit(void) -{ - unregister_ip_vs_app(&ip_vs_ftp); -} - - -module_init(ip_vs_ftp_init); -module_exit(ip_vs_ftp_exit); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c deleted file mode 100644 index 6ecef3518ca..00000000000 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ /dev/null @@ -1,555 +0,0 @@ -/* - * IPVS: Locality-Based Least-Connection scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Martin Hamilton : fixed the terrible locking bugs - * *lock(tbl->lock) ==> *lock(&tbl->lock) - * Wensong Zhang : fixed the uninitilized tbl->lock bug - * Wensong Zhang : added doing full expiration check to - * collect stale entries of 24+ hours when - * no partial expire check in a half hour - * Julian Anastasov : replaced del_timer call with del_timer_sync - * to avoid the possible race between timer - * handler and del_timer thread in SMP - * - */ - -/* - * The lblc algorithm is as follows (pseudo code): - * - * if cachenode[dest_ip] is null then - * n, cachenode[dest_ip] <- {weighted least-conn node}; - * else - * n <- cachenode[dest_ip]; - * if (n is dead) OR - * (n.conns>n.weight AND - * there is a node m with m.conns -#include -#include -#include -#include - -/* for sysctl */ -#include -#include - -#include - - -/* - * It is for garbage collection of stale IPVS lblc entries, - * when the table is full. - */ -#define CHECK_EXPIRE_INTERVAL (60*HZ) -#define ENTRY_TIMEOUT (6*60*HZ) - -/* - * It is for full expiration check. - * When there is no partial expiration check (garbage collection) - * in a half hour, do a full expiration check to collect stale - * entries that haven't been touched for a day. - */ -#define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; - - -/* - * for IPVS lblc entry hash table - */ -#ifndef CONFIG_IP_VS_LBLC_TAB_BITS -#define CONFIG_IP_VS_LBLC_TAB_BITS 10 -#endif -#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS -#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) -#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) - - -/* - * IPVS lblc entry represents an association between destination - * IP address and its destination server - */ -struct ip_vs_lblc_entry { - struct list_head list; - __be32 addr; /* destination IP address */ - struct ip_vs_dest *dest; /* real server (cache) */ - unsigned long lastuse; /* last used time */ -}; - - -/* - * IPVS lblc hash table - */ -struct ip_vs_lblc_table { - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ - atomic_t entries; /* number of entries */ - int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ - int rover; /* rover for expire check */ - int counter; /* counter for no expire */ -}; - - -/* - * IPVS LBLC sysctl table - */ - -static ctl_table vs_vars_table[] = { - { - .procname = "lblc_expiration", - .data = &sysctl_ip_vs_lblc_expiration, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header * sysctl_header; - -static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) -{ - list_del(&en->list); - /* - * We don't kfree dest because it is refered either by its service - * or the trash dest list. - */ - atomic_dec(&en->dest->refcnt); - kfree(en); -} - - -/* - * Returns hash value for IPVS LBLC entry - */ -static inline unsigned ip_vs_lblc_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; -} - - -/* - * Hash an entry in the ip_vs_lblc_table. - * returns bool success. - */ -static void -ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) -{ - unsigned hash = ip_vs_lblc_hashkey(en->addr); - - list_add(&en->list, &tbl->bucket[hash]); - atomic_inc(&tbl->entries); -} - - -/* - * Get ip_vs_lblc_entry associated with supplied parameters. Called under read - * lock - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) -{ - unsigned hash = ip_vs_lblc_hashkey(addr); - struct ip_vs_lblc_entry *en; - - list_for_each_entry(en, &tbl->bucket[hash], list) - if (en->addr == addr) - return en; - - return NULL; -} - - -/* - * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP - * address to a server. Called under write lock. - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr, - struct ip_vs_dest *dest) -{ - struct ip_vs_lblc_entry *en; - - en = ip_vs_lblc_get(tbl, daddr); - if (!en) { - en = kmalloc(sizeof(*en), GFP_ATOMIC); - if (!en) { - IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); - return NULL; - } - - en->addr = daddr; - en->lastuse = jiffies; - - atomic_inc(&dest->refcnt); - en->dest = dest; - - ip_vs_lblc_hash(tbl, en); - } else if (en->dest != dest) { - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; - } - - return en; -} - - -/* - * Flush all the entries of the specified table. - */ -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) -{ - struct ip_vs_lblc_entry *en, *nxt; - int i; - - for (i=0; ibucket[i], list) { - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - } - } -} - - -static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) -{ - struct ip_vs_lblc_table *tbl = svc->sched_data; - struct ip_vs_lblc_entry *en, *nxt; - unsigned long now = jiffies; - int i, j; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, - en->lastuse + sysctl_ip_vs_lblc_expiration)) - continue; - - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&svc->sched_lock); - } - tbl->rover = j; -} - - -/* - * Periodical timer handler for IPVS lblc table - * It is used to collect stale entries when the number of entries - * exceeds the maximum size of the table. - * - * Fixme: we probably need more complicated algorithm to collect - * entries that have not been used for a long time even - * if the number of entries doesn't exceed the maximum size - * of the table. - * The full expiration check is for this purpose now. - */ -static void ip_vs_lblc_check_expire(unsigned long data) -{ - struct ip_vs_service *svc = (struct ip_vs_service *) data; - struct ip_vs_lblc_table *tbl = svc->sched_data; - unsigned long now = jiffies; - int goal; - int i, j; - struct ip_vs_lblc_entry *en, *nxt; - - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { - /* do full expiration check */ - ip_vs_lblc_full_check(svc); - tbl->counter = 1; - goto out; - } - - if (atomic_read(&tbl->entries) <= tbl->max_size) { - tbl->counter++; - goto out; - } - - goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; - if (goal > tbl->max_size/2) - goal = tbl->max_size/2; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) - continue; - - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - goal--; - } - write_unlock(&svc->sched_lock); - if (goal <= 0) - break; - } - tbl->rover = j; - - out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); -} - - -static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) -{ - int i; - struct ip_vs_lblc_table *tbl; - - /* - * Allocate the ip_vs_lblc_table for this service - */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " - "current service\n", sizeof(*tbl)); - - /* - * Initialize the hash buckets - */ - for (i=0; ibucket[i]); - } - tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; - tbl->rover = 0; - tbl->counter = 1; - - /* - * Hook periodic timer for garbage collection - */ - setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, - (unsigned long)svc); - mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); - - return 0; -} - - -static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_lblc_table *tbl = svc->sched_data; - - /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); - - /* got to clean up table entries here */ - ip_vs_lblc_flush(tbl); - - /* release the table itself */ - kfree(tbl); - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", - sizeof(*tbl)); - - return 0; -} - - -static inline struct ip_vs_dest * -__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) -{ - struct ip_vs_dest *dest, *least; - int loh, doh; - - /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connection. - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - if (atomic_read(&dest->weight) > 0) { - least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -/* - * If this destination server is overloaded and there is a less loaded - * server, then return true. - */ -static inline int -is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { - struct ip_vs_dest *d; - - list_for_each_entry(d, &svc->destinations, n_list) { - if (atomic_read(&d->activeconns)*2 - < atomic_read(&d->weight)) { - return 1; - } - } - } - return 0; -} - - -/* - * Locality-Based (weighted) Least-Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_lblc_table *tbl = svc->sched_data; - struct iphdr *iph = ip_hdr(skb); - struct ip_vs_dest *dest = NULL; - struct ip_vs_lblc_entry *en; - - IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); - - /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblc_get(tbl, iph->daddr); - if (en) { - /* We only hold a read lock, but this is atomic */ - en->lastuse = jiffies; - - /* - * If the destination is not available, i.e. it's in the trash, - * we must ignore it, as it may be removed from under our feet, - * if someone drops our reference count. Our caller only makes - * sure that destinations, that are not in the trash, are not - * moved to the trash, while we are scheduling. But anyone can - * free up entries from the trash at any time. - */ - - if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) - dest = en->dest; - } - read_unlock(&svc->sched_lock); - - /* If the destination has a weight and is not overloaded, use it */ - if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) - goto out; - - /* No cache entry or it is invalid, time to schedule */ - dest = __ip_vs_lblc_schedule(svc, iph); - if (!dest) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - - /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblc_new(tbl, iph->daddr, dest); - write_unlock(&svc->sched_lock); - -out: - IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->daddr), - NIPQUAD(dest->addr.ip), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS LBLC Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ - .name = "lblc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 0, -#endif - .init_service = ip_vs_lblc_init_svc, - .done_service = ip_vs_lblc_done_svc, - .schedule = ip_vs_lblc_schedule, -}; - - -static int __init ip_vs_lblc_init(void) -{ - int ret; - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); - ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); - if (ret) - unregister_sysctl_table(sysctl_header); - return ret; -} - - -static void __exit ip_vs_lblc_cleanup(void) -{ - unregister_sysctl_table(sysctl_header); - unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); -} - - -module_init(ip_vs_lblc_init); -module_exit(ip_vs_lblc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c deleted file mode 100644 index 1f75ea83bcf..00000000000 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ /dev/null @@ -1,755 +0,0 @@ -/* - * IPVS: Locality-Based Least-Connection with Replication scheduler - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Julian Anastasov : Added the missing (dest->weight>0) - * condition in the ip_vs_dest_set_max. - * - */ - -/* - * The lblc/r algorithm is as follows (pseudo code): - * - * if serverSet[dest_ip] is null then - * n, serverSet[dest_ip] <- {weighted least-conn node}; - * else - * n <- {least-conn (alive) node in serverSet[dest_ip]}; - * if (n is null) OR - * (n.conns>n.weight AND - * there is a node m with m.conns 1 AND - * now - serverSet[dest_ip].lastMod > T then - * m <- {most conn node in serverSet[dest_ip]}; - * remove m from serverSet[dest_ip]; - * if serverSet[dest_ip] changed then - * serverSet[dest_ip].lastMod <- now; - * - * return n; - * - */ - -#include -#include -#include -#include -#include - -/* for sysctl */ -#include -#include -#include - -#include - - -/* - * It is for garbage collection of stale IPVS lblcr entries, - * when the table is full. - */ -#define CHECK_EXPIRE_INTERVAL (60*HZ) -#define ENTRY_TIMEOUT (6*60*HZ) - -/* - * It is for full expiration check. - * When there is no partial expiration check (garbage collection) - * in a half hour, do a full expiration check to collect stale - * entries that haven't been touched for a day. - */ -#define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; - - -/* - * for IPVS lblcr entry hash table - */ -#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS -#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 -#endif -#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS -#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) -#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) - - -/* - * IPVS destination set structure and operations - */ -struct ip_vs_dest_list { - struct ip_vs_dest_list *next; /* list link */ - struct ip_vs_dest *dest; /* destination server */ -}; - -struct ip_vs_dest_set { - atomic_t size; /* set size */ - unsigned long lastmod; /* last modified time */ - struct ip_vs_dest_list *list; /* destination list */ - rwlock_t lock; /* lock for this list */ -}; - - -static struct ip_vs_dest_list * -ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) -{ - struct ip_vs_dest_list *e; - - for (e=set->list; e!=NULL; e=e->next) { - if (e->dest == dest) - /* already existed */ - return NULL; - } - - e = kmalloc(sizeof(*e), GFP_ATOMIC); - if (e == NULL) { - IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); - return NULL; - } - - atomic_inc(&dest->refcnt); - e->dest = dest; - - /* link it to the list */ - e->next = set->list; - set->list = e; - atomic_inc(&set->size); - - set->lastmod = jiffies; - return e; -} - -static void -ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) -{ - struct ip_vs_dest_list *e, **ep; - - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { - if (e->dest == dest) { - /* HIT */ - *ep = e->next; - atomic_dec(&set->size); - set->lastmod = jiffies; - atomic_dec(&e->dest->refcnt); - kfree(e); - break; - } - ep = &e->next; - } -} - -static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) -{ - struct ip_vs_dest_list *e, **ep; - - write_lock(&set->lock); - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { - *ep = e->next; - /* - * We don't kfree dest because it is refered either - * by its service or by the trash dest list. - */ - atomic_dec(&e->dest->refcnt); - kfree(e); - } - write_unlock(&set->lock); -} - -/* get weighted least-connection node in the destination set */ -static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) -{ - register struct ip_vs_dest_list *e; - struct ip_vs_dest *dest, *least; - int loh, doh; - - if (set == NULL) - return NULL; - - /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { - least = e->dest; - if (least->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - if ((atomic_read(&least->weight) > 0) - && (least->flags & IP_VS_DEST_F_AVAILABLE)) { - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* find the destination with the weighted least load */ - nextstage: - for (e=e->next; e!=NULL; e=e->next) { - dest = e->dest; - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if ((loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) - && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - return least; -} - - -/* get weighted most-connection node in the destination set */ -static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) -{ - register struct ip_vs_dest_list *e; - struct ip_vs_dest *dest, *most; - int moh, doh; - - if (set == NULL) - return NULL; - - /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { - most = e->dest; - if (atomic_read(&most->weight) > 0) { - moh = atomic_read(&most->activeconns) * 50 - + atomic_read(&most->inactconns); - goto nextstage; - } - } - return NULL; - - /* find the destination with the weighted most load */ - nextstage: - for (e=e->next; e!=NULL; e=e->next) { - dest = e->dest; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ - if ((moh * atomic_read(&dest->weight) < - doh * atomic_read(&most->weight)) - && (atomic_read(&dest->weight) > 0)) { - most = dest; - moh = doh; - } - } - - IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(most->addr.ip), ntohs(most->port), - atomic_read(&most->activeconns), - atomic_read(&most->refcnt), - atomic_read(&most->weight), moh); - return most; -} - - -/* - * IPVS lblcr entry represents an association between destination - * IP address and its destination server set - */ -struct ip_vs_lblcr_entry { - struct list_head list; - __be32 addr; /* destination IP address */ - struct ip_vs_dest_set set; /* destination server set */ - unsigned long lastuse; /* last used time */ -}; - - -/* - * IPVS lblcr hash table - */ -struct ip_vs_lblcr_table { - struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ - atomic_t entries; /* number of entries */ - int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ - int rover; /* rover for expire check */ - int counter; /* counter for no expire */ -}; - - -/* - * IPVS LBLCR sysctl table - */ - -static ctl_table vs_vars_table[] = { - { - .procname = "lblcr_expiration", - .data = &sysctl_ip_vs_lblcr_expiration, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header * sysctl_header; - -static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) -{ - list_del(&en->list); - ip_vs_dest_set_eraseall(&en->set); - kfree(en); -} - - -/* - * Returns hash value for IPVS LBLCR entry - */ -static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; -} - - -/* - * Hash an entry in the ip_vs_lblcr_table. - * returns bool success. - */ -static void -ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) -{ - unsigned hash = ip_vs_lblcr_hashkey(en->addr); - - list_add(&en->list, &tbl->bucket[hash]); - atomic_inc(&tbl->entries); -} - - -/* - * Get ip_vs_lblcr_entry associated with supplied parameters. Called under - * read lock. - */ -static inline struct ip_vs_lblcr_entry * -ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) -{ - unsigned hash = ip_vs_lblcr_hashkey(addr); - struct ip_vs_lblcr_entry *en; - - list_for_each_entry(en, &tbl->bucket[hash], list) - if (en->addr == addr) - return en; - - return NULL; -} - - -/* - * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. Called under write lock. - */ -static inline struct ip_vs_lblcr_entry * -ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr, - struct ip_vs_dest *dest) -{ - struct ip_vs_lblcr_entry *en; - - en = ip_vs_lblcr_get(tbl, daddr); - if (!en) { - en = kmalloc(sizeof(*en), GFP_ATOMIC); - if (!en) { - IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); - return NULL; - } - - en->addr = daddr; - en->lastuse = jiffies; - - /* initilize its dest set */ - atomic_set(&(en->set.size), 0); - en->set.list = NULL; - rwlock_init(&en->set.lock); - - ip_vs_lblcr_hash(tbl, en); - } - - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); - - return en; -} - - -/* - * Flush all the entries of the specified table. - */ -static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) -{ - int i; - struct ip_vs_lblcr_entry *en, *nxt; - - /* No locking required, only called during cleanup. */ - for (i=0; ibucket[i], list) { - ip_vs_lblcr_free(en); - } - } -} - - -static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) -{ - struct ip_vs_lblcr_table *tbl = svc->sched_data; - unsigned long now = jiffies; - int i, j; - struct ip_vs_lblcr_entry *en, *nxt; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, - now)) - continue; - - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&svc->sched_lock); - } - tbl->rover = j; -} - - -/* - * Periodical timer handler for IPVS lblcr table - * It is used to collect stale entries when the number of entries - * exceeds the maximum size of the table. - * - * Fixme: we probably need more complicated algorithm to collect - * entries that have not been used for a long time even - * if the number of entries doesn't exceed the maximum size - * of the table. - * The full expiration check is for this purpose now. - */ -static void ip_vs_lblcr_check_expire(unsigned long data) -{ - struct ip_vs_service *svc = (struct ip_vs_service *) data; - struct ip_vs_lblcr_table *tbl = svc->sched_data; - unsigned long now = jiffies; - int goal; - int i, j; - struct ip_vs_lblcr_entry *en, *nxt; - - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { - /* do full expiration check */ - ip_vs_lblcr_full_check(svc); - tbl->counter = 1; - goto out; - } - - if (atomic_read(&tbl->entries) <= tbl->max_size) { - tbl->counter++; - goto out; - } - - goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; - if (goal > tbl->max_size/2) - goal = tbl->max_size/2; - - for (i=0, j=tbl->rover; isched_lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) - continue; - - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - goal--; - } - write_unlock(&svc->sched_lock); - if (goal <= 0) - break; - } - tbl->rover = j; - - out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); -} - -static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) -{ - int i; - struct ip_vs_lblcr_table *tbl; - - /* - * Allocate the ip_vs_lblcr_table for this service - */ - tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " - "current service\n", sizeof(*tbl)); - - /* - * Initialize the hash buckets - */ - for (i=0; ibucket[i]); - } - tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; - tbl->rover = 0; - tbl->counter = 1; - - /* - * Hook periodic timer for garbage collection - */ - setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, - (unsigned long)svc); - mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); - - return 0; -} - - -static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_lblcr_table *tbl = svc->sched_data; - - /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); - - /* got to clean up table entries here */ - ip_vs_lblcr_flush(tbl); - - /* release the table itself */ - kfree(tbl); - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", - sizeof(*tbl)); - - return 0; -} - - -static inline struct ip_vs_dest * -__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) -{ - struct ip_vs_dest *dest, *least; - int loh, doh; - - /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connection. - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - if (atomic_read(&dest->weight) > 0) { - least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr.ip), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -/* - * If this destination server is overloaded and there is a less loaded - * server, then return true. - */ -static inline int -is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { - struct ip_vs_dest *d; - - list_for_each_entry(d, &svc->destinations, n_list) { - if (atomic_read(&d->activeconns)*2 - < atomic_read(&d->weight)) { - return 1; - } - } - } - return 0; -} - - -/* - * Locality-Based (weighted) Least-Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_lblcr_table *tbl = svc->sched_data; - struct iphdr *iph = ip_hdr(skb); - struct ip_vs_dest *dest = NULL; - struct ip_vs_lblcr_entry *en; - - IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); - - /* First look in our cache */ - read_lock(&svc->sched_lock); - en = ip_vs_lblcr_get(tbl, iph->daddr); - if (en) { - /* We only hold a read lock, but this is atomic */ - en->lastuse = jiffies; - - /* Get the least loaded destination */ - read_lock(&en->set.lock); - dest = ip_vs_dest_set_min(&en->set); - read_unlock(&en->set.lock); - - /* More than one destination + enough time passed by, cleanup */ - if (atomic_read(&en->set.size) > 1 && - time_after(jiffies, en->set.lastmod + - sysctl_ip_vs_lblcr_expiration)) { - struct ip_vs_dest *m; - - write_lock(&en->set.lock); - m = ip_vs_dest_set_max(&en->set); - if (m) - ip_vs_dest_set_erase(&en->set, m); - write_unlock(&en->set.lock); - } - - /* If the destination is not overloaded, use it */ - if (dest && !is_overloaded(dest, svc)) { - read_unlock(&svc->sched_lock); - goto out; - } - - /* The cache entry is invalid, time to schedule */ - dest = __ip_vs_lblcr_schedule(svc, iph); - if (!dest) { - IP_VS_DBG(1, "no destination available\n"); - read_unlock(&svc->sched_lock); - return NULL; - } - - /* Update our cache entry */ - write_lock(&en->set.lock); - ip_vs_dest_set_insert(&en->set, dest); - write_unlock(&en->set.lock); - } - read_unlock(&svc->sched_lock); - - if (dest) - goto out; - - /* No cache entry, time to schedule */ - dest = __ip_vs_lblcr_schedule(svc, iph); - if (!dest) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - - /* If we fail to create a cache entry, we'll just use the valid dest */ - write_lock(&svc->sched_lock); - ip_vs_lblcr_new(tbl, iph->daddr, dest); - write_unlock(&svc->sched_lock); - -out: - IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->daddr), - NIPQUAD(dest->addr.ip), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS LBLCR Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_lblcr_scheduler = -{ - .name = "lblcr", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 0, -#endif - .init_service = ip_vs_lblcr_init_svc, - .done_service = ip_vs_lblcr_done_svc, - .schedule = ip_vs_lblcr_schedule, -}; - - -static int __init ip_vs_lblcr_init(void) -{ - int ret; - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); - ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); - if (ret) - unregister_sysctl_table(sysctl_header); - return ret; -} - - -static void __exit ip_vs_lblcr_cleanup(void) -{ - unregister_sysctl_table(sysctl_header); - unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); -} - - -module_init(ip_vs_lblcr_init); -module_exit(ip_vs_lblcr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c deleted file mode 100644 index b69f808ac46..00000000000 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * IPVS: Least-Connection Scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : added the ip_vs_lc_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - -/* - * Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; - - IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); - - /* - * Simply select the server with the least number of - * (activeconns<<5) + inactconns - * Except whose weight is equal to zero. - * If the weight is equal to zero, it means that the server is - * quiesced, the existing connections to the server still get - * served, but no new connection is assigned to the server. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || - atomic_read(&dest->weight) == 0) - continue; - doh = ip_vs_lc_dest_overhead(dest); - if (!least || doh < loh) { - least = dest; - loh = doh; - } - } - - if (least) - IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->inactconns)); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_lc_scheduler = { - .name = "lc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_lc_schedule, -}; - - -static int __init ip_vs_lc_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; -} - -static void __exit ip_vs_lc_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); -} - -module_init(ip_vs_lc_init); -module_exit(ip_vs_lc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c deleted file mode 100644 index 9a2d8033f08..00000000000 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - * IPVS: Never Queue scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The NQ algorithm adopts a two-speed model. When there is an idle server - * available, the job will be sent to the idle server, instead of waiting - * for a fast one. When there is no idle server available, the job will be - * sent to the server that minimize its expected delay (The Shortest - * Expected Delay scheduling algorithm). - * - * See the following paper for more information: - * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing - * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, - * pages 986-994, 1988. - * - * Thanks must go to Marko Buuri for talking NQ to me. - * - * The difference between NQ and SED is that NQ can improve overall - * system utilization. - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We only use the active connection number in the cost - * calculation here. - */ - return atomic_read(&dest->activeconns) + 1; -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; - - IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (server expected overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - - if (dest->flags & IP_VS_DEST_F_OVERLOAD || - !atomic_read(&dest->weight)) - continue; - - doh = ip_vs_nq_dest_overhead(dest); - - /* return the server directly if it is idle */ - if (atomic_read(&dest->activeconns) == 0) { - least = dest; - loh = doh; - goto out; - } - - if (!least || - (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight))) { - least = dest; - loh = doh; - } - } - - if (!least) - return NULL; - - out: - IP_VS_DBG_BUF(6, "NQ: server %s:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_nq_scheduler = -{ - .name = "nq", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_nq_schedule, -}; - - -static int __init ip_vs_nq_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_nq_scheduler); -} - -static void __exit ip_vs_nq_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); -} - -module_init(ip_vs_nq_init); -module_exit(ip_vs_nq_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c deleted file mode 100644 index 0791f9e08fe..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ /dev/null @@ -1,288 +0,0 @@ -/* - * ip_vs_proto.c: transport protocol load balancing support for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - -/* - * IPVS protocols can only be registered/unregistered when the ipvs - * module is loaded/unloaded, so no lock is needed in accessing the - * ipvs protocol table. - */ - -#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ -#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) - -static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; - - -/* - * register an ipvs protocol - */ -static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) -{ - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); - - pp->next = ip_vs_proto_table[hash]; - ip_vs_proto_table[hash] = pp; - - if (pp->init != NULL) - pp->init(pp); - - return 0; -} - - -/* - * unregister an ipvs protocol - */ -static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) -{ - struct ip_vs_protocol **pp_p; - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); - - pp_p = &ip_vs_proto_table[hash]; - for (; *pp_p; pp_p = &(*pp_p)->next) { - if (*pp_p == pp) { - *pp_p = pp->next; - if (pp->exit != NULL) - pp->exit(pp); - return 0; - } - } - - return -ESRCH; -} - - -/* - * get ip_vs_protocol object by its proto. - */ -struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) -{ - struct ip_vs_protocol *pp; - unsigned hash = IP_VS_PROTO_HASH(proto); - - for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { - if (pp->protocol == proto) - return pp; - } - - return NULL; -} - - -/* - * Propagate event for state change to all protocols - */ -void ip_vs_protocol_timeout_change(int flags) -{ - struct ip_vs_protocol *pp; - int i; - - for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { - if (pp->timeout_change) - pp->timeout_change(pp, flags); - } - } -} - - -int * -ip_vs_create_timeout_table(int *table, int size) -{ - return kmemdup(table, size, GFP_ATOMIC); -} - - -/* - * Set timeout value for state specified by name - */ -int -ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) -{ - int i; - - if (!table || !name || !to) - return -EINVAL; - - for (i = 0; i < num; i++) { - if (strcmp(names[i], name)) - continue; - table[i] = to * HZ; - return 0; - } - return -ENOENT; -} - - -const char * ip_vs_state_name(__u16 proto, int state) -{ - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); - - if (pp == NULL || pp->state_name == NULL) - return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; - return pp->state_name(state); -} - - -static void -ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) -{ - char buf[128]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else if (ih->frag_off & htons(IP_OFFSET)) - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - else { - __be16 _ports[2], *pptr -; - pptr = skb_header_pointer(skb, offset + ih->ihl*4, - sizeof(_ports), _ports); - if (pptr == NULL) - sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, - NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - else - sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", - pp->name, - NIPQUAD(ih->saddr), - ntohs(pptr[0]), - NIPQUAD(ih->daddr), - ntohs(pptr[1])); - } - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - -#ifdef CONFIG_IP_VS_IPV6 -static void -ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) -{ - char buf[192]; - struct ipv6hdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else if (ih->nexthdr == IPPROTO_FRAGMENT) - sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag", - pp->name, NIP6(ih->saddr), - NIP6(ih->daddr)); - else { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), - sizeof(_ports), _ports); - if (pptr == NULL) - sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT, - pp->name, - NIP6(ih->saddr), - NIP6(ih->daddr)); - else - sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u", - pp->name, - NIP6(ih->saddr), - ntohs(pptr[0]), - NIP6(ih->daddr), - ntohs(pptr[1])); - } - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} -#endif - - -void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) - ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); - else -#endif - ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); -} - - -int __init ip_vs_protocol_init(void) -{ - char protocols[64]; -#define REGISTER_PROTOCOL(p) \ - do { \ - register_ip_vs_protocol(p); \ - strcat(protocols, ", "); \ - strcat(protocols, (p)->name); \ - } while (0) - - protocols[0] = '\0'; - protocols[2] = '\0'; -#ifdef CONFIG_IP_VS_PROTO_TCP - REGISTER_PROTOCOL(&ip_vs_protocol_tcp); -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - REGISTER_PROTOCOL(&ip_vs_protocol_udp); -#endif -#ifdef CONFIG_IP_VS_PROTO_AH - REGISTER_PROTOCOL(&ip_vs_protocol_ah); -#endif -#ifdef CONFIG_IP_VS_PROTO_ESP - REGISTER_PROTOCOL(&ip_vs_protocol_esp); -#endif - IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); - - return 0; -} - - -void ip_vs_protocol_cleanup(void) -{ - struct ip_vs_protocol *pp; - int i; - - /* unregister all the ipvs protocols */ - for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - while ((pp = ip_vs_proto_table[i]) != NULL) - unregister_ip_vs_protocol(pp); - } -} diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c deleted file mode 100644 index 80ab0c8e5b4..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS - * - * Authors: Julian Anastasov , February 2002 - * Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation; - * - */ - -#include -#include -#include -#include -#include -#include - -#include - - -/* TODO: - -struct isakmp_hdr { - __u8 icookie[8]; - __u8 rcookie[8]; - __u8 np; - __u8 version; - __u8 xchgtype; - __u8 flags; - __u32 msgid; - __u32 length; -}; - -*/ - -#define PORT_ISAKMP 500 - - -static struct ip_vs_conn * -ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - /* - * We are not sure if the packet is from our - * service, so our conn_schedule hook should return NF_ACCEPT - */ - IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " - "%s%s %s->%s\n", - inverse ? "ICMP+" : "", - pp->name, - IP_VS_DBG_ADDR(af, &iph->saddr), - IP_VS_DBG_ADDR(af, &iph->daddr)); - } - - return cp; -} - - -static struct ip_vs_conn * -ah_esp_conn_out_get(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " - "%s%s %s->%s\n", - inverse ? "ICMP+" : "", - pp->name, - IP_VS_DBG_ADDR(af, &iph->saddr), - IP_VS_DBG_ADDR(af, &iph->daddr)); - } - - return cp; -} - - -static int -ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - /* - * AH/ESP is only related traffic. Pass the packet to IP stack. - */ - *verdict = NF_ACCEPT; - return 0; -} - - -static void -ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - -#ifdef CONFIG_IP_VS_IPV6 -static void -ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct ipv6hdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT, - pp->name, NIP6(ih->saddr), - NIP6(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} -#endif - -static void -ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) - ah_esp_debug_packet_v6(pp, skb, offset, msg); - else -#endif - ah_esp_debug_packet_v4(pp, skb, offset, msg); -} - - -static void ah_esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void ah_esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -#ifdef CONFIG_IP_VS_PROTO_AH -struct ip_vs_protocol ip_vs_protocol_ah = { - .name = "AH", - .protocol = IPPROTO_AH, - .num_states = 1, - .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, - .conn_schedule = ah_esp_conn_schedule, - .conn_in_get = ah_esp_conn_in_get, - .conn_out_get = ah_esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ - .set_state_timeout = NULL, -}; -#endif - -#ifdef CONFIG_IP_VS_PROTO_ESP -struct ip_vs_protocol ip_vs_protocol_esp = { - .name = "ESP", - .protocol = IPPROTO_ESP, - .num_states = 1, - .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, - .conn_schedule = ah_esp_conn_schedule, - .conn_in_get = ah_esp_conn_in_get, - .conn_out_get = ah_esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ -}; -#endif diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c deleted file mode 100644 index dd4566ea2bf..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ /dev/null @@ -1,732 +0,0 @@ -/* - * ip_vs_proto_tcp.c: TCP load balancing support for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include /* for tcphdr */ -#include -#include /* for csum_tcpudp_magic */ -#include -#include -#include - -#include - - -static struct ip_vs_conn * -tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - -static struct ip_vs_conn * -tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - - -static int -tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - struct ip_vs_service *svc; - struct tcphdr _tcph, *th; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); - if (th == NULL) { - *verdict = NF_DROP; - return 0; - } - - if (th->syn && - (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, - th->dest))) { - if (ip_vs_todrop()) { - /* - * It seems that we are very loaded. - * We have to drop this packet :( - */ - ip_vs_service_put(svc); - *verdict = NF_DROP; - return 0; - } - - /* - * Let the virtual server select a real server for the - * incoming connection, and create a connection entry. - */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); - return 0; - } - ip_vs_service_put(svc); - } - return 1; -} - - -static inline void -tcp_fast_csum_update(int af, struct tcphdr *tcph, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldport, __be16 newport) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - tcph->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(tcph->check)))); - else -#endif - tcph->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(tcph->check)))); -} - - -static inline void -tcp_partial_csum_update(int af, struct tcphdr *tcph, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldlen, __be16 newlen) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - tcph->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); - else -#endif - tcph->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); -} - - -static int -tcp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct tcphdr *tcph; - unsigned int tcphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else -#endif - tcphoff = ip_hdrlen(skb); - oldlen = skb->len - tcphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) - return 0; - } - - tcph = (void *)skb_network_header(skb) + tcphoff; - tcph->source = cp->vport; - - /* Adjust TCP checksums */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - tcphoff)); - } else if (!cp->app) { - /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, - cp->dport, cp->vport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - tcph->check = 0; - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcph->check = csum_ipv6_magic(&cp->vaddr.in6, - &cp->caddr.in6, - skb->len - tcphoff, - cp->protocol, skb->csum); - else -#endif - tcph->check = csum_tcpudp_magic(cp->vaddr.ip, - cp->caddr.ip, - skb->len - tcphoff, - cp->protocol, - skb->csum); - - IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", - pp->name, tcph->check, - (char*)&(tcph->check) - (char*)tcph); - } - return 1; -} - - -static int -tcp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct tcphdr *tcph; - unsigned int tcphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else -#endif - tcphoff = ip_hdrlen(skb); - oldlen = skb->len - tcphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* - * Attempt ip_vs_app call. - * It will fix ip_vs_conn and iph ack_seq stuff - */ - if (!ip_vs_app_pkt_in(cp, skb)) - return 0; - } - - tcph = (void *)skb_network_header(skb) + tcphoff; - tcph->dest = cp->dport; - - /* - * Adjust TCP checksums - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - tcphoff)); - } else if (!cp->app) { - /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, - cp->vport, cp->dport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - tcph->check = 0; - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - tcph->check = csum_ipv6_magic(&cp->caddr.in6, - &cp->daddr.in6, - skb->len - tcphoff, - cp->protocol, skb->csum); - else -#endif - tcph->check = csum_tcpudp_magic(cp->caddr.ip, - cp->daddr.ip, - skb->len - tcphoff, - cp->protocol, - skb->csum); - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return 1; -} - - -static int -tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) -{ - unsigned int tcphoff; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else -#endif - tcphoff = ip_hdrlen(skb); - - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - case CHECKSUM_COMPLETE: -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, - skb->len - tcphoff, - ipv6_hdr(skb)->nexthdr, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - } else -#endif - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, - skb->len - tcphoff, - ip_hdr(skb)->protocol, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - break; - default: - /* No need to checksum. */ - break; - } - - return 1; -} - - -#define TCP_DIR_INPUT 0 -#define TCP_DIR_OUTPUT 4 -#define TCP_DIR_INPUT_ONLY 8 - -static const int tcp_state_off[IP_VS_DIR_LAST] = { - [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, - [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, - [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, -}; - -/* - * Timeout table[state] - */ -static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = 2*HZ, - [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, - [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, - [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, - [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, - [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, - [IP_VS_TCP_S_CLOSE] = 10*HZ, - [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, - [IP_VS_TCP_S_LAST_ACK] = 30*HZ, - [IP_VS_TCP_S_LISTEN] = 2*60*HZ, - [IP_VS_TCP_S_SYNACK] = 120*HZ, - [IP_VS_TCP_S_LAST] = 2*HZ, -}; - -static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = "NONE", - [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", - [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", - [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", - [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", - [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", - [IP_VS_TCP_S_CLOSE] = "CLOSE", - [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", - [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", - [IP_VS_TCP_S_LISTEN] = "LISTEN", - [IP_VS_TCP_S_SYNACK] = "SYNACK", - [IP_VS_TCP_S_LAST] = "BUG!", -}; - -#define sNO IP_VS_TCP_S_NONE -#define sES IP_VS_TCP_S_ESTABLISHED -#define sSS IP_VS_TCP_S_SYN_SENT -#define sSR IP_VS_TCP_S_SYN_RECV -#define sFW IP_VS_TCP_S_FIN_WAIT -#define sTW IP_VS_TCP_S_TIME_WAIT -#define sCL IP_VS_TCP_S_CLOSE -#define sCW IP_VS_TCP_S_CLOSE_WAIT -#define sLA IP_VS_TCP_S_LAST_ACK -#define sLI IP_VS_TCP_S_LISTEN -#define sSA IP_VS_TCP_S_SYNACK - -struct tcp_states_t { - int next_state[IP_VS_TCP_S_LAST]; -}; - -static const char * tcp_state_name(int state) -{ - if (state >= IP_VS_TCP_S_LAST) - return "ERR!"; - return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; -} - -static struct tcp_states_t tcp_states [] = { -/* INPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, -/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, - -/* OUTPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, -/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, -/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, -/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, - -/* INPUT-ONLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, -/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, -}; - -static struct tcp_states_t tcp_states_dos [] = { -/* INPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, -/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, -/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, - -/* OUTPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, -/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, -/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, -/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, - -/* INPUT-ONLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, -/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, -}; - -static struct tcp_states_t *tcp_state_table = tcp_states; - - -static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) -{ - int on = (flags & 1); /* secure_tcp */ - - /* - ** FIXME: change secure_tcp to independent sysctl var - ** or make it per-service or per-app because it is valid - ** for most if not for all of the applications. Something - ** like "capabilities" (flags) for each object. - */ - tcp_state_table = (on? tcp_states_dos : tcp_states); -} - -static int -tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, - tcp_state_name_table, sname, to); -} - -static inline int tcp_state_idx(struct tcphdr *th) -{ - if (th->rst) - return 3; - if (th->syn) - return 0; - if (th->fin) - return 1; - if (th->ack) - return 2; - return -1; -} - -static inline void -set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, - int direction, struct tcphdr *th) -{ - int state_idx; - int new_state = IP_VS_TCP_S_CLOSE; - int state_off = tcp_state_off[direction]; - - /* - * Update state offset to INPUT_ONLY if necessary - * or delete NO_OUTPUT flag if output packet detected - */ - if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { - if (state_off == TCP_DIR_OUTPUT) - cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; - else - state_off = TCP_DIR_INPUT_ONLY; - } - - if ((state_idx = tcp_state_idx(th)) < 0) { - IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); - goto tcp_state_out; - } - - new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; - - tcp_state_out: - if (new_state != cp->state) { - struct ip_vs_dest *dest = cp->dest; - - IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" - "%s:%d state: %s->%s conn->refcnt:%d\n", - pp->name, - ((state_off == TCP_DIR_OUTPUT) ? - "output " : "input "), - th->syn ? 'S' : '.', - th->fin ? 'F' : '.', - th->ack ? 'A' : '.', - th->rst ? 'R' : '.', - IP_VS_DBG_ADDR(cp->af, &cp->daddr), - ntohs(cp->dport), - IP_VS_DBG_ADDR(cp->af, &cp->caddr), - ntohs(cp->cport), - tcp_state_name(cp->state), - tcp_state_name(new_state), - atomic_read(&cp->refcnt)); - - if (dest) { - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } - } - - cp->timeout = pp->timeout_table[cp->state = new_state]; -} - - -/* - * Handle state transitions - */ -static int -tcp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - struct tcphdr _tcph, *th; - -#ifdef CONFIG_IP_VS_IPV6 - int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); -#else - int ihl = ip_hdrlen(skb); -#endif - - th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - - spin_lock(&cp->lock); - set_tcp_state(pp, cp, direction, th); - spin_unlock(&cp->lock); - - return 1; -} - - -/* - * Hash table for TCP application incarnations - */ -#define TCP_APP_TAB_BITS 4 -#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) -#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) - -static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(tcp_app_lock); - -static inline __u16 tcp_app_hashkey(__be16 port) -{ - return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) - & TCP_APP_TAB_MASK; -} - - -static int tcp_register_app(struct ip_vs_app *inc) -{ - struct ip_vs_app *i; - __u16 hash; - __be16 port = inc->port; - int ret = 0; - - hash = tcp_app_hashkey(port); - - spin_lock_bh(&tcp_app_lock); - list_for_each_entry(i, &tcp_apps[hash], p_list) { - if (i->port == port) { - ret = -EEXIST; - goto out; - } - } - list_add(&inc->p_list, &tcp_apps[hash]); - atomic_inc(&ip_vs_protocol_tcp.appcnt); - - out: - spin_unlock_bh(&tcp_app_lock); - return ret; -} - - -static void -tcp_unregister_app(struct ip_vs_app *inc) -{ - spin_lock_bh(&tcp_app_lock); - atomic_dec(&ip_vs_protocol_tcp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&tcp_app_lock); -} - - -static int -tcp_app_conn_bind(struct ip_vs_conn *cp) -{ - int hash; - struct ip_vs_app *inc; - int result = 0; - - /* Default binding: bind app only for NAT */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - return 0; - - /* Lookup application incarnations and bind the right one */ - hash = tcp_app_hashkey(cp->vport); - - spin_lock(&tcp_app_lock); - list_for_each_entry(inc, &tcp_apps[hash], p_list) { - if (inc->port == cp->vport) { - if (unlikely(!ip_vs_app_inc_get(inc))) - break; - spin_unlock(&tcp_app_lock); - - IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" - "%s:%u to app %s on port %u\n", - __func__, - IP_VS_DBG_ADDR(cp->af, &cp->caddr), - ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), - ntohs(cp->vport), - inc->name, ntohs(inc->port)); - - cp->app = inc; - if (inc->init_conn) - result = inc->init_conn(inc, cp); - goto out; - } - } - spin_unlock(&tcp_app_lock); - - out: - return result; -} - - -/* - * Set LISTEN timeout. (ip_vs_conn_put will setup timer) - */ -void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) -{ - spin_lock(&cp->lock); - cp->state = IP_VS_TCP_S_LISTEN; - cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; - spin_unlock(&cp->lock); -} - - -static void ip_vs_tcp_init(struct ip_vs_protocol *pp) -{ - IP_VS_INIT_HASH_TABLE(tcp_apps); - pp->timeout_table = tcp_timeouts; -} - - -static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) -{ -} - - -struct ip_vs_protocol ip_vs_protocol_tcp = { - .name = "TCP", - .protocol = IPPROTO_TCP, - .num_states = IP_VS_TCP_S_LAST, - .dont_defrag = 0, - .appcnt = ATOMIC_INIT(0), - .init = ip_vs_tcp_init, - .exit = ip_vs_tcp_exit, - .register_app = tcp_register_app, - .unregister_app = tcp_unregister_app, - .conn_schedule = tcp_conn_schedule, - .conn_in_get = tcp_conn_in_get, - .conn_out_get = tcp_conn_out_get, - .snat_handler = tcp_snat_handler, - .dnat_handler = tcp_dnat_handler, - .csum_check = tcp_csum_check, - .state_name = tcp_state_name, - .state_transition = tcp_state_transition, - .app_conn_bind = tcp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = tcp_timeout_change, - .set_state_timeout = tcp_set_state_timeout, -}; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c deleted file mode 100644 index 6eb6039d634..00000000000 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ /dev/null @@ -1,533 +0,0 @@ -/* - * ip_vs_proto_udp.c: UDP load balancing support for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -static struct ip_vs_conn * -udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - -static struct ip_vs_conn * -udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - -static int -udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - struct ip_vs_service *svc; - struct udphdr _udph, *uh; - struct ip_vs_iphdr iph; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - - uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); - if (uh == NULL) { - *verdict = NF_DROP; - return 0; - } - - svc = ip_vs_service_get(af, skb->mark, iph.protocol, - &iph.daddr, uh->dest); - if (svc) { - if (ip_vs_todrop()) { - /* - * It seems that we are very loaded. - * We have to drop this packet :( - */ - ip_vs_service_put(svc); - *verdict = NF_DROP; - return 0; - } - - /* - * Let the virtual server select a real server for the - * incoming connection, and create a connection entry. - */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); - return 0; - } - ip_vs_service_put(svc); - } - return 1; -} - - -static inline void -udp_fast_csum_update(int af, struct udphdr *uhdr, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldport, __be16 newport) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - uhdr->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(uhdr->check)))); - else -#endif - uhdr->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(uhdr->check)))); - if (!uhdr->check) - uhdr->check = CSUM_MANGLED_0; -} - -static inline void -udp_partial_csum_update(int af, struct udphdr *uhdr, - const union nf_inet_addr *oldip, - const union nf_inet_addr *newip, - __be16 oldlen, __be16 newlen) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - uhdr->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); - else -#endif - uhdr->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, - ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); -} - - -static int -udp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct udphdr *udph; - unsigned int udphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else -#endif - udphoff = ip_hdrlen(skb); - oldlen = skb->len - udphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* - * Call application helper if needed - */ - if (!ip_vs_app_pkt_out(cp, skb)) - return 0; - } - - udph = (void *)skb_network_header(skb) + udphoff; - udph->source = cp->vport; - - /* - * Adjust UDP checksums - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { - /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, - cp->dport, cp->vport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - udph->check = 0; - skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udph->check = csum_ipv6_magic(&cp->vaddr.in6, - &cp->caddr.in6, - skb->len - udphoff, - cp->protocol, skb->csum); - else -#endif - udph->check = csum_tcpudp_magic(cp->vaddr.ip, - cp->caddr.ip, - skb->len - udphoff, - cp->protocol, - skb->csum); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", - pp->name, udph->check, - (char*)&(udph->check) - (char*)udph); - } - return 1; -} - - -static int -udp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct udphdr *udph; - unsigned int udphoff; - int oldlen; - -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else -#endif - udphoff = ip_hdrlen(skb); - oldlen = skb->len - udphoff; - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) - return 0; - - /* - * Attempt ip_vs_app call. - * It will fix ip_vs_conn - */ - if (!ip_vs_app_pkt_in(cp, skb)) - return 0; - } - - udph = (void *)skb_network_header(skb) + udphoff; - udph->dest = cp->dport; - - /* - * Adjust UDP checksums - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, - htonl(oldlen), - htonl(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { - /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, - cp->vport, cp->dport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - udph->check = 0; - skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); -#ifdef CONFIG_IP_VS_IPV6 - if (cp->af == AF_INET6) - udph->check = csum_ipv6_magic(&cp->caddr.in6, - &cp->daddr.in6, - skb->len - udphoff, - cp->protocol, skb->csum); - else -#endif - udph->check = csum_tcpudp_magic(cp->caddr.ip, - cp->daddr.ip, - skb->len - udphoff, - cp->protocol, - skb->csum); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return 1; -} - - -static int -udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) -{ - struct udphdr _udph, *uh; - unsigned int udphoff; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else -#endif - udphoff = ip_hdrlen(skb); - - uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); - if (uh == NULL) - return 0; - - if (uh->check != 0) { - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = skb_checksum(skb, udphoff, - skb->len - udphoff, 0); - case CHECKSUM_COMPLETE: -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, - skb->len - udphoff, - ipv6_hdr(skb)->nexthdr, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - } else -#endif - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, - skb->len - udphoff, - ip_hdr(skb)->protocol, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - break; - default: - /* No need to checksum. */ - break; - } - } - return 1; -} - - -/* - * Note: the caller guarantees that only one of register_app, - * unregister_app or app_conn_bind is called each time. - */ - -#define UDP_APP_TAB_BITS 4 -#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) -#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) - -static struct list_head udp_apps[UDP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(udp_app_lock); - -static inline __u16 udp_app_hashkey(__be16 port) -{ - return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) - & UDP_APP_TAB_MASK; -} - - -static int udp_register_app(struct ip_vs_app *inc) -{ - struct ip_vs_app *i; - __u16 hash; - __be16 port = inc->port; - int ret = 0; - - hash = udp_app_hashkey(port); - - - spin_lock_bh(&udp_app_lock); - list_for_each_entry(i, &udp_apps[hash], p_list) { - if (i->port == port) { - ret = -EEXIST; - goto out; - } - } - list_add(&inc->p_list, &udp_apps[hash]); - atomic_inc(&ip_vs_protocol_udp.appcnt); - - out: - spin_unlock_bh(&udp_app_lock); - return ret; -} - - -static void -udp_unregister_app(struct ip_vs_app *inc) -{ - spin_lock_bh(&udp_app_lock); - atomic_dec(&ip_vs_protocol_udp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&udp_app_lock); -} - - -static int udp_app_conn_bind(struct ip_vs_conn *cp) -{ - int hash; - struct ip_vs_app *inc; - int result = 0; - - /* Default binding: bind app only for NAT */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - return 0; - - /* Lookup application incarnations and bind the right one */ - hash = udp_app_hashkey(cp->vport); - - spin_lock(&udp_app_lock); - list_for_each_entry(inc, &udp_apps[hash], p_list) { - if (inc->port == cp->vport) { - if (unlikely(!ip_vs_app_inc_get(inc))) - break; - spin_unlock(&udp_app_lock); - - IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" - "%s:%u to app %s on port %u\n", - __func__, - IP_VS_DBG_ADDR(cp->af, &cp->caddr), - ntohs(cp->cport), - IP_VS_DBG_ADDR(cp->af, &cp->vaddr), - ntohs(cp->vport), - inc->name, ntohs(inc->port)); - - cp->app = inc; - if (inc->init_conn) - result = inc->init_conn(inc, cp); - goto out; - } - } - spin_unlock(&udp_app_lock); - - out: - return result; -} - - -static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { - [IP_VS_UDP_S_NORMAL] = 5*60*HZ, - [IP_VS_UDP_S_LAST] = 2*HZ, -}; - -static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { - [IP_VS_UDP_S_NORMAL] = "UDP", - [IP_VS_UDP_S_LAST] = "BUG!", -}; - - -static int -udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, - udp_state_name_table, sname, to); -} - -static const char * udp_state_name(int state) -{ - if (state >= IP_VS_UDP_S_LAST) - return "ERR!"; - return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; -} - -static int -udp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; - return 1; -} - -static void udp_init(struct ip_vs_protocol *pp) -{ - IP_VS_INIT_HASH_TABLE(udp_apps); - pp->timeout_table = udp_timeouts; -} - -static void udp_exit(struct ip_vs_protocol *pp) -{ -} - - -struct ip_vs_protocol ip_vs_protocol_udp = { - .name = "UDP", - .protocol = IPPROTO_UDP, - .num_states = IP_VS_UDP_S_LAST, - .dont_defrag = 0, - .init = udp_init, - .exit = udp_exit, - .conn_schedule = udp_conn_schedule, - .conn_in_get = udp_conn_in_get, - .conn_out_get = udp_conn_out_get, - .snat_handler = udp_snat_handler, - .dnat_handler = udp_dnat_handler, - .csum_check = udp_csum_check, - .state_transition = udp_state_transition, - .state_name = udp_state_name, - .register_app = udp_register_app, - .unregister_app = udp_unregister_app, - .app_conn_bind = udp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = NULL, - .set_state_timeout = udp_set_state_timeout, -}; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c deleted file mode 100644 index a22195f68ac..00000000000 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * IPVS: Round-Robin Scheduling module - * - * Authors: Wensong Zhang - * Peter Kese - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Fixes/Changes: - * Wensong Zhang : changed the ip_vs_rr_schedule to return dest - * Julian Anastasov : fixed the NULL pointer access bug in debugging - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_rr_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include -#include - -#include - - -static int ip_vs_rr_init_svc(struct ip_vs_service *svc) -{ - svc->sched_data = &svc->destinations; - return 0; -} - - -static int ip_vs_rr_update_svc(struct ip_vs_service *svc) -{ - svc->sched_data = &svc->destinations; - return 0; -} - - -/* - * Round-Robin Scheduling - */ -static struct ip_vs_dest * -ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct list_head *p, *q; - struct ip_vs_dest *dest; - - IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); - - write_lock(&svc->sched_lock); - p = (struct list_head *)svc->sched_data; - p = p->next; - q = p; - do { - /* skip list head */ - if (q == &svc->destinations) { - q = q->next; - continue; - } - - dest = list_entry(q, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) - /* HIT */ - goto out; - q = q->next; - } while (q != p); - write_unlock(&svc->sched_lock); - return NULL; - - out: - svc->sched_data = q; - write_unlock(&svc->sched_lock); - IP_VS_DBG_BUF(6, "RR: server %s:%u " - "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), atomic_read(&dest->weight)); - - return dest; -} - - -static struct ip_vs_scheduler ip_vs_rr_scheduler = { - .name = "rr", /* name */ - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .init_service = ip_vs_rr_init_svc, - .update_service = ip_vs_rr_update_svc, - .schedule = ip_vs_rr_schedule, -}; - -static int __init ip_vs_rr_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_rr_scheduler); -} - -static void __exit ip_vs_rr_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); -} - -module_init(ip_vs_rr_init); -module_exit(ip_vs_rr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c deleted file mode 100644 index a46ad9e3501..00000000000 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * Peter Kese - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include -#include -#include -#include -#include - -#include - -/* - * IPVS scheduler list - */ -static LIST_HEAD(ip_vs_schedulers); - -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_sched_lock); - - -/* - * Bind a service with a scheduler - */ -int ip_vs_bind_scheduler(struct ip_vs_service *svc, - struct ip_vs_scheduler *scheduler) -{ - int ret; - - if (svc == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - if (scheduler == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); - return -EINVAL; - } - - svc->scheduler = scheduler; - - if (scheduler->init_service) { - ret = scheduler->init_service(svc); - if (ret) { - IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); - return ret; - } - } - - return 0; -} - - -/* - * Unbind a service with its scheduler - */ -int ip_vs_unbind_scheduler(struct ip_vs_service *svc) -{ - struct ip_vs_scheduler *sched; - - if (svc == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - - sched = svc->scheduler; - if (sched == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); - return -EINVAL; - } - - if (sched->done_service) { - if (sched->done_service(svc) != 0) { - IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); - return -EINVAL; - } - } - - svc->scheduler = NULL; - return 0; -} - - -/* - * Get scheduler in the scheduler list by name - */ -static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) -{ - struct ip_vs_scheduler *sched; - - IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", - sched_name); - - read_lock_bh(&__ip_vs_sched_lock); - - list_for_each_entry(sched, &ip_vs_schedulers, n_list) { - /* - * Test and get the modules atomically - */ - if (sched->module && !try_module_get(sched->module)) { - /* - * This scheduler is just deleted - */ - continue; - } - if (strcmp(sched_name, sched->name)==0) { - /* HIT */ - read_unlock_bh(&__ip_vs_sched_lock); - return sched; - } - if (sched->module) - module_put(sched->module); - } - - read_unlock_bh(&__ip_vs_sched_lock); - return NULL; -} - - -/* - * Lookup scheduler and try to load it if it doesn't exist - */ -struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) -{ - struct ip_vs_scheduler *sched; - - /* - * Search for the scheduler by sched_name - */ - sched = ip_vs_sched_getbyname(sched_name); - - /* - * If scheduler not found, load the module and search again - */ - if (sched == NULL) { - request_module("ip_vs_%s", sched_name); - sched = ip_vs_sched_getbyname(sched_name); - } - - return sched; -} - -void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) -{ - if (scheduler->module) - module_put(scheduler->module); -} - - -/* - * Register a scheduler in the scheduler list - */ -int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) -{ - struct ip_vs_scheduler *sched; - - if (!scheduler) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); - return -EINVAL; - } - - if (!scheduler->name) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); - return -EINVAL; - } - - /* increase the module use count */ - ip_vs_use_count_inc(); - - write_lock_bh(&__ip_vs_sched_lock); - - if (!list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); - ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already linked\n", scheduler->name); - return -EINVAL; - } - - /* - * Make sure that the scheduler with this name doesn't exist - * in the scheduler list. - */ - list_for_each_entry(sched, &ip_vs_schedulers, n_list) { - if (strcmp(scheduler->name, sched->name) == 0) { - write_unlock_bh(&__ip_vs_sched_lock); - ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already existed in the system\n", - scheduler->name); - return -EINVAL; - } - } - /* - * Add it into the d-linked scheduler list - */ - list_add(&scheduler->n_list, &ip_vs_schedulers); - write_unlock_bh(&__ip_vs_sched_lock); - - IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); - - return 0; -} - - -/* - * Unregister a scheduler from the scheduler list - */ -int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) -{ - if (!scheduler) { - IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); - return -EINVAL; - } - - write_lock_bh(&__ip_vs_sched_lock); - if (list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); - IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " - "is not in the list. failed\n", scheduler->name); - return -EINVAL; - } - - /* - * Remove it from the d-linked scheduler list - */ - list_del(&scheduler->n_list); - write_unlock_bh(&__ip_vs_sched_lock); - - /* decrease the module use count */ - ip_vs_use_count_dec(); - - IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); - - return 0; -} diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c deleted file mode 100644 index 7d2f22f04b8..00000000000 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * IPVS: Shortest Expected Delay scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The SED algorithm attempts to minimize each job's expected delay until - * completion. The expected delay that the job will experience is - * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of - * jobs on the ith server and Ui is the fixed service rate (weight) of - * the ith server. The SED algorithm adopts a greedy policy that each does - * what is in its own best interest, i.e. to join the queue which would - * minimize its expected delay of completion. - * - * See the following paper for more information: - * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing - * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, - * pages 986-994, 1988. - * - * Thanks must go to Marko Buuri for talking SED to me. - * - * The difference between SED and WLC is that SED includes the incoming - * job in the cost function (the increment of 1). SED may outperform - * WLC, while scheduling big jobs under larger heterogeneous systems - * (the server weight varies a lot). - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We only use the active connection number in the cost - * calculation here. - */ - return atomic_read(&dest->activeconns) + 1; -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least; - unsigned int loh, doh; - - IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (server expected overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) { - least = dest; - loh = ip_vs_sed_dest_overhead(least); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - doh = ip_vs_sed_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG_BUF(6, "SED: server %s:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_sed_scheduler = -{ - .name = "sed", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_sed_schedule, -}; - - -static int __init ip_vs_sed_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_sed_scheduler); -} - -static void __exit ip_vs_sed_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); -} - -module_init(ip_vs_sed_init); -module_exit(ip_vs_sed_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c deleted file mode 100644 index 1d96de27fef..00000000000 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ /dev/null @@ -1,258 +0,0 @@ -/* - * IPVS: Source Hashing scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The sh algorithm is to select server by the hash key of source IP - * address. The pseudo code is as follows: - * - * n <- servernode[src_ip]; - * if (n is dead) OR - * (n is overloaded) or (n.weight <= 0) then - * return NULL; - * - * return n; - * - * Notes that servernode is a 256-bucket hash table that maps the hash - * index derived from packet source IP address to the current server - * array. If the sh scheduler is used in cache cluster, it is good to - * combine it with cache_bypass feature. When the statically assigned - * server is dead or overloaded, the load balancer can bypass the cache - * server and send requests to the original server directly. - * - */ - -#include -#include -#include -#include - -#include - - -/* - * IPVS SH bucket - */ -struct ip_vs_sh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ -}; - -/* - * for IPVS SH entry hash table - */ -#ifndef CONFIG_IP_VS_SH_TAB_BITS -#define CONFIG_IP_VS_SH_TAB_BITS 8 -#endif -#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS -#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) -#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) - - -/* - * Returns hash value for IPVS SH entry - */ -static inline unsigned ip_vs_sh_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; -} - - -/* - * Get ip_vs_dest associated with supplied parameters. - */ -static inline struct ip_vs_dest * -ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr) -{ - return (tbl[ip_vs_sh_hashkey(addr)]).dest; -} - - -/* - * Assign all the hash buckets of the specified table with the service. - */ -static int -ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) -{ - int i; - struct ip_vs_sh_bucket *b; - struct list_head *p; - struct ip_vs_dest *dest; - - b = tbl; - p = &svc->destinations; - for (i=0; idest = NULL; - } else { - if (p == &svc->destinations) - p = p->next; - - dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; - - p = p->next; - } - b++; - } - return 0; -} - - -/* - * Flush all the hash buckets of the specified table. - */ -static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) -{ - int i; - struct ip_vs_sh_bucket *b; - - b = tbl; - for (i=0; idest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; - } - b++; - } -} - - -static int ip_vs_sh_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl; - - /* allocate the SH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); - - return 0; -} - - -static int ip_vs_sh_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - return 0; -} - - -static int ip_vs_sh_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); - - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); - - return 0; -} - - -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) -{ - return dest->flags & IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Source Hashing scheduling - */ -static struct ip_vs_dest * -ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_sh_bucket *tbl; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_sh_bucket *)svc->sched_data; - dest = ip_vs_sh_get(tbl, iph->saddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { - return NULL; - } - - IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->saddr), - NIPQUAD(dest->addr.ip), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS SH Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_sh_scheduler = -{ - .name = "sh", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 0, -#endif - .init_service = ip_vs_sh_init_svc, - .done_service = ip_vs_sh_done_svc, - .update_service = ip_vs_sh_update_svc, - .schedule = ip_vs_sh_schedule, -}; - - -static int __init ip_vs_sh_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_sh_scheduler); -} - - -static void __exit ip_vs_sh_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); -} - - -module_init(ip_vs_sh_init); -module_exit(ip_vs_sh_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c deleted file mode 100644 index de5e7e118ee..00000000000 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ /dev/null @@ -1,942 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the NetFilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang - * - * ip_vs_sync: sync connection info from master load balancer to backups - * through multicast - * - * Changes: - * Alexandre Cassen : Added master & backup support at a time. - * Alexandre Cassen : Added SyncID support for incoming sync - * messages filtering. - * Justin Ossevoort : Fix endian problem on sync message size. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for ip_mc_join_group */ -#include -#include -#include -#include -#include - -#include -#include - -#include - -#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ -#define IP_VS_SYNC_PORT 8848 /* multicast port */ - - -/* - * IPVS sync connection entry - */ -struct ip_vs_sync_conn { - __u8 reserved; - - /* Protocol, addresses and port numbers */ - __u8 protocol; /* Which protocol (TCP/UDP) */ - __be16 cport; - __be16 vport; - __be16 dport; - __be32 caddr; /* client address */ - __be32 vaddr; /* virtual address */ - __be32 daddr; /* destination address */ - - /* Flags and state transition */ - __be16 flags; /* status flags */ - __be16 state; /* state info */ - - /* The sequence options start here */ -}; - -struct ip_vs_sync_conn_options { - struct ip_vs_seq in_seq; /* incoming seq. struct */ - struct ip_vs_seq out_seq; /* outgoing seq. struct */ -}; - -struct ip_vs_sync_thread_data { - struct socket *sock; - char *buf; -}; - -#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) -#define FULL_CONN_SIZE \ -(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) - - -/* - The master mulitcasts messages to the backup load balancers in the - following format. - - 0 1 2 3 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Count Conns | SyncID | Size | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (1) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | . | - | . | - | . | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (n) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -*/ - -#define SYNC_MESG_HEADER_LEN 4 -#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ - -struct ip_vs_sync_mesg { - __u8 nr_conns; - __u8 syncid; - __u16 size; - - /* ip_vs_sync_conn entries start here */ -}; - -/* the maximum length of sync (sending/receiving) message */ -static int sync_send_mesg_maxlen; -static int sync_recv_mesg_maxlen; - -struct ip_vs_sync_buff { - struct list_head list; - unsigned long firstuse; - - /* pointers for the message data */ - struct ip_vs_sync_mesg *mesg; - unsigned char *head; - unsigned char *end; -}; - - -/* the sync_buff list head and the lock */ -static LIST_HEAD(ip_vs_sync_queue); -static DEFINE_SPINLOCK(ip_vs_sync_lock); - -/* current sync_buff for accepting new conn entries */ -static struct ip_vs_sync_buff *curr_sb = NULL; -static DEFINE_SPINLOCK(curr_sb_lock); - -/* ipvs sync daemon state */ -volatile int ip_vs_sync_state = IP_VS_STATE_NONE; -volatile int ip_vs_master_syncid = 0; -volatile int ip_vs_backup_syncid = 0; - -/* multicast interface name */ -char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - -/* sync daemon tasks */ -static struct task_struct *sync_master_thread; -static struct task_struct *sync_backup_thread; - -/* multicast addr */ -static struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = __constant_htons(IP_VS_SYNC_PORT), - .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), -}; - - -static inline struct ip_vs_sync_buff *sb_dequeue(void) -{ - struct ip_vs_sync_buff *sb; - - spin_lock_bh(&ip_vs_sync_lock); - if (list_empty(&ip_vs_sync_queue)) { - sb = NULL; - } else { - sb = list_entry(ip_vs_sync_queue.next, - struct ip_vs_sync_buff, - list); - list_del(&sb->list); - } - spin_unlock_bh(&ip_vs_sync_lock); - - return sb; -} - -static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) -{ - struct ip_vs_sync_buff *sb; - - if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) - return NULL; - - if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { - kfree(sb); - return NULL; - } - sb->mesg->nr_conns = 0; - sb->mesg->syncid = ip_vs_master_syncid; - sb->mesg->size = 4; - sb->head = (unsigned char *)sb->mesg + 4; - sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; - sb->firstuse = jiffies; - return sb; -} - -static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) -{ - kfree(sb->mesg); - kfree(sb); -} - -static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) -{ - spin_lock(&ip_vs_sync_lock); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) - list_add_tail(&sb->list, &ip_vs_sync_queue); - else - ip_vs_sync_buff_release(sb); - spin_unlock(&ip_vs_sync_lock); -} - -/* - * Get the current sync buffer if it has been created for more - * than the specified time or the specified time is zero. - */ -static inline struct ip_vs_sync_buff * -get_curr_sync_buff(unsigned long time) -{ - struct ip_vs_sync_buff *sb; - - spin_lock_bh(&curr_sb_lock); - if (curr_sb && (time == 0 || - time_before(jiffies - curr_sb->firstuse, time))) { - sb = curr_sb; - curr_sb = NULL; - } else - sb = NULL; - spin_unlock_bh(&curr_sb_lock); - return sb; -} - - -/* - * Add an ip_vs_conn information into the current sync_buff. - * Called by ip_vs_in. - */ -void ip_vs_sync_conn(struct ip_vs_conn *cp) -{ - struct ip_vs_sync_mesg *m; - struct ip_vs_sync_conn *s; - int len; - - spin_lock(&curr_sb_lock); - if (!curr_sb) { - if (!(curr_sb=ip_vs_sync_buff_create())) { - spin_unlock(&curr_sb_lock); - IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); - return; - } - } - - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : - SIMPLE_CONN_SIZE; - m = curr_sb->mesg; - s = (struct ip_vs_sync_conn *)curr_sb->head; - - /* copy members */ - s->protocol = cp->protocol; - s->cport = cp->cport; - s->vport = cp->vport; - s->dport = cp->dport; - s->caddr = cp->caddr.ip; - s->vaddr = cp->vaddr.ip; - s->daddr = cp->daddr.ip; - s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); - s->state = htons(cp->state); - if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { - struct ip_vs_sync_conn_options *opt = - (struct ip_vs_sync_conn_options *)&s[1]; - memcpy(opt, &cp->in_seq, sizeof(*opt)); - } - - m->nr_conns++; - m->size += len; - curr_sb->head += len; - - /* check if there is a space for next one */ - if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { - sb_queue_tail(curr_sb); - curr_sb = NULL; - } - spin_unlock(&curr_sb_lock); - - /* synchronize its controller if it has */ - if (cp->control) - ip_vs_sync_conn(cp->control); -} - - -/* - * Process received multicast message and create the corresponding - * ip_vs_conn entries. - */ -static void ip_vs_process_message(const char *buffer, const size_t buflen) -{ - struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; - struct ip_vs_sync_conn *s; - struct ip_vs_sync_conn_options *opt; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - struct ip_vs_dest *dest; - char *p; - int i; - - if (buflen < sizeof(struct ip_vs_sync_mesg)) { - IP_VS_ERR_RL("sync message header too short\n"); - return; - } - - /* Convert size back to host byte order */ - m->size = ntohs(m->size); - - if (buflen != m->size) { - IP_VS_ERR_RL("bogus sync message size\n"); - return; - } - - /* SyncID sanity check */ - if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { - IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", - m->syncid); - return; - } - - p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); - for (i=0; inr_conns; i++) { - unsigned flags, state; - - if (p + SIMPLE_CONN_SIZE > buffer+buflen) { - IP_VS_ERR_RL("bogus conn in sync message\n"); - return; - } - s = (struct ip_vs_sync_conn *) p; - flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; - flags &= ~IP_VS_CONN_F_HASHED; - if (flags & IP_VS_CONN_F_SEQ_MASK) { - opt = (struct ip_vs_sync_conn_options *)&s[1]; - p += FULL_CONN_SIZE; - if (p > buffer+buflen) { - IP_VS_ERR_RL("bogus conn options in sync message\n"); - return; - } - } else { - opt = NULL; - p += SIMPLE_CONN_SIZE; - } - - state = ntohs(s->state); - if (!(flags & IP_VS_CONN_F_TEMPLATE)) { - pp = ip_vs_proto_get(s->protocol); - if (!pp) { - IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", - s->protocol); - continue; - } - if (state >= pp->num_states) { - IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", - pp->name, state); - continue; - } - } else { - /* protocol in templates is not used for state/timeout */ - pp = NULL; - if (state > 0) { - IP_VS_DBG(2, "Invalid template state %u in sync msg\n", - state); - state = 0; - } - } - - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); - else - cp = ip_vs_ct_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); - if (!cp) { - /* - * Find the appropriate destination for the connection. - * If it is not found the connection will remain unbound - * but still handled. - */ - dest = ip_vs_find_dest(AF_INET, - (union nf_inet_addr *)&s->daddr, - s->dport, - (union nf_inet_addr *)&s->vaddr, - s->vport, - s->protocol); - /* Set the approprite ativity flag */ - if (s->protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } - cp = ip_vs_conn_new(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport, - (union nf_inet_addr *)&s->daddr, - s->dport, - flags, dest); - if (dest) - atomic_dec(&dest->refcnt); - if (!cp) { - IP_VS_ERR("ip_vs_conn_new failed\n"); - return; - } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } - - if (opt) - memcpy(&cp->in_seq, opt, sizeof(*opt)); - atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->state = state; - cp->old_state = cp->state; - /* - * We can not recover the right timeout for templates - * in all cases, we can not find the right fwmark - * virtual service. If needed, we can do it for - * non-fwmark persistent services. - */ - if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) - cp->timeout = pp->timeout_table[state]; - else - cp->timeout = (3*60*HZ); - ip_vs_conn_put(cp); - } -} - - -/* - * Setup loopback of outgoing multicasts on a sending socket - */ -static void set_mcast_loop(struct sock *sk, u_char loop) -{ - struct inet_sock *inet = inet_sk(sk); - - /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ - lock_sock(sk); - inet->mc_loop = loop ? 1 : 0; - release_sock(sk); -} - -/* - * Specify TTL for outgoing multicasts on a sending socket - */ -static void set_mcast_ttl(struct sock *sk, u_char ttl) -{ - struct inet_sock *inet = inet_sk(sk); - - /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ - lock_sock(sk); - inet->mc_ttl = ttl; - release_sock(sk); -} - -/* - * Specifiy default interface for outgoing multicasts - */ -static int set_mcast_if(struct sock *sk, char *ifname) -{ - struct net_device *dev; - struct inet_sock *inet = inet_sk(sk); - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - - if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) - return -EINVAL; - - lock_sock(sk); - inet->mc_index = dev->ifindex; - /* inet->mc_addr = 0; */ - release_sock(sk); - - return 0; -} - - -/* - * Set the maximum length of sync message according to the - * specified interface's MTU. - */ -static int set_sync_mesg_maxlen(int sync_state) -{ - struct net_device *dev; - int num; - - if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) - return -ENODEV; - - num = (dev->mtu - sizeof(struct iphdr) - - sizeof(struct udphdr) - - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + - SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); - IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", sync_send_mesg_maxlen); - } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) - return -ENODEV; - - sync_recv_mesg_maxlen = dev->mtu - - sizeof(struct iphdr) - sizeof(struct udphdr); - IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", sync_recv_mesg_maxlen); - } - - return 0; -} - - -/* - * Join a multicast group. - * the group is specified by a class D multicast address 224.0.0.0/8 - * in the in_addr structure passed in as a parameter. - */ -static int -join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) -{ - struct ip_mreqn mreq; - struct net_device *dev; - int ret; - - memset(&mreq, 0, sizeof(mreq)); - memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) - return -EINVAL; - - mreq.imr_ifindex = dev->ifindex; - - lock_sock(sk); - ret = ip_mc_join_group(sk, &mreq); - release_sock(sk); - - return ret; -} - - -static int bind_mcastif_addr(struct socket *sock, char *ifname) -{ - struct net_device *dev; - __be32 addr; - struct sockaddr_in sin; - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - - addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); - if (!addr) - IP_VS_ERR("You probably need to specify IP address on " - "multicast interface.\n"); - - IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", - ifname, NIPQUAD(addr)); - - /* Now bind the socket with the address of multicast interface */ - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr; - sin.sin_port = 0; - - return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); -} - -/* - * Set up sending multicast socket over UDP - */ -static struct socket * make_send_sock(void) -{ - struct socket *sock; - int result; - - /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (result < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); - return ERR_PTR(result); - } - - result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error setting outbound mcast interface\n"); - goto error; - } - - set_mcast_loop(sock->sk, 0); - set_mcast_ttl(sock->sk, 1); - - result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error binding address of the mcast interface\n"); - goto error; - } - - result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr), 0); - if (result < 0) { - IP_VS_ERR("Error connecting to the multicast addr\n"); - goto error; - } - - return sock; - - error: - sock_release(sock); - return ERR_PTR(result); -} - - -/* - * Set up receiving multicast socket over UDP - */ -static struct socket * make_receive_sock(void) -{ - struct socket *sock; - int result; - - /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (result < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); - return ERR_PTR(result); - } - - /* it is equivalent to the REUSEADDR option in user-space */ - sock->sk->sk_reuse = 1; - - result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr)); - if (result < 0) { - IP_VS_ERR("Error binding to the multicast addr\n"); - goto error; - } - - /* join the multicast group */ - result = join_mcast_group(sock->sk, - (struct in_addr *) &mcast_addr.sin_addr, - ip_vs_backup_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error joining to the multicast group\n"); - goto error; - } - - return sock; - - error: - sock_release(sock); - return ERR_PTR(result); -} - - -static int -ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) -{ - struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; - struct kvec iov; - int len; - - EnterFunction(7); - iov.iov_base = (void *)buffer; - iov.iov_len = length; - - len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); - - LeaveFunction(7); - return len; -} - -static void -ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) -{ - int msize; - - msize = msg->size; - - /* Put size in network byte order */ - msg->size = htons(msg->size); - - if (ip_vs_send_async(sock, (char *)msg, msize) != msize) - IP_VS_ERR("ip_vs_send_async error\n"); -} - -static int -ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) -{ - struct msghdr msg = {NULL,}; - struct kvec iov; - int len; - - EnterFunction(7); - - /* Receive a packet */ - iov.iov_base = buffer; - iov.iov_len = (size_t)buflen; - - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); - - if (len < 0) - return -1; - - LeaveFunction(7); - return len; -} - - -static int sync_thread_master(void *data) -{ - struct ip_vs_sync_thread_data *tinfo = data; - struct ip_vs_sync_buff *sb; - - IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_master_mcast_ifn, ip_vs_master_syncid); - - while (!kthread_should_stop()) { - while ((sb = sb_dequeue())) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); - } - - /* check if entries stay in curr_sb for 2 seconds */ - sb = get_curr_sync_buff(2 * HZ); - if (sb) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); - } - - schedule_timeout_interruptible(HZ); - } - - /* clean up the sync_buff queue */ - while ((sb=sb_dequeue())) { - ip_vs_sync_buff_release(sb); - } - - /* clean up the current sync_buff */ - if ((sb = get_curr_sync_buff(0))) { - ip_vs_sync_buff_release(sb); - } - - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo); - - return 0; -} - - -static int sync_thread_backup(void *data) -{ - struct ip_vs_sync_thread_data *tinfo = data; - int len; - - IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); - - while (!kthread_should_stop()) { - wait_event_interruptible(*tinfo->sock->sk->sk_sleep, - !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) - || kthread_should_stop()); - - /* do we have data now? */ - while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { - len = ip_vs_receive(tinfo->sock, tinfo->buf, - sync_recv_mesg_maxlen); - if (len <= 0) { - IP_VS_ERR("receiving message error\n"); - break; - } - - /* disable bottom half, because it accesses the data - shared by softirq while getting/creating conns */ - local_bh_disable(); - ip_vs_process_message(tinfo->buf, len); - local_bh_enable(); - } - } - - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo->buf); - kfree(tinfo); - - return 0; -} - - -int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) -{ - struct ip_vs_sync_thread_data *tinfo; - struct task_struct **realtask, *task; - struct socket *sock; - char *name, *buf = NULL; - int (*threadfn)(void *data); - int result = -ENOMEM; - - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", - sizeof(struct ip_vs_sync_conn)); - - if (state == IP_VS_STATE_MASTER) { - if (sync_master_thread) - return -EEXIST; - - strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, - sizeof(ip_vs_master_mcast_ifn)); - ip_vs_master_syncid = syncid; - realtask = &sync_master_thread; - name = "ipvs_syncmaster"; - threadfn = sync_thread_master; - sock = make_send_sock(); - } else if (state == IP_VS_STATE_BACKUP) { - if (sync_backup_thread) - return -EEXIST; - - strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, - sizeof(ip_vs_backup_mcast_ifn)); - ip_vs_backup_syncid = syncid; - realtask = &sync_backup_thread; - name = "ipvs_syncbackup"; - threadfn = sync_thread_backup; - sock = make_receive_sock(); - } else { - return -EINVAL; - } - - if (IS_ERR(sock)) { - result = PTR_ERR(sock); - goto out; - } - - set_sync_mesg_maxlen(state); - if (state == IP_VS_STATE_BACKUP) { - buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); - if (!buf) - goto outsocket; - } - - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - goto outbuf; - - tinfo->sock = sock; - tinfo->buf = buf; - - task = kthread_run(threadfn, tinfo, name); - if (IS_ERR(task)) { - result = PTR_ERR(task); - goto outtinfo; - } - - /* mark as active */ - *realtask = task; - ip_vs_sync_state |= state; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - return 0; - -outtinfo: - kfree(tinfo); -outbuf: - kfree(buf); -outsocket: - sock_release(sock); -out: - return result; -} - - -int stop_sync_thread(int state) -{ - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - - if (state == IP_VS_STATE_MASTER) { - if (!sync_master_thread) - return -ESRCH; - - IP_VS_INFO("stopping master sync thread %d ...\n", - task_pid_nr(sync_master_thread)); - - /* - * The lock synchronizes with sb_queue_tail(), so that we don't - * add sync buffers to the queue, when we are already in - * progress of stopping the master sync daemon. - */ - - spin_lock_bh(&ip_vs_sync_lock); - ip_vs_sync_state &= ~IP_VS_STATE_MASTER; - spin_unlock_bh(&ip_vs_sync_lock); - kthread_stop(sync_master_thread); - sync_master_thread = NULL; - } else if (state == IP_VS_STATE_BACKUP) { - if (!sync_backup_thread) - return -ESRCH; - - IP_VS_INFO("stopping backup sync thread %d ...\n", - task_pid_nr(sync_backup_thread)); - - ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; - kthread_stop(sync_backup_thread); - sync_backup_thread = NULL; - } else { - return -EINVAL; - } - - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return 0; -} diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c deleted file mode 100644 index 8c596e71259..00000000000 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * IPVS: Weighted Least-Connection Scheduling module - * - * Authors: Wensong Zhang - * Peter Kese - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest - * Wensong Zhang : changed to use the inactconns in scheduling - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_wlc_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include -#include - -#include - - -static inline unsigned int -ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least; - unsigned int loh, doh; - - IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) { - least = dest; - loh = ip_vs_wlc_dest_overhead(least); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - doh = ip_vs_wlc_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG_BUF(6, "WLC: server %s:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_wlc_scheduler = -{ - .name = "wlc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .schedule = ip_vs_wlc_schedule, -}; - - -static int __init ip_vs_wlc_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); -} - -static void __exit ip_vs_wlc_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); -} - -module_init(ip_vs_wlc_init); -module_exit(ip_vs_wlc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c deleted file mode 100644 index 7ea92fed50b..00000000000 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ /dev/null @@ -1,237 +0,0 @@ -/* - * IPVS: Weighted Round-Robin Scheduling module - * - * Authors: Wensong Zhang - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_wrr_update_svc - * Julian Anastasov : fixed the bug of returning destination - * with weight 0 when all weights are zero - * - */ - -#include -#include -#include - -#include - -/* - * current destination pointer for weighted round-robin scheduling - */ -struct ip_vs_wrr_mark { - struct list_head *cl; /* current list head */ - int cw; /* current weight */ - int mw; /* maximum weight */ - int di; /* decreasing interval */ -}; - - -/* - * Get the gcd of server weights - */ -static int gcd(int a, int b) -{ - int c; - - while ((c = a % b)) { - a = b; - b = c; - } - return b; -} - -static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - int weight; - int g = 0; - - list_for_each_entry(dest, &svc->destinations, n_list) { - weight = atomic_read(&dest->weight); - if (weight > 0) { - if (g > 0) - g = gcd(weight, g); - else - g = weight; - } - } - return g ? g : 1; -} - - -/* - * Get the maximum weight of the service destinations. - */ -static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - int weight = 0; - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (atomic_read(&dest->weight) > weight) - weight = atomic_read(&dest->weight); - } - - return weight; -} - - -static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_wrr_mark *mark; - - /* - * Allocate the mark variable for WRR scheduling - */ - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); - if (mark == NULL) { - IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); - return -ENOMEM; - } - mark->cl = &svc->destinations; - mark->cw = 0; - mark->mw = ip_vs_wrr_max_weight(svc); - mark->di = ip_vs_wrr_gcd_weight(svc); - svc->sched_data = mark; - - return 0; -} - - -static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) -{ - /* - * Release the mark variable - */ - kfree(svc->sched_data); - - return 0; -} - - -static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_wrr_mark *mark = svc->sched_data; - - mark->cl = &svc->destinations; - mark->mw = ip_vs_wrr_max_weight(svc); - mark->di = ip_vs_wrr_gcd_weight(svc); - if (mark->cw > mark->mw) - mark->cw = 0; - return 0; -} - - -/* - * Weighted Round-Robin Scheduling - */ -static struct ip_vs_dest * -ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_wrr_mark *mark = svc->sched_data; - struct list_head *p; - - IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); - - /* - * This loop will always terminate, because mark->cw in (0, max_weight] - * and at least one server has its weight equal to max_weight. - */ - write_lock(&svc->sched_lock); - p = mark->cl; - while (1) { - if (mark->cl == &svc->destinations) { - /* it is at the head of the destination list */ - - if (mark->cl == mark->cl->next) { - /* no dest entry */ - dest = NULL; - goto out; - } - - mark->cl = svc->destinations.next; - mark->cw -= mark->di; - if (mark->cw <= 0) { - mark->cw = mark->mw; - /* - * Still zero, which means no available servers. - */ - if (mark->cw == 0) { - mark->cl = &svc->destinations; - IP_VS_ERR_RL("ip_vs_wrr_schedule(): " - "no available servers\n"); - dest = NULL; - goto out; - } - } - } else - mark->cl = mark->cl->next; - - if (mark->cl != &svc->destinations) { - /* not at the head of the list */ - dest = list_entry(mark->cl, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) >= mark->cw) { - /* got it */ - break; - } - } - - if (mark->cl == p && mark->cw == mark->di) { - /* back to the start, and no dest is found. - It is only possible when all dests are OVERLOADED */ - dest = NULL; - goto out; - } - } - - IP_VS_DBG_BUF(6, "WRR: server %s:%u " - "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), - atomic_read(&dest->weight)); - - out: - write_unlock(&svc->sched_lock); - return dest; -} - - -static struct ip_vs_scheduler ip_vs_wrr_scheduler = { - .name = "wrr", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 1, -#endif - .init_service = ip_vs_wrr_init_svc, - .done_service = ip_vs_wrr_done_svc, - .update_service = ip_vs_wrr_update_svc, - .schedule = ip_vs_wrr_schedule, -}; - -static int __init ip_vs_wrr_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; -} - -static void __exit ip_vs_wrr_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); -} - -module_init(ip_vs_wrr_init); -module_exit(ip_vs_wrr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c deleted file mode 100644 index 02ddc2b3ce2..00000000000 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ /dev/null @@ -1,1004 +0,0 @@ -/* - * ip_vs_xmit.c: various packet transmitters for IPVS - * - * Authors: Wensong Zhang - * Julian Anastasov - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include -#include /* for tcphdr */ -#include -#include /* for csum_tcpudp_magic */ -#include -#include /* for icmp_send */ -#include /* for ip_route_output */ -#include -#include -#include -#include -#include - -#include - - -/* - * Destination cache to speed up outgoing route lookup - */ -static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) -{ - struct dst_entry *old_dst; - - old_dst = dest->dst_cache; - dest->dst_cache = dst; - dest->dst_rtos = rtos; - dst_release(old_dst); -} - -static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) -{ - struct dst_entry *dst = dest->dst_cache; - - if (!dst) - return NULL; - if ((dst->obsolete - || (dest->af == AF_INET && rtos != dest->dst_rtos)) && - dst->ops->check(dst, cookie) == NULL) { - dest->dst_cache = NULL; - dst_release(dst); - return NULL; - } - dst_hold(dst); - return dst; -} - -static struct rtable * -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) -{ - struct rtable *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; - - if (dest) { - spin_lock(&dest->dst_lock); - if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos, 0))) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = dest->addr.ip, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip_route_output error, " - "dest: %u.%u.%u.%u\n", - NIPQUAD(dest->addr.ip)); - return NULL; - } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); - IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", - NIPQUAD(dest->addr.ip), - atomic_read(&rt->u.dst.__refcnt), rtos); - } - spin_unlock(&dest->dst_lock); - } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = cp->daddr.ip, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_route_output error, dest: " - "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip)); - return NULL; - } - } - - return rt; -} - -#ifdef CONFIG_IP_VS_IPV6 -static struct rt6_info * -__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) -{ - struct rt6_info *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; - - if (dest) { - spin_lock(&dest->dst_lock); - rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); - if (!rt) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = dest->addr.in6, - .saddr = { - .s6_addr32 = - { 0, 0, 0, 0 }, - }, - }, - }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, - NULL, &fl); - if (!rt) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip6_route_output error, " - "dest: " NIP6_FMT "\n", - NIP6(dest->addr.in6)); - return NULL; - } - __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); - IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n", - NIP6(dest->addr.in6), - atomic_read(&rt->u.dst.__refcnt)); - } - spin_unlock(&dest->dst_lock); - } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = cp->daddr.in6, - .saddr = { - .s6_addr32 = { 0, 0, 0, 0 }, - }, - }, - }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("ip6_route_output error, dest: " - NIP6_FMT "\n", NIP6(cp->daddr.in6)); - return NULL; - } - } - - return rt; -} -#endif - - -/* - * Release dest->dst_cache before a dest is removed - */ -void -ip_vs_dst_reset(struct ip_vs_dest *dest) -{ - struct dst_entry *old_dst; - - old_dst = dest->dst_cache; - dest->dst_cache = NULL; - dst_release(old_dst); -} - -#define IP_VS_XMIT(pf, skb, rt) \ -do { \ - (skb)->ipvs_property = 1; \ - skb_forward_csum(skb); \ - NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - (rt)->u.dst.dev, dst_output); \ -} while (0) - - -/* - * NULL transmitter (do nothing except return NF_ACCEPT) - */ -int -ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - /* we do not touch skb and do not need pskb ptr */ - return NF_ACCEPT; -} - - -/* - * Bypass transmitter - * Let packets bypass the destination when the destination is not - * available, it may be only used in transparent cache cluster. - */ -int -ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - u8 tos = iph->tos; - int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = iph->daddr, - .saddr = 0, - .tos = RT_TOS(tos), } }, - }; - - EnterFunction(10); - - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " - "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); - goto tx_error_icmp; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - struct ipv6hdr *iph = ipv6_hdr(skb); - int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = iph->daddr, - .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, - }; - - EnterFunction(10); - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, " - "dest: " NIP6_FMT "\n", NIP6(iph->daddr)); - goto tx_error_icmp; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); - return NF_STOLEN; - } - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} -#endif - -/* - * NAT transmitter (only for outside-to-inside nat forwarding) - * Not used for related ICMP - */ -int -ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - int mtu; - struct iphdr *iph = ip_hdr(skb); - - EnterFunction(10); - - /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { - __be16 _pt, *p; - p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); - if (p == NULL) - goto tx_error; - ip_vs_conn_fill_cport(cp, *p); - IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); - } - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error; - ip_hdr(skb)->daddr = cp->daddr.ip; - ip_send_check(ip_hdr(skb)); - - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - - /* FIXME: when application helper enlarges the packet and the length - is larger than the MTU of outgoing device, there will be still - MTU problem. */ - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - LeaveFunction(10); - kfree_skb(skb); - return NF_STOLEN; - tx_error_put: - ip_rt_put(rt); - goto tx_error; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - int mtu; - - EnterFunction(10); - - /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { - __be16 _pt, *p; - p = skb_header_pointer(skb, sizeof(struct ipv6hdr), - sizeof(_pt), &_pt); - if (p == NULL) - goto tx_error; - ip_vs_conn_fill_cport(cp, *p); - IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); - } - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "ip_vs_nat_xmit_v6(): frag needed for"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error; - ipv6_hdr(skb)->daddr = cp->daddr.in6; - - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - - /* FIXME: when application helper enlarges the packet and the length - is larger than the MTU of outgoing device, there will be still - MTU problem. */ - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - LeaveFunction(10); - kfree_skb(skb); - return NF_STOLEN; -tx_error_put: - dst_release(&rt->u.dst); - goto tx_error; -} -#endif - - -/* - * IP Tunneling transmitter - * - * This function encapsulates the packet in a new IP packet, its - * destination will be set to cp->daddr. Most code of this function - * is taken from ipip.c. - * - * It is used in VS/TUN cluster. The load balancer selects a real - * server from a cluster based on a scheduling algorithm, - * encapsulates the request packet and forwards it to the selected - * server. For example, all real servers are configured with - * "ifconfig tunl0 up". When the server receives - * the encapsulated packet, it will decapsulate the packet, processe - * the request and return the response packets directly to the client - * without passing the load balancer. This can greatly increase the - * scalability of virtual server. - * - * Used for ANY protocol - */ -int -ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); - u8 tos = old_iph->tos; - __be16 df = old_iph->frag_off; - sk_buff_data_t old_transport_header = skb->transport_header; - struct iphdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - int mtu; - - EnterFunction(10); - - if (skb->protocol != htons(ETH_P_IP)) { - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " - "ETH_P_IP: %d, skb protocol: %d\n", - htons(ETH_P_IP), skb->protocol); - goto tx_error; - } - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) - goto tx_error_icmp; - - tdev = rt->u.dst.dev; - - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); - if (mtu < 68) { - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); - goto tx_error; - } - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); - - df |= (old_iph->frag_off & htons(IP_DF)); - - if ((old_iph->frag_off & htons(IP_DF)) - && mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Okay, now see if we can stuff it in the buffer as-is. - */ - max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - kfree_skb(skb); - IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); - return NF_STOLEN; - } - kfree_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); - } - - skb->transport_header = old_transport_header; - - /* fix old IP header checksum */ - ip_send_check(old_iph); - - skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* - * Push down and install the IPIP header. - */ - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr)>>2; - iph->frag_off = df; - iph->protocol = IPPROTO_IPIP; - iph->tos = tos; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; - iph->ttl = old_iph->ttl; - ip_select_ident(iph, &rt->u.dst, NULL); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - ip_local_out(skb); - - LeaveFunction(10); - - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct ipv6hdr *old_iph = ipv6_hdr(skb); - sk_buff_data_t old_transport_header = skb->transport_header; - struct ipv6hdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - int mtu; - - EnterFunction(10); - - if (skb->protocol != htons(ETH_P_IPV6)) { - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, " - "ETH_P_IPV6: %d, skb protocol: %d\n", - htons(ETH_P_IPV6), skb->protocol); - goto tx_error; - } - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - tdev = rt->u.dst.dev; - - mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); - /* TODO IPv6: do we need this check in IPv6? */ - if (mtu < 1280) { - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n"); - goto tx_error; - } - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); - - if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n"); - goto tx_error; - } - - /* - * Okay, now see if we can stuff it in the buffer as-is. - */ - max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); - - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - dst_release(&rt->u.dst); - kfree_skb(skb); - IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n"); - return NF_STOLEN; - } - kfree_skb(skb); - skb = new_skb; - old_iph = ipv6_hdr(skb); - } - - skb->transport_header = old_transport_header; - - skb_push(skb, sizeof(struct ipv6hdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* - * Push down and install the IPIP header. - */ - iph = ipv6_hdr(skb); - iph->version = 6; - iph->nexthdr = IPPROTO_IPV6; - iph->payload_len = old_iph->payload_len + sizeof(old_iph); - iph->priority = old_iph->priority; - memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); - iph->daddr = rt->rt6i_dst.addr; - iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ - iph->hop_limit = old_iph->hop_limit; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - ip6_local_out(skb); - - LeaveFunction(10); - - return NF_STOLEN; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} -#endif - - -/* - * Direct Routing transmitter - * Used for ANY protocol - */ -int -ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - int mtu; - - EnterFunction(10); - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rt6_info *rt; /* Route to the other host */ - int mtu; - - EnterFunction(10); - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - dst_release(&rt->u.dst); - IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(skb == NULL)) { - dst_release(&rt->u.dst); - return NF_STOLEN; - } - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} -#endif - - -/* - * ICMP packet transmitter - * called by the ip_vs_in_icmp - */ -int -ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset) -{ - struct rtable *rt; /* Route to the other host */ - int mtu; - int rc; - - EnterFunction(10); - - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be - forwarded directly here, because there is no need to - translate address/port back */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { - if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); - else - rc = NF_ACCEPT; - /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); - goto out; - } - - /* - * mangle and send the packet here (only for VS/NAT) - */ - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop the old route when skb is not shared */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - ip_vs_nat_icmp(skb, pp, cp, 0); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET, skb, rt); - - rc = NF_STOLEN; - goto out; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - dev_kfree_skb(skb); - rc = NF_STOLEN; - out: - LeaveFunction(10); - return rc; - tx_error_put: - ip_rt_put(rt); - goto tx_error; -} - -#ifdef CONFIG_IP_VS_IPV6 -int -ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset) -{ - struct rt6_info *rt; /* Route to the other host */ - int mtu; - int rc; - - EnterFunction(10); - - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be - forwarded directly here, because there is no need to - translate address/port back */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { - if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); - else - rc = NF_ACCEPT; - /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); - goto out; - } - - /* - * mangle and send the packet here (only for VS/NAT) - */ - - rt = __ip_vs_get_out_rt_v6(cp); - if (!rt) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if (skb->len > mtu) { - dst_release(&rt->u.dst); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop the old route when skb is not shared */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - ip_vs_nat_icmp_v6(skb, pp, cp, 0); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(PF_INET6, skb, rt); - - rc = NF_STOLEN; - goto out; - -tx_error_icmp: - dst_link_failure(skb); -tx_error: - dev_kfree_skb(skb); - rc = NF_STOLEN; -out: - LeaveFunction(10); - return rc; -tx_error_put: - dst_release(&rt->u.dst); - goto tx_error; -} -#endif diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ee898e74808..73f9378e1bf 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -838,3 +838,5 @@ config NETFILTER_XT_MATCH_HASHLIMIT endmenu +source "net/netfilter/ipvs/Kconfig" + diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3bd2cc556ae..cf75055be83 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -83,3 +83,6 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o + +# IPVS +obj-$(CONFIG_IP_VS) += ipvs/ diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig new file mode 100644 index 00000000000..de6004de80b --- /dev/null +++ b/net/netfilter/ipvs/Kconfig @@ -0,0 +1,239 @@ +# +# IP Virtual Server configuration +# +menuconfig IP_VS + tristate "IP virtual server support (EXPERIMENTAL)" + depends on NETFILTER + ---help--- + IP Virtual Server support will let you build a high-performance + virtual server based on cluster of two or more real servers. This + option must be enabled for at least one of the clustered computers + that will take care of intercepting incoming connections to a + single IP address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The several scheduling algorithms can + be used to choose which server the connection is directed to, + thus load balancing can be achieved among the servers. For more + information and its administration program, please visit the + following URL: . + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +if IP_VS + +config IP_VS_IPV6 + bool "IPv6 support for IPVS (DANGEROUS)" + depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) + ---help--- + Add IPv6 support to IPVS. This is incomplete and might be dangerous. + + Say N if unsure. + +config IP_VS_DEBUG + bool "IP virtual server debugging" + ---help--- + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +config IP_VS_TAB_BITS + int "IPVS connection table size (the Nth power of 2)" + range 8 20 + default 12 + ---help--- + The IPVS connection hash table uses the chaining scheme to handle + hash collisions. Using a big IPVS connection hash table will greatly + reduce conflicts when there are hundreds of thousands of connections + in the hash table. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. The number to choose is + from 8 to 20, the default number is 12, which means the table size + is 4096. Don't input the number too small, otherwise you will lose + performance on it. You can adapt the table size yourself, according + to your virtual server application. It is good to set the table size + not far less than the number of connections per second multiplying + average lasting time of connection in the table. For example, your + virtual server gets 200 connections per second, the connection lasts + for 200 seconds in average in the connection table, the table size + should be not far less than 200x200, it is good to set the table + size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + +comment "IPVS transport protocol load balancing support" + +config IP_VS_PROTO_TCP + bool "TCP load balancing support" + ---help--- + This option enables support for load balancing TCP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_UDP + bool "UDP load balancing support" + ---help--- + This option enables support for load balancing UDP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_AH_ESP + bool + depends on UNDEFINED + +config IP_VS_PROTO_ESP + bool "ESP load balancing support" + select IP_VS_PROTO_AH_ESP + ---help--- + This option enables support for load balancing ESP (Encapsulation + Security Payload) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_AH + bool "AH load balancing support" + select IP_VS_PROTO_AH_ESP + ---help--- + This option enables support for load balancing AH (Authentication + Header) transport protocol. Say Y if unsure. + +comment "IPVS scheduler" + +config IP_VS_RR + tristate "round-robin scheduling" + ---help--- + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WRR + tristate "weighted round-robin scheduling" + ---help--- + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LC + tristate "least-connection scheduling" + ---help--- + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WLC + tristate "weighted least-connection scheduling" + ---help--- + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLC + tristate "locality-based least-connection scheduling" + ---help--- + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLCR + tristate "locality-based least-connection with replication scheduling" + ---help--- + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_DH + tristate "destination hashing scheduling" + ---help--- + The destination hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their destination IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SH + tristate "source hashing scheduling" + ---help--- + The source hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their source IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SED + tristate "shortest expected delay scheduling" + ---help--- + The shortest expected delay scheduling algorithm assigns network + connections to the server with the shortest expected delay. The + expected delay that the job will experience is (Ci + 1) / Ui if + sent to the ith server, in which Ci is the number of connections + on the ith server and Ui is the fixed service rate (weight) + of the ith server. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NQ + tristate "never queue scheduling" + ---help--- + The never queue scheduling algorithm adopts a two-speed model. + When there is an idle server available, the job will be sent to + the idle server, instead of waiting for a fast one. When there + is no idle server available, the job will be sent to the server + that minimize its expected delay (The Shortest Expected Delay + scheduling algorithm). + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +comment 'IPVS application helper' + +config IP_VS_FTP + tristate "FTP protocol helper" + depends on IP_VS_PROTO_TCP + ---help--- + FTP is a protocol that transfers IP address and/or port number in + the payload. In the virtual server via Network Address Translation, + the IP address and port number of real servers cannot be sent to + clients in ftp connections directly, so FTP protocol helper is + required for tracking the connection and mangling it back to that of + virtual service. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile new file mode 100644 index 00000000000..73a46fe1fe4 --- /dev/null +++ b/net/netfilter/ipvs/Makefile @@ -0,0 +1,33 @@ +# +# Makefile for the IPVS modules on top of IPv4. +# + +# IPVS transport protocol load balancing support +ip_vs_proto-objs-y := +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o + +ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ + ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ + ip_vs_est.o ip_vs_proto.o \ + $(ip_vs_proto-objs-y) + + +# IPVS core +obj-$(CONFIG_IP_VS) += ip_vs.o + +# IPVS schedulers +obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o +obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o +obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o +obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o +obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o +obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o +obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o +obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o + +# IPVS application helpers +obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c new file mode 100644 index 00000000000..201b8ea3020 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -0,0 +1,622 @@ +/* + * ip_vs_app.c: Application module support for IPVS + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference + * is that ip_vs_app module handles the reverse direction (incoming requests + * and outgoing responses). + * + * IP_MASQ_APP application masquerading module + * + * Author: Juan Jose Ciarlante, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +EXPORT_SYMBOL(register_ip_vs_app); +EXPORT_SYMBOL(unregister_ip_vs_app); +EXPORT_SYMBOL(register_ip_vs_app_inc); + +/* ipvs application list head */ +static LIST_HEAD(ip_vs_app_list); +static DEFINE_MUTEX(__ip_vs_app_mutex); + + +/* + * Get an ip_vs_app object + */ +static inline int ip_vs_app_get(struct ip_vs_app *app) +{ + return try_module_get(app->module); +} + + +static inline void ip_vs_app_put(struct ip_vs_app *app) +{ + module_put(app->module); +} + + +/* + * Allocate/initialize app incarnation and register it in proto apps. + */ +static int +ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + struct ip_vs_protocol *pp; + struct ip_vs_app *inc; + int ret; + + if (!(pp = ip_vs_proto_get(proto))) + return -EPROTONOSUPPORT; + + if (!pp->unregister_app) + return -EOPNOTSUPP; + + inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); + if (!inc) + return -ENOMEM; + INIT_LIST_HEAD(&inc->p_list); + INIT_LIST_HEAD(&inc->incs_list); + inc->app = app; + inc->port = htons(port); + atomic_set(&inc->usecnt, 0); + + if (app->timeouts) { + inc->timeout_table = + ip_vs_create_timeout_table(app->timeouts, + app->timeouts_size); + if (!inc->timeout_table) { + ret = -ENOMEM; + goto out; + } + } + + ret = pp->register_app(inc); + if (ret) + goto out; + + list_add(&inc->a_list, &app->incs_list); + IP_VS_DBG(9, "%s application %s:%u registered\n", + pp->name, inc->name, inc->port); + + return 0; + + out: + kfree(inc->timeout_table); + kfree(inc); + return ret; +} + + +/* + * Release app incarnation + */ +static void +ip_vs_app_inc_release(struct ip_vs_app *inc) +{ + struct ip_vs_protocol *pp; + + if (!(pp = ip_vs_proto_get(inc->protocol))) + return; + + if (pp->unregister_app) + pp->unregister_app(inc); + + IP_VS_DBG(9, "%s App %s:%u unregistered\n", + pp->name, inc->name, inc->port); + + list_del(&inc->a_list); + + kfree(inc->timeout_table); + kfree(inc); +} + + +/* + * Get reference to app inc (only called from softirq) + * + */ +int ip_vs_app_inc_get(struct ip_vs_app *inc) +{ + int result; + + atomic_inc(&inc->usecnt); + if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) + atomic_dec(&inc->usecnt); + return result; +} + + +/* + * Put the app inc (only called from timer or net softirq) + */ +void ip_vs_app_inc_put(struct ip_vs_app *inc) +{ + ip_vs_app_put(inc->app); + atomic_dec(&inc->usecnt); +} + + +/* + * Register an application incarnation in protocol applications + */ +int +register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + int result; + + mutex_lock(&__ip_vs_app_mutex); + + result = ip_vs_app_inc_new(app, proto, port); + + mutex_unlock(&__ip_vs_app_mutex); + + return result; +} + + +/* + * ip_vs_app registration routine + */ +int register_ip_vs_app(struct ip_vs_app *app) +{ + /* increase the module use count */ + ip_vs_use_count_inc(); + + mutex_lock(&__ip_vs_app_mutex); + + list_add(&app->a_list, &ip_vs_app_list); + + mutex_unlock(&__ip_vs_app_mutex); + + return 0; +} + + +/* + * ip_vs_app unregistration routine + * We are sure there are no app incarnations attached to services + */ +void unregister_ip_vs_app(struct ip_vs_app *app) +{ + struct ip_vs_app *inc, *nxt; + + mutex_lock(&__ip_vs_app_mutex); + + list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { + ip_vs_app_inc_release(inc); + } + + list_del(&app->a_list); + + mutex_unlock(&__ip_vs_app_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + + +/* + * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) + */ +int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) +{ + return pp->app_conn_bind(cp); +} + + +/* + * Unbind cp from application incarnation (called by cp destructor) + */ +void ip_vs_unbind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *inc = cp->app; + + if (!inc) + return; + + if (inc->unbind_conn) + inc->unbind_conn(inc, cp); + if (inc->done_conn) + inc->done_conn(inc, cp); + ip_vs_app_inc_put(inc); + cp->app = NULL; +} + + +/* + * Fixes th->seq based on ip_vs_seq info. + */ +static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + if (vseq->delta || vseq->previous_delta) { + if(after(seq, vseq->init_seq)) { + th->seq = htonl(seq + vseq->delta); + IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", + vseq->delta); + } else { + th->seq = htonl(seq + vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " + "(%d) to seq\n", vseq->previous_delta); + } + } +} + + +/* + * Fixes th->ack_seq based on ip_vs_seq info. + */ +static inline void +vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 ack_seq = ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + if (vseq->delta || vseq->previous_delta) { + /* since ack_seq is the number of octet that is expected + to receive next, so compare it with init_seq+delta */ + if(after(ack_seq, vseq->init_seq+vseq->delta)) { + th->ack_seq = htonl(ack_seq - vseq->delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " + "(%d) from ack_seq\n", vseq->delta); + + } else { + th->ack_seq = htonl(ack_seq - vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " + "previous_delta (%d) from ack_seq\n", + vseq->previous_delta); + } + } +} + + +/* + * Updates ip_vs_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ +static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, + unsigned flag, __u32 seq, int diff) +{ + /* spinlock is to keep updating cp->flags atomic */ + spin_lock(&cp->lock); + if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { + vseq->previous_delta = vseq->delta; + vseq->delta += diff; + vseq->init_seq = seq; + cp->flags |= flag; + } + spin_unlock(&cp->lock); +} + +static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_seq(&cp->out_seq, th); + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_ack_seq(&cp->in_seq, th); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + if (!app->pkt_out(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->out_seq, + IP_VS_CONN_F_OUT_SEQ, seq, diff); + + return 1; +} + +/* + * Output pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL + * returns false if it can't handle packet (oom) + */ +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_out(cp, skb, app); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + return app->pkt_out(app, cp, skb, NULL); +} + + +static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_seq(&cp->in_seq, th); + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_ack_seq(&cp->out_seq, th); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + if (!app->pkt_in(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->in_seq, + IP_VS_CONN_F_IN_SEQ, seq, diff); + + return 1; +} + +/* + * Input pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL. + * returns false if can't handle packet (oom). + */ +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_in(cp, skb, app); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + return app->pkt_in(app, cp, skb, NULL); +} + + +#ifdef CONFIG_PROC_FS +/* + * /proc/net/ip_vs_app entry function + */ + +static struct ip_vs_app *ip_vs_app_idx(loff_t pos) +{ + struct ip_vs_app *app, *inc; + + list_for_each_entry(app, &ip_vs_app_list, a_list) { + list_for_each_entry(inc, &app->incs_list, a_list) { + if (pos-- == 0) + return inc; + } + } + return NULL; + +} + +static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) +{ + mutex_lock(&__ip_vs_app_mutex); + + return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_app *inc, *app; + struct list_head *e; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_app_idx(0); + + inc = v; + app = inc->app; + + if ((e = inc->a_list.next) != &app->incs_list) + return list_entry(e, struct ip_vs_app, a_list); + + /* go on to next application */ + for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { + app = list_entry(e, struct ip_vs_app, a_list); + list_for_each_entry(inc, &app->incs_list, a_list) { + return inc; + } + } + return NULL; +} + +static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) +{ + mutex_unlock(&__ip_vs_app_mutex); +} + +static int ip_vs_app_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "prot port usecnt name\n"); + else { + const struct ip_vs_app *inc = v; + + seq_printf(seq, "%-3s %-7u %-6d %-17s\n", + ip_vs_proto_name(inc->protocol), + ntohs(inc->port), + atomic_read(&inc->usecnt), + inc->name); + } + return 0; +} + +static const struct seq_operations ip_vs_app_seq_ops = { + .start = ip_vs_app_seq_start, + .next = ip_vs_app_seq_next, + .stop = ip_vs_app_seq_stop, + .show = ip_vs_app_seq_show, +}; + +static int ip_vs_app_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_app_seq_ops); +} + +static const struct file_operations ip_vs_app_fops = { + .owner = THIS_MODULE, + .open = ip_vs_app_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + + +/* + * Replace a segment of data with a new segment + */ +int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, + char *o_buf, int o_len, char *n_buf, int n_len) +{ + int diff; + int o_offset; + int o_left; + + EnterFunction(9); + + diff = n_len - o_len; + o_offset = o_buf - (char *)skb->data; + /* The length of left data after o_buf+o_len in the skb data */ + o_left = skb->len - (o_offset + o_len); + + if (diff <= 0) { + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + skb_trim(skb, skb->len + diff); + } else if (diff <= skb_tailroom(skb)) { + skb_put(skb, diff); + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + } else { + if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) + return -ENOMEM; + skb_put(skb, diff); + memmove(skb->data + o_offset + n_len, + skb->data + o_offset + o_len, o_left); + skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); + } + + /* must update the iph total length here */ + ip_hdr(skb)->tot_len = htons(skb->len); + + LeaveFunction(9); + return 0; +} + + +int __init ip_vs_app_init(void) +{ + /* we will replace it with proc_net_ipvs_create() soon */ + proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); + return 0; +} + + +void ip_vs_app_cleanup(void) +{ + proc_net_remove(&init_net, "ip_vs_app"); +} diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c new file mode 100644 index 00000000000..9a24332fbed --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -0,0 +1,1110 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. Many code here is taken from IP MASQ code of kernel 2.2. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include /* for proc_net_* */ +#include +#include +#include + +#include +#include + + +/* + * Connection hash table: for input and output packets lookups of IPVS + */ +static struct list_head *ip_vs_conn_tab; + +/* SLAB cache for IPVS connections */ +static struct kmem_cache *ip_vs_conn_cachep __read_mostly; + +/* counter for current IPVS connections */ +static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); + +/* counter for no client port connections */ +static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); + +/* random value for IPVS connection hash */ +static unsigned int ip_vs_conn_rnd; + +/* + * Fine locking granularity for big connection hash table + */ +#define CT_LOCKARRAY_BITS 4 +#define CT_LOCKARRAY_SIZE (1<ip, (__force u32)port, proto, + ip_vs_conn_rnd) + & IP_VS_CONN_TAB_MASK; +} + + +/* + * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. + * returns bool success. + */ +static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + /* Hash by protocol, client address and port */ + hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + + ct_write_lock(hash); + + if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + list_add(&cp->c_list, &ip_vs_conn_tab[hash]); + cp->flags |= IP_VS_CONN_F_HASHED; + atomic_inc(&cp->refcnt); + ret = 1; + } else { + IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + ret = 0; + } + + ct_write_unlock(hash); + + return ret; +} + + +/* + * UNhashes ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + /* unhash it and decrease its reference counter */ + hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + + ct_write_lock(hash); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + list_del(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + atomic_dec(&cp->refcnt); + ret = 1; + } else + ret = 0; + + ct_write_unlock(hash); + + return ret; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from OUTside-to-INside. + * s_addr, s_port: pkt source address (foreign host) + * d_addr, d_port: pkt dest address (load balancer) + */ +static inline struct ip_vs_conn *__ip_vs_conn_in_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == af && + ip_vs_addr_equal(af, s_addr, &cp->caddr) && + ip_vs_addr_equal(af, d_addr, &cp->vaddr) && + s_port == cp->cport && d_port == cp->vport && + ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ct_read_unlock(hash); + return cp; + } + } + + ct_read_unlock(hash); + + return NULL; +} + +struct ip_vs_conn *ip_vs_conn_in_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + struct ip_vs_conn *cp; + + cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) + cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, + d_port); + + IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + cp ? "hit" : "not hit"); + + return cp; +} + +/* Get reference to connection template */ +struct ip_vs_conn *ip_vs_ct_in_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == af && + ip_vs_addr_equal(af, s_addr, &cp->caddr) && + ip_vs_addr_equal(af, d_addr, &cp->vaddr) && + s_port == cp->cport && d_port == cp->vport && + cp->flags & IP_VS_CONN_F_TEMPLATE && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + goto out; + } + } + cp = NULL; + + out: + ct_read_unlock(hash); + + IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + cp ? "hit" : "not hit"); + + return cp; +} + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * s_addr, s_port: pkt source address (inside host) + * d_addr, d_port: pkt dest address (foreign host) + */ +struct ip_vs_conn *ip_vs_conn_out_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp, *ret=NULL; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == af && + ip_vs_addr_equal(af, d_addr, &cp->caddr) && + ip_vs_addr_equal(af, s_addr, &cp->daddr) && + d_port == cp->cport && s_port == cp->dport && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ret = cp; + break; + } + } + + ct_read_unlock(hash); + + IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ret ? "hit" : "not hit"); + + return ret; +} + + +/* + * Put back the conn and restart its timer with its timeout + */ +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + /* reset it expire in its timeout */ + mod_timer(&cp->timer, jiffies+cp->timeout); + + __ip_vs_conn_put(cp); +} + + +/* + * Fill a no_client_port connection with a client port number + */ +void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) +{ + if (ip_vs_conn_unhash(cp)) { + spin_lock(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) { + atomic_dec(&ip_vs_conn_no_cport_cnt); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; + } + spin_unlock(&cp->lock); + + /* hash on new dport */ + ip_vs_conn_hash(cp); + } +} + + +/* + * Bind a connection entry with the corresponding packet_xmit. + * Called by ip_vs_conn_new. + */ +static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit; + break; + } +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit_v6; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit_v6; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit_v6; + break; + } +} +#endif + + +static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->activeconns) + + atomic_read(&dest->inactconns); +} + +/* + * Bind a connection entry with a virtual service destination + * Called just after a new connection entry is created. + */ +static inline void +ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) +{ + /* if dest is NULL, then return directly */ + if (!dest) + return; + + /* Increase the refcnt counter of the dest */ + atomic_inc(&dest->refcnt); + + /* Bind with the destination and its corresponding transmitter */ + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) + /* if the connection is not template and is created + * by sync, preserve the activity flag. + */ + cp->flags |= atomic_read(&dest->conn_flags) & + (~IP_VS_CONN_F_INACTIVE); + else + cp->flags |= atomic_read(&dest->conn_flags); + cp->dest = dest; + + IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so increase the inactive + connection counter because it is in TCP SYNRECV + state (inactive) or other protocol inacive state */ + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + atomic_inc(&dest->activeconns); + else + atomic_inc(&dest->inactconns); + } else { + /* It is a persistent connection/template, so increase + the peristent connection counter */ + atomic_inc(&dest->persistconns); + } + + if (dest->u_threshold != 0 && + ip_vs_dest_totalconns(dest) >= dest->u_threshold) + dest->flags |= IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Check if there is a destination for the connection, if so + * bind the connection to the destination. + */ +struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest; + + if ((cp) && (!cp->dest)) { + dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, + &cp->vaddr, cp->vport, + cp->protocol); + ip_vs_bind_dest(cp, dest); + return dest; + } else + return NULL; +} + + +/* + * Unbind a connection entry with its VS destination + * Called by the ip_vs_conn_expire function. + */ +static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest = cp->dest; + + if (!dest) + return; + + IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so decrease the inactconns + or activeconns counter */ + if (cp->flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } else { + /* It is a persistent connection/template, so decrease + the peristent connection counter */ + atomic_dec(&dest->persistconns); + } + + if (dest->l_threshold != 0) { + if (ip_vs_dest_totalconns(dest) < dest->l_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else if (dest->u_threshold != 0) { + if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } + + /* + * Simply decrease the refcnt of the dest, because the + * dest will be either in service's destination list + * or in the trash. + */ + atomic_dec(&dest->refcnt); +} + + +/* + * Checking if the destination of a connection template is available. + * If available, return 1, otherwise invalidate this connection + * template and return 0. + */ +int ip_vs_check_template(struct ip_vs_conn *ct) +{ + struct ip_vs_dest *dest = ct->dest; + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE) || + (sysctl_ip_vs_expire_quiescent_template && + (atomic_read(&dest->weight) == 0))) { + IP_VS_DBG_BUF(9, "check_template: dest not available for " + "protocol %s s:%s:%d v:%s:%d " + "-> d:%s:%d\n", + ip_vs_proto_name(ct->protocol), + IP_VS_DBG_ADDR(ct->af, &ct->caddr), + ntohs(ct->cport), + IP_VS_DBG_ADDR(ct->af, &ct->vaddr), + ntohs(ct->vport), + IP_VS_DBG_ADDR(ct->af, &ct->daddr), + ntohs(ct->dport)); + + /* + * Invalidate the connection template + */ + if (ct->vport != htons(0xffff)) { + if (ip_vs_conn_unhash(ct)) { + ct->dport = htons(0xffff); + ct->vport = htons(0xffff); + ct->cport = 0; + ip_vs_conn_hash(ct); + } + } + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + atomic_dec(&ct->refcnt); + return 0; + } + return 1; +} + +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + + cp->timeout = 60*HZ; + + /* + * hey, I'm using it + */ + atomic_inc(&cp->refcnt); + + /* + * do I control anybody? + */ + if (atomic_read(&cp->n_control)) + goto expire_later; + + /* + * unhash it if it is hashed in the conn table + */ + if (!ip_vs_conn_unhash(cp)) + goto expire_later; + + /* + * refcnt==1 implies I'm the only one referrer + */ + if (likely(atomic_read(&cp->refcnt) == 1)) { + /* delete the timer if it is activated by other users */ + if (timer_pending(&cp->timer)) + del_timer(&cp->timer); + + /* does anybody control me? */ + if (cp->control) + ip_vs_control_del(cp); + + if (unlikely(cp->app != NULL)) + ip_vs_unbind_app(cp); + ip_vs_unbind_dest(cp); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) + atomic_dec(&ip_vs_conn_no_cport_cnt); + atomic_dec(&ip_vs_conn_count); + + kmem_cache_free(ip_vs_conn_cachep, cp); + return; + } + + /* hash it back to the table */ + ip_vs_conn_hash(cp); + + expire_later: + IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", + atomic_read(&cp->refcnt)-1, + atomic_read(&cp->n_control)); + + ip_vs_conn_put(cp); +} + + +void ip_vs_conn_expire_now(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) + mod_timer(&cp->timer, jiffies); +} + + +/* + * Create a new connection entry and hash it into the ip_vs_conn_tab + */ +struct ip_vs_conn * +ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + const union nf_inet_addr *daddr, __be16 dport, unsigned flags, + struct ip_vs_dest *dest) +{ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); + if (cp == NULL) { + IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); + return NULL; + } + + INIT_LIST_HEAD(&cp->c_list); + setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + cp->af = af; + cp->protocol = proto; + ip_vs_addr_copy(af, &cp->caddr, caddr); + cp->cport = cport; + ip_vs_addr_copy(af, &cp->vaddr, vaddr); + cp->vport = vport; + ip_vs_addr_copy(af, &cp->daddr, daddr); + cp->dport = dport; + cp->flags = flags; + spin_lock_init(&cp->lock); + + /* + * Set the entry is referenced by the current thread before hashing + * it in the table, so that other thread run ip_vs_random_dropentry + * but cannot drop this entry. + */ + atomic_set(&cp->refcnt, 1); + + atomic_set(&cp->n_control, 0); + atomic_set(&cp->in_pkts, 0); + + atomic_inc(&ip_vs_conn_count); + if (flags & IP_VS_CONN_F_NO_CPORT) + atomic_inc(&ip_vs_conn_no_cport_cnt); + + /* Bind the connection with a destination server */ + ip_vs_bind_dest(cp, dest); + + /* Set its state and timeout */ + cp->state = 0; + cp->timeout = 3*HZ; + + /* Bind its packet transmitter */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + if (unlikely(pp && atomic_read(&pp->appcnt))) + ip_vs_bind_app(cp, pp); + + /* Hash it in the ip_vs_conn_tab finally */ + ip_vs_conn_hash(cp); + + return cp; +} + + +/* + * /proc/net/ip_vs_conn entries + */ +#ifdef CONFIG_PROC_FS + +static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +{ + int idx; + struct ip_vs_conn *cp; + + for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + ct_read_lock_bh(idx); + list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + if (pos-- == 0) { + seq->private = &ip_vs_conn_tab[idx]; + return cp; + } + } + ct_read_unlock_bh(idx); + } + + return NULL; +} + +static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) +{ + seq->private = NULL; + return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; +} + +static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_conn *cp = v; + struct list_head *e, *l = seq->private; + int idx; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_conn_array(seq, 0); + + /* more on same hash chain? */ + if ((e = cp->c_list.next) != l) + return list_entry(e, struct ip_vs_conn, c_list); + + idx = l - ip_vs_conn_tab; + ct_read_unlock_bh(idx); + + while (++idx < IP_VS_CONN_TAB_SIZE) { + ct_read_lock_bh(idx); + list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + seq->private = &ip_vs_conn_tab[idx]; + return cp; + } + ct_read_unlock_bh(idx); + } + seq->private = NULL; + return NULL; +} + +static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) +{ + struct list_head *l = seq->private; + + if (l) + ct_read_unlock_bh(l - ip_vs_conn_tab); +} + +static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); + else { + const struct ip_vs_conn *cp = v; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, + "%-3s " NIP6_FMT " %04X " NIP6_FMT + " %04X " NIP6_FMT " %04X %-11s %7lu\n", + ip_vs_proto_name(cp->protocol), + NIP6(cp->caddr.in6), ntohs(cp->cport), + NIP6(cp->vaddr.in6), ntohs(cp->vport), + NIP6(cp->daddr.in6), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X" + " %08X %04X %-11s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_seq_show, +}; + +static int ip_vs_conn_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_conn_seq_ops); +} + +static const struct file_operations ip_vs_conn_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const char *ip_vs_origin_name(unsigned flags) +{ + if (flags & IP_VS_CONN_F_SYNC) + return "SYNC"; + else + return "LOCAL"; +} + +static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); + else { + const struct ip_vs_conn *cp = v; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, + "%-3s " NIP6_FMT " %04X " NIP6_FMT + " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + NIP6(cp->caddr.in6), ntohs(cp->cport), + NIP6(cp->vaddr.in6), ntohs(cp->vport), + NIP6(cp->daddr.in6), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X " + "%08X %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_sync_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_sync_seq_show, +}; + +static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_conn_sync_seq_ops); +} + +static const struct file_operations ip_vs_conn_sync_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_sync_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + + +/* + * Randomly drop connection entries before running out of memory + */ +static inline int todrop_entry(struct ip_vs_conn *cp) +{ + /* + * The drop rate array needs tuning for real environments. + * Called from timer bh only => no locking + */ + static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static char todrop_counter[9] = {0}; + int i; + + /* if the conn entry hasn't lasted for 60 seconds, don't drop it. + This will leave enough time for normal connection to get + through. */ + if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) + return 0; + + /* Don't drop the entry if its number of incoming packets is not + located in [0, 8] */ + i = atomic_read(&cp->in_pkts); + if (i > 8 || i < 0) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + +/* Called from keventd and must protect itself from softirqs */ +void ip_vs_random_dropentry(void) +{ + int idx; + struct ip_vs_conn *cp; + + /* + * Randomly scan 1/32 of the whole table every second + */ + for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { + unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; + + /* + * Lock is actually needed in this loop. + */ + ct_write_lock_bh(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + /* connection template */ + continue; + + if (cp->protocol == IPPROTO_TCP) { + switch(cp->state) { + case IP_VS_TCP_S_SYN_RECV: + case IP_VS_TCP_S_SYNACK: + break; + + case IP_VS_TCP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + + default: + continue; + } + } else { + if (!todrop_entry(cp)) + continue; + } + + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (cp->control) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp->control); + } + } + ct_write_unlock_bh(hash); + } +} + + +/* + * Flush all the connection entries in the ip_vs_conn_tab + */ +static void ip_vs_conn_flush(void) +{ + int idx; + struct ip_vs_conn *cp; + + flush_again: + for (idx=0; idxcontrol) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp->control); + } + } + ct_write_unlock_bh(idx); + } + + /* the counter may be not NULL, because maybe some conn entries + are run by slow timer handler or unhashed but still referred */ + if (atomic_read(&ip_vs_conn_count) != 0) { + schedule(); + goto flush_again; + } +} + + +int __init ip_vs_conn_init(void) +{ + int idx; + + /* + * Allocate the connection hash table and initialize its list heads + */ + ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); + if (!ip_vs_conn_tab) + return -ENOMEM; + + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; + } + + IP_VS_INFO("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + IP_VS_CONN_TAB_SIZE, + (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); + IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", + sizeof(struct ip_vs_conn)); + + for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); + } + + for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { + rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + } + + proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); + proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + + /* calculate the random value for connection hash */ + get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + + return 0; +} + + +void ip_vs_conn_cleanup(void) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(); + + /* Release the empty cache */ + kmem_cache_destroy(ip_vs_conn_cachep); + proc_net_remove(&init_net, "ip_vs_conn"); + proc_net_remove(&init_net, "ip_vs_conn_sync"); + vfree(ip_vs_conn_tab); +} diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c new file mode 100644 index 00000000000..958abf3e5f8 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -0,0 +1,1542 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. + * + * Changes: + * Paul `Rusty' Russell properly handle non-linear skbs + * Harald Welte don't use nfcache + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* for icmp_send */ +#include + +#include +#include + +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#endif + +#include + + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_skb_replace); +EXPORT_SYMBOL(ip_vs_proto_name); +EXPORT_SYMBOL(ip_vs_conn_new); +EXPORT_SYMBOL(ip_vs_conn_in_get); +EXPORT_SYMBOL(ip_vs_conn_out_get); +#ifdef CONFIG_IP_VS_PROTO_TCP +EXPORT_SYMBOL(ip_vs_tcp_conn_listen); +#endif +EXPORT_SYMBOL(ip_vs_conn_put); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif + + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) (((icmph)->un).echo.id) +#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) + +const char *ip_vs_proto_name(unsigned proto) +{ + static char buf[20]; + + switch (proto) { + case IPPROTO_IP: + return "IP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_ICMP: + return "ICMP"; +#ifdef CONFIG_IP_VS_IPV6 + case IPPROTO_ICMPV6: + return "ICMPv6"; +#endif + default: + sprintf(buf, "IP_%d", proto); + return buf; + } +} + +void ip_vs_init_hash_table(struct list_head *table, int rows) +{ + while (--rows >= 0) + INIT_LIST_HEAD(&table[rows]); +} + +static inline void +ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.ustats.inpkts++; + dest->stats.ustats.inbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.ustats.inpkts++; + dest->svc->stats.ustats.inbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.ustats.inpkts++; + ip_vs_stats.ustats.inbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.ustats.outpkts++; + dest->stats.ustats.outbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.ustats.outpkts++; + dest->svc->stats.ustats.outbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.ustats.outpkts++; + ip_vs_stats.ustats.outbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + spin_lock(&cp->dest->stats.lock); + cp->dest->stats.ustats.conns++; + spin_unlock(&cp->dest->stats.lock); + + spin_lock(&svc->stats.lock); + svc->stats.ustats.conns++; + spin_unlock(&svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.ustats.conns++; + spin_unlock(&ip_vs_stats.lock); +} + + +static inline int +ip_vs_set_state(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + if (unlikely(!pp->state_transition)) + return 0; + return pp->state_transition(cp, direction, skb, pp); +} + + +/* + * IPVS persistent scheduling function + * It creates a connection entry according to its template if exists, + * or selects a server and creates a connection entry plus a template. + * Locking: we are svc user (svc->refcnt), so we hold all dests too + * Protocols supported: TCP, UDP + */ +static struct ip_vs_conn * +ip_vs_sched_persist(struct ip_vs_service *svc, + const struct sk_buff *skb, + __be16 ports[2]) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest; + struct ip_vs_conn *ct; + __be16 dport; /* destination port to forward */ + union nf_inet_addr snet; /* source network of the client, + after masking */ + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + /* Mask saddr with the netmask to adjust template granularity */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); + else +#endif + snet.ip = iph.saddr.ip & svc->netmask; + + IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " + "mnet %s\n", + IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), + IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), + IP_VS_DBG_ADDR(svc->af, &snet)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initialize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a conn template like + * is created for persistent FTP + * service, and a template like + * is created for other persistent services. + */ + if (ports[1] == svc->port) { + /* Check if a template already exists */ + if (svc->port != FTPPORT) + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, + &iph.daddr, ports[1]); + else + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, + &iph.daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * No template found or the dest of the connection + * template is not available. + */ + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template like for non-ftp service, + * and + * for ftp service. + */ + if (svc->port != FTPPORT) + ct = ip_vs_conn_new(svc->af, iph.protocol, + &snet, 0, + &iph.daddr, + ports[1], + &dest->addr, dest->port, + IP_VS_CONN_F_TEMPLATE, + dest); + else + ct = ip_vs_conn_new(svc->af, iph.protocol, + &snet, 0, + &iph.daddr, 0, + &dest->addr, 0, + IP_VS_CONN_F_TEMPLATE, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = dest->port; + } else { + /* + * Note: persistent fwmark-based services and persistent + * port zero service are handled here. + * fwmark template: + * port zero template: + */ + if (svc->fwmark) { + union nf_inet_addr fwmark = { + .all = { 0, 0, 0, htonl(svc->fwmark) } + }; + + ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0, + &fwmark, 0); + } else + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, + &iph.daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * If it is not persistent port zero, return NULL, + * otherwise create a connection template. + */ + if (svc->port) + return NULL; + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template according to the service + */ + if (svc->fwmark) { + union nf_inet_addr fwmark = { + .all = { 0, 0, 0, htonl(svc->fwmark) } + }; + + ct = ip_vs_conn_new(svc->af, IPPROTO_IP, + &snet, 0, + &fwmark, 0, + &dest->addr, 0, + IP_VS_CONN_F_TEMPLATE, + dest); + } else + ct = ip_vs_conn_new(svc->af, iph.protocol, + &snet, 0, + &iph.daddr, 0, + &dest->addr, 0, + IP_VS_CONN_F_TEMPLATE, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = ports[1]; + } + + /* + * Create a new connection according to the template + */ + cp = ip_vs_conn_new(svc->af, iph.protocol, + &iph.saddr, ports[0], + &iph.daddr, ports[1], + &dest->addr, dport, + 0, + dest); + if (cp == NULL) { + ip_vs_conn_put(ct); + return NULL; + } + + /* + * Add its control + */ + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a connection entry. + * Protocols supported: TCP, UDP + */ +struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest; + __be16 _ports[2], *pptr; + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, skb, pptr); + + /* + * Non-persistent service + */ + if (!svc->fwmark && pptr[1] != svc->port) { + if (!svc->port) + IP_VS_ERR("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); + return NULL; + } + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a connection entry. + */ + cp = ip_vs_conn_new(svc->af, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], + &dest->addr, dest->port ? dest->port : pptr[1], + 0, + dest); + if (cp == NULL) + return NULL; + + IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * Pass or drop the packet. + * Called by ip_vs_in, when the virtual service is available but + * no destination is available for a new connection. + */ +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + __be16 _ports[2], *pptr; + struct ip_vs_iphdr iph; + int unicast; + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + if (pptr == NULL) { + ip_vs_service_put(svc); + return NF_DROP; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; + else +#endif + unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); + + /* if it is fwmark-based service, the cache_bypass sysctl is up + and the destination is a non-local unicast, then create + a cache_bypass connection entry */ + if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { + int ret, cs; + struct ip_vs_conn *cp; + union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; + + ip_vs_service_put(svc); + + /* create a new connection entry */ + IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); + cp = ip_vs_conn_new(svc->af, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], + &daddr, 0, + IP_VS_CONN_F_BYPASS, + NULL); + if (cp == NULL) + return NF_DROP; + + /* statistics */ + ip_vs_in_stats(cp, skb); + + /* set state */ + cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + + /* transmit the first SYN packet */ + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { + ip_vs_service_put(svc); + return NF_ACCEPT; + } + + ip_vs_service_put(svc); + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, + skb->dev); + else +#endif + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + return NF_DROP; +} + + +/* + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING + * chain, and is used for VS/NAT. + * It detects packets for VS/NAT connections and sends the packets + * immediately. This can avoid that iptable_nat mangles the packets + * for VS/NAT. + */ +static unsigned int ip_vs_post_routing(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (!skb->ipvs_property) + return NF_ACCEPT; + /* The packet was sent from IPVS, exit this chain */ + return NF_STOP; +} + +__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) +{ + return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); +} + +static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + int err = ip_defrag(skb, user); + + if (!err) + ip_send_check(ip_hdr(skb)); + + return err; +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) +{ + /* TODO IPv6: Find out what to do here for IPv6 */ + return 0; +} +#endif + +/* + * Packet has been made sufficiently writable in caller + * - inout: 1=in->out, 0=out->in + */ +void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int icmp_offset = iph->ihl*4; + struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + + icmp_offset); + struct iphdr *ciph = (struct iphdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.ip; + ip_send_check(iph); + ciph->daddr = cp->vaddr.ip; + ip_send_check(ciph); + } else { + iph->daddr = cp->daddr.ip; + ip_send_check(iph); + ciph->saddr = cp->daddr.ip; + ip_send_check(ciph); + } + + /* the TCP/UDP port */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { + __be16 *ports = (void *)ciph + ciph->ihl*4; + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMP"); + else + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMP"); +} + +#ifdef CONFIG_IP_VS_IPV6 +void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int icmp_offset = sizeof(struct ipv6hdr); + struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + + icmp_offset); + struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.in6; + ciph->daddr = cp->vaddr.in6; + } else { + iph->daddr = cp->daddr.in6; + ciph->saddr = cp->daddr.in6; + } + + /* the TCP/UDP port */ + if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) { + __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->icmp6_cksum = 0; + /* TODO IPv6: is this correct for ICMPv6? */ + ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); + else + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); +} +#endif + +/* Handle relevant response ICMP messages - forward to the right + * destination host. Used for NAT and local client. + */ +static int handle_response_icmp(int af, struct sk_buff *skb, + union nf_inet_addr *snet, + __u8 protocol, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, + unsigned int offset, unsigned int ihl) +{ + unsigned int verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + IP_VS_ERR("shouldn't reach here, because the box is on the " + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", + IP_VS_DBG_ADDR(af, snet)); + goto out; + } + + if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol) + offset += 2 * sizeof(__u16); + if (!skb_make_writable(skb, offset)) + goto out; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_nat_icmp_v6(skb, pp, cp, 1); + else +#endif + ip_vs_nat_icmp(skb, pp, cp, 1); + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->ipvs_property = 1; + verdict = NF_ACCEPT; + +out: + __ip_vs_conn_put(cp); + + return verdict; +} + +/* + * Handle ICMP messages in the inside-to-outside direction (outgoing). + * Find any that might be relevant, check against existing connections. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int ip_vs_out_icmp(struct sk_buff *skb, int *related) +{ + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + ic->type, ntohs(icmp_id(ic)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); + + offset += cih->ihl * 4; + + ip_vs_fill_iphdr(AF_INET, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, + pp, offset, ihl); +} + +#ifdef CONFIG_IP_VS_IPV6 +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + NIP6(iph->saddr), NIP6(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + ipv6_addr_copy(&snet.in6, &iph->saddr); + return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, + pp, offset, sizeof(struct ipv6hdr)); +} +#endif + +static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) +{ + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + return th->rst; +} + +/* Handle response packets: rewrite addresses and send away... + * Used for NAT and local client. + */ +static unsigned int +handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int ihl) +{ + IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + + if (!skb_make_writable(skb, ihl)) + goto drop; + + /* mangle the packet */ + if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) + goto drop; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ipv6_hdr(skb)->saddr = cp->vaddr.in6; + else +#endif + { + ip_hdr(skb)->saddr = cp->vaddr.ip; + ip_send_check(ip_hdr(skb)); + } + + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (ip6_route_me_harder(skb) != 0) + goto drop; + } else +#endif + if (ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; + + IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + ip_vs_conn_put(cp); + + skb->ipvs_property = 1; + + LeaveFunction(11); + return NF_ACCEPT; + +drop: + ip_vs_conn_put(cp); + kfree_skb(skb); + return NF_STOLEN; +} + +/* + * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * Check if outgoing packet belongs to the established ip_vs_conn. + */ +static unsigned int +ip_vs_out(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + int af; + + EnterFunction(11); + + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; + + if (skb->ipvs_property) + return NF_ACCEPT; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related, verdict = ip_vs_out_icmp_v6(skb, &related); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_out_icmp(skb, &related); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + pp = ip_vs_proto_get(iph.protocol); + if (unlikely(!pp)) + return NF_ACCEPT; + + /* reassemble IP fragments */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related, verdict = ip_vs_out_icmp_v6(skb, &related); + + if (related) + return verdict; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && + !pp->dont_defrag)) { + if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); + + if (unlikely(!cp)) { + if (sysctl_ip_vs_nat_icmp_send && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP)) { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, iph.len, + sizeof(_ports), _ports); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_lookup_real_service(af, iph.protocol, + &iph.saddr, + pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if (iph.protocol != IPPROTO_TCP + || !is_tcp_reset(skb, iph.len)) { +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0, skb->dev); + else +#endif + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; + } + } + } + IP_VS_DBG_PKT(12, pp, skb, 0, + "packet continues traversal as normal"); + return NF_ACCEPT; + } + + return handle_response(af, skb, pp, cp, iph.len); +} + + +/* + * Handle ICMP messages in the outside-to-inside direction (incoming). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int +ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl, verdict; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? + IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + ic->type, ntohs(icmp_id(ic)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); + + offset += cih->ihl * 4; + + ip_vs_fill_iphdr(AF_INET, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); + if (!cp) { + /* The packet could also belong to a local client */ + cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + if (cp) { + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, + cih->protocol, cp, pp, + offset, ihl); + } + return NF_ACCEPT; + } + + verdict = NF_DROP; + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + goto out; + } + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); + /* do not touch skb anymore */ + + out: + __ip_vs_conn_put(cp); + + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 +static int +ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, verdict; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? + IP_DEFRAG_VS_IN : + IP_DEFRAG_VS_FWD)) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + NIP6(iph->saddr), NIP6(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (!cp) { + /* The packet could also belong to a local client */ + cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (cp) { + ipv6_addr_copy(&snet.in6, &iph->saddr); + return handle_response_icmp(AF_INET6, skb, &snet, + cih->nexthdr, + cp, pp, offset, + sizeof(struct ipv6hdr)); + } + return NF_ACCEPT; + } + + verdict = NF_DROP; + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); + /* do not touch skb anymore */ + + __ip_vs_conn_put(cp); + + return verdict; +} +#endif + + +/* + * Check if it's for virtual services, look it up, + * and send it on its way... + */ +static unsigned int +ip_vs_in(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + int ret, restart, af; + + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + /* + * Big tappo: only PACKET_HOST, including loopback for local client + * Don't handle local packets on IPv6 for now + */ + if (unlikely(skb->pkt_type != PACKET_HOST)) { + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", + skb->pkt_type, + iph.protocol, + IP_VS_DBG_ADDR(af, &iph.daddr)); + return NF_ACCEPT; + } + + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + /* Protocol supported? */ + pp = ip_vs_proto_get(iph.protocol); + if (unlikely(!pp)) + return NF_ACCEPT; + + /* + * Check if the packet belongs to an existing connection entry + */ + cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); + + if (unlikely(!cp)) { + int v; + + /* For local client packets, it could be a response */ + cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); + if (cp) + return handle_response(af, skb, pp, cp, iph.len); + + if (!pp->conn_schedule(af, skb, pp, &v, &cp)) + return v; + } + + if (unlikely(!cp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, pp, skb, 0, + "packet continues traversal as normal"); + return NF_ACCEPT; + } + + IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); + + /* Check the server status */ + if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* the destination server is not available */ + + if (sysctl_ip_vs_expire_nodest_conn) { + /* try to expire the connection immediately */ + ip_vs_conn_expire_now(cp); + } + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); + return NF_DROP; + } + + ip_vs_in_stats(cp, skb); + restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + else { + IP_VS_DBG_RL("warning: packet_xmit is null"); + ret = NF_ACCEPT; + } + + /* Increase its packet counter and check if it is needed + * to be synchronized + * + * Sync connection if it is about to close to + * encorage the standby servers to update the connections timeout + */ + atomic_inc(&cp->in_pkts); + if (af == AF_INET && + (ip_vs_sync_state & IP_VS_STATE_MASTER) && + (((cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_TCP_S_ESTABLISHED) && + (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] + == sysctl_ip_vs_sync_threshold[0])) || + ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && + ((cp->state == IP_VS_TCP_S_FIN_WAIT) || + (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || + (cp->state == IP_VS_TCP_S_TIME_WAIT))))) + ip_vs_sync_conn(cp); + cp->old_state = cp->state; + + ip_vs_conn_put(cp); + return ret; +} + + +/* + * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP + * related packets destined for 0.0.0.0/0. + * When fwmark-based virtual service is used, such as transparent + * cache cluster, TCP packets can be marked and routed to ip_vs_in, + * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and + * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain + * and send them to ip_vs_in_icmp. + */ +static unsigned int +ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + + if (ip_hdr(skb)->protocol != IPPROTO_ICMP) + return NF_ACCEPT; + + return ip_vs_in_icmp(skb, &r, hooknum); +} + +#ifdef CONFIG_IP_VS_IPV6 +static unsigned int +ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + + if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + return NF_ACCEPT; + + return ip_vs_in_icmp_v6(skb, &r, hooknum); +} +#endif + + +static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = 100, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC-1, + }, +#ifdef CONFIG_IP_VS_IPV6 + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_IN, + .priority = 100, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp_v6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_NAT_SRC-1, + }, +#endif +}; + + +/* + * Initialize IP Virtual Server + */ +static int __init ip_vs_init(void) +{ + int ret; + + ip_vs_estimator_init(); + + ret = ip_vs_control_init(); + if (ret < 0) { + IP_VS_ERR("can't setup control.\n"); + goto cleanup_estimator; + } + + ip_vs_protocol_init(); + + ret = ip_vs_app_init(); + if (ret < 0) { + IP_VS_ERR("can't setup application helper.\n"); + goto cleanup_protocol; + } + + ret = ip_vs_conn_init(); + if (ret < 0) { + IP_VS_ERR("can't setup connection table.\n"); + goto cleanup_app; + } + + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + if (ret < 0) { + IP_VS_ERR("can't register hooks.\n"); + goto cleanup_conn; + } + + IP_VS_INFO("ipvs loaded.\n"); + return ret; + + cleanup_conn: + ip_vs_conn_cleanup(); + cleanup_app: + ip_vs_app_cleanup(); + cleanup_protocol: + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + cleanup_estimator: + ip_vs_estimator_cleanup(); + return ret; +} + +static void __exit ip_vs_cleanup(void) +{ + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ip_vs_conn_cleanup(); + ip_vs_app_cleanup(); + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + ip_vs_estimator_cleanup(); + IP_VS_INFO("ipvs unloaded.\n"); +} + +module_init(ip_vs_init); +module_exit(ip_vs_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c new file mode 100644 index 00000000000..0302cf3e503 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -0,0 +1,3443 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#endif +#include +#include +#include + +#include + +#include + +/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ +static DEFINE_MUTEX(__ip_vs_mutex); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_svc_lock); + +/* lock for table with the real services */ +static DEFINE_RWLOCK(__ip_vs_rs_lock); + +/* lock for state and timeout tables */ +static DEFINE_RWLOCK(__ip_vs_securetcp_lock); + +/* lock for drop entry handling */ +static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); + +/* lock for drop packet handling */ +static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); + +/* 1/rate drop and drop-entry variables */ +int ip_vs_drop_rate = 0; +int ip_vs_drop_counter = 0; +static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); + +/* number of virtual services */ +static int ip_vs_num_services = 0; + +/* sysctl variables */ +static int sysctl_ip_vs_drop_entry = 0; +static int sysctl_ip_vs_drop_packet = 0; +static int sysctl_ip_vs_secure_tcp = 0; +static int sysctl_ip_vs_amemthresh = 1024; +static int sysctl_ip_vs_am_droprate = 10; +int sysctl_ip_vs_cache_bypass = 0; +int sysctl_ip_vs_expire_nodest_conn = 0; +int sysctl_ip_vs_expire_quiescent_template = 0; +int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; +int sysctl_ip_vs_nat_icmp_send = 0; + + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + +#ifdef CONFIG_IP_VS_IPV6 +/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ +static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) +{ + struct rt6_info *rt; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = *addr, + .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) + return 1; + + return 0; +} +#endif +/* + * update_defense_level is called from keventd and from sysctl, + * so it needs to protect itself from softirqs + */ +static void update_defense_level(void) +{ + struct sysinfo i; + static int old_secure_tcp = 0; + int availmem; + int nomem; + int to_change = -1; + + /* we only count free and buffered memory (in pages) */ + si_meminfo(&i); + availmem = i.freeram + i.bufferram; + /* however in linux 2.5 the i.bufferram is total page cache size, + we need adjust it */ + /* si_swapinfo(&i); */ + /* availmem = availmem - (i.totalswap - i.freeswap); */ + + nomem = (availmem < sysctl_ip_vs_amemthresh); + + local_bh_disable(); + + /* drop_entry */ + spin_lock(&__ip_vs_dropentry_lock); + switch (sysctl_ip_vs_drop_entry) { + case 0: + atomic_set(&ip_vs_dropentry, 0); + break; + case 1: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + sysctl_ip_vs_drop_entry = 2; + } else { + atomic_set(&ip_vs_dropentry, 0); + } + break; + case 2: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + } else { + atomic_set(&ip_vs_dropentry, 0); + sysctl_ip_vs_drop_entry = 1; + }; + break; + case 3: + atomic_set(&ip_vs_dropentry, 1); + break; + } + spin_unlock(&__ip_vs_dropentry_lock); + + /* drop_packet */ + spin_lock(&__ip_vs_droppacket_lock); + switch (sysctl_ip_vs_drop_packet) { + case 0: + ip_vs_drop_rate = 0; + break; + case 1: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + sysctl_ip_vs_drop_packet = 2; + } else { + ip_vs_drop_rate = 0; + } + break; + case 2: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + } else { + ip_vs_drop_rate = 0; + sysctl_ip_vs_drop_packet = 1; + } + break; + case 3: + ip_vs_drop_rate = sysctl_ip_vs_am_droprate; + break; + } + spin_unlock(&__ip_vs_droppacket_lock); + + /* secure_tcp */ + write_lock(&__ip_vs_securetcp_lock); + switch (sysctl_ip_vs_secure_tcp) { + case 0: + if (old_secure_tcp >= 2) + to_change = 0; + break; + case 1: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + sysctl_ip_vs_secure_tcp = 2; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + } + break; + case 2: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + sysctl_ip_vs_secure_tcp = 1; + } + break; + case 3: + if (old_secure_tcp < 2) + to_change = 1; + break; + } + old_secure_tcp = sysctl_ip_vs_secure_tcp; + if (to_change >= 0) + ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); + write_unlock(&__ip_vs_securetcp_lock); + + local_bh_enable(); +} + + +/* + * Timer for checking the defense + */ +#define DEFENSE_TIMER_PERIOD 1*HZ +static void defense_work_handler(struct work_struct *work); +static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); + +static void defense_work_handler(struct work_struct *work) +{ + update_defense_level(); + if (atomic_read(&ip_vs_dropentry)) + ip_vs_random_dropentry(); + + schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); +} + +int +ip_vs_use_count_inc(void) +{ + return try_module_get(THIS_MODULE); +} + +void +ip_vs_use_count_dec(void) +{ + module_put(THIS_MODULE); +} + + +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* the service table hashed by */ +static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + +/* + * Hash table: for real service lookups + */ +#define IP_VS_RTAB_BITS 4 +#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) +#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) + +static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; + +/* + * Trash for destinations + */ +static LIST_HEAD(ip_vs_dest_trash); + +/* + * FTP & NULL virtual service counters + */ +static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); +static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); + + +/* + * Returns hash value for virtual service + */ +static __inline__ unsigned +ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, + __be16 port) +{ + register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) + & IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +{ + return fwmark & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes a service in the ip_vs_svc_table by + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + */ +static int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash it by in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, + svc->port); + list_add(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash it by fwmark in ip_vs_svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->fwmark); + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. + * Should be called with locked tables. + */ +static int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* Remove it from the ip_vs_svc_table table */ + list_del(&svc->s_list); + } else { + /* Remove it from the ip_vs_svc_fwm_table table */ + list_del(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Get service by {proto,addr,port} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, + __be16 vport) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); + + list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + if ((svc->af == af) + && ip_vs_addr_equal(af, &svc->addr, vaddr) + && (svc->port == vport) + && (svc->protocol == protocol)) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + + +/* + * Get service by {fwmark} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_svc_fwm_get(int af, __u32 fwmark) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashkey(fwmark); + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + if (svc->fwmark == fwmark && svc->af == af) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + +struct ip_vs_service * +ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) +{ + struct ip_vs_service *svc; + + read_lock(&__ip_vs_svc_lock); + + /* + * Check the table hashed by fwmark first + */ + if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) + goto out; + + /* + * Check the table hashed by + * for "full" addressed entries + */ + svc = __ip_vs_service_get(af, protocol, vaddr, vport); + + if (svc == NULL + && protocol == IPPROTO_TCP + && atomic_read(&ip_vs_ftpsvc_counter) + && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ip_vs_nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_service_get(af, protocol, vaddr, 0); + } + + out: + read_unlock(&__ip_vs_svc_lock); + + IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), + svc ? "hit" : "not hit"); + + return svc; +} + + +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + dest->svc = svc; +} + +static inline void +__ip_vs_unbind_svc(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = dest->svc; + + dest->svc = NULL; + if (atomic_dec_and_test(&svc->refcnt)) + kfree(svc); +} + + +/* + * Returns hash value for real service + */ +static inline unsigned ip_vs_rs_hashkey(int af, + const union nf_inet_addr *addr, + __be16 port) +{ + register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) + & IP_VS_RTAB_MASK; +} + +/* + * Hashes ip_vs_dest in ip_vs_rtable by . + * should be called with locked tables. + */ +static int ip_vs_rs_hash(struct ip_vs_dest *dest) +{ + unsigned hash; + + if (!list_empty(&dest->d_list)) { + return 0; + } + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + + list_add(&dest->d_list, &ip_vs_rtable[hash]); + + return 1; +} + +/* + * UNhashes ip_vs_dest from ip_vs_rtable. + * should be called with locked tables. + */ +static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the ip_vs_rtable table. + */ + if (!list_empty(&dest->d_list)) { + list_del(&dest->d_list); + INIT_LIST_HEAD(&dest->d_list); + } + + return 1; +} + +/* + * Lookup real service by in the real service table. + */ +struct ip_vs_dest * +ip_vs_lookup_real_service(int af, __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) +{ + unsigned hash; + struct ip_vs_dest *dest; + + /* + * Check for "full" addressed entries + * Return the first found entry + */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + read_lock(&__ip_vs_rs_lock); + list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { + if ((dest->af == af) + && ip_vs_addr_equal(af, &dest->addr, daddr) + && (dest->port == dport) + && ((dest->protocol == protocol) || + dest->vfwmark)) { + /* HIT */ + read_unlock(&__ip_vs_rs_lock); + return dest; + } + } + read_unlock(&__ip_vs_rs_lock); + + return NULL; +} + +/* + * Lookup destination by {addr,port} in the given service + */ +static struct ip_vs_dest * +ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest; + + /* + * Find the destination for the given service + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->af == svc->af) + && ip_vs_addr_equal(svc->af, &dest->addr, daddr) + && (dest->port == dport)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* + * Find destination by {daddr,dport,vaddr,protocol} + * Cretaed to be used in ip_vs_process_message() in + * the backup synchronization daemon. It finds the + * destination to be bound to the received connection + * on the backup. + * + * ip_vs_lookup_real_service() looked promissing, but + * seems not working as expected. + */ +struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, + __be16 dport, + const union nf_inet_addr *vaddr, + __be16 vport, __u16 protocol) +{ + struct ip_vs_dest *dest; + struct ip_vs_service *svc; + + svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); + if (!svc) + return NULL; + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest) + atomic_inc(&dest->refcnt); + ip_vs_service_put(svc); + return dest; +} + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some conn entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +static struct ip_vs_dest * +ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest, *nxt; + + /* + * Find the destination in trash + */ + list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " + "dest->refcnt=%d\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->af == svc->af && + ip_vs_addr_equal(svc->af, &dest->addr, daddr) && + dest->port == dport && + dest->vfwmark == svc->fwmark && + dest->protocol == svc->protocol && + (svc->fwmark || + (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && + dest->vport == svc->port))) { + /* HIT */ + return dest; + } + + /* + * Try to purge the destination from trash if not referenced + */ + if (atomic_read(&dest->refcnt) == 1) { + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " + "from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } + } + + return NULL; +} + + +/* + * Clean up all the destinations in the trash + * Called by the ip_vs_control_cleanup() + * + * When the ip_vs_control_clearup is activated by ipvs module exit, + * the service tables must have been flushed and all the connections + * are expired, and the refcnt of each destination in the trash must + * be 1, so we simply release them here. + */ +static void ip_vs_trash_cleanup(void) +{ + struct ip_vs_dest *dest, *nxt; + + list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } +} + + +static void +ip_vs_zero_stats(struct ip_vs_stats *stats) +{ + spin_lock_bh(&stats->lock); + + memset(&stats->ustats, 0, sizeof(stats->ustats)); + ip_vs_zero_estimator(stats); + + spin_unlock_bh(&stats->lock); +} + +/* + * Update a destination in the given service + */ +static void +__ip_vs_update_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) +{ + int conn_flags; + + /* set the weight and the flags */ + atomic_set(&dest->weight, udest->weight); + conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; + + /* check if local node and update the flags */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + } else +#endif + if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { + conn_flags |= IP_VS_CONN_F_NOOUTPUT; + } else { + /* + * Put the real service in ip_vs_rtable if not present. + * For now only for NAT! + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_hash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + } + atomic_set(&dest->conn_flags, conn_flags); + + /* bind the service */ + if (!dest->svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (dest->svc != svc) { + __ip_vs_unbind_svc(dest); + ip_vs_zero_stats(&dest->stats); + __ip_vs_bind_svc(dest, svc); + } + } + + /* set the dest status flags */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; + + if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + dest->u_threshold = udest->u_threshold; + dest->l_threshold = udest->l_threshold; +} + + +/* + * Create a destination for the given service + */ +static int +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest **dest_p) +{ + struct ip_vs_dest *dest; + unsigned atype; + + EnterFunction(2); + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + atype = ipv6_addr_type(&udest->addr.in6); + if ((!(atype & IPV6_ADDR_UNICAST) || + atype & IPV6_ADDR_LINKLOCAL) && + !__ip_vs_addr_is_local_v6(&udest->addr.in6)) + return -EINVAL; + } else +#endif + { + atype = inet_addr_type(&init_net, udest->addr.ip); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + } + + dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); + if (dest == NULL) { + IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); + return -ENOMEM; + } + + dest->af = svc->af; + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); + dest->port = udest->port; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->persistconns, 0); + atomic_set(&dest->refcnt, 0); + + INIT_LIST_HEAD(&dest->d_list); + spin_lock_init(&dest->dst_lock); + spin_lock_init(&dest->stats.lock); + __ip_vs_update_dest(svc, dest, udest); + ip_vs_new_estimator(&dest->stats); + + *dest_p = dest; + + LeaveFunction(2); + return 0; +} + + +/* + * Add a destination into an existing service + */ +static int +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + int ret; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " + "upper threshold\n"); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* + * Check if the dest already exists in the list + */ + dest = ip_vs_lookup_dest(svc, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); + return -EEXIST; + } + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest(svc, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " + "dest->refcnt=%d, service %u/%s:%u\n", + IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->vaddr), + ntohs(dest->vport)); + + __ip_vs_update_dest(svc, dest, udest); + + /* + * Get the destination from the trash + */ + list_del(&dest->n_list); + + ip_vs_new_estimator(&dest->stats); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + return 0; + } + + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(svc, udest, &dest); + if (ret) { + return ret; + } + + /* + * Add the dest entry into the list + */ + atomic_inc(&dest->refcnt); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Edit a destination in the given service + */ +static int +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " + "upper threshold\n"); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(svc, &daddr, dport); + + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); + return -ENOENT; + } + + __ip_vs_update_dest(svc, dest, udest); + + write_lock_bh(&__ip_vs_svc_lock); + + /* Wait until all other svc users go away */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* call the update_service, because server weight may be changed */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Delete a destination (must be already unlinked from the service) + */ +static void __ip_vs_del_dest(struct ip_vs_dest *dest) +{ + ip_vs_kill_estimator(&dest->stats); + + /* + * Remove it from the d-linked list with the real services. + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_unhash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + + /* + * Decrease the refcnt of the dest, and free the dest + * if nobody refers to it (refcnt=0). Otherwise, throw + * the destination into the trash. + */ + if (atomic_dec_and_test(&dest->refcnt)) { + ip_vs_dst_reset(dest); + /* simply decrease svc->refcnt here, let the caller check + and release the service if nobody refers to it. + Only user context can release destination and service, + and only one user context can update virtual service at a + time, so the operation here is OK */ + atomic_dec(&dest->svc->refcnt); + kfree(dest); + } else { + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " + "dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + list_add(&dest->n_list, &ip_vs_dest_trash); + atomic_inc(&dest->refcnt); + } +} + + +/* + * Unlink a destination from the given service + */ +static void __ip_vs_unlink_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + int svcupd) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del(&dest->n_list); + svc->num_dests--; + + /* + * Call the update_service function of its scheduler + */ + if (svcupd && svc->scheduler->update_service) + svc->scheduler->update_service(svc); +} + + +/* + * Delete a destination server in the given service + */ +static int +ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + __be16 dport = udest->port; + + EnterFunction(2); + + dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(svc, dest, 1); + + write_unlock_bh(&__ip_vs_svc_lock); + + /* + * Delete the destination + */ + __ip_vs_del_dest(dest); + + LeaveFunction(2); + + return 0; +} + + +/* + * Add a service into the service hash table + */ +static int +ip_vs_add_service(struct ip_vs_service_user_kern *u, + struct ip_vs_service **svc_p) +{ + int ret = 0; + struct ip_vs_scheduler *sched = NULL; + struct ip_vs_service *svc = NULL; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* Lookup the scheduler by 'u->sched_name' */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_mod_dec; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + if (!sched->supports_ipv6) { + ret = -EAFNOSUPPORT; + goto out_err; + } + if ((u->netmask < 1) || (u->netmask > 128)) { + ret = -EINVAL; + goto out_err; + } + } +#endif + + svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + if (svc == NULL) { + IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); + ret = -ENOMEM; + goto out_err; + } + + /* I'm the first user of the service */ + atomic_set(&svc->usecnt, 1); + atomic_set(&svc->refcnt, 0); + + svc->af = u->af; + svc->protocol = u->protocol; + ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); + svc->port = u->port; + svc->fwmark = u->fwmark; + svc->flags = u->flags; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + INIT_LIST_HEAD(&svc->destinations); + rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->stats.lock); + + /* Bind the scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + + /* Update the virtual service counters */ + if (svc->port == FTPPORT) + atomic_inc(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ip_vs_nullsvc_counter); + + ip_vs_new_estimator(&svc->stats); + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ip_vs_num_services++; + + /* Hash the service into the service table */ + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_hash(svc); + write_unlock_bh(&__ip_vs_svc_lock); + + *svc_p = svc; + return 0; + + out_err: + if (svc != NULL) { + if (svc->scheduler) + ip_vs_unbind_scheduler(svc); + if (svc->inc) { + local_bh_disable(); + ip_vs_app_inc_put(svc->inc); + local_bh_enable(); + } + kfree(svc); + } + ip_vs_scheduler_put(sched); + + out_mod_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +static int +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) +{ + struct ip_vs_scheduler *sched, *old_sched; + int ret = 0; + + /* + * Lookup the scheduler, by 'u->sched_name' + */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } + old_sched = sched; + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + if (!sched->supports_ipv6) { + ret = -EAFNOSUPPORT; + goto out; + } + if ((u->netmask < 1) || (u->netmask > 128)) { + ret = -EINVAL; + goto out; + } + } +#endif + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Set the flags and timeout value + */ + svc->flags = u->flags | IP_VS_SVC_F_HASHED; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + old_sched = svc->scheduler; + if (sched != old_sched) { + /* + * Unbind the old scheduler + */ + if ((ret = ip_vs_unbind_scheduler(svc))) { + old_sched = sched; + goto out_unlock; + } + + /* + * Bind the new scheduler + */ + if ((ret = ip_vs_bind_scheduler(svc, sched))) { + /* + * If ip_vs_bind_scheduler fails, restore the old + * scheduler. + * The main reason of failure is out of memory. + * + * The question is if the old scheduler can be + * restored all the time. TODO: if it cannot be + * restored some time, we must delete the service, + * otherwise the system may crash. + */ + ip_vs_bind_scheduler(svc, old_sched); + old_sched = sched; + goto out_unlock; + } + } + + out_unlock: + write_unlock_bh(&__ip_vs_svc_lock); +#ifdef CONFIG_IP_VS_IPV6 + out: +#endif + + if (old_sched) + ip_vs_scheduler_put(old_sched); + + return ret; +} + + +/* + * Delete a service from the service list + * - The service must be unlinked, unlocked and not referenced! + * - We are called under _bh lock + */ +static void __ip_vs_del_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *nxt; + struct ip_vs_scheduler *old_sched; + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ip_vs_num_services--; + + ip_vs_kill_estimator(&svc->stats); + + /* Unbind scheduler */ + old_sched = svc->scheduler; + ip_vs_unbind_scheduler(svc); + if (old_sched) + ip_vs_scheduler_put(old_sched); + + /* Unbind app inc */ + if (svc->inc) { + ip_vs_app_inc_put(svc->inc); + svc->inc = NULL; + } + + /* + * Unlink the whole destination list + */ + list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { + __ip_vs_unlink_dest(svc, dest, 0); + __ip_vs_del_dest(dest); + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ip_vs_nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + if (atomic_read(&svc->refcnt) == 0) + kfree(svc); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + + /* + * Unhash it from the service table + */ + write_lock_bh(&__ip_vs_svc_lock); + + ip_vs_svc_unhash(svc); + + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + __ip_vs_del_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + return 0; +} + + +/* + * Flush all the virtual services + */ +static int ip_vs_flush(void) +{ + int idx; + struct ip_vs_service *svc, *nxt; + + /* + * Flush the service table hashed by + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, + &ip_vs_svc_fwm_table[idx], f_list) { + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + return 0; +} + + +/* + * Zero counters in a service or all services + */ +static int ip_vs_zero_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + + write_lock_bh(&__ip_vs_svc_lock); + list_for_each_entry(dest, &svc->destinations, n_list) { + ip_vs_zero_stats(&dest->stats); + } + ip_vs_zero_stats(&svc->stats); + write_unlock_bh(&__ip_vs_svc_lock); + return 0; +} + +static int ip_vs_zero_all(void) +{ + int idx; + struct ip_vs_service *svc; + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + ip_vs_zero_service(svc); + } + } + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + ip_vs_zero_service(svc); + } + } + + ip_vs_zero_stats(&ip_vs_stats); + return 0; +} + + +static int +proc_do_defense_mode(ctl_table *table, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 3)) { + /* Restore the correct value */ + *valp = val; + } else { + update_defense_level(); + } + } + return rc; +} + + +static int +proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val[2]; + int rc; + + /* backup the value first */ + memcpy(val, valp, sizeof(val)); + + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + /* Restore the correct value */ + memcpy(valp, val, sizeof(val)); + } + return rc; +} + + +/* + * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + */ + +static struct ctl_table vs_vars[] = { + { + .procname = "amemthresh", + .data = &sysctl_ip_vs_amemthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_IP_VS_DEBUG + { + .procname = "debug_level", + .data = &sysctl_ip_vs_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .procname = "am_droprate", + .data = &sysctl_ip_vs_am_droprate, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "drop_entry", + .data = &sysctl_ip_vs_drop_entry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, + { + .procname = "drop_packet", + .data = &sysctl_ip_vs_drop_packet, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, + { + .procname = "secure_tcp", + .data = &sysctl_ip_vs_secure_tcp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, +#if 0 + { + .procname = "timeout_established", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_synsent", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_synrecv", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_finwait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_timewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_close", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_closewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_lastack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_listen", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_synack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_udp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_icmp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, +#endif + { + .procname = "cache_bypass", + .data = &sysctl_ip_vs_cache_bypass, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "expire_nodest_conn", + .data = &sysctl_ip_vs_expire_nodest_conn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "expire_quiescent_template", + .data = &sysctl_ip_vs_expire_quiescent_template, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "sync_threshold", + .data = &sysctl_ip_vs_sync_threshold, + .maxlen = sizeof(sysctl_ip_vs_sync_threshold), + .mode = 0644, + .proc_handler = &proc_do_sync_threshold, + }, + { + .procname = "nat_icmp_send", + .data = &sysctl_ip_vs_nat_icmp_send, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +const struct ctl_path net_vs_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "vs", }, + { } +}; +EXPORT_SYMBOL_GPL(net_vs_ctl_path); + +static struct ctl_table_header * sysctl_header; + +#ifdef CONFIG_PROC_FS + +struct ip_vs_iter { + struct list_head *table; + int bucket; +}; + +/* + * Write the contents of the VS rule table to a PROCfs file. + * (It is kept just for backward compatibility) + */ +static inline const char *ip_vs_fwd_name(unsigned flags) +{ + switch (flags & IP_VS_CONN_F_FWD_MASK) { + case IP_VS_CONN_F_LOCALNODE: + return "Local"; + case IP_VS_CONN_F_TUNNEL: + return "Tunnel"; + case IP_VS_CONN_F_DROUTE: + return "Route"; + default: + return "Masq"; + } +} + + +/* Get the Nth entry in the two lists */ +static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) +{ + struct ip_vs_iter *iter = seq->private; + int idx; + struct ip_vs_service *svc; + + /* look in hash by protocol */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (pos-- == 0){ + iter->table = ip_vs_svc_table; + iter->bucket = idx; + return svc; + } + } + } + + /* keep looking in fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (pos-- == 0) { + iter->table = ip_vs_svc_fwm_table; + iter->bucket = idx; + return svc; + } + } + } + + return NULL; +} + +static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) +__acquires(__ip_vs_svc_lock) +{ + + read_lock_bh(&__ip_vs_svc_lock); + return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; +} + + +static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *e; + struct ip_vs_iter *iter; + struct ip_vs_service *svc; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_info_array(seq,0); + + svc = v; + iter = seq->private; + + if (iter->table == ip_vs_svc_table) { + /* next service in table hashed by protocol */ + if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, s_list); + + + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], + s_list) { + return svc; + } + } + + iter->table = ip_vs_svc_fwm_table; + iter->bucket = -1; + goto scan_fwmark; + } + + /* next service in hashed by fwmark */ + if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, f_list); + + scan_fwmark: + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], + f_list) + return svc; + } + + return NULL; +} + +static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) +__releases(__ip_vs_svc_lock) +{ + read_unlock_bh(&__ip_vs_svc_lock); +} + + +static int ip_vs_info_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "IP Virtual Server version %d.%d.%d (size=%d)\n", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + seq_puts(seq, + "Prot LocalAddress:Port Scheduler Flags\n"); + seq_puts(seq, + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); + } else { + const struct ip_vs_service *svc = v; + const struct ip_vs_iter *iter = seq->private; + const struct ip_vs_dest *dest; + + if (iter->table == ip_vs_svc_table) { +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ", + ip_vs_proto_name(svc->protocol), + NIP6(svc->addr.in6), + ntohs(svc->port), + svc->scheduler->name); + else +#endif + seq_printf(seq, "%s %08X:%04X %s ", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr.ip), + ntohs(svc->port), + svc->scheduler->name); + } else { + seq_printf(seq, "FWM %08X %s ", + svc->fwmark, svc->scheduler->name); + } + + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + seq_printf(seq, "persistent %d %08X\n", + svc->timeout, + ntohl(svc->netmask)); + else + seq_putc(seq, '\n'); + + list_for_each_entry(dest, &svc->destinations, n_list) { +#ifdef CONFIG_IP_VS_IPV6 + if (dest->af == AF_INET6) + seq_printf(seq, + " -> [" NIP6_FMT "]:%04X" + " %-7s %-6d %-10d %-10d\n", + NIP6(dest->addr.in6), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + else +#endif + seq_printf(seq, + " -> %08X:%04X " + "%-7s %-6d %-10d %-10d\n", + ntohl(dest->addr.ip), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + + } + } + return 0; +} + +static const struct seq_operations ip_vs_info_seq_ops = { + .start = ip_vs_info_seq_start, + .next = ip_vs_info_seq_next, + .stop = ip_vs_info_seq_stop, + .show = ip_vs_info_seq_show, +}; + +static int ip_vs_info_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &ip_vs_info_seq_ops, + sizeof(struct ip_vs_iter)); +} + +static const struct file_operations ip_vs_info_fops = { + .owner = THIS_MODULE, + .open = ip_vs_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif + +struct ip_vs_stats ip_vs_stats = { + .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), +}; + +#ifdef CONFIG_PROC_FS +static int ip_vs_stats_show(struct seq_file *seq, void *v) +{ + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + " Conns Packets Packets Bytes Bytes\n"); + + spin_lock_bh(&ip_vs_stats.lock); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, + ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, + (unsigned long long) ip_vs_stats.ustats.inbytes, + (unsigned long long) ip_vs_stats.ustats.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq,"%8X %8X %8X %16X %16X\n", + ip_vs_stats.ustats.cps, + ip_vs_stats.ustats.inpps, + ip_vs_stats.ustats.outpps, + ip_vs_stats.ustats.inbps, + ip_vs_stats.ustats.outbps); + spin_unlock_bh(&ip_vs_stats.lock); + + return 0; +} + +static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, ip_vs_stats_show, NULL); +} + +static const struct file_operations ip_vs_stats_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +/* + * Set timeout values for tcp tcpfin udp in the timeout_table. + */ +static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) +{ + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", + u->tcp_timeout, + u->tcp_fin_timeout, + u->udp_timeout); + +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] + = u->tcp_timeout * HZ; + } + + if (u->tcp_fin_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] + = u->tcp_fin_timeout * HZ; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout) { + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] + = u->udp_timeout * HZ; + } +#endif + return 0; +} + + +#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) +#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ + sizeof(struct ip_vs_dest_user)) +#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) +#define MAX_ARG_LEN SVCDEST_ARG_LEN + +static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { + [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, + [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, +}; + +static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, + struct ip_vs_service_user *usvc_compat) +{ + usvc->af = AF_INET; + usvc->protocol = usvc_compat->protocol; + usvc->addr.ip = usvc_compat->addr; + usvc->port = usvc_compat->port; + usvc->fwmark = usvc_compat->fwmark; + + /* Deep copy of sched_name is not needed here */ + usvc->sched_name = usvc_compat->sched_name; + + usvc->flags = usvc_compat->flags; + usvc->timeout = usvc_compat->timeout; + usvc->netmask = usvc_compat->netmask; +} + +static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest_user *udest_compat) +{ + udest->addr.ip = udest_compat->addr; + udest->port = udest_compat->port; + udest->conn_flags = udest_compat->conn_flags; + udest->weight = udest_compat->weight; + udest->u_threshold = udest_compat->u_threshold; + udest->l_threshold = udest_compat->l_threshold; +} + +static int +do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + unsigned char arg[MAX_ARG_LEN]; + struct ip_vs_service_user *usvc_compat; + struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; + struct ip_vs_dest_user *udest_compat; + struct ip_vs_dest_user_kern udest; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (len != set_arglen[SET_CMDID(cmd)]) { + IP_VS_ERR("set_ctl: len %u != %u\n", + len, set_arglen[SET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, len) != 0) + return -EFAULT; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + if (mutex_lock_interruptible(&__ip_vs_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + + if (cmd == IP_VS_SO_SET_FLUSH) { + /* Flush the virtual service */ + ret = ip_vs_flush(); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_TIMEOUT) { + /* Set timeout values for (tcp tcpfin udp) */ + ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = stop_sync_thread(dm->state); + goto out_unlock; + } + + usvc_compat = (struct ip_vs_service_user *)arg; + udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); + + /* We only use the new structs internally, so copy userspace compat + * structs to extended internal versions */ + ip_vs_copy_usvc_compat(&usvc, usvc_compat); + ip_vs_copy_udest_compat(&udest, udest_compat); + + if (cmd == IP_VS_SO_SET_ZERO) { + /* if no service address is set, zero counters in all */ + if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { + ret = ip_vs_zero_all(); + goto out_unlock; + } + } + + /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ + if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) { + IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", + usvc.protocol, NIPQUAD(usvc.addr.ip), + ntohs(usvc.port), usvc.sched_name); + ret = -EFAULT; + goto out_unlock; + } + + /* Lookup the exact service by or fwmark */ + if (usvc.fwmark == 0) + svc = __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); + + if (cmd != IP_VS_SO_SET_ADD + && (svc == NULL || svc->protocol != usvc.protocol)) { + ret = -ESRCH; + goto out_unlock; + } + + switch (cmd) { + case IP_VS_SO_SET_ADD: + if (svc != NULL) + ret = -EEXIST; + else + ret = ip_vs_add_service(&usvc, &svc); + break; + case IP_VS_SO_SET_EDIT: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IP_VS_SO_SET_DEL: + ret = ip_vs_del_service(svc); + if (!ret) + goto out_unlock; + break; + case IP_VS_SO_SET_ZERO: + ret = ip_vs_zero_service(svc); + break; + case IP_VS_SO_SET_ADDDEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IP_VS_SO_SET_EDITDEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IP_VS_SO_SET_DELDEST: + ret = ip_vs_del_dest(svc, &udest); + break; + default: + ret = -EINVAL; + } + + if (svc) + ip_vs_service_put(svc); + + out_unlock: + mutex_unlock(&__ip_vs_mutex); + out_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ + spin_lock_bh(&src->lock); + memcpy(dst, &src->ustats, sizeof(*dst)); + spin_unlock_bh(&src->lock); +} + +static void +ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) +{ + dst->protocol = src->protocol; + dst->addr = src->addr.ip; + dst->port = src->port; + dst->fwmark = src->fwmark; + strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); + dst->flags = src->flags; + dst->timeout = src->timeout / HZ; + dst->netmask = src->netmask; + dst->num_dests = src->num_dests; + ip_vs_copy_stats(&dst->stats, &src->stats); +} + +static inline int +__ip_vs_get_service_entries(const struct ip_vs_get_services *get, + struct ip_vs_get_services __user *uptr) +{ + int idx, count=0; + struct ip_vs_service *svc; + struct ip_vs_service_entry entry; + int ret = 0; + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + out: + return ret; +} + +static inline int +__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, + struct ip_vs_get_dests __user *uptr) +{ + struct ip_vs_service *svc; + union nf_inet_addr addr = { .ip = get->addr }; + int ret = 0; + + if (get->fwmark) + svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); + else + svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, + get->port); + + if (svc) { + int count = 0; + struct ip_vs_dest *dest; + struct ip_vs_dest_entry entry; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (count >= get->num_dests) + break; + + entry.addr = dest->addr.ip; + entry.port = dest->port; + entry.conn_flags = atomic_read(&dest->conn_flags); + entry.weight = atomic_read(&dest->weight); + entry.u_threshold = dest->u_threshold; + entry.l_threshold = dest->l_threshold; + entry.activeconns = atomic_read(&dest->activeconns); + entry.inactconns = atomic_read(&dest->inactconns); + entry.persistconns = atomic_read(&dest->persistconns); + ip_vs_copy_stats(&entry.stats, &dest->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; + } + ip_vs_service_put(svc); + } else + ret = -ESRCH; + return ret; +} + +static inline void +__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) +{ +#ifdef CONFIG_IP_VS_PROTO_TCP + u->tcp_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + u->udp_timeout = + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; +#endif +} + + +#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) +#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) +#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) +#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) +#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) + +static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { + [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, + [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, +}; + +static int +do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + unsigned char arg[128]; + int ret = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (*len < get_arglen[GET_CMDID(cmd)]) { + IP_VS_ERR("get_ctl: len %u < %u\n", + *len, get_arglen[GET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) + return -EFAULT; + + if (mutex_lock_interruptible(&__ip_vs_mutex)) + return -ERESTARTSYS; + + switch (cmd) { + case IP_VS_SO_GET_VERSION: + { + char buf[64]; + + sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + if (copy_to_user(user, buf, strlen(buf)+1) != 0) { + ret = -EFAULT; + goto out; + } + *len = strlen(buf)+1; + } + break; + + case IP_VS_SO_GET_INFO: + { + struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; + info.size = IP_VS_CONN_TAB_SIZE; + info.num_services = ip_vs_num_services; + if (copy_to_user(user, &info, sizeof(info)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_SERVICES: + { + struct ip_vs_get_services *get; + int size; + + get = (struct ip_vs_get_services *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_service_entry) * get->num_services; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_service_entries(get, user); + } + break; + + case IP_VS_SO_GET_SERVICE: + { + struct ip_vs_service_entry *entry; + struct ip_vs_service *svc; + union nf_inet_addr addr; + + entry = (struct ip_vs_service_entry *)arg; + addr.ip = entry->addr; + if (entry->fwmark) + svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); + else + svc = __ip_vs_service_get(AF_INET, entry->protocol, + &addr, entry->port); + if (svc) { + ip_vs_copy_service(entry, svc); + if (copy_to_user(user, entry, sizeof(*entry)) != 0) + ret = -EFAULT; + ip_vs_service_put(svc); + } else + ret = -ESRCH; + } + break; + + case IP_VS_SO_GET_DESTS: + { + struct ip_vs_get_dests *get; + int size; + + get = (struct ip_vs_get_dests *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_dest_entry) * get->num_dests; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_dest_entries(get, user); + } + break; + + case IP_VS_SO_GET_TIMEOUT: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); + if (copy_to_user(user, &t, sizeof(t)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_DAEMON: + { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); + d[0].syncid = ip_vs_master_syncid; + } + if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); + d[1].syncid = ip_vs_backup_syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + } + break; + + default: + ret = -EINVAL; + } + + out: + mutex_unlock(&__ip_vs_mutex); + return ret; +} + + +static struct nf_sockopt_ops ip_vs_sockopts = { + .pf = PF_INET, + .set_optmin = IP_VS_BASE_CTL, + .set_optmax = IP_VS_SO_SET_MAX+1, + .set = do_ip_vs_set_ctl, + .get_optmin = IP_VS_BASE_CTL, + .get_optmax = IP_VS_SO_GET_MAX+1, + .get = do_ip_vs_get_ctl, + .owner = THIS_MODULE, +}; + +/* + * Generic Netlink interface + */ + +/* IPVS genetlink family */ +static struct genl_family ip_vs_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = IPVS_GENL_NAME, + .version = IPVS_GENL_VERSION, + .maxattr = IPVS_CMD_MAX, +}; + +/* Policy used for first-level command attributes */ +static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { + [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ +static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { + [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, + .len = IP_VS_IFNAME_MAXLEN }, + [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ +static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { + [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_SCHEDNAME_MAXLEN }, + [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, + .len = sizeof(struct ip_vs_flags) }, + [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ +static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { + [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, + struct ip_vs_stats *stats) +{ + struct nlattr *nl_stats = nla_nest_start(skb, container_type); + if (!nl_stats) + return -EMSGSIZE; + + spin_lock_bh(&stats->lock); + + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); + + spin_unlock_bh(&stats->lock); + + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + spin_unlock_bh(&stats->lock); + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_service(struct sk_buff *skb, + struct ip_vs_service *svc) +{ + struct nlattr *nl_service; + struct ip_vs_flags flags = { .flags = svc->flags, + .mask = ~0 }; + + nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); + if (!nl_service) + return -EMSGSIZE; + + NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); + + if (svc->fwmark) { + NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); + } else { + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); + NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); + } + + NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); + NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); + + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_service); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_service); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_service(struct sk_buff *skb, + struct ip_vs_service *svc, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_SERVICE); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_service(skb, svc) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_services(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0, i; + int start = cb->args[0]; + struct ip_vs_service *svc; + + mutex_lock(&__ip_vs_mutex); + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + cb->args[0] = idx; + + return skb->len; +} + +static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; + struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; + + /* Parse mandatory identifying service fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) + return -EINVAL; + + nla_af = attrs[IPVS_SVC_ATTR_AF]; + nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; + nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; + nla_port = attrs[IPVS_SVC_ATTR_PORT]; + nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; + + if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) + return -EINVAL; + + usvc->af = nla_get_u16(nla_af); +#ifdef CONFIG_IP_VS_IPV6 + if (usvc->af != AF_INET && usvc->af != AF_INET6) +#else + if (usvc->af != AF_INET) +#endif + return -EAFNOSUPPORT; + + if (nla_fwmark) { + usvc->protocol = IPPROTO_TCP; + usvc->fwmark = nla_get_u32(nla_fwmark); + } else { + usvc->protocol = nla_get_u16(nla_protocol); + nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); + usvc->port = nla_get_u16(nla_port); + usvc->fwmark = 0; + } + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_sched, *nla_flags, *nla_timeout, + *nla_netmask; + struct ip_vs_flags flags; + struct ip_vs_service *svc; + + nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; + nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; + nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; + + if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) + return -EINVAL; + + nla_memcpy(&flags, nla_flags, sizeof(flags)); + + /* prefill flags from service if it already exists */ + if (usvc->fwmark) + svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark); + else + svc = __ip_vs_service_get(usvc->af, usvc->protocol, + &usvc->addr, usvc->port); + if (svc) { + usvc->flags = svc->flags; + ip_vs_service_put(svc); + } else + usvc->flags = 0; + + /* set new flags from userland */ + usvc->flags = (usvc->flags & ~flags.mask) | + (flags.flags & flags.mask); + usvc->sched_name = nla_data(nla_sched); + usvc->timeout = nla_get_u32(nla_timeout); + usvc->netmask = nla_get_u32(nla_netmask); + } + + return 0; +} + +static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) +{ + struct ip_vs_service_user_kern usvc; + int ret; + + ret = ip_vs_genl_parse_service(&usvc, nla, 0); + if (ret) + return ERR_PTR(ret); + + if (usvc.fwmark) + return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); + else + return __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); +} + +static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) +{ + struct nlattr *nl_dest; + + nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); + if (!nl_dest) + return -EMSGSIZE; + + NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); + NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); + + NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, + atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns)); + + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_dest); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_dest); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DEST); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_dest(skb, dest) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dests(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + + mutex_lock(&__ip_vs_mutex); + + /* Try to find the service for which to dump destinations */ + if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, + IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) + goto out_err; + + svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc) || svc == NULL) + goto out_err; + + /* Dump the destinations */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + +nla_put_failure: + cb->args[0] = idx; + ip_vs_service_put(svc); + +out_err: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; + struct nlattr *nla_addr, *nla_port; + + /* Parse mandatory identifying destination fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) + return -EINVAL; + + nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; + nla_port = attrs[IPVS_DEST_ATTR_PORT]; + + if (!(nla_addr && nla_port)) + return -EINVAL; + + nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); + udest->port = nla_get_u16(nla_port); + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, + *nla_l_thresh; + + nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; + nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; + nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; + nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + + if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) + return -EINVAL; + + udest->conn_flags = nla_get_u32(nla_fwd) + & IP_VS_CONN_F_FWD_MASK; + udest->weight = nla_get_u32(nla_weight); + udest->u_threshold = nla_get_u32(nla_u_thresh); + udest->l_threshold = nla_get_u32(nla_l_thresh); + } + + return 0; +} + +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid) +{ + struct nlattr *nl_daemon; + + nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); + if (!nl_daemon) + return -EMSGSIZE; + + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); + NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); + + nla_nest_end(skb, nl_daemon); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_daemon); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid, + struct netlink_callback *cb) +{ + void *hdr; + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DAEMON); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemons(struct sk_buff *skb, + struct netlink_callback *cb) +{ + mutex_lock(&__ip_vs_mutex); + if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, + ip_vs_master_mcast_ifn, + ip_vs_master_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[0] = 1; + } + + if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, + ip_vs_backup_mcast_ifn, + ip_vs_backup_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[1] = 1; + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_new_daemon(struct nlattr **attrs) +{ + if (!(attrs[IPVS_DAEMON_ATTR_STATE] && + attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && + attrs[IPVS_DAEMON_ATTR_SYNC_ID])) + return -EINVAL; + + return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); +} + +static int ip_vs_genl_del_daemon(struct nlattr **attrs) +{ + if (!attrs[IPVS_DAEMON_ATTR_STATE]) + return -EINVAL; + + return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); +} + +static int ip_vs_genl_set_config(struct nlattr **attrs) +{ + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) + t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) + t.tcp_fin_timeout = + nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) + t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); + + return ip_vs_set_timeout(&t); +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct ip_vs_service *svc = NULL; + struct ip_vs_service_user_kern usvc; + struct ip_vs_dest_user_kern udest; + int ret = 0, cmd; + int need_full_svc = 0, need_full_dest = 0; + + cmd = info->genlhdr->cmd; + + mutex_lock(&__ip_vs_mutex); + + if (cmd == IPVS_CMD_FLUSH) { + ret = ip_vs_flush(); + goto out; + } else if (cmd == IPVS_CMD_SET_CONFIG) { + ret = ip_vs_genl_set_config(info->attrs); + goto out; + } else if (cmd == IPVS_CMD_NEW_DAEMON || + cmd == IPVS_CMD_DEL_DAEMON) { + + struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; + + if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || + nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, + info->attrs[IPVS_CMD_ATTR_DAEMON], + ip_vs_daemon_policy)) { + ret = -EINVAL; + goto out; + } + + if (cmd == IPVS_CMD_NEW_DAEMON) + ret = ip_vs_genl_new_daemon(daemon_attrs); + else + ret = ip_vs_genl_del_daemon(daemon_attrs); + goto out; + } else if (cmd == IPVS_CMD_ZERO && + !info->attrs[IPVS_CMD_ATTR_SERVICE]) { + ret = ip_vs_zero_all(); + goto out; + } + + /* All following commands require a service argument, so check if we + * received a valid one. We need a full service specification when + * adding / editing a service. Only identifying members otherwise. */ + if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) + need_full_svc = 1; + + ret = ip_vs_genl_parse_service(&usvc, + info->attrs[IPVS_CMD_ATTR_SERVICE], + need_full_svc); + if (ret) + goto out; + + /* Lookup the exact service by or fwmark */ + if (usvc.fwmark == 0) + svc = __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); + + /* Unless we're adding a new service, the service must already exist */ + if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { + ret = -ESRCH; + goto out; + } + + /* Destination commands require a valid destination argument. For + * adding / editing a destination, we need a full destination + * specification. */ + if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || + cmd == IPVS_CMD_DEL_DEST) { + if (cmd != IPVS_CMD_DEL_DEST) + need_full_dest = 1; + + ret = ip_vs_genl_parse_dest(&udest, + info->attrs[IPVS_CMD_ATTR_DEST], + need_full_dest); + if (ret) + goto out; + } + + switch (cmd) { + case IPVS_CMD_NEW_SERVICE: + if (svc == NULL) + ret = ip_vs_add_service(&usvc, &svc); + else + ret = -EEXIST; + break; + case IPVS_CMD_SET_SERVICE: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IPVS_CMD_DEL_SERVICE: + ret = ip_vs_del_service(svc); + break; + case IPVS_CMD_NEW_DEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IPVS_CMD_SET_DEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IPVS_CMD_DEL_DEST: + ret = ip_vs_del_dest(svc, &udest); + break; + case IPVS_CMD_ZERO: + ret = ip_vs_zero_service(svc); + break; + default: + ret = -EINVAL; + } + +out: + if (svc) + ip_vs_service_put(svc); + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + +static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *reply; + int ret, cmd, reply_cmd; + + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_GET_SERVICE) + reply_cmd = IPVS_CMD_NEW_SERVICE; + else if (cmd == IPVS_CMD_GET_INFO) + reply_cmd = IPVS_CMD_SET_INFO; + else if (cmd == IPVS_CMD_GET_CONFIG) + reply_cmd = IPVS_CMD_SET_CONFIG; + else { + IP_VS_ERR("unknown Generic Netlink command\n"); + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&__ip_vs_mutex); + + reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); + if (reply == NULL) + goto nla_put_failure; + + switch (cmd) { + case IPVS_CMD_GET_SERVICE: + { + struct ip_vs_service *svc; + + svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc)) { + ret = PTR_ERR(svc); + goto out_err; + } else if (svc) { + ret = ip_vs_genl_fill_service(msg, svc); + ip_vs_service_put(svc); + if (ret) + goto nla_put_failure; + } else { + ret = -ESRCH; + goto out_err; + } + + break; + } + + case IPVS_CMD_GET_CONFIG: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); +#ifdef CONFIG_IP_VS_PROTO_TCP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); +#endif + + break; + } + + case IPVS_CMD_GET_INFO: + NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); + NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + IP_VS_CONN_TAB_SIZE); + break; + } + + genlmsg_end(msg, reply); + ret = genlmsg_unicast(msg, info->snd_pid); + goto out; + +nla_put_failure: + IP_VS_ERR("not enough space in Netlink message\n"); + ret = -EMSGSIZE; + +out_err: + nlmsg_free(msg); +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + + +static struct genl_ops ip_vs_genl_ops[] __read_mostly = { + { + .cmd = IPVS_CMD_NEW_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_SERVICE, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + .dumpit = ip_vs_genl_dump_services, + .policy = ip_vs_cmd_policy, + }, + { + .cmd = IPVS_CMD_NEW_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .dumpit = ip_vs_genl_dump_dests, + }, + { + .cmd = IPVS_CMD_NEW_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DAEMON, + .flags = GENL_ADMIN_PERM, + .dumpit = ip_vs_genl_dump_daemons, + }, + { + .cmd = IPVS_CMD_SET_CONFIG, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_CONFIG, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_GET_INFO, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_ZERO, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_FLUSH, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, +}; + +static int __init ip_vs_genl_register(void) +{ + int ret, i; + + ret = genl_register_family(&ip_vs_genl_family); + if (ret) + return ret; + + for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) { + ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]); + if (ret) + goto err_out; + } + return 0; + +err_out: + genl_unregister_family(&ip_vs_genl_family); + return ret; +} + +static void ip_vs_genl_unregister(void) +{ + genl_unregister_family(&ip_vs_genl_family); +} + +/* End of Generic Netlink interface definitions */ + + +int __init ip_vs_control_init(void) +{ + int ret; + int idx; + + EnterFunction(2); + + ret = nf_register_sockopt(&ip_vs_sockopts); + if (ret) { + IP_VS_ERR("cannot register sockopt.\n"); + return ret; + } + + ret = ip_vs_genl_register(); + if (ret) { + IP_VS_ERR("cannot register Generic Netlink interface.\n"); + nf_unregister_sockopt(&ip_vs_sockopts); + return ret; + } + + proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); + + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); + + /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_rtable[idx]); + } + + ip_vs_new_estimator(&ip_vs_stats); + + /* Hook the defense timer */ + schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); + + LeaveFunction(2); + return 0; +} + + +void ip_vs_control_cleanup(void) +{ + EnterFunction(2); + ip_vs_trash_cleanup(); + cancel_rearming_delayed_work(&defense_work); + cancel_work_sync(&defense_work.work); + ip_vs_kill_estimator(&ip_vs_stats); + unregister_sysctl_table(sysctl_header); + proc_net_remove(&init_net, "ip_vs_stats"); + proc_net_remove(&init_net, "ip_vs"); + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); + LeaveFunction(2); +} diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c new file mode 100644 index 00000000000..a16943fd72f --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -0,0 +1,261 @@ +/* + * IPVS: Destination Hashing scheduling module + * + * Authors: Wensong Zhang + * + * Inspired by the consistent hashing scheduler patch from + * Thomas Proell + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The dh algorithm is to select server by the hash key of destination IP + * address. The pseudo code is as follows: + * + * n <- servernode[dest_ip]; + * if (n is dead) OR + * (n is overloaded) OR (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet destination IP address to the current server + * array. If the dh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include +#include +#include +#include + +#include + + +/* + * IPVS DH bucket + */ +struct ip_vs_dh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS DH entry hash table + */ +#ifndef CONFIG_IP_VS_DH_TAB_BITS +#define CONFIG_IP_VS_DH_TAB_BITS 8 +#endif +#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS +#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) +#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS DH entry + */ +static inline unsigned ip_vs_dh_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr) +{ + return (tbl[ip_vs_dh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_dh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +{ + int i; + struct ip_vs_dh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_dh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl; + + /* allocate the DH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Destination hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_dh_bucket *tbl; + struct iphdr *iph = ip_hdr(skb); + + IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_dh_bucket *)svc->sched_data; + dest = ip_vs_dh_get(tbl, iph->daddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS DH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_dh_scheduler = +{ + .name = "dh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_dh_init_svc, + .done_service = ip_vs_dh_done_svc, + .update_service = ip_vs_dh_update_svc, + .schedule = ip_vs_dh_schedule, +}; + + +static int __init ip_vs_dh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +static void __exit ip_vs_dh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +module_init(ip_vs_dh_init); +module_exit(ip_vs_dh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c new file mode 100644 index 00000000000..2eb2860dabb --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -0,0 +1,166 @@ +/* + * ip_vs_est.c: simple rate estimator for IPVS + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + This code is to estimate rate in a shorter interval (such as 8 + seconds) for virtual services and real servers. For measure rate in a + long interval, it is easy to implement a user level daemon which + periodically reads those statistical counters and measure rate. + + Currently, the measurement is activated by slow timer handler. Hope + this measurement will not introduce too much load. + + We measure rate during the last 8 seconds every 2 seconds: + + avgrate = avgrate*(1-W) + rate*W + + where W = 2^(-2) + + NOTES. + + * The stored value for average bps is scaled by 2^5, so that maximal + rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + + * A lot code is taken from net/sched/estimator.c + */ + + +static void estimation_timer(unsigned long arg); + +static LIST_HEAD(est_list); +static DEFINE_SPINLOCK(est_lock); +static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); + +static void estimation_timer(unsigned long arg) +{ + struct ip_vs_estimator *e; + struct ip_vs_stats *s; + u32 n_conns; + u32 n_inpkts, n_outpkts; + u64 n_inbytes, n_outbytes; + u32 rate; + + spin_lock(&est_lock); + list_for_each_entry(e, &est_list, list) { + s = container_of(e, struct ip_vs_stats, est); + + spin_lock(&s->lock); + n_conns = s->ustats.conns; + n_inpkts = s->ustats.inpkts; + n_outpkts = s->ustats.outpkts; + n_inbytes = s->ustats.inbytes; + n_outbytes = s->ustats.outbytes; + + /* scaled by 2^10, but divided 2 seconds */ + rate = (n_conns - e->last_conns)<<9; + e->last_conns = n_conns; + e->cps += ((long)rate - (long)e->cps)>>2; + s->ustats.cps = (e->cps+0x1FF)>>10; + + rate = (n_inpkts - e->last_inpkts)<<9; + e->last_inpkts = n_inpkts; + e->inpps += ((long)rate - (long)e->inpps)>>2; + s->ustats.inpps = (e->inpps+0x1FF)>>10; + + rate = (n_outpkts - e->last_outpkts)<<9; + e->last_outpkts = n_outpkts; + e->outpps += ((long)rate - (long)e->outpps)>>2; + s->ustats.outpps = (e->outpps+0x1FF)>>10; + + rate = (n_inbytes - e->last_inbytes)<<4; + e->last_inbytes = n_inbytes; + e->inbps += ((long)rate - (long)e->inbps)>>2; + s->ustats.inbps = (e->inbps+0xF)>>5; + + rate = (n_outbytes - e->last_outbytes)<<4; + e->last_outbytes = n_outbytes; + e->outbps += ((long)rate - (long)e->outbps)>>2; + s->ustats.outbps = (e->outbps+0xF)>>5; + spin_unlock(&s->lock); + } + spin_unlock(&est_lock); + mod_timer(&est_timer, jiffies + 2*HZ); +} + +void ip_vs_new_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + + INIT_LIST_HEAD(&est->list); + + est->last_conns = stats->ustats.conns; + est->cps = stats->ustats.cps<<10; + + est->last_inpkts = stats->ustats.inpkts; + est->inpps = stats->ustats.inpps<<10; + + est->last_outpkts = stats->ustats.outpkts; + est->outpps = stats->ustats.outpps<<10; + + est->last_inbytes = stats->ustats.inbytes; + est->inbps = stats->ustats.inbps<<5; + + est->last_outbytes = stats->ustats.outbytes; + est->outbps = stats->ustats.outbps<<5; + + spin_lock_bh(&est_lock); + list_add(&est->list, &est_list); + spin_unlock_bh(&est_lock); +} + +void ip_vs_kill_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + + spin_lock_bh(&est_lock); + list_del(&est->list); + spin_unlock_bh(&est_lock); +} + +void ip_vs_zero_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + + /* set counters zero, caller must hold the stats->lock lock */ + est->last_inbytes = 0; + est->last_outbytes = 0; + est->last_conns = 0; + est->last_inpkts = 0; + est->last_outpkts = 0; + est->cps = 0; + est->inpps = 0; + est->outpps = 0; + est->inbps = 0; + est->outbps = 0; +} + +int __init ip_vs_estimator_init(void) +{ + mod_timer(&est_timer, jiffies + 2 * HZ); + return 0; +} + +void ip_vs_estimator_cleanup(void) +{ + del_timer_sync(&est_timer); +} diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c new file mode 100644 index 00000000000..2e7dbd8b73a --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -0,0 +1,410 @@ +/* + * ip_vs_ftp.c: IPVS ftp application module + * + * Authors: Wensong Zhang + * + * Changes: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference + * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. + * + * IP_MASQ_FTP ftp masquerading module + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define SERVER_STRING "227 Entering Passive Mode (" +#define CLIENT_STRING "PORT " + + +/* + * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper + * First port is set to the default port. + */ +static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; +module_param_array(ports, ushort, NULL, 0); +MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); + + +/* Dummy variable */ +static int ip_vs_ftp_pasv; + + +static int +ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +static int +ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +/* + * Get from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern" and terminated with the "term" character. + * is in network order. + */ +static int ip_vs_ftp_get_addrport(char *data, char *data_limit, + const char *pattern, size_t plen, char term, + __be32 *addr, __be16 *port, + char **start, char **end) +{ + unsigned char p[6]; + int i = 0; + + if (data_limit - data < plen) { + /* check if there is partial match */ + if (strnicmp(data, pattern, data_limit - data) == 0) + return -1; + else + return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { + return 0; + } + *start = data + plen; + + for (data = *start; *data != term; data++) { + if (data == data_limit) + return -1; + } + *end = data; + + memset(p, 0, sizeof(p)); + for (data = *start; data != *end; data++) { + if (*data >= '0' && *data <= '9') { + p[i] = p[i]*10 + *data - '0'; + } else if (*data == ',' && i < 5) { + i++; + } else { + /* unexpected character */ + return -1; + } + } + + if (i != 5) + return -1; + + *addr = get_unaligned((__be32 *)p); + *port = get_unaligned((__be16 *)(p + 4)); + return 1; +} + + +/* + * Look at outgoing ftp packets to catch the response to a PASV command + * from the server (inside-to-outside). + * When we see one, we build a connection entry with the client address, + * client port 0 (unknown at the moment), the server address and the + * server port. Mark the current connection entry as a control channel + * of the new entry. All this work is just to make the data connection + * can be scheduled to the right server later. + * + * The outgoing packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + */ +static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + char *start, *end; + union nf_inet_addr from; + __be16 port; + struct ip_vs_conn *n_cp; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned buf_len; + int ret; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (cp->app_data == &ip_vs_ftp_pasv) { + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING, + sizeof(SERVER_STRING)-1, ')', + &from.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " + "%u.%u.%u.%u:%d detected\n", + NIPQUAD(from.ip), ntohs(port), + NIPQUAD(cp->caddr.ip), 0); + + /* + * Now update or create an connection entry for it + */ + n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, + &cp->caddr, 0); + if (!n_cp) { + n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, + &cp->caddr, 0, + &cp->vaddr, port, + &from, port, + IP_VS_CONN_F_NO_CPORT, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Replace the old passive address with the new one + */ + from.ip = n_cp->vaddr.ip; + port = n_cp->vport; + sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip), + (ntohs(port)>>8)&255, ntohs(port)&255); + buf_len = strlen(buf); + + /* + * Calculate required delta-offset to keep TCP happy + */ + *diff = buf_len - (end-start); + + if (*diff == 0) { + /* simply replace it with new passive address */ + memcpy(start, buf, buf_len); + ret = 1; + } else { + ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, + end-start, buf, buf_len); + } + + cp->app_data = NULL; + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + return ret; + } + return 1; +} + + +/* + * Look at incoming ftp packets to catch the PASV/PORT command + * (outside-to-inside). + * + * The incoming packet having the PORT command should be something like + * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". + * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. + * In this case, we create a connection entry using the client address and + * port, so that the active ftp data connection from the server can reach + * the client. + */ +static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_start, *data_limit; + char *start, *end; + union nf_inet_addr to; + __be16 port; + struct ip_vs_conn *n_cp; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + /* no diff required for incoming packets */ + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + /* + * Detecting whether it is passive + */ + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Since there may be OPTIONS in the TCP packet and the HLEN is + the length of the header in 32-bit multiples, it is accurate + to calculate data address by th+HLEN*4 */ + data = data_start = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + while (data <= data_limit - 6) { + if (strnicmp(data, "PASV\r\n", 6) == 0) { + /* Passive mode on */ + IP_VS_DBG(7, "got PASV at %td of %td\n", + data - data_start, + data_limit - data_start); + cp->app_data = &ip_vs_ftp_pasv; + return 1; + } + data++; + } + + /* + * To support virtual FTP server, the scenerio is as follows: + * FTP client ----> Load Balancer ----> FTP server + * First detect the port number in the application data, + * then create a new connection entry for the coming data + * connection. + */ + if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING, sizeof(CLIENT_STRING)-1, + '\r', &to.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", + NIPQUAD(to.ip), ntohs(port)); + + /* Passive mode off */ + cp->app_data = NULL; + + /* + * Now update or create a connection entry for it + */ + IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); + + n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, + &to, port, + &cp->vaddr, htons(ntohs(cp->vport)-1)); + if (!n_cp) { + n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, + &to, port, + &cp->vaddr, htons(ntohs(cp->vport)-1), + &cp->daddr, htons(ntohs(cp->dport)-1), + 0, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Move tunnel to listen state + */ + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + + return 1; +} + + +static struct ip_vs_app ip_vs_ftp = { + .name = "ftp", + .type = IP_VS_APP_TYPE_FTP, + .protocol = IPPROTO_TCP, + .module = THIS_MODULE, + .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), + .init_conn = ip_vs_ftp_init_conn, + .done_conn = ip_vs_ftp_done_conn, + .bind_conn = NULL, + .unbind_conn = NULL, + .pkt_out = ip_vs_ftp_out, + .pkt_in = ip_vs_ftp_in, +}; + + +/* + * ip_vs_ftp initialization + */ +static int __init ip_vs_ftp_init(void) +{ + int i, ret; + struct ip_vs_app *app = &ip_vs_ftp; + + ret = register_ip_vs_app(app); + if (ret) + return ret; + + for (i=0; iprotocol, ports[i]); + if (ret) + break; + IP_VS_INFO("%s: loaded support on port[%d] = %d\n", + app->name, i, ports[i]); + } + + if (ret) + unregister_ip_vs_app(app); + + return ret; +} + + +/* + * ip_vs_ftp finish. + */ +static void __exit ip_vs_ftp_exit(void) +{ + unregister_ip_vs_app(&ip_vs_ftp); +} + + +module_init(ip_vs_ftp_init); +module_exit(ip_vs_ftp_exit); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c new file mode 100644 index 00000000000..6ecef3518ca --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -0,0 +1,555 @@ +/* + * IPVS: Locality-Based Least-Connection scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitilized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * Julian Anastasov : replaced del_timer call with del_timer_sync + * to avoid the possible race between timer + * handler and del_timer thread in SMP + * + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns +#include +#include +#include +#include + +/* for sysctl */ +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct list_head list; + __be32 addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLC sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .procname = "lblc_expiration", + .data = &sysctl_ip_vs_lblc_expiration, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +{ + list_del(&en->list); + /* + * We don't kfree dest because it is refered either by its service + * or the trash dest list. + */ + atomic_dec(&en->dest->refcnt); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned ip_vs_lblc_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static void +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned hash = ip_vs_lblc_hashkey(en->addr); + + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* + * Get ip_vs_lblc_entry associated with supplied parameters. Called under read + * lock + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) +{ + unsigned hash = ip_vs_lblc_hashkey(addr); + struct ip_vs_lblc_entry *en; + + list_for_each_entry(en, &tbl->bucket[hash], list) + if (en->addr == addr) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP + * address to a server. Called under write lock. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = ip_vs_lblc_get(tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); + return NULL; + } + + en->addr = daddr; + en->lastuse = jiffies; + + atomic_inc(&dest->refcnt); + en->dest = dest; + + ip_vs_lblc_hash(tbl, en); + } else if (en->dest != dest) { + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +{ + struct ip_vs_lblc_entry *en, *nxt; + int i; + + for (i=0; ibucket[i], list) { + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + } +} + + +static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en, *nxt; + unsigned long now = jiffies; + int i, j; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, + en->lastuse + sysctl_ip_vs_lblc_expiration)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblc_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblc_entry *en, *nxt; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(tbl); + + /* release the table itself */ + kfree(tbl); + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); + + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr.ip), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct iphdr *iph = ip_hdr(skb); + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblc_entry *en; + + IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); + + /* First look in our cache */ + read_lock(&svc->sched_lock); + en = ip_vs_lblc_get(tbl, iph->daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* + * If the destination is not available, i.e. it's in the trash, + * we must ignore it, as it may be removed from under our feet, + * if someone drops our reference count. Our caller only makes + * sure that destinations, that are not in the trash, are not + * moved to the trash, while we are scheduling. But anyone can + * free up entries from the trash at any time. + */ + + if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) + dest = en->dest; + } + read_unlock(&svc->sched_lock); + + /* If the destination has a weight and is not overloaded, use it */ + if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; + + /* No cache entry or it is invalid, time to schedule */ + dest = __ip_vs_lblc_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblc_new(tbl, iph->daddr, dest); + write_unlock(&svc->sched_lock); + +out: + IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLC Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = +{ + .name = "lblc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_lblc_init_svc, + .done_service = ip_vs_lblc_done_svc, + .schedule = ip_vs_lblc_schedule, +}; + + +static int __init ip_vs_lblc_init(void) +{ + int ret; + + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); + if (ret) + unregister_sysctl_table(sysctl_header); + return ret; +} + + +static void __exit ip_vs_lblc_cleanup(void) +{ + unregister_sysctl_table(sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +module_init(ip_vs_lblc_init); +module_exit(ip_vs_lblc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c new file mode 100644 index 00000000000..1f75ea83bcf --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -0,0 +1,755 @@ +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + * + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#include +#include +#include +#include +#include + +/* for sysctl */ +#include +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_list { + struct ip_vs_dest_list *next; /* list link */ + struct ip_vs_dest *dest; /* destination server */ +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct ip_vs_dest_list *list; /* destination list */ + rwlock_t lock; /* lock for this list */ +}; + + +static struct ip_vs_dest_list * +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e; + + for (e=set->list; e!=NULL; e=e->next) { + if (e->dest == dest) + /* already existed */ + return NULL; + } + + e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (e == NULL) { + IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); + return NULL; + } + + atomic_inc(&dest->refcnt); + e->dest = dest; + + /* link it to the list */ + e->next = set->list; + set->list = e; + atomic_inc(&set->size); + + set->lastmod = jiffies; + return e; +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e, **ep; + + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + if (e->dest == dest) { + /* HIT */ + *ep = e->next; + atomic_dec(&set->size); + set->lastmod = jiffies; + atomic_dec(&e->dest->refcnt); + kfree(e); + break; + } + ep = &e->next; + } +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + *ep = e->next; + /* + * We don't kfree dest because it is refered either + * by its service or by the trash dest list. + */ + atomic_dec(&e->dest->refcnt); + kfree(e); + } + write_unlock(&set->lock); +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + least = e->dest; + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if ((atomic_read(&least->weight) > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if ((loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr.ip), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + most = e->dest; + if (atomic_read(&most->weight) > 0) { + moh = atomic_read(&most->activeconns) * 50 + + atomic_read(&most->inactconns); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if ((moh * atomic_read(&dest->weight) < + doh * atomic_read(&most->weight)) + && (atomic_read(&dest->weight) > 0)) { + most = dest; + moh = doh; + } + } + + IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(most->addr.ip), ntohs(most->port), + atomic_read(&most->activeconns), + atomic_read(&most->refcnt), + atomic_read(&most->weight), moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct list_head list; + __be32 addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLCR sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .procname = "lblcr_expiration", + .data = &sysctl_ip_vs_lblcr_expiration, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + list_del(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static void +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned hash = ip_vs_lblcr_hashkey(en->addr); + + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* + * Get ip_vs_lblcr_entry associated with supplied parameters. Called under + * read lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) +{ + unsigned hash = ip_vs_lblcr_hashkey(addr); + struct ip_vs_lblcr_entry *en; + + list_for_each_entry(en, &tbl->bucket[hash], list) + if (en->addr == addr) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. Called under write lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblcr_entry *en; + + en = ip_vs_lblcr_get(tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); + return NULL; + } + + en->addr = daddr; + en->lastuse = jiffies; + + /* initilize its dest set */ + atomic_set(&(en->set.size), 0); + en->set.list = NULL; + rwlock_init(&en->set.lock); + + ip_vs_lblcr_hash(tbl, en); + } + + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +{ + int i; + struct ip_vs_lblcr_entry *en, *nxt; + + /* No locking required, only called during cleanup. */ + for (i=0; ibucket[i], list) { + ip_vs_lblcr_free(en); + } + } +} + + +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, + now)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(tbl); + + /* release the table itself */ + kfree(tbl); + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); + + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr.ip), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + struct iphdr *iph = ip_hdr(skb); + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblcr_entry *en; + + IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); + + /* First look in our cache */ + read_lock(&svc->sched_lock); + en = ip_vs_lblcr_get(tbl, iph->daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* Get the least loaded destination */ + read_lock(&en->set.lock); + dest = ip_vs_dest_set_min(&en->set); + read_unlock(&en->set.lock); + + /* More than one destination + enough time passed by, cleanup */ + if (atomic_read(&en->set.size) > 1 && + time_after(jiffies, en->set.lastmod + + sysctl_ip_vs_lblcr_expiration)) { + struct ip_vs_dest *m; + + write_lock(&en->set.lock); + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + write_unlock(&en->set.lock); + } + + /* If the destination is not overloaded, use it */ + if (dest && !is_overloaded(dest, svc)) { + read_unlock(&svc->sched_lock); + goto out; + } + + /* The cache entry is invalid, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + read_unlock(&svc->sched_lock); + return NULL; + } + + /* Update our cache entry */ + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + } + read_unlock(&svc->sched_lock); + + if (dest) + goto out; + + /* No cache entry, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblcr_new(tbl, iph->daddr, dest); + write_unlock(&svc->sched_lock); + +out: + IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + .name = "lblcr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_lblcr_init_svc, + .done_service = ip_vs_lblcr_done_svc, + .schedule = ip_vs_lblcr_schedule, +}; + + +static int __init ip_vs_lblcr_init(void) +{ + int ret; + + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + if (ret) + unregister_sysctl_table(sysctl_header); + return ret; +} + + +static void __exit ip_vs_lblcr_cleanup(void) +{ + unregister_sysctl_table(sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +module_init(ip_vs_lblcr_init); +module_exit(ip_vs_lblcr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c new file mode 100644 index 00000000000..b69f808ac46 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -0,0 +1,103 @@ +/* + * IPVS: Least-Connection Scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->weight) == 0) + continue; + doh = ip_vs_lc_dest_overhead(dest); + if (!least || doh < loh) { + least = dest; + loh = doh; + } + } + + if (least) + IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + .name = "lc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_lc_schedule, +}; + + +static int __init ip_vs_lc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + +static void __exit ip_vs_lc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); +} + +module_init(ip_vs_lc_init); +module_exit(ip_vs_lc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c new file mode 100644 index 00000000000..9a2d8033f08 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -0,0 +1,138 @@ +/* + * IPVS: Never Queue scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The NQ algorithm adopts a two-speed model. When there is an idle server + * available, the job will be sent to the idle server, instead of waiting + * for a fast one. When there is no idle server available, the job will be + * sent to the server that minimize its expected delay (The Shortest + * Expected Delay scheduling algorithm). + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking NQ to me. + * + * The difference between NQ and SED is that NQ can improve overall + * system utilization. + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + + if (dest->flags & IP_VS_DEST_F_OVERLOAD || + !atomic_read(&dest->weight)) + continue; + + doh = ip_vs_nq_dest_overhead(dest); + + /* return the server directly if it is idle */ + if (atomic_read(&dest->activeconns) == 0) { + least = dest; + loh = doh; + goto out; + } + + if (!least || + (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight))) { + least = dest; + loh = doh; + } + } + + if (!least) + return NULL; + + out: + IP_VS_DBG_BUF(6, "NQ: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_nq_scheduler = +{ + .name = "nq", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_nq_schedule, +}; + + +static int __init ip_vs_nq_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +static void __exit ip_vs_nq_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +module_init(ip_vs_nq_init); +module_exit(ip_vs_nq_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c new file mode 100644 index 00000000000..0791f9e08fe --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -0,0 +1,288 @@ +/* + * ip_vs_proto.c: transport protocol load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * IPVS protocols can only be registered/unregistered when the ipvs + * module is loaded/unloaded, so no lock is needed in accessing the + * ipvs protocol table. + */ + +#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ +#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) + +static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; + + +/* + * register an ipvs protocol + */ +static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp->next = ip_vs_proto_table[hash]; + ip_vs_proto_table[hash] = pp; + + if (pp->init != NULL) + pp->init(pp); + + return 0; +} + + +/* + * unregister an ipvs protocol + */ +static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + struct ip_vs_protocol **pp_p; + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp_p = &ip_vs_proto_table[hash]; + for (; *pp_p; pp_p = &(*pp_p)->next) { + if (*pp_p == pp) { + *pp_p = pp->next; + if (pp->exit != NULL) + pp->exit(pp); + return 0; + } + } + + return -ESRCH; +} + + +/* + * get ip_vs_protocol object by its proto. + */ +struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) +{ + struct ip_vs_protocol *pp; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { + if (pp->protocol == proto) + return pp; + } + + return NULL; +} + + +/* + * Propagate event for state change to all protocols + */ +void ip_vs_protocol_timeout_change(int flags) +{ + struct ip_vs_protocol *pp; + int i; + + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { + if (pp->timeout_change) + pp->timeout_change(pp, flags); + } + } +} + + +int * +ip_vs_create_timeout_table(int *table, int size) +{ + return kmemdup(table, size, GFP_ATOMIC); +} + + +/* + * Set timeout value for state specified by name + */ +int +ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) +{ + int i; + + if (!table || !name || !to) + return -EINVAL; + + for (i = 0; i < num; i++) { + if (strcmp(names[i], name)) + continue; + table[i] = to * HZ; + return 0; + } + return -ENOENT; +} + + +const char * ip_vs_state_name(__u16 proto, int state) +{ + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + if (pp == NULL || pp->state_name == NULL) + return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; + return pp->state_name(state); +} + + +static void +ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[128]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->frag_off & htons(IP_OFFSET)) + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else { + __be16 _ports[2], *pptr +; + pptr = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, + NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else + sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", + pp->name, + NIPQUAD(ih->saddr), + ntohs(pptr[0]), + NIPQUAD(ih->daddr), + ntohs(pptr[1])); + } + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + +#ifdef CONFIG_IP_VS_IPV6 +static void +ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[192]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->nexthdr == IPPROTO_FRAGMENT) + sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag", + pp->name, NIP6(ih->saddr), + NIP6(ih->daddr)); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT, + pp->name, + NIP6(ih->saddr), + NIP6(ih->daddr)); + else + sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u", + pp->name, + NIP6(ih->saddr), + ntohs(pptr[0]), + NIP6(ih->daddr), + ntohs(pptr[1])); + } + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} +#endif + + +void +ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb->protocol == htons(ETH_P_IPV6)) + ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); +} + + +int __init ip_vs_protocol_init(void) +{ + char protocols[64]; +#define REGISTER_PROTOCOL(p) \ + do { \ + register_ip_vs_protocol(p); \ + strcat(protocols, ", "); \ + strcat(protocols, (p)->name); \ + } while (0) + + protocols[0] = '\0'; + protocols[2] = '\0'; +#ifdef CONFIG_IP_VS_PROTO_TCP + REGISTER_PROTOCOL(&ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + REGISTER_PROTOCOL(&ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + REGISTER_PROTOCOL(&ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + REGISTER_PROTOCOL(&ip_vs_protocol_esp); +#endif + IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); + + return 0; +} + + +void ip_vs_protocol_cleanup(void) +{ + struct ip_vs_protocol *pp; + int i; + + /* unregister all the ipvs protocols */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pp = ip_vs_proto_table[i]) != NULL) + unregister_ip_vs_protocol(pp); + } +} diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c new file mode 100644 index 00000000000..80ab0c8e5b4 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -0,0 +1,235 @@ +/* + * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS + * + * Authors: Julian Anastasov , February 2002 + * Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + + +static struct ip_vs_conn * +ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(af, IPPROTO_UDP, + &iph->saddr, + htons(PORT_ISAKMP), + &iph->daddr, + htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_in_get(af, IPPROTO_UDP, + &iph->daddr, + htons(PORT_ISAKMP), + &iph->saddr, + htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + pp->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_esp_conn_out_get(int af, const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(af, IPPROTO_UDP, + &iph->saddr, + htons(PORT_ISAKMP), + &iph->daddr, + htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_out_get(af, IPPROTO_UDP, + &iph->daddr, + htons(PORT_ISAKMP), + &iph->saddr, + htons(PORT_ISAKMP)); + } + + if (!cp) { + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + pp->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static int +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * AH/ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + + +static void +ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + +#ifdef CONFIG_IP_VS_IPV6 +static void +ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT, + pp->name, NIP6(ih->saddr), + NIP6(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} +#endif + +static void +ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb->protocol == htons(ETH_P_IPV6)) + ah_esp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ah_esp_debug_packet_v4(pp, skb, offset, msg); +} + + +static void ah_esp_init(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +static void ah_esp_exit(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +#ifdef CONFIG_IP_VS_PROTO_AH +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ + .set_state_timeout = NULL, +}; +#endif + +#ifdef CONFIG_IP_VS_PROTO_ESP +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c new file mode 100644 index 00000000000..dd4566ea2bf --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -0,0 +1,732 @@ +/* + * ip_vs_proto_tcp.c: TCP load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include /* for tcphdr */ +#include +#include /* for csum_tcpudp_magic */ +#include +#include +#include + +#include + + +static struct ip_vs_conn * +tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + return ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + return ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } +} + +static struct ip_vs_conn * +tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + return ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + return ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } +} + + +static int +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + struct tcphdr _tcph, *th; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); + if (th == NULL) { + *verdict = NF_DROP; + return 0; + } + + if (th->syn && + (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, + th->dest))) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +tcp_fast_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); +} + + +static inline void +tcp_partial_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(tcph->check)))); +} + + +static int +tcp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + if (!ip_vs_app_pkt_out(cp, skb)) + return 0; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->source = cp->vport; + + /* Adjust TCP checksums */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - tcphoff)); + } else if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, tcph->check, + (char*)&(tcph->check) - (char*)tcph); + } + return 1; +} + + +static int +tcp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn and iph ack_seq stuff + */ + if (!ip_vs_app_pkt_in(cp, skb)) + return 0; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->dest = cp->dport; + + /* + * Adjust TCP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - tcphoff)); + } else if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - tcphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - tcphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + + return 1; +} + + +#define TCP_DIR_INPUT 0 +#define TCP_DIR_OUTPUT 4 +#define TCP_DIR_INPUT_ONLY 8 + +static const int tcp_state_off[IP_VS_DIR_LAST] = { + [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, + [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, + [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, +}; + +/* + * Timeout table[state] + */ +static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 120*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + +static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = "NONE", + [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", + [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", + [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", + [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", + [IP_VS_TCP_S_CLOSE] = "CLOSE", + [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", + [IP_VS_TCP_S_LISTEN] = "LISTEN", + [IP_VS_TCP_S_SYNACK] = "SYNACK", + [IP_VS_TCP_S_LAST] = "BUG!", +}; + +#define sNO IP_VS_TCP_S_NONE +#define sES IP_VS_TCP_S_ESTABLISHED +#define sSS IP_VS_TCP_S_SYN_SENT +#define sSR IP_VS_TCP_S_SYN_RECV +#define sFW IP_VS_TCP_S_FIN_WAIT +#define sTW IP_VS_TCP_S_TIME_WAIT +#define sCL IP_VS_TCP_S_CLOSE +#define sCW IP_VS_TCP_S_CLOSE_WAIT +#define sLA IP_VS_TCP_S_LAST_ACK +#define sLI IP_VS_TCP_S_LISTEN +#define sSA IP_VS_TCP_S_SYNACK + +struct tcp_states_t { + int next_state[IP_VS_TCP_S_LAST]; +}; + +static const char * tcp_state_name(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return "ERR!"; + return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; +} + +static struct tcp_states_t tcp_states [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t tcp_states_dos [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, +/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t *tcp_state_table = tcp_states; + + +static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) +{ + int on = (flags & 1); /* secure_tcp */ + + /* + ** FIXME: change secure_tcp to independent sysctl var + ** or make it per-service or per-app because it is valid + ** for most if not for all of the applications. Something + ** like "capabilities" (flags) for each object. + */ + tcp_state_table = (on? tcp_states_dos : tcp_states); +} + +static int +tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, + tcp_state_name_table, sname, to); +} + +static inline int tcp_state_idx(struct tcphdr *th) +{ + if (th->rst) + return 3; + if (th->syn) + return 0; + if (th->fin) + return 1; + if (th->ack) + return 2; + return -1; +} + +static inline void +set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + int direction, struct tcphdr *th) +{ + int state_idx; + int new_state = IP_VS_TCP_S_CLOSE; + int state_off = tcp_state_off[direction]; + + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (state_off == TCP_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + state_off = TCP_DIR_INPUT_ONLY; + } + + if ((state_idx = tcp_state_idx(th)) < 0) { + IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); + goto tcp_state_out; + } + + new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; + + tcp_state_out: + if (new_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pp->name, + ((state_off == TCP_DIR_OUTPUT) ? + "output " : "input "), + th->syn ? 'S' : '.', + th->fin ? 'F' : '.', + th->ack ? 'A' : '.', + th->rst ? 'R' : '.', + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + atomic_read(&cp->refcnt)); + + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + + cp->timeout = pp->timeout_table[cp->state = new_state]; +} + + +/* + * Handle state transitions + */ +static int +tcp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + struct tcphdr _tcph, *th; + +#ifdef CONFIG_IP_VS_IPV6 + int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + int ihl = ip_hdrlen(skb); +#endif + + th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + + spin_lock(&cp->lock); + set_tcp_state(pp, cp, direction, th); + spin_unlock(&cp->lock); + + return 1; +} + + +/* + * Hash table for TCP application incarnations + */ +#define TCP_APP_TAB_BITS 4 +#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) +#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) + +static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; +static DEFINE_SPINLOCK(tcp_app_lock); + +static inline __u16 tcp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) + & TCP_APP_TAB_MASK; +} + + +static int tcp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + + hash = tcp_app_hashkey(port); + + spin_lock_bh(&tcp_app_lock); + list_for_each_entry(i, &tcp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &tcp_apps[hash]); + atomic_inc(&ip_vs_protocol_tcp.appcnt); + + out: + spin_unlock_bh(&tcp_app_lock); + return ret; +} + + +static void +tcp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&tcp_app_lock); + atomic_dec(&ip_vs_protocol_tcp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&tcp_app_lock); +} + + +static int +tcp_app_conn_bind(struct ip_vs_conn *cp) +{ + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = tcp_app_hashkey(cp->vport); + + spin_lock(&tcp_app_lock); + list_for_each_entry(inc, &tcp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&tcp_app_lock); + + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&tcp_app_lock); + + out: + return result; +} + + +/* + * Set LISTEN timeout. (ip_vs_conn_put will setup timer) + */ +void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) +{ + spin_lock(&cp->lock); + cp->state = IP_VS_TCP_S_LISTEN; + cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; + spin_unlock(&cp->lock); +} + + +static void ip_vs_tcp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(tcp_apps); + pp->timeout_table = tcp_timeouts; +} + + +static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) +{ +} + + +struct ip_vs_protocol ip_vs_protocol_tcp = { + .name = "TCP", + .protocol = IPPROTO_TCP, + .num_states = IP_VS_TCP_S_LAST, + .dont_defrag = 0, + .appcnt = ATOMIC_INIT(0), + .init = ip_vs_tcp_init, + .exit = ip_vs_tcp_exit, + .register_app = tcp_register_app, + .unregister_app = tcp_unregister_app, + .conn_schedule = tcp_conn_schedule, + .conn_in_get = tcp_conn_in_get, + .conn_out_get = tcp_conn_out_get, + .snat_handler = tcp_snat_handler, + .dnat_handler = tcp_dnat_handler, + .csum_check = tcp_csum_check, + .state_name = tcp_state_name, + .state_transition = tcp_state_transition, + .app_conn_bind = tcp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = tcp_timeout_change, + .set_state_timeout = tcp_set_state_timeout, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c new file mode 100644 index 00000000000..6eb6039d634 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -0,0 +1,533 @@ +/* + * ip_vs_proto_udp.c: UDP load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static struct ip_vs_conn * +udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + cp = ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } + + return cp; +} + + +static struct ip_vs_conn * +udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + cp = ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } + + return cp; +} + + +static int +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + struct udphdr _udph, *uh; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); + if (uh == NULL) { + *verdict = NF_DROP; + return 0; + } + + svc = ip_vs_service_get(af, skb->mark, iph.protocol, + &iph.daddr, uh->dest); + if (svc) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +udp_fast_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + if (!uhdr->check) + uhdr->check = CSUM_MANGLED_0; +} + +static inline void +udp_partial_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(uhdr->check)))); +} + + +static int +udp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Call application helper if needed + */ + if (!ip_vs_app_pkt_out(cp, skb)) + return 0; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->source = cp->vport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - udphoff)); + } else if (!cp->app && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, udph->check, + (char*)&(udph->check) - (char*)udph); + } + return 1; +} + + +static int +udp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn + */ + if (!ip_vs_app_pkt_in(cp, skb)) + return 0; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->dest = cp->dport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - udphoff)); + } else if (!cp->app && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + struct udphdr _udph, *uh; + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + + uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); + if (uh == NULL) + return 0; + + if (uh->check != 0) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, udphoff, + skb->len - udphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - udphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - udphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + } + return 1; +} + + +/* + * Note: the caller guarantees that only one of register_app, + * unregister_app or app_conn_bind is called each time. + */ + +#define UDP_APP_TAB_BITS 4 +#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) +#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) + +static struct list_head udp_apps[UDP_APP_TAB_SIZE]; +static DEFINE_SPINLOCK(udp_app_lock); + +static inline __u16 udp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) + & UDP_APP_TAB_MASK; +} + + +static int udp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + + hash = udp_app_hashkey(port); + + + spin_lock_bh(&udp_app_lock); + list_for_each_entry(i, &udp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &udp_apps[hash]); + atomic_inc(&ip_vs_protocol_udp.appcnt); + + out: + spin_unlock_bh(&udp_app_lock); + return ret; +} + + +static void +udp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&udp_app_lock); + atomic_dec(&ip_vs_protocol_udp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&udp_app_lock); +} + + +static int udp_app_conn_bind(struct ip_vs_conn *cp) +{ + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = udp_app_hashkey(cp->vport); + + spin_lock(&udp_app_lock); + list_for_each_entry(inc, &udp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&udp_app_lock); + + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&udp_app_lock); + + out: + return result; +} + + +static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = 5*60*HZ, + [IP_VS_UDP_S_LAST] = 2*HZ, +}; + +static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = "UDP", + [IP_VS_UDP_S_LAST] = "BUG!", +}; + + +static int +udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, + udp_state_name_table, sname, to); +} + +static const char * udp_state_name(int state) +{ + if (state >= IP_VS_UDP_S_LAST) + return "ERR!"; + return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; +} + +static int +udp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; + return 1; +} + +static void udp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(udp_apps); + pp->timeout_table = udp_timeouts; +} + +static void udp_exit(struct ip_vs_protocol *pp) +{ +} + + +struct ip_vs_protocol ip_vs_protocol_udp = { + .name = "UDP", + .protocol = IPPROTO_UDP, + .num_states = IP_VS_UDP_S_LAST, + .dont_defrag = 0, + .init = udp_init, + .exit = udp_exit, + .conn_schedule = udp_conn_schedule, + .conn_in_get = udp_conn_in_get, + .conn_out_get = udp_conn_out_get, + .snat_handler = udp_snat_handler, + .dnat_handler = udp_dnat_handler, + .csum_check = udp_csum_check, + .state_transition = udp_state_transition, + .state_name = udp_state_name, + .register_app = udp_register_app, + .unregister_app = udp_unregister_app, + .app_conn_bind = udp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, + .set_state_timeout = udp_set_state_timeout, +}; diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c new file mode 100644 index 00000000000..a22195f68ac --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -0,0 +1,112 @@ +/* + * IPVS: Round-Robin Scheduling module + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct list_head *p, *q; + struct ip_vs_dest *dest; + + IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); + + write_lock(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + p = p->next; + q = p; + do { + /* skip list head */ + if (q == &svc->destinations) { + q = q->next; + continue; + } + + dest = list_entry(q, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + q = q->next; + } while (q != p); + write_unlock(&svc->sched_lock); + return NULL; + + out: + svc->sched_data = q; + write_unlock(&svc->sched_lock); + IP_VS_DBG_BUF(6, "RR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + .name = "rr", /* name */ + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .init_service = ip_vs_rr_init_svc, + .update_service = ip_vs_rr_update_svc, + .schedule = ip_vs_rr_schedule, +}; + +static int __init ip_vs_rr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +static void __exit ip_vs_rr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +module_init(ip_vs_rr_init); +module_exit(ip_vs_rr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c new file mode 100644 index 00000000000..a46ad9e3501 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -0,0 +1,251 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +/* + * IPVS scheduler list + */ +static LIST_HEAD(ip_vs_schedulers); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_sched_lock); + + +/* + * Bind a service with a scheduler + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + int ret; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + if (scheduler == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); + return -EINVAL; + } + + svc->scheduler = scheduler; + + if (scheduler->init_service) { + ret = scheduler->init_service(svc); + if (ret) { + IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); + return ret; + } + } + + return 0; +} + + +/* + * Unbind a service with its scheduler + */ +int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + + sched = svc->scheduler; + if (sched == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); + return -EINVAL; + } + + if (sched->done_service) { + if (sched->done_service(svc) != 0) { + IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); + return -EINVAL; + } + } + + svc->scheduler = NULL; + return 0; +} + + +/* + * Get scheduler in the scheduler list by name + */ +static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", + sched_name); + + read_lock_bh(&__ip_vs_sched_lock); + + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + /* + * Test and get the modules atomically + */ + if (sched->module && !try_module_get(sched->module)) { + /* + * This scheduler is just deleted + */ + continue; + } + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + read_unlock_bh(&__ip_vs_sched_lock); + return sched; + } + if (sched->module) + module_put(sched->module); + } + + read_unlock_bh(&__ip_vs_sched_lock); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + request_module("ip_vs_%s", sched_name); + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + +void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) +{ + if (scheduler->module) + module_put(scheduler->module); +} + + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + struct ip_vs_scheduler *sched; + + if (!scheduler) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + if (!scheduler->name) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); + return -EINVAL; + } + + /* increase the module use count */ + ip_vs_use_count_inc(); + + write_lock_bh(&__ip_vs_sched_lock); + + if (!list_empty(&scheduler->n_list)) { + write_unlock_bh(&__ip_vs_sched_lock); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already linked\n", scheduler->name); + return -EINVAL; + } + + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + if (strcmp(scheduler->name, sched->name) == 0) { + write_unlock_bh(&__ip_vs_sched_lock); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already existed in the system\n", + scheduler->name); + return -EINVAL; + } + } + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + write_unlock_bh(&__ip_vs_sched_lock); + + IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); + + return 0; +} + + +/* + * Unregister a scheduler from the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_sched_lock); + if (list_empty(&scheduler->n_list)) { + write_unlock_bh(&__ip_vs_sched_lock); + IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " + "is not in the list. failed\n", scheduler->name); + return -EINVAL; + } + + /* + * Remove it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + write_unlock_bh(&__ip_vs_sched_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); + + return 0; +} diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c new file mode 100644 index 00000000000..7d2f22f04b8 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -0,0 +1,140 @@ +/* + * IPVS: Shortest Expected Delay scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The SED algorithm attempts to minimize each job's expected delay until + * completion. The expected delay that the job will experience is + * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of + * jobs on the ith server and Ui is the fixed service rate (weight) of + * the ith server. The SED algorithm adopts a greedy policy that each does + * what is in its own best interest, i.e. to join the queue which would + * minimize its expected delay of completion. + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking SED to me. + * + * The difference between SED and WLC is that SED includes the incoming + * job in the cost function (the increment of 1). SED may outperform + * WLC, while scheduling big jobs under larger heterogeneous systems + * (the server weight varies a lot). + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_sed_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_sed_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "SED: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_sed_scheduler = +{ + .name = "sed", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_sed_schedule, +}; + + +static int __init ip_vs_sed_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +static void __exit ip_vs_sed_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +module_init(ip_vs_sed_init); +module_exit(ip_vs_sed_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c new file mode 100644 index 00000000000..1d96de27fef --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -0,0 +1,258 @@ +/* + * IPVS: Source Hashing scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The sh algorithm is to select server by the hash key of source IP + * address. The pseudo code is as follows: + * + * n <- servernode[src_ip]; + * if (n is dead) OR + * (n is overloaded) or (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet source IP address to the current server + * array. If the sh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include +#include +#include +#include + +#include + + +/* + * IPVS SH bucket + */ +struct ip_vs_sh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS SH entry hash table + */ +#ifndef CONFIG_IP_VS_SH_TAB_BITS +#define CONFIG_IP_VS_SH_TAB_BITS 8 +#endif +#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS +#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) +#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS SH entry + */ +static inline unsigned ip_vs_sh_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr) +{ + return (tbl[ip_vs_sh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_sh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +{ + int i; + struct ip_vs_sh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_sh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl; + + /* allocate the SH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Source Hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_sh_bucket *tbl; + struct iphdr *iph = ip_hdr(skb); + + IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_sh_bucket *)svc->sched_data; + dest = ip_vs_sh_get(tbl, iph->saddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->saddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS SH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_sh_scheduler = +{ + .name = "sh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_sh_init_svc, + .done_service = ip_vs_sh_done_svc, + .update_service = ip_vs_sh_update_svc, + .schedule = ip_vs_sh_schedule, +}; + + +static int __init ip_vs_sh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +static void __exit ip_vs_sh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +module_init(ip_vs_sh_init); +module_exit(ip_vs_sh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c new file mode 100644 index 00000000000..de5e7e118ee --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -0,0 +1,942 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * + * ip_vs_sync: sync connection info from master load balancer to backups + * through multicast + * + * Changes: + * Alexandre Cassen : Added master & backup support at a time. + * Alexandre Cassen : Added SyncID support for incoming sync + * messages filtering. + * Justin Ossevoort : Fix endian problem on sync message size. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ip_mc_join_group */ +#include +#include +#include +#include +#include + +#include +#include + +#include + +#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ +#define IP_VS_SYNC_PORT 8848 /* multicast port */ + + +/* + * IPVS sync connection entry + */ +struct ip_vs_sync_conn { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + + /* Flags and state transition */ + __be16 flags; /* status flags */ + __be16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +struct ip_vs_sync_thread_data { + struct socket *sock; + char *buf; +}; + +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) +#define FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages to the backup load balancers in the + following format. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + | . | + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +#define SYNC_MESG_HEADER_LEN 4 +#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ + +struct ip_vs_sync_mesg { + __u8 nr_conns; + __u8 syncid; + __u16 size; + + /* ip_vs_sync_conn entries start here */ +}; + +/* the maximum length of sync (sending/receiving) message */ +static int sync_send_mesg_maxlen; +static int sync_recv_mesg_maxlen; + +struct ip_vs_sync_buff { + struct list_head list; + unsigned long firstuse; + + /* pointers for the message data */ + struct ip_vs_sync_mesg *mesg; + unsigned char *head; + unsigned char *end; +}; + + +/* the sync_buff list head and the lock */ +static LIST_HEAD(ip_vs_sync_queue); +static DEFINE_SPINLOCK(ip_vs_sync_lock); + +/* current sync_buff for accepting new conn entries */ +static struct ip_vs_sync_buff *curr_sb = NULL; +static DEFINE_SPINLOCK(curr_sb_lock); + +/* ipvs sync daemon state */ +volatile int ip_vs_sync_state = IP_VS_STATE_NONE; +volatile int ip_vs_master_syncid = 0; +volatile int ip_vs_backup_syncid = 0; + +/* multicast interface name */ +char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; +char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + +/* sync daemon tasks */ +static struct task_struct *sync_master_thread; +static struct task_struct *sync_backup_thread; + +/* multicast addr */ +static struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = __constant_htons(IP_VS_SYNC_PORT), + .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), +}; + + +static inline struct ip_vs_sync_buff *sb_dequeue(void) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ip_vs_sync_lock); + if (list_empty(&ip_vs_sync_queue)) { + sb = NULL; + } else { + sb = list_entry(ip_vs_sync_queue.next, + struct ip_vs_sync_buff, + list); + list_del(&sb->list); + } + spin_unlock_bh(&ip_vs_sync_lock); + + return sb; +} + +static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) +{ + struct ip_vs_sync_buff *sb; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { + kfree(sb); + return NULL; + } + sb->mesg->nr_conns = 0; + sb->mesg->syncid = ip_vs_master_syncid; + sb->mesg->size = 4; + sb->head = (unsigned char *)sb->mesg + 4; + sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; + sb->firstuse = jiffies; + return sb; +} + +static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) +{ + kfree(sb->mesg); + kfree(sb); +} + +static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +{ + spin_lock(&ip_vs_sync_lock); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) + list_add_tail(&sb->list, &ip_vs_sync_queue); + else + ip_vs_sync_buff_release(sb); + spin_unlock(&ip_vs_sync_lock); +} + +/* + * Get the current sync buffer if it has been created for more + * than the specified time or the specified time is zero. + */ +static inline struct ip_vs_sync_buff * +get_curr_sync_buff(unsigned long time) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&curr_sb_lock); + if (curr_sb && (time == 0 || + time_before(jiffies - curr_sb->firstuse, time))) { + sb = curr_sb; + curr_sb = NULL; + } else + sb = NULL; + spin_unlock_bh(&curr_sb_lock); + return sb; +} + + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + */ +void ip_vs_sync_conn(struct ip_vs_conn *cp) +{ + struct ip_vs_sync_mesg *m; + struct ip_vs_sync_conn *s; + int len; + + spin_lock(&curr_sb_lock); + if (!curr_sb) { + if (!(curr_sb=ip_vs_sync_buff_create())) { + spin_unlock(&curr_sb_lock); + IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + m = curr_sb->mesg; + s = (struct ip_vs_sync_conn *)curr_sb->head; + + /* copy members */ + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr.ip; + s->vaddr = cp->vaddr.ip; + s->daddr = cp->daddr.ip; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->in_seq, sizeof(*opt)); + } + + m->nr_conns++; + m->size += len; + curr_sb->head += len; + + /* check if there is a space for next one */ + if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { + sb_queue_tail(curr_sb); + curr_sb = NULL; + } + spin_unlock(&curr_sb_lock); + + /* synchronize its controller if it has */ + if (cp->control) + ip_vs_sync_conn(cp->control); +} + + +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + */ +static void ip_vs_process_message(const char *buffer, const size_t buflen) +{ + struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; + struct ip_vs_sync_conn *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + struct ip_vs_dest *dest; + char *p; + int i; + + if (buflen < sizeof(struct ip_vs_sync_mesg)) { + IP_VS_ERR_RL("sync message header too short\n"); + return; + } + + /* Convert size back to host byte order */ + m->size = ntohs(m->size); + + if (buflen != m->size) { + IP_VS_ERR_RL("bogus sync message size\n"); + return; + } + + /* SyncID sanity check */ + if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { + IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", + m->syncid); + return; + } + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); + for (i=0; inr_conns; i++) { + unsigned flags, state; + + if (p + SIMPLE_CONN_SIZE > buffer+buflen) { + IP_VS_ERR_RL("bogus conn in sync message\n"); + return; + } + s = (struct ip_vs_sync_conn *) p; + flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; + flags &= ~IP_VS_CONN_F_HASHED; + if (flags & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + p += FULL_CONN_SIZE; + if (p > buffer+buflen) { + IP_VS_ERR_RL("bogus conn options in sync message\n"); + return; + } + } else { + opt = NULL; + p += SIMPLE_CONN_SIZE; + } + + state = ntohs(s->state); + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->protocol); + if (!pp) { + IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", + s->protocol); + continue; + } + if (state >= pp->num_states) { + IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", + pp->name, state); + continue; + } + } else { + /* protocol in templates is not used for state/timeout */ + pp = NULL; + if (state > 0) { + IP_VS_DBG(2, "Invalid template state %u in sync msg\n", + state); + state = 0; + } + } + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport); + else + cp = ip_vs_ct_in_get(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport); + if (!cp) { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + dest = ip_vs_find_dest(AF_INET, + (union nf_inet_addr *)&s->daddr, + s->dport, + (union nf_inet_addr *)&s->vaddr, + s->vport, + s->protocol); + /* Set the approprite ativity flag */ + if (s->protocol == IPPROTO_TCP) { + if (state != IP_VS_TCP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } + cp = ip_vs_conn_new(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport, + (union nf_inet_addr *)&s->daddr, + s->dport, + flags, dest); + if (dest) + atomic_dec(&dest->refcnt); + if (!cp) { + IP_VS_ERR("ip_vs_conn_new failed\n"); + return; + } + } else if (!cp->dest) { + dest = ip_vs_try_bind_dest(cp); + if (dest) + atomic_dec(&dest->refcnt); + } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && + (cp->state != state)) { + /* update active/inactive flag for the connection */ + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + + if (opt) + memcpy(&cp->in_seq, opt, sizeof(*opt)); + atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + cp->state = state; + cp->old_state = cp->state; + /* + * We can not recover the right timeout for templates + * in all cases, we can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + */ + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) + cp->timeout = pp->timeout_table[state]; + else + cp->timeout = (3*60*HZ); + ip_vs_conn_put(cp); + } +} + + +/* + * Setup loopback of outgoing multicasts on a sending socket + */ +static void set_mcast_loop(struct sock *sk, u_char loop) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ + lock_sock(sk); + inet->mc_loop = loop ? 1 : 0; + release_sock(sk); +} + +/* + * Specify TTL for outgoing multicasts on a sending socket + */ +static void set_mcast_ttl(struct sock *sk, u_char ttl) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ + lock_sock(sk); + inet->mc_ttl = ttl; + release_sock(sk); +} + +/* + * Specifiy default interface for outgoing multicasts + */ +static int set_mcast_if(struct sock *sk, char *ifname) +{ + struct net_device *dev; + struct inet_sock *inet = inet_sk(sk); + + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + return -ENODEV; + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + inet->mc_index = dev->ifindex; + /* inet->mc_addr = 0; */ + release_sock(sk); + + return 0; +} + + +/* + * Set the maximum length of sync message according to the + * specified interface's MTU. + */ +static int set_sync_mesg_maxlen(int sync_state) +{ + struct net_device *dev; + int num; + + if (sync_state == IP_VS_STATE_MASTER) { + if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) + return -ENODEV; + + num = (dev->mtu - sizeof(struct iphdr) - + sizeof(struct udphdr) - + SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; + sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); + IP_VS_DBG(7, "setting the maximum length of sync sending " + "message %d.\n", sync_send_mesg_maxlen); + } else if (sync_state == IP_VS_STATE_BACKUP) { + if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) + return -ENODEV; + + sync_recv_mesg_maxlen = dev->mtu - + sizeof(struct iphdr) - sizeof(struct udphdr); + IP_VS_DBG(7, "setting the maximum length of sync receiving " + "message %d.\n", sync_recv_mesg_maxlen); + } + + return 0; +} + + +/* + * Join a multicast group. + * the group is specified by a class D multicast address 224.0.0.0/8 + * in the in_addr structure passed in as a parameter. + */ +static int +join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) +{ + struct ip_mreqn mreq; + struct net_device *dev; + int ret; + + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + mreq.imr_ifindex = dev->ifindex; + + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); + + return ret; +} + + +static int bind_mcastif_addr(struct socket *sock, char *ifname) +{ + struct net_device *dev; + __be32 addr; + struct sockaddr_in sin; + + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + return -ENODEV; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + if (!addr) + IP_VS_ERR("You probably need to specify IP address on " + "multicast interface.\n"); + + IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", + ifname, NIPQUAD(addr)); + + /* Now bind the socket with the address of multicast interface */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + + return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); +} + +/* + * Set up sending multicast socket over UDP + */ +static struct socket * make_send_sock(void) +{ + struct socket *sock; + int result; + + /* First create a socket */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + + result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); + if (result < 0) { + IP_VS_ERR("Error setting outbound mcast interface\n"); + goto error; + } + + set_mcast_loop(sock->sk, 0); + set_mcast_ttl(sock->sk, 1); + + result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); + if (result < 0) { + IP_VS_ERR("Error binding address of the mcast interface\n"); + goto error; + } + + result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr), 0); + if (result < 0) { + IP_VS_ERR("Error connecting to the multicast addr\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return ERR_PTR(result); +} + + +/* + * Set up receiving multicast socket over UDP + */ +static struct socket * make_receive_sock(void) +{ + struct socket *sock; + int result; + + /* First create a socket */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + + /* it is equivalent to the REUSEADDR option in user-space */ + sock->sk->sk_reuse = 1; + + result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr)); + if (result < 0) { + IP_VS_ERR("Error binding to the multicast addr\n"); + goto error; + } + + /* join the multicast group */ + result = join_mcast_group(sock->sk, + (struct in_addr *) &mcast_addr.sin_addr, + ip_vs_backup_mcast_ifn); + if (result < 0) { + IP_VS_ERR("Error joining to the multicast group\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return ERR_PTR(result); +} + + +static int +ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) +{ + struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; + struct kvec iov; + int len; + + EnterFunction(7); + iov.iov_base = (void *)buffer; + iov.iov_len = length; + + len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); + + LeaveFunction(7); + return len; +} + +static void +ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) +{ + int msize; + + msize = msg->size; + + /* Put size in network byte order */ + msg->size = htons(msg->size); + + if (ip_vs_send_async(sock, (char *)msg, msize) != msize) + IP_VS_ERR("ip_vs_send_async error\n"); +} + +static int +ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) +{ + struct msghdr msg = {NULL,}; + struct kvec iov; + int len; + + EnterFunction(7); + + /* Receive a packet */ + iov.iov_base = buffer; + iov.iov_len = (size_t)buflen; + + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + + if (len < 0) + return -1; + + LeaveFunction(7); + return len; +} + + +static int sync_thread_master(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + struct ip_vs_sync_buff *sb; + + IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_master_mcast_ifn, ip_vs_master_syncid); + + while (!kthread_should_stop()) { + while ((sb = sb_dequeue())) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + /* check if entries stay in curr_sb for 2 seconds */ + sb = get_curr_sync_buff(2 * HZ); + if (sb) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + schedule_timeout_interruptible(HZ); + } + + /* clean up the sync_buff queue */ + while ((sb=sb_dequeue())) { + ip_vs_sync_buff_release(sb); + } + + /* clean up the current sync_buff */ + if ((sb = get_curr_sync_buff(0))) { + ip_vs_sync_buff_release(sb); + } + + /* release the sending multicast socket */ + sock_release(tinfo->sock); + kfree(tinfo); + + return 0; +} + + +static int sync_thread_backup(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + int len; + + IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); + + while (!kthread_should_stop()) { + wait_event_interruptible(*tinfo->sock->sk->sk_sleep, + !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) + || kthread_should_stop()); + + /* do we have data now? */ + while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { + len = ip_vs_receive(tinfo->sock, tinfo->buf, + sync_recv_mesg_maxlen); + if (len <= 0) { + IP_VS_ERR("receiving message error\n"); + break; + } + + /* disable bottom half, because it accesses the data + shared by softirq while getting/creating conns */ + local_bh_disable(); + ip_vs_process_message(tinfo->buf, len); + local_bh_enable(); + } + } + + /* release the sending multicast socket */ + sock_release(tinfo->sock); + kfree(tinfo->buf); + kfree(tinfo); + + return 0; +} + + +int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +{ + struct ip_vs_sync_thread_data *tinfo; + struct task_struct **realtask, *task; + struct socket *sock; + char *name, *buf = NULL; + int (*threadfn)(void *data); + int result = -ENOMEM; + + IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", + sizeof(struct ip_vs_sync_conn)); + + if (state == IP_VS_STATE_MASTER) { + if (sync_master_thread) + return -EEXIST; + + strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, + sizeof(ip_vs_master_mcast_ifn)); + ip_vs_master_syncid = syncid; + realtask = &sync_master_thread; + name = "ipvs_syncmaster"; + threadfn = sync_thread_master; + sock = make_send_sock(); + } else if (state == IP_VS_STATE_BACKUP) { + if (sync_backup_thread) + return -EEXIST; + + strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, + sizeof(ip_vs_backup_mcast_ifn)); + ip_vs_backup_syncid = syncid; + realtask = &sync_backup_thread; + name = "ipvs_syncbackup"; + threadfn = sync_thread_backup; + sock = make_receive_sock(); + } else { + return -EINVAL; + } + + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto out; + } + + set_sync_mesg_maxlen(state); + if (state == IP_VS_STATE_BACKUP) { + buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); + if (!buf) + goto outsocket; + } + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outbuf; + + tinfo->sock = sock; + tinfo->buf = buf; + + task = kthread_run(threadfn, tinfo, name); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } + + /* mark as active */ + *realtask = task; + ip_vs_sync_state |= state; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + return 0; + +outtinfo: + kfree(tinfo); +outbuf: + kfree(buf); +outsocket: + sock_release(sock); +out: + return result; +} + + +int stop_sync_thread(int state) +{ + IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); + + if (state == IP_VS_STATE_MASTER) { + if (!sync_master_thread) + return -ESRCH; + + IP_VS_INFO("stopping master sync thread %d ...\n", + task_pid_nr(sync_master_thread)); + + /* + * The lock synchronizes with sb_queue_tail(), so that we don't + * add sync buffers to the queue, when we are already in + * progress of stopping the master sync daemon. + */ + + spin_lock_bh(&ip_vs_sync_lock); + ip_vs_sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock_bh(&ip_vs_sync_lock); + kthread_stop(sync_master_thread); + sync_master_thread = NULL; + } else if (state == IP_VS_STATE_BACKUP) { + if (!sync_backup_thread) + return -ESRCH; + + IP_VS_INFO("stopping backup sync thread %d ...\n", + task_pid_nr(sync_backup_thread)); + + ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; + kthread_stop(sync_backup_thread); + sync_backup_thread = NULL; + } else { + return -EINVAL; + } + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return 0; +} diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c new file mode 100644 index 00000000000..8c596e71259 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -0,0 +1,128 @@ +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_wlc_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_wlc_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "WLC: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + .name = "wlc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_wlc_schedule, +}; + + +static int __init ip_vs_wlc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +static void __exit ip_vs_wlc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +module_init(ip_vs_wlc_init); +module_exit(ip_vs_wlc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c new file mode 100644 index 00000000000..7ea92fed50b --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -0,0 +1,237 @@ +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + * + */ + +#include +#include +#include + +#include + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct list_head *cl; /* current list head */ + int cw; /* current weight */ + int mw; /* maximum weight */ + int di; /* decreasing interval */ +}; + + +/* + * Get the gcd of server weights + */ +static int gcd(int a, int b) +{ + int c; + + while ((c = a % b)) { + a = b; + b = c; + } + return b; +} + +static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g ? g : 1; +} + + +/* + * Get the maximum weight of the service destinations. + */ +static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (atomic_read(&dest->weight) > weight) + weight = atomic_read(&dest->weight); + } + + return weight; +} + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark; + + /* + * Allocate the mark variable for WRR scheduling + */ + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + if (mark == NULL) { + IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); + return -ENOMEM; + } + mark->cl = &svc->destinations; + mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + svc->sched_data = mark; + + return 0; +} + + +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + /* + * Release the mark variable + */ + kfree(svc->sched_data); + + return 0; +} + + +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + mark->cl = &svc->destinations; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + if (mark->cw > mark->mw) + mark->cw = 0; + return 0; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_wrr_mark *mark = svc->sched_data; + struct list_head *p; + + IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); + + /* + * This loop will always terminate, because mark->cw in (0, max_weight] + * and at least one server has its weight equal to max_weight. + */ + write_lock(&svc->sched_lock); + p = mark->cl; + while (1) { + if (mark->cl == &svc->destinations) { + /* it is at the head of the destination list */ + + if (mark->cl == mark->cl->next) { + /* no dest entry */ + dest = NULL; + goto out; + } + + mark->cl = svc->destinations.next; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* + * Still zero, which means no available servers. + */ + if (mark->cw == 0) { + mark->cl = &svc->destinations; + IP_VS_ERR_RL("ip_vs_wrr_schedule(): " + "no available servers\n"); + dest = NULL; + goto out; + } + } + } else + mark->cl = mark->cl->next; + + if (mark->cl != &svc->destinations) { + /* not at the head of the list */ + dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) >= mark->cw) { + /* got it */ + break; + } + } + + if (mark->cl == p && mark->cw == mark->di) { + /* back to the start, and no dest is found. + It is only possible when all dests are OVERLOADED */ + dest = NULL; + goto out; + } + } + + IP_VS_DBG_BUF(6, "WRR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); + + out: + write_unlock(&svc->sched_lock); + return dest; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + .name = "wrr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .init_service = ip_vs_wrr_init_svc, + .done_service = ip_vs_wrr_done_svc, + .update_service = ip_vs_wrr_update_svc, + .schedule = ip_vs_wrr_schedule, +}; + +static int __init ip_vs_wrr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +static void __exit ip_vs_wrr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); +} + +module_init(ip_vs_wrr_init); +module_exit(ip_vs_wrr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c new file mode 100644 index 00000000000..02ddc2b3ce2 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -0,0 +1,1004 @@ +/* + * ip_vs_xmit.c: various packet transmitters for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include /* for tcphdr */ +#include +#include /* for csum_tcpudp_magic */ +#include +#include /* for icmp_send */ +#include /* for ip_route_output */ +#include +#include +#include +#include +#include + +#include + + +/* + * Destination cache to speed up outgoing route lookup + */ +static inline void +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = dst; + dest->dst_rtos = rtos; + dst_release(old_dst); +} + +static inline struct dst_entry * +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) +{ + struct dst_entry *dst = dest->dst_cache; + + if (!dst) + return NULL; + if ((dst->obsolete + || (dest->af == AF_INET && rtos != dest->dst_rtos)) && + dst->ops->check(dst, cookie) == NULL) { + dest->dst_cache = NULL; + dst_release(dst); + return NULL; + } + dst_hold(dst); + return dst; +} + +static struct rtable * +__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) +{ + struct rtable *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + if (!(rt = (struct rtable *) + __ip_vs_dst_check(dest, rtos, 0))) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = dest->addr.ip, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&init_net, &rt, &fl)) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip_route_output error, " + "dest: %u.%u.%u.%u\n", + NIPQUAD(dest->addr.ip)); + return NULL; + } + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", + NIPQUAD(dest->addr.ip), + atomic_read(&rt->u.dst.__refcnt), rtos); + } + spin_unlock(&dest->dst_lock); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = cp->daddr.ip, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&init_net, &rt, &fl)) { + IP_VS_DBG_RL("ip_route_output error, dest: " + "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip)); + return NULL; + } + } + + return rt; +} + +#ifdef CONFIG_IP_VS_IPV6 +static struct rt6_info * +__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); + if (!rt) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = dest->addr.in6, + .saddr = { + .s6_addr32 = + { 0, 0, 0, 0 }, + }, + }, + }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, + NULL, &fl); + if (!rt) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip6_route_output error, " + "dest: " NIP6_FMT "\n", + NIP6(dest->addr.in6)); + return NULL; + } + __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n", + NIP6(dest->addr.in6), + atomic_read(&rt->u.dst.__refcnt)); + } + spin_unlock(&dest->dst_lock); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = cp->daddr.in6, + .saddr = { + .s6_addr32 = { 0, 0, 0, 0 }, + }, + }, + }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (!rt) { + IP_VS_DBG_RL("ip6_route_output error, dest: " + NIP6_FMT "\n", NIP6(cp->daddr.in6)); + return NULL; + } + } + + return rt; +} +#endif + + +/* + * Release dest->dst_cache before a dest is removed + */ +void +ip_vs_dst_reset(struct ip_vs_dest *dest) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = NULL; + dst_release(old_dst); +} + +#define IP_VS_XMIT(pf, skb, rt) \ +do { \ + (skb)->ipvs_property = 1; \ + skb_forward_csum(skb); \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ + (rt)->u.dst.dev, dst_output); \ +} while (0) + + +/* + * NULL transmitter (do nothing except return NF_ACCEPT) + */ +int +ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + /* we do not touch skb and do not need pskb ptr */ + return NF_ACCEPT; +} + + +/* + * Bypass transmitter + * Let packets bypass the destination when the destination is not + * available, it may be only used in transparent cache cluster. + */ +int +ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = ip_hdr(skb); + u8 tos = iph->tos; + int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = 0, + .tos = RT_TOS(tos), } }, + }; + + EnterFunction(10); + + if (ip_route_output_key(&init_net, &rt, &fl)) { + IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " + "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(ip_hdr(skb)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ipv6hdr *iph = ipv6_hdr(skb); + int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = iph->daddr, + .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, + }; + + EnterFunction(10); + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (!rt) { + IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, " + "dest: " NIP6_FMT "\n", NIP6(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->u.dst); + return NF_STOLEN; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + +/* + * NAT transmitter (only for outside-to-inside nat forwarding) + * Not used for related ICMP + */ +int +ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + struct iphdr *iph = ip_hdr(skb); + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct iphdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error; + ip_hdr(skb)->daddr = cp->daddr.ip; + ip_send_check(ip_hdr(skb)); + + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, sizeof(struct ipv6hdr), + sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "ip_vs_nat_xmit_v6(): frag needed for"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error; + ipv6_hdr(skb)->daddr = cp->daddr.in6; + + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; +tx_error_put: + dst_release(&rt->u.dst); + goto tx_error; +} +#endif + + +/* + * IP Tunneling transmitter + * + * This function encapsulates the packet in a new IP packet, its + * destination will be set to cp->daddr. Most code of this function + * is taken from ipip.c. + * + * It is used in VS/TUN cluster. The load balancer selects a real + * server from a cluster based on a scheduling algorithm, + * encapsulates the request packet and forwards it to the selected + * server. For example, all real servers are configured with + * "ifconfig tunl0 up". When the server receives + * the encapsulated packet, it will decapsulate the packet, processe + * the request and return the response packets directly to the client + * without passing the load balancer. This can greatly increase the + * scalability of virtual server. + * + * Used for ANY protocol + */ +int +ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = ip_hdr(skb); + u8 tos = old_iph->tos; + __be16 df = old_iph->frag_off; + sk_buff_data_t old_transport_header = skb->transport_header; + struct iphdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != htons(ETH_P_IP)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " + "ETH_P_IP: %d, skb protocol: %d\n", + htons(ETH_P_IP), skb->protocol); + goto tx_error; + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + if (mtu < 68) { + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + df |= (old_iph->frag_off & htons(IP_DF)); + + if ((old_iph->frag_off & htons(IP_DF)) + && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ip_hdr(skb); + } + + skb->transport_header = old_transport_header; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->ttl = old_iph->ttl; + ip_select_ident(iph, &rt->u.dst, NULL); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ip_local_out(skb); + + LeaveFunction(10); + + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct ipv6hdr *old_iph = ipv6_hdr(skb); + sk_buff_data_t old_transport_header = skb->transport_header; + struct ipv6hdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != htons(ETH_P_IPV6)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, " + "ETH_P_IPV6: %d, skb protocol: %d\n", + htons(ETH_P_IPV6), skb->protocol); + goto tx_error; + } + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); + /* TODO IPv6: do we need this check in IPv6? */ + if (mtu < 1280) { + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n"); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + dst_release(&rt->u.dst); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n"); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ipv6_hdr(skb); + } + + skb->transport_header = old_transport_header; + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = ipv6_hdr(skb); + iph->version = 6; + iph->nexthdr = IPPROTO_IPV6; + iph->payload_len = old_iph->payload_len + sizeof(old_iph); + iph->priority = old_iph->priority; + memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); + iph->daddr = rt->rt6i_dst.addr; + iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ + iph->hop_limit = old_iph->hop_limit; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ip6_local_out(skb); + + LeaveFunction(10); + + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * Direct Routing transmitter + * Used for ANY protocol + */ +int +ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = ip_hdr(skb); + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(ip_hdr(skb)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->u.dst); + return NF_STOLEN; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * ICMP packet transmitter + * called by the ip_vs_in_icmp + */ +int +ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + int rc; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop the old route when skb is not shared */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + ip_vs_nat_icmp(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + rc = NF_STOLEN; + goto out; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; + out: + LeaveFunction(10); + return rc; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + int rc; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop the old route when skb is not shared */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + ip_vs_nat_icmp_v6(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + rc = NF_STOLEN; + goto out; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; +out: + LeaveFunction(10); + return rc; +tx_error_put: + dst_release(&rt->u.dst); + goto tx_error; +} +#endif -- cgit v1.2.3-70-g09d2 From 9a1f27c48065ce713eb47f2fd475b717e63ef239 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 7 Oct 2008 11:41:57 -0700 Subject: inet_hashtables: Add inet_lookup_skb helpers To be able to use the cached socket reference in the skb during input processing we add a new set of lookup functions that receive the skb on their argument list. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: KOVACS Krisztian Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 11 +++++++++++ include/net/inet_hashtables.h | 14 ++++++++++++++ net/dccp/ipv4.c | 5 ++--- net/dccp/ipv6.c | 6 ++---- net/ipv4/tcp_ipv4.c | 3 +-- net/ipv6/tcp_ipv6.c | 6 +----- 6 files changed, 31 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index e48989f04c2..995efbb031a 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -91,6 +91,17 @@ static inline struct sock *__inet6_lookup(struct net *net, return inet6_lookup_listener(net, hashinfo, daddr, hnum, dif); } +static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, + struct sk_buff *skb, + const __be16 sport, + const __be16 dport) +{ + return __inet6_lookup(dev_net(skb->dst->dev), hashinfo, + &ipv6_hdr(skb)->saddr, sport, + &ipv6_hdr(skb)->daddr, ntohs(dport), + inet6_iif(skb)); +} + extern struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index bb619d80f2e..3522bbcd546 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -16,6 +16,7 @@ #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include #include @@ -371,6 +373,18 @@ static inline struct sock *inet_lookup(struct net *net, return sk; } +static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, + struct sk_buff *skb, + const __be16 sport, + const __be16 dport) +{ + const struct iphdr *iph = ip_hdr(skb); + + return __inet_lookup(dev_net(skb->dst->dev), hashinfo, + iph->saddr, sport, + iph->daddr, dport, inet_iif(skb)); +} + extern int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u32 port_offset, int (*check_established)(struct inet_timewait_death_row *, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 882c5c4de69..e3dfddab21c 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -811,9 +811,8 @@ static int dccp_v4_rcv(struct sk_buff *skb) /* Step 2: * Look up flow ID in table and get corresponding socket */ - sk = __inet_lookup(dev_net(skb->dst->dev), &dccp_hashinfo, - iph->saddr, dh->dccph_sport, - iph->daddr, dh->dccph_dport, inet_iif(skb)); + sk = __inet_lookup_skb(&dccp_hashinfo, skb, + dh->dccph_sport, dh->dccph_dport); /* * Step 2: * If no socket ... diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 5e1ee0da2c4..caa7f346962 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -805,10 +805,8 @@ static int dccp_v6_rcv(struct sk_buff *skb) /* Step 2: * Look up flow ID in table and get corresponding socket */ - sk = __inet6_lookup(dev_net(skb->dst->dev), &dccp_hashinfo, - &ipv6_hdr(skb)->saddr, dh->dccph_sport, - &ipv6_hdr(skb)->daddr, ntohs(dh->dccph_dport), - inet6_iif(skb)); + sk = __inet6_lookup_skb(&dccp_hashinfo, skb, + dh->dccph_sport, dh->dccph_dport); /* * Step 2: * If no socket ... diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8b24bd833cb..24ffc5e1d3d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1577,8 +1577,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr, - th->source, iph->daddr, th->dest, inet_iif(skb)); + sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index df16b68644e..6268d266c03 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1681,11 +1681,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(ipv6_hdr(skb)); TCP_SKB_CB(skb)->sacked = 0; - sk = __inet6_lookup(net, &tcp_hashinfo, - &ipv6_hdr(skb)->saddr, th->source, - &ipv6_hdr(skb)->daddr, ntohs(th->dest), - inet6_iif(skb)); - + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; -- cgit v1.2.3-70-g09d2 From 607c4aaf03041c8bd81555a0218050c0f895088e Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Tue, 7 Oct 2008 12:38:32 -0700 Subject: inet: Add udplib_lookup_skb() helpers To be able to use the cached socket reference in the skb during input processing we add a new set of lookup functions that receive the skb on their argument list. Signed-off-by: KOVACS Krisztian Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/udp.c | 14 ++++++++++++-- net/ipv6/udp.c | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c83d0ef469c..c7a90b546b2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -302,6 +302,17 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, return result; } +static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport, + struct hlist_head udptable[]) +{ + const struct iphdr *iph = ip_hdr(skb); + + return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + udptable); +} + struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { @@ -1208,8 +1219,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable); - sk = __udp4_lib_lookup(net, saddr, uh->source, daddr, - uh->dest, inet_iif(skb), udptable); + sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { int ret = udp_queue_rcv_skb(sk, skb); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index a6aecf76a71..ce26c41d157 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -107,6 +107,17 @@ static struct sock *__udp6_lib_lookup(struct net *net, return result; } +static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport, + struct hlist_head udptable[]) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + + return __udp6_lib_lookup(dev_net(skb->dst->dev), &iph->saddr, sport, + &iph->daddr, dport, inet6_iif(skb), + udptable); +} + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -488,8 +499,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], * check socket cache ... must talk to Alan about his plans * for sock caches... i'll skip this for now. */ - sk = __udp6_lib_lookup(net, saddr, uh->source, - daddr, uh->dest, inet6_iif(skb), udptable); + sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk == NULL) { if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) -- cgit v1.2.3-70-g09d2 From 23542618deb77cfed312842fe8c41ed19fb16470 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Tue, 7 Oct 2008 12:41:01 -0700 Subject: inet: Don't lookup the socket if there's a socket attached to the skb Use the socket cached in the skb if it's present. Signed-off-by: KOVACS Krisztian Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 12 ++++++++---- include/net/inet_hashtables.h | 10 +++++++--- include/net/sock.h | 12 ++++++++++++ net/ipv4/udp.c | 10 +++++++--- net/ipv6/udp.c | 10 +++++++--- 5 files changed, 41 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 995efbb031a..f74665d7bea 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -96,10 +96,14 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, const __be16 sport, const __be16 dport) { - return __inet6_lookup(dev_net(skb->dst->dev), hashinfo, - &ipv6_hdr(skb)->saddr, sport, - &ipv6_hdr(skb)->daddr, ntohs(dport), - inet6_iif(skb)); + struct sock *sk; + + if (unlikely(sk = skb_steal_sock(skb))) + return sk; + else return __inet6_lookup(dev_net(skb->dst->dev), hashinfo, + &ipv6_hdr(skb)->saddr, sport, + &ipv6_hdr(skb)->daddr, ntohs(dport), + inet6_iif(skb)); } extern struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3522bbcd546..5cc182f9eca 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -378,11 +378,15 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const __be16 sport, const __be16 dport) { + struct sock *sk; const struct iphdr *iph = ip_hdr(skb); - return __inet_lookup(dev_net(skb->dst->dev), hashinfo, - iph->saddr, sport, - iph->daddr, dport, inet_iif(skb)); + if (unlikely(sk = skb_steal_sock(skb))) + return sk; + else + return __inet_lookup(dev_net(skb->dst->dev), hashinfo, + iph->saddr, sport, + iph->daddr, dport, inet_iif(skb)); } extern int __inet_hash_connect(struct inet_timewait_death_row *death_row, diff --git a/include/net/sock.h b/include/net/sock.h index 75a312d3888..18f96708f3a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1324,6 +1324,18 @@ static inline void sk_change_net(struct sock *sk, struct net *net) sock_net_set(sk, hold_net(net)); } +static inline struct sock *skb_steal_sock(struct sk_buff *skb) +{ + if (unlikely(skb->sk)) { + struct sock *sk = skb->sk; + + skb->destructor = NULL; + skb->sk = NULL; + return sk; + } + return NULL; +} + extern void sock_enable_timestamp(struct sock *sk); extern int sock_get_timestamp(struct sock *, struct timeval __user *); extern int sock_get_timestampns(struct sock *, struct timespec __user *); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c7a90b546b2..822c9deac83 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -306,11 +306,15 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct hlist_head udptable[]) { + struct sock *sk; const struct iphdr *iph = ip_hdr(skb); - return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport, - iph->daddr, dport, inet_iif(skb), - udptable); + if (unlikely(sk = skb_steal_sock(skb))) + return sk; + else + return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + udptable); } struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ce26c41d157..e51da8c092f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -111,11 +111,15 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct hlist_head udptable[]) { + struct sock *sk; struct ipv6hdr *iph = ipv6_hdr(skb); - return __udp6_lib_lookup(dev_net(skb->dst->dev), &iph->saddr, sport, - &iph->daddr, dport, inet6_iif(skb), - udptable); + if (unlikely(sk = skb_steal_sock(skb))) + return sk; + else + return __udp6_lib_lookup(dev_net(skb->dst->dev), &iph->saddr, sport, + &iph->daddr, dport, inet6_iif(skb), + udptable); } /* -- cgit v1.2.3-70-g09d2 From c57943a1c96214ee68f3890bb6772841ffbfd606 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Oct 2008 14:18:42 -0700 Subject: net: wrap sk->sk_backlog_rcv() Wrap calling sk->sk_backlog_rcv() in a function. This will allow extending the generic sk_backlog_rcv behaviour. Signed-off-by: Peter Zijlstra Signed-off-by: David S. Miller --- include/net/sock.h | 5 +++++ include/net/tcp.h | 2 +- net/core/sock.c | 4 ++-- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_timer.c | 2 +- 5 files changed, 10 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/sock.h b/include/net/sock.h index 18f96708f3a..ada50c04d09 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -482,6 +482,11 @@ static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb) skb->next = NULL; } +static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + return sk->sk_backlog_rcv(sk, skb); +} + #define sk_wait_event(__sk, __timeo, __condition) \ ({ int __rc; \ release_sock(__sk); \ diff --git a/include/net/tcp.h b/include/net/tcp.h index f6cc3414315..438014d5761 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -896,7 +896,7 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb) BUG_ON(sock_owned_by_user(sk)); while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { - sk->sk_backlog_rcv(sk, skb1); + sk_backlog_rcv(sk, skb1); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED); } diff --git a/net/core/sock.c b/net/core/sock.c index 2d358dd8a03..5e2a3132a8c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -327,7 +327,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) */ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); - rc = sk->sk_backlog_rcv(sk, skb); + rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); } else @@ -1374,7 +1374,7 @@ static void __release_sock(struct sock *sk) struct sk_buff *next = skb->next; skb->next = NULL; - sk->sk_backlog_rcv(sk, skb); + sk_backlog_rcv(sk, skb); /* * We are in process context here with softirqs diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7d81a1ee550..7d3fe571d15 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1161,7 +1161,7 @@ static void tcp_prequeue_process(struct sock *sk) * necessary */ local_bh_disable(); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk->sk_backlog_rcv(sk, skb); + sk_backlog_rcv(sk, skb); local_bh_enable(); /* Clear memory counter. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5ab6ba19c3c..6b6dff1164b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -201,7 +201,7 @@ static void tcp_delack_timer(unsigned long data) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk->sk_backlog_rcv(sk, skb); + sk_backlog_rcv(sk, skb); tp->ucopy.memory = 0; } -- cgit v1.2.3-70-g09d2 From 33f5f57eeb0c6386fdd85f9c690dc8d700ba7928 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 7 Oct 2008 14:43:06 -0700 Subject: tcp: kill pointless urg_mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It all started from me noticing that this urgent check in tcp_clean_rtx_queue is unnecessarily inside the loop. Then I took a longer look to it and found out that the users of urg_mode can trivially do without, well almost, there was one gotcha. Bonus: those funny people who use urg with >= 2^31 write_seq - snd_una could now rejoice too (that's the only purpose for the between being there, otherwise a simple compare would have done the thing). Not that I assume that the rest of the tcp code happily lives with such mind-boggling numbers :-). Alas, it turned out to be impossible to set wmem to such numbers anyway, yes I really tried a big sendfile after setting some wmem but nothing happened :-). ...Tcp_wmem is int and so is sk_sndbuf... So I hacked a bit variable to long and found out that it seems to work... :-) Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/linux/tcp.h | 9 ++++----- net/ipv4/tcp.c | 4 +--- net/ipv4/tcp_input.c | 11 ++++++----- net/ipv4/tcp_minisocks.c | 1 + net/ipv4/tcp_output.c | 18 ++++++++++++------ 5 files changed, 24 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 76729062829..fe77e1499ab 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -312,8 +312,11 @@ struct tcp_sock { u32 retrans_out; /* Retransmitted packets out */ u16 urg_data; /* Saved octet of OOB data and control flags */ - u8 urg_mode; /* In urgent mode */ u8 ecn_flags; /* ECN status bits. */ + u8 reordering; /* Packet reordering metric. */ + u32 snd_up; /* Urgent pointer */ + + u8 keepalive_probes; /* num of allowed keep alive probes */ /* * Options received (usually on last packet, some only on SYN packets). */ @@ -361,8 +364,6 @@ struct tcp_sock { u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */ - u8 reordering; /* Packet reordering metric. */ - u8 keepalive_probes; /* num of allowed keep alive probes */ u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ @@ -374,8 +375,6 @@ struct tcp_sock { u32 total_retrans; /* Total retransmits for entire connection */ u32 urg_seq; /* Seq of received urgent pointer */ - u32 snd_up; /* Urgent pointer */ - unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7d3fe571d15..eccb7165a80 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -497,10 +497,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, struct sk_buff *skb) { - if (flags & MSG_OOB) { - tp->urg_mode = 1; + if (flags & MSG_OOB) tp->snd_up = tp->write_seq; - } } static inline void tcp_push(struct sock *sk, int flags, int mss_now, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3b76bce769d..c19f429dc44 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2836,7 +2836,8 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) +static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, + u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2903,9 +2904,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; - if (unlikely(tp->urg_mode && !before(end_seq, tp->snd_up))) - tp->urg_mode = 0; - tp->packets_out -= acked_pcount; pkts_acked += acked_pcount; @@ -2935,6 +2933,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) tp->lost_skb_hint = NULL; } + if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) + tp->snd_up = tp->snd_una; + if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; @@ -3311,7 +3312,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, prior_fackets); + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); if (tp->frto_counter) frto_cwnd = tcp_process_frto(sk, flag); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f976fc57892..779f2e9d068 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -395,6 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->pred_flags = 0; newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; + newtp->snd_up = treq->snt_isn + 1; tcp_prequeue_init(newtp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 493553c71d3..990a5849323 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -345,6 +345,11 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->end_seq = seq; } +static inline int tcp_urg_mode(const struct tcp_sock *tp) +{ + return tp->snd_una != tp->snd_up; +} + #define OPTION_SACK_ADVERTISE (1 << 0) #define OPTION_TS (1 << 1) #define OPTION_MD5 (1 << 2) @@ -646,7 +651,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->check = 0; th->urg_ptr = 0; - if (unlikely(tp->urg_mode && + /* The urg_mode check is necessary during a below snd_una win probe */ + if (unlikely(tcp_urg_mode(tp) && between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; @@ -1012,7 +1018,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. * - * LARGESEND note: !urg_mode is overkill, only frames up to snd_up + * LARGESEND note: !tcp_urg_mode is overkill, only frames up to snd_up * cannot be large. However, taking into account rare use of URG, this * is not a big flaw. */ @@ -1029,7 +1035,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) mss_now = tp->mss_cache; - if (large_allowed && sk_can_gso(sk) && !tp->urg_mode) + if (large_allowed && sk_can_gso(sk) && !tcp_urg_mode(tp)) doing_tso = 1; if (dst) { @@ -1193,7 +1199,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, /* Don't use the nagle rule for urgent data (or for the final FIN). * Nagle can be ignored during F-RTO too (see RFC4138). */ - if (tp->urg_mode || (tp->frto_counter == 2) || + if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; @@ -2358,6 +2364,7 @@ static void tcp_connect_init(struct sock *sk) tcp_init_wl(tp, tp->write_seq, 0); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; + tp->snd_up = tp->write_seq; tp->rcv_nxt = 0; tp->rcv_wup = 0; tp->copied_seq = 0; @@ -2567,8 +2574,7 @@ int tcp_write_wakeup(struct sock *sk) tcp_event_new_data_sent(sk, skb); return err; } else { - if (tp->urg_mode && - between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) + if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) tcp_xmit_probe_skb(sk, 1); return tcp_xmit_probe_skb(sk, 0); } -- cgit v1.2.3-70-g09d2 From 4a7e56098f06d505f23f8d7c8d6762221065922a Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 7 Oct 2008 14:43:31 -0700 Subject: tcp: cleanup messy initializer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I'm quite sure that if I give this function in its old format for you to inspect, you start to wonder what is the type of demanded or if it's a global variable. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c19f429dc44..63da39372d4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4461,8 +4461,8 @@ static void tcp_new_space(struct sock *sk) if (tcp_should_expand_sndbuf(sk)) { int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + - MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), - demanded = max_t(unsigned int, tp->snd_cwnd, + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); + int demanded = max_t(unsigned int, tp->snd_cwnd, tp->reordering + 1); sndmem *= 2 * demanded; if (sndmem > sk->sk_sndbuf) -- cgit v1.2.3-70-g09d2 From 0c7ed677fb7013c8028045d409a48ac42151187a Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Tue, 7 Oct 2008 14:49:36 -0700 Subject: netns: make udpv6 mib per/namespace Signed-off-by: Denis V. Lunev Signed-off-by: David S. Miller --- include/net/netns/mib.h | 1 + include/net/udp.h | 12 ++++++------ net/ipv4/udp.c | 3 --- net/ipv6/af_inet6.c | 9 ++++----- net/ipv6/proc.c | 3 ++- 5 files changed, 13 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h index ffcef5de3d1..ba622b24b84 100644 --- a/include/net/netns/mib.h +++ b/include/net/netns/mib.h @@ -14,6 +14,7 @@ struct netns_mib { #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct proc_dir_entry *proc_net_devsnmp6; + DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6); #endif }; diff --git a/include/net/udp.h b/include/net/udp.h index d38f6f2419f..72f28c3ddaa 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -152,8 +152,6 @@ extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); -DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6); - /* UDP-Lite does not have a standardized MIB yet, so we inherit from UDP */ DECLARE_SNMP_STAT(struct udp_mib, udplite_stats_in6); @@ -167,12 +165,14 @@ DECLARE_SNMP_STAT(struct udp_mib, udplite_stats_in6); if (is_udplite) SNMP_INC_STATS_BH((net)->mib.udplite_statistics, field); \ else SNMP_INC_STATS_BH((net)->mib.udp_statistics, field); } while(0) -#define UDP6_INC_STATS_BH(net, field, is_udplite) do { (void)net; \ +#define UDP6_INC_STATS_BH(net, field, is_udplite) do { \ if (is_udplite) SNMP_INC_STATS_BH(udplite_stats_in6, field); \ - else SNMP_INC_STATS_BH(udp_stats_in6, field); } while(0) -#define UDP6_INC_STATS_USER(net, field, is_udplite) do { (void)net; \ + else SNMP_INC_STATS_BH((net)->mib.udp_stats_in6, field); \ +} while(0) +#define UDP6_INC_STATS_USER(net, field, is_udplite) do { \ if (is_udplite) SNMP_INC_STATS_USER(udplite_stats_in6, field); \ - else SNMP_INC_STATS_USER(udp_stats_in6, field); } while(0) + else SNMP_INC_STATS_USER((net)->mib.udp_stats_in6, field); \ +} while(0) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) #define UDPX_INC_STATS_BH(sk, field) \ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 822c9deac83..85f8e8e10b1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -108,9 +108,6 @@ * Snmp MIB for the UDP layer */ -DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly; -EXPORT_SYMBOL(udp_stats_in6); - struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e8f82eca160..e09139122ef 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -806,16 +806,12 @@ static int __init init_ipv6_mibs(void) if (snmp_mib_init((void **)icmpv6msg_statistics, sizeof(struct icmpv6msg_mib)) < 0) goto err_icmpmsg_mib; - if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib)) < 0) - goto err_udp_mib; if (snmp_mib_init((void **)udplite_stats_in6, sizeof (struct udp_mib)) < 0) goto err_udplite_mib; return 0; err_udplite_mib: - snmp_mib_free((void **)udp_stats_in6); -err_udp_mib: snmp_mib_free((void **)icmpv6msg_statistics); err_icmpmsg_mib: snmp_mib_free((void **)icmpv6_statistics); @@ -831,17 +827,20 @@ static void cleanup_ipv6_mibs(void) snmp_mib_free((void **)ipv6_statistics); snmp_mib_free((void **)icmpv6_statistics); snmp_mib_free((void **)icmpv6msg_statistics); - snmp_mib_free((void **)udp_stats_in6); snmp_mib_free((void **)udplite_stats_in6); } static int __net_init ipv6_init_mibs(struct net *net) { + if (snmp_mib_init((void **)net->mib.udp_stats_in6, + sizeof (struct udp_mib)) < 0) + return -ENOMEM; return 0; } static void __net_exit ipv6_cleanup_mibs(struct net *net) { + snmp_mib_free((void **)net->mib.udp_stats_in6); } static int inet6_net_init(struct net *net) diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 23e567fb1d3..3eaf20bf998 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -180,7 +180,8 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list); snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list); snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics); - snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list); + snmp6_seq_show_item(seq, (void **)net->mib.udp_stats_in6, + snmp6_udp6_list); snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list); return 0; } -- cgit v1.2.3-70-g09d2 From 53240c208776d557dba9d7afedbcdbf512774c16 Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Tue, 7 Oct 2008 15:31:19 -0700 Subject: tcp: Fix possible double-ack w/ user dma From: Ali Saidi When TCP receive copy offload is enabled it's possible that tcp_rcv_established() will cause two acks to be sent for a single packet. In the case that a tcp_dma_early_copy() is successful, copied_early is set to true which causes tcp_cleanup_rbuf() to be called early which can send an ack. Further along in tcp_rcv_established(), __tcp_ack_snd_check() is called and will schedule a delayed ACK. If no packets are processed before the delayed ack timer expires the packet will be acked twice. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 67ccce2a96b..7abc6b80d47 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4879,7 +4879,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto no_ack; } - __tcp_ack_snd_check(sk, 0); + if (!copied_early || tp->rcv_nxt != tp->rcv_wup) + __tcp_ack_snd_check(sk, 0); no_ack: #ifdef CONFIG_NET_DMA if (copied_early) -- cgit v1.2.3-70-g09d2 From b8bae41ed6a53cce56c50811a91cd963e3187d1c Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Tue, 7 Oct 2008 15:34:37 -0700 Subject: ipv4: add mc_count to in_device. This patch add mc_count to struct in_device and updates increment/decrement/initilaize of this field in IPv4 and in IPv6. - Also printing the vfs /proc entry (/proc/net/igmp) is adjusted to use the new mc_count. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 1 + net/ipv4/igmp.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index c6f51ad52d5..06fcdb45106 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -25,6 +25,7 @@ struct in_device struct in_ifaddr *ifa_list; /* IP ifaddr chain */ rwlock_t mc_list_lock; struct ip_mc_list *mc_list; /* IP multicast filter chain */ + int mc_count; /* Number of installed mcasts */ spinlock_t mc_tomb_lock; struct ip_mc_list *mc_tomb; unsigned long mr_v1_seen; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f70fac61259..7f9e337e390 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1234,6 +1234,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) write_lock_bh(&in_dev->mc_list_lock); im->next=in_dev->mc_list; in_dev->mc_list=im; + in_dev->mc_count++; write_unlock_bh(&in_dev->mc_list_lock); #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im->multiaddr); @@ -1282,6 +1283,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) if (--i->users == 0) { write_lock_bh(&in_dev->mc_list_lock); *ip = i->next; + in_dev->mc_count--; write_unlock_bh(&in_dev->mc_list_lock); igmp_group_dropped(i); @@ -1330,6 +1332,7 @@ void ip_mc_init_dev(struct in_device *in_dev) setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire, (unsigned long)in_dev); in_dev->mr_ifc_count = 0; + in_dev->mc_count = 0; setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, (unsigned long)in_dev); in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; @@ -1369,8 +1372,8 @@ void ip_mc_destroy_dev(struct in_device *in_dev) write_lock_bh(&in_dev->mc_list_lock); while ((i = in_dev->mc_list) != NULL) { in_dev->mc_list = i->next; + in_dev->mc_count--; write_unlock_bh(&in_dev->mc_list_lock); - igmp_group_dropped(i); ip_ma_put(i); @@ -2383,7 +2386,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) if (state->in_dev->mc_list == im) { seq_printf(seq, "%d\t%-10s: %5d %7s\n", - state->dev->ifindex, state->dev->name, state->dev->mc_count, querier); + state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); } seq_printf(seq, -- cgit v1.2.3-70-g09d2 From 9d2c27e17b7574023b5adb5c6a50d7aaeb915543 Mon Sep 17 00:00:00 2001 From: Daniele Lacamera Date: Tue, 7 Oct 2008 15:58:17 -0700 Subject: tcp: Fix tcp_hybla zero congestion window growth with small rho and large cwnd. Because of rounding, in certain conditions, i.e. when in congestion avoidance state rho is smaller than 1/128 of the current cwnd, TCP Hybla congestion control starves and the cwnd is kept constant forever. This patch forces an increment by one segment after #send_cwnd calls without increments(newreno behavior). Signed-off-by: Daniele Lacamera Signed-off-by: David S. Miller --- net/ipv4/tcp_hybla.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index bfcbd148a89..c209e054a63 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -150,7 +150,11 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) ca->snd_cwnd_cents -= 128; tp->snd_cwnd_cnt = 0; } - + /* check when cwnd has not been incremented for a while */ + if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } /* clamp down slowstart cwnd to ssthresh value. */ if (is_slowstart) tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); -- cgit v1.2.3-70-g09d2 From 76108cea065cda58366d16a7eb6ca90d717a1396 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:00 +0200 Subject: netfilter: Use unsigned types for hooknum and pf vars and (try to) consistently use u_int8_t for the L3 family. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter.h | 30 ++++++++-------- include/linux/netfilter/x_tables.h | 28 +++++++-------- include/net/netfilter/nf_conntrack_core.h | 2 +- include/net/netfilter/nf_conntrack_expect.h | 2 +- include/net/netfilter/nf_conntrack_l4proto.h | 4 +-- include/net/netfilter/nf_log.h | 8 ++--- include/net/netfilter/nf_queue.h | 6 ++-- net/bridge/br_netfilter.c | 4 +-- net/bridge/netfilter/ebt_log.c | 2 +- net/bridge/netfilter/ebt_ulog.c | 2 +- net/ipv4/netfilter/ipt_LOG.c | 2 +- net/ipv4/netfilter/ipt_ULOG.c | 2 +- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 4 +-- net/ipv6/netfilter/ip6t_LOG.c | 2 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 4 +-- net/netfilter/core.c | 4 +-- net/netfilter/nf_conntrack_core.c | 6 ++-- net/netfilter/nf_conntrack_expect.c | 2 +- net/netfilter/nf_conntrack_h323_main.c | 3 +- net/netfilter/nf_conntrack_proto_dccp.c | 4 +-- net/netfilter/nf_conntrack_proto_generic.c | 2 +- net/netfilter/nf_conntrack_proto_gre.c | 2 +- net/netfilter/nf_conntrack_proto_sctp.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 6 ++-- net/netfilter/nf_conntrack_proto_udp.c | 4 +-- net/netfilter/nf_conntrack_proto_udplite.c | 4 +-- net/netfilter/nf_internals.h | 4 +-- net/netfilter/nf_log.c | 6 ++-- net/netfilter/nf_queue.c | 10 +++--- net/netfilter/nf_sockopt.c | 15 ++++---- net/netfilter/nfnetlink_log.c | 4 +-- net/netfilter/x_tables.c | 47 ++++++++++++++------------ net/netfilter/xt_connlimit.c | 2 +- net/netfilter/xt_conntrack.c | 8 ++--- net/netfilter/xt_hashlimit.c | 11 +++--- 35 files changed, 127 insertions(+), 121 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 0c5eb7ed8b3..8c83d2e23bd 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -92,8 +92,8 @@ struct nf_hook_ops /* User fills in from here down. */ nf_hookfn *hook; struct module *owner; - int pf; - int hooknum; + u_int8_t pf; + unsigned int hooknum; /* Hooks are ordered in ascending priority. */ int priority; }; @@ -102,7 +102,7 @@ struct nf_sockopt_ops { struct list_head list; - int pf; + u_int8_t pf; /* Non-inclusive ranges: use 0/0/NULL to never get called. */ int set_optmin; @@ -140,7 +140,7 @@ extern struct ctl_path nf_net_ipv4_netfilter_sysctl_path[]; extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; -int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb, +int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int thresh); @@ -151,7 +151,7 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb, * okfn must be invoked by the caller in this case. Any other return * value indicates the packet has been consumed by the hook. */ -static inline int nf_hook_thresh(int pf, unsigned int hook, +static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, @@ -167,7 +167,7 @@ static inline int nf_hook_thresh(int pf, unsigned int hook, return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh); } -static inline int nf_hook(int pf, unsigned int hook, struct sk_buff *skb, +static inline int nf_hook(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *)) { @@ -212,14 +212,14 @@ __ret;}) NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, INT_MIN) /* Call setsockopt() */ -int nf_setsockopt(struct sock *sk, int pf, int optval, char __user *opt, +int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int len); -int nf_getsockopt(struct sock *sk, int pf, int optval, char __user *opt, +int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); -int compat_nf_setsockopt(struct sock *sk, int pf, int optval, +int compat_nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int len); -int compat_nf_getsockopt(struct sock *sk, int pf, int optval, +int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); /* Call this before modifying an existing packet: ensures it is @@ -292,7 +292,7 @@ extern void nf_unregister_afinfo(const struct nf_afinfo *afinfo); extern void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *); static inline void -nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family) +nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { #ifdef CONFIG_NF_NAT_NEEDED void (*decodefn)(struct sk_buff *, struct flowi *); @@ -315,7 +315,7 @@ extern struct proc_dir_entry *proc_net_netfilter; #else /* !CONFIG_NETFILTER */ #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb) #define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) (okfn)(skb) -static inline int nf_hook_thresh(int pf, unsigned int hook, +static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, @@ -324,7 +324,7 @@ static inline int nf_hook_thresh(int pf, unsigned int hook, { return okfn(skb); } -static inline int nf_hook(int pf, unsigned int hook, struct sk_buff *skb, +static inline int nf_hook(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *)) { @@ -332,7 +332,9 @@ static inline int nf_hook(int pf, unsigned int hook, struct sk_buff *skb, } struct flowi; static inline void -nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family) {} +nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) +{ +} #endif /*CONFIG_NETFILTER*/ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 2326296b6f2..6989b22716e 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -292,7 +292,7 @@ struct xt_table /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; - int af; /* address/protocol family */ + u_int8_t af; /* address/protocol family */ }; #include @@ -346,19 +346,19 @@ extern struct xt_table_info *xt_replace_table(struct xt_table *table, struct xt_table_info *newinfo, int *error); -extern struct xt_match *xt_find_match(int af, const char *name, u8 revision); -extern struct xt_target *xt_find_target(int af, const char *name, u8 revision); -extern struct xt_target *xt_request_find_target(int af, const char *name, +extern struct xt_match *xt_find_match(u8 af, const char *name, u8 revision); +extern struct xt_target *xt_find_target(u8 af, const char *name, u8 revision); +extern struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision); -extern int xt_find_revision(int af, const char *name, u8 revision, int target, - int *err); +extern int xt_find_revision(u8 af, const char *name, u8 revision, + int target, int *err); -extern struct xt_table *xt_find_table_lock(struct net *net, int af, +extern struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name); extern void xt_table_unlock(struct xt_table *t); -extern int xt_proto_init(struct net *net, int af); -extern void xt_proto_fini(struct net *net, int af); +extern int xt_proto_init(struct net *net, u_int8_t af); +extern void xt_proto_fini(struct net *net, u_int8_t af); extern struct xt_table_info *xt_alloc_table_info(unsigned int size); extern void xt_free_table_info(struct xt_table_info *info); @@ -423,12 +423,12 @@ struct compat_xt_counters_info #define COMPAT_XT_ALIGN(s) (((s) + (__alignof__(struct compat_xt_counters)-1)) \ & ~(__alignof__(struct compat_xt_counters)-1)) -extern void xt_compat_lock(int af); -extern void xt_compat_unlock(int af); +extern void xt_compat_lock(u_int8_t af); +extern void xt_compat_unlock(u_int8_t af); -extern int xt_compat_add_offset(int af, unsigned int offset, short delta); -extern void xt_compat_flush_offsets(int af); -extern short xt_compat_calc_jump(int af, unsigned int offset); +extern int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta); +extern void xt_compat_flush_offsets(u_int8_t af); +extern short xt_compat_calc_jump(u_int8_t af, unsigned int offset); extern int xt_compat_match_offset(const struct xt_match *match); extern int xt_compat_match_from_user(struct xt_entry_match *m, diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index a8177121093..05760d6a706 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -20,7 +20,7 @@ /* This header is used to share core functionality between the standalone connection tracking module, and the compatibility layer's use of connection tracking. */ -extern unsigned int nf_conntrack_in(int pf, +extern unsigned int nf_conntrack_in(u_int8_t pf, unsigned int hooknum, struct sk_buff *skb); diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index dfdf4b45947..4c4d894cb9b 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -86,7 +86,7 @@ void nf_ct_unexpect_related(struct nf_conntrack_expect *exp); /* Allocate space for an expectation: this is mandatory before calling nf_ct_expect_related. You will have to call put afterwards. */ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me); -void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, int, +void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t, const union nf_inet_addr *, const union nf_inet_addr *, u_int8_t, const __be16 *, const __be16 *); diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 723df9d1cc3..d4376e97bae 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -39,7 +39,7 @@ struct nf_conntrack_l4proto const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum); /* Called when a new connection for this protocol found; @@ -52,7 +52,7 @@ struct nf_conntrack_l4proto int (*error)(struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, - int pf, unsigned int hooknum); + u_int8_t pf, unsigned int hooknum); /* Print out the per-protocol part of the tuple. Return like seq_* */ int (*print_tuple)(struct seq_file *s, diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 8c6b5ae4553..7182c06974f 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -28,7 +28,7 @@ struct nf_loginfo { } u; }; -typedef void nf_logfn(unsigned int pf, +typedef void nf_logfn(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -43,12 +43,12 @@ struct nf_logger { }; /* Function to register/unregister log function. */ -int nf_log_register(int pf, const struct nf_logger *logger); +int nf_log_register(u_int8_t pf, const struct nf_logger *logger); void nf_log_unregister(const struct nf_logger *logger); -void nf_log_unregister_pf(int pf); +void nf_log_unregister_pf(u_int8_t pf); /* Calls the registered backend logging function */ -void nf_log_packet(int pf, +void nf_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index d030044e923..252fd1010b7 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -8,7 +8,7 @@ struct nf_queue_entry { unsigned int id; struct nf_hook_ops *elem; - int pf; + u_int8_t pf; unsigned int hook; struct net_device *indev; struct net_device *outdev; @@ -24,9 +24,9 @@ struct nf_queue_handler { char *name; }; -extern int nf_register_queue_handler(int pf, +extern int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh); -extern int nf_unregister_queue_handler(int pf, +extern int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh); extern void nf_unregister_queue_handlers(const struct nf_queue_handler *qh); extern void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 6a9a6cd74b1..a4abed5b4c4 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -657,7 +657,7 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb, { struct nf_bridge_info *nf_bridge; struct net_device *parent; - int pf; + u_int8_t pf; if (!skb->nf_bridge) return NF_ACCEPT; @@ -791,7 +791,7 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb, { struct nf_bridge_info *nf_bridge = skb->nf_bridge; struct net_device *realoutdev = bridge_parent(skb->dev); - int pf; + u_int8_t pf; #ifdef CONFIG_NETFILTER_DEBUG /* Be very paranoid. This probably won't happen anymore, but let's diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 2f430d4ae91..3770cd8a7b3 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -84,7 +84,7 @@ print_ports(const struct sk_buff *skb, uint8_t protocol, int offset) #define myNIPQUAD(a) a[0], a[1], a[2], a[3] static void -ebt_log_packet(unsigned int pf, unsigned int hooknum, +ebt_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 2d4c9ef909f..c84bda61afb 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -223,7 +223,7 @@ alloc_failure: } /* this function is registered with the netfilter core */ -static void ebt_log_packet(unsigned int pf, unsigned int hooknum, +static void ebt_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *li, const char *prefix) diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 0af14137137..9330ba3577e 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -375,7 +375,7 @@ static struct nf_loginfo default_loginfo = { }; static void -ipt_log_packet(unsigned int pf, +ipt_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index b192756c6d0..d8241e6b077 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -292,7 +292,7 @@ ulog_tg(struct sk_buff *skb, const struct net_device *in, return XT_CONTINUE; } -static void ipt_logfn(unsigned int pf, +static void ipt_logfn(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 97791048fa9..da8edcdaef3 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -79,7 +79,7 @@ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { /* Try to delete connection immediately after all replies: @@ -173,7 +173,7 @@ icmp_error_message(struct sk_buff *skb, /* Small and modified version of icmp_rcv */ static int icmp_error(struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) + enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { const struct icmphdr *icmph; struct icmphdr _ih; diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 3a2316974f8..0716f8afa0b 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -385,7 +385,7 @@ static struct nf_loginfo default_loginfo = { }; static void -ip6t_log_packet(unsigned int pf, +ip6t_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 14d47d83354..5756f30ebc6 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -81,7 +81,7 @@ static int icmpv6_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { /* Try to delete connection immediately after all replies: @@ -173,7 +173,7 @@ icmpv6_error_message(struct sk_buff *skb, static int icmpv6_error(struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) + enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 292fa28146f..26b8f489d7a 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -113,7 +113,7 @@ EXPORT_SYMBOL(nf_unregister_hooks); unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, - int hook, + unsigned int hook, const struct net_device *indev, const struct net_device *outdev, struct list_head **i, @@ -155,7 +155,7 @@ unsigned int nf_iterate(struct list_head *head, /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. */ -int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb, +int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9d1830da8e8..6aaf64b5ded 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -665,7 +665,7 @@ resolve_normal_ct(struct sk_buff *skb, } unsigned int -nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff *skb) +nf_conntrack_in(u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -683,7 +683,7 @@ nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff *skb) } /* rcu_read_lock()ed by nf_hook_slow */ - l3proto = __nf_ct_l3proto_find((u_int16_t)pf); + l3proto = __nf_ct_l3proto_find(pf); ret = l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, &protonum); if (ret <= 0) { @@ -693,7 +693,7 @@ nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff *skb) return -ret; } - l4proto = __nf_ct_l4proto_find((u_int16_t)pf, protonum); + l4proto = __nf_ct_l4proto_find(pf, protonum); /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index e8f0dead267..990fa12f2ee 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -241,7 +241,7 @@ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, - int family, + u_int8_t family, const union nf_inet_addr *saddr, const union nf_inet_addr *daddr, u_int8_t proto, const __be16 *src, const __be16 *dst) diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 2f83c158934..5dc0478108a 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -709,7 +709,8 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, /* If the calling party is on the same side of the forward-to party, * we don't need to track the second call */ static int callforward_do_filter(const union nf_inet_addr *src, - const union nf_inet_addr *dst, int family) + const union nf_inet_addr *dst, + u_int8_t family) { const struct nf_afinfo *afinfo; struct flowi fl1, fl2; diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index e7866dd3cde..edc30358dc1 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -461,7 +461,7 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh) static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, unsigned int hooknum) + u_int8_t pf, unsigned int hooknum) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; @@ -546,7 +546,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, } static int dccp_error(struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, int pf, + enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { struct dccp_hdr _dh, *dh; diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index e31b0e7bd0b..dbe680af85d 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -45,7 +45,7 @@ static int packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_generic_timeout); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9bd03967fea..c5a78220fa3 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -219,7 +219,7 @@ static int gre_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { /* If we've seen traffic both ways, this is a GRE connection. diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 30aa5b94a77..b5a90596d3f 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -287,7 +287,7 @@ static int sctp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { enum sctp_conntrack new_state, old_state; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 6f61261888e..539a8202025 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -486,7 +486,7 @@ static bool tcp_in_window(const struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, - int pf) + u_int8_t pf) { struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; @@ -749,7 +749,7 @@ static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] = static int tcp_error(struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { const struct tcphdr *th; @@ -804,7 +804,7 @@ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { struct nf_conntrack_tuple *tuple; diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8b21762e65d..2a965c4a0ea 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -66,7 +66,7 @@ static int udp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { /* If we've seen traffic both ways, this is some kind of UDP @@ -91,7 +91,7 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, static int udp_error(struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { unsigned int udplen = skb->len - dataoff; diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 1fa62f3c24f..4fb6c8d83a8 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -65,7 +65,7 @@ static int udplite_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { /* If we've seen traffic both ways, this is some kind of UDP @@ -91,7 +91,7 @@ static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb, static int udplite_error(struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { unsigned int udplen = skb->len - dataoff; diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 196269c1e58..bf6609978af 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -15,7 +15,7 @@ /* core.c */ extern unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, - int hook, + unsigned int hook, const struct net_device *indev, const struct net_device *outdev, struct list_head **i, @@ -25,7 +25,7 @@ extern unsigned int nf_iterate(struct list_head *head, /* nf_queue.c */ extern int nf_queue(struct sk_buff *skb, struct list_head *elem, - int pf, unsigned int hook, + u_int8_t pf, unsigned int hook, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 9fda6ee95a3..5c2f7332015 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -20,7 +20,7 @@ static DEFINE_MUTEX(nf_log_mutex); /* return EBUSY if somebody else is registered, EEXIST if the same logger * is registred, 0 on success. */ -int nf_log_register(int pf, const struct nf_logger *logger) +int nf_log_register(u_int8_t pf, const struct nf_logger *logger) { int ret; @@ -45,7 +45,7 @@ int nf_log_register(int pf, const struct nf_logger *logger) } EXPORT_SYMBOL(nf_log_register); -void nf_log_unregister_pf(int pf) +void nf_log_unregister_pf(u_int8_t pf) { if (pf >= NPROTO) return; @@ -73,7 +73,7 @@ void nf_log_unregister(const struct nf_logger *logger) } EXPORT_SYMBOL(nf_log_unregister); -void nf_log_packet(int pf, +void nf_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 582ec3efc8a..f285086f629 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -22,7 +22,7 @@ static DEFINE_MUTEX(queue_handler_mutex); /* return EBUSY when somebody else is registered, return EEXIST if the * same handler is registered, return 0 in case of success. */ -int nf_register_queue_handler(int pf, const struct nf_queue_handler *qh) +int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) { int ret; @@ -45,7 +45,7 @@ int nf_register_queue_handler(int pf, const struct nf_queue_handler *qh) EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ -int nf_unregister_queue_handler(int pf, const struct nf_queue_handler *qh) +int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) { if (pf >= NPROTO) return -EINVAL; @@ -67,7 +67,7 @@ EXPORT_SYMBOL(nf_unregister_queue_handler); void nf_unregister_queue_handlers(const struct nf_queue_handler *qh) { - int pf; + u_int8_t pf; mutex_lock(&queue_handler_mutex); for (pf = 0; pf < NPROTO; pf++) { @@ -107,7 +107,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) */ static int __nf_queue(struct sk_buff *skb, struct list_head *elem, - int pf, unsigned int hook, + u_int8_t pf, unsigned int hook, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), @@ -191,7 +191,7 @@ err: int nf_queue(struct sk_buff *skb, struct list_head *elem, - int pf, unsigned int hook, + u_int8_t pf, unsigned int hook, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c index 01489681fa9..f9b46de6a3d 100644 --- a/net/netfilter/nf_sockopt.c +++ b/net/netfilter/nf_sockopt.c @@ -60,7 +60,7 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg) } EXPORT_SYMBOL(nf_unregister_sockopt); -static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, int pf, +static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf, int val, int get) { struct nf_sockopt_ops *ops; @@ -96,7 +96,7 @@ out: } /* Call get/setsockopt() */ -static int nf_sockopt(struct sock *sk, int pf, int val, +static int nf_sockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len, int get) { struct nf_sockopt_ops *ops; @@ -115,21 +115,22 @@ static int nf_sockopt(struct sock *sk, int pf, int val, return ret; } -int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt, +int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int len) { return nf_sockopt(sk, pf, val, opt, &len, 0); } EXPORT_SYMBOL(nf_setsockopt); -int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len) +int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, + int *len) { return nf_sockopt(sk, pf, val, opt, len, 1); } EXPORT_SYMBOL(nf_getsockopt); #ifdef CONFIG_COMPAT -static int compat_nf_sockopt(struct sock *sk, int pf, int val, +static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len, int get) { struct nf_sockopt_ops *ops; @@ -155,14 +156,14 @@ static int compat_nf_sockopt(struct sock *sk, int pf, int val, return ret; } -int compat_nf_setsockopt(struct sock *sk, int pf, +int compat_nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int len) { return compat_nf_sockopt(sk, pf, val, opt, &len, 0); } EXPORT_SYMBOL(compat_nf_setsockopt); -int compat_nf_getsockopt(struct sock *sk, int pf, +int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len) { return compat_nf_sockopt(sk, pf, val, opt, len, 1); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 9a35b57ab76..41e0105d382 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -359,7 +359,7 @@ static inline int __build_packet_message(struct nfulnl_instance *inst, const struct sk_buff *skb, unsigned int data_len, - unsigned int pf, + u_int8_t pf, unsigned int hooknum, const struct net_device *indev, const struct net_device *outdev, @@ -534,7 +534,7 @@ static struct nf_loginfo default_loginfo = { /* log handler for internal netfilter logging api */ static void -nfulnl_log_packet(unsigned int pf, +nfulnl_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 5d75cd86ebb..cf2f3e90cef 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -68,7 +68,8 @@ static const char *const xt_prefix[NPROTO] = { int xt_register_target(struct xt_target *target) { - int ret, af = target->family; + u_int8_t af = target->family; + int ret; ret = mutex_lock_interruptible(&xt[af].mutex); if (ret != 0) @@ -82,7 +83,7 @@ EXPORT_SYMBOL(xt_register_target); void xt_unregister_target(struct xt_target *target) { - int af = target->family; + u_int8_t af = target->family; mutex_lock(&xt[af].mutex); list_del(&target->list); @@ -123,7 +124,8 @@ EXPORT_SYMBOL(xt_unregister_targets); int xt_register_match(struct xt_match *match) { - int ret, af = match->family; + u_int8_t af = match->family; + int ret; ret = mutex_lock_interruptible(&xt[af].mutex); if (ret != 0) @@ -139,7 +141,7 @@ EXPORT_SYMBOL(xt_register_match); void xt_unregister_match(struct xt_match *match) { - int af = match->family; + u_int8_t af = match->family; mutex_lock(&xt[af].mutex); list_del(&match->list); @@ -185,7 +187,7 @@ EXPORT_SYMBOL(xt_unregister_matches); */ /* Find match, grabs ref. Returns ERR_PTR() on error. */ -struct xt_match *xt_find_match(int af, const char *name, u8 revision) +struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) { struct xt_match *m; int err = 0; @@ -210,7 +212,7 @@ struct xt_match *xt_find_match(int af, const char *name, u8 revision) EXPORT_SYMBOL(xt_find_match); /* Find target, grabs ref. Returns ERR_PTR() on error. */ -struct xt_target *xt_find_target(int af, const char *name, u8 revision) +struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) { struct xt_target *t; int err = 0; @@ -234,7 +236,7 @@ struct xt_target *xt_find_target(int af, const char *name, u8 revision) } EXPORT_SYMBOL(xt_find_target); -struct xt_target *xt_request_find_target(int af, const char *name, u8 revision) +struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) { struct xt_target *target; @@ -246,7 +248,7 @@ struct xt_target *xt_request_find_target(int af, const char *name, u8 revision) } EXPORT_SYMBOL_GPL(xt_request_find_target); -static int match_revfn(int af, const char *name, u8 revision, int *bestp) +static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) { const struct xt_match *m; int have_rev = 0; @@ -262,7 +264,7 @@ static int match_revfn(int af, const char *name, u8 revision, int *bestp) return have_rev; } -static int target_revfn(int af, const char *name, u8 revision, int *bestp) +static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) { const struct xt_target *t; int have_rev = 0; @@ -279,7 +281,7 @@ static int target_revfn(int af, const char *name, u8 revision, int *bestp) } /* Returns true or false (if no such extension at all) */ -int xt_find_revision(int af, const char *name, u8 revision, int target, +int xt_find_revision(u8 af, const char *name, u8 revision, int target, int *err) { int have_rev, best = -1; @@ -337,7 +339,7 @@ int xt_check_match(const struct xt_match *match, unsigned short family, EXPORT_SYMBOL_GPL(xt_check_match); #ifdef CONFIG_COMPAT -int xt_compat_add_offset(int af, unsigned int offset, short delta) +int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta) { struct compat_delta *tmp; @@ -359,7 +361,7 @@ int xt_compat_add_offset(int af, unsigned int offset, short delta) } EXPORT_SYMBOL_GPL(xt_compat_add_offset); -void xt_compat_flush_offsets(int af) +void xt_compat_flush_offsets(u_int8_t af) { struct compat_delta *tmp, *next; @@ -373,7 +375,7 @@ void xt_compat_flush_offsets(int af) } EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); -short xt_compat_calc_jump(int af, unsigned int offset) +short xt_compat_calc_jump(u_int8_t af, unsigned int offset) { struct compat_delta *tmp; short delta; @@ -590,7 +592,8 @@ void xt_free_table_info(struct xt_table_info *info) EXPORT_SYMBOL(xt_free_table_info); /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ -struct xt_table *xt_find_table_lock(struct net *net, int af, const char *name) +struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, + const char *name) { struct xt_table *t; @@ -612,13 +615,13 @@ void xt_table_unlock(struct xt_table *table) EXPORT_SYMBOL_GPL(xt_table_unlock); #ifdef CONFIG_COMPAT -void xt_compat_lock(int af) +void xt_compat_lock(u_int8_t af) { mutex_lock(&xt[af].compat_mutex); } EXPORT_SYMBOL_GPL(xt_compat_lock); -void xt_compat_unlock(int af) +void xt_compat_unlock(u_int8_t af) { mutex_unlock(&xt[af].compat_mutex); } @@ -722,13 +725,13 @@ EXPORT_SYMBOL_GPL(xt_unregister_table); #ifdef CONFIG_PROC_FS struct xt_names_priv { struct seq_net_private p; - int af; + u_int8_t af; }; static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) { struct xt_names_priv *priv = seq->private; struct net *net = seq_file_net(seq); - int af = priv->af; + u_int8_t af = priv->af; mutex_lock(&xt[af].mutex); return seq_list_start(&net->xt.tables[af], *pos); @@ -738,7 +741,7 @@ static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct xt_names_priv *priv = seq->private; struct net *net = seq_file_net(seq); - int af = priv->af; + u_int8_t af = priv->af; return seq_list_next(v, &net->xt.tables[af], pos); } @@ -746,7 +749,7 @@ static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void xt_table_seq_stop(struct seq_file *seq, void *v) { struct xt_names_priv *priv = seq->private; - int af = priv->af; + u_int8_t af = priv->af; mutex_unlock(&xt[af].mutex); } @@ -922,7 +925,7 @@ static const struct file_operations xt_target_ops = { #endif /* CONFIG_PROC_FS */ -int xt_proto_init(struct net *net, int af) +int xt_proto_init(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; @@ -974,7 +977,7 @@ out: } EXPORT_SYMBOL_GPL(xt_proto_init); -void xt_proto_fini(struct net *net, int af) +void xt_proto_fini(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 70907f6baac..1655e2cf25c 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -82,7 +82,7 @@ static inline bool already_closed(const struct nf_conn *conn) static inline unsigned int same_source_net(const union nf_inet_addr *addr, const union nf_inet_addr *mask, - const union nf_inet_addr *u3, unsigned int family) + const union nf_inet_addr *u3, u_int8_t family) { if (family == AF_INET) { return (addr->ip & mask->ip) == (u3->ip & mask->ip); diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index d61412f58ef..28a42a3fbff 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -133,7 +133,7 @@ conntrack_addrcmp(const union nf_inet_addr *kaddr, static inline bool conntrack_mt_origsrc(const struct nf_conn *ct, const struct xt_conntrack_mtinfo1 *info, - unsigned int family) + u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, &info->origsrc_addr, &info->origsrc_mask, family); @@ -142,7 +142,7 @@ conntrack_mt_origsrc(const struct nf_conn *ct, static inline bool conntrack_mt_origdst(const struct nf_conn *ct, const struct xt_conntrack_mtinfo1 *info, - unsigned int family) + u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3, &info->origdst_addr, &info->origdst_mask, family); @@ -151,7 +151,7 @@ conntrack_mt_origdst(const struct nf_conn *ct, static inline bool conntrack_mt_replsrc(const struct nf_conn *ct, const struct xt_conntrack_mtinfo1 *info, - unsigned int family) + u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3, &info->replsrc_addr, &info->replsrc_mask, family); @@ -160,7 +160,7 @@ conntrack_mt_replsrc(const struct nf_conn *ct, static inline bool conntrack_mt_repldst(const struct nf_conn *ct, const struct xt_conntrack_mtinfo1 *info, - unsigned int family) + u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3, &info->repldst_addr, &info->repldst_mask, family); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index d9418a26781..0c9268fd2e1 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -80,7 +80,7 @@ struct dsthash_ent { struct xt_hashlimit_htable { struct hlist_node node; /* global list of all htables */ atomic_t use; - int family; + u_int8_t family; struct hashlimit_cfg1 cfg; /* config */ @@ -185,7 +185,7 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) } static void htable_gc(unsigned long htlong); -static int htable_create_v0(struct xt_hashlimit_info *minfo, int family) +static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family) { struct xt_hashlimit_htable *hinfo; unsigned int size; @@ -258,8 +258,7 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, int family) return 0; } -static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, - unsigned int family) +static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, u_int8_t family) { struct xt_hashlimit_htable *hinfo; unsigned int size; @@ -378,7 +377,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo) } static struct xt_hashlimit_htable *htable_find_get(const char *name, - int family) + u_int8_t family) { struct xt_hashlimit_htable *hinfo; struct hlist_node *pos; @@ -901,7 +900,7 @@ static void dl_seq_stop(struct seq_file *s, void *v) spin_unlock_bh(&htable->lock); } -static int dl_seq_real_show(struct dsthash_ent *ent, int family, +static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { /* recalculate to show accurate numbers */ -- cgit v1.2.3-70-g09d2 From e948b20a71a06a740c925d6ea22b59b4e17cfa0c Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:00 +0200 Subject: netfilter: rename ipt_recent to xt_recent Like with other modules (such as ipt_state), ipt_recent.h is changed to forward definitions to (IOW include) xt_recent.h, and xt_recent.c is changed to use the new constant names. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter/Kbuild | 1 + include/linux/netfilter/xt_recent.h | 26 ++ include/linux/netfilter_ipv4/ipt_recent.h | 28 +- net/ipv4/netfilter/Kconfig | 13 - net/ipv4/netfilter/Makefile | 1 - net/ipv4/netfilter/ipt_recent.c | 501 ----------------------------- net/netfilter/Kconfig | 11 + net/netfilter/Makefile | 1 + net/netfilter/xt_recent.c | 502 ++++++++++++++++++++++++++++++ 9 files changed, 552 insertions(+), 532 deletions(-) create mode 100644 include/linux/netfilter/xt_recent.h delete mode 100644 net/ipv4/netfilter/ipt_recent.c create mode 100644 net/netfilter/xt_recent.c (limited to 'net/ipv4') diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild index 3aff513d12c..5a8af875bce 100644 --- a/include/linux/netfilter/Kbuild +++ b/include/linux/netfilter/Kbuild @@ -32,6 +32,7 @@ header-y += xt_owner.h header-y += xt_pkttype.h header-y += xt_rateest.h header-y += xt_realm.h +header-y += xt_recent.h header-y += xt_sctp.h header-y += xt_state.h header-y += xt_statistic.h diff --git a/include/linux/netfilter/xt_recent.h b/include/linux/netfilter/xt_recent.h new file mode 100644 index 00000000000..5cfeb81c679 --- /dev/null +++ b/include/linux/netfilter/xt_recent.h @@ -0,0 +1,26 @@ +#ifndef _LINUX_NETFILTER_XT_RECENT_H +#define _LINUX_NETFILTER_XT_RECENT_H 1 + +enum { + XT_RECENT_CHECK = 1 << 0, + XT_RECENT_SET = 1 << 1, + XT_RECENT_UPDATE = 1 << 2, + XT_RECENT_REMOVE = 1 << 3, + XT_RECENT_TTL = 1 << 4, + + XT_RECENT_SOURCE = 0, + XT_RECENT_DEST = 1, + + XT_RECENT_NAME_LEN = 200, +}; + +struct xt_recent_mtinfo { + u_int32_t seconds; + u_int32_t hit_count; + u_int8_t check_set; + u_int8_t invert; + char name[XT_RECENT_NAME_LEN]; + u_int8_t side; +}; + +#endif /* _LINUX_NETFILTER_XT_RECENT_H */ diff --git a/include/linux/netfilter_ipv4/ipt_recent.h b/include/linux/netfilter_ipv4/ipt_recent.h index 6508a459265..d636cca133c 100644 --- a/include/linux/netfilter_ipv4/ipt_recent.h +++ b/include/linux/netfilter_ipv4/ipt_recent.h @@ -1,27 +1,21 @@ #ifndef _IPT_RECENT_H #define _IPT_RECENT_H -#define RECENT_NAME "ipt_recent" -#define RECENT_VER "v0.3.1" +#include -#define IPT_RECENT_CHECK 1 -#define IPT_RECENT_SET 2 -#define IPT_RECENT_UPDATE 4 -#define IPT_RECENT_REMOVE 8 -#define IPT_RECENT_TTL 16 +#define ipt_recent_info xt_recent_mtinfo -#define IPT_RECENT_SOURCE 0 -#define IPT_RECENT_DEST 1 +enum { + IPT_RECENT_CHECK = XT_RECENT_CHECK, + IPT_RECENT_SET = XT_RECENT_SET, + IPT_RECENT_UPDATE = XT_RECENT_UPDATE, + IPT_RECENT_REMOVE = XT_RECENT_REMOVE, + IPT_RECENT_TTL = XT_RECENT_TTL, -#define IPT_RECENT_NAME_LEN 200 + IPT_RECENT_SOURCE = XT_RECENT_SOURCE, + IPT_RECENT_DEST = XT_RECENT_DEST, -struct ipt_recent_info { - u_int32_t seconds; - u_int32_t hit_count; - u_int8_t check_set; - u_int8_t invert; - char name[IPT_RECENT_NAME_LEN]; - u_int8_t side; + IPT_RECENT_NAME_LEN = XT_RECENT_NAME_LEN, }; #endif /*_IPT_RECENT_H*/ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 90eb7cb47e7..4e842d56642 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -57,19 +57,6 @@ config IP_NF_IPTABLES To compile it as a module, choose M here. If unsure, say N. # The matches. -config IP_NF_MATCH_RECENT - tristate '"recent" match support' - depends on IP_NF_IPTABLES - depends on NETFILTER_ADVANCED - help - This match is used for creating one or many lists of recently - used addresses and then matching against that/those list(s). - - Short options are available by using 'iptables -m recent -h' - Official Website: - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_MATCH_ECN tristate '"ecn" match support' depends on IP_NF_IPTABLES diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 3f31291f37c..1107edbe478 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o -obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o # targets diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c deleted file mode 100644 index 3974d7cae5c..00000000000 --- a/net/ipv4/netfilter/ipt_recent.c +++ /dev/null @@ -1,501 +0,0 @@ -/* - * Copyright (c) 2006 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This is a replacement of the old ipt_recent module, which carried the - * following copyright notice: - * - * Author: Stephen Frost - * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -MODULE_AUTHOR("Patrick McHardy "); -MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4"); -MODULE_LICENSE("GPL"); - -static unsigned int ip_list_tot = 100; -static unsigned int ip_pkt_list_tot = 20; -static unsigned int ip_list_hash_size = 0; -static unsigned int ip_list_perms = 0644; -static unsigned int ip_list_uid = 0; -static unsigned int ip_list_gid = 0; -module_param(ip_list_tot, uint, 0400); -module_param(ip_pkt_list_tot, uint, 0400); -module_param(ip_list_hash_size, uint, 0400); -module_param(ip_list_perms, uint, 0400); -module_param(ip_list_uid, uint, 0400); -module_param(ip_list_gid, uint, 0400); -MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); -MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); -MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); -MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); -MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files"); -MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files"); - -struct recent_entry { - struct list_head list; - struct list_head lru_list; - __be32 addr; - u_int8_t ttl; - u_int8_t index; - u_int16_t nstamps; - unsigned long stamps[0]; -}; - -struct recent_table { - struct list_head list; - char name[IPT_RECENT_NAME_LEN]; -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *proc; -#endif - unsigned int refcnt; - unsigned int entries; - struct list_head lru_list; - struct list_head iphash[0]; -}; - -static LIST_HEAD(tables); -static DEFINE_SPINLOCK(recent_lock); -static DEFINE_MUTEX(recent_mutex); - -#ifdef CONFIG_PROC_FS -static struct proc_dir_entry *proc_dir; -static const struct file_operations recent_fops; -#endif - -static u_int32_t hash_rnd; -static int hash_rnd_initted; - -static unsigned int recent_entry_hash(__be32 addr) -{ - if (!hash_rnd_initted) { - get_random_bytes(&hash_rnd, 4); - hash_rnd_initted = 1; - } - return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1); -} - -static struct recent_entry * -recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl) -{ - struct recent_entry *e; - unsigned int h; - - h = recent_entry_hash(addr); - list_for_each_entry(e, &table->iphash[h], list) - if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) - return e; - return NULL; -} - -static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) -{ - list_del(&e->list); - list_del(&e->lru_list); - kfree(e); - t->entries--; -} - -static struct recent_entry * -recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl) -{ - struct recent_entry *e; - - if (t->entries >= ip_list_tot) { - e = list_entry(t->lru_list.next, struct recent_entry, lru_list); - recent_entry_remove(t, e); - } - e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, - GFP_ATOMIC); - if (e == NULL) - return NULL; - e->addr = addr; - e->ttl = ttl; - e->stamps[0] = jiffies; - e->nstamps = 1; - e->index = 1; - list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); - list_add_tail(&e->lru_list, &t->lru_list); - t->entries++; - return e; -} - -static void recent_entry_update(struct recent_table *t, struct recent_entry *e) -{ - e->stamps[e->index++] = jiffies; - if (e->index > e->nstamps) - e->nstamps = e->index; - e->index %= ip_pkt_list_tot; - list_move_tail(&e->lru_list, &t->lru_list); -} - -static struct recent_table *recent_table_lookup(const char *name) -{ - struct recent_table *t; - - list_for_each_entry(t, &tables, list) - if (!strcmp(t->name, name)) - return t; - return NULL; -} - -static void recent_table_flush(struct recent_table *t) -{ - struct recent_entry *e, *next; - unsigned int i; - - for (i = 0; i < ip_list_hash_size; i++) - list_for_each_entry_safe(e, next, &t->iphash[i], list) - recent_entry_remove(t, e); -} - -static bool -recent_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) -{ - const struct ipt_recent_info *info = matchinfo; - struct recent_table *t; - struct recent_entry *e; - __be32 addr; - u_int8_t ttl; - bool ret = info->invert; - - if (info->side == IPT_RECENT_DEST) - addr = ip_hdr(skb)->daddr; - else - addr = ip_hdr(skb)->saddr; - - ttl = ip_hdr(skb)->ttl; - /* use TTL as seen before forwarding */ - if (out && !skb->sk) - ttl++; - - spin_lock_bh(&recent_lock); - t = recent_table_lookup(info->name); - e = recent_entry_lookup(t, addr, - info->check_set & IPT_RECENT_TTL ? ttl : 0); - if (e == NULL) { - if (!(info->check_set & IPT_RECENT_SET)) - goto out; - e = recent_entry_init(t, addr, ttl); - if (e == NULL) - *hotdrop = true; - ret = !ret; - goto out; - } - - if (info->check_set & IPT_RECENT_SET) - ret = !ret; - else if (info->check_set & IPT_RECENT_REMOVE) { - recent_entry_remove(t, e); - ret = !ret; - } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) { - unsigned long time = jiffies - info->seconds * HZ; - unsigned int i, hits = 0; - - for (i = 0; i < e->nstamps; i++) { - if (info->seconds && time_after(time, e->stamps[i])) - continue; - if (++hits >= info->hit_count) { - ret = !ret; - break; - } - } - } - - if (info->check_set & IPT_RECENT_SET || - (info->check_set & IPT_RECENT_UPDATE && ret)) { - recent_entry_update(t, e); - e->ttl = ttl; - } -out: - spin_unlock_bh(&recent_lock); - return ret; -} - -static bool -recent_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) -{ - const struct ipt_recent_info *info = matchinfo; - struct recent_table *t; - unsigned i; - bool ret = false; - - if (hweight8(info->check_set & - (IPT_RECENT_SET | IPT_RECENT_REMOVE | - IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1) - return false; - if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) && - (info->seconds || info->hit_count)) - return false; - if (info->hit_count > ip_pkt_list_tot) - return false; - if (info->name[0] == '\0' || - strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) - return false; - - mutex_lock(&recent_mutex); - t = recent_table_lookup(info->name); - if (t != NULL) { - t->refcnt++; - ret = true; - goto out; - } - - t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, - GFP_KERNEL); - if (t == NULL) - goto out; - t->refcnt = 1; - strcpy(t->name, info->name); - INIT_LIST_HEAD(&t->lru_list); - for (i = 0; i < ip_list_hash_size; i++) - INIT_LIST_HEAD(&t->iphash[i]); -#ifdef CONFIG_PROC_FS - t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops); - if (t->proc == NULL) { - kfree(t); - goto out; - } - t->proc->uid = ip_list_uid; - t->proc->gid = ip_list_gid; - t->proc->data = t; -#endif - spin_lock_bh(&recent_lock); - list_add_tail(&t->list, &tables); - spin_unlock_bh(&recent_lock); - ret = true; -out: - mutex_unlock(&recent_mutex); - return ret; -} - -static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) -{ - const struct ipt_recent_info *info = matchinfo; - struct recent_table *t; - - mutex_lock(&recent_mutex); - t = recent_table_lookup(info->name); - if (--t->refcnt == 0) { - spin_lock_bh(&recent_lock); - list_del(&t->list); - spin_unlock_bh(&recent_lock); -#ifdef CONFIG_PROC_FS - remove_proc_entry(t->name, proc_dir); -#endif - recent_table_flush(t); - kfree(t); - } - mutex_unlock(&recent_mutex); -} - -#ifdef CONFIG_PROC_FS -struct recent_iter_state { - struct recent_table *table; - unsigned int bucket; -}; - -static void *recent_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(recent_lock) -{ - struct recent_iter_state *st = seq->private; - const struct recent_table *t = st->table; - struct recent_entry *e; - loff_t p = *pos; - - spin_lock_bh(&recent_lock); - - for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) - list_for_each_entry(e, &t->iphash[st->bucket], list) - if (p-- == 0) - return e; - return NULL; -} - -static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct recent_iter_state *st = seq->private; - const struct recent_table *t = st->table; - struct recent_entry *e = v; - struct list_head *head = e->list.next; - - while (head == &t->iphash[st->bucket]) { - if (++st->bucket >= ip_list_hash_size) - return NULL; - head = t->iphash[st->bucket].next; - } - (*pos)++; - return list_entry(head, struct recent_entry, list); -} - -static void recent_seq_stop(struct seq_file *s, void *v) - __releases(recent_lock) -{ - spin_unlock_bh(&recent_lock); -} - -static int recent_seq_show(struct seq_file *seq, void *v) -{ - const struct recent_entry *e = v; - unsigned int i; - - i = (e->index - 1) % ip_pkt_list_tot; - seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", - NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); - for (i = 0; i < e->nstamps; i++) - seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); - seq_printf(seq, "\n"); - return 0; -} - -static const struct seq_operations recent_seq_ops = { - .start = recent_seq_start, - .next = recent_seq_next, - .stop = recent_seq_stop, - .show = recent_seq_show, -}; - -static int recent_seq_open(struct inode *inode, struct file *file) -{ - struct proc_dir_entry *pde = PDE(inode); - struct recent_iter_state *st; - - st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); - if (st == NULL) - return -ENOMEM; - - st->table = pde->data; - return 0; -} - -static ssize_t recent_proc_write(struct file *file, const char __user *input, - size_t size, loff_t *loff) -{ - const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); - struct recent_table *t = pde->data; - struct recent_entry *e; - char buf[sizeof("+255.255.255.255")], *c = buf; - __be32 addr; - int add; - - if (size > sizeof(buf)) - size = sizeof(buf); - if (copy_from_user(buf, input, size)) - return -EFAULT; - while (isspace(*c)) - c++; - - if (size - (c - buf) < 5) - return c - buf; - if (!strncmp(c, "clear", 5)) { - c += 5; - spin_lock_bh(&recent_lock); - recent_table_flush(t); - spin_unlock_bh(&recent_lock); - return c - buf; - } - - switch (*c) { - case '-': - add = 0; - c++; - break; - case '+': - c++; - default: - add = 1; - break; - } - addr = in_aton(c); - - spin_lock_bh(&recent_lock); - e = recent_entry_lookup(t, addr, 0); - if (e == NULL) { - if (add) - recent_entry_init(t, addr, 0); - } else { - if (add) - recent_entry_update(t, e); - else - recent_entry_remove(t, e); - } - spin_unlock_bh(&recent_lock); - return size; -} - -static const struct file_operations recent_fops = { - .open = recent_seq_open, - .read = seq_read, - .write = recent_proc_write, - .release = seq_release_private, - .owner = THIS_MODULE, -}; -#endif /* CONFIG_PROC_FS */ - -static struct xt_match recent_mt_reg __read_mostly = { - .name = "recent", - .family = AF_INET, - .match = recent_mt, - .matchsize = sizeof(struct ipt_recent_info), - .checkentry = recent_mt_check, - .destroy = recent_mt_destroy, - .me = THIS_MODULE, -}; - -static int __init recent_mt_init(void) -{ - int err; - - if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) - return -EINVAL; - ip_list_hash_size = 1 << fls(ip_list_tot); - - err = xt_register_match(&recent_mt_reg); -#ifdef CONFIG_PROC_FS - if (err) - return err; - proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); - if (proc_dir == NULL) { - xt_unregister_match(&recent_mt_reg); - err = -ENOMEM; - } -#endif - return err; -} - -static void __exit recent_mt_exit(void) -{ - BUG_ON(!list_empty(&tables)); - xt_unregister_match(&recent_mt_reg); -#ifdef CONFIG_PROC_FS - remove_proc_entry("ipt_recent", init_net.proc_net); -#endif -} - -module_init(recent_mt_init); -module_exit(recent_mt_exit); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ee898e74808..ccc78b07a1a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -732,6 +732,17 @@ config NETFILTER_XT_MATCH_REALM If you want to compile it as a module, say M here and read . If unsure, say `N'. +config NETFILTER_XT_MATCH_RECENT + tristate '"recent" match support' + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + ---help--- + This match is used for creating one or many lists of recently + used addresses and then matching against that/those list(s). + + Short options are available by using 'iptables -m recent -h' + Official Website: + config NETFILTER_XT_MATCH_SCTP tristate '"sctp" protocol match support (EXPERIMENTAL)' depends on NETFILTER_XTABLES && EXPERIMENTAL diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3bd2cc556ae..f101cf61e6f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -76,6 +76,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o +obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c new file mode 100644 index 00000000000..422c0e4d66b --- /dev/null +++ b/net/netfilter/xt_recent.c @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2006 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is a replacement of the old ipt_recent module, which carried the + * following copyright notice: + * + * Author: Stephen Frost + * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_recent"); + +static unsigned int ip_list_tot = 100; +static unsigned int ip_pkt_list_tot = 20; +static unsigned int ip_list_hash_size = 0; +static unsigned int ip_list_perms = 0644; +static unsigned int ip_list_uid = 0; +static unsigned int ip_list_gid = 0; +module_param(ip_list_tot, uint, 0400); +module_param(ip_pkt_list_tot, uint, 0400); +module_param(ip_list_hash_size, uint, 0400); +module_param(ip_list_perms, uint, 0400); +module_param(ip_list_uid, uint, 0400); +module_param(ip_list_gid, uint, 0400); +MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); +MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); +MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files"); +MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files"); + +struct recent_entry { + struct list_head list; + struct list_head lru_list; + __be32 addr; + u_int8_t ttl; + u_int8_t index; + u_int16_t nstamps; + unsigned long stamps[0]; +}; + +struct recent_table { + struct list_head list; + char name[XT_RECENT_NAME_LEN]; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc; +#endif + unsigned int refcnt; + unsigned int entries; + struct list_head lru_list; + struct list_head iphash[0]; +}; + +static LIST_HEAD(tables); +static DEFINE_SPINLOCK(recent_lock); +static DEFINE_MUTEX(recent_mutex); + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *proc_dir; +static const struct file_operations recent_fops; +#endif + +static u_int32_t hash_rnd; +static int hash_rnd_initted; + +static unsigned int recent_entry_hash(__be32 addr) +{ + if (!hash_rnd_initted) { + get_random_bytes(&hash_rnd, 4); + hash_rnd_initted = 1; + } + return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1); +} + +static struct recent_entry * +recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl) +{ + struct recent_entry *e; + unsigned int h; + + h = recent_entry_hash(addr); + list_for_each_entry(e, &table->iphash[h], list) + if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) + return e; + return NULL; +} + +static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) +{ + list_del(&e->list); + list_del(&e->lru_list); + kfree(e); + t->entries--; +} + +static struct recent_entry * +recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl) +{ + struct recent_entry *e; + + if (t->entries >= ip_list_tot) { + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + recent_entry_remove(t, e); + } + e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, + GFP_ATOMIC); + if (e == NULL) + return NULL; + e->addr = addr; + e->ttl = ttl; + e->stamps[0] = jiffies; + e->nstamps = 1; + e->index = 1; + list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); + list_add_tail(&e->lru_list, &t->lru_list); + t->entries++; + return e; +} + +static void recent_entry_update(struct recent_table *t, struct recent_entry *e) +{ + e->stamps[e->index++] = jiffies; + if (e->index > e->nstamps) + e->nstamps = e->index; + e->index %= ip_pkt_list_tot; + list_move_tail(&e->lru_list, &t->lru_list); +} + +static struct recent_table *recent_table_lookup(const char *name) +{ + struct recent_table *t; + + list_for_each_entry(t, &tables, list) + if (!strcmp(t->name, name)) + return t; + return NULL; +} + +static void recent_table_flush(struct recent_table *t) +{ + struct recent_entry *e, *next; + unsigned int i; + + for (i = 0; i < ip_list_hash_size; i++) + list_for_each_entry_safe(e, next, &t->iphash[i], list) + recent_entry_remove(t, e); +} + +static bool +recent_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct xt_recent_mtinfo *info = matchinfo; + struct recent_table *t; + struct recent_entry *e; + __be32 addr; + u_int8_t ttl; + bool ret = info->invert; + + if (info->side == XT_RECENT_DEST) + addr = ip_hdr(skb)->daddr; + else + addr = ip_hdr(skb)->saddr; + + ttl = ip_hdr(skb)->ttl; + /* use TTL as seen before forwarding */ + if (out && !skb->sk) + ttl++; + + spin_lock_bh(&recent_lock); + t = recent_table_lookup(info->name); + e = recent_entry_lookup(t, addr, + info->check_set & XT_RECENT_TTL ? ttl : 0); + if (e == NULL) { + if (!(info->check_set & XT_RECENT_SET)) + goto out; + e = recent_entry_init(t, addr, ttl); + if (e == NULL) + *hotdrop = true; + ret = !ret; + goto out; + } + + if (info->check_set & XT_RECENT_SET) + ret = !ret; + else if (info->check_set & XT_RECENT_REMOVE) { + recent_entry_remove(t, e); + ret = !ret; + } else if (info->check_set & (XT_RECENT_CHECK | XT_RECENT_UPDATE)) { + unsigned long time = jiffies - info->seconds * HZ; + unsigned int i, hits = 0; + + for (i = 0; i < e->nstamps; i++) { + if (info->seconds && time_after(time, e->stamps[i])) + continue; + if (++hits >= info->hit_count) { + ret = !ret; + break; + } + } + } + + if (info->check_set & XT_RECENT_SET || + (info->check_set & XT_RECENT_UPDATE && ret)) { + recent_entry_update(t, e); + e->ttl = ttl; + } +out: + spin_unlock_bh(&recent_lock); + return ret; +} + +static bool +recent_mt_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) +{ + const struct xt_recent_mtinfo *info = matchinfo; + struct recent_table *t; + unsigned i; + bool ret = false; + + if (hweight8(info->check_set & + (XT_RECENT_SET | XT_RECENT_REMOVE | + XT_RECENT_CHECK | XT_RECENT_UPDATE)) != 1) + return false; + if ((info->check_set & (XT_RECENT_SET | XT_RECENT_REMOVE)) && + (info->seconds || info->hit_count)) + return false; + if (info->hit_count > ip_pkt_list_tot) + return false; + if (info->name[0] == '\0' || + strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN) + return false; + + mutex_lock(&recent_mutex); + t = recent_table_lookup(info->name); + if (t != NULL) { + t->refcnt++; + ret = true; + goto out; + } + + t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, + GFP_KERNEL); + if (t == NULL) + goto out; + t->refcnt = 1; + strcpy(t->name, info->name); + INIT_LIST_HEAD(&t->lru_list); + for (i = 0; i < ip_list_hash_size; i++) + INIT_LIST_HEAD(&t->iphash[i]); +#ifdef CONFIG_PROC_FS + t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops); + if (t->proc == NULL) { + kfree(t); + goto out; + } + t->proc->uid = ip_list_uid; + t->proc->gid = ip_list_gid; + t->proc->data = t; +#endif + spin_lock_bh(&recent_lock); + list_add_tail(&t->list, &tables); + spin_unlock_bh(&recent_lock); + ret = true; +out: + mutex_unlock(&recent_mutex); + return ret; +} + +static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) +{ + const struct xt_recent_mtinfo *info = matchinfo; + struct recent_table *t; + + mutex_lock(&recent_mutex); + t = recent_table_lookup(info->name); + if (--t->refcnt == 0) { + spin_lock_bh(&recent_lock); + list_del(&t->list); + spin_unlock_bh(&recent_lock); +#ifdef CONFIG_PROC_FS + remove_proc_entry(t->name, proc_dir); +#endif + recent_table_flush(t); + kfree(t); + } + mutex_unlock(&recent_mutex); +} + +#ifdef CONFIG_PROC_FS +struct recent_iter_state { + struct recent_table *table; + unsigned int bucket; +}; + +static void *recent_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(recent_lock) +{ + struct recent_iter_state *st = seq->private; + const struct recent_table *t = st->table; + struct recent_entry *e; + loff_t p = *pos; + + spin_lock_bh(&recent_lock); + + for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) + list_for_each_entry(e, &t->iphash[st->bucket], list) + if (p-- == 0) + return e; + return NULL; +} + +static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + const struct recent_table *t = st->table; + struct recent_entry *e = v; + struct list_head *head = e->list.next; + + while (head == &t->iphash[st->bucket]) { + if (++st->bucket >= ip_list_hash_size) + return NULL; + head = t->iphash[st->bucket].next; + } + (*pos)++; + return list_entry(head, struct recent_entry, list); +} + +static void recent_seq_stop(struct seq_file *s, void *v) + __releases(recent_lock) +{ + spin_unlock_bh(&recent_lock); +} + +static int recent_seq_show(struct seq_file *seq, void *v) +{ + const struct recent_entry *e = v; + unsigned int i; + + i = (e->index - 1) % ip_pkt_list_tot; + seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", + NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); + for (i = 0; i < e->nstamps; i++) + seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); + seq_printf(seq, "\n"); + return 0; +} + +static const struct seq_operations recent_seq_ops = { + .start = recent_seq_start, + .next = recent_seq_next, + .stop = recent_seq_stop, + .show = recent_seq_show, +}; + +static int recent_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct recent_iter_state *st; + + st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); + if (st == NULL) + return -ENOMEM; + + st->table = pde->data; + return 0; +} + +static ssize_t recent_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *loff) +{ + const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct recent_table *t = pde->data; + struct recent_entry *e; + char buf[sizeof("+255.255.255.255")], *c = buf; + __be32 addr; + int add; + + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size)) + return -EFAULT; + while (isspace(*c)) + c++; + + if (size - (c - buf) < 5) + return c - buf; + if (!strncmp(c, "clear", 5)) { + c += 5; + spin_lock_bh(&recent_lock); + recent_table_flush(t); + spin_unlock_bh(&recent_lock); + return c - buf; + } + + switch (*c) { + case '-': + add = 0; + c++; + break; + case '+': + c++; + default: + add = 1; + break; + } + addr = in_aton(c); + + spin_lock_bh(&recent_lock); + e = recent_entry_lookup(t, addr, 0); + if (e == NULL) { + if (add) + recent_entry_init(t, addr, 0); + } else { + if (add) + recent_entry_update(t, e); + else + recent_entry_remove(t, e); + } + spin_unlock_bh(&recent_lock); + return size; +} + +static const struct file_operations recent_fops = { + .open = recent_seq_open, + .read = seq_read, + .write = recent_proc_write, + .release = seq_release_private, + .owner = THIS_MODULE, +}; +#endif /* CONFIG_PROC_FS */ + +static struct xt_match recent_mt_reg __read_mostly = { + .name = "recent", + .family = AF_INET, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo), + .checkentry = recent_mt_check, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, +}; + +static int __init recent_mt_init(void) +{ + int err; + + if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) + return -EINVAL; + ip_list_hash_size = 1 << fls(ip_list_tot); + + err = xt_register_match(&recent_mt_reg); +#ifdef CONFIG_PROC_FS + if (err) + return err; + proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); + if (proc_dir == NULL) { + xt_unregister_match(&recent_mt_reg); + err = -ENOMEM; + } +#endif + return err; +} + +static void __exit recent_mt_exit(void) +{ + BUG_ON(!list_empty(&tables)); + xt_unregister_match(&recent_mt_reg); +#ifdef CONFIG_PROC_FS + remove_proc_entry("ipt_recent", init_net.proc_net); +#endif +} + +module_init(recent_mt_init); +module_exit(recent_mt_exit); -- cgit v1.2.3-70-g09d2 From ee999d8b9573df1b547aacdc6d79f86eb79c25cd Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:01 +0200 Subject: netfilter: x_tables: use NFPROTO_* in extensions Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/bridge/netfilter/ebt_log.c | 6 ++-- net/bridge/netfilter/ebt_ulog.c | 2 +- net/ipv4/netfilter/arp_tables.c | 58 +++++++++++++++++++----------------- net/ipv4/netfilter/arpt_mangle.c | 2 +- net/ipv4/netfilter/arptable_filter.c | 8 ++--- net/ipv4/netfilter/ipt_CLUSTERIP.c | 4 +-- net/ipv4/netfilter/ipt_ECN.c | 2 +- net/ipv4/netfilter/ipt_LOG.c | 6 ++-- net/ipv4/netfilter/ipt_MASQUERADE.c | 2 +- net/ipv4/netfilter/ipt_NETMAP.c | 2 +- net/ipv4/netfilter/ipt_REDIRECT.c | 2 +- net/ipv4/netfilter/ipt_REJECT.c | 2 +- net/ipv4/netfilter/ipt_TTL.c | 2 +- net/ipv4/netfilter/ipt_ULOG.c | 4 +-- net/ipv4/netfilter/ipt_addrtype.c | 4 +-- net/ipv4/netfilter/ipt_ah.c | 2 +- net/ipv4/netfilter/ipt_ecn.c | 2 +- net/ipv4/netfilter/ipt_ttl.c | 2 +- net/ipv6/netfilter/ip6t_HL.c | 2 +- net/ipv6/netfilter/ip6t_LOG.c | 7 +++-- net/ipv6/netfilter/ip6t_REJECT.c | 2 +- net/ipv6/netfilter/ip6t_ah.c | 2 +- net/ipv6/netfilter/ip6t_eui64.c | 2 +- net/ipv6/netfilter/ip6t_frag.c | 2 +- net/ipv6/netfilter/ip6t_hbh.c | 4 +-- net/ipv6/netfilter/ip6t_hl.c | 2 +- net/ipv6/netfilter/ip6t_ipv6header.c | 2 +- net/ipv6/netfilter/ip6t_mh.c | 2 +- net/ipv6/netfilter/ip6t_rt.c | 2 +- net/netfilter/xt_CLASSIFY.c | 4 +-- net/netfilter/xt_CONNMARK.c | 8 ++--- net/netfilter/xt_CONNSECMARK.c | 4 +-- net/netfilter/xt_DSCP.c | 10 +++---- net/netfilter/xt_MARK.c | 12 ++++---- net/netfilter/xt_NFLOG.c | 4 +-- net/netfilter/xt_NFQUEUE.c | 4 +-- net/netfilter/xt_NOTRACK.c | 4 +-- net/netfilter/xt_RATEEST.c | 4 +-- net/netfilter/xt_SECMARK.c | 4 +-- net/netfilter/xt_TCPMSS.c | 4 +-- net/netfilter/xt_TCPOPTSTRIP.c | 4 +-- net/netfilter/xt_TRACE.c | 4 +-- net/netfilter/xt_comment.c | 4 +-- net/netfilter/xt_connbytes.c | 4 +-- net/netfilter/xt_connlimit.c | 10 +++---- net/netfilter/xt_connmark.c | 8 ++--- net/netfilter/xt_conntrack.c | 10 +++---- net/netfilter/xt_dccp.c | 4 +-- net/netfilter/xt_dscp.c | 12 ++++---- net/netfilter/xt_esp.c | 4 +-- net/netfilter/xt_hashlimit.c | 40 ++++++++++++------------- net/netfilter/xt_helper.c | 4 +-- net/netfilter/xt_iprange.c | 6 ++-- net/netfilter/xt_length.c | 4 +-- net/netfilter/xt_limit.c | 4 +-- net/netfilter/xt_mac.c | 4 +-- net/netfilter/xt_mark.c | 8 ++--- net/netfilter/xt_multiport.c | 8 ++--- net/netfilter/xt_owner.c | 8 ++--- net/netfilter/xt_physdev.c | 4 +-- net/netfilter/xt_pkttype.c | 8 ++--- net/netfilter/xt_policy.c | 8 ++--- net/netfilter/xt_quota.c | 4 +-- net/netfilter/xt_rateest.c | 4 +-- net/netfilter/xt_realm.c | 2 +- net/netfilter/xt_recent.c | 21 ++++++------- net/netfilter/xt_sctp.c | 4 +-- net/netfilter/xt_state.c | 4 +-- net/netfilter/xt_statistic.c | 4 +-- net/netfilter/xt_string.c | 8 ++--- net/netfilter/xt_tcpmss.c | 4 +-- net/netfilter/xt_tcpudp.c | 12 ++++---- net/netfilter/xt_time.c | 4 +-- net/netfilter/xt_u32.c | 4 +-- 74 files changed, 225 insertions(+), 223 deletions(-) (limited to 'net/ipv4') diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 3770cd8a7b3..8b17c64bcd7 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -206,10 +206,10 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, li.u.log.logflags = info->bitmask; if (info->bitmask & EBT_LOG_NFLOG) - nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, + nf_log_packet(NFPROTO_BRIDGE, hooknr, skb, in, out, &li, "%s", info->prefix); else - ebt_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, + ebt_log_packet(NFPROTO_BRIDGE, hooknr, skb, in, out, &li, info->prefix); } @@ -234,7 +234,7 @@ static int __init ebt_log_init(void) ret = ebt_register_watcher(&log); if (ret < 0) return ret; - nf_log_register(PF_BRIDGE, &ebt_log_logger); + nf_log_register(NFPROTO_BRIDGE, &ebt_log_logger); return 0; } diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index c84bda61afb..3b1678cd66f 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -310,7 +310,7 @@ static int __init ebt_ulog_init(void) netlink_kernel_release(ebtulognl); if (ret == 0) - nf_log_register(PF_BRIDGE, &ebt_ulog_logger); + nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger); return ret; } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 03e83a65aec..b4a9a1799c9 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -463,7 +463,8 @@ static inline int check_target(struct arpt_entry *e, const char *name) t = arpt_get_target(e); target = t->u.kernel.target; - ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t), + ret = xt_check_target(target, NFPROTO_ARP, + t->u.target_size - sizeof(*t), name, e->comefrom, 0, 0); if (!ret && t->u.kernel.target->checkentry && !t->u.kernel.target->checkentry(name, e, target, t->data, @@ -488,7 +489,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, return ret; t = arpt_get_target(e); - target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name, + target = try_then_request_module(xt_find_target(NFPROTO_ARP, + t->u.user.name, t->u.user.revision), "arpt_%s", t->u.user.name); if (IS_ERR(target) || !target) { @@ -788,7 +790,7 @@ static void compat_standard_from_user(void *dst, void *src) int v = *(compat_int_t *)src; if (v > 0) - v += xt_compat_calc_jump(NF_ARP, v); + v += xt_compat_calc_jump(NFPROTO_ARP, v); memcpy(dst, &v, sizeof(v)); } @@ -797,7 +799,7 @@ static int compat_standard_to_user(void __user *dst, void *src) compat_int_t cv = *(int *)src; if (cv > 0) - cv -= xt_compat_calc_jump(NF_ARP, cv); + cv -= xt_compat_calc_jump(NFPROTO_ARP, cv); return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } @@ -815,7 +817,7 @@ static int compat_calc_entry(struct arpt_entry *e, t = arpt_get_target(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; - ret = xt_compat_add_offset(NF_ARP, entry_offset, off); + ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); if (ret) return ret; @@ -866,9 +868,9 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT if (compat) - xt_compat_lock(NF_ARP); + xt_compat_lock(NFPROTO_ARP); #endif - t = try_then_request_module(xt_find_table_lock(net, NF_ARP, name), + t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); if (t && !IS_ERR(t)) { struct arpt_getinfo info; @@ -878,7 +880,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (compat) { struct xt_table_info tmp; ret = compat_table_info(private, &tmp); - xt_compat_flush_offsets(NF_ARP); + xt_compat_flush_offsets(NFPROTO_ARP); private = &tmp; } #endif @@ -901,7 +903,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) ret = t ? PTR_ERR(t) : -ENOENT; #ifdef CONFIG_COMPAT if (compat) - xt_compat_unlock(NF_ARP); + xt_compat_unlock(NFPROTO_ARP); #endif return ret; } @@ -925,7 +927,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, return -EINVAL; } - t = xt_find_table_lock(net, NF_ARP, get.name); + t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (t && !IS_ERR(t)) { const struct xt_table_info *private = t->private; @@ -967,7 +969,7 @@ static int __do_replace(struct net *net, const char *name, goto out; } - t = try_then_request_module(xt_find_table_lock(net, NF_ARP, name), + t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), "arptable_%s", name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; @@ -1134,7 +1136,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, goto free; } - t = xt_find_table_lock(net, NF_ARP, name); + t = xt_find_table_lock(net, NFPROTO_ARP, name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; @@ -1218,7 +1220,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, entry_offset = (void *)e - (void *)base; t = compat_arpt_get_target(e); - target = try_then_request_module(xt_find_target(NF_ARP, + target = try_then_request_module(xt_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision), "arpt_%s", t->u.user.name); @@ -1232,7 +1234,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, off += xt_compat_target_offset(target); *size += off; - ret = xt_compat_add_offset(NF_ARP, entry_offset, off); + ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); if (ret) goto release_target; @@ -1333,7 +1335,7 @@ static int translate_compat_table(const char *name, duprintf("translate_compat_table: size %u\n", info->size); j = 0; - xt_compat_lock(NF_ARP); + xt_compat_lock(NFPROTO_ARP); /* Walk through entries, checking offsets. */ ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, check_compat_entry_size_and_hooks, @@ -1383,8 +1385,8 @@ static int translate_compat_table(const char *name, ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_copy_entry_from_user, &pos, &size, name, newinfo, entry1); - xt_compat_flush_offsets(NF_ARP); - xt_compat_unlock(NF_ARP); + xt_compat_flush_offsets(NFPROTO_ARP); + xt_compat_unlock(NFPROTO_ARP); if (ret) goto free_newinfo; @@ -1420,8 +1422,8 @@ out: COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); return ret; out_unlock: - xt_compat_flush_offsets(NF_ARP); - xt_compat_unlock(NF_ARP); + xt_compat_flush_offsets(NFPROTO_ARP); + xt_compat_unlock(NFPROTO_ARP); goto out; } @@ -1607,8 +1609,8 @@ static int compat_get_entries(struct net *net, return -EINVAL; } - xt_compat_lock(NF_ARP); - t = xt_find_table_lock(net, NF_ARP, get.name); + xt_compat_lock(NFPROTO_ARP); + t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (t && !IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; @@ -1623,13 +1625,13 @@ static int compat_get_entries(struct net *net, private->size, get.size); ret = -EAGAIN; } - xt_compat_flush_offsets(NF_ARP); + xt_compat_flush_offsets(NFPROTO_ARP); module_put(t->me); xt_table_unlock(t); } else ret = t ? PTR_ERR(t) : -ENOENT; - xt_compat_unlock(NF_ARP); + xt_compat_unlock(NFPROTO_ARP); return ret; } @@ -1709,7 +1711,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len break; } - try_then_request_module(xt_find_revision(NF_ARP, rev.name, + try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, rev.revision, 1, &ret), "arpt_%s", rev.name); break; @@ -1787,7 +1789,7 @@ void arpt_unregister_table(struct xt_table *table) static struct xt_target arpt_standard_target __read_mostly = { .name = ARPT_STANDARD_TARGET, .targetsize = sizeof(int), - .family = NF_ARP, + .family = NFPROTO_ARP, #ifdef CONFIG_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, @@ -1799,7 +1801,7 @@ static struct xt_target arpt_error_target __read_mostly = { .name = ARPT_ERROR_TARGET, .target = arpt_error, .targetsize = ARPT_FUNCTION_MAXNAMELEN, - .family = NF_ARP, + .family = NFPROTO_ARP, }; static struct nf_sockopt_ops arpt_sockopts = { @@ -1821,12 +1823,12 @@ static struct nf_sockopt_ops arpt_sockopts = { static int __net_init arp_tables_net_init(struct net *net) { - return xt_proto_init(net, NF_ARP); + return xt_proto_init(net, NFPROTO_ARP); } static void __net_exit arp_tables_net_exit(struct net *net) { - xt_proto_fini(net, NF_ARP); + xt_proto_fini(net, NFPROTO_ARP); } static struct pernet_operations arp_tables_net_ops = { diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index a385959d265..3f9e4ccd616 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -75,7 +75,7 @@ checkentry(const char *tablename, const void *e, const struct xt_target *target, static struct xt_target arpt_mangle_reg __read_mostly = { .name = "mangle", - .family = NF_ARP, + .family = NFPROTO_ARP, .target = target, .targetsize = sizeof(struct arpt_mangle), .checkentry = checkentry, diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 082f5dd3156..bee3d117661 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -51,7 +51,7 @@ static struct xt_table packet_filter = { .lock = __RW_LOCK_UNLOCKED(packet_filter.lock), .private = NULL, .me = THIS_MODULE, - .af = NF_ARP, + .af = NFPROTO_ARP, }; /* The work comes in here from netfilter.c */ @@ -89,21 +89,21 @@ static struct nf_hook_ops arpt_ops[] __read_mostly = { { .hook = arpt_in_hook, .owner = THIS_MODULE, - .pf = NF_ARP, + .pf = NFPROTO_ARP, .hooknum = NF_ARP_IN, .priority = NF_IP_PRI_FILTER, }, { .hook = arpt_out_hook, .owner = THIS_MODULE, - .pf = NF_ARP, + .pf = NFPROTO_ARP, .hooknum = NF_ARP_OUT, .priority = NF_IP_PRI_FILTER, }, { .hook = arpt_forward_hook, .owner = THIS_MODULE, - .pf = NF_ARP, + .pf = NFPROTO_ARP, .hooknum = NF_ARP_FORWARD, .priority = NF_IP_PRI_FILTER, }, diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index fafe8ebb4c5..63faddc18a1 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -445,7 +445,7 @@ struct compat_ipt_clusterip_tgt_info static struct xt_target clusterip_tg_reg __read_mostly = { .name = "CLUSTERIP", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = clusterip_tg, .checkentry = clusterip_tg_check, .destroy = clusterip_tg_destroy, @@ -546,7 +546,7 @@ arp_mangle(unsigned int hook, static struct nf_hook_ops cip_arp_ops __read_mostly = { .hook = arp_mangle, - .pf = NF_ARP, + .pf = NFPROTO_ARP, .hooknum = NF_ARP_OUT, .priority = -1 }; diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index d60139c134c..aee2364afff 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -124,7 +124,7 @@ ecn_tg_check(const char *tablename, const void *e_void, static struct xt_target ecn_tg_reg __read_mostly = { .name = "ECN", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = ecn_tg, .targetsize = sizeof(struct ipt_ECN_info), .table = "mangle", diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 9330ba3577e..1c9785df4df 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -437,7 +437,7 @@ log_tg(struct sk_buff *skb, const struct net_device *in, li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; - ipt_log_packet(PF_INET, hooknum, skb, in, out, &li, + ipt_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, &li, loginfo->prefix); return XT_CONTINUE; } @@ -463,7 +463,7 @@ log_tg_check(const char *tablename, const void *e, static struct xt_target log_tg_reg __read_mostly = { .name = "LOG", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = log_tg, .targetsize = sizeof(struct ipt_log_info), .checkentry = log_tg_check, @@ -483,7 +483,7 @@ static int __init log_tg_init(void) ret = xt_register_target(&log_tg_reg); if (ret < 0) return ret; - nf_log_register(PF_INET, &ipt_log_logger); + nf_log_register(NFPROTO_IPV4, &ipt_log_logger); return 0; } diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 0841aefaa50..9a4822f8243 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -153,7 +153,7 @@ static struct notifier_block masq_inet_notifier = { static struct xt_target masquerade_tg_reg __read_mostly = { .name = "MASQUERADE", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = masquerade_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index 6739abfd152..f281500bd7f 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -75,7 +75,7 @@ netmap_tg(struct sk_buff *skb, const struct net_device *in, static struct xt_target netmap_tg_reg __read_mostly = { .name = "NETMAP", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = netmap_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index 5c6292449d1..ef496105eae 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -92,7 +92,7 @@ redirect_tg(struct sk_buff *skb, const struct net_device *in, static struct xt_target redirect_tg_reg __read_mostly = { .name = "REDIRECT", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = redirect_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 2639872849d..9f5da0c2cae 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -201,7 +201,7 @@ reject_tg_check(const char *tablename, const void *e_void, static struct xt_target reject_tg_reg __read_mostly = { .name = "REJECT", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = reject_tg, .targetsize = sizeof(struct ipt_reject_info), .table = "filter", diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c index 30eed65e733..7d01d424a71 100644 --- a/net/ipv4/netfilter/ipt_TTL.c +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -80,7 +80,7 @@ ttl_tg_check(const char *tablename, const void *e, static struct xt_target ttl_tg_reg __read_mostly = { .name = "TTL", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = ttl_tg, .targetsize = sizeof(struct ipt_TTL_info), .table = "mangle", diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index d8241e6b077..9065e4a34fb 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -374,7 +374,7 @@ static int ulog_tg_compat_to_user(void __user *dst, void *src) static struct xt_target ulog_tg_reg __read_mostly = { .name = "ULOG", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = ulog_tg, .targetsize = sizeof(struct ipt_ulog_info), .checkentry = ulog_tg_check, @@ -419,7 +419,7 @@ static int __init ulog_tg_init(void) return ret; } if (nflog) - nf_log_register(PF_INET, &ipt_ulog_logger); + nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); return 0; } diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index 462a22c9787..2c9d88a6c83 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -108,14 +108,14 @@ addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void, static struct xt_match addrtype_mt_reg[] __read_mostly = { { .name = "addrtype", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = addrtype_mt_v0, .matchsize = sizeof(struct ipt_addrtype_info), .me = THIS_MODULE }, { .name = "addrtype", - .family = AF_INET, + .family = NFPROTO_IPV4, .revision = 1, .match = addrtype_mt_v1, .checkentry = addrtype_mt_checkentry_v1, diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index e977989629c..e2e993edd66 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -83,7 +83,7 @@ ah_mt_check(const char *tablename, const void *ip_void, static struct xt_match ah_mt_reg __read_mostly = { .name = "ah", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = ah_mt, .matchsize = sizeof(struct ipt_ah), .proto = IPPROTO_AH, diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index 749de8284ce..2c45b4be7c3 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -114,7 +114,7 @@ ecn_mt_check(const char *tablename, const void *ip_void, static struct xt_match ecn_mt_reg __read_mostly = { .name = "ecn", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = ecn_mt, .matchsize = sizeof(struct ipt_ecn_info), .checkentry = ecn_mt_check, diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c index e0b8caeb710..d4c3fdc2a79 100644 --- a/net/ipv4/netfilter/ipt_ttl.c +++ b/net/ipv4/netfilter/ipt_ttl.c @@ -46,7 +46,7 @@ ttl_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match ttl_mt_reg __read_mostly = { .name = "ttl", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = ttl_mt, .matchsize = sizeof(struct ipt_ttl_info), .me = THIS_MODULE, diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c index d5f8fd5f29d..7eebd350916 100644 --- a/net/ipv6/netfilter/ip6t_HL.c +++ b/net/ipv6/netfilter/ip6t_HL.c @@ -78,7 +78,7 @@ hl_tg6_check(const char *tablename, const void *entry, static struct xt_target hl_tg6_reg __read_mostly = { .name = "HL", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = hl_tg6, .targetsize = sizeof(struct ip6t_HL_info), .table = "mangle", diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 0716f8afa0b..fd148f3d842 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -449,7 +449,8 @@ log_tg6(struct sk_buff *skb, const struct net_device *in, li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; - ip6t_log_packet(PF_INET6, hooknum, skb, in, out, &li, loginfo->prefix); + ip6t_log_packet(NFPROTO_IPV6, hooknum, skb, in, out, + &li, loginfo->prefix); return XT_CONTINUE; } @@ -475,7 +476,7 @@ log_tg6_check(const char *tablename, const void *entry, static struct xt_target log_tg6_reg __read_mostly = { .name = "LOG", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = log_tg6, .targetsize = sizeof(struct ip6t_log_info), .checkentry = log_tg6_check, @@ -495,7 +496,7 @@ static int __init log_tg6_init(void) ret = xt_register_target(&log_tg6_reg); if (ret < 0) return ret; - nf_log_register(PF_INET6, &ip6t_logger); + nf_log_register(NFPROTO_IPV6, &ip6t_logger); return 0; } diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 44c8d65a243..672ad9ff3e2 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -237,7 +237,7 @@ reject_tg6_check(const char *tablename, const void *entry, static struct xt_target reject_tg6_reg __read_mostly = { .name = "REJECT", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = reject_tg6, .targetsize = sizeof(struct ip6t_reject_info), .table = "filter", diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index 429629fd63b..061f89beeb6 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -110,7 +110,7 @@ ah_mt6_check(const char *tablename, const void *entry, static struct xt_match ah_mt6_reg __read_mostly = { .name = "ah", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = ah_mt6, .matchsize = sizeof(struct ip6t_ah), .checkentry = ah_mt6_check, diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c index 8f331f12b2e..ba38df1116f 100644 --- a/net/ipv6/netfilter/ip6t_eui64.c +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -60,7 +60,7 @@ eui64_mt6(const struct sk_buff *skb, const struct net_device *in, static struct xt_match eui64_mt6_reg __read_mostly = { .name = "eui64", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = eui64_mt6, .matchsize = sizeof(int), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c index e2bbc63dba5..972f699af22 100644 --- a/net/ipv6/netfilter/ip6t_frag.c +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -127,7 +127,7 @@ frag_mt6_check(const char *tablename, const void *ip, static struct xt_match frag_mt6_reg __read_mostly = { .name = "frag", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = frag_mt6, .matchsize = sizeof(struct ip6t_frag), .checkentry = frag_mt6_check, diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c index 26654b26d7f..d5edb51a595 100644 --- a/net/ipv6/netfilter/ip6t_hbh.c +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -187,7 +187,7 @@ hbh_mt6_check(const char *tablename, const void *entry, static struct xt_match hbh_mt6_reg[] __read_mostly = { { .name = "hbh", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, @@ -196,7 +196,7 @@ static struct xt_match hbh_mt6_reg[] __read_mostly = { }, { .name = "dst", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c index 34567167384..25c1eb92fac 100644 --- a/net/ipv6/netfilter/ip6t_hl.c +++ b/net/ipv6/netfilter/ip6t_hl.c @@ -51,7 +51,7 @@ hl_mt6(const struct sk_buff *skb, const struct net_device *in, static struct xt_match hl_mt6_reg __read_mostly = { .name = "hl", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = hl_mt6, .matchsize = sizeof(struct ip6t_hl_info), .me = THIS_MODULE, diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 317a8960a75..ef0661aacea 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -138,7 +138,7 @@ ipv6header_mt6_check(const char *tablename, const void *ip, static struct xt_match ipv6header_mt6_reg __read_mostly = { .name = "ipv6header", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = ipv6header_mt6, .matchsize = sizeof(struct ip6t_ipv6header_info), .checkentry = ipv6header_mt6_check, diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c index e06678d07ec..dd876274ff6 100644 --- a/net/ipv6/netfilter/ip6t_mh.c +++ b/net/ipv6/netfilter/ip6t_mh.c @@ -84,7 +84,7 @@ mh_mt6_check(const char *tablename, const void *entry, static struct xt_match mh_mt6_reg __read_mostly = { .name = "mh", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = mh_mt6_check, .match = mh_mt6, .matchsize = sizeof(struct ip6t_mh), diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index 81aaf7aaaab..7c544ae591d 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -214,7 +214,7 @@ rt_mt6_check(const char *tablename, const void *entry, static struct xt_match rt_mt6_reg __read_mostly = { .name = "rt", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = rt_mt6, .matchsize = sizeof(struct ip6t_rt), .checkentry = rt_mt6_check, diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index 77a52bf8322..9d68da1748b 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -39,7 +39,7 @@ classify_tg(struct sk_buff *skb, const struct net_device *in, static struct xt_target classify_tg_reg[] __read_mostly = { { - .family = AF_INET, + .family = NFPROTO_IPV4, .name = "CLASSIFY", .target = classify_tg, .targetsize = sizeof(struct xt_classify_target_info), @@ -51,7 +51,7 @@ static struct xt_target classify_tg_reg[] __read_mostly = { }, { .name = "CLASSIFY", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = classify_tg, .targetsize = sizeof(struct xt_classify_target_info), .table = "mangle", diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index 5fecfb4794b..e72e5d01752 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -197,7 +197,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check_v0, .destroy = connmark_tg_destroy, .target = connmark_tg_v0, @@ -212,7 +212,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connmark_tg_check_v0, .destroy = connmark_tg_destroy, .target = connmark_tg_v0, @@ -227,7 +227,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), @@ -237,7 +237,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 76ca1f2421e..ae939e54dfa 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -127,7 +127,7 @@ connsecmark_tg_destroy(const struct xt_target *target, void *targinfo) static struct xt_target connsecmark_tg_reg[] __read_mostly = { { .name = "CONNSECMARK", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connsecmark_tg_check, .destroy = connsecmark_tg_destroy, .target = connsecmark_tg, @@ -136,7 +136,7 @@ static struct xt_target connsecmark_tg_reg[] __read_mostly = { }, { .name = "CONNSECMARK", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connsecmark_tg_check, .destroy = connsecmark_tg_destroy, .target = connsecmark_tg, diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 97efd74c04f..f0b4958528e 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -165,7 +165,7 @@ tos_tg6(struct sk_buff *skb, const struct net_device *in, static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "DSCP", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = dscp_tg_check, .target = dscp_tg, .targetsize = sizeof(struct xt_DSCP_info), @@ -174,7 +174,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = { }, { .name = "DSCP", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = dscp_tg_check, .target = dscp_tg6, .targetsize = sizeof(struct xt_DSCP_info), @@ -184,7 +184,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "TOS", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .table = "mangle", .target = tos_tg_v0, .targetsize = sizeof(struct ipt_tos_target_info), @@ -194,7 +194,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "TOS", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .table = "mangle", .target = tos_tg, .targetsize = sizeof(struct xt_tos_target_info), @@ -203,7 +203,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "TOS", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .table = "mangle", .target = tos_tg6, .targetsize = sizeof(struct xt_tos_target_info), diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c index f9ce20b5898..55ef0796c76 100644 --- a/net/netfilter/xt_MARK.c +++ b/net/netfilter/xt_MARK.c @@ -161,7 +161,7 @@ static int mark_tg_compat_to_user_v1(void __user *dst, void *src) static struct xt_target mark_tg_reg[] __read_mostly = { { .name = "MARK", - .family = AF_INET, + .family = NFPROTO_IPV4, .revision = 0, .checkentry = mark_tg_check_v0, .target = mark_tg_v0, @@ -176,7 +176,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { }, { .name = "MARK", - .family = AF_INET, + .family = NFPROTO_IPV4, .revision = 1, .checkentry = mark_tg_check_v1, .target = mark_tg_v1, @@ -191,7 +191,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { }, { .name = "MARK", - .family = AF_INET6, + .family = NFPROTO_IPV6, .revision = 0, .checkentry = mark_tg_check_v0, .target = mark_tg_v0, @@ -206,7 +206,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { }, { .name = "MARK", - .family = AF_INET6, + .family = NFPROTO_IPV6, .revision = 1, .checkentry = mark_tg_check_v1, .target = mark_tg_v1, @@ -222,7 +222,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { { .name = "MARK", .revision = 2, - .family = AF_INET, + .family = NFPROTO_IPV4, .target = mark_tg, .targetsize = sizeof(struct xt_mark_tginfo2), .me = THIS_MODULE, @@ -230,7 +230,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { { .name = "MARK", .revision = 2, - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = mark_tg, .targetsize = sizeof(struct xt_mark_tginfo2), .me = THIS_MODULE, diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index 19ae8efae65..9b095520176 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -55,7 +55,7 @@ nflog_tg_check(const char *tablename, const void *entry, static struct xt_target nflog_tg_reg[] __read_mostly = { { .name = "NFLOG", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = nflog_tg_check, .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), @@ -63,7 +63,7 @@ static struct xt_target nflog_tg_reg[] __read_mostly = { }, { .name = "NFLOG", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = nflog_tg_check, .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index beb24d19a56..c03c2e8d06f 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -36,14 +36,14 @@ nfqueue_tg(struct sk_buff *skb, const struct net_device *in, static struct xt_target nfqueue_tg_reg[] __read_mostly = { { .name = "NFQUEUE", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, }, { .name = "NFQUEUE", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c index 6c9de611eb8..b9ee268b37c 100644 --- a/net/netfilter/xt_NOTRACK.c +++ b/net/netfilter/xt_NOTRACK.c @@ -35,14 +35,14 @@ notrack_tg(struct sk_buff *skb, const struct net_device *in, static struct xt_target notrack_tg_reg[] __read_mostly = { { .name = "NOTRACK", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = notrack_tg, .table = "raw", .me = THIS_MODULE, }, { .name = "NOTRACK", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = notrack_tg, .table = "raw", .me = THIS_MODULE, diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 64d6ad38029..f7114fc5cfc 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -159,7 +159,7 @@ static void xt_rateest_tg_destroy(const struct xt_target *target, static struct xt_target xt_rateest_target[] __read_mostly = { { - .family = AF_INET, + .family = NFPROTO_IPV4, .name = "RATEEST", .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, @@ -168,7 +168,7 @@ static struct xt_target xt_rateest_target[] __read_mostly = { .me = THIS_MODULE, }, { - .family = AF_INET6, + .family = NFPROTO_IPV6, .name = "RATEEST", .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 94f87ee7552..8f8f57b93a6 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -128,7 +128,7 @@ static void secmark_tg_destroy(const struct xt_target *target, void *targinfo) static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = secmark_tg_check, .destroy = secmark_tg_destroy, .target = secmark_tg, @@ -137,7 +137,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { }, { .name = "SECMARK", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = secmark_tg_check, .destroy = secmark_tg_destroy, .target = secmark_tg, diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index beb5094703c..b868f995239 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -289,7 +289,7 @@ tcpmss_tg6_check(const char *tablename, const void *entry, static struct xt_target tcpmss_tg_reg[] __read_mostly = { { - .family = AF_INET, + .family = NFPROTO_IPV4, .name = "TCPMSS", .checkentry = tcpmss_tg4_check, .target = tcpmss_tg4, @@ -299,7 +299,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = { }, #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) { - .family = AF_INET6, + .family = NFPROTO_IPV6, .name = "TCPMSS", .checkentry = tcpmss_tg6_check, .target = tcpmss_tg6, diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 9685b6fcbc8..2e0ae6cc5d9 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -106,7 +106,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct net_device *in, static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { { .name = "TCPOPTSTRIP", - .family = AF_INET, + .family = NFPROTO_IPV4, .table = "mangle", .proto = IPPROTO_TCP, .target = tcpoptstrip_tg4, @@ -116,7 +116,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { #if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE) { .name = "TCPOPTSTRIP", - .family = AF_INET6, + .family = NFPROTO_IPV6, .table = "mangle", .proto = IPPROTO_TCP, .target = tcpoptstrip_tg6, diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index 30dab79a343..e1bcad57914 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -22,14 +22,14 @@ trace_tg(struct sk_buff *skb, const struct net_device *in, static struct xt_target trace_tg_reg[] __read_mostly = { { .name = "TRACE", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = trace_tg, .table = "raw", .me = THIS_MODULE, }, { .name = "TRACE", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = trace_tg, .table = "raw", .me = THIS_MODULE, diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c index 89f47364e84..fa211b2ab87 100644 --- a/net/netfilter/xt_comment.c +++ b/net/netfilter/xt_comment.c @@ -28,14 +28,14 @@ comment_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match comment_mt_reg[] __read_mostly = { { .name = "comment", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = comment_mt, .matchsize = sizeof(struct xt_comment_info), .me = THIS_MODULE }, { .name = "comment", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = comment_mt, .matchsize = sizeof(struct xt_comment_info), .me = THIS_MODULE diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 3e39c4fe193..d2cd22a49c9 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -130,7 +130,7 @@ connbytes_mt_destroy(const struct xt_match *match, void *matchinfo) static struct xt_match connbytes_mt_reg[] __read_mostly = { { .name = "connbytes", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connbytes_mt_check, .match = connbytes_mt, .destroy = connbytes_mt_destroy, @@ -139,7 +139,7 @@ static struct xt_match connbytes_mt_reg[] __read_mostly = { }, { .name = "connbytes", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connbytes_mt_check, .match = connbytes_mt, .destroy = connbytes_mt_destroy, diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 1655e2cf25c..d2453d182d6 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -84,7 +84,7 @@ same_source_net(const union nf_inet_addr *addr, const union nf_inet_addr *mask, const union nf_inet_addr *u3, u_int8_t family) { - if (family == AF_INET) { + if (family == NFPROTO_IPV4) { return (addr->ip & mask->ip) == (u3->ip & mask->ip); } else { union nf_inet_addr lh, rh; @@ -114,7 +114,7 @@ static int count_them(struct xt_connlimit_data *data, int matches = 0; - if (match->family == AF_INET6) + if (match->family == NFPROTO_IPV6) hash = &data->iphash[connlimit_iphash6(addr, mask)]; else hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)]; @@ -198,7 +198,7 @@ connlimit_mt(const struct sk_buff *skb, const struct net_device *in, match->family, &tuple)) goto hotdrop; - if (match->family == AF_INET6) { + if (match->family == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr)); } else { @@ -276,7 +276,7 @@ connlimit_mt_destroy(const struct xt_match *match, void *matchinfo) static struct xt_match connlimit_mt_reg[] __read_mostly = { { .name = "connlimit", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connlimit_mt_check, .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), @@ -285,7 +285,7 @@ static struct xt_match connlimit_mt_reg[] __read_mostly = { }, { .name = "connlimit", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connlimit_mt_check, .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index aaa1b96691f..0577b8ff4e1 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -140,7 +140,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = { { .name = "connmark", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connmark_mt_check_v0, .match = connmark_mt_v0, .destroy = connmark_mt_destroy, @@ -155,7 +155,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = { { .name = "connmark", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connmark_mt_check_v0, .match = connmark_mt_v0, .destroy = connmark_mt_destroy, @@ -170,7 +170,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = { { .name = "connmark", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connmark_mt_check, .match = connmark_mt, .matchsize = sizeof(struct xt_connmark_mtinfo1), @@ -180,7 +180,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = { { .name = "connmark", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connmark_mt_check, .match = connmark_mt, .matchsize = sizeof(struct xt_connmark_mtinfo1), diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 28a42a3fbff..392b457f9c2 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -121,9 +121,9 @@ conntrack_addrcmp(const union nf_inet_addr *kaddr, const union nf_inet_addr *uaddr, const union nf_inet_addr *umask, unsigned int l3proto) { - if (l3proto == AF_INET) + if (l3proto == NFPROTO_IPV4) return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; - else if (l3proto == AF_INET6) + else if (l3proto == NFPROTO_IPV6) return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, &uaddr->in6) == 0; else @@ -356,7 +356,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = conntrack_mt_v0, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, @@ -371,7 +371,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .matchsize = sizeof(struct xt_conntrack_mtinfo1), .match = conntrack_mt, .checkentry = conntrack_mt_check, @@ -381,7 +381,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .matchsize = sizeof(struct xt_conntrack_mtinfo1), .match = conntrack_mt, .checkentry = conntrack_mt_check, diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index 8b6522186d9..87971f47132 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -138,7 +138,7 @@ dccp_mt_check(const char *tablename, const void *inf, static struct xt_match dccp_mt_reg[] __read_mostly = { { .name = "dccp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = dccp_mt_check, .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), @@ -147,7 +147,7 @@ static struct xt_match dccp_mt_reg[] __read_mostly = { }, { .name = "dccp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = dccp_mt_check, .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c index 26f4aab9c42..7f03aa13a95 100644 --- a/net/netfilter/xt_dscp.c +++ b/net/netfilter/xt_dscp.c @@ -80,7 +80,7 @@ static bool tos_mt(const struct sk_buff *skb, const struct net_device *in, { const struct xt_tos_match_info *info = matchinfo; - if (match->family == AF_INET) + if (match->family == NFPROTO_IPV4) return ((ip_hdr(skb)->tos & info->tos_mask) == info->tos_value) ^ !!info->invert; else @@ -91,7 +91,7 @@ static bool tos_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match dscp_mt_reg[] __read_mostly = { { .name = "dscp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = dscp_mt_check, .match = dscp_mt, .matchsize = sizeof(struct xt_dscp_info), @@ -99,7 +99,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = { }, { .name = "dscp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = dscp_mt_check, .match = dscp_mt6, .matchsize = sizeof(struct xt_dscp_info), @@ -108,7 +108,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = { { .name = "tos", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = tos_mt_v0, .matchsize = sizeof(struct ipt_tos_info), .me = THIS_MODULE, @@ -116,7 +116,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = { { .name = "tos", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = tos_mt, .matchsize = sizeof(struct xt_tos_match_info), .me = THIS_MODULE, @@ -124,7 +124,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = { { .name = "tos", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = tos_mt, .matchsize = sizeof(struct xt_tos_match_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c index a133eb9b23e..045c4deecaf 100644 --- a/net/netfilter/xt_esp.c +++ b/net/netfilter/xt_esp.c @@ -88,7 +88,7 @@ esp_mt_check(const char *tablename, const void *ip_void, static struct xt_match esp_mt_reg[] __read_mostly = { { .name = "esp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = esp_mt_check, .match = esp_mt, .matchsize = sizeof(struct xt_esp), @@ -97,7 +97,7 @@ static struct xt_match esp_mt_reg[] __read_mostly = { }, { .name = "esp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = esp_mt_check, .match = esp_mt, .matchsize = sizeof(struct xt_esp), diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 0c9268fd2e1..7bae369603d 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -218,7 +218,7 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family) hinfo->cfg.gc_interval = minfo->cfg.gc_interval; hinfo->cfg.expire = minfo->cfg.expire; - if (family == AF_INET) + if (family == NFPROTO_IPV4) hinfo->cfg.srcmask = hinfo->cfg.dstmask = 32; else hinfo->cfg.srcmask = hinfo->cfg.dstmask = 128; @@ -237,11 +237,10 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family) hinfo->family = family; hinfo->rnd_initialized = 0; spin_lock_init(&hinfo->lock); - hinfo->pde = - proc_create_data(minfo->name, 0, - family == AF_INET ? hashlimit_procdir4 : - hashlimit_procdir6, - &dl_file_ops, hinfo); + hinfo->pde = proc_create_data(minfo->name, 0, + (family == NFPROTO_IPV4) ? + hashlimit_procdir4 : hashlimit_procdir6, + &dl_file_ops, hinfo); if (!hinfo->pde) { vfree(hinfo); return -1; @@ -300,11 +299,10 @@ static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, u_int8_t family) hinfo->rnd_initialized = 0; spin_lock_init(&hinfo->lock); - hinfo->pde = - proc_create_data(minfo->name, 0, - family == AF_INET ? hashlimit_procdir4 : - hashlimit_procdir6, - &dl_file_ops, hinfo); + hinfo->pde = proc_create_data(minfo->name, 0, + (family == NFPROTO_IPV4) ? + hashlimit_procdir4 : hashlimit_procdir6, + &dl_file_ops, hinfo); if (hinfo->pde == NULL) { vfree(hinfo); return -1; @@ -370,7 +368,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo) /* remove proc entry */ remove_proc_entry(hinfo->pde->name, - hinfo->family == AF_INET ? hashlimit_procdir4 : + hinfo->family == NFPROTO_IPV4 ? hashlimit_procdir4 : hashlimit_procdir6); htable_selective_cleanup(hinfo, select_all); vfree(hinfo); @@ -501,7 +499,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, memset(dst, 0, sizeof(*dst)); switch (hinfo->family) { - case AF_INET: + case NFPROTO_IPV4: if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) dst->ip.dst = maskl(ip_hdr(skb)->daddr, hinfo->cfg.dstmask); @@ -515,7 +513,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, nexthdr = ip_hdr(skb)->protocol; break; #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) - case AF_INET6: + case NFPROTO_IPV6: if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) { memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr, sizeof(dst->ip6.dst)); @@ -737,7 +735,7 @@ hashlimit_mt_check(const char *tablename, const void *inf, return false; if (info->name[sizeof(info->name)-1] != '\0') return false; - if (match->family == AF_INET) { + if (match->family == NFPROTO_IPV4) { if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32) return false; } else { @@ -805,7 +803,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { { .name = "hashlimit", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = hashlimit_mt_v0, .matchsize = sizeof(struct xt_hashlimit_info), #ifdef CONFIG_COMPAT @@ -820,7 +818,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { { .name = "hashlimit", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), .checkentry = hashlimit_mt_check, @@ -830,7 +828,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) { .name = "hashlimit", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = hashlimit_mt_v0, .matchsize = sizeof(struct xt_hashlimit_info), #ifdef CONFIG_COMPAT @@ -845,7 +843,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { { .name = "hashlimit", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), .checkentry = hashlimit_mt_check, @@ -907,7 +905,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, rateinfo_recalc(ent, jiffies); switch (family) { - case AF_INET: + case NFPROTO_IPV4: return seq_printf(s, "%ld %u.%u.%u.%u:%u->" "%u.%u.%u.%u:%u %u %u %u\n", (long)(ent->expires - jiffies)/HZ, @@ -918,7 +916,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, ent->rateinfo.credit, ent->rateinfo.credit_cap, ent->rateinfo.cost); #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) - case AF_INET6: + case NFPROTO_IPV6: return seq_printf(s, "%ld " NIP6_FMT ":%u->" NIP6_FMT ":%u %u %u %u\n", (long)(ent->expires - jiffies)/HZ, diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index dada2905d66..134d94324eb 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -81,7 +81,7 @@ static void helper_mt_destroy(const struct xt_match *match, void *matchinfo) static struct xt_match helper_mt_reg[] __read_mostly = { { .name = "helper", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = helper_mt_check, .match = helper_mt, .destroy = helper_mt_destroy, @@ -90,7 +90,7 @@ static struct xt_match helper_mt_reg[] __read_mostly = { }, { .name = "helper", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = helper_mt_check, .match = helper_mt, .destroy = helper_mt_destroy, diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c index c63e9333c75..a7498cc48dc 100644 --- a/net/netfilter/xt_iprange.c +++ b/net/netfilter/xt_iprange.c @@ -141,7 +141,7 @@ static struct xt_match iprange_mt_reg[] __read_mostly = { { .name = "iprange", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = iprange_mt_v0, .matchsize = sizeof(struct ipt_iprange_info), .me = THIS_MODULE, @@ -149,7 +149,7 @@ static struct xt_match iprange_mt_reg[] __read_mostly = { { .name = "iprange", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = iprange_mt4, .matchsize = sizeof(struct xt_iprange_mtinfo), .me = THIS_MODULE, @@ -157,7 +157,7 @@ static struct xt_match iprange_mt_reg[] __read_mostly = { { .name = "iprange", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = iprange_mt6, .matchsize = sizeof(struct xt_iprange_mtinfo), .me = THIS_MODULE, diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index b8640f97295..b8612d1914b 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -48,14 +48,14 @@ length_mt6(const struct sk_buff *skb, const struct net_device *in, static struct xt_match length_mt_reg[] __read_mostly = { { .name = "length", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = length_mt, .matchsize = sizeof(struct xt_length_info), .me = THIS_MODULE, }, { .name = "length", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = length_mt6, .matchsize = sizeof(struct xt_length_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index aad9ab8d204..584d66893c4 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -170,7 +170,7 @@ static int limit_mt_compat_to_user(void __user *dst, void *src) static struct xt_match limit_mt_reg[] __read_mostly = { { .name = "limit", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = limit_mt_check, .match = limit_mt, .matchsize = sizeof(struct xt_rateinfo), @@ -183,7 +183,7 @@ static struct xt_match limit_mt_reg[] __read_mostly = { }, { .name = "limit", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = limit_mt_check, .match = limit_mt, .matchsize = sizeof(struct xt_rateinfo), diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index b3e96a0ec17..60db240098a 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -42,7 +42,7 @@ mac_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match mac_mt_reg[] __read_mostly = { { .name = "mac", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = mac_mt, .matchsize = sizeof(struct xt_mac_info), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -52,7 +52,7 @@ static struct xt_match mac_mt_reg[] __read_mostly = { }, { .name = "mac", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = mac_mt, .matchsize = sizeof(struct xt_mac_info), .hooks = (1 << NF_INET_PRE_ROUTING) | diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 9f78f6120fb..c66affd5722 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -92,7 +92,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = { { .name = "mark", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = mark_mt_check_v0, .match = mark_mt_v0, .matchsize = sizeof(struct xt_mark_info), @@ -106,7 +106,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = { { .name = "mark", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = mark_mt_check_v0, .match = mark_mt_v0, .matchsize = sizeof(struct xt_mark_info), @@ -120,7 +120,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = { { .name = "mark", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = mark_mt, .matchsize = sizeof(struct xt_mark_mtinfo1), .me = THIS_MODULE, @@ -128,7 +128,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = { { .name = "mark", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = mark_mt, .matchsize = sizeof(struct xt_mark_mtinfo1), .me = THIS_MODULE, diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index fd88c489b70..f6fe008ab8c 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -216,7 +216,7 @@ multiport_mt6_check(const char *tablename, const void *info, static struct xt_match multiport_mt_reg[] __read_mostly = { { .name = "multiport", - .family = AF_INET, + .family = NFPROTO_IPV4, .revision = 0, .checkentry = multiport_mt_check_v0, .match = multiport_mt_v0, @@ -225,7 +225,7 @@ static struct xt_match multiport_mt_reg[] __read_mostly = { }, { .name = "multiport", - .family = AF_INET, + .family = NFPROTO_IPV4, .revision = 1, .checkentry = multiport_mt_check, .match = multiport_mt, @@ -234,7 +234,7 @@ static struct xt_match multiport_mt_reg[] __read_mostly = { }, { .name = "multiport", - .family = AF_INET6, + .family = NFPROTO_IPV6, .revision = 0, .checkentry = multiport_mt6_check_v0, .match = multiport_mt_v0, @@ -243,7 +243,7 @@ static struct xt_match multiport_mt_reg[] __read_mostly = { }, { .name = "multiport", - .family = AF_INET6, + .family = NFPROTO_IPV6, .revision = 1, .checkentry = multiport_mt6_check, .match = multiport_mt, diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 9059c16144c..d1c3b7ae9b4 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -153,7 +153,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = { { .name = "owner", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = owner_mt_v0, .matchsize = sizeof(struct ipt_owner_info), .checkentry = owner_mt_check_v0, @@ -164,7 +164,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = { { .name = "owner", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = owner_mt6_v0, .matchsize = sizeof(struct ip6t_owner_info), .checkentry = owner_mt6_check_v0, @@ -175,7 +175,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = { { .name = "owner", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = owner_mt, .matchsize = sizeof(struct xt_owner_match_info), .hooks = (1 << NF_INET_LOCAL_OUT) | @@ -185,7 +185,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = { { .name = "owner", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = owner_mt, .matchsize = sizeof(struct xt_owner_match_info), .hooks = (1 << NF_INET_LOCAL_OUT) | diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 4ec1094bda9..72a0bdd53fa 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -121,7 +121,7 @@ physdev_mt_check(const char *tablename, const void *ip, static struct xt_match physdev_mt_reg[] __read_mostly = { { .name = "physdev", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), @@ -129,7 +129,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = { }, { .name = "physdev", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c index 7936f7e2325..81e86d319a8 100644 --- a/net/netfilter/xt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -33,10 +33,10 @@ pkttype_mt(const struct sk_buff *skb, const struct net_device *in, if (skb->pkt_type != PACKET_LOOPBACK) type = skb->pkt_type; - else if (match->family == AF_INET && + else if (match->family == NFPROTO_IPV4 && ipv4_is_multicast(ip_hdr(skb)->daddr)) type = PACKET_MULTICAST; - else if (match->family == AF_INET6 && + else if (match->family == NFPROTO_IPV6 && ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) type = PACKET_MULTICAST; else @@ -48,14 +48,14 @@ pkttype_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match pkttype_mt_reg[] __read_mostly = { { .name = "pkttype", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = pkttype_mt, .matchsize = sizeof(struct xt_pkttype_info), .me = THIS_MODULE, }, { .name = "pkttype", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = pkttype_mt, .matchsize = sizeof(struct xt_pkttype_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index d351582b2a3..f1d514e9d0a 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -26,9 +26,9 @@ xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m, const union nf_inet_addr *a2, unsigned short family) { switch (family) { - case AF_INET: + case NFPROTO_IPV4: return ((a1->ip ^ a2->ip) & m->ip) == 0; - case AF_INET6: + case NFPROTO_IPV6: return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0; } return false; @@ -165,7 +165,7 @@ policy_mt_check(const char *tablename, const void *ip_void, static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), @@ -173,7 +173,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = { }, { .name = "policy", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index 3b021d0c522..59f61e32b62 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -57,7 +57,7 @@ quota_mt_check(const char *tablename, const void *entry, static struct xt_match quota_mt_reg[] __read_mostly = { { .name = "quota", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = quota_mt_check, .match = quota_mt, .matchsize = sizeof(struct xt_quota_info), @@ -65,7 +65,7 @@ static struct xt_match quota_mt_reg[] __read_mostly = { }, { .name = "quota", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = quota_mt_check, .match = quota_mt, .matchsize = sizeof(struct xt_quota_info), diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index ebd84f1b4f6..ba1cb5760f4 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -139,7 +139,7 @@ static void xt_rateest_mt_destroy(const struct xt_match *match, static struct xt_match xt_rateest_match[] __read_mostly = { { - .family = AF_INET, + .family = NFPROTO_IPV4, .name = "rateest", .match = xt_rateest_mt, .checkentry = xt_rateest_mt_checkentry, @@ -148,7 +148,7 @@ static struct xt_match xt_rateest_match[] __read_mostly = { .me = THIS_MODULE, }, { - .family = AF_INET6, + .family = NFPROTO_IPV6, .name = "rateest", .match = xt_rateest_mt, .checkentry = xt_rateest_mt_checkentry, diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c index 7df1627c536..ef65756d489 100644 --- a/net/netfilter/xt_realm.c +++ b/net/netfilter/xt_realm.c @@ -39,7 +39,7 @@ static struct xt_match realm_mt_reg __read_mostly = { .matchsize = sizeof(struct xt_realm_info), .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), - .family = AF_INET, + .family = NFPROTO_IPV4, .me = THIS_MODULE }; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index adc2e2f1b09..4a916e2624d 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -124,7 +124,7 @@ recent_entry_lookup(const struct recent_table *table, struct recent_entry *e; unsigned int h; - if (family == AF_INET) + if (family == NFPROTO_IPV4) h = recent_entry_hash4(addrp); else h = recent_entry_hash6(addrp); @@ -165,7 +165,7 @@ recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr, e->nstamps = 1; e->index = 1; e->family = family; - if (family == AF_INET) + if (family == NFPROTO_IPV4) list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]); else list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]); @@ -216,7 +216,7 @@ recent_mt(const struct sk_buff *skb, const struct net_device *in, u_int8_t ttl; bool ret = info->invert; - if (match->family == AF_INET) { + if (match->family == NFPROTO_IPV4) { const struct iphdr *iph = ip_hdr(skb); if (info->side == XT_RECENT_DEST) @@ -429,7 +429,7 @@ static int recent_seq_show(struct seq_file *seq, void *v) unsigned int i; i = (e->index - 1) % ip_pkt_list_tot; - if (e->family == AF_INET) + if (e->family == NFPROTO_IPV4) seq_printf(seq, "src=" NIPQUAD_FMT " ttl: %u last_seen: %lu " "oldest_pkt: %u", NIPQUAD(e->addr.ip), e->ttl, e->stamps[i], e->index); @@ -519,10 +519,11 @@ static ssize_t recent_old_proc_write(struct file *file, addr = in_aton(c); spin_lock_bh(&recent_lock); - e = recent_entry_lookup(t, (const void *)&addr, PF_INET, 0); + e = recent_entry_lookup(t, (const void *)&addr, NFPROTO_IPV4, 0); if (e == NULL) { if (add) - recent_entry_init(t, (const void *)&addr, PF_INET, 0); + recent_entry_init(t, (const void *)&addr, + NFPROTO_IPV4, 0); } else { if (add) recent_entry_update(t, e); @@ -585,10 +586,10 @@ recent_mt_proc_write(struct file *file, const char __user *input, ++c; --size; if (strnchr(c, size, ':') != NULL) { - family = AF_INET6; + family = NFPROTO_IPV6; succ = in6_pton(c, size, (void *)&addr, '\n', NULL); } else { - family = AF_INET; + family = NFPROTO_IPV4; succ = in4_pton(c, size, (void *)&addr, '\n', NULL); } @@ -628,7 +629,7 @@ static struct xt_match recent_mt_reg[] __read_mostly = { { .name = "recent", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = recent_mt, .matchsize = sizeof(struct xt_recent_mtinfo), .checkentry = recent_mt_check, @@ -638,7 +639,7 @@ static struct xt_match recent_mt_reg[] __read_mostly = { { .name = "recent", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = recent_mt, .matchsize = sizeof(struct xt_recent_mtinfo), .checkentry = recent_mt_check, diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index e6e4681fa04..ab67aca4d8f 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -169,7 +169,7 @@ sctp_mt_check(const char *tablename, const void *inf, static struct xt_match sctp_mt_reg[] __read_mostly = { { .name = "sctp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = sctp_mt_check, .match = sctp_mt, .matchsize = sizeof(struct xt_sctp_info), @@ -178,7 +178,7 @@ static struct xt_match sctp_mt_reg[] __read_mostly = { }, { .name = "sctp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = sctp_mt_check, .match = sctp_mt, .matchsize = sizeof(struct xt_sctp_info), diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index a776dc36a19..f92f8bcc1e3 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -61,7 +61,7 @@ static void state_mt_destroy(const struct xt_match *match, void *matchinfo) static struct xt_match state_mt_reg[] __read_mostly = { { .name = "state", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = state_mt_check, .match = state_mt, .destroy = state_mt_destroy, @@ -70,7 +70,7 @@ static struct xt_match state_mt_reg[] __read_mostly = { }, { .name = "state", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = state_mt_check, .match = state_mt, .destroy = state_mt_destroy, diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 43133080da7..fd3bb1400df 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -69,7 +69,7 @@ statistic_mt_check(const char *tablename, const void *entry, static struct xt_match statistic_mt_reg[] __read_mostly = { { .name = "statistic", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = statistic_mt_check, .match = statistic_mt, .matchsize = sizeof(struct xt_statistic_info), @@ -77,7 +77,7 @@ static struct xt_match statistic_mt_reg[] __read_mostly = { }, { .name = "statistic", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = statistic_mt_check, .match = statistic_mt, .matchsize = sizeof(struct xt_statistic_info), diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 4903182a062..50169718377 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -85,7 +85,7 @@ static struct xt_match string_mt_reg[] __read_mostly = { { .name = "string", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, @@ -95,7 +95,7 @@ static struct xt_match string_mt_reg[] __read_mostly = { { .name = "string", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, @@ -105,7 +105,7 @@ static struct xt_match string_mt_reg[] __read_mostly = { { .name = "string", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, @@ -115,7 +115,7 @@ static struct xt_match string_mt_reg[] __read_mostly = { { .name = "string", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 6771bf01275..4791c7cbe5a 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -83,7 +83,7 @@ dropit: static struct xt_match tcpmss_mt_reg[] __read_mostly = { { .name = "tcpmss", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = tcpmss_mt, .matchsize = sizeof(struct xt_tcpmss_match_info), .proto = IPPROTO_TCP, @@ -91,7 +91,7 @@ static struct xt_match tcpmss_mt_reg[] __read_mostly = { }, { .name = "tcpmss", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = tcpmss_mt, .matchsize = sizeof(struct xt_tcpmss_match_info), .proto = IPPROTO_TCP, diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index 951b06b8d70..5a6268cbb9f 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -186,7 +186,7 @@ udp_mt_check(const char *tablename, const void *info, static struct xt_match tcpudp_mt_reg[] __read_mostly = { { .name = "tcp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), @@ -195,7 +195,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { }, { .name = "tcp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), @@ -204,7 +204,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { }, { .name = "udp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), @@ -213,7 +213,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { }, { .name = "udp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), @@ -222,7 +222,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { }, { .name = "udplite", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), @@ -231,7 +231,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { }, { .name = "udplite", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 307a2c3c2df..fe9dae2b4f5 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -240,7 +240,7 @@ time_mt_check(const char *tablename, const void *ip, static struct xt_match time_mt_reg[] __read_mostly = { { .name = "time", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = time_mt, .matchsize = sizeof(struct xt_time_info), .checkentry = time_mt_check, @@ -248,7 +248,7 @@ static struct xt_match time_mt_reg[] __read_mostly = { }, { .name = "time", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = time_mt, .matchsize = sizeof(struct xt_time_info), .checkentry = time_mt_check, diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c index 627e0f336d5..ed9f8340611 100644 --- a/net/netfilter/xt_u32.c +++ b/net/netfilter/xt_u32.c @@ -102,14 +102,14 @@ u32_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match u32_mt_reg[] __read_mostly = { { .name = "u32", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = u32_mt, .matchsize = sizeof(struct xt_u32), .me = THIS_MODULE, }, { .name = "u32", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = u32_mt, .matchsize = sizeof(struct xt_u32), .me = THIS_MODULE, -- cgit v1.2.3-70-g09d2 From 48dc7865aa3db9404aedc8677d9daf8f8f469ab0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:01 +0200 Subject: netfilter: netns: remove nf_*_net() wrappers Now that dev_net() exists, the usefullness of them is even less. Also they're a big problem in resolving circular header dependencies necessary for NOTRACK-in-netns patch. See below. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/linux/netfilter.h | 53 ---------------------------------- net/ipv4/netfilter/iptable_filter.c | 6 ++-- net/ipv4/netfilter/iptable_mangle.c | 10 +++---- net/ipv4/netfilter/iptable_raw.c | 4 +-- net/ipv4/netfilter/iptable_security.c | 6 ++-- net/ipv6/netfilter/ip6table_filter.c | 6 ++-- net/ipv6/netfilter/ip6table_security.c | 6 ++-- 7 files changed, 19 insertions(+), 72 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index bf3afb0844f..48cfe51bfdd 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -5,13 +5,11 @@ #include #include #include -#include #include #include #include #include #include -#include #endif #include #include @@ -355,56 +353,5 @@ extern void (*nf_ct_destroy)(struct nf_conntrack *); static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif -static inline struct net *nf_pre_routing_net(const struct net_device *in, - const struct net_device *out) -{ -#ifdef CONFIG_NET_NS - return in->nd_net; -#else - return &init_net; -#endif -} - -static inline struct net *nf_local_in_net(const struct net_device *in, - const struct net_device *out) -{ -#ifdef CONFIG_NET_NS - return in->nd_net; -#else - return &init_net; -#endif -} - -static inline struct net *nf_forward_net(const struct net_device *in, - const struct net_device *out) -{ -#ifdef CONFIG_NET_NS - BUG_ON(in->nd_net != out->nd_net); - return in->nd_net; -#else - return &init_net; -#endif -} - -static inline struct net *nf_local_out_net(const struct net_device *in, - const struct net_device *out) -{ -#ifdef CONFIG_NET_NS - return out->nd_net; -#else - return &init_net; -#endif -} - -static inline struct net *nf_post_routing_net(const struct net_device *in, - const struct net_device *out) -{ -#ifdef CONFIG_NET_NS - return out->nd_net; -#else - return &init_net; -#endif -} - #endif /*__KERNEL__*/ #endif /*__LINUX_NETFILTER_H*/ diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 1ea677dcf84..c9224310eba 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -70,7 +70,7 @@ ipt_local_in_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_local_in_net(in, out)->ipv4.iptable_filter); + dev_net(in)->ipv4.iptable_filter); } static unsigned int @@ -81,7 +81,7 @@ ipt_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_forward_net(in, out)->ipv4.iptable_filter); + dev_net(in)->ipv4.iptable_filter); } static unsigned int @@ -101,7 +101,7 @@ ipt_local_out_hook(unsigned int hook, } return ipt_do_table(skb, hook, in, out, - nf_local_out_net(in, out)->ipv4.iptable_filter); + dev_net(out)->ipv4.iptable_filter); } static struct nf_hook_ops ipt_ops[] __read_mostly = { diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index da59182f222..69f2c428714 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -81,7 +81,7 @@ ipt_pre_routing_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_pre_routing_net(in, out)->ipv4.iptable_mangle); + dev_net(in)->ipv4.iptable_mangle); } static unsigned int @@ -92,7 +92,7 @@ ipt_post_routing_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_post_routing_net(in, out)->ipv4.iptable_mangle); + dev_net(out)->ipv4.iptable_mangle); } static unsigned int @@ -103,7 +103,7 @@ ipt_local_in_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_local_in_net(in, out)->ipv4.iptable_mangle); + dev_net(in)->ipv4.iptable_mangle); } static unsigned int @@ -114,7 +114,7 @@ ipt_forward_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_forward_net(in, out)->ipv4.iptable_mangle); + dev_net(in)->ipv4.iptable_mangle); } static unsigned int @@ -147,7 +147,7 @@ ipt_local_hook(unsigned int hook, tos = iph->tos; ret = ipt_do_table(skb, hook, in, out, - nf_local_out_net(in, out)->ipv4.iptable_mangle); + dev_net(out)->ipv4.iptable_mangle); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { iph = ip_hdr(skb); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index fddce7754b7..8faebfe638f 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -53,7 +53,7 @@ ipt_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_pre_routing_net(in, out)->ipv4.iptable_raw); + dev_net(in)->ipv4.iptable_raw); } static unsigned int @@ -72,7 +72,7 @@ ipt_local_hook(unsigned int hook, return NF_ACCEPT; } return ipt_do_table(skb, hook, in, out, - nf_local_out_net(in, out)->ipv4.iptable_raw); + dev_net(out)->ipv4.iptable_raw); } /* 'raw' is the very first table. */ diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index db6d312128e..36f3be3cc42 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -73,7 +73,7 @@ ipt_local_in_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_local_in_net(in, out)->ipv4.iptable_security); + dev_net(in)->ipv4.iptable_security); } static unsigned int @@ -84,7 +84,7 @@ ipt_forward_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ipt_do_table(skb, hook, in, out, - nf_forward_net(in, out)->ipv4.iptable_security); + dev_net(in)->ipv4.iptable_security); } static unsigned int @@ -103,7 +103,7 @@ ipt_local_out_hook(unsigned int hook, return NF_ACCEPT; } return ipt_do_table(skb, hook, in, out, - nf_local_out_net(in, out)->ipv4.iptable_security); + dev_net(out)->ipv4.iptable_security); } static struct nf_hook_ops ipt_ops[] __read_mostly = { diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 55a2c290bad..b110a8a85a1 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -68,7 +68,7 @@ ip6t_local_in_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ip6t_do_table(skb, hook, in, out, - nf_local_in_net(in, out)->ipv6.ip6table_filter); + dev_net(in)->ipv6.ip6table_filter); } static unsigned int @@ -79,7 +79,7 @@ ip6t_forward_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ip6t_do_table(skb, hook, in, out, - nf_forward_net(in, out)->ipv6.ip6table_filter); + dev_net(in)->ipv6.ip6table_filter); } static unsigned int @@ -100,7 +100,7 @@ ip6t_local_out_hook(unsigned int hook, #endif return ip6t_do_table(skb, hook, in, out, - nf_local_out_net(in, out)->ipv6.ip6table_filter); + dev_net(out)->ipv6.ip6table_filter); } static struct nf_hook_ops ip6t_ops[] __read_mostly = { diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 6e7131036bc..20bc52f13e4 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -72,7 +72,7 @@ ip6t_local_in_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ip6t_do_table(skb, hook, in, out, - nf_local_in_net(in, out)->ipv6.ip6table_security); + dev_net(in)->ipv6.ip6table_security); } static unsigned int @@ -83,7 +83,7 @@ ip6t_forward_hook(unsigned int hook, int (*okfn)(struct sk_buff *)) { return ip6t_do_table(skb, hook, in, out, - nf_forward_net(in, out)->ipv6.ip6table_security); + dev_net(in)->ipv6.ip6table_security); } static unsigned int @@ -95,7 +95,7 @@ ip6t_local_out_hook(unsigned int hook, { /* TBD: handle short packets via raw socket */ return ip6t_do_table(skb, hook, in, out, - nf_local_out_net(in, out)->ipv6.ip6table_security); + dev_net(out)->ipv6.ip6table_security); } static struct nf_hook_ops ip6t_ops[] __read_mostly = { -- cgit v1.2.3-70-g09d2 From 49ac8713b6d064adf7474080fdccebd7cce76be0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:03 +0200 Subject: netfilter: netns nf_conntrack: per-netns conntrack count Sysctls and proc files are stubbed to init_net's one. This is temporary. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack.h | 1 - include/net/netns/conntrack.h | 3 +++ net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | 2 +- net/netfilter/nf_conntrack_core.c | 18 ++++++++---------- net/netfilter/nf_conntrack_standalone.c | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 2b8d6efecf3..5999c5313d0 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -288,7 +288,6 @@ static inline int nf_ct_is_untracked(const struct sk_buff *skb) extern int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); extern unsigned int nf_conntrack_htable_size; extern int nf_conntrack_checksum; -extern atomic_t nf_conntrack_count; extern int nf_conntrack_max; DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 82d80b83477..edf84714d7c 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -1,6 +1,9 @@ #ifndef __NETNS_CONNTRACK_H #define __NETNS_CONNTRACK_H +#include + struct netns_ct { + atomic_t count; }; #endif diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5a955c44036..31abee3e29f 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -254,7 +254,7 @@ static ctl_table ip_ct_sysctl_table[] = { { .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, .procname = "ip_conntrack_count", - .data = &nf_conntrack_count, + .data = &init_net.ct.count, .maxlen = sizeof(int), .mode = 0444, .proc_handler = &proc_dointvec, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 3a020720e40..4556805027f 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -314,7 +314,7 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); + unsigned int nr_conntracks = atomic_read(&init_net.ct.count); const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index cefc338f6e5..8299b3490e7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -44,10 +44,6 @@ DEFINE_SPINLOCK(nf_conntrack_lock); EXPORT_SYMBOL_GPL(nf_conntrack_lock); -/* nf_conntrack_standalone needs this */ -atomic_t nf_conntrack_count = ATOMIC_INIT(0); -EXPORT_SYMBOL_GPL(nf_conntrack_count); - unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); @@ -477,13 +473,13 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, } /* We don't want any race condition at early drop stage */ - atomic_inc(&nf_conntrack_count); + atomic_inc(&net->ct.count); if (nf_conntrack_max && - unlikely(atomic_read(&nf_conntrack_count) > nf_conntrack_max)) { + unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { unsigned int hash = hash_conntrack(orig); if (!early_drop(hash)) { - atomic_dec(&nf_conntrack_count); + atomic_dec(&net->ct.count); if (net_ratelimit()) printk(KERN_WARNING "nf_conntrack: table full, dropping" @@ -495,7 +491,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, ct = kmem_cache_zalloc(nf_conntrack_cachep, gfp); if (ct == NULL) { pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); - atomic_dec(&nf_conntrack_count); + atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); } @@ -516,10 +512,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc); static void nf_conntrack_free_rcu(struct rcu_head *head) { struct nf_conn *ct = container_of(head, struct nf_conn, rcu); + struct net *net = nf_ct_net(ct); nf_ct_ext_free(ct); kmem_cache_free(nf_conntrack_cachep, ct); - atomic_dec(&nf_conntrack_count); + atomic_dec(&net->ct.count); } void nf_conntrack_free(struct nf_conn *ct) @@ -1024,7 +1021,7 @@ void nf_conntrack_cleanup(struct net *net) nf_ct_event_cache_flush(); i_see_dead_people: nf_conntrack_flush(); - if (atomic_read(&nf_conntrack_count) != 0) { + if (atomic_read(&net->ct.count) != 0) { schedule(); goto i_see_dead_people; } @@ -1148,6 +1145,7 @@ int nf_conntrack_init(struct net *net) * entries. */ max_factor = 4; } + atomic_set(&net->ct.count, 0); nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, &nf_conntrack_vmalloc); if (!nf_conntrack_hash) { diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 81dec17196d..021b505907d 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -226,7 +226,7 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); + unsigned int nr_conntracks = atomic_read(&init_net.ct.count); const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { @@ -338,7 +338,7 @@ static ctl_table nf_ct_sysctl_table[] = { { .ctl_name = NET_NF_CONNTRACK_COUNT, .procname = "nf_conntrack_count", - .data = &nf_conntrack_count, + .data = &init_net.ct.count, .maxlen = sizeof(int), .mode = 0444, .proc_handler = &proc_dointvec, -- cgit v1.2.3-70-g09d2 From 400dad39d1c33fe797e47326d87a3f54d0ac5181 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:03 +0200 Subject: netfilter: netns nf_conntrack: per-netns conntrack hash * make per-netns conntrack hash Other solution is to add ->ct_net pointer to tuplehashes and still has one hash, I tried that it's ugly and requires more code deep down in protocol modules et al. * propagate netns pointer to where needed, e. g. to conntrack iterators. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack.h | 6 +- include/net/netfilter/nf_conntrack_core.h | 3 +- include/net/netns/conntrack.h | 2 + net/ipv4/netfilter/ipt_MASQUERADE.c | 3 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- .../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 4 +- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 2 +- net/ipv4/netfilter/nf_nat_core.c | 2 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +- net/netfilter/nf_conntrack_core.c | 74 +++++++++++----------- net/netfilter/nf_conntrack_helper.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 16 ++--- net/netfilter/nf_conntrack_pptp.c | 2 +- net/netfilter/nf_conntrack_proto.c | 4 +- net/netfilter/nf_conntrack_standalone.c | 4 +- net/netfilter/xt_connlimit.c | 2 +- 16 files changed, 67 insertions(+), 63 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 5999c5313d0..f5447f14304 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -195,11 +195,11 @@ extern void nf_ct_free_hashtable(struct hlist_head *hash, int vmalloced, unsigned int size); extern struct nf_conntrack_tuple_hash * -__nf_conntrack_find(const struct nf_conntrack_tuple *tuple); +__nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple); extern void nf_conntrack_hash_insert(struct nf_conn *ct); -extern void nf_conntrack_flush(void); +extern void nf_conntrack_flush(struct net *net); extern bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, @@ -261,7 +261,7 @@ extern struct nf_conn nf_conntrack_untracked; /* Iterate over all conntracks: if iter returns true, it's deleted. */ extern void -nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data); +nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data); extern void nf_conntrack_free(struct nf_conn *ct); extern struct nf_conn * nf_conntrack_alloc(struct net *net, diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 532aa200cbc..1c373564396 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -48,7 +48,7 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, /* Find a connection corresponding to a tuple. */ extern struct nf_conntrack_tuple_hash * -nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple); +nf_conntrack_find_get(struct net *net, const struct nf_conntrack_tuple *tuple); extern int __nf_conntrack_confirm(struct sk_buff *skb); @@ -71,7 +71,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *proto); -extern struct hlist_head *nf_conntrack_hash; extern spinlock_t nf_conntrack_lock ; extern struct hlist_head unconfirmed; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index edf84714d7c..b767683f112 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -5,5 +5,7 @@ struct netns_ct { atomic_t count; + struct hlist_head *hash; + int hash_vmalloc; }; #endif diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 9a4822f8243..5e1c81791e5 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -129,7 +129,8 @@ static int masq_device_event(struct notifier_block *this, and forget them. */ NF_CT_ASSERT(dev->ifindex != 0); - nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + nf_ct_iterate_cleanup(&init_net, device_cmp, + (void *)(long)dev->ifindex); } return NOTIFY_DONE; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 31abee3e29f..03dd108015c 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -323,7 +323,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; } - h = nf_conntrack_find_get(&tuple); + h = nf_conntrack_find_get(sock_net(sk), &tuple); if (h) { struct sockaddr_in sin; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 4556805027f..8e0afdc2b13 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -32,7 +32,7 @@ static struct hlist_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(nf_conntrack_hash[st->bucket].first); + n = rcu_dereference(init_net.ct.hash[st->bucket].first); if (n) return n; } @@ -48,7 +48,7 @@ static struct hlist_node *ct_get_next(struct seq_file *seq, while (head == NULL) { if (++st->bucket >= nf_conntrack_htable_size) return NULL; - head = rcu_dereference(nf_conntrack_hash[st->bucket].first); + head = rcu_dereference(init_net.ct.hash[st->bucket].first); } return head; } diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index da8edcdaef3..daf346377b6 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -155,7 +155,7 @@ icmp_error_message(struct sk_buff *skb, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(&innertuple); + h = nf_conntrack_find_get(&init_net, &innertuple); if (!h) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 6c6a3cba8d5..5d4a5b70da2 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -643,7 +643,7 @@ static int clean_nat(struct nf_conn *i, void *data) static void __exit nf_nat_cleanup(void) { - nf_ct_iterate_cleanup(&clean_nat, NULL); + nf_ct_iterate_cleanup(&init_net, &clean_nat, NULL); synchronize_rcu(); nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size); nf_ct_l3proto_put(l3proto); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 5756f30ebc6..548cf4f15c0 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -156,7 +156,7 @@ icmpv6_error_message(struct sk_buff *skb, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(&intuple); + h = nf_conntrack_find_get(&init_net, &intuple); if (!h) { pr_debug("icmpv6_error: no match\n"); return -NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 8299b3490e7..da56b260552 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -50,15 +50,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); -struct hlist_head *nf_conntrack_hash __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_hash); - struct nf_conn nf_conntrack_untracked __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_untracked); unsigned int nf_ct_log_invalid __read_mostly; HLIST_HEAD(unconfirmed); -static int nf_conntrack_vmalloc __read_mostly; static struct kmem_cache *nf_conntrack_cachep __read_mostly; DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); @@ -242,7 +238,7 @@ static void death_by_timeout(unsigned long ul_conntrack) } struct nf_conntrack_tuple_hash * -__nf_conntrack_find(const struct nf_conntrack_tuple *tuple) +__nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_tuple_hash *h; struct hlist_node *n; @@ -252,7 +248,7 @@ __nf_conntrack_find(const struct nf_conntrack_tuple *tuple) * at least once for the stats anyway. */ local_bh_disable(); - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) { + hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { if (nf_ct_tuple_equal(tuple, &h->tuple)) { NF_CT_STAT_INC(found); local_bh_enable(); @@ -268,13 +264,13 @@ EXPORT_SYMBOL_GPL(__nf_conntrack_find); /* Find a connection corresponding to a tuple. */ struct nf_conntrack_tuple_hash * -nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple) +nf_conntrack_find_get(struct net *net, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; rcu_read_lock(); - h = __nf_conntrack_find(tuple); + h = __nf_conntrack_find(net, tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) @@ -290,10 +286,12 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, unsigned int repl_hash) { + struct net *net = nf_ct_net(ct); + hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, - &nf_conntrack_hash[hash]); + &net->ct.hash[hash]); hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode, - &nf_conntrack_hash[repl_hash]); + &net->ct.hash[repl_hash]); } void nf_conntrack_hash_insert(struct nf_conn *ct) @@ -319,8 +317,10 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct nf_conn_help *help; struct hlist_node *n; enum ip_conntrack_info ctinfo; + struct net *net; ct = nf_ct_get(skb, &ctinfo); + net = nf_ct_net(ct); /* ipt_REJECT uses nf_conntrack_attach to attach related ICMP/TCP RST packets in other direction. Actual packet @@ -347,11 +347,11 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) + hlist_for_each_entry(h, n, &net->ct.hash[hash], hnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple)) goto out; - hlist_for_each_entry(h, n, &nf_conntrack_hash[repl_hash], hnode) + hlist_for_each_entry(h, n, &net->ct.hash[repl_hash], hnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple)) goto out; @@ -394,6 +394,7 @@ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { + struct net *net = nf_ct_net(ignored_conntrack); struct nf_conntrack_tuple_hash *h; struct hlist_node *n; unsigned int hash = hash_conntrack(tuple); @@ -402,7 +403,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, * least once for the stats anyway. */ rcu_read_lock_bh(); - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) { + hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple)) { NF_CT_STAT_INC(found); @@ -421,7 +422,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ -static noinline int early_drop(unsigned int hash) +static noinline int early_drop(struct net *net, unsigned int hash) { /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; @@ -432,7 +433,7 @@ static noinline int early_drop(unsigned int hash) rcu_read_lock(); for (i = 0; i < nf_conntrack_htable_size; i++) { - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], + hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { tmp = nf_ct_tuplehash_to_ctrack(h); if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) @@ -478,7 +479,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, if (nf_conntrack_max && unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { unsigned int hash = hash_conntrack(orig); - if (!early_drop(hash)) { + if (!early_drop(net, hash)) { atomic_dec(&net->ct.count); if (net_ratelimit()) printk(KERN_WARNING @@ -631,7 +632,7 @@ resolve_normal_ct(struct sk_buff *skb, } /* look for tuple match */ - h = nf_conntrack_find_get(&tuple); + h = nf_conntrack_find_get(&init_net, &tuple); if (!h) { h = init_conntrack(&init_net, &tuple, l3proto, l4proto, skb, dataoff); @@ -941,7 +942,7 @@ static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) /* Bring out ya dead! */ static struct nf_conn * -get_next_corpse(int (*iter)(struct nf_conn *i, void *data), +get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, unsigned int *bucket) { struct nf_conntrack_tuple_hash *h; @@ -950,7 +951,7 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), spin_lock_bh(&nf_conntrack_lock); for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { - hlist_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnode) { + hlist_for_each_entry(h, n, &net->ct.hash[*bucket], hnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) goto found; @@ -969,13 +970,14 @@ found: return ct; } -void -nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data) +void nf_ct_iterate_cleanup(struct net *net, + int (*iter)(struct nf_conn *i, void *data), + void *data) { struct nf_conn *ct; unsigned int bucket = 0; - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) death_by_timeout((unsigned long)ct); @@ -1001,9 +1003,9 @@ void nf_ct_free_hashtable(struct hlist_head *hash, int vmalloced, unsigned int s } EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -void nf_conntrack_flush(void) +void nf_conntrack_flush(struct net *net) { - nf_ct_iterate_cleanup(kill_all, NULL); + nf_ct_iterate_cleanup(net, kill_all, NULL); } EXPORT_SYMBOL_GPL(nf_conntrack_flush); @@ -1020,7 +1022,7 @@ void nf_conntrack_cleanup(struct net *net) nf_ct_event_cache_flush(); i_see_dead_people: - nf_conntrack_flush(); + nf_conntrack_flush(net); if (atomic_read(&net->ct.count) != 0) { schedule(); goto i_see_dead_people; @@ -1032,7 +1034,7 @@ void nf_conntrack_cleanup(struct net *net) rcu_assign_pointer(nf_ct_destroy, NULL); kmem_cache_destroy(nf_conntrack_cachep); - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); nf_conntrack_acct_fini(); @@ -1097,8 +1099,8 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) */ spin_lock_bh(&nf_conntrack_lock); for (i = 0; i < nf_conntrack_htable_size; i++) { - while (!hlist_empty(&nf_conntrack_hash[i])) { - h = hlist_entry(nf_conntrack_hash[i].first, + while (!hlist_empty(&init_net.ct.hash[i])) { + h = hlist_entry(init_net.ct.hash[i].first, struct nf_conntrack_tuple_hash, hnode); hlist_del_rcu(&h->hnode); bucket = __hash_conntrack(&h->tuple, hashsize, rnd); @@ -1106,12 +1108,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) } } old_size = nf_conntrack_htable_size; - old_vmalloced = nf_conntrack_vmalloc; - old_hash = nf_conntrack_hash; + old_vmalloced = init_net.ct.hash_vmalloc; + old_hash = init_net.ct.hash; nf_conntrack_htable_size = hashsize; - nf_conntrack_vmalloc = vmalloced; - nf_conntrack_hash = hash; + init_net.ct.hash_vmalloc = vmalloced; + init_net.ct.hash = hash; nf_conntrack_hash_rnd = rnd; spin_unlock_bh(&nf_conntrack_lock); @@ -1146,9 +1148,9 @@ int nf_conntrack_init(struct net *net) max_factor = 4; } atomic_set(&net->ct.count, 0); - nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, - &nf_conntrack_vmalloc); - if (!nf_conntrack_hash) { + net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, + &net->ct.hash_vmalloc); + if (!net->ct.hash) { printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); goto err_out; } @@ -1207,7 +1209,7 @@ out_fini_proto: err_free_conntrack_slab: kmem_cache_destroy(nf_conntrack_cachep); err_free_hash: - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); err_out: return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 8e0b4c8f62a..d91278dfdaf 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -159,7 +159,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) hlist_for_each_entry(h, n, &unconfirmed, hnode) unhelp(h, me); for (i = 0; i < nf_conntrack_htable_size; i++) { - hlist_for_each_entry(h, n, &nf_conntrack_hash[i], hnode) + hlist_for_each_entry(h, n, &init_net.ct.hash[i], hnode) unhelp(h, me); } spin_unlock_bh(&nf_conntrack_lock); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index da3cdc8db70..918a3358a12 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -549,7 +549,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conn *)cb->args[1]; for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[cb->args[0]], + hlist_for_each_entry_rcu(h, n, &init_net.ct.hash[cb->args[0]], hnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; @@ -794,14 +794,14 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); else { /* Flush the whole table */ - nf_conntrack_flush(); + nf_conntrack_flush(&init_net); return 0; } if (err < 0) return err; - h = nf_conntrack_find_get(&tuple); + h = nf_conntrack_find_get(&init_net, &tuple); if (!h) return -ENOENT; @@ -847,7 +847,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - h = nf_conntrack_find_get(&tuple); + h = nf_conntrack_find_get(&init_net, &tuple); if (!h) return -ENOENT; @@ -1213,9 +1213,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, spin_lock_bh(&nf_conntrack_lock); if (cda[CTA_TUPLE_ORIG]) - h = __nf_conntrack_find(&otuple); + h = __nf_conntrack_find(&init_net, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = __nf_conntrack_find(&rtuple); + h = __nf_conntrack_find(&init_net, &rtuple); if (h == NULL) { struct nf_conntrack_tuple master; @@ -1230,7 +1230,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, if (err < 0) goto out_unlock; - master_h = __nf_conntrack_find(&master); + master_h = __nf_conntrack_find(&init_net, &master); if (master_h == NULL) { err = -ENOENT; goto out_unlock; @@ -1670,7 +1670,7 @@ ctnetlink_create_expect(struct nlattr *cda[], u_int8_t u3) return err; /* Look for master conntrack of this expectation */ - h = nf_conntrack_find_get(&master_tuple); + h = nf_conntrack_find_get(&init_net, &master_tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 97e54b0e43a..7caf45b59d2 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -143,7 +143,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t) pr_debug("trying to timeout ct or exp for tuple "); nf_ct_dump_tuple(t); - h = nf_conntrack_find_get(t); + h = nf_conntrack_find_get(&init_net, t); if (h) { sibling = nf_ct_tuplehash_to_ctrack(h); pr_debug("setting timeout of conntrack %p to 0\n", sibling); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index a49fc932629..3a2f7ef997f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -219,7 +219,7 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) synchronize_rcu(); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(kill_l3proto, proto); + nf_ct_iterate_cleanup(&init_net, kill_l3proto, proto); } EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister); @@ -328,7 +328,7 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) synchronize_rcu(); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(kill_l4proto, l4proto); + nf_ct_iterate_cleanup(&init_net, kill_l4proto, l4proto); } EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 021b505907d..5456e4b9424 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -51,7 +51,7 @@ static struct hlist_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(nf_conntrack_hash[st->bucket].first); + n = rcu_dereference(init_net.ct.hash[st->bucket].first); if (n) return n; } @@ -67,7 +67,7 @@ static struct hlist_node *ct_get_next(struct seq_file *seq, while (head == NULL) { if (++st->bucket >= nf_conntrack_htable_size) return NULL; - head = rcu_dereference(nf_conntrack_hash[st->bucket].first); + head = rcu_dereference(init_net.ct.hash[st->bucket].first); } return head; } diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index d2453d182d6..bd00830ff69 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -123,7 +123,7 @@ static int count_them(struct xt_connlimit_data *data, /* check the saved connections */ list_for_each_entry_safe(conn, tmp, hash, list) { - found = __nf_conntrack_find(&conn->tuple); + found = __nf_conntrack_find(&init_net, &conn->tuple); found_ct = NULL; if (found != NULL) -- cgit v1.2.3-70-g09d2 From b21f89019399ff75d9c239010e38b840eb6e01e7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:03 +0200 Subject: netfilter: netns: fix {ip,6}_route_me_harder() in netns Take netns from skb->dst->dev. It should be safe because, they are called from LOCAL_OUT hook where dst is valid (though, I'm not exactly sure about IPVS and queueing packets to userspace). [Patrick: its safe everywhere since they already expect skb->dst to be set] Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/ipv4/netfilter.c | 7 ++++--- net/ipv6/netfilter.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 01671ad51ed..6efdb70b3eb 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -12,6 +12,7 @@ /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) { + struct net *net = dev_net(skb->dst->dev); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi fl = {}; @@ -19,7 +20,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) unsigned int hh_len; unsigned int type; - type = inet_addr_type(&init_net, iph->saddr); + type = inet_addr_type(net, iph->saddr); if (skb->sk && inet_sk(skb->sk)->transparent) type = RTN_LOCAL; if (addr_type == RTN_UNSPEC) @@ -36,7 +37,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; fl.mark = skb->mark; fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; - if (ip_route_output_key(&init_net, &rt, &fl) != 0) + if (ip_route_output_key(net, &rt, &fl) != 0) return -1; /* Drop old route. */ @@ -46,7 +47,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ fl.nl_u.ip4_u.daddr = iph->saddr; - if (ip_route_output_key(&init_net, &rt, &fl) != 0) + if (ip_route_output_key(net, &rt, &fl) != 0) return -1; odst = skb->dst; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 8c6c5e71f21..4cb4844a322 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -23,7 +23,7 @@ int ip6_route_me_harder(struct sk_buff *skb) .saddr = iph->saddr, } }, }; - dst = ip6_route_output(&init_net, skb->sk, &fl); + dst = ip6_route_output(dev_net(skb->dst->dev), skb->sk, &fl); #ifdef CONFIG_XFRM if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && -- cgit v1.2.3-70-g09d2 From 9b03f38d0487f3908696242286d934c9b38f9d2a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:03 +0200 Subject: netfilter: netns nf_conntrack: per-netns expectations Make per-netns a) expectation hash and b) expectations count. Expectations always belongs to netns to which it's master conntrack belong. This is natural and doesn't bloat expectation. Proc files and leaf users are stubbed to init_net, this is temporary. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack_expect.h | 20 +++++--- include/net/netns/conntrack.h | 3 ++ .../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 6 ++- net/ipv4/netfilter/nf_nat_pptp.c | 2 +- net/netfilter/nf_conntrack_core.c | 8 ++-- net/netfilter/nf_conntrack_expect.c | 55 +++++++++++----------- net/netfilter/nf_conntrack_h323_main.c | 2 +- net/netfilter/nf_conntrack_helper.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 13 ++--- net/netfilter/nf_conntrack_pptp.c | 4 +- net/netfilter/nf_conntrack_sip.c | 2 +- 11 files changed, 66 insertions(+), 51 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 4c4d894cb9b..37a7fc1164b 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -6,7 +6,6 @@ #define _NF_CONNTRACK_EXPECT_H #include -extern struct hlist_head *nf_ct_expect_hash; extern unsigned int nf_ct_expect_hsize; extern unsigned int nf_ct_expect_max; @@ -56,6 +55,15 @@ struct nf_conntrack_expect struct rcu_head rcu; }; +static inline struct net *nf_ct_exp_net(struct nf_conntrack_expect *exp) +{ +#ifdef CONFIG_NET_NS + return exp->master->ct_net; /* by definition */ +#else + return &init_net; +#endif +} + struct nf_conntrack_expect_policy { unsigned int max_expected; @@ -67,17 +75,17 @@ struct nf_conntrack_expect_policy #define NF_CT_EXPECT_PERMANENT 0x1 #define NF_CT_EXPECT_INACTIVE 0x2 -int nf_conntrack_expect_init(void); -void nf_conntrack_expect_fini(void); +int nf_conntrack_expect_init(struct net *net); +void nf_conntrack_expect_fini(struct net *net); struct nf_conntrack_expect * -__nf_ct_expect_find(const struct nf_conntrack_tuple *tuple); +__nf_ct_expect_find(struct net *net, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * -nf_ct_expect_find_get(const struct nf_conntrack_tuple *tuple); +nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * -nf_ct_find_expectation(const struct nf_conntrack_tuple *tuple); +nf_ct_find_expectation(struct net *net, const struct nf_conntrack_tuple *tuple); void nf_ct_unlink_expect(struct nf_conntrack_expect *exp); void nf_ct_remove_expectations(struct nf_conn *ct); diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index b767683f112..e453a33f3e9 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -5,7 +5,10 @@ struct netns_ct { atomic_t count; + unsigned int expect_count; struct hlist_head *hash; + struct hlist_head *expect_hash; int hash_vmalloc; + int expect_vmalloc; }; #endif diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 8e0afdc2b13..f8636a57e8c 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -177,11 +177,12 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { + struct net *net = &init_net; struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + n = rcu_dereference(net->ct.expect_hash[st->bucket].first); if (n) return n; } @@ -191,13 +192,14 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { + struct net *net = &init_net; struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(head->next); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + head = rcu_dereference(net->ct.expect_hash[st->bucket].first); } return head; } diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index da3d91a5ef5..e4bdddc6034 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -73,7 +73,7 @@ static void pptp_nat_expected(struct nf_conn *ct, pr_debug("trying to unexpect other dir: "); nf_ct_dump_tuple_ip(&t); - other_exp = nf_ct_expect_find_get(&t); + other_exp = nf_ct_expect_find_get(&init_net, &t); if (other_exp) { nf_ct_unexpect_related(other_exp); nf_ct_expect_put(other_exp); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index da56b260552..c188edea249 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -562,7 +562,7 @@ init_conntrack(struct net *net, nf_ct_acct_ext_add(ct, GFP_ATOMIC); spin_lock_bh(&nf_conntrack_lock); - exp = nf_ct_find_expectation(tuple); + exp = nf_ct_find_expectation(net, tuple); if (exp) { pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", ct, exp); @@ -1038,7 +1038,7 @@ void nf_conntrack_cleanup(struct net *net) nf_conntrack_htable_size); nf_conntrack_acct_fini(); - nf_conntrack_expect_fini(); + nf_conntrack_expect_fini(net); nf_conntrack_helper_fini(); nf_conntrack_proto_fini(); } @@ -1173,7 +1173,7 @@ int nf_conntrack_init(struct net *net) if (ret < 0) goto err_free_conntrack_slab; - ret = nf_conntrack_expect_init(); + ret = nf_conntrack_expect_init(net); if (ret < 0) goto out_fini_proto; @@ -1203,7 +1203,7 @@ int nf_conntrack_init(struct net *net) out_fini_helper: nf_conntrack_helper_fini(); out_fini_expect: - nf_conntrack_expect_fini(); + nf_conntrack_expect_fini(net); out_fini_proto: nf_conntrack_proto_fini(); err_free_conntrack_slab: diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index e6a79f2a7c5..5307316356e 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -28,17 +28,12 @@ #include #include -struct hlist_head *nf_ct_expect_hash __read_mostly; -EXPORT_SYMBOL_GPL(nf_ct_expect_hash); - unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); static unsigned int nf_ct_expect_hash_rnd __read_mostly; -static unsigned int nf_ct_expect_count; unsigned int nf_ct_expect_max __read_mostly; static int nf_ct_expect_hash_rnd_initted __read_mostly; -static int nf_ct_expect_vmalloc; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; @@ -46,12 +41,13 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly; void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { struct nf_conn_help *master_help = nfct_help(exp->master); + struct net *net = nf_ct_exp_net(exp); NF_CT_ASSERT(master_help); NF_CT_ASSERT(!timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); - nf_ct_expect_count--; + net->ct.expect_count--; hlist_del(&exp->lnode); master_help->expecting[exp->class]--; @@ -87,17 +83,17 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple } struct nf_conntrack_expect * -__nf_ct_expect_find(const struct nf_conntrack_tuple *tuple) +__nf_ct_expect_find(struct net *net, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; struct hlist_node *n; unsigned int h; - if (!nf_ct_expect_count) + if (!net->ct.expect_count) return NULL; h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry_rcu(i, n, &nf_ct_expect_hash[h], hnode) { + hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) { if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) return i; } @@ -107,12 +103,12 @@ EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * -nf_ct_expect_find_get(const struct nf_conntrack_tuple *tuple) +nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; rcu_read_lock(); - i = __nf_ct_expect_find(tuple); + i = __nf_ct_expect_find(net, tuple); if (i && !atomic_inc_not_zero(&i->use)) i = NULL; rcu_read_unlock(); @@ -124,17 +120,17 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * -nf_ct_find_expectation(const struct nf_conntrack_tuple *tuple) +nf_ct_find_expectation(struct net *net, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i, *exp = NULL; struct hlist_node *n; unsigned int h; - if (!nf_ct_expect_count) + if (!net->ct.expect_count) return NULL; h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) { + hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { exp = i; @@ -311,6 +307,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_put); static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conn_help *master_help = nfct_help(exp->master); + struct net *net = nf_ct_exp_net(exp); const struct nf_conntrack_expect_policy *p; unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); @@ -319,8 +316,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) hlist_add_head(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; - hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); - nf_ct_expect_count++; + hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); + net->ct.expect_count++; setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); @@ -371,6 +368,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect) struct nf_conntrack_expect *i; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); + struct net *net = nf_ct_exp_net(expect); struct hlist_node *n; unsigned int h; int ret; @@ -383,7 +381,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect) goto out; } h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) { + hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { if (expect_matches(i, expect)) { /* Refresh timer: if it's dying, ignore.. */ if (refresh_timer(i)) { @@ -406,7 +404,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect) } } - if (nf_ct_expect_count >= nf_ct_expect_max) { + if (net->ct.expect_count >= nf_ct_expect_max) { if (net_ratelimit()) printk(KERN_WARNING "nf_conntrack: expectation table full\n"); @@ -430,11 +428,12 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { + struct net *net = &init_net; struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + n = rcu_dereference(net->ct.expect_hash[st->bucket].first); if (n) return n; } @@ -444,13 +443,14 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { + struct net *net = &init_net; struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(head->next); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + head = rcu_dereference(net->ct.expect_hash[st->bucket].first); } return head; } @@ -558,7 +558,7 @@ static void exp_proc_remove(void) module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600); -int nf_conntrack_expect_init(void) +int nf_conntrack_expect_init(struct net *net) { int err = -ENOMEM; @@ -569,9 +569,10 @@ int nf_conntrack_expect_init(void) } nf_ct_expect_max = nf_ct_expect_hsize * 4; - nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, - &nf_ct_expect_vmalloc); - if (nf_ct_expect_hash == NULL) + net->ct.expect_count = 0; + net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, + &net->ct.expect_vmalloc); + if (net->ct.expect_hash == NULL) goto err1; nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", @@ -589,16 +590,16 @@ int nf_conntrack_expect_init(void) err3: kmem_cache_destroy(nf_ct_expect_cachep); err2: - nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc, + nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, nf_ct_expect_hsize); err1: return err; } -void nf_conntrack_expect_fini(void) +void nf_conntrack_expect_fini(struct net *net) { exp_proc_remove(); kmem_cache_destroy(nf_ct_expect_cachep); - nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc, + nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 5dc0478108a..dfb826c973d 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1219,7 +1219,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, tuple.dst.u.tcp.port = port; tuple.dst.protonum = IPPROTO_TCP; - exp = __nf_ct_expect_find(&tuple); + exp = __nf_ct_expect_find(&init_net, &tuple); if (exp && exp->master == ct) return exp; return NULL; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index d91278dfdaf..c793db810cd 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -145,7 +145,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) /* Get rid of expectations */ for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], hnode) { + &init_net.ct.expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((help->helper == me || exp->helper == me) && del_timer(&exp->timeout)) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 918a3358a12..cadfd15b44f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1458,6 +1458,7 @@ static int ctnetlink_exp_done(struct netlink_callback *cb) static int ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = &init_net; struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); struct hlist_node *n; @@ -1467,7 +1468,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: - hlist_for_each_entry(exp, n, &nf_ct_expect_hash[cb->args[0]], + hlist_for_each_entry(exp, n, &net->ct.expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; @@ -1529,7 +1530,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - exp = nf_ct_expect_find_get(&tuple); + exp = nf_ct_expect_find_get(&init_net, &tuple); if (!exp) return -ENOENT; @@ -1583,7 +1584,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, return err; /* bump usage count to 2 */ - exp = nf_ct_expect_find_get(&tuple); + exp = nf_ct_expect_find_get(&init_net, &tuple); if (!exp) return -ENOENT; @@ -1613,7 +1614,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], + &init_net.ct.expect_hash[i], hnode) { m_help = nfct_help(exp->master); if (m_help->helper == h @@ -1629,7 +1630,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, spin_lock_bh(&nf_conntrack_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], + &init_net.ct.expect_hash[i], hnode) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); @@ -1724,7 +1725,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, return err; spin_lock_bh(&nf_conntrack_lock); - exp = __nf_ct_expect_find(&tuple); + exp = __nf_ct_expect_find(&init_net, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_lock); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 7caf45b59d2..5db7df5d19b 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -121,7 +121,7 @@ static void pptp_expectfn(struct nf_conn *ct, pr_debug("trying to unexpect other dir: "); nf_ct_dump_tuple(&inv_t); - exp_other = nf_ct_expect_find_get(&inv_t); + exp_other = nf_ct_expect_find_get(&init_net, &inv_t); if (exp_other) { /* delete other expectation. */ pr_debug("found\n"); @@ -154,7 +154,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t) nf_ct_put(sibling); return 1; } else { - exp = nf_ct_expect_find_get(t); + exp = nf_ct_expect_find_get(&init_net, t); if (exp) { pr_debug("unexpect_related of expect %p\n", exp); nf_ct_unexpect_related(exp); diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 1fa306be60f..a006080eb38 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -775,7 +775,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, rcu_read_lock(); do { - exp = __nf_ct_expect_find(&tuple); + exp = __nf_ct_expect_find(&init_net, &tuple); if (!exp || exp->master == ct || nfct_help(exp->master)->helper != nfct_help(ct)->helper || -- cgit v1.2.3-70-g09d2 From a702a65fc1376fc1f6757ec2a6960348af3f1876 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:04 +0200 Subject: netfilter: netns nf_conntrack: pass netns pointer to nf_conntrack_in() It's deducible from skb->dev or skb->dst->dev, but we know netns at the moment of call, so pass it down and use for finding and creating conntracks. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack_core.h | 3 ++- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 4 ++-- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 24 ++++++++++++++++-------- net/netfilter/nf_conntrack_core.c | 15 ++++++++------- 4 files changed, 28 insertions(+), 18 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index b4b45c541da..e78afe7f28e 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -20,7 +20,8 @@ /* This header is used to share core functionality between the standalone connection tracking module, and the compatibility layer's use of connection tracking. */ -extern unsigned int nf_conntrack_in(u_int8_t pf, +extern unsigned int nf_conntrack_in(struct net *net, + u_int8_t pf, unsigned int hooknum, struct sk_buff *skb); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 03dd108015c..2e4dd3fb002 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -172,7 +172,7 @@ static unsigned int ipv4_conntrack_in(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return nf_conntrack_in(PF_INET, hooknum, skb); + return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb); } static unsigned int ipv4_conntrack_local(unsigned int hooknum, @@ -188,7 +188,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum, printk("ipt_hook: happy cracking.\n"); return NF_ACCEPT; } - return nf_conntrack_in(PF_INET, hooknum, skb); + return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb); } /* Connection tracking may drop packets, but never alters them, so diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 85050c072ab..e91db16611d 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -211,11 +211,10 @@ static unsigned int ipv6_defrag(unsigned int hooknum, return NF_STOLEN; } -static unsigned int ipv6_conntrack_in(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int __ipv6_conntrack_in(struct net *net, + unsigned int hooknum, + struct sk_buff *skb, + int (*okfn)(struct sk_buff *)) { struct sk_buff *reasm = skb->nfct_reasm; @@ -225,7 +224,7 @@ static unsigned int ipv6_conntrack_in(unsigned int hooknum, if (!reasm->nfct) { unsigned int ret; - ret = nf_conntrack_in(PF_INET6, hooknum, reasm); + ret = nf_conntrack_in(net, PF_INET6, hooknum, reasm); if (ret != NF_ACCEPT) return ret; } @@ -235,7 +234,16 @@ static unsigned int ipv6_conntrack_in(unsigned int hooknum, return NF_ACCEPT; } - return nf_conntrack_in(PF_INET6, hooknum, skb); + return nf_conntrack_in(net, PF_INET6, hooknum, skb); +} + +static unsigned int ipv6_conntrack_in(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return __ipv6_conntrack_in(dev_net(in), hooknum, skb, okfn); } static unsigned int ipv6_conntrack_local(unsigned int hooknum, @@ -250,7 +258,7 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum, printk("ipv6_conntrack_local: packet too short\n"); return NF_ACCEPT; } - return ipv6_conntrack_in(hooknum, skb, in, out, okfn); + return __ipv6_conntrack_in(dev_net(out), hooknum, skb, okfn); } static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2a105db1330..5c96d9732c7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -611,7 +611,8 @@ init_conntrack(struct net *net, /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ static inline struct nf_conn * -resolve_normal_ct(struct sk_buff *skb, +resolve_normal_ct(struct net *net, + struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, @@ -632,10 +633,9 @@ resolve_normal_ct(struct sk_buff *skb, } /* look for tuple match */ - h = nf_conntrack_find_get(&init_net, &tuple); + h = nf_conntrack_find_get(net, &tuple); if (!h) { - h = init_conntrack(&init_net, &tuple, l3proto, l4proto, skb, - dataoff); + h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff); if (!h) return NULL; if (IS_ERR(h)) @@ -669,7 +669,8 @@ resolve_normal_ct(struct sk_buff *skb, } unsigned int -nf_conntrack_in(u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) +nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, + struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -709,8 +710,8 @@ nf_conntrack_in(u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) return -ret; } - ct = resolve_normal_ct(skb, dataoff, pf, protonum, l3proto, l4proto, - &set_reply, &ctinfo); + ct = resolve_normal_ct(net, skb, dataoff, pf, protonum, + l3proto, l4proto, &set_reply, &ctinfo); if (!ct) { /* Not valid part of a connection */ NF_CT_STAT_INC_ATOMIC(invalid); -- cgit v1.2.3-70-g09d2 From 74c51a1497033e6ff7b8096797daca233a4a30df Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:05 +0200 Subject: netfilter: netns nf_conntrack: pass netns pointer to L4 protocol's ->error hook Again, it's deducible from skb, but we're going to use it for nf_conntrack_checksum and statistics, so just pass it from upper layer. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack_l4proto.h | 2 +- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 8 ++++---- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 9 +++++---- net/netfilter/nf_conntrack_core.c | 12 +++++++----- net/netfilter/nf_conntrack_proto_dccp.c | 6 +++--- net/netfilter/nf_conntrack_proto_tcp.c | 3 ++- net/netfilter/nf_conntrack_proto_udp.c | 2 +- net/netfilter/nf_conntrack_proto_udplite.c | 4 +++- 8 files changed, 26 insertions(+), 20 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index d4376e97bae..97723d33c95 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -50,7 +50,7 @@ struct nf_conntrack_l4proto /* Called when a conntrack entry is destroyed */ void (*destroy)(struct nf_conn *ct); - int (*error)(struct sk_buff *skb, unsigned int dataoff, + int (*error)(struct net *net, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index daf346377b6..8c7ed5bc959 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -123,7 +123,7 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ static int -icmp_error_message(struct sk_buff *skb, +icmp_error_message(struct net *net, struct sk_buff *skb, enum ip_conntrack_info *ctinfo, unsigned int hooknum) { @@ -155,7 +155,7 @@ icmp_error_message(struct sk_buff *skb, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(&init_net, &innertuple); + h = nf_conntrack_find_get(net, &innertuple); if (!h) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; @@ -172,7 +172,7 @@ icmp_error_message(struct sk_buff *skb, /* Small and modified version of icmp_rcv */ static int -icmp_error(struct sk_buff *skb, unsigned int dataoff, +icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { const struct icmphdr *icmph; @@ -217,7 +217,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, && icmph->type != ICMP_REDIRECT) return NF_ACCEPT; - return icmp_error_message(skb, ctinfo, hooknum); + return icmp_error_message(net, skb, ctinfo, hooknum); } #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 548cf4f15c0..aabddfe2127 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -122,7 +122,8 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, } static int -icmpv6_error_message(struct sk_buff *skb, +icmpv6_error_message(struct net *net, + struct sk_buff *skb, unsigned int icmp6off, enum ip_conntrack_info *ctinfo, unsigned int hooknum) @@ -156,7 +157,7 @@ icmpv6_error_message(struct sk_buff *skb, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(&init_net, &intuple); + h = nf_conntrack_find_get(net, &intuple); if (!h) { pr_debug("icmpv6_error: no match\n"); return -NF_ACCEPT; @@ -172,7 +173,7 @@ icmpv6_error_message(struct sk_buff *skb, } static int -icmpv6_error(struct sk_buff *skb, unsigned int dataoff, +icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { const struct icmp6hdr *icmp6h; @@ -197,7 +198,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff, if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; - return icmpv6_error_message(skb, dataoff, ctinfo, hooknum); + return icmpv6_error_message(net, skb, dataoff, ctinfo, hooknum); } #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5c96d9732c7..251f020c7c1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -703,11 +703,13 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter * core what to do with the packet. */ - if (l4proto->error != NULL && - (ret = l4proto->error(skb, dataoff, &ctinfo, pf, hooknum)) <= 0) { - NF_CT_STAT_INC_ATOMIC(error); - NF_CT_STAT_INC_ATOMIC(invalid); - return -ret; + if (l4proto->error != NULL) { + ret = l4proto->error(net, skb, dataoff, &ctinfo, pf, hooknum); + if (ret <= 0) { + NF_CT_STAT_INC_ATOMIC(error); + NF_CT_STAT_INC_ATOMIC(invalid); + return -ret; + } } ct = resolve_normal_ct(net, skb, dataoff, pf, protonum, diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index edc30358dc1..6ead8da3e9e 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -545,9 +545,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, return NF_ACCEPT; } -static int dccp_error(struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, u_int8_t pf, - unsigned int hooknum) +static int dccp_error(struct net *net, struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info *ctinfo, + u_int8_t pf, unsigned int hooknum) { struct dccp_hdr _dh, *dh; unsigned int dccp_len = skb->len - dataoff; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 539a8202025..4e71de2405f 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -746,7 +746,8 @@ static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] = }; /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ -static int tcp_error(struct sk_buff *skb, +static int tcp_error(struct net *net, + struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 2a965c4a0ea..8a245beb2c9 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -89,7 +89,7 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; } -static int udp_error(struct sk_buff *skb, unsigned int dataoff, +static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 4fb6c8d83a8..981701919a7 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -89,7 +89,9 @@ static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb, return true; } -static int udplite_error(struct sk_buff *skb, unsigned int dataoff, +static int udplite_error(struct net *net, + struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) -- cgit v1.2.3-70-g09d2 From 5e6b29972b7e9c9c39882227e36fe0cd3463fe96 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:06 +0200 Subject: netfilter: netns nf_conntrack: per-netns /proc/net/ip_conntrack, /proc/net/stat/ip_conntrack, /proc/net/ip_conntrack_expect Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- .../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 57 ++++++++++++++-------- 1 file changed, 38 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index f8636a57e8c..b2940836d10 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -21,18 +21,20 @@ #include struct ct_iter_state { + struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_get_first(struct seq_file *seq) { + struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(init_net.ct.hash[st->bucket].first); + n = rcu_dereference(net->ct.hash[st->bucket].first); if (n) return n; } @@ -42,13 +44,14 @@ static struct hlist_node *ct_get_first(struct seq_file *seq) static struct hlist_node *ct_get_next(struct seq_file *seq, struct hlist_node *head) { + struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(head->next); while (head == NULL) { if (++st->bucket >= nf_conntrack_htable_size) return NULL; - head = rcu_dereference(init_net.ct.hash[st->bucket].first); + head = rcu_dereference(net->ct.hash[st->bucket].first); } return head; } @@ -158,8 +161,8 @@ static const struct seq_operations ct_seq_ops = { static int ct_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &ct_seq_ops, - sizeof(struct ct_iter_state)); + return seq_open_net(inode, file, &ct_seq_ops, + sizeof(struct ct_iter_state)); } static const struct file_operations ct_file_ops = { @@ -167,17 +170,18 @@ static const struct file_operations ct_file_ops = { .open = ct_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; /* expects */ struct ct_expect_iter_state { + struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { - struct net *net = &init_net; + struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; @@ -192,7 +196,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { - struct net *net = &init_net; + struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(head->next); @@ -267,8 +271,8 @@ static const struct seq_operations exp_seq_ops = { static int exp_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &exp_seq_ops, - sizeof(struct ct_expect_iter_state)); + return seq_open_net(inode, file, &exp_seq_ops, + sizeof(struct ct_expect_iter_state)); } static const struct file_operations ip_exp_file_ops = { @@ -276,7 +280,7 @@ static const struct file_operations ip_exp_file_ops = { .open = exp_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) @@ -367,36 +371,51 @@ static const struct file_operations ct_cpu_seq_fops = { .release = seq_release, }; -int __init nf_conntrack_ipv4_compat_init(void) +static int __net_init ip_conntrack_net_init(struct net *net) { struct proc_dir_entry *proc, *proc_exp, *proc_stat; - proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops); + proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops); if (!proc) goto err1; - proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440, + proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440, &ip_exp_file_ops); if (!proc_exp) goto err2; proc_stat = proc_create("ip_conntrack", S_IRUGO, - init_net.proc_net_stat, &ct_cpu_seq_fops); + net->proc_net_stat, &ct_cpu_seq_fops); if (!proc_stat) goto err3; return 0; err3: - proc_net_remove(&init_net, "ip_conntrack_expect"); + proc_net_remove(net, "ip_conntrack_expect"); err2: - proc_net_remove(&init_net, "ip_conntrack"); + proc_net_remove(net, "ip_conntrack"); err1: return -ENOMEM; } +static void __net_exit ip_conntrack_net_exit(struct net *net) +{ + remove_proc_entry("ip_conntrack", net->proc_net_stat); + proc_net_remove(net, "ip_conntrack_expect"); + proc_net_remove(net, "ip_conntrack"); +} + +static struct pernet_operations ip_conntrack_net_ops = { + .init = ip_conntrack_net_init, + .exit = ip_conntrack_net_exit, +}; + +int __init nf_conntrack_ipv4_compat_init(void) +{ + return register_pernet_subsys(&ip_conntrack_net_ops); +} + void __exit nf_conntrack_ipv4_compat_fini(void) { - remove_proc_entry("ip_conntrack", init_net.proc_net_stat); - proc_net_remove(&init_net, "ip_conntrack_expect"); - proc_net_remove(&init_net, "ip_conntrack"); + unregister_pernet_subsys(&ip_conntrack_net_ops); } -- cgit v1.2.3-70-g09d2 From a71996fccce4b2086a26036aa3c915365ca36926 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:07 +0200 Subject: netfilter: netns nf_conntrack: pass conntrack to nf_conntrack_event_cache() not skb This is cleaner, we already know conntrack to which event is relevant. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack_ecache.h | 4 +--- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 2 +- net/ipv4/netfilter/nf_nat_helper.c | 2 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +- net/netfilter/nf_conntrack_core.c | 10 +++++----- net/netfilter/nf_conntrack_ftp.c | 9 +++++---- net/netfilter/nf_conntrack_proto_gre.c | 2 +- net/netfilter/nf_conntrack_proto_sctp.c | 4 ++-- net/netfilter/nf_conntrack_proto_tcp.c | 6 +++--- net/netfilter/nf_conntrack_proto_udp.c | 2 +- net/netfilter/nf_conntrack_proto_udplite.c | 2 +- net/netfilter/xt_CONNMARK.c | 8 ++++---- net/netfilter/xt_CONNSECMARK.c | 2 +- 13 files changed, 27 insertions(+), 28 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index f0b9078235c..c1b406cecf9 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -28,10 +28,8 @@ extern void __nf_ct_event_cache_init(struct nf_conn *ct); extern void nf_ct_event_cache_flush(void); static inline void -nf_conntrack_event_cache(enum ip_conntrack_events event, - const struct sk_buff *skb) +nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) { - struct nf_conn *ct = (struct nf_conn *)skb->nfct; struct nf_conntrack_ecache *ecache; local_bh_disable(); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 8c7ed5bc959..205ba399d4a 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -91,7 +91,7 @@ static int icmp_packet(struct nf_conn *ct, nf_ct_kill_acct(ct, ctinfo, skb); } else { atomic_inc(&ct->proto.icmp.count); - nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct); nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); } diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 112dcfa1290..cf7a42bf982 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -193,7 +193,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, nf_conntrack_tcp_update(skb, ip_hdrlen(skb), ct, CTINFO2DIR(ctinfo)); - nf_conntrack_event_cache(IPCT_NATSEQADJ, skb); + nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); } return 1; } diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index aabddfe2127..df04de91e6e 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -93,7 +93,7 @@ static int icmpv6_packet(struct nf_conn *ct, nf_ct_kill_acct(ct, ctinfo, skb); } else { atomic_inc(&ct->proto.icmp.count); - nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct); nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 251f020c7c1..01f59c57730 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -370,14 +370,14 @@ __nf_conntrack_confirm(struct sk_buff *skb) spin_unlock_bh(&nf_conntrack_lock); help = nfct_help(ct); if (help && help->helper) - nf_conntrack_event_cache(IPCT_HELPER, skb); + nf_conntrack_event_cache(IPCT_HELPER, ct); #ifdef CONFIG_NF_NAT_NEEDED if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_NATINFO, skb); + nf_conntrack_event_cache(IPCT_NATINFO, ct); #endif nf_conntrack_event_cache(master_ct(ct) ? - IPCT_RELATED : IPCT_NEW, skb); + IPCT_RELATED : IPCT_NEW, ct); return NF_ACCEPT; out: @@ -740,7 +740,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, } if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); return ret; } @@ -853,7 +853,7 @@ acct: /* must be unlocked when calling event cache */ if (event) - nf_conntrack_event_cache(event, skb); + nf_conntrack_event_cache(event, ct); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index bb20672fe03..4f7107107e9 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -318,7 +318,8 @@ static int find_nl_seq(u32 seq, const struct nf_ct_ftp_master *info, int dir) } /* We don't update if it's older than what we have. */ -static void update_nl_seq(u32 nl_seq, struct nf_ct_ftp_master *info, int dir, +static void update_nl_seq(struct nf_conn *ct, u32 nl_seq, + struct nf_ct_ftp_master *info, int dir, struct sk_buff *skb) { unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; @@ -336,11 +337,11 @@ static void update_nl_seq(u32 nl_seq, struct nf_ct_ftp_master *info, int dir, if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; - nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, ct); } else if (oldest != NUM_SEQ_TO_REMEMBER && after(nl_seq, info->seq_aft_nl[dir][oldest])) { info->seq_aft_nl[dir][oldest] = nl_seq; - nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, ct); } } @@ -509,7 +510,7 @@ out_update_nl: /* Now if this ends in \n, update ftp info. Seq may have been * adjusted by NAT code. */ if (ends_in_nl) - update_nl_seq(seq, ct_ftp_info, dir, skb); + update_nl_seq(ct, seq, ct_ftp_info, dir, skb); out: spin_unlock_bh(&nf_ftp_lock); return ret; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index c5a78220fa3..5b1273a01fe 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -229,7 +229,7 @@ static int gre_packet(struct nf_conn *ct, ct->proto.gre.stream_timeout); /* Also, more likely to be important, and not a probe. */ set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); } else nf_ct_refresh_acct(ct, ctinfo, skb, ct->proto.gre.timeout); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index b5a90596d3f..ae8c2609e23 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -369,7 +369,7 @@ static int sctp_packet(struct nf_conn *ct, ct->proto.sctp.state = new_state; if (old_state != new_state) - nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); } write_unlock_bh(&sctp_lock); @@ -380,7 +380,7 @@ static int sctp_packet(struct nf_conn *ct, new_state == SCTP_CONNTRACK_ESTABLISHED) { pr_debug("Setting assured bit\n"); set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); } return NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 4e71de2405f..b5d62d66e02 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -969,9 +969,9 @@ static int tcp_packet(struct nf_conn *ct, timeout = tcp_timeouts[new_state]; write_unlock_bh(&tcp_lock); - nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct); if (new_state != old_state) - nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { /* If only reply is a RST, we can consider ourselves not to @@ -990,7 +990,7 @@ static int tcp_packet(struct nf_conn *ct, after SYN_RECV or a valid answer for a picked up connection. */ set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); } nf_ct_refresh_acct(ct, ctinfo, skb, timeout); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8a245beb2c9..e0ee89e179c 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -75,7 +75,7 @@ static int udp_packet(struct nf_conn *ct, nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout_stream); /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); } else nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout); diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 981701919a7..c5b77c8f86c 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -75,7 +75,7 @@ static int udplite_packet(struct nf_conn *ct, nf_ct_udplite_timeout_stream); /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); } else nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udplite_timeout); diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index e72e5d01752..e1415c3f5c9 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -54,7 +54,7 @@ connmark_tg_v0(struct sk_buff *skb, const struct net_device *in, newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; if (newmark != ct->mark) { ct->mark = newmark; - nf_conntrack_event_cache(IPCT_MARK, skb); + nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_SAVE: @@ -62,7 +62,7 @@ connmark_tg_v0(struct sk_buff *skb, const struct net_device *in, (skb->mark & markinfo->mask); if (ct->mark != newmark) { ct->mark = newmark; - nf_conntrack_event_cache(IPCT_MARK, skb); + nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: @@ -95,7 +95,7 @@ connmark_tg(struct sk_buff *skb, const struct net_device *in, newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; if (ct->mark != newmark) { ct->mark = newmark; - nf_conntrack_event_cache(IPCT_MARK, skb); + nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_SAVE: @@ -103,7 +103,7 @@ connmark_tg(struct sk_buff *skb, const struct net_device *in, (skb->mark & info->nfmask); if (ct->mark != newmark) { ct->mark = newmark; - nf_conntrack_event_cache(IPCT_MARK, skb); + nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index ae939e54dfa..5f221c3bd35 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -43,7 +43,7 @@ static void secmark_save(const struct sk_buff *skb) ct = nf_ct_get(skb, &ctinfo); if (ct && !ct->secmark) { ct->secmark = skb->secmark; - nf_conntrack_event_cache(IPCT_SECMARK, skb); + nf_conntrack_event_cache(IPCT_SECMARK, ct); } } } -- cgit v1.2.3-70-g09d2 From 0d55af8791bfb42e04cc456b348910582f230343 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:07 +0200 Subject: netfilter: netns nf_conntrack: per-netns statistics Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack.h | 8 ++-- include/net/netns/conntrack.h | 1 + .../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 4 +- net/netfilter/nf_conntrack_core.c | 49 ++++++++++++---------- net/netfilter/nf_conntrack_expect.c | 4 +- net/netfilter/nf_conntrack_standalone.c | 4 +- 6 files changed, 38 insertions(+), 32 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index f5447f14304..c95561050f7 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -290,12 +290,12 @@ extern unsigned int nf_conntrack_htable_size; extern int nf_conntrack_checksum; extern int nf_conntrack_max; -DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); -#define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++) -#define NF_CT_STAT_INC_ATOMIC(count) \ +#define NF_CT_STAT_INC(net, count) \ + (per_cpu_ptr((net)->ct.stat, raw_smp_processor_id())->count++) +#define NF_CT_STAT_INC_ATOMIC(net, count) \ do { \ local_bh_disable(); \ - __get_cpu_var(nf_conntrack_stat).count++; \ + per_cpu_ptr((net)->ct.stat, raw_smp_processor_id())->count++; \ local_bh_enable(); \ } while (0) diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 9d5c1623c51..fc0a46d64cc 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -12,6 +12,7 @@ struct netns_ct { struct hlist_head *hash; struct hlist_head *expect_hash; struct hlist_head unconfirmed; + struct ip_conntrack_stat *stat; #ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_ecache *ecache; #endif diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index b2940836d10..fdc85b37078 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -294,7 +294,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) if (!cpu_possible(cpu)) continue; *pos = cpu+1; - return &per_cpu(nf_conntrack_stat, cpu); + return per_cpu_ptr(init_net.ct.stat, cpu); } return NULL; @@ -308,7 +308,7 @@ static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!cpu_possible(cpu)) continue; *pos = cpu+1; - return &per_cpu(nf_conntrack_stat, cpu); + return per_cpu_ptr(init_net.ct.stat, cpu); } return NULL; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index b55944e5e4e..1e87fa0cd3a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -56,9 +56,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_untracked); unsigned int nf_ct_log_invalid __read_mostly; static struct kmem_cache *nf_conntrack_cachep __read_mostly; -DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); -EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat); - static int nf_conntrack_hash_rnd_initted; static unsigned int nf_conntrack_hash_rnd; @@ -171,6 +168,7 @@ static void destroy_conntrack(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; + struct net *net = nf_ct_net(ct); struct nf_conntrack_l4proto *l4proto; pr_debug("destroy_conntrack(%p)\n", ct); @@ -203,7 +201,7 @@ destroy_conntrack(struct nf_conntrack *nfct) hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); } - NF_CT_STAT_INC(delete); + NF_CT_STAT_INC(net, delete); spin_unlock_bh(&nf_conntrack_lock); if (ct->master) @@ -216,6 +214,7 @@ destroy_conntrack(struct nf_conntrack *nfct) static void death_by_timeout(unsigned long ul_conntrack) { struct nf_conn *ct = (void *)ul_conntrack; + struct net *net = nf_ct_net(ct); struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_helper *helper; @@ -230,7 +229,7 @@ static void death_by_timeout(unsigned long ul_conntrack) spin_lock_bh(&nf_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ - NF_CT_STAT_INC(delete_list); + NF_CT_STAT_INC(net, delete_list); clean_from_lists(ct); spin_unlock_bh(&nf_conntrack_lock); nf_ct_put(ct); @@ -249,11 +248,11 @@ __nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple) local_bh_disable(); hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { if (nf_ct_tuple_equal(tuple, &h->tuple)) { - NF_CT_STAT_INC(found); + NF_CT_STAT_INC(net, found); local_bh_enable(); return h; } - NF_CT_STAT_INC(searched); + NF_CT_STAT_INC(net, searched); } local_bh_enable(); @@ -366,7 +365,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); set_bit(IPS_CONFIRMED_BIT, &ct->status); - NF_CT_STAT_INC(insert); + NF_CT_STAT_INC(net, insert); spin_unlock_bh(&nf_conntrack_lock); help = nfct_help(ct); if (help && help->helper) @@ -381,7 +380,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; out: - NF_CT_STAT_INC(insert_failed); + NF_CT_STAT_INC(net, insert_failed); spin_unlock_bh(&nf_conntrack_lock); return NF_DROP; } @@ -405,11 +404,11 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple)) { - NF_CT_STAT_INC(found); + NF_CT_STAT_INC(net, found); rcu_read_unlock_bh(); return 1; } - NF_CT_STAT_INC(searched); + NF_CT_STAT_INC(net, searched); } rcu_read_unlock_bh(); @@ -454,7 +453,7 @@ static noinline int early_drop(struct net *net, unsigned int hash) if (del_timer(&ct->timeout)) { death_by_timeout((unsigned long)ct); dropped = 1; - NF_CT_STAT_INC_ATOMIC(early_drop); + NF_CT_STAT_INC_ATOMIC(net, early_drop); } nf_ct_put(ct); return dropped; @@ -581,7 +580,7 @@ init_conntrack(struct net *net, ct->secmark = exp->master->secmark; #endif nf_conntrack_get(&ct->master->ct_general); - NF_CT_STAT_INC(expect_new); + NF_CT_STAT_INC(net, expect_new); } else { struct nf_conntrack_helper *helper; @@ -591,7 +590,7 @@ init_conntrack(struct net *net, if (help) rcu_assign_pointer(help->helper, helper); } - NF_CT_STAT_INC(new); + NF_CT_STAT_INC(net, new); } /* Overload tuple linked list to put us in unconfirmed list. */ @@ -683,7 +682,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, /* Previously seen (loopback or untracked)? Ignore. */ if (skb->nfct) { - NF_CT_STAT_INC_ATOMIC(ignore); + NF_CT_STAT_INC_ATOMIC(net, ignore); return NF_ACCEPT; } @@ -693,8 +692,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, &dataoff, &protonum); if (ret <= 0) { pr_debug("not prepared to track yet or error occured\n"); - NF_CT_STAT_INC_ATOMIC(error); - NF_CT_STAT_INC_ATOMIC(invalid); + NF_CT_STAT_INC_ATOMIC(net, error); + NF_CT_STAT_INC_ATOMIC(net, invalid); return -ret; } @@ -706,8 +705,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, if (l4proto->error != NULL) { ret = l4proto->error(net, skb, dataoff, &ctinfo, pf, hooknum); if (ret <= 0) { - NF_CT_STAT_INC_ATOMIC(error); - NF_CT_STAT_INC_ATOMIC(invalid); + NF_CT_STAT_INC_ATOMIC(net, error); + NF_CT_STAT_INC_ATOMIC(net, invalid); return -ret; } } @@ -716,13 +715,13 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, l3proto, l4proto, &set_reply, &ctinfo); if (!ct) { /* Not valid part of a connection */ - NF_CT_STAT_INC_ATOMIC(invalid); + NF_CT_STAT_INC_ATOMIC(net, invalid); return NF_ACCEPT; } if (IS_ERR(ct)) { /* Too stressed to deal. */ - NF_CT_STAT_INC_ATOMIC(drop); + NF_CT_STAT_INC_ATOMIC(net, drop); return NF_DROP; } @@ -735,7 +734,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, pr_debug("nf_conntrack_in: Can't track with proto module\n"); nf_conntrack_put(skb->nfct); skb->nfct = NULL; - NF_CT_STAT_INC_ATOMIC(invalid); + NF_CT_STAT_INC_ATOMIC(net, invalid); return -ret; } @@ -1043,6 +1042,7 @@ void nf_conntrack_cleanup(struct net *net) nf_conntrack_acct_fini(); nf_conntrack_expect_fini(net); + free_percpu(net->ct.stat); nf_conntrack_helper_fini(); nf_conntrack_proto_fini(); } @@ -1152,6 +1152,9 @@ int nf_conntrack_init(struct net *net) max_factor = 4; } atomic_set(&net->ct.count, 0); + net->ct.stat = alloc_percpu(struct ip_conntrack_stat); + if (!net->ct.stat) + goto err_stat; ret = nf_conntrack_ecache_init(net); if (ret < 0) goto err_ecache; @@ -1222,5 +1225,7 @@ err_free_hash: err_hash: nf_conntrack_ecache_fini(net); err_ecache: + free_percpu(net->ct.stat); +err_stat: return -ENOMEM; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 6a09200049e..b7f75117161 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -53,7 +53,7 @@ void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) master_help->expecting[exp->class]--; nf_ct_expect_put(exp); - NF_CT_STAT_INC(expect_delete); + NF_CT_STAT_INC(net, expect_delete); } EXPORT_SYMBOL_GPL(nf_ct_unlink_expect); @@ -326,7 +326,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) add_timer(&exp->timeout); atomic_inc(&exp->use); - NF_CT_STAT_INC(expect_create); + NF_CT_STAT_INC(net, expect_create); } /* Race with expectations being used means we could have none to find; OK. */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 02eaf872277..a4fdbbf363a 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -203,7 +203,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) if (!cpu_possible(cpu)) continue; *pos = cpu + 1; - return &per_cpu(nf_conntrack_stat, cpu); + return per_cpu_ptr(init_net.ct.stat, cpu); } return NULL; @@ -217,7 +217,7 @@ static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!cpu_possible(cpu)) continue; *pos = cpu + 1; - return &per_cpu(nf_conntrack_stat, cpu); + return per_cpu_ptr(init_net.ct.stat, cpu); } return NULL; -- cgit v1.2.3-70-g09d2 From 8e9df80180b73d4107bf8fbf28b1633c541d2770 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:08 +0200 Subject: netfilter: netns nf_conntrack: per-netns /proc/net/stat/nf_conntrack, /proc/net/stat/ip_conntrack Show correct conntrack count, while I'm at it. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | 14 +++++++++----- net/netfilter/nf_conntrack_standalone.c | 14 +++++++++----- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index fdc85b37078..313ebf00ee3 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -285,6 +285,7 @@ static const struct file_operations ip_exp_file_ops = { static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) { + struct net *net = seq_file_net(seq); int cpu; if (*pos == 0) @@ -294,7 +295,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) if (!cpu_possible(cpu)) continue; *pos = cpu+1; - return per_cpu_ptr(init_net.ct.stat, cpu); + return per_cpu_ptr(net->ct.stat, cpu); } return NULL; @@ -302,13 +303,14 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq_file_net(seq); int cpu; for (cpu = *pos; cpu < NR_CPUS; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; - return per_cpu_ptr(init_net.ct.stat, cpu); + return per_cpu_ptr(net->ct.stat, cpu); } return NULL; @@ -320,7 +322,8 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&init_net.ct.count); + struct net *net = seq_file_net(seq); + unsigned int nr_conntracks = atomic_read(&net->ct.count); const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { @@ -360,7 +363,8 @@ static const struct seq_operations ct_cpu_seq_ops = { static int ct_cpu_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &ct_cpu_seq_ops); + return seq_open_net(inode, file, &ct_cpu_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations ct_cpu_seq_fops = { @@ -368,7 +372,7 @@ static const struct file_operations ct_cpu_seq_fops = { .open = ct_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; static int __net_init ip_conntrack_net_init(struct net *net) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index a4fdbbf363a..169760ddc16 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -194,6 +194,7 @@ static const struct file_operations ct_file_ops = { static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) { + struct net *net = seq_file_net(seq); int cpu; if (*pos == 0) @@ -203,7 +204,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) if (!cpu_possible(cpu)) continue; *pos = cpu + 1; - return per_cpu_ptr(init_net.ct.stat, cpu); + return per_cpu_ptr(net->ct.stat, cpu); } return NULL; @@ -211,13 +212,14 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq_file_net(seq); int cpu; for (cpu = *pos; cpu < NR_CPUS; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; - return per_cpu_ptr(init_net.ct.stat, cpu); + return per_cpu_ptr(net->ct.stat, cpu); } return NULL; @@ -229,7 +231,8 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&init_net.ct.count); + struct net *net = seq_file_net(seq); + unsigned int nr_conntracks = atomic_read(&net->ct.count); const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { @@ -269,7 +272,8 @@ static const struct seq_operations ct_cpu_seq_ops = { static int ct_cpu_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &ct_cpu_seq_ops); + return seq_open_net(inode, file, &ct_cpu_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations ct_cpu_seq_fops = { @@ -277,7 +281,7 @@ static const struct file_operations ct_cpu_seq_fops = { .open = ct_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; static int nf_conntrack_standalone_init_proc(struct net *net) -- cgit v1.2.3-70-g09d2 From c04d05529a6e0bf97183a2caf76a0c7f07f5b78c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:08 +0200 Subject: netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_checksum sysctl Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack.h | 1 - include/net/netns/conntrack.h | 1 + net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 2 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +- net/netfilter/nf_conntrack_proto_dccp.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 2 +- net/netfilter/nf_conntrack_proto_udp.c | 2 +- net/netfilter/nf_conntrack_proto_udplite.c | 2 +- net/netfilter/nf_conntrack_standalone.c | 7 +++---- 10 files changed, 11 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index c95561050f7..b76a8685b5b 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -287,7 +287,6 @@ static inline int nf_ct_is_untracked(const struct sk_buff *skb) extern int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); extern unsigned int nf_conntrack_htable_size; -extern int nf_conntrack_checksum; extern int nf_conntrack_max; #define NF_CT_STAT_INC(net, count) \ diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 2b50758df6a..38b6dae4d3d 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -17,6 +17,7 @@ struct netns_ct { #ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_ecache *ecache; #endif + int sysctl_checksum; #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_header; #endif diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 2e4dd3fb002..75871b1dd8a 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -270,7 +270,7 @@ static ctl_table ip_ct_sysctl_table[] = { { .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, .procname = "ip_conntrack_checksum", - .data = &nf_conntrack_checksum, + .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 205ba399d4a..ace66cbf921 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -188,7 +188,7 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, } /* See ip_conntrack_proto_tcp.c */ - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index df04de91e6e..fa12e57749a 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -187,7 +187,7 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, return -NF_ACCEPT; } - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, "nf_ct_icmpv6: ICMPv6 checksum failed\n"); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 6ead8da3e9e..769680e68b5 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -575,7 +575,7 @@ static int dccp_error(struct net *net, struct sk_buff *skb, } } - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP, pf)) { msg = "nf_ct_dccp: bad checksum "; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b5d62d66e02..131c9be4470 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -780,7 +780,7 @@ static int tcp_error(struct net *net, * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index e0ee89e179c..3d3fffe3f8b 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -123,7 +123,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. * FIXME: Source route IP option packets --RR */ - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { if (LOG_INVALID(IPPROTO_UDP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index c5b77c8f86c..3d1697c4f91 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -129,7 +129,7 @@ static int udplite_error(struct net *net, } /* Checksum invalid? Ignore. */ - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, pf)) { if (LOG_INVALID(IPPROTO_UDPLITE)) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 64b4f95b367..5cd06637977 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -322,9 +322,6 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) /* Sysctl support */ -int nf_conntrack_checksum __read_mostly = 1; -EXPORT_SYMBOL_GPL(nf_conntrack_checksum); - #ifdef CONFIG_SYSCTL /* Log invalid packets of a given protocol */ static int log_invalid_proto_min = 0; @@ -360,7 +357,7 @@ static ctl_table nf_ct_sysctl_table[] = { { .ctl_name = NET_NF_CONNTRACK_CHECKSUM, .procname = "nf_conntrack_checksum", - .data = &nf_conntrack_checksum, + .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -425,6 +422,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) goto out_kmemdup; table[1].data = &net->ct.count; + table[3].data = &net->ct.sysctl_checksum; net->ct.sysctl_header = register_net_sysctl_table(net, nf_net_netfilter_sysctl_path, table); @@ -474,6 +472,7 @@ static int nf_conntrack_net_init(struct net *net) ret = nf_conntrack_standalone_init_proc(net); if (ret < 0) goto out_proc; + net->ct.sysctl_checksum = 1; ret = nf_conntrack_standalone_init_sysctl(net); if (ret < 0) goto out_sysctl; -- cgit v1.2.3-70-g09d2 From c2a2c7e0cc39e7f9336cd67e8307a110bdba82f3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:08 +0200 Subject: netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_log_invalid sysctl Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/net/netfilter/nf_conntrack_l4proto.h | 15 +++++++-------- include/net/netns/conntrack.h | 1 + net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 6 +++--- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +- net/netfilter/nf_conntrack_core.c | 1 - net/netfilter/nf_conntrack_proto_dccp.c | 10 ++++++---- net/netfilter/nf_conntrack_proto_tcp.c | 18 ++++++++++-------- net/netfilter/nf_conntrack_proto_udp.c | 6 +++--- net/netfilter/nf_conntrack_proto_udplite.c | 8 ++++---- net/netfilter/nf_conntrack_standalone.c | 6 +++--- 11 files changed, 39 insertions(+), 36 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 97723d33c95..7f2f43c7728 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -117,20 +117,19 @@ extern int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t); extern const struct nla_policy nf_ct_port_nla_policy[]; -/* Log invalid packets */ -extern unsigned int nf_ct_log_invalid; - #ifdef CONFIG_SYSCTL #ifdef DEBUG_INVALID_PACKETS -#define LOG_INVALID(proto) \ - (nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW) +#define LOG_INVALID(net, proto) \ + ((net)->ct.sysctl_log_invalid == (proto) || \ + (net)->ct.sysctl_log_invalid == IPPROTO_RAW) #else -#define LOG_INVALID(proto) \ - ((nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW) \ +#define LOG_INVALID(net, proto) \ + (((net)->ct.sysctl_log_invalid == (proto) || \ + (net)->ct.sysctl_log_invalid == IPPROTO_RAW) \ && net_ratelimit()) #endif #else -#define LOG_INVALID(proto) 0 +#define LOG_INVALID(net, proto) 0 #endif /* CONFIG_SYSCTL */ #endif /*_NF_CONNTRACK_PROTOCOL_H*/ diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 38b6dae4d3d..503e37551b1 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -18,6 +18,7 @@ struct netns_ct { struct nf_conntrack_ecache *ecache; #endif int sysctl_checksum; + unsigned int sysctl_log_invalid; /* Log invalid packets */ #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_header; #endif diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 75871b1dd8a..af69acc1d0f 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -278,7 +278,7 @@ static ctl_table ip_ct_sysctl_table[] = { { .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, .procname = "ip_conntrack_log_invalid", - .data = &nf_ct_log_invalid, + .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index ace66cbf921..4e887922022 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -181,7 +181,7 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, /* Not enough header? */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { - if (LOG_INVALID(IPPROTO_ICMP)) + if (LOG_INVALID(net, IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "nf_ct_icmp: short packet "); return -NF_ACCEPT; @@ -190,7 +190,7 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, /* See ip_conntrack_proto_tcp.c */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { - if (LOG_INVALID(IPPROTO_ICMP)) + if (LOG_INVALID(net, IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "nf_ct_icmp: bad HW ICMP checksum "); return -NF_ACCEPT; @@ -203,7 +203,7 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, * discarded. */ if (icmph->type > NR_ICMP_TYPES) { - if (LOG_INVALID(IPPROTO_ICMP)) + if (LOG_INVALID(net, IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "nf_ct_icmp: invalid ICMP type "); return -NF_ACCEPT; diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index fa12e57749a..05726177903 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -181,7 +181,7 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { - if (LOG_INVALID(IPPROTO_ICMPV6)) + if (LOG_INVALID(net, IPPROTO_ICMPV6)) nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, "nf_ct_icmpv6: short packet "); return -NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1e87fa0cd3a..ade0bb3ab2e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -53,7 +53,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max); struct nf_conn nf_conntrack_untracked __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_untracked); -unsigned int nf_ct_log_invalid __read_mostly; static struct kmem_cache *nf_conntrack_cachep __read_mostly; static int nf_conntrack_hash_rnd_initted; diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 769680e68b5..8fcf1762fab 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -418,6 +418,7 @@ static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv, static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff) { + struct net *net = nf_ct_net(ct); struct dccp_hdr _dh, *dh; const char *msg; u_int8_t state; @@ -445,7 +446,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; out_invalid: - if (LOG_INVALID(IPPROTO_DCCP)) + if (LOG_INVALID(net, IPPROTO_DCCP)) nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg); return false; } @@ -463,6 +464,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, u_int8_t pf, unsigned int hooknum) { + struct net *net = nf_ct_net(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; u_int8_t type, old_state, new_state; @@ -524,13 +526,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.dccp.last_pkt = type; write_unlock_bh(&dccp_lock); - if (LOG_INVALID(IPPROTO_DCCP)) + if (LOG_INVALID(net, IPPROTO_DCCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_dccp: invalid packet ignored "); return NF_ACCEPT; case CT_DCCP_INVALID: write_unlock_bh(&dccp_lock); - if (LOG_INVALID(IPPROTO_DCCP)) + if (LOG_INVALID(net, IPPROTO_DCCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_dccp: invalid state transition "); return -NF_ACCEPT; @@ -590,7 +592,7 @@ static int dccp_error(struct net *net, struct sk_buff *skb, return NF_ACCEPT; out_invalid: - if (LOG_INVALID(IPPROTO_DCCP)) + if (LOG_INVALID(net, IPPROTO_DCCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 131c9be4470..f947ec41e39 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -488,6 +488,7 @@ static bool tcp_in_window(const struct nf_conn *ct, const struct tcphdr *tcph, u_int8_t pf) { + struct net *net = nf_ct_net(ct); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; @@ -668,7 +669,7 @@ static bool tcp_in_window(const struct nf_conn *ct, if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || nf_ct_tcp_be_liberal) res = true; - if (!res && LOG_INVALID(IPPROTO_TCP)) + if (!res && LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? @@ -761,7 +762,7 @@ static int tcp_error(struct net *net, /* Smaller that minimal TCP header? */ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: short packet "); return -NF_ACCEPT; @@ -769,7 +770,7 @@ static int tcp_error(struct net *net, /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; @@ -782,7 +783,7 @@ static int tcp_error(struct net *net, /* FIXME: Source route IP option packets --RR */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; @@ -791,7 +792,7 @@ static int tcp_error(struct net *net, /* Check TCP flags. */ tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH)); if (!tcp_valid_flags[tcpflags]) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; @@ -808,6 +809,7 @@ static int tcp_packet(struct nf_conn *ct, u_int8_t pf, unsigned int hooknum) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; enum ip_conntrack_dir dir; @@ -886,7 +888,7 @@ static int tcp_packet(struct nf_conn *ct, * thus initiate a clean new session. */ write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: killing out of sync session "); nf_ct_kill(ct); @@ -899,7 +901,7 @@ static int tcp_packet(struct nf_conn *ct, segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid packet ignored "); return NF_ACCEPT; @@ -908,7 +910,7 @@ static int tcp_packet(struct nf_conn *ct, pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3d3fffe3f8b..7c2ca48698b 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -101,7 +101,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { - if (LOG_INVALID(IPPROTO_UDP)) + if (LOG_INVALID(net, IPPROTO_UDP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: short packet "); return -NF_ACCEPT; @@ -109,7 +109,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { - if (LOG_INVALID(IPPROTO_UDP)) + if (LOG_INVALID(net, IPPROTO_UDP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: truncated/malformed packet "); return -NF_ACCEPT; @@ -125,7 +125,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, * FIXME: Source route IP option packets --RR */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { - if (LOG_INVALID(IPPROTO_UDP)) + if (LOG_INVALID(net, IPPROTO_UDP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: bad UDP checksum "); return -NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 3d1697c4f91..d22d839e4f9 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -104,7 +104,7 @@ static int udplite_error(struct net *net, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { - if (LOG_INVALID(IPPROTO_UDPLITE)) + if (LOG_INVALID(net, IPPROTO_UDPLITE)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: short packet "); return -NF_ACCEPT; @@ -114,7 +114,7 @@ static int udplite_error(struct net *net, if (cscov == 0) cscov = udplen; else if (cscov < sizeof(*hdr) || cscov > udplen) { - if (LOG_INVALID(IPPROTO_UDPLITE)) + if (LOG_INVALID(net, IPPROTO_UDPLITE)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: invalid checksum coverage "); return -NF_ACCEPT; @@ -122,7 +122,7 @@ static int udplite_error(struct net *net, /* UDPLITE mandates checksums */ if (!hdr->check) { - if (LOG_INVALID(IPPROTO_UDPLITE)) + if (LOG_INVALID(net, IPPROTO_UDPLITE)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: checksum missing "); return -NF_ACCEPT; @@ -132,7 +132,7 @@ static int udplite_error(struct net *net, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, pf)) { - if (LOG_INVALID(IPPROTO_UDPLITE)) + if (LOG_INVALID(net, IPPROTO_UDPLITE)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: bad UDPLite checksum "); return -NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 5cd06637977..98106d4e89f 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -365,7 +365,7 @@ static ctl_table nf_ct_sysctl_table[] = { { .ctl_name = NET_NF_CONNTRACK_LOG_INVALID, .procname = "nf_conntrack_log_invalid", - .data = &nf_ct_log_invalid, + .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, @@ -403,8 +403,6 @@ static struct ctl_path nf_ct_path[] = { { } }; -EXPORT_SYMBOL_GPL(nf_ct_log_invalid); - static int nf_conntrack_standalone_init_sysctl(struct net *net) { struct ctl_table *table; @@ -423,6 +421,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[1].data = &net->ct.count; table[3].data = &net->ct.sysctl_checksum; + table[4].data = &net->ct.sysctl_log_invalid; net->ct.sysctl_header = register_net_sysctl_table(net, nf_net_netfilter_sysctl_path, table); @@ -473,6 +472,7 @@ static int nf_conntrack_net_init(struct net *net) if (ret < 0) goto out_proc; net->ct.sysctl_checksum = 1; + net->ct.sysctl_log_invalid = 0; ret = nf_conntrack_standalone_init_sysctl(net); if (ret < 0) goto out_sysctl; -- cgit v1.2.3-70-g09d2 From b8b8063e0d0835fb44c88d9fded2be31c9a1757e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:10 +0200 Subject: netfilter: netns nat: fix ipt_MASQUERADE in netns First, allow entry in notifier hook. Second, start conntrack cleanup in netns to which netdevice belongs. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ipt_MASQUERADE.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 5e1c81791e5..65c811b27b7 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -119,9 +119,7 @@ static int masq_device_event(struct notifier_block *this, void *ptr) { const struct net_device *dev = ptr; - - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; + struct net *net = dev_net(dev); if (event == NETDEV_DOWN) { /* Device was downed. Search entire table for @@ -129,7 +127,7 @@ static int masq_device_event(struct notifier_block *this, and forget them. */ NF_CT_ASSERT(dev->ifindex != 0); - nf_ct_iterate_cleanup(&init_net, device_cmp, + nf_ct_iterate_cleanup(net, device_cmp, (void *)(long)dev->ifindex); } -- cgit v1.2.3-70-g09d2 From e099a173573ce1ba171092aee7bb3c72ea686e59 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:10 +0200 Subject: netfilter: netns nat: per-netns NAT table Same story as with iptable_filter, iptables_raw tables. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/net/netns/ipv4.h | 1 + net/ipv4/netfilter/nf_nat_rule.c | 40 +++++++++++++++++++++++++++++----------- 2 files changed, 30 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index a6ed83853dc..b286b840493 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -38,6 +38,7 @@ struct netns_ipv4 { struct xt_table *iptable_raw; struct xt_table *arptable_filter; struct xt_table *iptable_security; + struct xt_table *nat_table; #endif int sysctl_icmp_echo_ignore_all; diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index e8b4d0d4439..0a02a8caf3b 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -33,7 +33,7 @@ static struct struct ipt_replace repl; struct ipt_standard entries[3]; struct ipt_error term; -} nat_initial_table __initdata = { +} nat_initial_table __net_initdata = { .repl = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, @@ -58,14 +58,13 @@ static struct .term = IPT_ERROR_INIT, /* ERROR */ }; -static struct xt_table __nat_table = { +static struct xt_table nat_table = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, .lock = __RW_LOCK_UNLOCKED(__nat_table.lock), .me = THIS_MODULE, .af = AF_INET, }; -static struct xt_table *nat_table; /* Source NAT */ static unsigned int ipt_snat_target(struct sk_buff *skb, @@ -194,9 +193,10 @@ int nf_nat_rule_find(struct sk_buff *skb, const struct net_device *out, struct nf_conn *ct) { + struct net *net = nf_ct_net(ct); int ret; - ret = ipt_do_table(skb, hooknum, in, out, nat_table); + ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); if (ret == NF_ACCEPT) { if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) @@ -226,14 +226,32 @@ static struct xt_target ipt_dnat_reg __read_mostly = { .family = AF_INET, }; +static int __net_init nf_nat_rule_net_init(struct net *net) +{ + net->ipv4.nat_table = ipt_register_table(net, &nat_table, + &nat_initial_table.repl); + if (IS_ERR(net->ipv4.nat_table)) + return PTR_ERR(net->ipv4.nat_table); + return 0; +} + +static void __net_exit nf_nat_rule_net_exit(struct net *net) +{ + ipt_unregister_table(net->ipv4.nat_table); +} + +static struct pernet_operations nf_nat_rule_net_ops = { + .init = nf_nat_rule_net_init, + .exit = nf_nat_rule_net_exit, +}; + int __init nf_nat_rule_init(void) { int ret; - nat_table = ipt_register_table(&init_net, &__nat_table, - &nat_initial_table.repl); - if (IS_ERR(nat_table)) - return PTR_ERR(nat_table); + ret = register_pernet_subsys(&nf_nat_rule_net_ops); + if (ret != 0) + goto out; ret = xt_register_target(&ipt_snat_reg); if (ret != 0) goto unregister_table; @@ -247,8 +265,8 @@ int __init nf_nat_rule_init(void) unregister_snat: xt_unregister_target(&ipt_snat_reg); unregister_table: - ipt_unregister_table(nat_table); - + unregister_pernet_subsys(&nf_nat_rule_net_ops); + out: return ret; } @@ -256,5 +274,5 @@ void nf_nat_rule_cleanup(void) { xt_unregister_target(&ipt_dnat_reg); xt_unregister_target(&ipt_snat_reg); - ipt_unregister_table(nat_table); + unregister_pernet_subsys(&nf_nat_rule_net_ops); } -- cgit v1.2.3-70-g09d2 From 0c4c9288ada0e6642d511ef872f10a4781a896ff Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:11 +0200 Subject: netfilter: netns nat: per-netns bysource hash Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- include/net/netns/ipv4.h | 2 ++ net/ipv4/netfilter/nf_nat_core.c | 72 +++++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 27 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index b286b840493..ece1c926b5d 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -39,6 +39,8 @@ struct netns_ipv4 { struct xt_table *arptable_filter; struct xt_table *iptable_security; struct xt_table *nat_table; + struct hlist_head *nat_bysource; + int nat_vmalloced; #endif int sysctl_icmp_echo_ignore_all; diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 5d4a5b70da2..2ac9eaf1a8c 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -37,9 +37,6 @@ static struct nf_conntrack_l3proto *l3proto __read_mostly; /* Calculated at init based on memory size */ static unsigned int nf_nat_htable_size __read_mostly; -static int nf_nat_vmalloced; - -static struct hlist_head *bysource __read_mostly; #define MAX_IP_NAT_PROTO 256 static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] @@ -145,7 +142,8 @@ same_src(const struct nf_conn *ct, /* Only called for SRC manip */ static int -find_appropriate_src(const struct nf_conntrack_tuple *tuple, +find_appropriate_src(struct net *net, + const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { @@ -155,7 +153,7 @@ find_appropriate_src(const struct nf_conntrack_tuple *tuple, const struct hlist_node *n; rcu_read_lock(); - hlist_for_each_entry_rcu(nat, n, &bysource[h], bysource) { + hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) { ct = nat->ct; if (same_src(ct, tuple)) { /* Copy source part from reply tuple. */ @@ -231,6 +229,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { + struct net *net = nf_ct_net(ct); const struct nf_nat_protocol *proto; /* 1) If this srcip/proto/src-proto-part is currently mapped, @@ -242,7 +241,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, manips not an issue. */ if (maniptype == IP_NAT_MANIP_SRC && !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { - if (find_appropriate_src(orig_tuple, tuple, range)) { + if (find_appropriate_src(net, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; @@ -283,6 +282,7 @@ nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); @@ -334,7 +334,8 @@ nf_nat_setup_info(struct nf_conn *ct, /* nf_conntrack_alter_reply might re-allocate exntension aera */ nat = nfct_nat(ct); nat->ct = ct; - hlist_add_head_rcu(&nat->bysource, &bysource[srchash]); + hlist_add_head_rcu(&nat->bysource, + &net->ipv4.nat_bysource[srchash]); spin_unlock_bh(&nf_nat_lock); } @@ -583,6 +584,40 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { .flags = NF_CT_EXT_F_PREALLOC, }; +static int __net_init nf_nat_net_init(struct net *net) +{ + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, + &net->ipv4.nat_vmalloced); + if (!net->ipv4.nat_bysource) + return -ENOMEM; + return 0; +} + +/* Clear NAT section of all conntracks, in case we're loaded again. */ +static int clean_nat(struct nf_conn *i, void *data) +{ + struct nf_conn_nat *nat = nfct_nat(i); + + if (!nat) + return 0; + memset(nat, 0, sizeof(*nat)); + i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); + return 0; +} + +static void __net_exit nf_nat_net_exit(struct net *net) +{ + nf_ct_iterate_cleanup(net, &clean_nat, NULL); + synchronize_rcu(); + nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, + nf_nat_htable_size); +} + +static struct pernet_operations nf_nat_net_ops = { + .init = nf_nat_net_init, + .exit = nf_nat_net_exit, +}; + static int __init nf_nat_init(void) { size_t i; @@ -599,12 +634,9 @@ static int __init nf_nat_init(void) /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; - bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, - &nf_nat_vmalloced); - if (!bysource) { - ret = -ENOMEM; + ret = register_pernet_subsys(&nf_nat_net_ops); + if (ret < 0) goto cleanup_extend; - } /* Sew in builtin protocols. */ spin_lock_bh(&nf_nat_lock); @@ -629,23 +661,9 @@ static int __init nf_nat_init(void) return ret; } -/* Clear NAT section of all conntracks, in case we're loaded again. */ -static int clean_nat(struct nf_conn *i, void *data) -{ - struct nf_conn_nat *nat = nfct_nat(i); - - if (!nat) - return 0; - memset(nat, 0, sizeof(*nat)); - i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); - return 0; -} - static void __exit nf_nat_cleanup(void) { - nf_ct_iterate_cleanup(&init_net, &clean_nat, NULL); - synchronize_rcu(); - nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size); + unregister_pernet_subsys(&nf_nat_net_ops); nf_ct_l3proto_put(l3proto); nf_ct_extend_unregister(&nat_extend); rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); -- cgit v1.2.3-70-g09d2 From 9174c1538fffbb5dddab99563eac6b3d8b212277 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:11 +0200 Subject: netfilter: netns nf_conntrack: fixup DNAT in netns Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/nf_nat_rule.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 0a02a8caf3b..f929352ec0e 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -91,13 +91,13 @@ static unsigned int ipt_snat_target(struct sk_buff *skb, } /* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ -static void warn_if_extra_mangle(__be32 dstip, __be32 srcip) +static void warn_if_extra_mangle(struct net *net, __be32 dstip, __be32 srcip) { static int warned = 0; struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; struct rtable *rt; - if (ip_route_output_key(&init_net, &rt, &fl) != 0) + if (ip_route_output_key(net, &rt, &fl) != 0) return; if (rt->rt_src != srcip && !warned) { @@ -130,7 +130,7 @@ static unsigned int ipt_dnat_target(struct sk_buff *skb, if (hooknum == NF_INET_LOCAL_OUT && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) - warn_if_extra_mangle(ip_hdr(skb)->daddr, + warn_if_extra_mangle(dev_net(out), ip_hdr(skb)->daddr, mr->range[0].min_ip); return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); -- cgit v1.2.3-70-g09d2 From cfd6e3d74751b62b6d0844e24c911776e40a0135 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:11 +0200 Subject: netfilter: netns nat: PPTP NAT in netns Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/nf_nat_pptp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index e4bdddc6034..9eb171056c6 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -40,6 +40,7 @@ MODULE_ALIAS("ip_nat_pptp"); static void pptp_nat_expected(struct nf_conn *ct, struct nf_conntrack_expect *exp) { + struct net *net = nf_ct_net(ct); const struct nf_conn *master = ct->master; struct nf_conntrack_expect *other_exp; struct nf_conntrack_tuple t; @@ -73,7 +74,7 @@ static void pptp_nat_expected(struct nf_conn *ct, pr_debug("trying to unexpect other dir: "); nf_ct_dump_tuple_ip(&t); - other_exp = nf_ct_expect_find_get(&init_net, &t); + other_exp = nf_ct_expect_find_get(net, &t); if (other_exp) { nf_ct_unexpect_related(other_exp); nf_ct_expect_put(other_exp); -- cgit v1.2.3-70-g09d2 From 73e4022f78acdbe420e8c24a7afbd90f4c8f5077 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 8 Oct 2008 11:35:12 +0200 Subject: netfilter: split netfilter IPv4 defragmentation into a separate module Netfilter connection tracking requires all IPv4 packets to be defragmented. Both the socket match and the TPROXY target depend on this functionality, so this patch separates the Netfilter IPv4 defrag hooks into a separate module. Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- include/net/netfilter/ipv4/nf_defrag_ipv4.h | 6 ++ net/ipv4/netfilter/Kconfig | 5 ++ net/ipv4/netfilter/Makefile | 3 + net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 56 +-------------- net/ipv4/netfilter/nf_defrag_ipv4.c | 96 ++++++++++++++++++++++++++ 5 files changed, 113 insertions(+), 53 deletions(-) create mode 100644 include/net/netfilter/ipv4/nf_defrag_ipv4.h create mode 100644 net/ipv4/netfilter/nf_defrag_ipv4.c (limited to 'net/ipv4') diff --git a/include/net/netfilter/ipv4/nf_defrag_ipv4.h b/include/net/netfilter/ipv4/nf_defrag_ipv4.h new file mode 100644 index 00000000000..6b00ea38546 --- /dev/null +++ b/include/net/netfilter/ipv4/nf_defrag_ipv4.h @@ -0,0 +1,6 @@ +#ifndef _NF_DEFRAG_IPV4_H +#define _NF_DEFRAG_IPV4_H + +extern void nf_defrag_ipv4_enable(void); + +#endif /* _NF_DEFRAG_IPV4_H */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4e842d56642..07757ac8d5d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -5,10 +5,15 @@ menu "IP: Netfilter Configuration" depends on INET && NETFILTER +config NF_DEFRAG_IPV4 + tristate + default n + config NF_CONNTRACK_IPV4 tristate "IPv4 connection tracking support (required for NAT)" depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV4 ---help--- Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 1107edbe478..5f9b650d90f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -18,6 +18,9 @@ obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o obj-$(CONFIG_NF_NAT) += nf_nat.o +# defrag +obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o + # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index af69acc1d0f..4a7c3527539 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -1,3 +1,4 @@ + /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team * @@ -24,6 +25,7 @@ #include #include #include +#include int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, struct nf_conn *ct, @@ -63,23 +65,6 @@ static int ipv4_print_tuple(struct seq_file *s, NIPQUAD(tuple->dst.u3.ip)); } -/* Returns new sk_buff, or NULL */ -static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) -{ - int err; - - skb_orphan(skb); - - local_bh_disable(); - err = ip_defrag(skb, user); - local_bh_enable(); - - if (!err) - ip_send_check(ip_hdr(skb)); - - return err; -} - static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -144,28 +129,6 @@ out: return nf_conntrack_confirm(skb); } -static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* Previously seen (loopback)? Ignore. Do this before - fragment check. */ - if (skb->nfct) - return NF_ACCEPT; - - /* Gather fragments. */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (nf_ct_ipv4_gather_frags(skb, - hooknum == NF_INET_PRE_ROUTING ? - IP_DEFRAG_CONNTRACK_IN : - IP_DEFRAG_CONNTRACK_OUT)) - return NF_STOLEN; - } - return NF_ACCEPT; -} - static unsigned int ipv4_conntrack_in(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, @@ -194,13 +157,6 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum, /* Connection tracking may drop packets, but never alters them, so make it the first hook. */ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { - { - .hook = ipv4_conntrack_defrag, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_DEFRAG, - }, { .hook = ipv4_conntrack_in, .owner = THIS_MODULE, @@ -208,13 +164,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, - { - .hook = ipv4_conntrack_defrag, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_CONNTRACK_DEFRAG, - }, { .hook = ipv4_conntrack_local, .owner = THIS_MODULE, @@ -422,6 +371,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) int ret = 0; need_conntrack(); + nf_defrag_ipv4_enable(); ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) { diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c new file mode 100644 index 00000000000..aa2c50a180f --- /dev/null +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -0,0 +1,96 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* Returns new sk_buff, or NULL */ +static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + int err; + + skb_orphan(skb); + + local_bh_disable(); + err = ip_defrag(skb, user); + local_bh_enable(); + + if (!err) + ip_send_check(ip_hdr(skb)); + + return err; +} + +static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + /* Previously seen (loopback)? Ignore. Do this before + fragment check. */ + if (skb->nfct) + return NF_ACCEPT; +#endif + + /* Gather fragments. */ + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (nf_ct_ipv4_gather_frags(skb, + hooknum == NF_INET_PRE_ROUTING ? + IP_DEFRAG_CONNTRACK_IN : + IP_DEFRAG_CONNTRACK_OUT)) + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static struct nf_hook_ops ipv4_defrag_ops[] = { + { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, + }, + { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, + }, +}; + +static int __init nf_defrag_init(void) +{ + return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); +} + +static void __exit nf_defrag_fini(void) +{ + nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); +} + +void nf_defrag_ipv4_enable(void) +{ +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); + +module_init(nf_defrag_init); +module_exit(nf_defrag_fini); + +MODULE_LICENSE("GPL"); -- cgit v1.2.3-70-g09d2 From 367c679007fa4f990eb7ee381326ec59d8148b0e Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:17 +0200 Subject: netfilter: xtables: do centralized checkentry call (1/2) It used to be that {ip,ip6,etc}_tables called extension->checkentry themselves, but this can be moved into the xtables core. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 6 ++++-- net/bridge/netfilter/ebtables.c | 24 ++++++------------------ net/ipv4/netfilter/arp_tables.c | 10 ++++------ net/ipv4/netfilter/ip_tables.c | 23 +++++++++-------------- net/ipv6/netfilter/ip6_tables.c | 23 +++++++++-------------- net/netfilter/x_tables.c | 12 ++++++++++-- net/sched/act_ipt.c | 14 +++----------- 7 files changed, 45 insertions(+), 67 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 6989b22716e..85aa42785a5 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -330,10 +330,12 @@ extern void xt_unregister_matches(struct xt_match *match, unsigned int n); extern int xt_check_match(const struct xt_match *match, unsigned short family, unsigned int size, const char *table, unsigned int hook, - unsigned short proto, int inv_proto); + unsigned short proto, int inv_proto, + const void *entry, void *matchinfo); extern int xt_check_target(const struct xt_target *target, unsigned short family, unsigned int size, const char *table, unsigned int hook, - unsigned short proto, int inv_proto); + unsigned short proto, int inv_proto, + const void *entry, void *targinfo); extern struct xt_table *xt_register_table(struct net *net, struct xt_table *table, diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 7d8ead52d25..7ee72b71d3c 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -340,15 +340,11 @@ ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e, m->u.match = match; ret = xt_check_match(match, NFPROTO_BRIDGE, m->match_size, - name, hookmask, e->ethproto, e->invflags & EBT_IPROTO); + name, hookmask, e->ethproto, e->invflags & EBT_IPROTO, + e, m->data); if (ret < 0) { module_put(match->me); return ret; - } else if (match->checkentry != NULL && - !match->checkentry(name, e, NULL, m->data, hookmask)) { - module_put(match->me); - BUGPRINT("match->check failed\n"); - return -EINVAL; } (*cnt)++; @@ -377,15 +373,11 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e, w->u.watcher = watcher; ret = xt_check_target(watcher, NFPROTO_BRIDGE, w->watcher_size, - name, hookmask, e->ethproto, e->invflags & EBT_IPROTO); + name, hookmask, e->ethproto, e->invflags & EBT_IPROTO, + e, w->data); if (ret < 0) { module_put(watcher->me); return ret; - } else if (watcher->checkentry != NULL && - !watcher->checkentry(name, e, NULL, w->data, hookmask)) { - module_put(watcher->me); - BUGPRINT("watcher->check failed\n"); - return -EINVAL; } (*cnt)++; @@ -692,15 +684,11 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, } ret = xt_check_target(target, NFPROTO_BRIDGE, t->target_size, - name, hookmask, e->ethproto, e->invflags & EBT_IPROTO); + name, hookmask, e->ethproto, e->invflags & EBT_IPROTO, + e, t->data); if (ret < 0) { module_put(target->me); goto cleanup_watchers; - } else if (t->u.target->checkentry && - !t->u.target->checkentry(name, e, NULL, t->data, hookmask)) { - module_put(t->u.target->me); - ret = -EINVAL; - goto cleanup_watchers; } (*cnt)++; return 0; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b4a9a1799c9..ae525a9afbe 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -465,15 +465,13 @@ static inline int check_target(struct arpt_entry *e, const char *name) ret = xt_check_target(target, NFPROTO_ARP, t->u.target_size - sizeof(*t), - name, e->comefrom, 0, 0); - if (!ret && t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, t->data, - e->comefrom)) { + name, e->comefrom, 0, 0, e, t->data); + if (ret < 0) { duprintf("arp_tables: check failed for `%s'.\n", t->u.kernel.target->name); - ret = -EINVAL; + return ret; } - return ret; + return 0; } static inline int diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4e7c719445c..b4c74a7a807 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -616,17 +616,14 @@ check_match(struct ipt_entry_match *m, const char *name, match = m->u.kernel.match; ret = xt_check_match(match, AF_INET, m->u.match_size - sizeof(*m), name, hookmask, ip->proto, - ip->invflags & IPT_INV_PROTO); - if (!ret && m->u.kernel.match->checkentry - && !m->u.kernel.match->checkentry(name, ip, match, m->data, - hookmask)) { + ip->invflags & IPT_INV_PROTO, ip, m->data); + if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", m->u.kernel.match->name); - ret = -EINVAL; + return ret; } - if (!ret) - (*i)++; - return ret; + ++*i; + return 0; } static int @@ -668,15 +665,13 @@ static int check_target(struct ipt_entry *e, const char *name) target = t->u.kernel.target; ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t), name, e->comefrom, e->ip.proto, - e->ip.invflags & IPT_INV_PROTO); - if (!ret && t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, t->data, - e->comefrom)) { + e->ip.invflags & IPT_INV_PROTO, e, t->data); + if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", t->u.kernel.target->name); - ret = -EINVAL; + return ret; } - return ret; + return 0; } static int diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 0b4557e0343..12c41b8d365 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -642,17 +642,14 @@ static int check_match(struct ip6t_entry_match *m, const char *name, match = m->u.kernel.match; ret = xt_check_match(match, AF_INET6, m->u.match_size - sizeof(*m), name, hookmask, ipv6->proto, - ipv6->invflags & IP6T_INV_PROTO); - if (!ret && m->u.kernel.match->checkentry - && !m->u.kernel.match->checkentry(name, ipv6, match, m->data, - hookmask)) { + ipv6->invflags & IP6T_INV_PROTO, ipv6, m->data); + if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", m->u.kernel.match->name); - ret = -EINVAL; + return ret; } - if (!ret) - (*i)++; - return ret; + ++*i; + return 0; } static int @@ -694,15 +691,13 @@ static int check_target(struct ip6t_entry *e, const char *name) target = t->u.kernel.target; ret = xt_check_target(target, AF_INET6, t->u.target_size - sizeof(*t), name, e->comefrom, e->ipv6.proto, - e->ipv6.invflags & IP6T_INV_PROTO); - if (!ret && t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, t->data, - e->comefrom)) { + e->ipv6.invflags & IP6T_INV_PROTO, e, t->data); + if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", t->u.kernel.target->name); - ret = -EINVAL; + return ret; } - return ret; + return 0; } static int diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 3b1fc40cc27..d1f2fb3e8f2 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -323,7 +323,8 @@ EXPORT_SYMBOL_GPL(xt_find_revision); int xt_check_match(const struct xt_match *match, unsigned short family, unsigned int size, const char *table, unsigned int hook_mask, - unsigned short proto, int inv_proto) + unsigned short proto, int inv_proto, const void *entry, + void *matchinfo) { if (XT_ALIGN(match->matchsize) != size && match->matchsize != -1) { @@ -351,6 +352,9 @@ int xt_check_match(const struct xt_match *match, unsigned short family, xt_prefix[family], match->name, match->proto); return -EINVAL; } + if (match->checkentry != NULL && + !match->checkentry(table, entry, match, matchinfo, hook_mask)) + return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(xt_check_match); @@ -469,7 +473,8 @@ EXPORT_SYMBOL_GPL(xt_compat_match_to_user); int xt_check_target(const struct xt_target *target, unsigned short family, unsigned int size, const char *table, unsigned int hook_mask, - unsigned short proto, int inv_proto) + unsigned short proto, int inv_proto, const void *entry, + void *targinfo) { if (XT_ALIGN(target->targetsize) != size) { printk("%s_tables: %s target: invalid size %Zu != %u\n", @@ -493,6 +498,9 @@ int xt_check_target(const struct xt_target *target, unsigned short family, xt_prefix[family], target->name, target->proto); return -EINVAL; } + if (target->checkentry != NULL && + !target->checkentry(table, entry, target, targinfo, hook_mask)) + return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(xt_check_target); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index d1263b3c96c..79ea19375ca 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -51,20 +51,12 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int t->u.kernel.target = target; ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t), - table, hook, 0, 0); - if (ret) { + table, hook, 0, 0, NULL, t->data); + if (ret < 0) { module_put(t->u.kernel.target->me); return ret; } - if (t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(table, NULL, - t->u.kernel.target, t->data, - hook)) { - module_put(t->u.kernel.target->me); - ret = -EINVAL; - } - - return ret; + return 0; } static void ipt_destroy_target(struct ipt_entry_target *t) -- cgit v1.2.3-70-g09d2 From aba0d34800d7f56493b4d5548cc06498a4d69124 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:17 +0200 Subject: netfilter: xtables: sort extensions alphabetically in Kconfig Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/Kconfig | 78 +++++++++++++++++------------------ net/ipv6/netfilter/Kconfig | 44 ++++++++++---------- net/netfilter/Kconfig | 100 ++++++++++++++++++++++----------------------- 3 files changed, 111 insertions(+), 111 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 07757ac8d5d..087b8290684 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -62,15 +62,16 @@ config IP_NF_IPTABLES To compile it as a module, choose M here. If unsure, say N. # The matches. -config IP_NF_MATCH_ECN - tristate '"ecn" match support' +config IP_NF_MATCH_ADDRTYPE + tristate '"addrtype" address type match support' depends on IP_NF_IPTABLES depends on NETFILTER_ADVANCED help - This option adds a `ECN' match, which allows you to match against - the IPv4 and TCP header ECN fields. + This option allows you to match what routing thinks of an address, + eg. UNICAST, LOCAL, BROADCAST, ... - To compile it as a module, choose M here. If unsure, say N. + If you want to compile it as a module, say M here and read + . If unsure, say `N'. config IP_NF_MATCH_AH tristate '"ah" match support' @@ -82,26 +83,25 @@ config IP_NF_MATCH_AH To compile it as a module, choose M here. If unsure, say N. -config IP_NF_MATCH_TTL - tristate '"ttl" match support' +config IP_NF_MATCH_ECN + tristate '"ecn" match support' depends on IP_NF_IPTABLES depends on NETFILTER_ADVANCED help - This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user - to match packets by their TTL value. + This option adds a `ECN' match, which allows you to match against + the IPv4 and TCP header ECN fields. To compile it as a module, choose M here. If unsure, say N. -config IP_NF_MATCH_ADDRTYPE - tristate '"addrtype" address type match support' +config IP_NF_MATCH_TTL + tristate '"ttl" match support' depends on IP_NF_IPTABLES depends on NETFILTER_ADVANCED help - This option allows you to match what routing thinks of an address, - eg. UNICAST, LOCAL, BROADCAST, ... + This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user + to match packets by their TTL value. - If you want to compile it as a module, say M here and read - . If unsure, say `N'. + To compile it as a module, choose M here. If unsure, say N. # `filter', generic and specific targets config IP_NF_FILTER @@ -186,26 +186,26 @@ config IP_NF_TARGET_MASQUERADE To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_REDIRECT - tristate "REDIRECT target support" +config IP_NF_TARGET_NETMAP + tristate "NETMAP target support" depends on NF_NAT depends on NETFILTER_ADVANCED help - REDIRECT is a special case of NAT: all incoming connections are - mapped onto the incoming interface's address, causing the packets to - come to the local machine instead of passing through. This is - useful for transparent proxies. + NETMAP is an implementation of static 1:1 NAT mapping of network + addresses. It maps the network address part, while keeping the host + address part intact. To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_NETMAP - tristate "NETMAP target support" +config IP_NF_TARGET_REDIRECT + tristate "REDIRECT target support" depends on NF_NAT depends on NETFILTER_ADVANCED help - NETMAP is an implementation of static 1:1 NAT mapping of network - addresses. It maps the network address part, while keeping the host - address part intact. + REDIRECT is a special case of NAT: all incoming connections are + mapped onto the incoming interface's address, causing the packets to + come to the local machine instead of passing through. This is + useful for transparent proxies. To compile it as a module, choose M here. If unsure, say N. @@ -300,6 +300,19 @@ config IP_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. +config IP_NF_TARGET_CLUSTERIP + tristate "CLUSTERIP target support (EXPERIMENTAL)" + depends on IP_NF_MANGLE && EXPERIMENTAL + depends on NF_CONNTRACK_IPV4 + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_MARK + help + The CLUSTERIP target allows you to build load-balancing clusters of + network servers without having a dedicated load-balancing + router/server/switch. + + To compile it as a module, choose M here. If unsure, say N. + config IP_NF_TARGET_ECN tristate "ECN target support" depends on IP_NF_MANGLE @@ -330,19 +343,6 @@ config IP_NF_TARGET_TTL To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_CLUSTERIP - tristate "CLUSTERIP target support (EXPERIMENTAL)" - depends on IP_NF_MANGLE && EXPERIMENTAL - depends on NF_CONNTRACK_IPV4 - depends on NETFILTER_ADVANCED - select NF_CONNTRACK_MARK - help - The CLUSTERIP target allows you to build load-balancing clusters of - network servers without having a dedicated load-balancing - router/server/switch. - - To compile it as a module, choose M here. If unsure, say N. - # raw + specific targets config IP_NF_RAW tristate 'raw table support (required for NOTRACK/TRACE)' diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index fee881bd31f..91ffba08c29 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -56,23 +56,23 @@ config IP6_NF_IPTABLES To compile it as a module, choose M here. If unsure, say N. # The simple matches. -config IP6_NF_MATCH_RT - tristate '"rt" Routing header match support' +config IP6_NF_MATCH_AH + tristate '"ah" match support' depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help - rt matching allows you to match packets based on the routing - header of the packet. + This module allows one to match AH packets. To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_MATCH_OPTS - tristate '"hbh" hop-by-hop and "dst" opts header match support' +config IP6_NF_MATCH_EUI64 + tristate '"eui64" address check' depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help - This allows one to match packets based on the hop-by-hop - and destination options headers of a packet. + This module performs checking on the IPv6 source address + Compares the last 64 bits with the EUI64 (delivered + from the MAC address) address To compile it as a module, choose M here. If unsure, say N. @@ -86,6 +86,16 @@ config IP6_NF_MATCH_FRAG To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_MATCH_OPTS + tristate '"hbh" hop-by-hop and "dst" opts header match support' + depends on IP6_NF_IPTABLES + depends on NETFILTER_ADVANCED + help + This allows one to match packets based on the hop-by-hop + and destination options headers of a packet. + + To compile it as a module, choose M here. If unsure, say N. + config IP6_NF_MATCH_HL tristate '"hl" match support' depends on IP6_NF_IPTABLES @@ -106,15 +116,6 @@ config IP6_NF_MATCH_IPV6HEADER To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_MATCH_AH - tristate '"ah" match support' - depends on IP6_NF_IPTABLES - depends on NETFILTER_ADVANCED - help - This module allows one to match AH packets. - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_MATCH_MH tristate '"mh" match support' depends on IP6_NF_IPTABLES @@ -124,14 +125,13 @@ config IP6_NF_MATCH_MH To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_MATCH_EUI64 - tristate '"eui64" address check' +config IP6_NF_MATCH_RT + tristate '"rt" Routing header match support' depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help - This module performs checking on the IPv6 source address - Compares the last 64 bits with the EUI64 (delivered - from the MAC address) address + rt matching allows you to match packets based on the routing + header of the packet. To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index de18bba619f..9ad74e8bc5b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -340,6 +340,18 @@ config NETFILTER_XT_TARGET_CONNMARK . The module will be called ipt_CONNMARK.ko. If unsure, say `N'. +config NETFILTER_XT_TARGET_CONNSECMARK + tristate '"CONNSECMARK" target support' + depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK + default m if NETFILTER_ADVANCED=n + help + The CONNSECMARK target copies security markings from packets + to connections, and restores security markings from connections + to packets (if the packets are not already marked). This would + normally be used in conjunction with the SECMARK target. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_DSCP tristate '"DSCP" and "TOS" target support' depends on NETFILTER_XTABLES @@ -371,18 +383,6 @@ config NETFILTER_XT_TARGET_MARK To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_TARGET_NFQUEUE - tristate '"NFQUEUE" target Support' - depends on NETFILTER_XTABLES - depends on NETFILTER_ADVANCED - help - This target replaced the old obsolete QUEUE target. - - As opposed to QUEUE, it supports 65535 different queues, - not just one. - - To compile it as a module, choose M here. If unsure, say N. - config NETFILTER_XT_TARGET_NFLOG tristate '"NFLOG" target support' depends on NETFILTER_XTABLES @@ -395,6 +395,18 @@ config NETFILTER_XT_TARGET_NFLOG To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_NFQUEUE + tristate '"NFQUEUE" target Support' + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + help + This target replaced the old obsolete QUEUE target. + + As opposed to QUEUE, it supports 65535 different queues, + not just one. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_NOTRACK tristate '"NOTRACK" target support' depends on NETFILTER_XTABLES @@ -459,18 +471,6 @@ config NETFILTER_XT_TARGET_SECMARK To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_TARGET_CONNSECMARK - tristate '"CONNSECMARK" target support' - depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK - default m if NETFILTER_ADVANCED=n - help - The CONNSECMARK target copies security markings from packets - to connections, and restores security markings from connections - to packets (if the packets are not already marked). This would - normally be used in conjunction with the SECMARK target. - - To compile it as a module, choose M here. If unsure, say N. - config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' depends on NETFILTER_XTABLES && (IPV6 || IPV6=n) @@ -607,6 +607,21 @@ config NETFILTER_XT_MATCH_ESP To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_HASHLIMIT + tristate '"hashlimit" match support' + depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on NETFILTER_ADVANCED + help + This option adds a `hashlimit' match. + + As opposed to `limit', this match dynamically creates a hash table + of limit buckets, based on your selection of source/destination + addresses and/or ports. + + It enables you to express policies like `10kpps for any given + destination address' or `500pps from any given source address' + with a single rule. + config NETFILTER_XT_MATCH_HELPER tristate '"helper" match support' depends on NETFILTER_XTABLES @@ -671,6 +686,17 @@ config NETFILTER_XT_MATCH_MARK To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_MULTIPORT + tristate '"multiport" Multiple port match support' + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + help + Multiport matching allows you to match TCP or UDP packets based on + a series of source or destination ports: normally a rule can only + match a single range of ports. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_OWNER tristate '"owner" match support' depends on NETFILTER_XTABLES @@ -691,17 +717,6 @@ config NETFILTER_XT_MATCH_POLICY To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_MATCH_MULTIPORT - tristate '"multiport" Multiple port match support' - depends on NETFILTER_XTABLES - depends on NETFILTER_ADVANCED - help - Multiport matching allows you to match TCP or UDP packets based on - a series of source or destination ports: normally a rule can only - match a single range of ports. - - To compile it as a module, choose M here. If unsure, say N. - config NETFILTER_XT_MATCH_PHYSDEV tristate '"physdev" match support' depends on NETFILTER_XTABLES && BRIDGE && BRIDGE_NETFILTER @@ -884,20 +899,5 @@ config NETFILTER_XT_MATCH_U32 Details and examples are in the kernel module source. -config NETFILTER_XT_MATCH_HASHLIMIT - tristate '"hashlimit" match support' - depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) - depends on NETFILTER_ADVANCED - help - This option adds a `hashlimit' match. - - As opposed to `limit', this match dynamically creates a hash table - of limit buckets, based on your selection of source/destination - addresses and/or ports. - - It enables you to express policies like `10kpps for any given - destination address' or `500pps from any given source address' - with a single rule. - endmenu -- cgit v1.2.3-70-g09d2 From c2df73de246ae75705af8ceed4f385b261dea108 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:18 +0200 Subject: netfilter: xtables: use "if" blocks in Kconfig Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/Kconfig | 36 +++++++++----------- net/ipv6/netfilter/Kconfig | 17 +++------- net/netfilter/Kconfig | 84 ++++++++++++---------------------------------- 3 files changed, 41 insertions(+), 96 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 087b8290684..3816e1dc929 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -61,10 +61,11 @@ config IP_NF_IPTABLES To compile it as a module, choose M here. If unsure, say N. +if IP_NF_IPTABLES + # The matches. config IP_NF_MATCH_ADDRTYPE tristate '"addrtype" address type match support' - depends on IP_NF_IPTABLES depends on NETFILTER_ADVANCED help This option allows you to match what routing thinks of an address, @@ -75,7 +76,6 @@ config IP_NF_MATCH_ADDRTYPE config IP_NF_MATCH_AH tristate '"ah" match support' - depends on IP_NF_IPTABLES depends on NETFILTER_ADVANCED help This match extension allows you to match a range of SPIs @@ -85,7 +85,6 @@ config IP_NF_MATCH_AH config IP_NF_MATCH_ECN tristate '"ecn" match support' - depends on IP_NF_IPTABLES depends on NETFILTER_ADVANCED help This option adds a `ECN' match, which allows you to match against @@ -95,7 +94,6 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_TTL tristate '"ttl" match support' - depends on IP_NF_IPTABLES depends on NETFILTER_ADVANCED help This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user @@ -106,7 +104,6 @@ config IP_NF_MATCH_TTL # `filter', generic and specific targets config IP_NF_FILTER tristate "Packet filtering" - depends on IP_NF_IPTABLES default m if NETFILTER_ADVANCED=n help Packet filtering defines a table `filter', which has a series of @@ -128,7 +125,6 @@ config IP_NF_TARGET_REJECT config IP_NF_TARGET_LOG tristate "LOG target support" - depends on IP_NF_IPTABLES default m if NETFILTER_ADVANCED=n help This option adds a `LOG' target, which allows you to create rules in @@ -138,7 +134,6 @@ config IP_NF_TARGET_LOG config IP_NF_TARGET_ULOG tristate "ULOG target support" - depends on IP_NF_IPTABLES default m if NETFILTER_ADVANCED=n ---help--- @@ -159,7 +154,7 @@ config IP_NF_TARGET_ULOG # NAT + specific targets: nf_conntrack config NF_NAT tristate "Full NAT" - depends on IP_NF_IPTABLES && NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK_IPV4 default m if NETFILTER_ADVANCED=n help The Full NAT option allows masquerading, port forwarding and other @@ -254,44 +249,43 @@ config NF_NAT_PROTO_SCTP config NF_NAT_FTP tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_FTP config NF_NAT_IRC tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_IRC config NF_NAT_TFTP tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_TFTP config NF_NAT_AMANDA tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_AMANDA config NF_NAT_PPTP tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_PPTP select NF_NAT_PROTO_GRE config NF_NAT_H323 tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_H323 config NF_NAT_SIP tristate - depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT + depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_SIP # mangle + specific targets config IP_NF_MANGLE tristate "Packet mangling" - depends on IP_NF_IPTABLES default m if NETFILTER_ADVANCED=n help This option adds a `mangle' table to iptables: see the man page for @@ -346,7 +340,6 @@ config IP_NF_TARGET_TTL # raw + specific targets config IP_NF_RAW tristate 'raw table support (required for NOTRACK/TRACE)' - depends on IP_NF_IPTABLES depends on NETFILTER_ADVANCED help This option adds a `raw' table to iptables. This table is the very @@ -359,7 +352,6 @@ config IP_NF_RAW # security table for MAC policy config IP_NF_SECURITY tristate "Security table" - depends on IP_NF_IPTABLES depends on SECURITY depends on NETFILTER_ADVANCED help @@ -368,6 +360,8 @@ config IP_NF_SECURITY If unsure, say N. +endif # IP_NF_IPTABLES + # ARP tables config IP_NF_ARPTABLES tristate "ARP tables support" @@ -380,9 +374,10 @@ config IP_NF_ARPTABLES To compile it as a module, choose M here. If unsure, say N. +if IP_NF_ARPTABLES + config IP_NF_ARPFILTER tristate "ARP packet filtering" - depends on IP_NF_ARPTABLES help ARP packet filtering defines a table `filter', which has a series of rules for simple ARP packet filtering at local input and @@ -393,10 +388,11 @@ config IP_NF_ARPFILTER config IP_NF_ARP_MANGLE tristate "ARP payload mangling" - depends on IP_NF_ARPTABLES help Allows altering the ARP packet payload: source and destination hardware and network addresses. +endif # IP_NF_ARPTABLES + endmenu diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 91ffba08c29..53ea512c460 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -55,10 +55,11 @@ config IP6_NF_IPTABLES To compile it as a module, choose M here. If unsure, say N. +if IP6_NF_IPTABLES + # The simple matches. config IP6_NF_MATCH_AH tristate '"ah" match support' - depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help This module allows one to match AH packets. @@ -67,7 +68,6 @@ config IP6_NF_MATCH_AH config IP6_NF_MATCH_EUI64 tristate '"eui64" address check' - depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help This module performs checking on the IPv6 source address @@ -78,7 +78,6 @@ config IP6_NF_MATCH_EUI64 config IP6_NF_MATCH_FRAG tristate '"frag" Fragmentation header match support' - depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help frag matching allows you to match packets based on the fragmentation @@ -88,7 +87,6 @@ config IP6_NF_MATCH_FRAG config IP6_NF_MATCH_OPTS tristate '"hbh" hop-by-hop and "dst" opts header match support' - depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help This allows one to match packets based on the hop-by-hop @@ -98,7 +96,6 @@ config IP6_NF_MATCH_OPTS config IP6_NF_MATCH_HL tristate '"hl" match support' - depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help HL matching allows you to match packets based on the hop @@ -108,7 +105,6 @@ config IP6_NF_MATCH_HL config IP6_NF_MATCH_IPV6HEADER tristate '"ipv6header" IPv6 Extension Headers Match' - depends on IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n help This module allows one to match packets based upon @@ -118,7 +114,6 @@ config IP6_NF_MATCH_IPV6HEADER config IP6_NF_MATCH_MH tristate '"mh" match support' - depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help This module allows one to match MH packets. @@ -127,7 +122,6 @@ config IP6_NF_MATCH_MH config IP6_NF_MATCH_RT tristate '"rt" Routing header match support' - depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help rt matching allows you to match packets based on the routing @@ -138,7 +132,6 @@ config IP6_NF_MATCH_RT # The targets config IP6_NF_TARGET_LOG tristate "LOG target support" - depends on IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n help This option adds a `LOG' target, which allows you to create rules in @@ -148,7 +141,6 @@ config IP6_NF_TARGET_LOG config IP6_NF_FILTER tristate "Packet filtering" - depends on IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n help Packet filtering defines a table `filter', which has a series of @@ -170,7 +162,6 @@ config IP6_NF_TARGET_REJECT config IP6_NF_MANGLE tristate "Packet mangling" - depends on IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n help This option adds a `mangle' table to iptables: see the man page for @@ -198,7 +189,6 @@ config IP6_NF_TARGET_HL config IP6_NF_RAW tristate 'raw table support (required for TRACE)' - depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help This option adds a `raw' table to ip6tables. This table is the very @@ -211,7 +201,6 @@ config IP6_NF_RAW # security table for MAC policy config IP6_NF_SECURITY tristate "Security table" - depends on IP6_NF_IPTABLES depends on SECURITY depends on NETFILTER_ADVANCED help @@ -220,5 +209,7 @@ config IP6_NF_SECURITY If unsure, say N. +endif # IP6_NF_IPTABLES + endmenu diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 9ad74e8bc5b..899e78051d8 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -38,10 +38,11 @@ config NF_CONNTRACK To compile it as a module, choose M here. If unsure, say N. +if NF_CONNTRACK + config NF_CT_ACCT bool "Connection tracking flow accounting" depends on NETFILTER_ADVANCED - depends on NF_CONNTRACK help If this option is enabled, the connection tracking code will keep per-flow packet and byte counters. @@ -63,7 +64,6 @@ config NF_CT_ACCT config NF_CONNTRACK_MARK bool 'Connection mark tracking support' depends on NETFILTER_ADVANCED - depends on NF_CONNTRACK help This option enables support for connection marks, used by the `CONNMARK' target and `connmark' match. Similar to the mark value @@ -72,7 +72,7 @@ config NF_CONNTRACK_MARK config NF_CONNTRACK_SECMARK bool 'Connection tracking security mark support' - depends on NF_CONNTRACK && NETWORK_SECMARK + depends on NETWORK_SECMARK default m if NETFILTER_ADVANCED=n help This option enables security markings to be applied to @@ -85,7 +85,6 @@ config NF_CONNTRACK_SECMARK config NF_CONNTRACK_EVENTS bool "Connection tracking events" - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED help If this option is enabled, the connection tracking code will @@ -96,7 +95,7 @@ config NF_CONNTRACK_EVENTS config NF_CT_PROTO_DCCP tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)' - depends on EXPERIMENTAL && NF_CONNTRACK + depends on EXPERIMENTAL depends on NETFILTER_ADVANCED default IP_DCCP help @@ -107,11 +106,10 @@ config NF_CT_PROTO_DCCP config NF_CT_PROTO_GRE tristate - depends on NF_CONNTRACK config NF_CT_PROTO_SCTP tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' - depends on EXPERIMENTAL && NF_CONNTRACK + depends on EXPERIMENTAL depends on NETFILTER_ADVANCED default IP_SCTP help @@ -123,7 +121,6 @@ config NF_CT_PROTO_SCTP config NF_CT_PROTO_UDPLITE tristate 'UDP-Lite protocol connection tracking support' - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED help With this option enabled, the layer 3 independent connection @@ -134,7 +131,6 @@ config NF_CT_PROTO_UDPLITE config NF_CONNTRACK_AMANDA tristate "Amanda backup protocol support" - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select TEXTSEARCH select TEXTSEARCH_KMP @@ -150,7 +146,6 @@ config NF_CONNTRACK_AMANDA config NF_CONNTRACK_FTP tristate "FTP protocol support" - depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n help Tracking FTP connections is problematic: special helpers are @@ -165,7 +160,7 @@ config NF_CONNTRACK_FTP config NF_CONNTRACK_H323 tristate "H.323 protocol support" - depends on NF_CONNTRACK && (IPV6 || IPV6=n) + depends on (IPV6 || IPV6=n) depends on NETFILTER_ADVANCED help H.323 is a VoIP signalling protocol from ITU-T. As one of the most @@ -185,7 +180,6 @@ config NF_CONNTRACK_H323 config NF_CONNTRACK_IRC tristate "IRC protocol support" - depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n help There is a commonly-used extension to IRC called @@ -201,7 +195,6 @@ config NF_CONNTRACK_IRC config NF_CONNTRACK_NETBIOS_NS tristate "NetBIOS name service protocol support" - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED help NetBIOS name service requests are sent as broadcast messages from an @@ -221,7 +214,6 @@ config NF_CONNTRACK_NETBIOS_NS config NF_CONNTRACK_PPTP tristate "PPtP protocol support" - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CT_PROTO_GRE help @@ -241,7 +233,7 @@ config NF_CONNTRACK_PPTP config NF_CONNTRACK_SANE tristate "SANE protocol support (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK + depends on EXPERIMENTAL depends on NETFILTER_ADVANCED help SANE is a protocol for remote access to scanners as implemented @@ -255,7 +247,6 @@ config NF_CONNTRACK_SANE config NF_CONNTRACK_SIP tristate "SIP protocol support" - depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n help SIP is an application-layer control protocol that can establish, @@ -268,7 +259,6 @@ config NF_CONNTRACK_SIP config NF_CONNTRACK_TFTP tristate "TFTP protocol support" - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED help TFTP connection tracking helper, this is required depending @@ -280,7 +270,6 @@ config NF_CONNTRACK_TFTP config NF_CT_NETLINK tristate 'Connection tracking netlink interface' - depends on NF_CONNTRACK select NETFILTER_NETLINK depends on NF_NAT=n || NF_NAT default m if NETFILTER_ADVANCED=n @@ -302,6 +291,8 @@ config NETFILTER_TPROXY To compile it as a module, choose M here. If unsure, say N. +endif # NF_CONNTRACK + config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" default m if NETFILTER_ADVANCED=n @@ -309,11 +300,12 @@ config NETFILTER_XTABLES This is required if you intend to use any of ip_tables, ip6_tables or arp_tables. +if NETFILTER_XTABLES + # alphabetically ordered list of targets config NETFILTER_XT_TARGET_CLASSIFY tristate '"CLASSIFY" target support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `CLASSIFY' target, which enables the user to set @@ -326,7 +318,6 @@ config NETFILTER_XT_TARGET_CLASSIFY config NETFILTER_XT_TARGET_CONNMARK tristate '"CONNMARK" target support' - depends on NETFILTER_XTABLES depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NF_CONNTRACK depends on NETFILTER_ADVANCED @@ -342,7 +333,7 @@ config NETFILTER_XT_TARGET_CONNMARK config NETFILTER_XT_TARGET_CONNSECMARK tristate '"CONNSECMARK" target support' - depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK + depends on NF_CONNTRACK && NF_CONNTRACK_SECMARK default m if NETFILTER_ADVANCED=n help The CONNSECMARK target copies security markings from packets @@ -354,7 +345,6 @@ config NETFILTER_XT_TARGET_CONNSECMARK config NETFILTER_XT_TARGET_DSCP tristate '"DSCP" and "TOS" target support' - depends on NETFILTER_XTABLES depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NETFILTER_ADVANCED help @@ -371,7 +361,6 @@ config NETFILTER_XT_TARGET_DSCP config NETFILTER_XT_TARGET_MARK tristate '"MARK" target support' - depends on NETFILTER_XTABLES default m if NETFILTER_ADVANCED=n help This option adds a `MARK' target, which allows you to create rules @@ -385,7 +374,6 @@ config NETFILTER_XT_TARGET_MARK config NETFILTER_XT_TARGET_NFLOG tristate '"NFLOG" target support' - depends on NETFILTER_XTABLES default m if NETFILTER_ADVANCED=n help This option enables the NFLOG target, which allows to LOG @@ -397,7 +385,6 @@ config NETFILTER_XT_TARGET_NFLOG config NETFILTER_XT_TARGET_NFQUEUE tristate '"NFQUEUE" target Support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This target replaced the old obsolete QUEUE target. @@ -409,7 +396,6 @@ config NETFILTER_XT_TARGET_NFQUEUE config NETFILTER_XT_TARGET_NOTRACK tristate '"NOTRACK" target support' - depends on NETFILTER_XTABLES depends on IP_NF_RAW || IP6_NF_RAW depends on NF_CONNTRACK depends on NETFILTER_ADVANCED @@ -424,7 +410,6 @@ config NETFILTER_XT_TARGET_NOTRACK config NETFILTER_XT_TARGET_RATEEST tristate '"RATEEST" target support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `RATEEST' target, which allows to measure @@ -450,7 +435,6 @@ config NETFILTER_XT_TARGET_TPROXY config NETFILTER_XT_TARGET_TRACE tristate '"TRACE" target support' - depends on NETFILTER_XTABLES depends on IP_NF_RAW || IP6_NF_RAW depends on NETFILTER_ADVANCED help @@ -463,7 +447,7 @@ config NETFILTER_XT_TARGET_TRACE config NETFILTER_XT_TARGET_SECMARK tristate '"SECMARK" target support' - depends on NETFILTER_XTABLES && NETWORK_SECMARK + depends on NETWORK_SECMARK default m if NETFILTER_ADVANCED=n help The SECMARK target allows security marking of network @@ -473,7 +457,7 @@ config NETFILTER_XT_TARGET_SECMARK config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' - depends on NETFILTER_XTABLES && (IPV6 || IPV6=n) + depends on (IPV6 || IPV6=n) default m if NETFILTER_ADVANCED=n ---help--- This option adds a `TCPMSS' target, which allows you to alter the @@ -500,7 +484,7 @@ config NETFILTER_XT_TARGET_TCPMSS config NETFILTER_XT_TARGET_TCPOPTSTRIP tristate '"TCPOPTSTRIP" target support (EXPERIMENTAL)' - depends on EXPERIMENTAL && NETFILTER_XTABLES + depends on EXPERIMENTAL depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NETFILTER_ADVANCED help @@ -509,7 +493,6 @@ config NETFILTER_XT_TARGET_TCPOPTSTRIP config NETFILTER_XT_MATCH_COMMENT tristate '"comment" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `comment' dummy-match, which allows you to put @@ -520,7 +503,6 @@ config NETFILTER_XT_MATCH_COMMENT config NETFILTER_XT_MATCH_CONNBYTES tristate '"connbytes" per-connection counter match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CT_ACCT @@ -533,7 +515,6 @@ config NETFILTER_XT_MATCH_CONNBYTES config NETFILTER_XT_MATCH_CONNLIMIT tristate '"connlimit" match support"' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK depends on NETFILTER_ADVANCED ---help--- @@ -542,7 +523,6 @@ config NETFILTER_XT_MATCH_CONNLIMIT config NETFILTER_XT_MATCH_CONNMARK tristate '"connmark" connection mark match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK @@ -556,7 +536,6 @@ config NETFILTER_XT_MATCH_CONNMARK config NETFILTER_XT_MATCH_CONNTRACK tristate '"conntrack" connection tracking match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n help @@ -570,7 +549,6 @@ config NETFILTER_XT_MATCH_CONNTRACK config NETFILTER_XT_MATCH_DCCP tristate '"dccp" protocol match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED default IP_DCCP help @@ -583,7 +561,6 @@ config NETFILTER_XT_MATCH_DCCP config NETFILTER_XT_MATCH_DSCP tristate '"dscp" and "tos" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `DSCP' match, which allows you to match against @@ -599,7 +576,6 @@ config NETFILTER_XT_MATCH_DSCP config NETFILTER_XT_MATCH_ESP tristate '"esp" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This match extension allows you to match a range of SPIs @@ -609,7 +585,7 @@ config NETFILTER_XT_MATCH_ESP config NETFILTER_XT_MATCH_HASHLIMIT tristate '"hashlimit" match support' - depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) depends on NETFILTER_ADVANCED help This option adds a `hashlimit' match. @@ -624,7 +600,6 @@ config NETFILTER_XT_MATCH_HASHLIMIT config NETFILTER_XT_MATCH_HELPER tristate '"helper" match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK depends on NETFILTER_ADVANCED help @@ -635,7 +610,6 @@ config NETFILTER_XT_MATCH_HELPER config NETFILTER_XT_MATCH_IPRANGE tristate '"iprange" address range match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED ---help--- This option adds a "iprange" match, which allows you to match based on @@ -646,7 +620,6 @@ config NETFILTER_XT_MATCH_IPRANGE config NETFILTER_XT_MATCH_LENGTH tristate '"length" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option allows you to match the length of a packet against a @@ -656,7 +629,6 @@ config NETFILTER_XT_MATCH_LENGTH config NETFILTER_XT_MATCH_LIMIT tristate '"limit" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help limit matching allows you to control the rate at which a rule can be @@ -667,7 +639,6 @@ config NETFILTER_XT_MATCH_LIMIT config NETFILTER_XT_MATCH_MAC tristate '"mac" address match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help MAC matching allows you to match packets based on the source @@ -677,7 +648,6 @@ config NETFILTER_XT_MATCH_MAC config NETFILTER_XT_MATCH_MARK tristate '"mark" match support' - depends on NETFILTER_XTABLES default m if NETFILTER_ADVANCED=n help Netfilter mark matching allows you to match packets based on the @@ -688,7 +658,6 @@ config NETFILTER_XT_MATCH_MARK config NETFILTER_XT_MATCH_MULTIPORT tristate '"multiport" Multiple port match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help Multiport matching allows you to match TCP or UDP packets based on @@ -699,7 +668,6 @@ config NETFILTER_XT_MATCH_MULTIPORT config NETFILTER_XT_MATCH_OWNER tristate '"owner" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED ---help--- Socket owner matching allows you to match locally-generated packets @@ -708,7 +676,7 @@ config NETFILTER_XT_MATCH_OWNER config NETFILTER_XT_MATCH_POLICY tristate 'IPsec "policy" match support' - depends on NETFILTER_XTABLES && XFRM + depends on XFRM default m if NETFILTER_ADVANCED=n help Policy matching allows you to match packets based on the @@ -719,7 +687,7 @@ config NETFILTER_XT_MATCH_POLICY config NETFILTER_XT_MATCH_PHYSDEV tristate '"physdev" match support' - depends on NETFILTER_XTABLES && BRIDGE && BRIDGE_NETFILTER + depends on BRIDGE && BRIDGE_NETFILTER depends on NETFILTER_ADVANCED help Physdev packet matching matches against the physical bridge ports @@ -729,7 +697,6 @@ config NETFILTER_XT_MATCH_PHYSDEV config NETFILTER_XT_MATCH_PKTTYPE tristate '"pkttype" packet type match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help Packet type matching allows you to match a packet by @@ -742,7 +709,6 @@ config NETFILTER_XT_MATCH_PKTTYPE config NETFILTER_XT_MATCH_QUOTA tristate '"quota" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `quota' match, which allows to match on a @@ -753,7 +719,6 @@ config NETFILTER_XT_MATCH_QUOTA config NETFILTER_XT_MATCH_RATEEST tristate '"rateest" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED select NETFILTER_XT_TARGET_RATEEST help @@ -764,7 +729,6 @@ config NETFILTER_XT_MATCH_RATEEST config NETFILTER_XT_MATCH_REALM tristate '"realm" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED select NET_CLS_ROUTE help @@ -779,7 +743,6 @@ config NETFILTER_XT_MATCH_REALM config NETFILTER_XT_MATCH_RECENT tristate '"recent" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED ---help--- This match is used for creating one or many lists of recently @@ -797,7 +760,7 @@ config NETFILTER_XT_MATCH_RECENT_PROC_COMPAT config NETFILTER_XT_MATCH_SCTP tristate '"sctp" protocol match support (EXPERIMENTAL)' - depends on NETFILTER_XTABLES && EXPERIMENTAL + depends on EXPERIMENTAL depends on NETFILTER_ADVANCED default IP_SCTP help @@ -825,7 +788,6 @@ config NETFILTER_XT_MATCH_SOCKET config NETFILTER_XT_MATCH_STATE tristate '"state" match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n help @@ -837,7 +799,6 @@ config NETFILTER_XT_MATCH_STATE config NETFILTER_XT_MATCH_STATISTIC tristate '"statistic" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `statistic' match, which allows you to match @@ -847,7 +808,6 @@ config NETFILTER_XT_MATCH_STATISTIC config NETFILTER_XT_MATCH_STRING tristate '"string" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED select TEXTSEARCH select TEXTSEARCH_KMP @@ -861,7 +821,6 @@ config NETFILTER_XT_MATCH_STRING config NETFILTER_XT_MATCH_TCPMSS tristate '"tcpmss" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `tcpmss' match, which allows you to examine the @@ -872,7 +831,6 @@ config NETFILTER_XT_MATCH_TCPMSS config NETFILTER_XT_MATCH_TIME tristate '"time" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED ---help--- This option adds a "time" match, which allows you to match based on @@ -887,7 +845,6 @@ config NETFILTER_XT_MATCH_TIME config NETFILTER_XT_MATCH_U32 tristate '"u32" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED ---help--- u32 allows you to extract quantities of up to 4 bytes from a packet, @@ -899,5 +856,6 @@ config NETFILTER_XT_MATCH_U32 Details and examples are in the kernel module source. -endmenu +endif # NETFILTER_XTABLES +endmenu -- cgit v1.2.3-70-g09d2 From f7108a20dee44e5bb037f9e48f6a207b42e6ae1c Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:18 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (1/6) The function signatures for Xtables extensions have grown over time. It involves a lot of typing/replication, and also a bit of stack space even if they are not used. Realize an NFWS2008 idea and pack them into structs. The skb remains outside of the struct so gcc can continue to apply its optimizations. This patch does this for match extensions' match functions. A few ambiguities have also been addressed. The "offset" parameter for example has been renamed to "fragoff" (there are so many different offsets already) and "protoff" to "thoff" (there is more than just one protocol here, so clarify). Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 28 ++++++++++++++++------ net/bridge/netfilter/ebt_802_3.c | 6 ++--- net/bridge/netfilter/ebt_among.c | 6 ++--- net/bridge/netfilter/ebt_arp.c | 6 ++--- net/bridge/netfilter/ebt_ip.c | 6 ++--- net/bridge/netfilter/ebt_ip6.c | 6 ++--- net/bridge/netfilter/ebt_limit.c | 6 ++--- net/bridge/netfilter/ebt_mark_m.c | 6 ++--- net/bridge/netfilter/ebt_pkttype.c | 7 ++---- net/bridge/netfilter/ebt_stp.c | 6 ++--- net/bridge/netfilter/ebt_vlan.c | 6 ++--- net/bridge/netfilter/ebtables.c | 16 ++++++++----- net/ipv4/netfilter/ip_tables.c | 46 ++++++++++++++++-------------------- net/ipv4/netfilter/ipt_addrtype.c | 18 +++++--------- net/ipv4/netfilter/ipt_ah.c | 14 ++++------- net/ipv4/netfilter/ipt_ecn.c | 9 +++---- net/ipv4/netfilter/ipt_ttl.c | 7 ++---- net/ipv6/netfilter/ip6_tables.c | 44 +++++++++++++--------------------- net/ipv6/netfilter/ip6t_ah.c | 11 ++++----- net/ipv6/netfilter/ip6t_eui64.c | 9 +++---- net/ipv6/netfilter/ip6t_frag.c | 11 ++++----- net/ipv6/netfilter/ip6t_hbh.c | 13 ++++------ net/ipv6/netfilter/ip6t_hl.c | 7 ++---- net/ipv6/netfilter/ip6t_ipv6header.c | 7 ++---- net/ipv6/netfilter/ip6t_mh.c | 15 +++++------- net/ipv6/netfilter/ip6t_rt.c | 11 ++++----- net/netfilter/xt_comment.c | 5 +--- net/netfilter/xt_connbytes.c | 7 ++---- net/netfilter/xt_connlimit.c | 17 ++++++------- net/netfilter/xt_connmark.c | 14 ++++------- net/netfilter/xt_conntrack.c | 22 +++++++---------- net/netfilter/xt_dccp.c | 16 ++++++------- net/netfilter/xt_dscp.c | 30 ++++++++--------------- net/netfilter/xt_esp.c | 13 ++++------ net/netfilter/xt_hashlimit.c | 22 +++++++---------- net/netfilter/xt_helper.c | 7 ++---- net/netfilter/xt_iprange.c | 21 +++++----------- net/netfilter/xt_length.c | 14 ++++------- net/netfilter/xt_limit.c | 7 ++---- net/netfilter/xt_mac.c | 7 ++---- net/netfilter/xt_mark.c | 13 ++++------ net/netfilter/xt_multiport.c | 26 ++++++++------------ net/netfilter/xt_owner.c | 21 +++++----------- net/netfilter/xt_physdev.c | 7 ++---- net/netfilter/xt_pkttype.c | 11 ++++----- net/netfilter/xt_policy.c | 11 ++++----- net/netfilter/xt_quota.c | 7 ++---- net/netfilter/xt_rateest.c | 12 +++------- net/netfilter/xt_realm.c | 7 ++---- net/netfilter/xt_recent.c | 17 ++++++------- net/netfilter/xt_sctp.c | 16 ++++++------- net/netfilter/xt_socket.c | 11 ++------- net/netfilter/xt_state.c | 7 ++---- net/netfilter/xt_statistic.c | 7 ++---- net/netfilter/xt_string.c | 9 +++---- net/netfilter/xt_tcpmss.c | 13 ++++------ net/netfilter/xt_tcpudp.c | 36 ++++++++++++---------------- net/netfilter/xt_time.c | 6 ++--- net/netfilter/xt_u32.c | 7 ++---- 59 files changed, 286 insertions(+), 487 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 85aa42785a5..bcd40ec8325 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -173,6 +173,26 @@ struct xt_counters_info #include +/** + * struct xt_match_param - parameters for match extensions' match functions + * + * @in: input netdevice + * @out: output netdevice + * @match: struct xt_match through which this function was invoked + * @matchinfo: per-match data + * @fragoff: packet is a fragment, this is the data offset + * @thoff: position of transport header relative to skb->data + * @hotdrop: drop packet if we had inspection problems + */ +struct xt_match_param { + const struct net_device *in, *out; + const struct xt_match *match; + const void *matchinfo; + int fragoff; + unsigned int thoff; + bool *hotdrop; +}; + struct xt_match { struct list_head list; @@ -185,13 +205,7 @@ struct xt_match non-linear skb, using skb_header_pointer and skb_ip_make_writable. */ bool (*match)(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop); + const struct xt_match_param *); /* Called when user tries to insert an entry of this type. */ /* Should return true or false. */ diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c index 6fc2a59e09a..c9e1bc14951 100644 --- a/net/bridge/netfilter/ebt_802_3.c +++ b/net/bridge/netfilter/ebt_802_3.c @@ -13,11 +13,9 @@ #include static bool -ebt_802_3_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *data, int offset, unsigned int protoff, bool *hotdrop) +ebt_802_3_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ebt_802_3_info *info = data; + const struct ebt_802_3_info *info = par->matchinfo; const struct ebt_802_3_hdr *hdr = ebt_802_3_hdr(skb); __be16 type = hdr->llc.ui.ctrl & IS_UI ? hdr->llc.ui.type : hdr->llc.ni.type; diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c index 084559e1840..0ad0db3e815 100644 --- a/net/bridge/netfilter/ebt_among.c +++ b/net/bridge/netfilter/ebt_among.c @@ -128,11 +128,9 @@ static int get_ip_src(const struct sk_buff *skb, __be32 *addr) } static bool -ebt_among_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *data, int offset, unsigned int protoff, bool *hotdrop) +ebt_among_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ebt_among_info *info = data; + const struct ebt_among_info *info = par->matchinfo; const char *dmac, *smac; const struct ebt_mac_wormhash *wh_dst, *wh_src; __be32 dip = 0, sip = 0; diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c index a073dffe7a1..1ff8fa3a9e7 100644 --- a/net/bridge/netfilter/ebt_arp.c +++ b/net/bridge/netfilter/ebt_arp.c @@ -16,11 +16,9 @@ #include static bool -ebt_arp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *data, int offset, unsigned int protoff, bool *hotdrop) +ebt_arp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ebt_arp_info *info = data; + const struct ebt_arp_info *info = par->matchinfo; const struct arphdr *ah; struct arphdr _arph; diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index b42c7ce799b..c70ea39840b 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -25,11 +25,9 @@ struct tcpudphdr { }; static bool -ebt_ip_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *data, int offset, unsigned int protoff, bool *hotdrop) +ebt_ip_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ebt_ip_info *info = data; + const struct ebt_ip_info *info = par->matchinfo; const struct iphdr *ih; struct iphdr _iph; const struct tcpudphdr *pptr; diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c index 7bd98312967..5acee02de72 100644 --- a/net/bridge/netfilter/ebt_ip6.c +++ b/net/bridge/netfilter/ebt_ip6.c @@ -28,11 +28,9 @@ struct tcpudphdr { }; static bool -ebt_ip6_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *data, int offset, unsigned int protoff, bool *hotdrop) +ebt_ip6_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ebt_ip6_info *info = data; + const struct ebt_ip6_info *info = par->matchinfo; const struct ipv6hdr *ih6; struct ipv6hdr _ip6h; const struct tcpudphdr *pptr; diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c index 58aaaa14906..9a3ec8cadaa 100644 --- a/net/bridge/netfilter/ebt_limit.c +++ b/net/bridge/netfilter/ebt_limit.c @@ -31,11 +31,9 @@ static DEFINE_SPINLOCK(limit_lock); #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) static bool -ebt_limit_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *data, int offset, unsigned int protoff, bool *hotdrop) +ebt_limit_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - struct ebt_limit_info *info = (void *)data; + struct ebt_limit_info *info = (void *)par->matchinfo; unsigned long now = jiffies; spin_lock_bh(&limit_lock); diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c index aa6781c7f98..5b22ef96127 100644 --- a/net/bridge/netfilter/ebt_mark_m.c +++ b/net/bridge/netfilter/ebt_mark_m.c @@ -13,11 +13,9 @@ #include static bool -ebt_mark_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *data, int offset, unsigned int protoff, bool *hotdrop) +ebt_mark_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ebt_mark_m_info *info = data; + const struct ebt_mark_m_info *info = par->matchinfo; if (info->bitmask & EBT_MARK_OR) return !!(skb->mark & info->mask) ^ info->invert; diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c index 1c04ce5a52c..b756f88fb10 100644 --- a/net/bridge/netfilter/ebt_pkttype.c +++ b/net/bridge/netfilter/ebt_pkttype.c @@ -13,12 +13,9 @@ #include static bool -ebt_pkttype_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *data, int offset, unsigned int protoff, - bool *hotdrop) +ebt_pkttype_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ebt_pkttype_info *info = data; + const struct ebt_pkttype_info *info = par->matchinfo; return (skb->pkt_type == info->pkt_type) ^ info->invert; } diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 28bb48b67a8..06d777c62c3 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -120,11 +120,9 @@ static bool ebt_filter_config(const struct ebt_stp_info *info, } static bool -ebt_stp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *data, int offset, unsigned int protoff, bool *hotdrop) +ebt_stp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ebt_stp_info *info = data; + const struct ebt_stp_info *info = par->matchinfo; const struct stp_header *sp; struct stp_header _stph; const uint8_t header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00}; diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c index 5addef6d62f..b05b4a81834 100644 --- a/net/bridge/netfilter/ebt_vlan.c +++ b/net/bridge/netfilter/ebt_vlan.c @@ -41,11 +41,9 @@ MODULE_LICENSE("GPL"); #define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return false; } static bool -ebt_vlan_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *data, int offset, unsigned int protoff, bool *hotdrop) +ebt_vlan_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ebt_vlan_info *info = data; + const struct ebt_vlan_info *info = par->matchinfo; const struct vlan_hdr *fp; struct vlan_hdr _frame; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 7ee72b71d3c..f8e1822f38d 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -74,11 +74,11 @@ static inline int ebt_do_watcher (struct ebt_entry_watcher *w, } static inline int ebt_do_match (struct ebt_entry_match *m, - const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, bool *hotdrop) + const struct sk_buff *skb, struct xt_match_param *par) { - return m->u.match->match(skb, in, out, m->u.match, - m->data, 0, 0, hotdrop); + par->match = m->u.match; + par->matchinfo = m->data; + return m->u.match->match(skb, par); } static inline int ebt_dev_check(char *entry, const struct net_device *device) @@ -155,6 +155,11 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, char *base; struct ebt_table_info *private; bool hotdrop = false; + struct xt_match_param mtpar; + + mtpar.in = in; + mtpar.out = out; + mtpar.hotdrop = &hotdrop; read_lock_bh(&table->lock); private = table->private; @@ -175,8 +180,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, if (ebt_basic_match(point, eth_hdr(skb), in, out)) goto letscontinue; - if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, - in, out, &hotdrop) != 0) + if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &mtpar) != 0) goto letscontinue; if (hotdrop) { read_unlock_bh(&table->lock); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b4c74a7a807..99fdb59454f 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -186,16 +186,14 @@ ipt_error(struct sk_buff *skb, /* Performance critical - called for every packet */ static inline bool -do_match(struct ipt_entry_match *m, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int offset, - bool *hotdrop) +do_match(struct ipt_entry_match *m, const struct sk_buff *skb, + struct xt_match_param *par) { + par->match = m->u.kernel.match; + par->matchinfo = m->data; + /* Stop iteration if it doesn't match */ - if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data, - offset, ip_hdrlen(skb), hotdrop)) + if (!m->u.kernel.match->match(skb, par)) return true; else return false; @@ -326,7 +324,6 @@ ipt_do_table(struct sk_buff *skb, struct xt_table *table) { static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); - u_int16_t offset; const struct iphdr *ip; u_int16_t datalen; bool hotdrop = false; @@ -336,6 +333,7 @@ ipt_do_table(struct sk_buff *skb, void *table_base; struct ipt_entry *e, *back; struct xt_table_info *private; + struct xt_match_param mtpar; /* Initialization */ ip = ip_hdr(skb); @@ -348,7 +346,11 @@ ipt_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ - offset = ntohs(ip->frag_off) & IP_OFFSET; + mtpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; + mtpar.thoff = ip_hdrlen(skb); + mtpar.hotdrop = &hotdrop; + mtpar.in = in; + mtpar.out = out; read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); @@ -362,12 +364,11 @@ ipt_do_table(struct sk_buff *skb, do { IP_NF_ASSERT(e); IP_NF_ASSERT(back); - if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { + if (ip_packet_match(ip, indev, outdev, + &e->ip, mtpar.fragoff)) { struct ipt_entry_target *t; - if (IPT_MATCH_ITERATE(e, do_match, - skb, in, out, - offset, &hotdrop) != 0) + if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0) goto no_match; ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); @@ -2116,30 +2117,23 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, } static bool -icmp_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +icmp_match(const struct sk_buff *skb, const struct xt_match_param *par) { const struct icmphdr *ic; struct icmphdr _icmph; - const struct ipt_icmp *icmpinfo = matchinfo; + const struct ipt_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph); + ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (ic == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("Dropping evil ICMP tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index 2c9d88a6c83..e60995e4c20 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -30,12 +30,9 @@ static inline bool match_type(const struct net_device *dev, __be32 addr, } static bool -addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_addrtype_info *info = matchinfo; + const struct ipt_addrtype_info *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); bool ret = true; @@ -50,20 +47,17 @@ addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in, } static bool -addrtype_mt_v1(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_addrtype_info_v1 *info = matchinfo; + const struct ipt_addrtype_info_v1 *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); const struct net_device *dev = NULL; bool ret = true; if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) - dev = in; + dev = par->in; else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) - dev = out; + dev = par->out; if (info->source) ret &= match_type(dev, iph->saddr, info->source) ^ diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index e2e993edd66..2fce19ef4f3 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -36,27 +36,23 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) return r; } -static bool -ah_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par) { struct ip_auth_hdr _ahdr; const struct ip_auth_hdr *ah; - const struct ipt_ah *ahinfo = matchinfo; + const struct ipt_ah *ahinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - ah = skb_header_pointer(skb, protoff, - sizeof(_ahdr), &_ahdr); + ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr); if (ah == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("Dropping evil AH tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return 0; } diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index 2c45b4be7c3..06915463150 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -67,12 +67,9 @@ static inline bool match_tcp(const struct sk_buff *skb, return true; } -static bool -ecn_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_ecn_info *info = matchinfo; + const struct ipt_ecn_info *info = par->matchinfo; if (info->operation & IPT_ECN_OP_MATCH_IP) if (!match_ip(skb, info)) @@ -81,7 +78,7 @@ ecn_mt(const struct sk_buff *skb, const struct net_device *in, if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { if (ip_hdr(skb)->protocol != IPPROTO_TCP) return false; - if (!match_tcp(skb, info, hotdrop)) + if (!match_tcp(skb, info, par->hotdrop)) return false; } diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c index d4c3fdc2a79..297f1cbf4ff 100644 --- a/net/ipv4/netfilter/ipt_ttl.c +++ b/net/ipv4/netfilter/ipt_ttl.c @@ -18,12 +18,9 @@ MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("Xtables: IPv4 TTL field match"); MODULE_LICENSE("GPL"); -static bool -ttl_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool ttl_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_ttl_info *info = matchinfo; + const struct ipt_ttl_info *info = par->matchinfo; const u8 ttl = ip_hdr(skb)->ttl; switch (info->mode) { diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 12c41b8d365..cf2c5370a4e 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -215,17 +215,14 @@ ip6t_error(struct sk_buff *skb, /* Performance critical - called for every packet */ static inline bool -do_match(struct ip6t_entry_match *m, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int offset, - unsigned int protoff, - bool *hotdrop) +do_match(struct ip6t_entry_match *m, const struct sk_buff *skb, + struct xt_match_param *par) { + par->match = m->u.kernel.match; + par->matchinfo = m->data; + /* Stop iteration if it doesn't match */ - if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data, - offset, protoff, hotdrop)) + if (!m->u.kernel.match->match(skb, par)) return true; else return false; @@ -355,8 +352,6 @@ ip6t_do_table(struct sk_buff *skb, struct xt_table *table) { static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); - int offset = 0; - unsigned int protoff = 0; bool hotdrop = false; /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; @@ -364,6 +359,7 @@ ip6t_do_table(struct sk_buff *skb, void *table_base; struct ip6t_entry *e, *back; struct xt_table_info *private; + struct xt_match_param mtpar; /* Initialization */ indev = in ? in->name : nulldevname; @@ -374,6 +370,9 @@ ip6t_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ + mtpar.hotdrop = &hotdrop; + mtpar.in = in; + mtpar.out = out; read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); @@ -388,12 +387,10 @@ ip6t_do_table(struct sk_buff *skb, IP_NF_ASSERT(e); IP_NF_ASSERT(back); if (ip6_packet_match(skb, indev, outdev, &e->ipv6, - &protoff, &offset, &hotdrop)) { + &mtpar.thoff, &mtpar.fragoff, &hotdrop)) { struct ip6t_entry_target *t; - if (IP6T_MATCH_ITERATE(e, do_match, - skb, in, out, - offset, protoff, &hotdrop) != 0) + if (IP6T_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0) goto no_match; ADD_COUNTER(e->counters, @@ -2141,30 +2138,23 @@ icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, } static bool -icmp6_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +icmp6_match(const struct sk_buff *skb, const struct xt_match_param *par) { const struct icmp6hdr *ic; struct icmp6hdr _icmph; - const struct ip6t_icmp *icmpinfo = matchinfo; + const struct ip6t_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph); + ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (ic == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("Dropping evil ICMP tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index 061f89beeb6..a04f2b8396e 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -36,14 +36,11 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) return r; } -static bool -ah_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool ah_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { struct ip_auth_hdr _ah; const struct ip_auth_hdr *ah; - const struct ip6t_ah *ahinfo = matchinfo; + const struct ip6t_ah *ahinfo = par->matchinfo; unsigned int ptr; unsigned int hdrlen = 0; int err; @@ -51,13 +48,13 @@ ah_mt6(const struct sk_buff *skb, const struct net_device *in, err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL); if (err < 0) { if (err != -ENOENT) - *hotdrop = true; + *par->hotdrop = true; return false; } ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah); if (ah == NULL) { - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c index ba38df1116f..db610bacbcc 100644 --- a/net/ipv6/netfilter/ip6t_eui64.c +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -20,18 +20,15 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Andras Kis-Szabo "); static bool -eui64_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +eui64_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { unsigned char eui64[8]; int i = 0; if (!(skb_mac_header(skb) >= skb->head && skb_mac_header(skb) + ETH_HLEN <= skb->data) && - offset != 0) { - *hotdrop = true; + par->fragoff != 0) { + *par->hotdrop = true; return false; } diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c index 972f699af22..6951d0dacf4 100644 --- a/net/ipv6/netfilter/ip6t_frag.c +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -35,27 +35,24 @@ id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) } static bool -frag_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +frag_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { struct frag_hdr _frag; const struct frag_hdr *fh; - const struct ip6t_frag *fraginfo = matchinfo; + const struct ip6t_frag *fraginfo = par->matchinfo; unsigned int ptr; int err; err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL); if (err < 0) { if (err != -ENOENT) - *hotdrop = true; + *par->hotdrop = true; return false; } fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag); if (fh == NULL) { - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c index d5edb51a595..d3351978819 100644 --- a/net/ipv6/netfilter/ip6t_hbh.c +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -42,14 +42,11 @@ MODULE_ALIAS("ip6t_dst"); */ static bool -hbh_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +hbh_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { struct ipv6_opt_hdr _optsh; const struct ipv6_opt_hdr *oh; - const struct ip6t_opts *optinfo = matchinfo; + const struct ip6t_opts *optinfo = par->matchinfo; unsigned int temp; unsigned int ptr; unsigned int hdrlen = 0; @@ -61,16 +58,16 @@ hbh_mt6(const struct sk_buff *skb, const struct net_device *in, unsigned int optlen; int err; - err = ipv6_find_hdr(skb, &ptr, match->data, NULL); + err = ipv6_find_hdr(skb, &ptr, par->match->data, NULL); if (err < 0) { if (err != -ENOENT) - *hotdrop = true; + *par->hotdrop = true; return false; } oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); if (oh == NULL) { - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c index 25c1eb92fac..c964dca1132 100644 --- a/net/ipv6/netfilter/ip6t_hl.c +++ b/net/ipv6/netfilter/ip6t_hl.c @@ -19,12 +19,9 @@ MODULE_AUTHOR("Maciej Soltysiak "); MODULE_DESCRIPTION("Xtables: IPv6 Hop Limit field match"); MODULE_LICENSE("GPL"); -static bool -hl_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool hl_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ip6t_hl_info *info = matchinfo; + const struct ip6t_hl_info *info = par->matchinfo; const struct ipv6hdr *ip6h = ipv6_hdr(skb); switch (info->mode) { diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index ef0661aacea..6aaca511d47 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -27,12 +27,9 @@ MODULE_DESCRIPTION("Xtables: IPv6 header types match"); MODULE_AUTHOR("Andras Kis-Szabo "); static bool -ipv6header_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +ipv6header_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ip6t_ipv6header_info *info = matchinfo; + const struct ip6t_ipv6header_info *info = par->matchinfo; unsigned int temp; int len; u8 nexthdr; diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c index dd876274ff6..2803258b6d0 100644 --- a/net/ipv6/netfilter/ip6t_mh.c +++ b/net/ipv6/netfilter/ip6t_mh.c @@ -37,32 +37,29 @@ type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert) return (type >= min && type <= max) ^ invert; } -static bool -mh_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool mh_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { struct ip6_mh _mh; const struct ip6_mh *mh; - const struct ip6t_mh *mhinfo = matchinfo; + const struct ip6t_mh *mhinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - mh = skb_header_pointer(skb, protoff, sizeof(_mh), &_mh); + mh = skb_header_pointer(skb, par->thoff, sizeof(_mh), &_mh); if (mh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ duprintf("Dropping evil MH tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } if (mh->ip6mh_proto != IPPROTO_NONE) { duprintf("Dropping invalid MH Payload Proto: %u\n", mh->ip6mh_proto); - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index 7c544ae591d..9cf4b8a37af 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -36,14 +36,11 @@ segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) return r; } -static bool -rt_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool rt_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { struct ipv6_rt_hdr _route; const struct ipv6_rt_hdr *rh; - const struct ip6t_rt *rtinfo = matchinfo; + const struct ip6t_rt *rtinfo = par->matchinfo; unsigned int temp; unsigned int ptr; unsigned int hdrlen = 0; @@ -55,13 +52,13 @@ rt_mt6(const struct sk_buff *skb, const struct net_device *in, err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL); if (err < 0) { if (err != -ENOENT) - *hotdrop = true; + *par->hotdrop = true; return false; } rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route); if (rh == NULL) { - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c index fa211b2ab87..bd7aa57af42 100644 --- a/net/netfilter/xt_comment.c +++ b/net/netfilter/xt_comment.c @@ -16,10 +16,7 @@ MODULE_ALIAS("ipt_comment"); MODULE_ALIAS("ip6t_comment"); static bool -comment_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protooff, - bool *hotdrop) +comment_mt(const struct sk_buff *skb, const struct xt_match_param *par) { /* We always match */ return true; diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index d2cd22a49c9..30c19b5fe90 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -17,12 +17,9 @@ MODULE_ALIAS("ipt_connbytes"); MODULE_ALIAS("ip6t_connbytes"); static bool -connbytes_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +connbytes_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_connbytes_info *sinfo = matchinfo; + const struct xt_connbytes_info *sinfo = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int64_t what = 0; /* initialize to make gcc happy */ diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index bd00830ff69..8b8f70e7664 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -178,12 +178,9 @@ static int count_them(struct xt_connlimit_data *data, } static bool -connlimit_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +connlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_connlimit_info *info = matchinfo; + const struct xt_connlimit_info *info = par->matchinfo; union nf_inet_addr addr; struct nf_conntrack_tuple tuple; const struct nf_conntrack_tuple *tuple_ptr = &tuple; @@ -195,10 +192,10 @@ connlimit_mt(const struct sk_buff *skb, const struct net_device *in, if (ct != NULL) tuple_ptr = &ct->tuplehash[0].tuple; else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - match->family, &tuple)) + par->match->family, &tuple)) goto hotdrop; - if (match->family == NFPROTO_IPV6) { + if (par->match->family == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr)); } else { @@ -208,19 +205,19 @@ connlimit_mt(const struct sk_buff *skb, const struct net_device *in, spin_lock_bh(&info->data->lock); connections = count_them(info->data, tuple_ptr, &addr, - &info->mask, match); + &info->mask, par->match); spin_unlock_bh(&info->data->lock); if (connections < 0) { /* kmalloc failed, drop it entirely */ - *hotdrop = true; + *par->hotdrop = true; return false; } return (connections > info->limit) ^ info->inverse; hotdrop: - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 0577b8ff4e1..df4f4a865a5 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -34,12 +34,9 @@ MODULE_ALIAS("ipt_connmark"); MODULE_ALIAS("ip6t_connmark"); static bool -connmark_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +connmark_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_connmark_mtinfo1 *info = matchinfo; + const struct xt_connmark_mtinfo1 *info = par->matchinfo; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; @@ -51,12 +48,9 @@ connmark_mt(const struct sk_buff *skb, const struct net_device *in, } static bool -connmark_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +connmark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_connmark_info *info = matchinfo; + const struct xt_connmark_info *info = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 392b457f9c2..13a7e4eacdf 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -25,12 +25,9 @@ MODULE_ALIAS("ipt_conntrack"); MODULE_ALIAS("ip6t_conntrack"); static bool -conntrack_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +conntrack_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_conntrack_info *sinfo = matchinfo; + const struct xt_conntrack_info *sinfo = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int statebit; @@ -205,12 +202,9 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo1 *info, } static bool -conntrack_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_conntrack_mtinfo1 *info = matchinfo; + const struct xt_conntrack_mtinfo1 *info = par->matchinfo; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int statebit; @@ -244,22 +238,22 @@ conntrack_mt(const struct sk_buff *skb, const struct net_device *in, return false; if (info->match_flags & XT_CONNTRACK_ORIGSRC) - if (conntrack_mt_origsrc(ct, info, match->family) ^ + if (conntrack_mt_origsrc(ct, info, par->match->family) ^ !(info->invert_flags & XT_CONNTRACK_ORIGSRC)) return false; if (info->match_flags & XT_CONNTRACK_ORIGDST) - if (conntrack_mt_origdst(ct, info, match->family) ^ + if (conntrack_mt_origdst(ct, info, par->match->family) ^ !(info->invert_flags & XT_CONNTRACK_ORIGDST)) return false; if (info->match_flags & XT_CONNTRACK_REPLSRC) - if (conntrack_mt_replsrc(ct, info, match->family) ^ + if (conntrack_mt_replsrc(ct, info, par->match->family) ^ !(info->invert_flags & XT_CONNTRACK_REPLSRC)) return false; if (info->match_flags & XT_CONNTRACK_REPLDST) - if (conntrack_mt_repldst(ct, info, match->family) ^ + if (conntrack_mt_repldst(ct, info, par->match->family) ^ !(info->invert_flags & XT_CONNTRACK_REPLDST)) return false; diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index 87971f47132..7aa30bb9105 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -93,20 +93,18 @@ match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, } static bool -dccp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +dccp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_dccp_info *info = matchinfo; + const struct xt_dccp_info *info = par->matchinfo; const struct dccp_hdr *dh; struct dccp_hdr _dh; - if (offset) + if (par->fragoff != 0) return false; - dh = skb_header_pointer(skb, protoff, sizeof(_dh), &_dh); + dh = skb_header_pointer(skb, par->thoff, sizeof(_dh), &_dh); if (dh == NULL) { - *hotdrop = true; + *par->hotdrop = true; return false; } @@ -118,8 +116,8 @@ dccp_mt(const struct sk_buff *skb, const struct net_device *in, XT_DCCP_DEST_PORTS, info->flags, info->invflags) && DCCHECK(match_types(dh, info->typemask), XT_DCCP_TYPE, info->flags, info->invflags) - && DCCHECK(match_option(info->option, skb, protoff, dh, - hotdrop), + && DCCHECK(match_option(info->option, skb, par->thoff, dh, + par->hotdrop), XT_DCCP_OPTION, info->flags, info->invflags); } diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c index 7f03aa13a95..57d61206135 100644 --- a/net/netfilter/xt_dscp.c +++ b/net/netfilter/xt_dscp.c @@ -26,23 +26,18 @@ MODULE_ALIAS("ipt_tos"); MODULE_ALIAS("ip6t_tos"); static bool -dscp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +dscp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_dscp_info *info = matchinfo; + const struct xt_dscp_info *info = par->matchinfo; u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; return (dscp == info->dscp) ^ !!info->invert; } static bool -dscp_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +dscp_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_dscp_info *info = matchinfo; + const struct xt_dscp_info *info = par->matchinfo; u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; return (dscp == info->dscp) ^ !!info->invert; @@ -63,24 +58,19 @@ dscp_mt_check(const char *tablename, const void *info, return true; } -static bool tos_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +static bool +tos_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_tos_info *info = matchinfo; + const struct ipt_tos_info *info = par->matchinfo; return (ip_hdr(skb)->tos == info->tos) ^ info->invert; } -static bool tos_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +static bool tos_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_tos_match_info *info = matchinfo; + const struct xt_tos_match_info *info = par->matchinfo; - if (match->family == NFPROTO_IPV4) + if (par->match->family == NFPROTO_IPV4) return ((ip_hdr(skb)->tos & info->tos_mask) == info->tos_value) ^ !!info->invert; else diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c index 045c4deecaf..6d59f2e7c1c 100644 --- a/net/netfilter/xt_esp.c +++ b/net/netfilter/xt_esp.c @@ -42,26 +42,23 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) return r; } -static bool -esp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool esp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const struct ip_esp_hdr *eh; struct ip_esp_hdr _esp; - const struct xt_esp *espinfo = matchinfo; + const struct xt_esp *espinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - eh = skb_header_pointer(skb, protoff, sizeof(_esp), &_esp); + eh = skb_header_pointer(skb, par->thoff, sizeof(_esp), &_esp); if (eh == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("Dropping evil ESP tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 7bae369603d..22a60a728cf 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -563,19 +563,16 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, } static bool -hashlimit_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +hashlimit_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { const struct xt_hashlimit_info *r = - ((const struct xt_hashlimit_info *)matchinfo)->u.master; + ((const struct xt_hashlimit_info *)par->matchinfo)->u.master; struct xt_hashlimit_htable *hinfo = r->hinfo; unsigned long now = jiffies; struct dsthash_ent *dh; struct dsthash_dst dst; - if (hashlimit_init_dst(hinfo, &dst, skb, protoff) < 0) + if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) goto hotdrop; spin_lock_bh(&hinfo->lock); @@ -613,23 +610,20 @@ hashlimit_mt_v0(const struct sk_buff *skb, const struct net_device *in, return false; hotdrop: - *hotdrop = true; + *par->hotdrop = true; return false; } static bool -hashlimit_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_hashlimit_mtinfo1 *info = matchinfo; + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; struct xt_hashlimit_htable *hinfo = info->hinfo; unsigned long now = jiffies; struct dsthash_ent *dh; struct dsthash_dst dst; - if (hashlimit_init_dst(hinfo, &dst, skb, protoff) < 0) + if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) goto hotdrop; spin_lock_bh(&hinfo->lock); @@ -666,7 +660,7 @@ hashlimit_mt(const struct sk_buff *skb, const struct net_device *in, return info->cfg.mode & XT_HASHLIMIT_INVERT; hotdrop: - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 134d94324eb..73bdc3ba13f 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -24,12 +24,9 @@ MODULE_ALIAS("ip6t_helper"); static bool -helper_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +helper_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_helper_info *info = matchinfo; + const struct xt_helper_info *info = par->matchinfo; const struct nf_conn *ct; const struct nf_conn_help *master_help; const struct nf_conntrack_helper *helper; diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c index a7498cc48dc..6f62c36948d 100644 --- a/net/netfilter/xt_iprange.c +++ b/net/netfilter/xt_iprange.c @@ -17,12 +17,9 @@ #include static bool -iprange_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +iprange_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_iprange_info *info = matchinfo; + const struct ipt_iprange_info *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); if (info->flags & IPRANGE_SRC) { @@ -55,12 +52,9 @@ iprange_mt_v0(const struct sk_buff *skb, const struct net_device *in, } static bool -iprange_mt4(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +iprange_mt4(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_iprange_mtinfo *info = matchinfo; + const struct xt_iprange_mtinfo *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); bool m; @@ -111,12 +105,9 @@ iprange_ipv6_sub(const struct in6_addr *a, const struct in6_addr *b) } static bool -iprange_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +iprange_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_iprange_mtinfo *info = matchinfo; + const struct xt_iprange_mtinfo *info = par->matchinfo; const struct ipv6hdr *iph = ipv6_hdr(skb); bool m; diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index b8612d1914b..c4871ca6c86 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -21,24 +21,18 @@ MODULE_ALIAS("ipt_length"); MODULE_ALIAS("ip6t_length"); static bool -length_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +length_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_length_info *info = matchinfo; + const struct xt_length_info *info = par->matchinfo; u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } static bool -length_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +length_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_length_info *info = matchinfo; + const struct xt_length_info *info = par->matchinfo; const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 00247bd1095..c475eac5dbe 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -58,13 +58,10 @@ static DEFINE_SPINLOCK(limit_lock); #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) static bool -limit_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +limit_mt(const struct sk_buff *skb, const struct xt_match_param *par) { struct xt_rateinfo *r = - ((const struct xt_rateinfo *)matchinfo)->master; + ((const struct xt_rateinfo *)par->matchinfo)->master; unsigned long now = jiffies; spin_lock_bh(&limit_lock); diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index 60db240098a..269f9d8aef5 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -24,12 +24,9 @@ MODULE_DESCRIPTION("Xtables: MAC address match"); MODULE_ALIAS("ipt_mac"); MODULE_ALIAS("ip6t_mac"); -static bool -mac_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool mac_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_mac_info *info = matchinfo; + const struct xt_mac_info *info = par->matchinfo; /* Is mac pointer valid? */ return skb_mac_header(skb) >= skb->head && diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 96dd2b63b6b..88547614653 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -23,22 +23,17 @@ MODULE_ALIAS("ipt_mark"); MODULE_ALIAS("ip6t_mark"); static bool -mark_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +mark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_mark_info *info = matchinfo; + const struct xt_mark_info *info = par->matchinfo; return ((skb->mark & info->mask) == info->mark) ^ info->invert; } static bool -mark_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +mark_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_mark_mtinfo1 *info = matchinfo; + const struct xt_mark_mtinfo1 *info = par->matchinfo; return ((skb->mark & info->mask) == info->mark) ^ info->invert; } diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index f6fe008ab8c..7087e291528 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -95,25 +95,22 @@ ports_match_v1(const struct xt_multiport_v1 *minfo, } static bool -multiport_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +multiport_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { const __be16 *pptr; __be16 _ports[2]; - const struct xt_multiport *multiinfo = matchinfo; + const struct xt_multiport *multiinfo = par->matchinfo; - if (offset) + if (par->fragoff != 0) return false; - pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports); + pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports); if (pptr == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } @@ -122,25 +119,22 @@ multiport_mt_v0(const struct sk_buff *skb, const struct net_device *in, } static bool -multiport_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +multiport_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const __be16 *pptr; __be16 _ports[2]; - const struct xt_multiport_v1 *multiinfo = matchinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; - if (offset) + if (par->fragoff != 0) return false; - pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports); + pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports); if (pptr == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index d1c3b7ae9b4..493b5eb8d14 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -21,12 +21,9 @@ #include static bool -owner_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +owner_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_owner_info *info = matchinfo; + const struct ipt_owner_info *info = par->matchinfo; const struct file *filp; if (skb->sk == NULL || skb->sk->sk_socket == NULL) @@ -50,12 +47,9 @@ owner_mt_v0(const struct sk_buff *skb, const struct net_device *in, } static bool -owner_mt6_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +owner_mt6_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ip6t_owner_info *info = matchinfo; + const struct ip6t_owner_info *info = par->matchinfo; const struct file *filp; if (skb->sk == NULL || skb->sk->sk_socket == NULL) @@ -79,12 +73,9 @@ owner_mt6_v0(const struct sk_buff *skb, const struct net_device *in, } static bool -owner_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +owner_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_owner_match_info *info = matchinfo; + const struct xt_owner_match_info *info = par->matchinfo; const struct file *filp; if (skb->sk == NULL || skb->sk->sk_socket == NULL) diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 72a0bdd53fa..e980e179d4f 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -21,14 +21,11 @@ MODULE_ALIAS("ipt_physdev"); MODULE_ALIAS("ip6t_physdev"); static bool -physdev_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +physdev_mt(const struct sk_buff *skb, const struct xt_match_param *par) { int i; static const char nulldevname[IFNAMSIZ]; - const struct xt_physdev_info *info = matchinfo; + const struct xt_physdev_info *info = par->matchinfo; bool ret; const char *indev, *outdev; const struct nf_bridge_info *nf_bridge; diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c index 81e86d319a8..37753a37760 100644 --- a/net/netfilter/xt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -23,20 +23,17 @@ MODULE_ALIAS("ipt_pkttype"); MODULE_ALIAS("ip6t_pkttype"); static bool -pkttype_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +pkttype_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_pkttype_info *info = matchinfo; + const struct xt_pkttype_info *info = par->matchinfo; u_int8_t type; if (skb->pkt_type != PACKET_LOOPBACK) type = skb->pkt_type; - else if (match->family == NFPROTO_IPV4 && + else if (par->match->family == NFPROTO_IPV4 && ipv4_is_multicast(ip_hdr(skb)->daddr)) type = PACKET_MULTICAST; - else if (match->family == NFPROTO_IPV6 && + else if (par->match->family == NFPROTO_IPV6 && ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) type = PACKET_MULTICAST; else diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index f1d514e9d0a..b0a00fb0511 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -110,18 +110,15 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, } static bool -policy_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +policy_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_policy_info *info = matchinfo; + const struct xt_policy_info *info = par->matchinfo; int ret; if (info->flags & XT_POLICY_MATCH_IN) - ret = match_policy_in(skb, info, match->family); + ret = match_policy_in(skb, info, par->match->family); else - ret = match_policy_out(skb, info, match->family); + ret = match_policy_out(skb, info, par->match->family); if (ret < 0) ret = info->flags & XT_POLICY_MATCH_NONE ? true : false; diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index a3c8798f0cc..3ab92666c14 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -18,13 +18,10 @@ MODULE_ALIAS("ip6t_quota"); static DEFINE_SPINLOCK(quota_lock); static bool -quota_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +quota_mt(const struct sk_buff *skb, const struct xt_match_param *par) { struct xt_quota_info *q = - ((const struct xt_quota_info *)matchinfo)->master; + ((const struct xt_quota_info *)par->matchinfo)->master; bool ret = q->flags & XT_QUOTA_INVERT; spin_lock_bh("a_lock); diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index 4dcfd7353db..e9f64ef4565 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -14,16 +14,10 @@ #include -static bool xt_rateest_mt(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +static bool +xt_rateest_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_rateest_match_info *info = matchinfo; + const struct xt_rateest_match_info *info = par->matchinfo; struct gnet_stats_rate_est *r; u_int32_t bps1, bps2, pps1, pps2; bool ret = true; diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c index ef65756d489..b25942110ed 100644 --- a/net/netfilter/xt_realm.c +++ b/net/netfilter/xt_realm.c @@ -22,12 +22,9 @@ MODULE_DESCRIPTION("Xtables: Routing realm match"); MODULE_ALIAS("ipt_realm"); static bool -realm_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +realm_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_realm_info *info = matchinfo; + const struct xt_realm_info *info = par->matchinfo; const struct dst_entry *dst = skb->dst; return (info->id == (dst->tclassid & info->mask)) ^ info->invert; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 4a916e2624d..baeb90a5623 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -204,19 +204,16 @@ static void recent_table_flush(struct recent_table *t) } static bool -recent_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +recent_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_recent_mtinfo *info = matchinfo; + const struct xt_recent_mtinfo *info = par->matchinfo; struct recent_table *t; struct recent_entry *e; union nf_inet_addr addr = {}; u_int8_t ttl; bool ret = info->invert; - if (match->family == NFPROTO_IPV4) { + if (par->match->family == NFPROTO_IPV4) { const struct iphdr *iph = ip_hdr(skb); if (info->side == XT_RECENT_DEST) @@ -237,19 +234,19 @@ recent_mt(const struct sk_buff *skb, const struct net_device *in, } /* use TTL as seen before forwarding */ - if (out && !skb->sk) + if (par->out != NULL && skb->sk == NULL) ttl++; spin_lock_bh(&recent_lock); t = recent_table_lookup(info->name); - e = recent_entry_lookup(t, &addr, match->family, + e = recent_entry_lookup(t, &addr, par->match->family, (info->check_set & XT_RECENT_TTL) ? ttl : 0); if (e == NULL) { if (!(info->check_set & XT_RECENT_SET)) goto out; - e = recent_entry_init(t, &addr, match->family, ttl); + e = recent_entry_init(t, &addr, par->match->family, ttl); if (e == NULL) - *hotdrop = true; + *par->hotdrop = true; ret = !ret; goto out; } diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index ab67aca4d8f..b0014ab65da 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -117,23 +117,21 @@ match_packet(const struct sk_buff *skb, } static bool -sctp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +sctp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_sctp_info *info = matchinfo; + const struct xt_sctp_info *info = par->matchinfo; const sctp_sctphdr_t *sh; sctp_sctphdr_t _sh; - if (offset) { + if (par->fragoff != 0) { duprintf("Dropping non-first fragment.. FIXME\n"); return false; } - sh = skb_header_pointer(skb, protoff, sizeof(_sh), &_sh); + sh = skb_header_pointer(skb, par->thoff, sizeof(_sh), &_sh); if (sh == NULL) { duprintf("Dropping evil TCP offset=0 tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest)); @@ -144,8 +142,8 @@ sctp_mt(const struct sk_buff *skb, const struct net_device *in, && SCCHECK(ntohs(sh->dest) >= info->dpts[0] && ntohs(sh->dest) <= info->dpts[1], XT_SCTP_DEST_PORTS, info->flags, info->invflags) - && SCCHECK(match_packet(skb, protoff + sizeof (sctp_sctphdr_t), - info, hotdrop), + && SCCHECK(match_packet(skb, par->thoff + sizeof(sctp_sctphdr_t), + info, par->hotdrop), XT_SCTP_CHUNK_TYPES, info->flags, info->invflags); } diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index ac9db17c7b9..02a8fed2108 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -86,14 +86,7 @@ extract_icmp_fields(const struct sk_buff *skb, static bool -socket_mt(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +socket_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const struct iphdr *iph = ip_hdr(skb); struct udphdr _hdr, *hp = NULL; @@ -146,7 +139,7 @@ socket_mt(const struct sk_buff *skb, #endif sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, - saddr, daddr, sport, dport, in, false); + saddr, daddr, sport, dport, par->in, false); if (sk != NULL) { bool wildcard = (inet_sk(sk)->rcv_saddr == 0); diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index f92f8bcc1e3..29f5a8a1b02 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -21,12 +21,9 @@ MODULE_ALIAS("ipt_state"); MODULE_ALIAS("ip6t_state"); static bool -state_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +state_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_state_info *sinfo = matchinfo; + const struct xt_state_info *sinfo = par->matchinfo; enum ip_conntrack_info ctinfo; unsigned int statebit; diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index f41a92322e6..dcadc491db2 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -25,12 +25,9 @@ MODULE_ALIAS("ip6t_statistic"); static DEFINE_SPINLOCK(nth_lock); static bool -statistic_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo; + struct xt_statistic_info *info = (void *)par->matchinfo; bool ret = info->flags & XT_STATISTIC_INVERT; switch (info->mode) { diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 18d8884e737..33f2d29ca4f 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -22,18 +22,15 @@ MODULE_ALIAS("ipt_string"); MODULE_ALIAS("ip6t_string"); static bool -string_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +string_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_string_info *conf = matchinfo; + const struct xt_string_info *conf = par->matchinfo; struct ts_state state; int invert; memset(&state, 0, sizeof(struct ts_state)); - invert = (match->revision == 0 ? conf->u.v0.invert : + invert = (par->match->revision == 0 ? conf->u.v0.invert : conf->u.v1.flags & XT_STRING_FLAG_INVERT); return (skb_find_text((struct sk_buff *)skb, conf->from_offset, diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 4791c7cbe5a..4809b34b10f 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -25,12 +25,9 @@ MODULE_ALIAS("ipt_tcpmss"); MODULE_ALIAS("ip6t_tcpmss"); static bool -tcpmss_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +tcpmss_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_tcpmss_match_info *info = matchinfo; + const struct xt_tcpmss_match_info *info = par->matchinfo; const struct tcphdr *th; struct tcphdr _tcph; /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ @@ -39,7 +36,7 @@ tcpmss_mt(const struct sk_buff *skb, const struct net_device *in, unsigned int i, optlen; /* If we don't have the whole header, drop packet. */ - th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) goto dropit; @@ -52,7 +49,7 @@ tcpmss_mt(const struct sk_buff *skb, const struct net_device *in, goto out; /* Truncated options. */ - op = skb_header_pointer(skb, protoff + sizeof(*th), optlen, _opt); + op = skb_header_pointer(skb, par->thoff + sizeof(*th), optlen, _opt); if (op == NULL) goto dropit; @@ -76,7 +73,7 @@ out: return info->invert; dropit: - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index 5a6268cbb9f..66cf71b1d59 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -68,25 +68,22 @@ tcp_find_option(u_int8_t option, return invert; } -static bool -tcp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool tcp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const struct tcphdr *th; struct tcphdr _tcph; - const struct xt_tcp *tcpinfo = matchinfo; + const struct xt_tcp *tcpinfo = par->matchinfo; - if (offset) { + if (par->fragoff != 0) { /* To quote Alan: Don't allow a fragment of TCP 8 bytes in. Nobody normal causes this. Its a cracker trying to break in by doing a flag overwrite to pass the direction checks. */ - if (offset == 1) { + if (par->fragoff == 1) { duprintf("Dropping evil TCP offset=1 frag.\n"); - *hotdrop = true; + *par->hotdrop = true; } /* Must not be a fragment. */ return false; @@ -94,12 +91,12 @@ tcp_mt(const struct sk_buff *skb, const struct net_device *in, #define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg))) - th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ duprintf("Dropping evil TCP offset=0 tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } @@ -117,13 +114,13 @@ tcp_mt(const struct sk_buff *skb, const struct net_device *in, return false; if (tcpinfo->option) { if (th->doff * 4 < sizeof(_tcph)) { - *hotdrop = true; + *par->hotdrop = true; return false; } - if (!tcp_find_option(tcpinfo->option, skb, protoff, + if (!tcp_find_option(tcpinfo->option, skb, par->thoff, th->doff*4 - sizeof(_tcph), tcpinfo->invflags & XT_TCP_INV_OPTION, - hotdrop)) + par->hotdrop)) return false; } return true; @@ -141,25 +138,22 @@ tcp_mt_check(const char *tablename, const void *info, return !(tcpinfo->invflags & ~XT_TCP_INV_MASK); } -static bool -udp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool udp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const struct udphdr *uh; struct udphdr _udph; - const struct xt_udp *udpinfo = matchinfo; + const struct xt_udp *udpinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - uh = skb_header_pointer(skb, protoff, sizeof(_udph), &_udph); + uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph); if (uh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ duprintf("Dropping evil UDP tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 32d4c769caa..28599d3979c 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -153,11 +153,9 @@ static void localtime_3(struct xtm *r, time_t time) } static bool -time_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +time_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_time_info *info = matchinfo; + const struct xt_time_info *info = par->matchinfo; unsigned int packet_time; struct xtm current_time; s64 stamp; diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c index a6b971dc5d3..24a52762450 100644 --- a/net/netfilter/xt_u32.c +++ b/net/netfilter/xt_u32.c @@ -87,12 +87,9 @@ static bool u32_match_it(const struct xt_u32 *data, return true; } -static bool -u32_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool u32_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_u32 *data = matchinfo; + const struct xt_u32 *data = par->matchinfo; bool ret; ret = u32_match_it(data, skb); -- cgit v1.2.3-70-g09d2 From 9b4fce7a3508a9776534188b6065b206a9608ccf Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:18 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (2/6) This patch does this for match extensions' checkentry functions. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 32 +++++++++++++++-------- net/bridge/netfilter/ebt_802_3.c | 7 ++---- net/bridge/netfilter/ebt_among.c | 9 +++---- net/bridge/netfilter/ebt_arp.c | 9 +++---- net/bridge/netfilter/ebt_ip.c | 9 +++---- net/bridge/netfilter/ebt_ip6.c | 9 +++---- net/bridge/netfilter/ebt_limit.c | 7 ++---- net/bridge/netfilter/ebt_mark_m.c | 7 ++---- net/bridge/netfilter/ebt_pkttype.c | 7 ++---- net/bridge/netfilter/ebt_stp.c | 9 +++---- net/bridge/netfilter/ebt_vlan.c | 9 +++---- net/bridge/netfilter/ebtables.c | 19 +++++++++----- net/ipv4/netfilter/ip_tables.c | 49 +++++++++++++++++------------------- net/ipv4/netfilter/ipt_addrtype.c | 13 +++++----- net/ipv4/netfilter/ipt_ah.c | 8 ++---- net/ipv4/netfilter/ipt_ecn.c | 9 +++---- net/ipv6/netfilter/ip6_tables.c | 48 +++++++++++++++++------------------ net/ipv6/netfilter/ip6t_ah.c | 8 ++---- net/ipv6/netfilter/ip6t_frag.c | 8 ++---- net/ipv6/netfilter/ip6t_hbh.c | 8 ++---- net/ipv6/netfilter/ip6t_ipv6header.c | 7 ++---- net/ipv6/netfilter/ip6t_mh.c | 8 ++---- net/ipv6/netfilter/ip6t_rt.c | 8 ++---- net/netfilter/x_tables.c | 32 +++++++++++------------ net/netfilter/xt_connbytes.c | 14 ++++------- net/netfilter/xt_connlimit.c | 13 ++++------ net/netfilter/xt_connmark.c | 20 ++++++--------- net/netfilter/xt_conntrack.c | 9 +++---- net/netfilter/xt_dccp.c | 7 ++---- net/netfilter/xt_dscp.c | 11 +++----- net/netfilter/xt_esp.c | 8 ++---- net/netfilter/xt_hashlimit.c | 24 +++++++----------- net/netfilter/xt_helper.c | 11 +++----- net/netfilter/xt_limit.c | 7 ++---- net/netfilter/xt_mark.c | 7 ++---- net/netfilter/xt_multiport.c | 37 +++++++++------------------ net/netfilter/xt_owner.c | 14 +++-------- net/netfilter/xt_physdev.c | 13 ++++------ net/netfilter/xt_policy.c | 15 +++++------ net/netfilter/xt_quota.c | 7 ++---- net/netfilter/xt_rateest.c | 8 ++---- net/netfilter/xt_recent.c | 7 ++---- net/netfilter/xt_sctp.c | 7 ++---- net/netfilter/xt_state.c | 9 +++---- net/netfilter/xt_statistic.c | 7 ++---- net/netfilter/xt_string.c | 9 +++---- net/netfilter/xt_tcpudp.c | 16 +++--------- net/netfilter/xt_time.c | 7 ++---- 48 files changed, 240 insertions(+), 386 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index bcd40ec8325..763a704ce83 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -193,6 +193,25 @@ struct xt_match_param { bool *hotdrop; }; +/** + * struct xt_mtchk_param - parameters for match extensions' + * checkentry functions + * + * @table: table the rule is tried to be inserted into + * @entryinfo: the family-specific rule data + * (struct ipt_ip, ip6t_ip, ebt_entry) + * @match: struct xt_match through which this function was invoked + * @matchinfo: per-match data + * @hook_mask: via which hooks the new rule is reachable + */ +struct xt_mtchk_param { + const char *table; + const void *entryinfo; + const struct xt_match *match; + void *matchinfo; + unsigned int hook_mask; +}; + struct xt_match { struct list_head list; @@ -208,12 +227,7 @@ struct xt_match const struct xt_match_param *); /* Called when user tries to insert an entry of this type. */ - /* Should return true or false. */ - bool (*checkentry)(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask); + bool (*checkentry)(const struct xt_mtchk_param *); /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_match *match, void *matchinfo); @@ -342,10 +356,8 @@ extern void xt_unregister_match(struct xt_match *target); extern int xt_register_matches(struct xt_match *match, unsigned int n); extern void xt_unregister_matches(struct xt_match *match, unsigned int n); -extern int xt_check_match(const struct xt_match *match, unsigned short family, - unsigned int size, const char *table, unsigned int hook, - unsigned short proto, int inv_proto, - const void *entry, void *matchinfo); +extern int xt_check_match(struct xt_mtchk_param *, u_int8_t family, + unsigned int size, u_int8_t proto, bool inv_proto); extern int xt_check_target(const struct xt_target *target, unsigned short family, unsigned int size, const char *table, unsigned int hook, unsigned short proto, int inv_proto, diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c index c9e1bc14951..bd91dc58d49 100644 --- a/net/bridge/netfilter/ebt_802_3.c +++ b/net/bridge/netfilter/ebt_802_3.c @@ -36,12 +36,9 @@ ebt_802_3_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -ebt_802_3_mt_check(const char *table, const void *entry, - const struct xt_match *match, void *data, - unsigned int hook_mask) +static bool ebt_802_3_mt_check(const struct xt_mtchk_param *par) { - const struct ebt_802_3_info *info = data; + const struct ebt_802_3_info *info = par->matchinfo; if (info->bitmask & ~EBT_802_3_MASK || info->invflags & ~EBT_802_3_MASK) return false; diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c index 0ad0db3e815..b595f091f35 100644 --- a/net/bridge/netfilter/ebt_among.c +++ b/net/bridge/netfilter/ebt_among.c @@ -171,14 +171,11 @@ ebt_among_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -ebt_among_mt_check(const char *table, const void *entry, - const struct xt_match *match, void *data, - unsigned int hook_mask) +static bool ebt_among_mt_check(const struct xt_mtchk_param *par) { + const struct ebt_among_info *info = par->matchinfo; const struct ebt_entry_match *em = - container_of(data, const struct ebt_entry_match, data); - const struct ebt_among_info *info = data; + container_of(par->matchinfo, const struct ebt_entry_match, data); int expected_length = sizeof(struct ebt_among_info); const struct ebt_mac_wormhash *wh_dst, *wh_src; int err; diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c index 1ff8fa3a9e7..b7ad60419f9 100644 --- a/net/bridge/netfilter/ebt_arp.c +++ b/net/bridge/netfilter/ebt_arp.c @@ -100,13 +100,10 @@ ebt_arp_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -ebt_arp_mt_check(const char *table, const void *entry, - const struct xt_match *match, void *data, - unsigned int hook_mask) +static bool ebt_arp_mt_check(const struct xt_mtchk_param *par) { - const struct ebt_arp_info *info = data; - const struct ebt_entry *e = entry; + const struct ebt_arp_info *info = par->matchinfo; + const struct ebt_entry *e = par->entryinfo; if ((e->ethproto != htons(ETH_P_ARP) && e->ethproto != htons(ETH_P_RARP)) || diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index c70ea39840b..d771bbfbcbe 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -77,13 +77,10 @@ ebt_ip_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -ebt_ip_mt_check(const char *table, const void *entry, - const struct xt_match *match, void *data, - unsigned int hook_mask) +static bool ebt_ip_mt_check(const struct xt_mtchk_param *par) { - const struct ebt_ip_info *info = data; - const struct ebt_entry *e = entry; + const struct ebt_ip_info *info = par->matchinfo; + const struct ebt_entry *e = par->entryinfo; if (e->ethproto != htons(ETH_P_IP) || e->invflags & EBT_IPROTO) diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c index 5acee02de72..784a6573876 100644 --- a/net/bridge/netfilter/ebt_ip6.c +++ b/net/bridge/netfilter/ebt_ip6.c @@ -90,13 +90,10 @@ ebt_ip6_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -ebt_ip6_mt_check(const char *table, const void *entry, - const struct xt_match *match, void *data, - unsigned int hook_mask) +static bool ebt_ip6_mt_check(const struct xt_mtchk_param *par) { - const struct ebt_entry *e = entry; - struct ebt_ip6_info *info = data; + const struct ebt_entry *e = par->entryinfo; + struct ebt_ip6_info *info = par->matchinfo; if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO) return false; diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c index 9a3ec8cadaa..f7bd9192ff0 100644 --- a/net/bridge/netfilter/ebt_limit.c +++ b/net/bridge/netfilter/ebt_limit.c @@ -64,12 +64,9 @@ user2credits(u_int32_t user) return (user * HZ * CREDITS_PER_JIFFY) / EBT_LIMIT_SCALE; } -static bool -ebt_limit_mt_check(const char *table, const void *e, - const struct xt_match *match, void *data, - unsigned int hook_mask) +static bool ebt_limit_mt_check(const struct xt_mtchk_param *par) { - struct ebt_limit_info *info = data; + struct ebt_limit_info *info = par->matchinfo; /* Check for overflow. */ if (info->burst == 0 || diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c index 5b22ef96127..ea570f214b1 100644 --- a/net/bridge/netfilter/ebt_mark_m.c +++ b/net/bridge/netfilter/ebt_mark_m.c @@ -22,12 +22,9 @@ ebt_mark_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ((skb->mark & info->mask) == info->mark) ^ info->invert; } -static bool -ebt_mark_mt_check(const char *table, const void *e, - const struct xt_match *match, void *data, - unsigned int hook_mask) +static bool ebt_mark_mt_check(const struct xt_mtchk_param *par) { - const struct ebt_mark_m_info *info = data; + const struct ebt_mark_m_info *info = par->matchinfo; if (info->bitmask & ~EBT_MARK_MASK) return false; diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c index b756f88fb10..883e96e2a54 100644 --- a/net/bridge/netfilter/ebt_pkttype.c +++ b/net/bridge/netfilter/ebt_pkttype.c @@ -20,12 +20,9 @@ ebt_pkttype_mt(const struct sk_buff *skb, const struct xt_match_param *par) return (skb->pkt_type == info->pkt_type) ^ info->invert; } -static bool -ebt_pkttype_mt_check(const char *table, const void *e, - const struct xt_match *match, void *data, - unsigned int hook_mask) +static bool ebt_pkttype_mt_check(const struct xt_mtchk_param *par) { - const struct ebt_pkttype_info *info = data; + const struct ebt_pkttype_info *info = par->matchinfo; if (info->invert != 0 && info->invert != 1) return false; diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 06d777c62c3..48527e62162 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -153,15 +153,12 @@ ebt_stp_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -ebt_stp_mt_check(const char *table, const void *entry, - const struct xt_match *match, void *data, - unsigned int hook_mask) +static bool ebt_stp_mt_check(const struct xt_mtchk_param *par) { - const struct ebt_stp_info *info = data; + const struct ebt_stp_info *info = par->matchinfo; const uint8_t bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00}; const uint8_t msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - const struct ebt_entry *e = entry; + const struct ebt_entry *e = par->entryinfo; if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK || !(info->bitmask & EBT_STP_MASK)) diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c index b05b4a81834..3dddd489328 100644 --- a/net/bridge/netfilter/ebt_vlan.c +++ b/net/bridge/netfilter/ebt_vlan.c @@ -84,13 +84,10 @@ ebt_vlan_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -ebt_vlan_mt_check(const char *table, const void *entry, - const struct xt_match *match, void *data, - unsigned int hook_mask) +static bool ebt_vlan_mt_check(const struct xt_mtchk_param *par) { - struct ebt_vlan_info *info = data; - const struct ebt_entry *e = entry; + struct ebt_vlan_info *info = par->matchinfo; + const struct ebt_entry *e = par->entryinfo; /* Is it 802.1Q frame checked? */ if (e->ethproto != htons(ETH_P_8021Q)) { diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index f8e1822f38d..5ce37b2f5b8 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -324,9 +324,10 @@ find_table_lock(const char *name, int *error, struct mutex *mutex) } static inline int -ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e, - const char *name, unsigned int hookmask, unsigned int *cnt) +ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, + unsigned int *cnt) { + const struct ebt_entry *e = par->entryinfo; struct xt_match *match; size_t left = ((char *)e + e->watchers_offset) - (char *)m; int ret; @@ -343,9 +344,10 @@ ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e, return -ENOENT; m->u.match = match; - ret = xt_check_match(match, NFPROTO_BRIDGE, m->match_size, - name, hookmask, e->ethproto, e->invflags & EBT_IPROTO, - e, m->data); + par->match = match; + par->matchinfo = m->data; + ret = xt_check_match(par, NFPROTO_BRIDGE, m->match_size, + e->ethproto, e->invflags & EBT_IPROTO); if (ret < 0) { module_put(match->me); return ret; @@ -607,6 +609,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, unsigned int i, j, hook = 0, hookmask = 0; size_t gap; int ret; + struct xt_mtchk_param par; /* don't mess with the struct ebt_entries */ if (e->bitmask == 0) @@ -647,7 +650,11 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, hookmask = cl_s[i - 1].hookmask; } i = 0; - ret = EBT_MATCH_ITERATE(e, ebt_check_match, e, name, hookmask, &i); + + par.table = name; + par.entryinfo = e; + par.hook_mask = hookmask; + ret = EBT_MATCH_ITERATE(e, ebt_check_match, &par, &i); if (ret != 0) goto cleanup_matches; j = 0; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 99fdb59454f..4147298a6a8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -607,20 +607,20 @@ check_entry(struct ipt_entry *e, const char *name) } static int -check_match(struct ipt_entry_match *m, const char *name, - const struct ipt_ip *ip, - unsigned int hookmask, unsigned int *i) +check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, + unsigned int *i) { - struct xt_match *match; + const struct ipt_ip *ip = par->entryinfo; int ret; - match = m->u.kernel.match; - ret = xt_check_match(match, AF_INET, m->u.match_size - sizeof(*m), - name, hookmask, ip->proto, - ip->invflags & IPT_INV_PROTO, ip, m->data); + par->match = m->u.kernel.match; + par->matchinfo = m->data; + + ret = xt_check_match(par, NFPROTO_IPV4, m->u.match_size - sizeof(*m), + ip->proto, ip->invflags & IPT_INV_PROTO); if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", - m->u.kernel.match->name); + par.match->name); return ret; } ++*i; @@ -628,10 +628,7 @@ check_match(struct ipt_entry_match *m, const char *name, } static int -find_check_match(struct ipt_entry_match *m, - const char *name, - const struct ipt_ip *ip, - unsigned int hookmask, +find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, unsigned int *i) { struct xt_match *match; @@ -646,7 +643,7 @@ find_check_match(struct ipt_entry_match *m, } m->u.kernel.match = match; - ret = check_match(m, name, ip, hookmask, i); + ret = check_match(m, par, i); if (ret) goto err; @@ -683,14 +680,17 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, struct xt_target *target; int ret; unsigned int j; + struct xt_mtchk_param mtpar; ret = check_entry(e, name); if (ret) return ret; j = 0; - ret = IPT_MATCH_ITERATE(e, find_check_match, name, &e->ip, - e->comefrom, &j); + mtpar.table = name; + mtpar.entryinfo = &e->ip; + mtpar.hook_mask = e->comefrom; + ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j); if (ret != 0) goto cleanup_matches; @@ -1644,12 +1644,15 @@ static int compat_check_entry(struct ipt_entry *e, const char *name, unsigned int *i) { + struct xt_mtchk_param mtpar; unsigned int j; int ret; j = 0; - ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, - e->comefrom, &j); + mtpar.table = name; + mtpar.entryinfo = &e->ip; + mtpar.hook_mask = e->comefrom; + ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j); if (ret) goto cleanup_matches; @@ -2144,15 +2147,9 @@ icmp_match(const struct sk_buff *skb, const struct xt_match_param *par) !!(icmpinfo->invflags&IPT_ICMP_INV)); } -/* Called when user tries to insert an entry of this type. */ -static bool -icmp_checkentry(const char *tablename, - const void *entry, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool icmp_checkentry(const struct xt_mtchk_param *par) { - const struct ipt_icmp *icmpinfo = matchinfo; + const struct ipt_icmp *icmpinfo = par->matchinfo; /* Must specify no unknown invflags */ return !(icmpinfo->invflags & ~IPT_ICMP_INV); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index e60995e4c20..88762f02779 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -68,12 +68,9 @@ addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool -addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) { - struct ipt_addrtype_info_v1 *info = matchinfo; + struct ipt_addrtype_info_v1 *info = par->matchinfo; if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { @@ -82,14 +79,16 @@ addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void, return false; } - if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN) && + if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN)) && info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { printk(KERN_ERR "ipt_addrtype: output interface limitation " "not valid in PRE_ROUTING and INPUT\n"); return false; } - if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT) && + if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT)) && info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { printk(KERN_ERR "ipt_addrtype: input interface limitation " "not valid in POST_ROUTING and OUTPUT\n"); diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index 2fce19ef4f3..0104c0b399d 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -61,13 +61,9 @@ static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par) !!(ahinfo->invflags & IPT_AH_INV_SPI)); } -/* Called when user tries to insert an entry of this type. */ -static bool -ah_mt_check(const char *tablename, const void *ip_void, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool ah_mt_check(const struct xt_mtchk_param *par) { - const struct ipt_ah *ahinfo = matchinfo; + const struct ipt_ah *ahinfo = par->matchinfo; /* Must specify no unknown invflags */ if (ahinfo->invflags & ~IPT_AH_INV_MASK) { diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index 06915463150..6289b64144c 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -85,13 +85,10 @@ static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -ecn_mt_check(const char *tablename, const void *ip_void, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool ecn_mt_check(const struct xt_mtchk_param *par) { - const struct ipt_ecn_info *info = matchinfo; - const struct ipt_ip *ip = ip_void; + const struct ipt_ecn_info *info = par->matchinfo; + const struct ipt_ip *ip = par->entryinfo; if (info->operation & IPT_ECN_OP_MATCH_MASK) return false; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index cf2c5370a4e..9c843e3777b 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -629,20 +629,20 @@ check_entry(struct ip6t_entry *e, const char *name) return 0; } -static int check_match(struct ip6t_entry_match *m, const char *name, - const struct ip6t_ip6 *ipv6, - unsigned int hookmask, unsigned int *i) +static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par, + unsigned int *i) { - struct xt_match *match; + const struct ip6t_ip6 *ipv6 = par->entryinfo; int ret; - match = m->u.kernel.match; - ret = xt_check_match(match, AF_INET6, m->u.match_size - sizeof(*m), - name, hookmask, ipv6->proto, - ipv6->invflags & IP6T_INV_PROTO, ipv6, m->data); + par->match = m->u.kernel.match; + par->matchinfo = m->data; + + ret = xt_check_match(par, NFPROTO_IPV6, m->u.match_size - sizeof(*m), + ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", - m->u.kernel.match->name); + par.match->name); return ret; } ++*i; @@ -650,10 +650,7 @@ static int check_match(struct ip6t_entry_match *m, const char *name, } static int -find_check_match(struct ip6t_entry_match *m, - const char *name, - const struct ip6t_ip6 *ipv6, - unsigned int hookmask, +find_check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par, unsigned int *i) { struct xt_match *match; @@ -668,7 +665,7 @@ find_check_match(struct ip6t_entry_match *m, } m->u.kernel.match = match; - ret = check_match(m, name, ipv6, hookmask, i); + ret = check_match(m, par, i); if (ret) goto err; @@ -705,14 +702,17 @@ find_check_entry(struct ip6t_entry *e, const char *name, unsigned int size, struct xt_target *target; int ret; unsigned int j; + struct xt_mtchk_param mtpar; ret = check_entry(e, name); if (ret) return ret; j = 0; - ret = IP6T_MATCH_ITERATE(e, find_check_match, name, &e->ipv6, - e->comefrom, &j); + mtpar.table = name; + mtpar.entryinfo = &e->ipv6; + mtpar.hook_mask = e->comefrom; + ret = IP6T_MATCH_ITERATE(e, find_check_match, &mtpar, &j); if (ret != 0) goto cleanup_matches; @@ -1669,10 +1669,13 @@ static int compat_check_entry(struct ip6t_entry *e, const char *name, { unsigned int j; int ret; + struct xt_mtchk_param mtpar; j = 0; - ret = IP6T_MATCH_ITERATE(e, check_match, name, &e->ipv6, - e->comefrom, &j); + mtpar.table = name; + mtpar.entryinfo = &e->ipv6; + mtpar.hook_mask = e->comefrom; + ret = IP6T_MATCH_ITERATE(e, check_match, &mtpar, &j); if (ret) goto cleanup_matches; @@ -2166,14 +2169,9 @@ icmp6_match(const struct sk_buff *skb, const struct xt_match_param *par) } /* Called when user tries to insert an entry of this type. */ -static bool -icmp6_checkentry(const char *tablename, - const void *entry, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool icmp6_checkentry(const struct xt_mtchk_param *par) { - const struct ip6t_icmp *icmpinfo = matchinfo; + const struct ip6t_icmp *icmpinfo = par->matchinfo; /* Must specify no unknown invflags */ return !(icmpinfo->invflags & ~IP6T_ICMP_INV); diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index a04f2b8396e..3a82f24746b 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -90,13 +90,9 @@ static bool ah_mt6(const struct sk_buff *skb, const struct xt_match_param *par) !(ahinfo->hdrres && ah->reserved); } -/* Called when user tries to insert an entry of this type. */ -static bool -ah_mt6_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool ah_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_ah *ahinfo = matchinfo; + const struct ip6t_ah *ahinfo = par->matchinfo; if (ahinfo->invflags & ~IP6T_AH_INV_MASK) { pr_debug("ip6t_ah: unknown flags %X\n", ahinfo->invflags); diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c index 6951d0dacf4..673aa0a5084 100644 --- a/net/ipv6/netfilter/ip6t_frag.c +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -107,13 +107,9 @@ frag_mt6(const struct sk_buff *skb, const struct xt_match_param *par) && (ntohs(fh->frag_off) & IP6_MF)); } -/* Called when user tries to insert an entry of this type. */ -static bool -frag_mt6_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool frag_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_frag *fraginfo = matchinfo; + const struct ip6t_frag *fraginfo = par->matchinfo; if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) { pr_debug("ip6t_frag: unknown flags %X\n", fraginfo->invflags); diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c index d3351978819..cbe8dec9744 100644 --- a/net/ipv6/netfilter/ip6t_hbh.c +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -160,13 +160,9 @@ hbh_mt6(const struct sk_buff *skb, const struct xt_match_param *par) return false; } -/* Called when user tries to insert an entry of this type. */ -static bool -hbh_mt6_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool hbh_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_opts *optsinfo = matchinfo; + const struct ip6t_opts *optsinfo = par->matchinfo; if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { pr_debug("ip6t_opts: unknown flags %X\n", optsinfo->invflags); diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 6aaca511d47..14e6724d567 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -118,12 +118,9 @@ ipv6header_mt6(const struct sk_buff *skb, const struct xt_match_param *par) } } -static bool -ipv6header_mt6_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool ipv6header_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_ipv6header_info *info = matchinfo; + const struct ip6t_ipv6header_info *info = par->matchinfo; /* invflags is 0 or 0xff in hard mode */ if ((!info->modeflag) && info->invflags != 0x00 && diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c index 2803258b6d0..aafe4e66577 100644 --- a/net/ipv6/netfilter/ip6t_mh.c +++ b/net/ipv6/netfilter/ip6t_mh.c @@ -67,13 +67,9 @@ static bool mh_mt6(const struct sk_buff *skb, const struct xt_match_param *par) !!(mhinfo->invflags & IP6T_MH_INV_TYPE)); } -/* Called when user tries to insert an entry of this type. */ -static bool -mh_mt6_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool mh_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_mh *mhinfo = matchinfo; + const struct ip6t_mh *mhinfo = par->matchinfo; /* Must specify no unknown invflags */ return !(mhinfo->invflags & ~IP6T_MH_INV_MASK); diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index 9cf4b8a37af..356b8d6f6ba 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -186,13 +186,9 @@ static bool rt_mt6(const struct sk_buff *skb, const struct xt_match_param *par) return false; } -/* Called when user tries to insert an entry of this type. */ -static bool -rt_mt6_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool rt_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_rt *rtinfo = matchinfo; + const struct ip6t_rt *rtinfo = par->matchinfo; if (rtinfo->invflags & ~IP6T_RT_INV_MASK) { pr_debug("ip6t_rt: unknown flags %X\n", rtinfo->invflags); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index d1f2fb3e8f2..817ab14f7cd 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -321,39 +321,39 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target, } EXPORT_SYMBOL_GPL(xt_find_revision); -int xt_check_match(const struct xt_match *match, unsigned short family, - unsigned int size, const char *table, unsigned int hook_mask, - unsigned short proto, int inv_proto, const void *entry, - void *matchinfo) +int xt_check_match(struct xt_mtchk_param *par, u_int8_t family, + unsigned int size, u_int8_t proto, bool inv_proto) { - if (XT_ALIGN(match->matchsize) != size && - match->matchsize != -1) { + if (XT_ALIGN(par->match->matchsize) != size && + par->match->matchsize != -1) { /* * ebt_among is exempt from centralized matchsize checking * because it uses a dynamic-size data set. */ printk("%s_tables: %s match: invalid size %Zu != %u\n", - xt_prefix[family], match->name, - XT_ALIGN(match->matchsize), size); + xt_prefix[family], par->match->name, + XT_ALIGN(par->match->matchsize), size); return -EINVAL; } - if (match->table && strcmp(match->table, table)) { + if (par->match->table != NULL && + strcmp(par->match->table, par->table) != 0) { printk("%s_tables: %s match: only valid in %s table, not %s\n", - xt_prefix[family], match->name, match->table, table); + xt_prefix[family], par->match->name, + par->match->table, par->table); return -EINVAL; } - if (match->hooks && (hook_mask & ~match->hooks) != 0) { + if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { printk("%s_tables: %s match: bad hook_mask %#x/%#x\n", - xt_prefix[family], match->name, hook_mask, match->hooks); + xt_prefix[family], par->match->name, + par->hook_mask, par->match->hooks); return -EINVAL; } - if (match->proto && (match->proto != proto || inv_proto)) { + if (par->match->proto && (par->match->proto != proto || inv_proto)) { printk("%s_tables: %s match: only valid for protocol %u\n", - xt_prefix[family], match->name, match->proto); + xt_prefix[family], par->match->name, par->match->proto); return -EINVAL; } - if (match->checkentry != NULL && - !match->checkentry(table, entry, match, matchinfo, hook_mask)) + if (par->match->checkentry != NULL && !par->match->checkentry(par)) return -EINVAL; return 0; } diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 30c19b5fe90..43a36c728e5 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -92,12 +92,9 @@ connbytes_mt(const struct sk_buff *skb, const struct xt_match_param *par) return what >= sinfo->count.from; } -static bool -connbytes_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool connbytes_mt_check(const struct xt_mtchk_param *par) { - const struct xt_connbytes_info *sinfo = matchinfo; + const struct xt_connbytes_info *sinfo = par->matchinfo; if (sinfo->what != XT_CONNBYTES_PKTS && sinfo->what != XT_CONNBYTES_BYTES && @@ -109,17 +106,16 @@ connbytes_mt_check(const char *tablename, const void *ip, sinfo->direction != XT_CONNBYTES_DIR_BOTH) return false; - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } return true; } -static void -connbytes_mt_destroy(const struct xt_match *match, void *matchinfo) +static void connbytes_mt_destroy(const struct xt_match *match, void *matchinfo) { nf_ct_l3proto_module_put(match->family); } diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 8b8f70e7664..1361e9919cf 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -221,24 +221,21 @@ connlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par) return false; } -static bool -connlimit_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool connlimit_mt_check(const struct xt_mtchk_param *par) { - struct xt_connlimit_info *info = matchinfo; + struct xt_connlimit_info *info = par->matchinfo; unsigned int i; - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "cannot load conntrack support for " - "address family %u\n", match->family); + "address family %u\n", par->match->family); return false; } /* init private data */ info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL); if (info->data == NULL) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); return false; } diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index df4f4a865a5..b935b7888a9 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -61,33 +61,27 @@ connmark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) return ((ct->mark & info->mask) == info->mark) ^ info->invert; } -static bool -connmark_mt_check_v0(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool connmark_mt_check_v0(const struct xt_mtchk_param *par) { - const struct xt_connmark_info *cm = matchinfo; + const struct xt_connmark_info *cm = par->matchinfo; if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { printk(KERN_WARNING "connmark: only support 32bit mark\n"); return false; } - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } return true; } -static bool -connmark_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool connmark_mt_check(const struct xt_mtchk_param *par) { - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "cannot load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } return true; diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 13a7e4eacdf..f04c46a02ce 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -278,14 +278,11 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -conntrack_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool conntrack_mt_check(const struct xt_mtchk_param *par) { - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } return true; diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index 7aa30bb9105..e5d3e867328 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -121,12 +121,9 @@ dccp_mt(const struct sk_buff *skb, const struct xt_match_param *par) XT_DCCP_OPTION, info->flags, info->invflags); } -static bool -dccp_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool dccp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_dccp_info *info = matchinfo; + const struct xt_dccp_info *info = par->matchinfo; return !(info->flags & ~XT_DCCP_VALID_FLAGS) && !(info->invflags & ~XT_DCCP_VALID_FLAGS) diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c index 57d61206135..c3f8085460d 100644 --- a/net/netfilter/xt_dscp.c +++ b/net/netfilter/xt_dscp.c @@ -43,15 +43,12 @@ dscp_mt6(const struct sk_buff *skb, const struct xt_match_param *par) return (dscp == info->dscp) ^ !!info->invert; } -static bool -dscp_mt_check(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool dscp_mt_check(const struct xt_mtchk_param *par) { - const u_int8_t dscp = ((struct xt_dscp_info *)matchinfo)->dscp; + const struct xt_dscp_info *info = par->matchinfo; - if (dscp > XT_DSCP_MAX) { - printk(KERN_ERR "xt_dscp: dscp %x out of range\n", dscp); + if (info->dscp > XT_DSCP_MAX) { + printk(KERN_ERR "xt_dscp: dscp %x out of range\n", info->dscp); return false; } diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c index 6d59f2e7c1c..609439967c2 100644 --- a/net/netfilter/xt_esp.c +++ b/net/netfilter/xt_esp.c @@ -66,13 +66,9 @@ static bool esp_mt(const struct sk_buff *skb, const struct xt_match_param *par) !!(espinfo->invflags & XT_ESP_INV_SPI)); } -/* Called when user tries to insert an entry of this type. */ -static bool -esp_mt_check(const char *tablename, const void *ip_void, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool esp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_esp *espinfo = matchinfo; + const struct xt_esp *espinfo = par->matchinfo; if (espinfo->invflags & ~XT_ESP_INV_MASK) { duprintf("xt_esp: unknown flags %X\n", espinfo->invflags); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 22a60a728cf..2f73820e46d 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -664,12 +664,9 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par) return false; } -static bool -hashlimit_mt_check_v0(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool hashlimit_mt_check_v0(const struct xt_mtchk_param *par) { - struct xt_hashlimit_info *r = matchinfo; + struct xt_hashlimit_info *r = par->matchinfo; /* Check for overflow. */ if (r->cfg.burst == 0 || @@ -698,8 +695,8 @@ hashlimit_mt_check_v0(const char *tablename, const void *inf, * the list of htable's in htable_create(), since then we would * create duplicate proc files. -HW */ mutex_lock(&hlimit_mutex); - r->hinfo = htable_find_get(r->name, match->family); - if (!r->hinfo && htable_create_v0(r, match->family) != 0) { + r->hinfo = htable_find_get(r->name, par->match->family); + if (!r->hinfo && htable_create_v0(r, par->match->family) != 0) { mutex_unlock(&hlimit_mutex); return false; } @@ -710,12 +707,9 @@ hashlimit_mt_check_v0(const char *tablename, const void *inf, return true; } -static bool -hashlimit_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool hashlimit_mt_check(const struct xt_mtchk_param *par) { - struct xt_hashlimit_mtinfo1 *info = matchinfo; + struct xt_hashlimit_mtinfo1 *info = par->matchinfo; /* Check for overflow. */ if (info->cfg.burst == 0 || @@ -729,7 +723,7 @@ hashlimit_mt_check(const char *tablename, const void *inf, return false; if (info->name[sizeof(info->name)-1] != '\0') return false; - if (match->family == NFPROTO_IPV4) { + if (par->match->family == NFPROTO_IPV4) { if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32) return false; } else { @@ -744,8 +738,8 @@ hashlimit_mt_check(const char *tablename, const void *inf, * the list of htable's in htable_create(), since then we would * create duplicate proc files. -HW */ mutex_lock(&hlimit_mutex); - info->hinfo = htable_find_get(info->name, match->family); - if (!info->hinfo && htable_create(info, match->family) != 0) { + info->hinfo = htable_find_get(info->name, par->match->family); + if (!info->hinfo && htable_create(info, par->match->family) != 0) { mutex_unlock(&hlimit_mutex); return false; } diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 73bdc3ba13f..86d3c332fcb 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -54,16 +54,13 @@ helper_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool -helper_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool helper_mt_check(const struct xt_mtchk_param *par) { - struct xt_helper_info *info = matchinfo; + struct xt_helper_info *info = par->matchinfo; - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } info->name[29] = '\0'; diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index c475eac5dbe..c908d69a559 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -92,12 +92,9 @@ user2credits(u_int32_t user) return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE; } -static bool -limit_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool limit_mt_check(const struct xt_mtchk_param *par) { - struct xt_rateinfo *r = matchinfo; + struct xt_rateinfo *r = par->matchinfo; /* Check for overflow. */ if (r->burst == 0 diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 88547614653..10b9e34bbc5 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -38,12 +38,9 @@ mark_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ((skb->mark & info->mask) == info->mark) ^ info->invert; } -static bool -mark_mt_check_v0(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool mark_mt_check_v0(const struct xt_mtchk_param *par) { - const struct xt_mark_info *minfo = matchinfo; + const struct xt_mark_info *minfo = par->matchinfo; if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { printk(KERN_WARNING "mark: only supports 32bit mark\n"); diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index 7087e291528..d06bb2dd390 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -158,50 +158,37 @@ check(u_int16_t proto, && count <= XT_MULTI_PORTS; } -/* Called when user tries to insert an entry of this type. */ -static bool -multiport_mt_check_v0(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool multiport_mt_check_v0(const struct xt_mtchk_param *par) { - const struct ipt_ip *ip = info; - const struct xt_multiport *multiinfo = matchinfo; + const struct ipt_ip *ip = par->entryinfo; + const struct xt_multiport *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count); } -static bool -multiport_mt_check(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool multiport_mt_check(const struct xt_mtchk_param *par) { - const struct ipt_ip *ip = info; - const struct xt_multiport_v1 *multiinfo = matchinfo; + const struct ipt_ip *ip = par->entryinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count); } -static bool -multiport_mt6_check_v0(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool multiport_mt6_check_v0(const struct xt_mtchk_param *par) { - const struct ip6t_ip6 *ip = info; - const struct xt_multiport *multiinfo = matchinfo; + const struct ip6t_ip6 *ip = par->entryinfo; + const struct xt_multiport *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count); } -static bool -multiport_mt6_check(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool multiport_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_ip6 *ip = info; - const struct xt_multiport_v1 *multiinfo = matchinfo; + const struct ip6t_ip6 *ip = par->entryinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 493b5eb8d14..32f84e84d9e 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -107,12 +107,9 @@ owner_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -owner_mt_check_v0(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool owner_mt_check_v0(const struct xt_mtchk_param *par) { - const struct ipt_owner_info *info = matchinfo; + const struct ipt_owner_info *info = par->matchinfo; if (info->match & (IPT_OWNER_PID | IPT_OWNER_SID | IPT_OWNER_COMM)) { printk(KERN_WARNING KBUILD_MODNAME @@ -124,12 +121,9 @@ owner_mt_check_v0(const char *tablename, const void *ip, return true; } -static bool -owner_mt6_check_v0(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool owner_mt6_check_v0(const struct xt_mtchk_param *par) { - const struct ip6t_owner_info *info = matchinfo; + const struct ip6t_owner_info *info = par->matchinfo; if (info->match & (IP6T_OWNER_PID | IP6T_OWNER_SID)) { printk(KERN_WARNING KBUILD_MODNAME diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index e980e179d4f..b01786d2dd9 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -91,12 +91,9 @@ match_outdev: return ret ^ !(info->invert & XT_PHYSDEV_OP_OUT); } -static bool -physdev_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool physdev_mt_check(const struct xt_mtchk_param *par) { - const struct xt_physdev_info *info = matchinfo; + const struct xt_physdev_info *info = par->matchinfo; if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || info->bitmask & ~XT_PHYSDEV_OP_MASK) @@ -104,12 +101,12 @@ physdev_mt_check(const char *tablename, const void *ip, if (info->bitmask & XT_PHYSDEV_OP_OUT && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && - hook_mask & ((1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_POST_ROUTING))) { + par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { printk(KERN_WARNING "physdev match: using --physdev-out in the " "OUTPUT, FORWARD and POSTROUTING chains for non-bridged " "traffic is not supported anymore.\n"); - if (hook_mask & (1 << NF_INET_LOCAL_OUT)) + if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) return false; } return true; diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index b0a00fb0511..328bd20ddd2 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -128,26 +128,23 @@ policy_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool -policy_mt_check(const char *tablename, const void *ip_void, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool policy_mt_check(const struct xt_mtchk_param *par) { - const struct xt_policy_info *info = matchinfo; + const struct xt_policy_info *info = par->matchinfo; if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) { printk(KERN_ERR "xt_policy: neither incoming nor " "outgoing policy selected\n"); return false; } - if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN) - && info->flags & XT_POLICY_MATCH_OUT) { + if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { printk(KERN_ERR "xt_policy: output policy not valid in " "PRE_ROUTING and INPUT\n"); return false; } - if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT) - && info->flags & XT_POLICY_MATCH_IN) { + if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) { printk(KERN_ERR "xt_policy: input policy not valid in " "POST_ROUTING and OUTPUT\n"); return false; diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index 3ab92666c14..c84fce5e0f3 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -37,12 +37,9 @@ quota_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool -quota_mt_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool quota_mt_check(const struct xt_mtchk_param *par) { - struct xt_quota_info *q = matchinfo; + struct xt_quota_info *q = par->matchinfo; if (q->flags & ~XT_QUOTA_MASK) return false; diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index e9f64ef4565..4b05ce168a7 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -74,13 +74,9 @@ xt_rateest_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool xt_rateest_mt_checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) { - struct xt_rateest_match_info *info = matchinfo; + struct xt_rateest_match_info *info = par->matchinfo; struct xt_rateest *est1, *est2; if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS | diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index baeb90a5623..a512b49f3fe 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -280,12 +280,9 @@ out: return ret; } -static bool -recent_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool recent_mt_check(const struct xt_mtchk_param *par) { - const struct xt_recent_mtinfo *info = matchinfo; + const struct xt_recent_mtinfo *info = par->matchinfo; struct recent_table *t; unsigned i; bool ret = false; diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index b0014ab65da..e223cb43ae8 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -147,12 +147,9 @@ sctp_mt(const struct sk_buff *skb, const struct xt_match_param *par) XT_SCTP_CHUNK_TYPES, info->flags, info->invflags); } -static bool -sctp_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool sctp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_sctp_info *info = matchinfo; + const struct xt_sctp_info *info = par->matchinfo; return !(info->flags & ~XT_SCTP_VALID_FLAGS) && !(info->invflags & ~XT_SCTP_VALID_FLAGS) diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index 29f5a8a1b02..88b1235519d 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -37,14 +37,11 @@ state_mt(const struct sk_buff *skb, const struct xt_match_param *par) return (sinfo->statemask & statebit); } -static bool -state_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool state_mt_check(const struct xt_mtchk_param *par) { - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } return true; diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index dcadc491db2..0d75141139d 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -49,12 +49,9 @@ statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool -statistic_mt_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool statistic_mt_check(const struct xt_mtchk_param *par) { - struct xt_statistic_info *info = matchinfo; + struct xt_statistic_info *info = par->matchinfo; if (info->mode > XT_STATISTIC_MODE_MAX || info->flags & ~XT_STATISTIC_MASK) diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 33f2d29ca4f..c9407aa78f7 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -40,12 +40,9 @@ string_mt(const struct sk_buff *skb, const struct xt_match_param *par) #define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m)) -static bool -string_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool string_mt_check(const struct xt_mtchk_param *par) { - struct xt_string_info *conf = matchinfo; + struct xt_string_info *conf = par->matchinfo; struct ts_config *ts_conf; int flags = TS_AUTOLOAD; @@ -56,7 +53,7 @@ string_mt_check(const char *tablename, const void *ip, return false; if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE) return false; - if (match->revision == 1) { + if (par->match->revision == 1) { if (conf->u.v1.flags & ~(XT_STRING_FLAG_IGNORECASE | XT_STRING_FLAG_INVERT)) return false; diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index 66cf71b1d59..1ebdc4934ee 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -126,13 +126,9 @@ static bool tcp_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -/* Called when user tries to insert an entry of this type. */ -static bool -tcp_mt_check(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool tcp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_tcp *tcpinfo = matchinfo; + const struct xt_tcp *tcpinfo = par->matchinfo; /* Must specify no unknown invflags */ return !(tcpinfo->invflags & ~XT_TCP_INV_MASK); @@ -165,13 +161,9 @@ static bool udp_mt(const struct sk_buff *skb, const struct xt_match_param *par) !!(udpinfo->invflags & XT_UDP_INV_DSTPT)); } -/* Called when user tries to insert an entry of this type. */ -static bool -udp_mt_check(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool udp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_udp *udpinfo = matchinfo; + const struct xt_udp *udpinfo = par->matchinfo; /* Must specify no unknown invflags */ return !(udpinfo->invflags & ~XT_UDP_INV_MASK); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 28599d3979c..29375ba8db7 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -218,12 +218,9 @@ time_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -time_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool time_mt_check(const struct xt_mtchk_param *par) { - const struct xt_time_info *info = matchinfo; + const struct xt_time_info *info = par->matchinfo; if (info->daytime_start > XT_TIME_MAX_DAYTIME || info->daytime_stop > XT_TIME_MAX_DAYTIME) { -- cgit v1.2.3-70-g09d2 From 6be3d8598e883fb632edf059ba2f8d1b9f4da138 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:19 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (3/6) This patch does this for match extensions' destroy functions. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 8 +++++++- net/bridge/netfilter/ebtables.c | 20 ++++++++++++-------- net/ipv4/netfilter/ip_tables.c | 10 +++++++--- net/ipv6/netfilter/ip6_tables.c | 10 +++++++--- net/netfilter/xt_connbytes.c | 4 ++-- net/netfilter/xt_connlimit.c | 7 +++---- net/netfilter/xt_connmark.c | 5 ++--- net/netfilter/xt_conntrack.c | 5 ++--- net/netfilter/xt_hashlimit.c | 9 ++++----- net/netfilter/xt_helper.c | 4 ++-- net/netfilter/xt_rateest.c | 5 ++--- net/netfilter/xt_recent.c | 4 ++-- net/netfilter/xt_state.c | 4 ++-- net/netfilter/xt_string.c | 4 ++-- 14 files changed, 56 insertions(+), 43 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 763a704ce83..c79c8838014 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -212,6 +212,12 @@ struct xt_mtchk_param { unsigned int hook_mask; }; +/* Match destructor parameters */ +struct xt_mtdtor_param { + const struct xt_match *match; + void *matchinfo; +}; + struct xt_match { struct list_head list; @@ -230,7 +236,7 @@ struct xt_match bool (*checkentry)(const struct xt_mtchk_param *); /* Called when entry of this type deleted. */ - void (*destroy)(const struct xt_match *match, void *matchinfo); + void (*destroy)(const struct xt_mtdtor_param *); /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, void *src); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5ce37b2f5b8..0320b520362 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -558,12 +558,16 @@ ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo, static inline int ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i) { + struct xt_mtdtor_param par; + if (i && (*i)-- == 0) return 1; - if (m->u.match->destroy) - m->u.match->destroy(m->u.match, m->data); - module_put(m->u.match->me); + par.match = m->u.match; + par.matchinfo = m->data; + if (par.match->destroy != NULL) + par.match->destroy(&par); + module_put(par.match->me); return 0; } @@ -609,7 +613,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, unsigned int i, j, hook = 0, hookmask = 0; size_t gap; int ret; - struct xt_mtchk_param par; + struct xt_mtchk_param mtpar; /* don't mess with the struct ebt_entries */ if (e->bitmask == 0) @@ -651,10 +655,10 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, } i = 0; - par.table = name; - par.entryinfo = e; - par.hook_mask = hookmask; - ret = EBT_MATCH_ITERATE(e, ebt_check_match, &par, &i); + mtpar.table = name; + mtpar.entryinfo = e; + mtpar.hook_mask = hookmask; + ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i); if (ret != 0) goto cleanup_matches; j = 0; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4147298a6a8..12ad4d5c55d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -576,12 +576,16 @@ mark_source_chains(struct xt_table_info *newinfo, static int cleanup_match(struct ipt_entry_match *m, unsigned int *i) { + struct xt_mtdtor_param par; + if (i && (*i)-- == 0) return 1; - if (m->u.kernel.match->destroy) - m->u.kernel.match->destroy(m->u.kernel.match, m->data); - module_put(m->u.kernel.match->me); + par.match = m->u.kernel.match; + par.matchinfo = m->data; + if (par.match->destroy != NULL) + par.match->destroy(&par); + module_put(par.match->me); return 0; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 9c843e3777b..891358e89a2 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -599,12 +599,16 @@ mark_source_chains(struct xt_table_info *newinfo, static int cleanup_match(struct ip6t_entry_match *m, unsigned int *i) { + struct xt_mtdtor_param par; + if (i && (*i)-- == 0) return 1; - if (m->u.kernel.match->destroy) - m->u.kernel.match->destroy(m->u.kernel.match, m->data); - module_put(m->u.kernel.match->me); + par.match = m->u.kernel.match; + par.matchinfo = m->data; + if (par.match->destroy != NULL) + par.match->destroy(&par); + module_put(par.match->me); return 0; } diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 43a36c728e5..5bf4aa08b0f 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -115,9 +115,9 @@ static bool connbytes_mt_check(const struct xt_mtchk_param *par) return true; } -static void connbytes_mt_destroy(const struct xt_match *match, void *matchinfo) +static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); } static struct xt_match connbytes_mt_reg[] __read_mostly = { diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 1361e9919cf..bfb3ee6c712 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -246,16 +246,15 @@ static bool connlimit_mt_check(const struct xt_mtchk_param *par) return true; } -static void -connlimit_mt_destroy(const struct xt_match *match, void *matchinfo) +static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { - const struct xt_connlimit_info *info = matchinfo; + const struct xt_connlimit_info *info = par->matchinfo; struct xt_connlimit_conn *conn; struct xt_connlimit_conn *tmp; struct list_head *hash = info->data->iphash; unsigned int i; - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { list_for_each_entry_safe(conn, tmp, &hash[i], list) { diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index b935b7888a9..c708577ea1b 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -87,10 +87,9 @@ static bool connmark_mt_check(const struct xt_mtchk_param *par) return true; } -static void -connmark_mt_destroy(const struct xt_match *match, void *matchinfo) +static void connmark_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); } #ifdef CONFIG_COMPAT diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index f04c46a02ce..5cd58d7fcb1 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -288,10 +288,9 @@ static bool conntrack_mt_check(const struct xt_mtchk_param *par) return true; } -static void -conntrack_mt_destroy(const struct xt_match *match, void *matchinfo) +static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); } #ifdef CONFIG_COMPAT diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 2f73820e46d..6fc4292d46e 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -748,17 +748,16 @@ static bool hashlimit_mt_check(const struct xt_mtchk_param *par) } static void -hashlimit_mt_destroy_v0(const struct xt_match *match, void *matchinfo) +hashlimit_mt_destroy_v0(const struct xt_mtdtor_param *par) { - const struct xt_hashlimit_info *r = matchinfo; + const struct xt_hashlimit_info *r = par->matchinfo; htable_put(r->hinfo); } -static void -hashlimit_mt_destroy(const struct xt_match *match, void *matchinfo) +static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) { - const struct xt_hashlimit_mtinfo1 *info = matchinfo; + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; htable_put(info->hinfo); } diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 86d3c332fcb..280c984349f 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -67,9 +67,9 @@ static bool helper_mt_check(const struct xt_mtchk_param *par) return true; } -static void helper_mt_destroy(const struct xt_match *match, void *matchinfo) +static void helper_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); } static struct xt_match helper_mt_reg[] __read_mostly = { diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index 4b05ce168a7..220a1d588ee 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -117,10 +117,9 @@ err1: return false; } -static void xt_rateest_mt_destroy(const struct xt_match *match, - void *matchinfo) +static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par) { - struct xt_rateest_match_info *info = matchinfo; + struct xt_rateest_match_info *info = par->matchinfo; xt_rateest_put(info->est1); if (info->est2) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index a512b49f3fe..4ebd4ca9a99 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -349,9 +349,9 @@ out: return ret; } -static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) +static void recent_mt_destroy(const struct xt_mtdtor_param *par) { - const struct xt_recent_mtinfo *info = matchinfo; + const struct xt_recent_mtinfo *info = par->matchinfo; struct recent_table *t; mutex_lock(&recent_mutex); diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index 88b1235519d..4c946cbd731 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -47,9 +47,9 @@ static bool state_mt_check(const struct xt_mtchk_param *par) return true; } -static void state_mt_destroy(const struct xt_match *match, void *matchinfo) +static void state_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); } static struct xt_match state_mt_reg[] __read_mostly = { diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index c9407aa78f7..b4d77411131 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -70,9 +70,9 @@ static bool string_mt_check(const struct xt_mtchk_param *par) return true; } -static void string_mt_destroy(const struct xt_match *match, void *matchinfo) +static void string_mt_destroy(const struct xt_mtdtor_param *par) { - textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config); + textsearch_destroy(STRING_TEXT_PRIV(par->matchinfo)->config); } static struct xt_match xt_string_mt_reg[] __read_mostly = { -- cgit v1.2.3-70-g09d2 From 7eb3558655aaa87a3e71a0c065dfaddda521fa6d Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:19 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (4/6) This patch does this for target extensions' target functions. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 22 +++++++++++++++++----- net/bridge/netfilter/ebt_arpreply.c | 8 +++----- net/bridge/netfilter/ebt_dnat.c | 6 ++---- net/bridge/netfilter/ebt_log.c | 14 ++++++-------- net/bridge/netfilter/ebt_mark.c | 6 ++---- net/bridge/netfilter/ebt_nflog.c | 9 ++++----- net/bridge/netfilter/ebt_redirect.c | 12 +++++------- net/bridge/netfilter/ebt_snat.c | 6 ++---- net/bridge/netfilter/ebt_ulog.c | 9 +++------ net/bridge/netfilter/ebtables.c | 27 ++++++++++++++++----------- net/ipv4/netfilter/arp_tables.c | 23 ++++++++++++----------- net/ipv4/netfilter/arpt_mangle.c | 7 ++----- net/ipv4/netfilter/ip_tables.c | 24 ++++++++++-------------- net/ipv4/netfilter/ipt_CLUSTERIP.c | 6 ++---- net/ipv4/netfilter/ipt_ECN.c | 6 ++---- net/ipv4/netfilter/ipt_LOG.c | 8 +++----- net/ipv4/netfilter/ipt_MASQUERADE.c | 14 ++++++-------- net/ipv4/netfilter/ipt_NETMAP.c | 17 ++++++++--------- net/ipv4/netfilter/ipt_REDIRECT.c | 12 +++++------- net/ipv4/netfilter/ipt_REJECT.c | 8 +++----- net/ipv4/netfilter/ipt_TTL.c | 6 ++---- net/ipv4/netfilter/ipt_ULOG.c | 10 +++------- net/ipv4/netfilter/nf_nat_rule.c | 32 ++++++++++++-------------------- net/ipv6/netfilter/ip6_tables.c | 24 +++++++++++------------- net/ipv6/netfilter/ip6t_HL.c | 6 ++---- net/ipv6/netfilter/ip6t_LOG.c | 8 +++----- net/ipv6/netfilter/ip6t_REJECT.c | 18 ++++++++---------- net/netfilter/xt_CLASSIFY.c | 6 ++---- net/netfilter/xt_CONNMARK.c | 12 ++++-------- net/netfilter/xt_CONNSECMARK.c | 6 ++---- net/netfilter/xt_DSCP.c | 30 ++++++++++-------------------- net/netfilter/xt_MARK.c | 18 ++++++------------ net/netfilter/xt_NFLOG.c | 10 ++++------ net/netfilter/xt_NFQUEUE.c | 6 ++---- net/netfilter/xt_NOTRACK.c | 4 +--- net/netfilter/xt_RATEEST.c | 9 ++------- net/netfilter/xt_SECMARK.c | 6 ++---- net/netfilter/xt_TCPMSS.c | 12 ++++-------- net/netfilter/xt_TCPOPTSTRIP.c | 12 ++++-------- net/netfilter/xt_TPROXY.c | 11 +++-------- net/netfilter/xt_TRACE.c | 4 +--- net/sched/act_ipt.c | 12 ++++++++---- 42 files changed, 209 insertions(+), 297 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index c79c8838014..46d0cb1ad34 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -218,6 +218,22 @@ struct xt_mtdtor_param { void *matchinfo; }; +/** + * struct xt_target_param - parameters for target extensions' target functions + * + * @hooknum: hook through which this target was invoked + * @target: struct xt_target through which this function was invoked + * @targinfo: per-target data + * + * Other fields see above. + */ +struct xt_target_param { + const struct net_device *in, *out; + unsigned int hooknum; + const struct xt_target *target; + const void *targinfo; +}; + struct xt_match { struct list_head list; @@ -269,11 +285,7 @@ struct xt_target must now handle non-linear skbs, using skb_copy_bits and skb_ip_make_writable. */ unsigned int (*target)(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo); + const struct xt_target_param *); /* Called when user tries to insert an entry of this type: hook_mask is a bitmask of hooks from which it can be diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c index baf5510d044..fc94699f719 100644 --- a/net/bridge/netfilter/ebt_arpreply.c +++ b/net/bridge/netfilter/ebt_arpreply.c @@ -16,11 +16,9 @@ #include static unsigned int -ebt_arpreply_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hook_nr, - const struct xt_target *target, const void *data) +ebt_arpreply_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ebt_arpreply_info *info = data; + const struct ebt_arpreply_info *info = par->targinfo; const __be32 *siptr, *diptr; __be32 _sip, _dip; const struct arphdr *ap; @@ -53,7 +51,7 @@ ebt_arpreply_tg(struct sk_buff *skb, const struct net_device *in, if (diptr == NULL) return EBT_DROP; - arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)in, + arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)par->in, *diptr, shp, info->mac, shp); return info->target; diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index cb80101e412..bb5d79e0bee 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -15,11 +15,9 @@ #include static unsigned int -ebt_dnat_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hook_nr, - const struct xt_target *target, const void *data) +ebt_dnat_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ebt_nat_info *info = data; + const struct ebt_nat_info *info = par->targinfo; if (!skb_make_writable(skb, 0)) return EBT_DROP; diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index b40f9ed4c34..87de5fccb2f 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -195,11 +195,9 @@ out: } static unsigned int -ebt_log_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknr, - const struct xt_target *target, const void *data) +ebt_log_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ebt_log_info *info = data; + const struct ebt_log_info *info = par->targinfo; struct nf_loginfo li; li.type = NF_LOG_TYPE_LOG; @@ -207,11 +205,11 @@ ebt_log_tg(struct sk_buff *skb, const struct net_device *in, li.u.log.logflags = info->bitmask; if (info->bitmask & EBT_LOG_NFLOG) - nf_log_packet(NFPROTO_BRIDGE, hooknr, skb, in, out, &li, - "%s", info->prefix); + nf_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in, + par->out, &li, "%s", info->prefix); else - ebt_log_packet(NFPROTO_BRIDGE, hooknr, skb, in, out, &li, - info->prefix); + ebt_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in, + par->out, &li, info->prefix); return EBT_CONTINUE; } diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c index dff19fc91cf..aafc456c3c3 100644 --- a/net/bridge/netfilter/ebt_mark.c +++ b/net/bridge/netfilter/ebt_mark.c @@ -19,11 +19,9 @@ #include static unsigned int -ebt_mark_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hook_nr, - const struct xt_target *target, const void *data) +ebt_mark_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ebt_mark_t_info *info = data; + const struct ebt_mark_t_info *info = par->targinfo; int action = info->target & -16; if (action == MARK_SET_VALUE) diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c index 74b4fa0aabc..6a28d994cf7 100644 --- a/net/bridge/netfilter/ebt_nflog.c +++ b/net/bridge/netfilter/ebt_nflog.c @@ -20,11 +20,9 @@ #include static unsigned int -ebt_nflog_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknr, - const struct xt_target *target, const void *data) +ebt_nflog_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ebt_nflog_info *info = data; + const struct ebt_nflog_info *info = par->targinfo; struct nf_loginfo li; li.type = NF_LOG_TYPE_ULOG; @@ -32,7 +30,8 @@ ebt_nflog_tg(struct sk_buff *skb, const struct net_device *in, li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; - nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, "%s", info->prefix); + nf_log_packet(PF_BRIDGE, par->hooknum, skb, par->in, par->out, + &li, "%s", info->prefix); return EBT_CONTINUE; } diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index a50ffbe0e4f..0cfe2fad940 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -16,20 +16,18 @@ #include static unsigned int -ebt_redirect_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknr, - const struct xt_target *target, const void *data) +ebt_redirect_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ebt_redirect_info *info = data; + const struct ebt_redirect_info *info = par->targinfo; if (!skb_make_writable(skb, 0)) return EBT_DROP; - if (hooknr != NF_BR_BROUTING) + if (par->hooknum != NF_BR_BROUTING) memcpy(eth_hdr(skb)->h_dest, - in->br_port->br->dev->dev_addr, ETH_ALEN); + par->in->br_port->br->dev->dev_addr, ETH_ALEN); else - memcpy(eth_hdr(skb)->h_dest, in->dev_addr, ETH_ALEN); + memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN); skb->pkt_type = PACKET_HOST; return info->target; } diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c index 8a55c7d49b5..f55960eee99 100644 --- a/net/bridge/netfilter/ebt_snat.c +++ b/net/bridge/netfilter/ebt_snat.c @@ -17,11 +17,9 @@ #include static unsigned int -ebt_snat_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hook_nr, - const struct xt_target *target, const void *data) +ebt_snat_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ebt_nat_info *info = data; + const struct ebt_nat_info *info = par->targinfo; if (!skb_make_writable(skb, 0)) return EBT_DROP; diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 25ca6467349..bfedf12cbf4 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -247,13 +247,10 @@ static void ebt_log_packet(u_int8_t pf, unsigned int hooknum, } static unsigned int -ebt_ulog_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknr, - const struct xt_target *target, const void *data) +ebt_ulog_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ebt_ulog_info *uloginfo = data; - - ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL); + ebt_ulog_packet(par->hooknum, skb, par->in, par->out, + par->targinfo, NULL); return EBT_CONTINUE; } diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 0320b520362..a1156bab4a0 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -64,11 +64,13 @@ static struct xt_target ebt_standard_target = { .targetsize = sizeof(int), }; -static inline int ebt_do_watcher (struct ebt_entry_watcher *w, - struct sk_buff *skb, unsigned int hooknr, const struct net_device *in, - const struct net_device *out) +static inline int +ebt_do_watcher(const struct ebt_entry_watcher *w, struct sk_buff *skb, + struct xt_target_param *par) { - w->u.watcher->target(skb, in, out, hooknr, w->u.watcher, w->data); + par->target = w->u.watcher; + par->targinfo = w->data; + w->u.watcher->target(skb, par); /* watchers don't give a verdict */ return 0; } @@ -156,10 +158,12 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, struct ebt_table_info *private; bool hotdrop = false; struct xt_match_param mtpar; + struct xt_target_param tgpar; - mtpar.in = in; - mtpar.out = out; + mtpar.in = tgpar.in = in; + mtpar.out = tgpar.out = out; mtpar.hotdrop = &hotdrop; + tgpar.hooknum = hook; read_lock_bh(&table->lock); private = table->private; @@ -193,17 +197,18 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, /* these should only watch: not modify, nor tell us what to do with the packet */ - EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, hook, in, - out); + EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &tgpar); t = (struct ebt_entry_target *) (((char *)point) + point->target_offset); /* standard target */ if (!t->u.target->target) verdict = ((struct ebt_standard_target *)t)->verdict; - else - verdict = t->u.target->target(skb, in, out, hook, - t->u.target, t->data); + else { + tgpar.target = t->u.target; + tgpar.targinfo = t->data; + verdict = t->u.target->target(skb, &tgpar); + } if (verdict == EBT_ACCEPT) { read_unlock_bh(&table->lock); return NF_ACCEPT; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index ae525a9afbe..5b631ad74b5 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -200,15 +200,12 @@ static inline int arp_checkentry(const struct arpt_arp *arp) return 1; } -static unsigned int arpt_error(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +arpt_error(struct sk_buff *skb, const struct xt_target_param *par) { if (net_ratelimit()) - printk("arp_tables: error: '%s'\n", (char *)targinfo); + printk("arp_tables: error: '%s'\n", + (const char *)par->targinfo); return NF_DROP; } @@ -232,6 +229,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, const char *indev, *outdev; void *table_base; const struct xt_table_info *private; + struct xt_target_param tgpar; if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) return NF_DROP; @@ -245,6 +243,10 @@ unsigned int arpt_do_table(struct sk_buff *skb, e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); + tgpar.in = in; + tgpar.out = out; + tgpar.hooknum = hook; + arp = arp_hdr(skb); do { if (arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { @@ -290,11 +292,10 @@ unsigned int arpt_do_table(struct sk_buff *skb, /* Targets which reenter must return * abs. verdicts */ + tgpar.target = t->u.kernel.target; + tgpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, - in, out, - hook, - t->u.kernel.target, - t->data); + &tgpar); /* Target might have changed stuff. */ arp = arp_hdr(skb); diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index 3f9e4ccd616..0bf81b35369 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -9,12 +9,9 @@ MODULE_AUTHOR("Bart De Schuymer "); MODULE_DESCRIPTION("arptables arp payload mangle target"); static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - unsigned int hooknum, const struct xt_target *target, - const void *targinfo) +target(struct sk_buff *skb, const struct xt_target_param *par) { - const struct arpt_mangle *mangle = targinfo; + const struct arpt_mangle *mangle = par->targinfo; const struct arphdr *arp; unsigned char *arpptr; int pln, hln; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 12ad4d5c55d..0f8ecf39022 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -171,15 +171,11 @@ ip_checkentry(const struct ipt_ip *ip) } static unsigned int -ipt_error(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +ipt_error(struct sk_buff *skb, const struct xt_target_param *par) { if (net_ratelimit()) - printk("ip_tables: error: `%s'\n", (char *)targinfo); + printk("ip_tables: error: `%s'\n", + (const char *)par->targinfo); return NF_DROP; } @@ -334,6 +330,7 @@ ipt_do_table(struct sk_buff *skb, struct ipt_entry *e, *back; struct xt_table_info *private; struct xt_match_param mtpar; + struct xt_target_param tgpar; /* Initialization */ ip = ip_hdr(skb); @@ -349,8 +346,9 @@ ipt_do_table(struct sk_buff *skb, mtpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; mtpar.thoff = ip_hdrlen(skb); mtpar.hotdrop = &hotdrop; - mtpar.in = in; - mtpar.out = out; + mtpar.in = tgpar.in = in; + mtpar.out = tgpar.out = out; + tgpar.hooknum = hook; read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); @@ -414,16 +412,14 @@ ipt_do_table(struct sk_buff *skb, } else { /* Targets which reenter must return abs. verdicts */ + tgpar.target = t->u.kernel.target; + tgpar.targinfo = t->data; #ifdef CONFIG_NETFILTER_DEBUG ((struct ipt_entry *)table_base)->comefrom = 0xeeeeeeec; #endif verdict = t->u.kernel.target->target(skb, - in, out, - hook, - t->u.kernel.target, - t->data); - + &tgpar); #ifdef CONFIG_NETFILTER_DEBUG if (((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 63faddc18a1..67e8aa8f34f 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -281,11 +281,9 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) ***********************************************************************/ static unsigned int -clusterip_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ipt_clusterip_tgt_info *cipinfo = targinfo; + const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int32_t hash; diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index aee2364afff..e37f181e829 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -77,11 +77,9 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) } static unsigned int -ecn_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +ecn_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ipt_ECN_info *einfo = targinfo; + const struct ipt_ECN_info *einfo = par->targinfo; if (einfo->operation & IPT_ECN_OP_SET_IP) if (!set_ect_ip(skb, einfo)) diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 1c9785df4df..e9942aed35a 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -426,18 +426,16 @@ ipt_log_packet(u_int8_t pf, } static unsigned int -log_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +log_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ipt_log_info *loginfo = targinfo; + const struct ipt_log_info *loginfo = par->targinfo; struct nf_loginfo li; li.type = NF_LOG_TYPE_LOG; li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; - ipt_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, &li, + ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li, loginfo->prefix); return XT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 65c811b27b7..e0d9d49b79e 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -50,9 +50,7 @@ masquerade_tg_check(const char *tablename, const void *e, } static unsigned int -masquerade_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par) { struct nf_conn *ct; struct nf_conn_nat *nat; @@ -62,7 +60,7 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in, const struct rtable *rt; __be32 newsrc; - NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); + NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); nat = nfct_nat(ct); @@ -76,16 +74,16 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in, if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) return NF_ACCEPT; - mr = targinfo; + mr = par->targinfo; rt = skb->rtable; - newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE); + newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); if (!newsrc) { - printk("MASQUERADE: %s ate my IP address\n", out->name); + printk("MASQUERADE: %s ate my IP address\n", par->out->name); return NF_DROP; } write_lock_bh(&masq_lock); - nat->masq_index = out->ifindex; + nat->masq_index = par->out->ifindex; write_unlock_bh(&masq_lock); /* Transfer from original range. */ diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index f281500bd7f..cf18f23b346 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -41,24 +41,23 @@ netmap_tg_check(const char *tablename, const void *e, } static unsigned int -netmap_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +netmap_tg(struct sk_buff *skb, const struct xt_target_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 new_ip, netmask; - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; struct nf_nat_range newrange; - NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING - || hooknum == NF_INET_POST_ROUTING - || hooknum == NF_INET_LOCAL_OUT); + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_POST_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); - if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_OUT) + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT) new_ip = ip_hdr(skb)->daddr & ~netmask; else new_ip = ip_hdr(skb)->saddr & ~netmask; @@ -70,7 +69,7 @@ netmap_tg(struct sk_buff *skb, const struct net_device *in, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(hooknum)); + return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); } static struct xt_target netmap_tg_reg __read_mostly = { diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index ef496105eae..23adb09ddfb 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -45,24 +45,22 @@ redirect_tg_check(const char *tablename, const void *e, } static unsigned int -redirect_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +redirect_tg(struct sk_buff *skb, const struct xt_target_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 newdst; - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; struct nf_nat_range newrange; - NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING - || hooknum == NF_INET_LOCAL_OUT); + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); /* Local packets: make them go to loopback */ - if (hooknum == NF_INET_LOCAL_OUT) + if (par->hooknum == NF_INET_LOCAL_OUT) newdst = htonl(0x7F000001); else { struct in_device *indev; diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 9f5da0c2cae..b36071bb107 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -136,11 +136,9 @@ static inline void send_unreach(struct sk_buff *skb_in, int code) } static unsigned int -reject_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +reject_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ipt_reject_info *reject = targinfo; + const struct ipt_reject_info *reject = par->targinfo; /* WARNING: This code causes reentry within iptables. This means that the iptables jump stack is now crap. We @@ -168,7 +166,7 @@ reject_tg(struct sk_buff *skb, const struct net_device *in, send_unreach(skb, ICMP_PKT_FILTERED); break; case IPT_TCP_RESET: - send_reset(skb, hooknum); + send_reset(skb, par->hooknum); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c index 7d01d424a71..05cbfd2f747 100644 --- a/net/ipv4/netfilter/ipt_TTL.c +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -20,12 +20,10 @@ MODULE_DESCRIPTION("Xtables: IPv4 TTL field modification target"); MODULE_LICENSE("GPL"); static unsigned int -ttl_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +ttl_tg(struct sk_buff *skb, const struct xt_target_param *par) { struct iphdr *iph; - const struct ipt_TTL_info *info = targinfo; + const struct ipt_TTL_info *info = par->targinfo; int new_ttl; if (!skb_make_writable(skb, skb->len)) diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 9065e4a34fb..46c0df0dc2d 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -281,14 +281,10 @@ alloc_failure: } static unsigned int -ulog_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +ulog_tg(struct sk_buff *skb, const struct xt_target_param *par) { - struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; - - ipt_ulog_packet(hooknum, skb, in, out, loginfo, NULL); - + ipt_ulog_packet(par->hooknum, skb, par->in, par->out, + par->targinfo, NULL); return XT_CONTINUE; } diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index f929352ec0e..83170ff131f 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -67,25 +67,21 @@ static struct xt_table nat_table = { }; /* Source NAT */ -static unsigned int ipt_snat_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; - NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); + NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); /* Connection must be valid and new. */ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); - NF_CT_ASSERT(out); + NF_CT_ASSERT(par->out != NULL); return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); } @@ -109,28 +105,24 @@ static void warn_if_extra_mangle(struct net *net, __be32 dstip, __be32 srcip) ip_rt_put(rt); } -static unsigned int ipt_dnat_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; - NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING || - hooknum == NF_INET_LOCAL_OUT); + NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); /* Connection must be valid and new. */ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - if (hooknum == NF_INET_LOCAL_OUT && + if (par->hooknum == NF_INET_LOCAL_OUT && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) - warn_if_extra_mangle(dev_net(out), ip_hdr(skb)->daddr, + warn_if_extra_mangle(dev_net(par->out), ip_hdr(skb)->daddr, mr->range[0].min_ip); return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 891358e89a2..ee0986cdbd6 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -200,15 +200,11 @@ ip6_checkentry(const struct ip6t_ip6 *ipv6) } static unsigned int -ip6t_error(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +ip6t_error(struct sk_buff *skb, const struct xt_target_param *par) { if (net_ratelimit()) - printk("ip6_tables: error: `%s'\n", (char *)targinfo); + printk("ip6_tables: error: `%s'\n", + (const char *)par->targinfo); return NF_DROP; } @@ -360,6 +356,7 @@ ip6t_do_table(struct sk_buff *skb, struct ip6t_entry *e, *back; struct xt_table_info *private; struct xt_match_param mtpar; + struct xt_target_param tgpar; /* Initialization */ indev = in ? in->name : nulldevname; @@ -371,8 +368,9 @@ ip6t_do_table(struct sk_buff *skb, * rule is also a fragment-specific rule, non-fragments won't * match it. */ mtpar.hotdrop = &hotdrop; - mtpar.in = in; - mtpar.out = out; + mtpar.in = tgpar.in = in; + mtpar.out = tgpar.out = out; + tgpar.hooknum = hook; read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); @@ -438,15 +436,15 @@ ip6t_do_table(struct sk_buff *skb, } else { /* Targets which reenter must return abs. verdicts */ + tgpar.target = t->u.kernel.target; + tgpar.targinfo = t->data; + #ifdef CONFIG_NETFILTER_DEBUG ((struct ip6t_entry *)table_base)->comefrom = 0xeeeeeeec; #endif verdict = t->u.kernel.target->target(skb, - in, out, - hook, - t->u.kernel.target, - t->data); + &tgpar); #ifdef CONFIG_NETFILTER_DEBUG if (((struct ip6t_entry *)table_base)->comefrom diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c index 7eebd350916..ac759a54f2c 100644 --- a/net/ipv6/netfilter/ip6t_HL.c +++ b/net/ipv6/netfilter/ip6t_HL.c @@ -19,12 +19,10 @@ MODULE_DESCRIPTION("Xtables: IPv6 Hop Limit field modification target"); MODULE_LICENSE("GPL"); static unsigned int -hl_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +hl_tg6(struct sk_buff *skb, const struct xt_target_param *par) { struct ipv6hdr *ip6h; - const struct ip6t_HL_info *info = targinfo; + const struct ip6t_HL_info *info = par->targinfo; int new_hl; if (!skb_make_writable(skb, skb->len)) diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index fd148f3d842..a31d3ecd1fc 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -438,18 +438,16 @@ ip6t_log_packet(u_int8_t pf, } static unsigned int -log_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +log_tg6(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ip6t_log_info *loginfo = targinfo; + const struct ip6t_log_info *loginfo = par->targinfo; struct nf_loginfo li; li.type = NF_LOG_TYPE_LOG; li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; - ip6t_log_packet(NFPROTO_IPV6, hooknum, skb, in, out, + ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in, par->out, &li, loginfo->prefix); return XT_CONTINUE; } diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index f1a9fce1ec9..1d5f3a70ed0 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -173,12 +173,10 @@ send_unreach(struct net *net, struct sk_buff *skb_in, unsigned char code, } static unsigned int -reject_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +reject_tg6(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ip6t_reject_info *reject = targinfo; - struct net *net = dev_net(in ? in : out); + const struct ip6t_reject_info *reject = par->targinfo; + struct net *net = dev_net((par->in != NULL) ? par->in : par->out); pr_debug("%s: medium point\n", __func__); /* WARNING: This code causes reentry within ip6tables. @@ -186,19 +184,19 @@ reject_tg6(struct sk_buff *skb, const struct net_device *in, must return an absolute verdict. --RR */ switch (reject->with) { case IP6T_ICMP6_NO_ROUTE: - send_unreach(net, skb, ICMPV6_NOROUTE, hooknum); + send_unreach(net, skb, ICMPV6_NOROUTE, par->hooknum); break; case IP6T_ICMP6_ADM_PROHIBITED: - send_unreach(net, skb, ICMPV6_ADM_PROHIBITED, hooknum); + send_unreach(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum); break; case IP6T_ICMP6_NOT_NEIGHBOUR: - send_unreach(net, skb, ICMPV6_NOT_NEIGHBOUR, hooknum); + send_unreach(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum); break; case IP6T_ICMP6_ADDR_UNREACH: - send_unreach(net, skb, ICMPV6_ADDR_UNREACH, hooknum); + send_unreach(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum); break; case IP6T_ICMP6_PORT_UNREACH: - send_unreach(net, skb, ICMPV6_PORT_UNREACH, hooknum); + send_unreach(net, skb, ICMPV6_PORT_UNREACH, par->hooknum); break; case IP6T_ICMP6_ECHOREPLY: /* Do nothing */ diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index 8cffa295dd3..011bc80dd2a 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -27,11 +27,9 @@ MODULE_ALIAS("ipt_CLASSIFY"); MODULE_ALIAS("ip6t_CLASSIFY"); static unsigned int -classify_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +classify_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_classify_target_info *clinfo = targinfo; + const struct xt_classify_target_info *clinfo = par->targinfo; skb->priority = clinfo->priority; return XT_CONTINUE; diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index e1415c3f5c9..95ed267328a 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -36,11 +36,9 @@ MODULE_ALIAS("ip6t_CONNMARK"); #include static unsigned int -connmark_tg_v0(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +connmark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_connmark_target_info *markinfo = targinfo; + const struct xt_connmark_target_info *markinfo = par->targinfo; struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int32_t diff; @@ -77,11 +75,9 @@ connmark_tg_v0(struct sk_buff *skb, const struct net_device *in, } static unsigned int -connmark_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +connmark_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_connmark_tginfo1 *info = targinfo; + const struct xt_connmark_tginfo1 *info = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; u_int32_t newmark; diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 5f221c3bd35..2211a2cef28 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -65,11 +65,9 @@ static void secmark_restore(struct sk_buff *skb) } static unsigned int -connsecmark_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +connsecmark_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_connsecmark_target_info *info = targinfo; + const struct xt_connsecmark_target_info *info = par->targinfo; switch (info->mode) { case CONNSECMARK_SAVE: diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index f0b4958528e..c78e80afdf3 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -29,11 +29,9 @@ MODULE_ALIAS("ipt_TOS"); MODULE_ALIAS("ip6t_TOS"); static unsigned int -dscp_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +dscp_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_DSCP_info *dinfo = targinfo; + const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { @@ -48,11 +46,9 @@ dscp_tg(struct sk_buff *skb, const struct net_device *in, } static unsigned int -dscp_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +dscp_tg6(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_DSCP_info *dinfo = targinfo; + const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { @@ -80,11 +76,9 @@ dscp_tg_check(const char *tablename, const void *e_void, } static unsigned int -tos_tg_v0(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tos_tg_v0(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ipt_tos_target_info *info = targinfo; + const struct ipt_tos_target_info *info = par->targinfo; struct iphdr *iph = ip_hdr(skb); u_int8_t oldtos; @@ -119,11 +113,9 @@ tos_tg_check_v0(const char *tablename, const void *e_void, } static unsigned int -tos_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tos_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_tos_target_info *info = targinfo; + const struct xt_tos_target_info *info = par->targinfo; struct iphdr *iph = ip_hdr(skb); u_int8_t orig, nv; @@ -141,11 +133,9 @@ tos_tg(struct sk_buff *skb, const struct net_device *in, } static unsigned int -tos_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tos_tg6(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_tos_target_info *info = targinfo; + const struct xt_tos_target_info *info = par->targinfo; struct ipv6hdr *iph = ipv6_hdr(skb); u_int8_t orig, nv; diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c index c8ea7a80970..27d03f39611 100644 --- a/net/netfilter/xt_MARK.c +++ b/net/netfilter/xt_MARK.c @@ -25,22 +25,18 @@ MODULE_ALIAS("ipt_MARK"); MODULE_ALIAS("ip6t_MARK"); static unsigned int -mark_tg_v0(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +mark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_mark_target_info *markinfo = targinfo; + const struct xt_mark_target_info *markinfo = par->targinfo; skb->mark = markinfo->mark; return XT_CONTINUE; } static unsigned int -mark_tg_v1(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +mark_tg_v1(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_mark_target_info_v1 *markinfo = targinfo; + const struct xt_mark_target_info_v1 *markinfo = par->targinfo; int mark = 0; switch (markinfo->mode) { @@ -62,11 +58,9 @@ mark_tg_v1(struct sk_buff *skb, const struct net_device *in, } static unsigned int -mark_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +mark_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_mark_tginfo2 *info = targinfo; + const struct xt_mark_tginfo2 *info = par->targinfo; skb->mark = (skb->mark & ~info->mask) ^ info->mark; return XT_CONTINUE; diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index 9b095520176..3218ad63bd1 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -21,11 +21,9 @@ MODULE_ALIAS("ipt_NFLOG"); MODULE_ALIAS("ip6t_NFLOG"); static unsigned int -nflog_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +nflog_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_nflog_info *info = targinfo; + const struct xt_nflog_info *info = par->targinfo; struct nf_loginfo li; li.type = NF_LOG_TYPE_ULOG; @@ -33,8 +31,8 @@ nflog_tg(struct sk_buff *skb, const struct net_device *in, li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; - nf_log_packet(target->family, hooknum, skb, in, out, &li, - "%s", info->prefix); + nf_log_packet(par->target->family, par->hooknum, skb, par->in, + par->out, &li, "%s", info->prefix); return XT_CONTINUE; } diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index c03c2e8d06f..2cc1fff4930 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -24,11 +24,9 @@ MODULE_ALIAS("ip6t_NFQUEUE"); MODULE_ALIAS("arpt_NFQUEUE"); static unsigned int -nfqueue_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +nfqueue_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_NFQ_info *tinfo = targinfo; + const struct xt_NFQ_info *tinfo = par->targinfo; return NF_QUEUE_NR(tinfo->queuenum); } diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c index b9ee268b37c..cc50295cd11 100644 --- a/net/netfilter/xt_NOTRACK.c +++ b/net/netfilter/xt_NOTRACK.c @@ -13,9 +13,7 @@ MODULE_ALIAS("ipt_NOTRACK"); MODULE_ALIAS("ip6t_NOTRACK"); static unsigned int -notrack_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +notrack_tg(struct sk_buff *skb, const struct xt_target_param *par) { /* Previously seen (loopback)? Ignore. */ if (skb->nfct != NULL) diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index da7946e6ecb..92e33524f78 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -71,14 +71,9 @@ void xt_rateest_put(struct xt_rateest *est) EXPORT_SYMBOL_GPL(xt_rateest_put); static unsigned int -xt_rateest_tg(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +xt_rateest_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_rateest_target_info *info = targinfo; + const struct xt_rateest_target_info *info = par->targinfo; struct gnet_stats_basic *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 2a2ab833481..ad05214e380 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -29,12 +29,10 @@ MODULE_ALIAS("ip6t_SECMARK"); static u8 mode; static unsigned int -secmark_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +secmark_tg(struct sk_buff *skb, const struct xt_target_param *par) { u32 secmark = 0; - const struct xt_secmark_target_info *info = targinfo; + const struct xt_secmark_target_info *info = par->targinfo; BUG_ON(info->mode != mode); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index b868f995239..e08762d9b0f 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -174,15 +174,13 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, } static unsigned int -tcpmss_tg4(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tcpmss_tg4(struct sk_buff *skb, const struct xt_target_param *par) { struct iphdr *iph = ip_hdr(skb); __be16 newlen; int ret; - ret = tcpmss_mangle_packet(skb, targinfo, + ret = tcpmss_mangle_packet(skb, par->targinfo, tcpmss_reverse_mtu(skb, PF_INET), iph->ihl * 4, sizeof(*iph) + sizeof(struct tcphdr)); @@ -199,9 +197,7 @@ tcpmss_tg4(struct sk_buff *skb, const struct net_device *in, #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) static unsigned int -tcpmss_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tcpmss_tg6(struct sk_buff *skb, const struct xt_target_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; @@ -212,7 +208,7 @@ tcpmss_tg6(struct sk_buff *skb, const struct net_device *in, tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr); if (tcphoff < 0) return NF_DROP; - ret = tcpmss_mangle_packet(skb, targinfo, + ret = tcpmss_mangle_packet(skb, par->targinfo, tcpmss_reverse_mtu(skb, PF_INET6), tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 2e0ae6cc5d9..9dd8c8ef63e 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -75,19 +75,15 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, } static unsigned int -tcpoptstrip_tg4(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_target_param *par) { - return tcpoptstrip_mangle_packet(skb, targinfo, ip_hdrlen(skb), + return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb), sizeof(struct iphdr) + sizeof(struct tcphdr)); } #if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE) static unsigned int -tcpoptstrip_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_target_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); int tcphoff; @@ -98,7 +94,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct net_device *in, if (tcphoff < 0) return NF_DROP; - return tcpoptstrip_mangle_packet(skb, targinfo, tcphoff, + return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); } #endif diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 183f251d2f0..f08c49ea4bd 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -25,15 +25,10 @@ #include static unsigned int -tproxy_tg(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +tproxy_tg(struct sk_buff *skb, const struct xt_target_param *par) { const struct iphdr *iph = ip_hdr(skb); - const struct xt_tproxy_target_info *tgi = targinfo; + const struct xt_tproxy_target_info *tgi = par->targinfo; struct udphdr _hdr, *hp; struct sock *sk; @@ -44,7 +39,7 @@ tproxy_tg(struct sk_buff *skb, sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr, hp->source, tgi->lport ? tgi->lport : hp->dest, - in, true); + par->in, true); /* NOTE: assign_sock consumes our sk reference */ if (sk && nf_tproxy_assign_sock(skb, sk)) { diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index da35f9f1cd7..fbb04b86c46 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -11,9 +11,7 @@ MODULE_ALIAS("ipt_TRACE"); MODULE_ALIAS("ip6t_TRACE"); static unsigned int -trace_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +trace_tg(struct sk_buff *skb, const struct xt_target_param *par) { skb->nf_trace = 1; return XT_CONTINUE; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 79ea19375ca..89791a56429 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -188,6 +188,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a, { int ret = 0, result = 0; struct tcf_ipt *ipt = a->priv; + struct xt_target_param par; if (skb_cloned(skb)) { if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) @@ -203,10 +204,13 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a, /* yes, we have to worry about both in and out dev worry later - danger - this API seems to have changed from earlier kernels */ - ret = ipt->tcfi_t->u.kernel.target->target(skb, skb->dev, NULL, - ipt->tcfi_hook, - ipt->tcfi_t->u.kernel.target, - ipt->tcfi_t->data); + par.in = skb->dev; + par.out = NULL; + par.hooknum = ipt->tcfi_hook; + par.target = ipt->tcfi_t->u.kernel.target; + par.targinfo = ipt->tcfi_t->data; + ret = par.target->target(skb, &par); + switch (ret) { case NF_ACCEPT: result = TC_ACT_OK; -- cgit v1.2.3-70-g09d2 From af5d6dc200eb0fcc6fbd3df1ab4d8969004cb37f Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:19 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (5/6) This patch does this for target extensions' checkentry functions. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 29 +++++++++++++++++++--------- include/linux/netfilter_bridge/ebtables.h | 4 ++-- net/bridge/netfilter/ebt_arpreply.c | 10 +++------- net/bridge/netfilter/ebt_dnat.c | 19 +++++++++--------- net/bridge/netfilter/ebt_log.c | 7 ++----- net/bridge/netfilter/ebt_mark.c | 8 ++------ net/bridge/netfilter/ebt_nflog.c | 7 ++----- net/bridge/netfilter/ebt_redirect.c | 17 ++++++++-------- net/bridge/netfilter/ebt_snat.c | 8 ++------ net/bridge/netfilter/ebt_ulog.c | 7 ++----- net/bridge/netfilter/ebtables.c | 28 +++++++++++++++------------ net/ipv4/netfilter/arp_tables.c | 20 ++++++++++--------- net/ipv4/netfilter/arpt_mangle.c | 6 ++---- net/ipv4/netfilter/ip_tables.c | 17 +++++++++------- net/ipv4/netfilter/ipt_CLUSTERIP.c | 13 +++++-------- net/ipv4/netfilter/ipt_ECN.c | 9 +++------ net/ipv4/netfilter/ipt_LOG.c | 7 ++----- net/ipv4/netfilter/ipt_MASQUERADE.c | 7 ++----- net/ipv4/netfilter/ipt_NETMAP.c | 7 ++----- net/ipv4/netfilter/ipt_REDIRECT.c | 7 ++----- net/ipv4/netfilter/ipt_REJECT.c | 9 +++------ net/ipv4/netfilter/ipt_TTL.c | 7 ++----- net/ipv4/netfilter/ipt_ULOG.c | 7 ++----- net/ipv4/netfilter/nf_nat_rule.c | 16 ++++------------ net/ipv6/netfilter/ip6_tables.c | 16 ++++++++++------ net/ipv6/netfilter/ip6t_HL.c | 7 ++----- net/ipv6/netfilter/ip6t_LOG.c | 7 ++----- net/ipv6/netfilter/ip6t_REJECT.c | 9 +++------ net/netfilter/x_tables.c | 32 +++++++++++++++---------------- net/netfilter/xt_CONNMARK.c | 24 +++++++++-------------- net/netfilter/xt_CONNSECMARK.c | 16 +++++++--------- net/netfilter/xt_DSCP.c | 19 +++++++----------- net/netfilter/xt_MARK.c | 14 ++++---------- net/netfilter/xt_NFLOG.c | 7 ++----- net/netfilter/xt_RATEEST.c | 9 ++------- net/netfilter/xt_SECMARK.c | 12 +++++------- net/netfilter/xt_TCPMSS.c | 22 ++++++++------------- net/netfilter/xt_TPROXY.c | 9 ++------- net/sched/act_ipt.c | 12 +++++++++--- 39 files changed, 208 insertions(+), 283 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 46d0cb1ad34..8daeb496ba7 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -234,6 +234,23 @@ struct xt_target_param { const void *targinfo; }; +/** + * struct xt_tgchk_param - parameters for target extensions' + * checkentry functions + * + * @entryinfo: the family-specific rule data + * (struct ipt_entry, ip6t_entry, arpt_entry, ebt_entry) + * + * Other fields see above. + */ +struct xt_tgchk_param { + const char *table; + void *entryinfo; + const struct xt_target *target; + void *targinfo; + unsigned int hook_mask; +}; + struct xt_match { struct list_head list; @@ -291,11 +308,7 @@ struct xt_target hook_mask is a bitmask of hooks from which it can be called. */ /* Should return true or false. */ - bool (*checkentry)(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask); + bool (*checkentry)(const struct xt_tgchk_param *); /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_target *target, void *targinfo); @@ -376,10 +389,8 @@ extern void xt_unregister_matches(struct xt_match *match, unsigned int n); extern int xt_check_match(struct xt_mtchk_param *, u_int8_t family, unsigned int size, u_int8_t proto, bool inv_proto); -extern int xt_check_target(const struct xt_target *target, unsigned short family, - unsigned int size, const char *table, unsigned int hook, - unsigned short proto, int inv_proto, - const void *entry, void *targinfo); +extern int xt_check_target(struct xt_tgchk_param *, u_int8_t family, + unsigned int size, u_int8_t proto, bool inv_proto); extern struct xt_table *xt_register_table(struct net *net, struct xt_table *table, diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 568a690f6a6..d45e29cd1cf 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -310,9 +310,9 @@ extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb, #define FWINV(bool,invflg) ((bool) ^ !!(info->invflags & invflg)) /* True if the hook mask denotes that the rule is in a base chain, * used in the check() functions */ -#define BASE_CHAIN (hookmask & (1 << NF_BR_NUMHOOKS)) +#define BASE_CHAIN (par->hook_mask & (1 << NF_BR_NUMHOOKS)) /* Clear the bit in the hook mask that tells if the rule is on a base chain */ -#define CLEAR_BASE_CHAIN_BIT (hookmask &= ~(1 << NF_BR_NUMHOOKS)) +#define CLEAR_BASE_CHAIN_BIT (par->hook_mask &= ~(1 << NF_BR_NUMHOOKS)) /* True if the target is not a standard target */ #define INVALID_TARGET (info->target < -NUM_STANDARD_TARGETS || info->target >= 0) diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c index fc94699f719..76584cd72e5 100644 --- a/net/bridge/netfilter/ebt_arpreply.c +++ b/net/bridge/netfilter/ebt_arpreply.c @@ -57,20 +57,16 @@ ebt_arpreply_tg(struct sk_buff *skb, const struct xt_target_param *par) return info->target; } -static bool -ebt_arpreply_tg_check(const char *tablename, const void *entry, - const struct xt_target *target, void *data, - unsigned int hookmask) +static bool ebt_arpreply_tg_check(const struct xt_tgchk_param *par) { - const struct ebt_arpreply_info *info = data; - const struct ebt_entry *e = entry; + const struct ebt_arpreply_info *info = par->targinfo; + const struct ebt_entry *e = par->entryinfo; if (BASE_CHAIN && info->target == EBT_RETURN) return false; if (e->ethproto != htons(ETH_P_ARP) || e->invflags & EBT_IPROTO) return false; - CLEAR_BASE_CHAIN_BIT; return true; } diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index bb5d79e0bee..6b49ea9e31f 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -26,19 +26,20 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_target_param *par) return info->target; } -static bool -ebt_dnat_tg_check(const char *tablename, const void *entry, - const struct xt_target *target, void *data, - unsigned int hookmask) +static bool ebt_dnat_tg_check(const struct xt_tgchk_param *par) { - const struct ebt_nat_info *info = data; + const struct ebt_nat_info *info = par->targinfo; + unsigned int hook_mask; if (BASE_CHAIN && info->target == EBT_RETURN) return false; - CLEAR_BASE_CHAIN_BIT; - if ( (strcmp(tablename, "nat") || - (hookmask & ~((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT)))) && - (strcmp(tablename, "broute") || hookmask & ~(1 << NF_BR_BROUTING)) ) + + hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS); + if ((strcmp(par->table, "nat") != 0 || + (hook_mask & ~((1 << NF_BR_PRE_ROUTING) | + (1 << NF_BR_LOCAL_OUT)))) && + (strcmp(par->table, "broute") != 0 || + hook_mask & ~(1 << NF_BR_BROUTING))) return false; if (INVALID_TARGET) return false; diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 87de5fccb2f..3d33c608906 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -24,12 +24,9 @@ static DEFINE_SPINLOCK(ebt_log_lock); -static bool -ebt_log_tg_check(const char *table, const void *entry, - const struct xt_target *target, void *data, - unsigned int hook_mask) +static bool ebt_log_tg_check(const struct xt_tgchk_param *par) { - struct ebt_log_info *info = data; + struct ebt_log_info *info = par->targinfo; if (info->bitmask & ~EBT_LOG_MASK) return false; diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c index aafc456c3c3..2fee7e8e2e9 100644 --- a/net/bridge/netfilter/ebt_mark.c +++ b/net/bridge/netfilter/ebt_mark.c @@ -36,18 +36,14 @@ ebt_mark_tg(struct sk_buff *skb, const struct xt_target_param *par) return info->target | ~EBT_VERDICT_BITS; } -static bool -ebt_mark_tg_check(const char *table, const void *e, - const struct xt_target *target, void *data, - unsigned int hookmask) +static bool ebt_mark_tg_check(const struct xt_tgchk_param *par) { - const struct ebt_mark_t_info *info = data; + const struct ebt_mark_t_info *info = par->targinfo; int tmp; tmp = info->target | ~EBT_VERDICT_BITS; if (BASE_CHAIN && tmp == EBT_RETURN) return false; - CLEAR_BASE_CHAIN_BIT; if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0) return false; tmp = info->target & ~EBT_VERDICT_BITS; diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c index 6a28d994cf7..2a63d996dd4 100644 --- a/net/bridge/netfilter/ebt_nflog.c +++ b/net/bridge/netfilter/ebt_nflog.c @@ -35,12 +35,9 @@ ebt_nflog_tg(struct sk_buff *skb, const struct xt_target_param *par) return EBT_CONTINUE; } -static bool -ebt_nflog_tg_check(const char *table, const void *e, - const struct xt_target *target, void *data, - unsigned int hookmask) +static bool ebt_nflog_tg_check(const struct xt_tgchk_param *par) { - struct ebt_nflog_info *info = data; + struct ebt_nflog_info *info = par->targinfo; if (info->flags & ~EBT_NFLOG_MASK) return false; diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index 0cfe2fad940..c8a49f7a57b 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -32,18 +32,19 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_target_param *par) return info->target; } -static bool -ebt_redirect_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *data, - unsigned int hookmask) +static bool ebt_redirect_tg_check(const struct xt_tgchk_param *par) { - const struct ebt_redirect_info *info = data; + const struct ebt_redirect_info *info = par->targinfo; + unsigned int hook_mask; if (BASE_CHAIN && info->target == EBT_RETURN) return false; - CLEAR_BASE_CHAIN_BIT; - if ( (strcmp(tablename, "nat") || hookmask & ~(1 << NF_BR_PRE_ROUTING)) && - (strcmp(tablename, "broute") || hookmask & ~(1 << NF_BR_BROUTING)) ) + + hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS); + if ((strcmp(par->table, "nat") != 0 || + hook_mask & ~(1 << NF_BR_PRE_ROUTING)) && + (strcmp(par->table, "broute") != 0 || + hook_mask & ~(1 << NF_BR_BROUTING))) return false; if (INVALID_TARGET) return false; diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c index f55960eee99..8d04d4c302b 100644 --- a/net/bridge/netfilter/ebt_snat.c +++ b/net/bridge/netfilter/ebt_snat.c @@ -42,18 +42,14 @@ out: return info->target | ~EBT_VERDICT_BITS; } -static bool -ebt_snat_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *data, - unsigned int hookmask) +static bool ebt_snat_tg_check(const struct xt_tgchk_param *par) { - const struct ebt_nat_info *info = data; + const struct ebt_nat_info *info = par->targinfo; int tmp; tmp = info->target | ~EBT_VERDICT_BITS; if (BASE_CHAIN && tmp == EBT_RETURN) return false; - CLEAR_BASE_CHAIN_BIT; if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0) return false; diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index bfedf12cbf4..2c6d6823e70 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -254,12 +254,9 @@ ebt_ulog_tg(struct sk_buff *skb, const struct xt_target_param *par) return EBT_CONTINUE; } -static bool -ebt_ulog_tg_check(const char *table, const void *entry, - const struct xt_target *target, void *data, - unsigned int hookmask) +static bool ebt_ulog_tg_check(const struct xt_tgchk_param *par) { - struct ebt_ulog_info *uloginfo = data; + struct ebt_ulog_info *uloginfo = par->targinfo; if (uloginfo->nlgroup > 31) return false; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index a1156bab4a0..cf823c21c16 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -363,9 +363,10 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, } static inline int -ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e, - const char *name, unsigned int hookmask, unsigned int *cnt) +ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par, + unsigned int *cnt) { + const struct ebt_entry *e = par->entryinfo; struct xt_target *watcher; size_t left = ((char *)e + e->target_offset) - (char *)w; int ret; @@ -383,9 +384,10 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e, return -ENOENT; w->u.watcher = watcher; - ret = xt_check_target(watcher, NFPROTO_BRIDGE, w->watcher_size, - name, hookmask, e->ethproto, e->invflags & EBT_IPROTO, - e, w->data); + par->target = watcher; + par->targinfo = w->data; + ret = xt_check_target(par, NFPROTO_BRIDGE, w->watcher_size, + e->ethproto, e->invflags & EBT_IPROTO); if (ret < 0) { module_put(watcher->me); return ret; @@ -619,6 +621,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, size_t gap; int ret; struct xt_mtchk_param mtpar; + struct xt_tgchk_param tgpar; /* don't mess with the struct ebt_entries */ if (e->bitmask == 0) @@ -660,14 +663,14 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, } i = 0; - mtpar.table = name; - mtpar.entryinfo = e; - mtpar.hook_mask = hookmask; + mtpar.table = tgpar.table = name; + mtpar.entryinfo = tgpar.entryinfo = e; + mtpar.hook_mask = tgpar.hook_mask = hookmask; ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i); if (ret != 0) goto cleanup_matches; j = 0; - ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, e, name, hookmask, &j); + ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j); if (ret != 0) goto cleanup_watchers; t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); @@ -703,9 +706,10 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, goto cleanup_watchers; } - ret = xt_check_target(target, NFPROTO_BRIDGE, t->target_size, - name, hookmask, e->ethproto, e->invflags & EBT_IPROTO, - e, t->data); + tgpar.target = target; + tgpar.targinfo = t->data; + ret = xt_check_target(&tgpar, NFPROTO_BRIDGE, t->target_size, + e->ethproto, e->invflags & EBT_IPROTO); if (ret < 0) { module_put(target->me); goto cleanup_watchers; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 5b631ad74b5..b3238d0101c 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -457,16 +457,18 @@ static inline int check_entry(struct arpt_entry *e, const char *name) static inline int check_target(struct arpt_entry *e, const char *name) { - struct arpt_entry_target *t; - struct xt_target *target; + struct arpt_entry_target *t = arpt_get_target(e); int ret; - - t = arpt_get_target(e); - target = t->u.kernel.target; - - ret = xt_check_target(target, NFPROTO_ARP, - t->u.target_size - sizeof(*t), - name, e->comefrom, 0, 0, e, t->data); + struct xt_tgchk_param par = { + .table = name, + .entryinfo = e, + .target = t->u.kernel.target, + .targinfo = t->data, + .hook_mask = e->comefrom, + }; + + ret = xt_check_target(&par, NFPROTO_ARP, + t->u.target_size - sizeof(*t), 0, false); if (ret < 0) { duprintf("arp_tables: check failed for `%s'.\n", t->u.kernel.target->name); diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index 0bf81b35369..b0d5b1d0a76 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -54,11 +54,9 @@ target(struct sk_buff *skb, const struct xt_target_param *par) return mangle->target; } -static bool -checkentry(const char *tablename, const void *e, const struct xt_target *target, - void *targinfo, unsigned int hook_mask) +static bool checkentry(const struct xt_tgchk_param *par) { - const struct arpt_mangle *mangle = targinfo; + const struct arpt_mangle *mangle = par->targinfo; if (mangle->flags & ~ARPT_MANGLE_MASK || !(mangle->flags & ARPT_MANGLE_MASK)) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 0f8ecf39022..e592c54d499 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -655,15 +655,18 @@ err: static int check_target(struct ipt_entry *e, const char *name) { - struct ipt_entry_target *t; - struct xt_target *target; + struct ipt_entry_target *t = ipt_get_target(e); + struct xt_tgchk_param par = { + .table = name, + .entryinfo = e, + .target = t->u.kernel.target, + .targinfo = t->data, + .hook_mask = e->comefrom, + }; int ret; - t = ipt_get_target(e); - target = t->u.kernel.target; - ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t), - name, e->comefrom, e->ip.proto, - e->ip.invflags & IPT_INV_PROTO, e, t->data); + ret = xt_check_target(&par, NFPROTO_IPV4, t->u.target_size - sizeof(*t), + e->ip.proto, e->ip.invflags & IPT_INV_PROTO); if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", t->u.kernel.target->name); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 67e8aa8f34f..6c7254e0256 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -347,13 +347,10 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -clusterip_tg_check(const char *tablename, const void *e_void, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool clusterip_tg_check(const struct xt_tgchk_param *par) { - struct ipt_clusterip_tgt_info *cipinfo = targinfo; - const struct ipt_entry *e = e_void; + struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; + const struct ipt_entry *e = par->entryinfo; struct clusterip_config *config; @@ -404,9 +401,9 @@ clusterip_tg_check(const char *tablename, const void *e_void, } cipinfo->config = config; - if (nf_ct_l3proto_try_module_get(target->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", target->family); + "proto=%u\n", par->target->family); return false; } diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index e37f181e829..f7e2fa0974d 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -93,13 +93,10 @@ ecn_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -ecn_tg_check(const char *tablename, const void *e_void, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool ecn_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_ECN_info *einfo = targinfo; - const struct ipt_entry *e = e_void; + const struct ipt_ECN_info *einfo = par->targinfo; + const struct ipt_entry *e = par->entryinfo; if (einfo->operation & IPT_ECN_OP_MASK) { printk(KERN_WARNING "ECN: unsupported ECN operation %x\n", diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index e9942aed35a..fc6ce04a3e3 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -440,12 +440,9 @@ log_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -log_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool log_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_log_info *loginfo = targinfo; + const struct ipt_log_info *loginfo = par->targinfo; if (loginfo->level >= 8) { pr_debug("LOG: level %u >= 8\n", loginfo->level); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index e0d9d49b79e..f389f60cb10 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -31,12 +31,9 @@ MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); static DEFINE_RWLOCK(masq_lock); /* FIXME: Multiple targets. --RR */ -static bool -masquerade_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool masquerade_tg_check(const struct xt_tgchk_param *par) { - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { pr_debug("masquerade_check: bad MAP_IPS.\n"); diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index cf18f23b346..7c29582d4ec 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -22,12 +22,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Svenning Soerensen "); MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); -static bool -netmap_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool netmap_tg_check(const struct xt_tgchk_param *par) { - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { pr_debug("NETMAP:check: bad MAP_IPS.\n"); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index 23adb09ddfb..698e5e78685 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -26,12 +26,9 @@ MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); /* FIXME: Take multiple ranges --RR */ -static bool -redirect_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool redirect_tg_check(const struct xt_tgchk_param *par) { - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { pr_debug("redirect_check: bad MAP_IPS.\n"); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index b36071bb107..0b4b6e0ff2b 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -175,13 +175,10 @@ reject_tg(struct sk_buff *skb, const struct xt_target_param *par) return NF_DROP; } -static bool -reject_tg_check(const char *tablename, const void *e_void, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool reject_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_reject_info *rejinfo = targinfo; - const struct ipt_entry *e = e_void; + const struct ipt_reject_info *rejinfo = par->targinfo; + const struct ipt_entry *e = par->entryinfo; if (rejinfo->with == IPT_ICMP_ECHOREPLY) { printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c index 05cbfd2f747..6d76aae90cc 100644 --- a/net/ipv4/netfilter/ipt_TTL.c +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -59,12 +59,9 @@ ttl_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -ttl_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool ttl_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_TTL_info *info = targinfo; + const struct ipt_TTL_info *info = par->targinfo; if (info->mode > IPT_TTL_MAXMODE) { printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 46c0df0dc2d..18a2826b57c 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -313,12 +313,9 @@ static void ipt_logfn(u_int8_t pf, ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); } -static bool -ulog_tg_check(const char *tablename, const void *e, - const struct xt_target *target, void *targinfo, - unsigned int hookmask) +static bool ulog_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_ulog_info *loginfo = targinfo; + const struct ipt_ulog_info *loginfo = par->targinfo; if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { pr_debug("ipt_ULOG: prefix term %i\n", diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 83170ff131f..bea54a68510 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -128,13 +128,9 @@ ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par) return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); } -static bool ipt_snat_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool ipt_snat_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; /* Must be a valid range */ if (mr->rangesize != 1) { @@ -144,13 +140,9 @@ static bool ipt_snat_checkentry(const char *tablename, return true; } -static bool ipt_dnat_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool ipt_dnat_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = par->targinfo; /* Must be a valid range */ if (mr->rangesize != 1) { diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index ee0986cdbd6..ca14fb8bd36 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -679,15 +679,19 @@ err: static int check_target(struct ip6t_entry *e, const char *name) { - struct ip6t_entry_target *t; - struct xt_target *target; + struct ip6t_entry_target *t = ip6t_get_target(e); + struct xt_tgchk_param par = { + .table = name, + .entryinfo = e, + .target = t->u.kernel.target, + .targinfo = t->data, + .hook_mask = e->comefrom, + }; int ret; t = ip6t_get_target(e); - target = t->u.kernel.target; - ret = xt_check_target(target, AF_INET6, t->u.target_size - sizeof(*t), - name, e->comefrom, e->ipv6.proto, - e->ipv6.invflags & IP6T_INV_PROTO, e, t->data); + ret = xt_check_target(&par, NFPROTO_IPV6, t->u.target_size - sizeof(*t), + e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", t->u.kernel.target->name); diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c index ac759a54f2c..27b5adf670a 100644 --- a/net/ipv6/netfilter/ip6t_HL.c +++ b/net/ipv6/netfilter/ip6t_HL.c @@ -54,12 +54,9 @@ hl_tg6(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -hl_tg6_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool hl_tg6_check(const struct xt_tgchk_param *par) { - const struct ip6t_HL_info *info = targinfo; + const struct ip6t_HL_info *info = par->targinfo; if (info->mode > IP6T_HL_MAXMODE) { printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n", diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index a31d3ecd1fc..caa441d0956 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -453,12 +453,9 @@ log_tg6(struct sk_buff *skb, const struct xt_target_param *par) } -static bool -log_tg6_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool log_tg6_check(const struct xt_tgchk_param *par) { - const struct ip6t_log_info *loginfo = targinfo; + const struct ip6t_log_info *loginfo = par->targinfo; if (loginfo->level >= 8) { pr_debug("LOG: level %u >= 8\n", loginfo->level); diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 1d5f3a70ed0..0981b4ccb8b 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -213,13 +213,10 @@ reject_tg6(struct sk_buff *skb, const struct xt_target_param *par) return NF_DROP; } -static bool -reject_tg6_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool reject_tg6_check(const struct xt_tgchk_param *par) { - const struct ip6t_reject_info *rejinfo = targinfo; - const struct ip6t_entry *e = entry; + const struct ip6t_reject_info *rejinfo = par->targinfo; + const struct ip6t_entry *e = par->entryinfo; if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) { printk("ip6t_REJECT: ECHOREPLY is not supported.\n"); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 817ab14f7cd..f29513cd139 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -471,35 +471,35 @@ int xt_compat_match_to_user(struct xt_entry_match *m, void __user **dstptr, EXPORT_SYMBOL_GPL(xt_compat_match_to_user); #endif /* CONFIG_COMPAT */ -int xt_check_target(const struct xt_target *target, unsigned short family, - unsigned int size, const char *table, unsigned int hook_mask, - unsigned short proto, int inv_proto, const void *entry, - void *targinfo) +int xt_check_target(struct xt_tgchk_param *par, u_int8_t family, + unsigned int size, u_int8_t proto, bool inv_proto) { - if (XT_ALIGN(target->targetsize) != size) { + if (XT_ALIGN(par->target->targetsize) != size) { printk("%s_tables: %s target: invalid size %Zu != %u\n", - xt_prefix[family], target->name, - XT_ALIGN(target->targetsize), size); + xt_prefix[family], par->target->name, + XT_ALIGN(par->target->targetsize), size); return -EINVAL; } - if (target->table && strcmp(target->table, table)) { + if (par->target->table != NULL && + strcmp(par->target->table, par->table) != 0) { printk("%s_tables: %s target: only valid in %s table, not %s\n", - xt_prefix[family], target->name, target->table, table); + xt_prefix[family], par->target->name, + par->target->table, par->table); return -EINVAL; } - if (target->hooks && (hook_mask & ~target->hooks) != 0) { + if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { printk("%s_tables: %s target: bad hook_mask %#x/%#x\n", - xt_prefix[family], target->name, hook_mask, - target->hooks); + xt_prefix[family], par->target->name, par->hook_mask, + par->target->hooks); return -EINVAL; } - if (target->proto && (target->proto != proto || inv_proto)) { + if (par->target->proto && (par->target->proto != proto || inv_proto)) { printk("%s_tables: %s target: only valid for protocol %u\n", - xt_prefix[family], target->name, target->proto); + xt_prefix[family], par->target->name, + par->target->proto); return -EINVAL; } - if (target->checkentry != NULL && - !target->checkentry(table, entry, target, targinfo, hook_mask)) + if (par->target->checkentry != NULL && !par->target->checkentry(par)) return -EINVAL; return 0; } diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index 95ed267328a..8fc9f35e67d 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -112,18 +112,15 @@ connmark_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -connmark_tg_check_v0(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool connmark_tg_check_v0(const struct xt_tgchk_param *par) { - const struct xt_connmark_target_info *matchinfo = targinfo; + const struct xt_connmark_target_info *matchinfo = par->targinfo; if (matchinfo->mode == XT_CONNMARK_RESTORE) { - if (strcmp(tablename, "mangle") != 0) { + if (strcmp(par->table, "mangle") != 0) { printk(KERN_WARNING "CONNMARK: restore can only be " "called from \"mangle\" table, not \"%s\"\n", - tablename); + par->table); return false; } } @@ -131,22 +128,19 @@ connmark_tg_check_v0(const char *tablename, const void *entry, printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n"); return false; } - if (nf_ct_l3proto_try_module_get(target->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", target->family); + "proto=%u\n", par->target->family); return false; } return true; } -static bool -connmark_tg_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool connmark_tg_check(const struct xt_tgchk_param *par) { - if (nf_ct_l3proto_try_module_get(target->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { printk(KERN_WARNING "cannot load conntrack support for " - "proto=%u\n", target->family); + "proto=%u\n", par->target->family); return false; } return true; diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 2211a2cef28..2041a3d4b4d 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -85,16 +85,14 @@ connsecmark_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -connsecmark_tg_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool connsecmark_tg_check(const struct xt_tgchk_param *par) { - const struct xt_connsecmark_target_info *info = targinfo; + const struct xt_connsecmark_target_info *info = par->targinfo; - if (strcmp(tablename, "mangle") && strcmp(tablename, "security")) { + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "security") != 0) { printk(KERN_INFO PFX "target only valid in the \'mangle\' " - "or \'security\' tables, not \'%s\'.\n", tablename); + "or \'security\' tables, not \'%s\'.\n", par->table); return false; } @@ -108,9 +106,9 @@ connsecmark_tg_check(const char *tablename, const void *entry, return false; } - if (nf_ct_l3proto_try_module_get(target->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", target->family); + "proto=%u\n", par->target->family); return false; } return true; diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index c78e80afdf3..6a347e768f8 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -61,15 +61,12 @@ dscp_tg6(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -dscp_tg_check(const char *tablename, const void *e_void, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool dscp_tg_check(const struct xt_tgchk_param *par) { - const u_int8_t dscp = ((struct xt_DSCP_info *)targinfo)->dscp; + const struct xt_DSCP_info *info = par->targinfo; - if (dscp > XT_DSCP_MAX) { - printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp); + if (info->dscp > XT_DSCP_MAX) { + printk(KERN_WARNING "DSCP: dscp %x out of range\n", info->dscp); return false; } return true; @@ -95,12 +92,10 @@ tos_tg_v0(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -tos_tg_check_v0(const char *tablename, const void *e_void, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool tos_tg_check_v0(const struct xt_tgchk_param *par) { - const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos; + const struct ipt_tos_target_info *info = par->targinfo; + const uint8_t tos = info->tos; if (tos != IPTOS_LOWDELAY && tos != IPTOS_THROUGHPUT && tos != IPTOS_RELIABILITY && tos != IPTOS_MINCOST && diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c index 27d03f39611..123ee0ba78c 100644 --- a/net/netfilter/xt_MARK.c +++ b/net/netfilter/xt_MARK.c @@ -66,12 +66,9 @@ mark_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -mark_tg_check_v0(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool mark_tg_check_v0(const struct xt_tgchk_param *par) { - const struct xt_mark_target_info *markinfo = targinfo; + const struct xt_mark_target_info *markinfo = par->targinfo; if (markinfo->mark > 0xffffffff) { printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); @@ -80,12 +77,9 @@ mark_tg_check_v0(const char *tablename, const void *entry, return true; } -static bool -mark_tg_check_v1(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool mark_tg_check_v1(const struct xt_tgchk_param *par) { - const struct xt_mark_target_info_v1 *markinfo = targinfo; + const struct xt_mark_target_info_v1 *markinfo = par->targinfo; if (markinfo->mode != XT_MARK_SET && markinfo->mode != XT_MARK_AND diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index 3218ad63bd1..56ee4f118b5 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -36,12 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -nflog_tg_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targetinfo, - unsigned int hookmask) +static bool nflog_tg_check(const struct xt_tgchk_param *par) { - const struct xt_nflog_info *info = targetinfo; + const struct xt_nflog_info *info = par->targinfo; if (info->flags & ~XT_NFLOG_MASK) return false; diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 92e33524f78..edf4ab1f30f 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -84,14 +84,9 @@ xt_rateest_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -xt_rateest_tg_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { - struct xt_rateest_target_info *info = targinfo; + struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { struct nlattr opt; diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index ad05214e380..e5777227192 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -80,16 +80,14 @@ static bool checkentry_selinux(struct xt_secmark_target_info *info) return true; } -static bool -secmark_tg_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool secmark_tg_check(const struct xt_tgchk_param *par) { - struct xt_secmark_target_info *info = targinfo; + struct xt_secmark_target_info *info = par->targinfo; - if (strcmp(tablename, "mangle") && strcmp(tablename, "security")) { + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "security") != 0) { printk(KERN_INFO PFX "target only valid in the \'mangle\' " - "or \'security\' tables, not \'%s\'.\n", tablename); + "or \'security\' tables, not \'%s\'.\n", par->table); return false; } diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index e08762d9b0f..4f3b1f80879 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -237,16 +237,13 @@ static inline bool find_syn_match(const struct xt_entry_match *m) return false; } -static bool -tcpmss_tg4_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool tcpmss_tg4_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = targinfo; - const struct ipt_entry *e = entry; + const struct xt_tcpmss_info *info = par->targinfo; + const struct ipt_entry *e = par->entryinfo; if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (hook_mask & ~((1 << NF_INET_FORWARD) | + (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { printk("xt_TCPMSS: path-MTU clamping only supported in " @@ -260,16 +257,13 @@ tcpmss_tg4_check(const char *tablename, const void *entry, } #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) -static bool -tcpmss_tg6_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool tcpmss_tg6_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = targinfo; - const struct ip6t_entry *e = entry; + const struct xt_tcpmss_info *info = par->targinfo; + const struct ip6t_entry *e = par->entryinfo; if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (hook_mask & ~((1 << NF_INET_FORWARD) | + (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { printk("xt_TCPMSS: path-MTU clamping only supported in " diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index f08c49ea4bd..1340c2fa362 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -59,14 +59,9 @@ tproxy_tg(struct sk_buff *skb, const struct xt_target_param *par) return NF_DROP; } -static bool -tproxy_tg_check(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targetinfo, - unsigned int hook_mask) +static bool tproxy_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_ip *i = entry; + const struct ipt_ip *i = par->entryinfo; if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) && !(i->invflags & IPT_INV_PROTO)) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 89791a56429..a54dc3f8234 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -40,6 +40,7 @@ static struct tcf_hashinfo ipt_hash_info = { static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook) { + struct xt_tgchk_param par; struct xt_target *target; int ret = 0; @@ -49,9 +50,14 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int return -ENOENT; t->u.kernel.target = target; - - ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t), - table, hook, 0, 0, NULL, t->data); + par.table = table; + par.entryinfo = NULL; + par.target = target; + par.targinfo = t->data; + par.hook_mask = hook; + + ret = xt_check_target(&par, NFPROTO_IPV4, + t->u.target_size - sizeof(*t), 0, false); if (ret < 0) { module_put(t->u.kernel.target->me); return ret; -- cgit v1.2.3-70-g09d2 From a2df1648ba615dd5908e9a1fa7b2f133fa302487 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:19 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (6/6) This patch does this for target extensions' destroy functions. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 8 +++++++- net/bridge/netfilter/ebtables.c | 19 +++++++++++++------ net/ipv4/netfilter/arp_tables.c | 9 ++++++--- net/ipv4/netfilter/ip_tables.c | 10 +++++++--- net/ipv4/netfilter/ipt_CLUSTERIP.c | 6 +++--- net/ipv6/netfilter/ip6_tables.c | 10 +++++++--- net/netfilter/xt_CONNMARK.c | 5 ++--- net/netfilter/xt_CONNSECMARK.c | 5 ++--- net/netfilter/xt_RATEEST.c | 5 ++--- net/netfilter/xt_SECMARK.c | 2 +- net/sched/act_ipt.c | 10 +++++++--- 11 files changed, 57 insertions(+), 32 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 8daeb496ba7..e3b3b669a14 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -251,6 +251,12 @@ struct xt_tgchk_param { unsigned int hook_mask; }; +/* Target destructor parameters */ +struct xt_tgdtor_param { + const struct xt_target *target; + void *targinfo; +}; + struct xt_match { struct list_head list; @@ -311,7 +317,7 @@ struct xt_target bool (*checkentry)(const struct xt_tgchk_param *); /* Called when entry of this type deleted. */ - void (*destroy)(const struct xt_target *target, void *targinfo); + void (*destroy)(const struct xt_tgdtor_param *); /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, void *src); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index cf823c21c16..29d8061fa15 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -581,18 +581,23 @@ ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i) static inline int ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i) { + struct xt_tgdtor_param par; + if (i && (*i)-- == 0) return 1; - if (w->u.watcher->destroy) - w->u.watcher->destroy(w->u.watcher, w->data); - module_put(w->u.watcher->me); + par.target = w->u.watcher; + par.targinfo = w->data; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); return 0; } static inline int ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt) { + struct xt_tgdtor_param par; struct ebt_entry_target *t; if (e->bitmask == 0) @@ -603,10 +608,12 @@ ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt) EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, NULL); EBT_MATCH_ITERATE(e, ebt_cleanup_match, NULL); t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); - if (t->u.target->destroy) - t->u.target->destroy(t->u.target, t->data); - module_put(t->u.target->me); + par.target = t->u.target; + par.targinfo = t->data; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); return 0; } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b3238d0101c..3bab78330cf 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -557,15 +557,18 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) { + struct xt_tgdtor_param par; struct arpt_entry_target *t; if (i && (*i)-- == 0) return 1; t = arpt_get_target(e); - if (t->u.kernel.target->destroy) - t->u.kernel.target->destroy(t->u.kernel.target, t->data); - module_put(t->u.kernel.target->me); + par.target = t->u.kernel.target; + par.targinfo = t->data; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); return 0; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index e592c54d499..50b9a6c34c3 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -768,6 +768,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, static int cleanup_entry(struct ipt_entry *e, unsigned int *i) { + struct xt_tgdtor_param par; struct ipt_entry_target *t; if (i && (*i)-- == 0) @@ -776,9 +777,12 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i) /* Cleanup all matches */ IPT_MATCH_ITERATE(e, cleanup_match, NULL); t = ipt_get_target(e); - if (t->u.kernel.target->destroy) - t->u.kernel.target->destroy(t->u.kernel.target, t->data); - module_put(t->u.kernel.target->me); + + par.target = t->u.kernel.target; + par.targinfo = t->data; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); return 0; } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 6c7254e0256..7ac1677419a 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -411,9 +411,9 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par) } /* drop reference count of cluster config when rule is deleted */ -static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo) +static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) { - const struct ipt_clusterip_tgt_info *cipinfo = targinfo; + const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; /* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ @@ -421,7 +421,7 @@ static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo) clusterip_config_put(cipinfo->config); - nf_ct_l3proto_module_put(target->family); + nf_ct_l3proto_module_put(par->target->family); } #ifdef CONFIG_COMPAT diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index ca14fb8bd36..d934a699463 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -793,6 +793,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, static int cleanup_entry(struct ip6t_entry *e, unsigned int *i) { + struct xt_tgdtor_param par; struct ip6t_entry_target *t; if (i && (*i)-- == 0) @@ -801,9 +802,12 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i) /* Cleanup all matches */ IP6T_MATCH_ITERATE(e, cleanup_match, NULL); t = ip6t_get_target(e); - if (t->u.kernel.target->destroy) - t->u.kernel.target->destroy(t->u.kernel.target, t->data); - module_put(t->u.kernel.target->me); + + par.target = t->u.kernel.target; + par.targinfo = t->data; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); return 0; } diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index 8fc9f35e67d..c5a5072e005 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -146,10 +146,9 @@ static bool connmark_tg_check(const struct xt_tgchk_param *par) return true; } -static void -connmark_tg_destroy(const struct xt_target *target, void *targinfo) +static void connmark_tg_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(target->family); + nf_ct_l3proto_module_put(par->target->family); } #ifdef CONFIG_COMPAT diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 2041a3d4b4d..b6e3f3f125f 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -114,10 +114,9 @@ static bool connsecmark_tg_check(const struct xt_tgchk_param *par) return true; } -static void -connsecmark_tg_destroy(const struct xt_target *target, void *targinfo) +static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(target->family); + nf_ct_l3proto_module_put(par->target->family); } static struct xt_target connsecmark_tg_reg[] __read_mostly = { diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index edf4ab1f30f..43f5676b1af 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -139,10 +139,9 @@ err1: return false; } -static void xt_rateest_tg_destroy(const struct xt_target *target, - void *targinfo) +static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { - struct xt_rateest_target_info *info = targinfo; + struct xt_rateest_target_info *info = par->targinfo; xt_rateest_put(info->est); } diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index e5777227192..7a6f9e6f5df 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -113,7 +113,7 @@ static bool secmark_tg_check(const struct xt_tgchk_param *par) return true; } -static void secmark_tg_destroy(const struct xt_target *target, void *targinfo) +static void secmark_tg_destroy(const struct xt_tgdtor_param *par) { switch (mode) { case SECMARK_MODE_SEL: diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index a54dc3f8234..b951d422db9 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -67,9 +67,13 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int static void ipt_destroy_target(struct ipt_entry_target *t) { - if (t->u.kernel.target->destroy) - t->u.kernel.target->destroy(t->u.kernel.target, t->data); - module_put(t->u.kernel.target->me); + struct xt_tgdtor_param par = { + .target = t->u.kernel.target, + .targinfo = t->data, + }; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); } static int tcf_ipt_release(struct tcf_ipt *ipt, int bind) -- cgit v1.2.3-70-g09d2 From 916a917dfec18535ff9e2afdafba82e6279eb4f4 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:20 +0200 Subject: netfilter: xtables: provide invoked family value to extensions By passing in the family through which extensions were invoked, a bit of data space can be reclaimed. The "family" member will be added to the parameter structures and the check functions be adjusted. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- include/linux/netfilter/x_tables.h | 12 ++++++++++-- net/bridge/netfilter/ebtables.c | 11 ++++++++--- net/ipv4/netfilter/arp_tables.c | 6 ++++-- net/ipv4/netfilter/ip_tables.c | 10 ++++++++-- net/ipv6/netfilter/ip6_tables.c | 10 ++++++++-- net/netfilter/x_tables.c | 23 ++++++++++++----------- net/sched/act_ipt.c | 4 ++-- 7 files changed, 52 insertions(+), 24 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index e3b3b669a14..be41b609c88 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -183,6 +183,8 @@ struct xt_counters_info * @fragoff: packet is a fragment, this is the data offset * @thoff: position of transport header relative to skb->data * @hotdrop: drop packet if we had inspection problems + * @family: Actual NFPROTO_* through which the function is invoked + * (helpful when match->family == NFPROTO_UNSPEC) */ struct xt_match_param { const struct net_device *in, *out; @@ -191,6 +193,7 @@ struct xt_match_param { int fragoff; unsigned int thoff; bool *hotdrop; + u_int8_t family; }; /** @@ -210,12 +213,14 @@ struct xt_mtchk_param { const struct xt_match *match; void *matchinfo; unsigned int hook_mask; + u_int8_t family; }; /* Match destructor parameters */ struct xt_mtdtor_param { const struct xt_match *match; void *matchinfo; + u_int8_t family; }; /** @@ -232,6 +237,7 @@ struct xt_target_param { unsigned int hooknum; const struct xt_target *target; const void *targinfo; + u_int8_t family; }; /** @@ -249,12 +255,14 @@ struct xt_tgchk_param { const struct xt_target *target; void *targinfo; unsigned int hook_mask; + u_int8_t family; }; /* Target destructor parameters */ struct xt_tgdtor_param { const struct xt_target *target; void *targinfo; + u_int8_t family; }; struct xt_match @@ -393,9 +401,9 @@ extern void xt_unregister_match(struct xt_match *target); extern int xt_register_matches(struct xt_match *match, unsigned int n); extern void xt_unregister_matches(struct xt_match *match, unsigned int n); -extern int xt_check_match(struct xt_mtchk_param *, u_int8_t family, +extern int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto, bool inv_proto); -extern int xt_check_target(struct xt_tgchk_param *, u_int8_t family, +extern int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto, bool inv_proto); extern struct xt_table *xt_register_table(struct net *net, diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 29d8061fa15..5bb88eb0aad 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -160,6 +160,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, struct xt_match_param mtpar; struct xt_target_param tgpar; + mtpar.family = tgpar.family = NFPROTO_BRIDGE; mtpar.in = tgpar.in = in; mtpar.out = tgpar.out = out; mtpar.hotdrop = &hotdrop; @@ -351,7 +352,7 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, par->match = match; par->matchinfo = m->data; - ret = xt_check_match(par, NFPROTO_BRIDGE, m->match_size, + ret = xt_check_match(par, m->match_size, e->ethproto, e->invflags & EBT_IPROTO); if (ret < 0) { module_put(match->me); @@ -386,7 +387,7 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par, par->target = watcher; par->targinfo = w->data; - ret = xt_check_target(par, NFPROTO_BRIDGE, w->watcher_size, + ret = xt_check_target(par, w->watcher_size, e->ethproto, e->invflags & EBT_IPROTO); if (ret < 0) { module_put(watcher->me); @@ -572,6 +573,7 @@ ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i) par.match = m->u.match; par.matchinfo = m->data; + par.family = NFPROTO_BRIDGE; if (par.match->destroy != NULL) par.match->destroy(&par); module_put(par.match->me); @@ -588,6 +590,7 @@ ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i) par.target = w->u.watcher; par.targinfo = w->data; + par.family = NFPROTO_BRIDGE; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); @@ -611,6 +614,7 @@ ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt) par.target = t->u.target; par.targinfo = t->data; + par.family = NFPROTO_BRIDGE; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); @@ -673,6 +677,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, mtpar.table = tgpar.table = name; mtpar.entryinfo = tgpar.entryinfo = e; mtpar.hook_mask = tgpar.hook_mask = hookmask; + mtpar.family = tgpar.family = NFPROTO_BRIDGE; ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i); if (ret != 0) goto cleanup_matches; @@ -715,7 +720,7 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, tgpar.target = target; tgpar.targinfo = t->data; - ret = xt_check_target(&tgpar, NFPROTO_BRIDGE, t->target_size, + ret = xt_check_target(&tgpar, t->target_size, e->ethproto, e->invflags & EBT_IPROTO); if (ret < 0) { module_put(target->me); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3bab78330cf..8d70d29f1cc 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -246,6 +246,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, tgpar.in = in; tgpar.out = out; tgpar.hooknum = hook; + tgpar.family = NFPROTO_ARP; arp = arp_hdr(skb); do { @@ -465,10 +466,10 @@ static inline int check_target(struct arpt_entry *e, const char *name) .target = t->u.kernel.target, .targinfo = t->data, .hook_mask = e->comefrom, + .family = NFPROTO_ARP, }; - ret = xt_check_target(&par, NFPROTO_ARP, - t->u.target_size - sizeof(*t), 0, false); + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); if (ret < 0) { duprintf("arp_tables: check failed for `%s'.\n", t->u.kernel.target->name); @@ -566,6 +567,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) t = arpt_get_target(e); par.target = t->u.kernel.target; par.targinfo = t->data; + par.family = NFPROTO_ARP; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 50b9a6c34c3..213fb27debc 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -348,6 +348,7 @@ ipt_do_table(struct sk_buff *skb, mtpar.hotdrop = &hotdrop; mtpar.in = tgpar.in = in; mtpar.out = tgpar.out = out; + mtpar.family = tgpar.family = NFPROTO_IPV4; tgpar.hooknum = hook; read_lock_bh(&table->lock); @@ -579,6 +580,7 @@ cleanup_match(struct ipt_entry_match *m, unsigned int *i) par.match = m->u.kernel.match; par.matchinfo = m->data; + par.family = NFPROTO_IPV4; if (par.match->destroy != NULL) par.match->destroy(&par); module_put(par.match->me); @@ -616,7 +618,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, par->match = m->u.kernel.match; par->matchinfo = m->data; - ret = xt_check_match(par, NFPROTO_IPV4, m->u.match_size - sizeof(*m), + ret = xt_check_match(par, m->u.match_size - sizeof(*m), ip->proto, ip->invflags & IPT_INV_PROTO); if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", @@ -662,10 +664,11 @@ static int check_target(struct ipt_entry *e, const char *name) .target = t->u.kernel.target, .targinfo = t->data, .hook_mask = e->comefrom, + .family = NFPROTO_IPV4, }; int ret; - ret = xt_check_target(&par, NFPROTO_IPV4, t->u.target_size - sizeof(*t), + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), e->ip.proto, e->ip.invflags & IPT_INV_PROTO); if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", @@ -693,6 +696,7 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, mtpar.table = name; mtpar.entryinfo = &e->ip; mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV4; ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j); if (ret != 0) goto cleanup_matches; @@ -780,6 +784,7 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i) par.target = t->u.kernel.target; par.targinfo = t->data; + par.family = NFPROTO_IPV4; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); @@ -1659,6 +1664,7 @@ compat_check_entry(struct ipt_entry *e, const char *name, mtpar.table = name; mtpar.entryinfo = &e->ip; mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV4; ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j); if (ret) goto cleanup_matches; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index d934a699463..a33485dc81c 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -370,6 +370,7 @@ ip6t_do_table(struct sk_buff *skb, mtpar.hotdrop = &hotdrop; mtpar.in = tgpar.in = in; mtpar.out = tgpar.out = out; + mtpar.family = tgpar.family = NFPROTO_IPV6; tgpar.hooknum = hook; read_lock_bh(&table->lock); @@ -604,6 +605,7 @@ cleanup_match(struct ip6t_entry_match *m, unsigned int *i) par.match = m->u.kernel.match; par.matchinfo = m->data; + par.family = NFPROTO_IPV6; if (par.match->destroy != NULL) par.match->destroy(&par); module_put(par.match->me); @@ -640,7 +642,7 @@ static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par, par->match = m->u.kernel.match; par->matchinfo = m->data; - ret = xt_check_match(par, NFPROTO_IPV6, m->u.match_size - sizeof(*m), + ret = xt_check_match(par, m->u.match_size - sizeof(*m), ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", @@ -686,11 +688,12 @@ static int check_target(struct ip6t_entry *e, const char *name) .target = t->u.kernel.target, .targinfo = t->data, .hook_mask = e->comefrom, + .family = NFPROTO_IPV6, }; int ret; t = ip6t_get_target(e); - ret = xt_check_target(&par, NFPROTO_IPV6, t->u.target_size - sizeof(*t), + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", @@ -718,6 +721,7 @@ find_check_entry(struct ip6t_entry *e, const char *name, unsigned int size, mtpar.table = name; mtpar.entryinfo = &e->ipv6; mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV6; ret = IP6T_MATCH_ITERATE(e, find_check_match, &mtpar, &j); if (ret != 0) goto cleanup_matches; @@ -805,6 +809,7 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i) par.target = t->u.kernel.target; par.targinfo = t->data; + par.family = NFPROTO_IPV6; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); @@ -1685,6 +1690,7 @@ static int compat_check_entry(struct ip6t_entry *e, const char *name, mtpar.table = name; mtpar.entryinfo = &e->ipv6; mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV6; ret = IP6T_MATCH_ITERATE(e, check_match, &mtpar, &j); if (ret) goto cleanup_matches; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index f29513cd139..89837a4eef7 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -321,7 +321,7 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target, } EXPORT_SYMBOL_GPL(xt_find_revision); -int xt_check_match(struct xt_mtchk_param *par, u_int8_t family, +int xt_check_match(struct xt_mtchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { if (XT_ALIGN(par->match->matchsize) != size && @@ -331,26 +331,27 @@ int xt_check_match(struct xt_mtchk_param *par, u_int8_t family, * because it uses a dynamic-size data set. */ printk("%s_tables: %s match: invalid size %Zu != %u\n", - xt_prefix[family], par->match->name, + xt_prefix[par->family], par->match->name, XT_ALIGN(par->match->matchsize), size); return -EINVAL; } if (par->match->table != NULL && strcmp(par->match->table, par->table) != 0) { printk("%s_tables: %s match: only valid in %s table, not %s\n", - xt_prefix[family], par->match->name, + xt_prefix[par->family], par->match->name, par->match->table, par->table); return -EINVAL; } if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { printk("%s_tables: %s match: bad hook_mask %#x/%#x\n", - xt_prefix[family], par->match->name, + xt_prefix[par->family], par->match->name, par->hook_mask, par->match->hooks); return -EINVAL; } if (par->match->proto && (par->match->proto != proto || inv_proto)) { printk("%s_tables: %s match: only valid for protocol %u\n", - xt_prefix[family], par->match->name, par->match->proto); + xt_prefix[par->family], par->match->name, + par->match->proto); return -EINVAL; } if (par->match->checkentry != NULL && !par->match->checkentry(par)) @@ -471,31 +472,31 @@ int xt_compat_match_to_user(struct xt_entry_match *m, void __user **dstptr, EXPORT_SYMBOL_GPL(xt_compat_match_to_user); #endif /* CONFIG_COMPAT */ -int xt_check_target(struct xt_tgchk_param *par, u_int8_t family, +int xt_check_target(struct xt_tgchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { if (XT_ALIGN(par->target->targetsize) != size) { printk("%s_tables: %s target: invalid size %Zu != %u\n", - xt_prefix[family], par->target->name, + xt_prefix[par->family], par->target->name, XT_ALIGN(par->target->targetsize), size); return -EINVAL; } if (par->target->table != NULL && strcmp(par->target->table, par->table) != 0) { printk("%s_tables: %s target: only valid in %s table, not %s\n", - xt_prefix[family], par->target->name, + xt_prefix[par->family], par->target->name, par->target->table, par->table); return -EINVAL; } if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { printk("%s_tables: %s target: bad hook_mask %#x/%#x\n", - xt_prefix[family], par->target->name, par->hook_mask, - par->target->hooks); + xt_prefix[par->family], par->target->name, + par->hook_mask, par->target->hooks); return -EINVAL; } if (par->target->proto && (par->target->proto != proto || inv_proto)) { printk("%s_tables: %s target: only valid for protocol %u\n", - xt_prefix[family], par->target->name, + xt_prefix[par->family], par->target->name, par->target->proto); return -EINVAL; } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index b951d422db9..0453d79ebf5 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -55,9 +55,9 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int par.target = target; par.targinfo = t->data; par.hook_mask = hook; + par.family = NFPROTO_IPV4; - ret = xt_check_target(&par, NFPROTO_IPV4, - t->u.target_size - sizeof(*t), 0, false); + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); if (ret < 0) { module_put(t->u.kernel.target->me); return ret; -- cgit v1.2.3-70-g09d2 From 52cd5750e81ec8d213949fa7c0d2e08907bf498b Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 8 Oct 2008 11:34:06 -0700 Subject: tcp: fix length used for checksum in a reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While looking for some common code I came across difference in checksum calculation between tcp_v6_send_(reset|ack) I couldn't explain. I checked both v4 and v6 and found out that both seem to have the same "feature". I couldn't find anything in rfc nor anywhere else which would state that md5 option should be ignored like it was in case of reset so I came to a conclusion that this is probably a genuine bug. I suspect that addition of md5 just was fooled by the excessive copy-paste code in those functions and the reset part was never tested well enough to find out the problem. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 24ffc5e1d3d..ba46769c6e9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -589,7 +589,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ - sizeof(struct tcphdr), IPPROTO_TCP, 0); + arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 424d9c4a67a..e8b0fdd9edb 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1012,14 +1012,14 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) } #endif - buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); + buff->csum = csum_partial((char *)t1, tot_len, 0); memset(&fl, 0, sizeof(fl)); ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr); ipv6_addr_copy(&fl.fl6_src, &ipv6_hdr(skb)->daddr); t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, - sizeof(*t1), IPPROTO_TCP, + tot_len, IPPROTO_TCP, buff->csum); fl.proto = IPPROTO_TCP; -- cgit v1.2.3-70-g09d2 From 9088c5609584684149f3fb5b065aa7f18dcb03ff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Oct 2008 11:44:17 -0700 Subject: udp: Improve port randomization Current UDP port allocation is suboptimal. We select the shortest chain to chose a port (out of 512) that will hash in this shortest chain. First, it can lead to give not so ramdom ports and ease give attackers more opportunities to break the system. Second, it can consume a lot of CPU to scan all table in order to find the shortest chain. Third, in some pathological cases we can fail to find a free port even if they are plenty of them. This patch zap the search for a short chain and only use one random seed. Problem of getting long chains should be addressed in another way, since we can obtain long chains with non random ports. Based on a report and patch from Vitaly Mayatskikh Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 56 ++++++++++++-------------------------------------------- 1 file changed, 12 insertions(+), 44 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 85f8e8e10b1..67d8430b4a2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -155,55 +155,23 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, write_lock_bh(&udp_hash_lock); if (!snum) { - int i, low, high, remaining; - unsigned rover, best, best_size_so_far; + int low, high, remaining; + unsigned rand; + unsigned short first; inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; - best_size_so_far = UINT_MAX; - best = rover = net_random() % remaining + low; - - /* 1st pass: look for empty (or shortest) hash chain */ - for (i = 0; i < UDP_HTABLE_SIZE; i++) { - int size = 0; - - head = &udptable[udp_hashfn(net, rover)]; - if (hlist_empty(head)) - goto gotit; - - sk_for_each(sk2, node, head) { - if (++size >= best_size_so_far) - goto next; - } - best_size_so_far = size; - best = rover; - next: - /* fold back if end of range */ - if (++rover > high) - rover = low + ((rover - low) - & (UDP_HTABLE_SIZE - 1)); - - - } - - /* 2nd pass: find hole in shortest hash chain */ - rover = best; - for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) { - if (! __udp_lib_lport_inuse(net, rover, udptable)) - goto gotit; - rover += UDP_HTABLE_SIZE; - if (rover > high) - rover = low + ((rover - low) - & (UDP_HTABLE_SIZE - 1)); + rand = net_random(); + snum = first = rand % remaining + low; + rand |= 1; + while (__udp_lib_lport_inuse(net, snum, udptable)) { + do { + snum = snum + rand; + } while (snum < low || snum > high); + if (snum == first) + goto fail; } - - - /* All ports in use! */ - goto fail; - -gotit: - snum = rover; } else { head = &udptable[udp_hashfn(net, snum)]; -- cgit v1.2.3-70-g09d2 From 3c689b7320ae6f20dba6a8b71806a6c6fd604ee8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Oct 2008 14:18:04 -0700 Subject: inet: cleanup of local_port_range I noticed sysctl_local_port_range[] and its associated seqlock sysctl_local_port_range_lock were on separate cache lines. Moreover, sysctl_local_port_range[] was close to unrelated variables, highly modified, leading to cache misses. Moving these two variables in a structure can help data locality and moving this structure to read_mostly section helps sharing of this data among cpus. Cleanup of extern declarations (moved in include file where they belong), and use of inet_get_local_port_range() accessor instead of direct access to ports values. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 4 ++++ net/ipv4/inet_connection_sock.c | 16 +++++++++------- net/ipv4/sysctl_net_ipv4.c | 23 ++++++++++------------- 3 files changed, 23 insertions(+), 20 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index d678ea3d474..1cbccaf0de3 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -178,6 +178,10 @@ extern unsigned long snmp_fold_field(void *mib[], int offt); extern int snmp_mib_init(void *ptr[2], size_t mibsize); extern void snmp_mib_free(void *ptr[2]); +extern struct local_ports { + seqlock_t lock; + int range[2]; +} sysctl_local_ports; extern void inet_get_local_port_range(int *low, int *high); extern int sysctl_ip_default_ttl; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 21fcc5a9045..bd1278a2d82 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -30,20 +30,22 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif /* - * This array holds the first and last local port number. + * This struct holds the first and last local port number. */ -int sysctl_local_port_range[2] = { 32768, 61000 }; -DEFINE_SEQLOCK(sysctl_port_range_lock); +struct local_ports sysctl_local_ports __read_mostly = { + .lock = SEQLOCK_UNLOCKED, + .range = { 32768, 61000 }, +}; void inet_get_local_port_range(int *low, int *high) { unsigned seq; do { - seq = read_seqbegin(&sysctl_port_range_lock); + seq = read_seqbegin(&sysctl_local_ports.lock); - *low = sysctl_local_port_range[0]; - *high = sysctl_local_port_range[1]; - } while (read_seqretry(&sysctl_port_range_lock, seq)); + *low = sysctl_local_ports.range[0]; + *high = sysctl_local_ports.range[1]; + } while (read_seqretry(&sysctl_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e0689fd7b79..276d047fb85 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -26,16 +26,13 @@ static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; -extern seqlock_t sysctl_port_range_lock; -extern int sysctl_local_port_range[2]; - /* Update system visible IP port range */ static void set_local_port_range(int range[2]) { - write_seqlock(&sysctl_port_range_lock); - sysctl_local_port_range[0] = range[0]; - sysctl_local_port_range[1] = range[1]; - write_sequnlock(&sysctl_port_range_lock); + write_seqlock(&sysctl_local_ports.lock); + sysctl_local_ports.range[0] = range[0]; + sysctl_local_ports.range[1] = range[1]; + write_sequnlock(&sysctl_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -44,8 +41,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, size_t *lenp, loff_t *ppos) { int ret; - int range[2] = { sysctl_local_port_range[0], - sysctl_local_port_range[1] }; + int range[2]; ctl_table tmp = { .data = &range, .maxlen = sizeof(range), @@ -54,6 +50,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, .extra2 = &ip_local_port_range_max, }; + inet_get_local_port_range(range, range + 1); ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); if (write && ret == 0) { @@ -73,8 +70,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, void __user *newval, size_t newlen) { int ret; - int range[2] = { sysctl_local_port_range[0], - sysctl_local_port_range[1] }; + int range[2]; ctl_table tmp = { .data = &range, .maxlen = sizeof(range), @@ -83,6 +79,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, .extra2 = &ip_local_port_range_max, }; + inet_get_local_port_range(range, range + 1); ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen); if (ret == 0 && newval && newlen) { if (range[1] < range[0]) @@ -396,8 +393,8 @@ static struct ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_LOCAL_PORT_RANGE, .procname = "ip_local_port_range", - .data = &sysctl_local_port_range, - .maxlen = sizeof(sysctl_local_port_range), + .data = &sysctl_local_ports.range, + .maxlen = sizeof(sysctl_local_ports.range), .mode = 0644, .proc_handler = &ipv4_local_port_range, .strategy = &ipv4_sysctl_local_port_range, -- cgit v1.2.3-70-g09d2 From 071d7ab6649eb34a873a53e71635186e9117101d Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 8 Oct 2008 14:41:35 -0700 Subject: ipvs: Remove stray file left over from ipvs move Commit cb7f6a7b716e801097b564dec3ccb58d330aef56 ("IPVS: Move IPVS to net/netfilter/ipvs") has left a stray file in the old location of ipvs. Signed-off-by: Sven Wegener Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_dh.c | 261 ----------------------------------------------- 1 file changed, 261 deletions(-) delete mode 100644 net/ipv4/ipvs/ip_vs_dh.c (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c deleted file mode 100644 index a16943fd72f..00000000000 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ /dev/null @@ -1,261 +0,0 @@ -/* - * IPVS: Destination Hashing scheduling module - * - * Authors: Wensong Zhang - * - * Inspired by the consistent hashing scheduler patch from - * Thomas Proell - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The dh algorithm is to select server by the hash key of destination IP - * address. The pseudo code is as follows: - * - * n <- servernode[dest_ip]; - * if (n is dead) OR - * (n is overloaded) OR (n.weight <= 0) then - * return NULL; - * - * return n; - * - * Notes that servernode is a 256-bucket hash table that maps the hash - * index derived from packet destination IP address to the current server - * array. If the dh scheduler is used in cache cluster, it is good to - * combine it with cache_bypass feature. When the statically assigned - * server is dead or overloaded, the load balancer can bypass the cache - * server and send requests to the original server directly. - * - */ - -#include -#include -#include -#include - -#include - - -/* - * IPVS DH bucket - */ -struct ip_vs_dh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ -}; - -/* - * for IPVS DH entry hash table - */ -#ifndef CONFIG_IP_VS_DH_TAB_BITS -#define CONFIG_IP_VS_DH_TAB_BITS 8 -#endif -#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS -#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) -#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) - - -/* - * Returns hash value for IPVS DH entry - */ -static inline unsigned ip_vs_dh_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; -} - - -/* - * Get ip_vs_dest associated with supplied parameters. - */ -static inline struct ip_vs_dest * -ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr) -{ - return (tbl[ip_vs_dh_hashkey(addr)]).dest; -} - - -/* - * Assign all the hash buckets of the specified table with the service. - */ -static int -ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) -{ - int i; - struct ip_vs_dh_bucket *b; - struct list_head *p; - struct ip_vs_dest *dest; - - b = tbl; - p = &svc->destinations; - for (i=0; idest = NULL; - } else { - if (p == &svc->destinations) - p = p->next; - - dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; - - p = p->next; - } - b++; - } - return 0; -} - - -/* - * Flush all the hash buckets of the specified table. - */ -static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) -{ - int i; - struct ip_vs_dh_bucket *b; - - b = tbl; - for (i=0; idest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; - } - b++; - } -} - - -static int ip_vs_dh_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl; - - /* allocate the DH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); - - return 0; -} - - -static int ip_vs_dh_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - return 0; -} - - -static int ip_vs_dh_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); - - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); - - return 0; -} - - -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) -{ - return dest->flags & IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Destination hashing scheduling - */ -static struct ip_vs_dest * -ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_dh_bucket *tbl; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_dh_bucket *)svc->sched_data; - dest = ip_vs_dh_get(tbl, iph->daddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { - return NULL; - } - - IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->daddr), - NIPQUAD(dest->addr.ip), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS DH Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_dh_scheduler = -{ - .name = "dh", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), -#ifdef CONFIG_IP_VS_IPV6 - .supports_ipv6 = 0, -#endif - .init_service = ip_vs_dh_init_svc, - .done_service = ip_vs_dh_done_svc, - .update_service = ip_vs_dh_update_svc, - .schedule = ip_vs_dh_schedule, -}; - - -static int __init ip_vs_dh_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_dh_scheduler); -} - - -static void __exit ip_vs_dh_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); -} - - -module_init(ip_vs_dh_init); -module_exit(ip_vs_dh_cleanup); -MODULE_LICENSE("GPL"); -- cgit v1.2.3-70-g09d2 From c95b819ad75b13102139aad0e7062d927be23cc6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 9 Oct 2008 11:58:54 -0700 Subject: gre: Use needed_headroom Now that we have dev->needed_headroom, we can use it instead of having a bogus dev->hard_header_len. This also allows us to include dev->hard_header_len in the MTU computation so that when we do have a meaningful hard_harder_len in future it is included automatically in figuring out the MTU. Incidentally, this fixes a bug where we ignored the needed_headroom field of the underlying device in calculating our own hard_header_len. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2a61158ea72..fd192d67695 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -637,7 +637,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) df = tiph->frag_off; if (df) - mtu = dst_mtu(&rt->u.dst) - tunnel->hlen; + mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen; else mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; @@ -785,7 +785,7 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; - /* Guess output device to choose reasonable mtu and hard_header_len */ + /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { struct flowi fl = { .oif = tunnel->parms.link, @@ -806,7 +806,7 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); if (tdev) { - hlen = tdev->hard_header_len; + hlen = tdev->hard_header_len + tdev->needed_headroom; mtu = tdev->mtu; } dev->iflink = tunnel->parms.link; @@ -820,8 +820,8 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) if (tunnel->parms.o_flags&GRE_SEQ) addend += 4; } - dev->hard_header_len = hlen + addend; - dev->mtu = mtu - addend; + dev->needed_headroom = addend + hlen; + dev->mtu = mtu - dev->hard_header_len - addend; tunnel->hlen = addend; } @@ -959,7 +959,8 @@ done: static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) { struct ip_tunnel *tunnel = netdev_priv(dev); - if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen) + if (new_mtu < 68 || + new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) return -EINVAL; dev->mtu = new_mtu; return 0; @@ -1085,7 +1086,7 @@ static void ipgre_tunnel_setup(struct net_device *dev) dev->change_mtu = ipgre_tunnel_change_mtu; dev->type = ARPHRD_IPGRE; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4; + dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; dev->flags = IFF_NOARP; dev->iflink = 0; -- cgit v1.2.3-70-g09d2 From 42aa916265d740d66ac1f17290366e9494c884c2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 9 Oct 2008 11:59:32 -0700 Subject: gre: Move MTU setting out of ipgre_tunnel_bind_dev This patch moves the dev->mtu setting out of ipgre_tunnel_bind_dev. This is in prepartion of using rtnl_link where we'll need to make the MTU setting conditional on whether the user has supplied an MTU. This also requires the move of the ipgre_tunnel_bind_dev call out of the dev->init function so that we can access the user parameters later. This patch also adds a check to prevent setting the MTU below the minimum of 68. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fd192d67695..80622dd6fb3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -119,6 +119,7 @@ static int ipgre_tunnel_init(struct net_device *dev); static void ipgre_tunnel_setup(struct net_device *dev); +static int ipgre_tunnel_bind_dev(struct net_device *dev); /* Fallback tunnel: no source, no destination, no key, no options */ @@ -289,6 +290,8 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, nt = netdev_priv(dev); nt->parms = *parms; + dev->mtu = ipgre_tunnel_bind_dev(dev); + if (register_netdevice(dev) < 0) goto failed_free; @@ -773,7 +776,7 @@ tx_error: return 0; } -static void ipgre_tunnel_bind_dev(struct net_device *dev) +static int ipgre_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel; @@ -821,9 +824,14 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) addend += 4; } dev->needed_headroom = addend + hlen; - dev->mtu = mtu - dev->hard_header_len - addend; + mtu -= dev->hard_header_len - addend; + + if (mtu < 68) + mtu = 68; + tunnel->hlen = addend; + return mtu; } static int @@ -917,7 +925,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) t->parms.iph.frag_off = p.iph.frag_off; if (t->parms.link != p.link) { t->parms.link = p.link; - ipgre_tunnel_bind_dev(dev); + dev->mtu = ipgre_tunnel_bind_dev(dev); netdev_state_change(dev); } } @@ -1108,8 +1116,6 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - ipgre_tunnel_bind_dev(dev); - if (iph->daddr) { #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { -- cgit v1.2.3-70-g09d2 From c19e654ddbe3831252f61e76a74d661e1a755530 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 9 Oct 2008 11:59:55 -0700 Subject: gre: Add netlink interface This patch adds a netlink interface that will eventually displace the existing ioctl interface. It utilises the elegant rtnl_link_ops mechanism. This also means that user-space no longer needs to rely on the tunnel interface being of type GRE to identify GRE tunnels. The identification can now occur using rtnl_link_ops. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/if_tunnel.h | 19 ++++ net/ipv4/ip_gre.c | 247 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 262 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h index d4efe401470..aeab2cb32a9 100644 --- a/include/linux/if_tunnel.h +++ b/include/linux/if_tunnel.h @@ -2,6 +2,7 @@ #define _IF_TUNNEL_H_ #include +#include #define SIOCGETTUNNEL (SIOCDEVPRIVATE + 0) #define SIOCADDTUNNEL (SIOCDEVPRIVATE + 1) @@ -47,4 +48,22 @@ struct ip_tunnel_prl { /* PRL flags */ #define PRL_DEFAULT 0x0001 +enum +{ + IFLA_GRE_UNSPEC, + IFLA_GRE_LINK, + IFLA_GRE_IFLAGS, + IFLA_GRE_OFLAGS, + IFLA_GRE_IKEY, + IFLA_GRE_OKEY, + IFLA_GRE_LOCAL, + IFLA_GRE_REMOTE, + IFLA_GRE_TTL, + IFLA_GRE_TOS, + IFLA_GRE_PMTUDISC, + __IFLA_GRE_MAX, +}; + +#define IFLA_GRE_MAX (__IFLA_GRE_MAX - 1) + #endif /* _IF_TUNNEL_H_ */ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 80622dd6fb3..25d2c77a7f3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -41,6 +41,7 @@ #include #include #include +#include #ifdef CONFIG_IPV6 #include @@ -117,6 +118,7 @@ Alexey Kuznetsov. */ +static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); static void ipgre_tunnel_setup(struct net_device *dev); static int ipgre_tunnel_bind_dev(struct net_device *dev); @@ -286,9 +288,9 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, goto failed_free; } - dev->init = ipgre_tunnel_init; nt = netdev_priv(dev); nt->parms = *parms; + dev->rtnl_link_ops = &ipgre_link_ops; dev->mtu = ipgre_tunnel_bind_dev(dev); @@ -1087,6 +1089,7 @@ static int ipgre_close(struct net_device *dev) static void ipgre_tunnel_setup(struct net_device *dev) { + dev->init = ipgre_tunnel_init; dev->uninit = ipgre_tunnel_uninit; dev->destructor = free_netdev; dev->hard_start_xmit = ipgre_tunnel_xmit; @@ -1196,6 +1199,7 @@ static int ipgre_init_net(struct net *net) ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init; dev_net_set(ign->fb_tunnel_dev, net); + ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; if ((err = register_netdev(ign->fb_tunnel_dev))) goto err_reg_dev; @@ -1228,6 +1232,229 @@ static struct pernet_operations ipgre_net_ops = { .exit = ipgre_exit_net, }; +static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + __be16 flags; + + if (!data) + return 0; + + flags = 0; + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (flags & (GRE_VERSION|GRE_ROUTING)) + return -EINVAL; + + return 0; +} + +static void ipgre_netlink_parms(struct nlattr *data[], + struct ip_tunnel_parm *parms) +{ + memset(parms, 0, sizeof(parms)); + + parms->iph.protocol = IPPROTO_GRE; + + if (!data) + return; + + if (data[IFLA_GRE_LINK]) + parms->link = nla_get_u32(data[IFLA_GRE_LINK]); + + if (data[IFLA_GRE_IFLAGS]) + parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); + + if (data[IFLA_GRE_OFLAGS]) + parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); + + if (data[IFLA_GRE_IKEY]) + parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); + + if (data[IFLA_GRE_OKEY]) + parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); + + if (data[IFLA_GRE_LOCAL]) + memcpy(&parms->iph.saddr, nla_data(data[IFLA_GRE_LOCAL]), 4); + + if (data[IFLA_GRE_REMOTE]) + memcpy(&parms->iph.daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); + + if (data[IFLA_GRE_TTL]) + parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); + + if (data[IFLA_GRE_TOS]) + parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); + + if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) + parms->iph.frag_off = htons(IP_DF); +} + +static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip_tunnel *nt; + struct net *net = dev_net(dev); + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + int mtu; + int err; + + nt = netdev_priv(dev); + ipgre_netlink_parms(data, &nt->parms); + + if (ipgre_tunnel_locate(net, &nt->parms, 0)) + return -EEXIST; + + mtu = ipgre_tunnel_bind_dev(dev); + if (!tb[IFLA_MTU]) + dev->mtu = mtu; + + err = register_netdevice(dev); + if (err) + goto out; + + dev_hold(dev); + ipgre_tunnel_link(ign, nt); + +out: + return err; +} + +static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip_tunnel *t, *nt; + struct net *net = dev_net(dev); + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + struct ip_tunnel_parm p; + int mtu; + + if (dev == ign->fb_tunnel_dev) + return -EINVAL; + + nt = netdev_priv(dev); + ipgre_netlink_parms(data, &p); + + t = ipgre_tunnel_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else { + unsigned nflags = 0; + + t = nt; + + if (ipv4_is_multicast(p.iph.daddr)) + nflags = IFF_BROADCAST; + else if (p.iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags ^ nflags) & + (IFF_POINTOPOINT | IFF_BROADCAST)) + return -EINVAL; + + ipgre_tunnel_unlink(ign, t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + t->parms.i_key = p.i_key; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipgre_tunnel_link(ign, t); + netdev_state_change(dev); + } + + t->parms.o_key = p.o_key; + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + + if (t->parms.link != p.link) { + t->parms.link = p.link; + mtu = ipgre_tunnel_bind_dev(dev); + if (!tb[IFLA_MTU]) + dev->mtu = mtu; + netdev_state_change(dev); + } + + return 0; +} + +static size_t ipgre_get_size(const struct net_device *dev) +{ + return + /* IFLA_GRE_LINK */ + nla_total_size(4) + + /* IFLA_GRE_IFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_OFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_IKEY */ + nla_total_size(4) + + /* IFLA_GRE_OKEY */ + nla_total_size(4) + + /* IFLA_GRE_LOCAL */ + nla_total_size(4) + + /* IFLA_GRE_REMOTE */ + nla_total_size(4) + + /* IFLA_GRE_TTL */ + nla_total_size(1) + + /* IFLA_GRE_TOS */ + nla_total_size(1) + + /* IFLA_GRE_PMTUDISC */ + nla_total_size(1) + + 0; +} + +static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm *p = &t->parms; + + NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link); + NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags); + NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); + NLA_PUT_BE32(skb, IFLA_GRE_IFLAGS, p->i_flags); + NLA_PUT_BE32(skb, IFLA_GRE_OFLAGS, p->o_flags); + NLA_PUT(skb, IFLA_GRE_LOCAL, 4, &p->iph.saddr); + NLA_PUT(skb, IFLA_GRE_REMOTE, 4, &p->iph.daddr); + NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); + NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos); + NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { + [IFLA_GRE_LINK] = { .type = NLA_U32 }, + [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_IKEY] = { .type = NLA_U32 }, + [IFLA_GRE_OKEY] = { .type = NLA_U32 }, + [IFLA_GRE_LOCAL] = { .len = 4 }, + [IFLA_GRE_REMOTE] = { .len = 4 }, + [IFLA_GRE_TTL] = { .type = NLA_U8 }, + [IFLA_GRE_TOS] = { .type = NLA_U8 }, + [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, +}; + +static struct rtnl_link_ops ipgre_link_ops __read_mostly = { + .kind = "gre", + .maxtype = IFLA_GRE_MAX, + .policy = ipgre_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = ipgre_tunnel_setup, + .validate = ipgre_tunnel_validate, + .newlink = ipgre_newlink, + .changelink = ipgre_changelink, + .get_size = ipgre_get_size, + .fill_info = ipgre_fill_info, +}; + /* * And now the modules code and kernel interface. */ @@ -1245,19 +1472,31 @@ static int __init ipgre_init(void) err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops); if (err < 0) - inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + goto gen_device_failed; + err = rtnl_link_register(&ipgre_link_ops); + if (err < 0) + goto rtnl_link_failed; + +out: return err; + +rtnl_link_failed: + unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); +gen_device_failed: + inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + goto out; } static void __exit ipgre_fini(void) { + rtnl_link_unregister(&ipgre_link_ops); + unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); - - unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); } module_init(ipgre_init); module_exit(ipgre_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS("rtnl-link-gre"); -- cgit v1.2.3-70-g09d2 From e1a8000228e16212c93b23cfbed4d622e2ec7a6b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 9 Oct 2008 12:00:17 -0700 Subject: gre: Add Transparent Ethernet Bridging This patch adds support for Ethernet over GRE encapsulation. This is exposed to user-space with a new link type of "gretap" instead of "gre". It will create an ARPHRD_ETHER device in lieu of the usual ARPHRD_IPGRE. Note that to preserver backwards compatibility all Transparent Ethernet Bridging packets are passed to an ARPHRD_IPGRE tunnel if its key matches and there is no ARPHRD_ETHER device whose key matches more closely. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/if_ether.h | 1 + net/ipv4/ip_gre.c | 206 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 175 insertions(+), 32 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index a0099e98b5c..bf1a53b2682 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -56,6 +56,7 @@ #define ETH_P_DIAG 0x6005 /* DEC Diagnostics */ #define ETH_P_CUST 0x6006 /* DEC Customer use */ #define ETH_P_SCA 0x6007 /* DEC Systems Comms Arch */ +#define ETH_P_TEB 0x6558 /* Trans Ether Bridging */ #define ETH_P_RARP 0x8035 /* Reverse Addr Res packet */ #define ETH_P_ATALK 0x809B /* Appletalk DDP */ #define ETH_P_AARP 0x80F3 /* Appletalk AARP */ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 25d2c77a7f3..44ed9487fa1 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -166,38 +167,64 @@ static DEFINE_RWLOCK(ipgre_lock); /* Given src, dst and key, find appropriate for input tunnel. */ static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net, - __be32 remote, __be32 local, __be32 key) + __be32 remote, __be32 local, + __be32 key, __be16 gre_proto) { unsigned h0 = HASH(remote); unsigned h1 = HASH(key); struct ip_tunnel *t; + struct ip_tunnel *t2 = NULL; struct ipgre_net *ign = net_generic(net, ipgre_net_id); + int dev_type = (gre_proto == htons(ETH_P_TEB)) ? + ARPHRD_ETHER : ARPHRD_IPGRE; for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { - if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) - return t; + if (t->parms.i_key == key && t->dev->flags & IFF_UP) { + if (t->dev->type == dev_type) + return t; + if (t->dev->type == ARPHRD_IPGRE && !t2) + t2 = t; + } } } + for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { if (remote == t->parms.iph.daddr) { - if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) - return t; + if (t->parms.i_key == key && t->dev->flags & IFF_UP) { + if (t->dev->type == dev_type) + return t; + if (t->dev->type == ARPHRD_IPGRE && !t2) + t2 = t; + } } } + for (t = ign->tunnels_l[h1]; t; t = t->next) { if (local == t->parms.iph.saddr || (local == t->parms.iph.daddr && ipv4_is_multicast(local))) { - if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) - return t; + if (t->parms.i_key == key && t->dev->flags & IFF_UP) { + if (t->dev->type == dev_type) + return t; + if (t->dev->type == ARPHRD_IPGRE && !t2) + t2 = t; + } } } + for (t = ign->tunnels_wc[h1]; t; t = t->next) { - if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) - return t; + if (t->parms.i_key == key && t->dev->flags & IFF_UP) { + if (t->dev->type == dev_type) + return t; + if (t->dev->type == ARPHRD_IPGRE && !t2) + t2 = t; + } } + if (t2) + return t2; + if (ign->fb_tunnel_dev->flags&IFF_UP) return netdev_priv(ign->fb_tunnel_dev); return NULL; @@ -252,25 +279,37 @@ static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) } } -static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) +static struct ip_tunnel *ipgre_tunnel_find(struct net *net, + struct ip_tunnel_parm *parms, + int type) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; __be32 key = parms->i_key; - struct ip_tunnel *t, **tp, *nt; + struct ip_tunnel *t, **tp; + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + + for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && + key == t->parms.i_key && + type == t->dev->type) + break; + + return t; +} + +static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, + struct ip_tunnel_parm *parms, int create) +{ + struct ip_tunnel *t, *nt; struct net_device *dev; char name[IFNAMSIZ]; struct ipgre_net *ign = net_generic(net, ipgre_net_id); - for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) { - if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { - if (key == t->parms.i_key) - return t; - } - } - if (!create) - return NULL; + t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); + if (t || !create) + return t; if (parms->name[0]) strlcpy(name, parms->name, IFNAMSIZ); @@ -385,8 +424,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info) read_lock(&ipgre_lock); t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr, - (flags&GRE_KEY) ? - *(((__be32*)p) + (grehlen>>2) - 1) : 0); + flags & GRE_KEY ? + *(((__be32 *)p) + (grehlen / 4) - 1) : 0, + p[1]); if (t == NULL || t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) goto out; @@ -436,6 +476,7 @@ static int ipgre_rcv(struct sk_buff *skb) u32 seqno = 0; struct ip_tunnel *tunnel; int offset = 4; + __be16 gre_proto; if (!pskb_may_pull(skb, 16)) goto drop_nolock; @@ -475,20 +516,22 @@ static int ipgre_rcv(struct sk_buff *skb) } } + gre_proto = *(__be16 *)(h + 2); + read_lock(&ipgre_lock); if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev), - iph->saddr, iph->daddr, key)) != NULL) { + iph->saddr, iph->daddr, key, + gre_proto))) { struct net_device_stats *stats = &tunnel->dev->stats; secpath_reset(skb); - skb->protocol = *(__be16*)(h + 2); + skb->protocol = gre_proto; /* WCCP version 1 and 2 protocol decoding. * - Change protocol to IP * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header */ - if (flags == 0 && - skb->protocol == htons(ETH_P_WCCP)) { + if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { skb->protocol = htons(ETH_P_IP); if ((*(h + offset) & 0xF0) != 0x40) offset += 4; @@ -496,7 +539,6 @@ static int ipgre_rcv(struct sk_buff *skb) skb->mac_header = skb->network_header; __pskb_pull(skb, offset); - skb_reset_network_header(skb); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb->pkt_type = PACKET_HOST; #ifdef CONFIG_NET_IPGRE_BROADCAST @@ -524,13 +566,30 @@ static int ipgre_rcv(struct sk_buff *skb) } tunnel->i_seqno = seqno + 1; } + + /* Warning: All skb pointers will be invalidated! */ + if (tunnel->dev->type == ARPHRD_ETHER) { + if (!pskb_may_pull(skb, ETH_HLEN)) { + stats->rx_length_errors++; + stats->rx_errors++; + goto drop; + } + + iph = ip_hdr(skb); + skb->protocol = eth_type_trans(skb, tunnel->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } + stats->rx_packets++; stats->rx_bytes += skb->len; skb->dev = tunnel->dev; dst_release(skb->dst); skb->dst = NULL; nf_reset(skb); + + skb_reset_network_header(skb); ipgre_ecn_decapsulate(iph, skb); + netif_rx(skb); read_unlock(&ipgre_lock); return(0); @@ -565,7 +624,10 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error; } - if (dev->header_ops) { + if (dev->type == ARPHRD_ETHER) + IPCB(skb)->flags = 0; + + if (dev->header_ops && dev->type == ARPHRD_IPGRE) { gre_hlen = 0; tiph = (struct iphdr*)skb->data; } else { @@ -741,8 +803,9 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); } - ((__be16*)(iph+1))[0] = tunnel->parms.o_flags; - ((__be16*)(iph+1))[1] = skb->protocol; + ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; + ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? + htons(ETH_P_TEB) : skb->protocol; if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); @@ -804,7 +867,9 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) tdev = rt->u.dst.dev; ip_rt_put(rt); } - dev->flags |= IFF_POINTOPOINT; + + if (dev->type != ARPHRD_ETHER) + dev->flags |= IFF_POINTOPOINT; } if (!tdev && tunnel->parms.link) @@ -1250,6 +1315,30 @@ static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; } +static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + __be32 daddr; + + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + if (!data) + goto out; + + if (data[IFLA_GRE_REMOTE]) { + memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); + if (!daddr) + return -EINVAL; + } + +out: + return ipgre_tunnel_validate(tb, data); +} + static void ipgre_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm *parms) { @@ -1291,6 +1380,35 @@ static void ipgre_netlink_parms(struct nlattr *data[], parms->iph.frag_off = htons(IP_DF); } +static int ipgre_tap_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + ipgre_tunnel_bind_dev(dev); + + return 0; +} + +static void ipgre_tap_setup(struct net_device *dev) +{ + + ether_setup(dev); + + dev->init = ipgre_tap_init; + dev->uninit = ipgre_tunnel_uninit; + dev->destructor = free_netdev; + dev->hard_start_xmit = ipgre_tunnel_xmit; + dev->change_mtu = ipgre_tunnel_change_mtu; + + dev->iflink = 0; + dev->features |= NETIF_F_NETNS_LOCAL; +} + static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { @@ -1303,9 +1421,12 @@ static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], nt = netdev_priv(dev); ipgre_netlink_parms(data, &nt->parms); - if (ipgre_tunnel_locate(net, &nt->parms, 0)) + if (ipgre_tunnel_find(net, &nt->parms, dev->type)) return -EEXIST; + if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) + random_ether_addr(dev->dev_addr); + mtu = ipgre_tunnel_bind_dev(dev); if (!tb[IFLA_MTU]) dev->mtu = mtu; @@ -1455,6 +1576,19 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = { .fill_info = ipgre_fill_info, }; +static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { + .kind = "gretap", + .maxtype = IFLA_GRE_MAX, + .policy = ipgre_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = ipgre_tap_setup, + .validate = ipgre_tap_validate, + .newlink = ipgre_newlink, + .changelink = ipgre_changelink, + .get_size = ipgre_get_size, + .fill_info = ipgre_fill_info, +}; + /* * And now the modules code and kernel interface. */ @@ -1478,9 +1612,15 @@ static int __init ipgre_init(void) if (err < 0) goto rtnl_link_failed; + err = rtnl_link_register(&ipgre_tap_ops); + if (err < 0) + goto tap_ops_failed; + out: return err; +tap_ops_failed: + rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); gen_device_failed: @@ -1490,6 +1630,7 @@ gen_device_failed: static void __exit ipgre_fini(void) { + rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) @@ -1500,3 +1641,4 @@ module_init(ipgre_init); module_exit(ipgre_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS("rtnl-link-gre"); +MODULE_ALIAS("rtnl-link-gretap"); -- cgit v1.2.3-70-g09d2 From 64194c31a0b6f5d84703b772113aafc400eeaad6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 9 Oct 2008 12:03:17 -0700 Subject: inet: Make tunnel RX/TX byte counters more consistent This patch makes the RX/TX byte counters for IPIP, GRE and SIT more consistent. Previously we included the external IP headers on the way out but not when the packet is inbound. The new scheme is to count payload only in both directions. For IPIP and SIT this simply means the exclusion of the external IP header. For GRE this means that we exclude the GRE header as well. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/net/ipip.h | 2 +- net/ipv4/ip_gre.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ipip.h b/include/net/ipip.h index a85bda64b85..fdf9bd74370 100644 --- a/include/net/ipip.h +++ b/include/net/ipip.h @@ -37,7 +37,7 @@ struct ip_tunnel_prl_entry #define IPTUNNEL_XMIT() do { \ int err; \ - int pkt_len = skb->len; \ + int pkt_len = skb->len - skb_transport_offset(skb); \ \ skb->ip_summed = CHECKSUM_NONE; \ ip_select_ident(iph, &rt->u.dst, NULL); \ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 44ed9487fa1..0d5e35b0ed5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -477,6 +477,7 @@ static int ipgre_rcv(struct sk_buff *skb) struct ip_tunnel *tunnel; int offset = 4; __be16 gre_proto; + unsigned int len; if (!pskb_may_pull(skb, 16)) goto drop_nolock; @@ -567,6 +568,8 @@ static int ipgre_rcv(struct sk_buff *skb) tunnel->i_seqno = seqno + 1; } + len = skb->len; + /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { @@ -581,7 +584,7 @@ static int ipgre_rcv(struct sk_buff *skb) } stats->rx_packets++; - stats->rx_bytes += skb->len; + stats->rx_bytes += len; skb->dev = tunnel->dev; dst_release(skb->dst); skb->dst = NULL; @@ -770,7 +773,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) old_iph = ip_hdr(skb); } - skb->transport_header = skb->network_header; + skb_reset_transport_header(skb); skb_push(skb, gre_hlen); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); -- cgit v1.2.3-70-g09d2 From 78e645cb890b0f32ea81a974e29427d9cd2f64f0 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 9 Oct 2008 14:37:47 -0700 Subject: tcpv[46]: fix md5 pseudoheader address field ordering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maybe it's just me but I guess those md5 people made a mess out of it by having *_md5_hash_* to use daddr, saddr order instead of the one that is natural (and equal to what csum functions use). For the segment were sending, the original addresses are reversed so buff's saddr == skb's daddr and vice-versa. Maybe I can finally proceed with unification of some code after fixing it first... :-) Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ba46769c6e9..5c8fa7f1e32 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -583,8 +583,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) rep.th.doff = arg.iov[0].iov_len / 4; tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], - key, ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, &rep.th); + key, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &rep.th); } #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index dd7bdde7bdd..eab10bc7ff8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1007,8 +1007,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); tcp_v6_md5_hash_hdr((__u8 *)&opt[1], key, - &ipv6_hdr(skb)->daddr, - &ipv6_hdr(skb)->saddr, t1); + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, t1); } #endif -- cgit v1.2.3-70-g09d2 From f24d43c07e208372aa3d3bff419afbf43ba87698 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Oct 2008 14:51:27 -0700 Subject: udp: complete port availability checking While looking at UDP port randomization, I noticed it was litle bit pessimistic, not looking at type of sockets (IPV6/IPV4) and not looking at bound addresses if any. We should perform same tests than when binding to a specific port. This permits a cleanup of udp_lib_get_port() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 67d8430b4a2..eacf4cfef14 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -122,14 +122,23 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min); atomic_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); -static inline int __udp_lib_lport_inuse(struct net *net, __u16 num, - const struct hlist_head udptable[]) +static int udp_lib_lport_inuse(struct net *net, __u16 num, + const struct hlist_head udptable[], + struct sock *sk, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2)) { - struct sock *sk; + struct sock *sk2; struct hlist_node *node; - sk_for_each(sk, node, &udptable[udp_hashfn(net, num)]) - if (net_eq(sock_net(sk), net) && sk->sk_hash == num) + sk_for_each(sk2, node, &udptable[udp_hashfn(net, num)]) + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + sk2->sk_hash == num && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if + || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) return 1; return 0; } @@ -146,9 +155,6 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, const struct sock *sk2 ) ) { struct hlist_head *udptable = sk->sk_prot->h.udp_hash; - struct hlist_node *node; - struct hlist_head *head; - struct sock *sk2; int error = 1; struct net *net = sock_net(sk); @@ -165,32 +171,21 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, rand = net_random(); snum = first = rand % remaining + low; rand |= 1; - while (__udp_lib_lport_inuse(net, snum, udptable)) { + while (udp_lib_lport_inuse(net, snum, udptable, sk, + saddr_comp)) { do { snum = snum + rand; } while (snum < low || snum > high); if (snum == first) goto fail; } - } else { - head = &udptable[udp_hashfn(net, snum)]; - - sk_for_each(sk2, node, head) - if (sk2->sk_hash == snum && - sk2 != sk && - net_eq(sock_net(sk2), net) && - (!sk2->sk_reuse || !sk->sk_reuse) && - (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if - || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - (*saddr_comp)(sk, sk2) ) - goto fail; - } + } else if (udp_lib_lport_inuse(net, snum, udptable, sk, saddr_comp)) + goto fail; inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - head = &udptable[udp_hashfn(net, snum)]; - sk_add_node(sk, head); + sk_add_node(sk, &udptable[udp_hashfn(net, snum)]); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } error = 0; -- cgit v1.2.3-70-g09d2 From ba9e64b1c23f1dd22fea14c310f739d84ac8b748 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 10 Oct 2008 12:10:30 -0700 Subject: gre: fix copy and paste error The flags are dumped twice, the keys not at all. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0d5e35b0ed5..c0755e98b81 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1539,8 +1539,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link); NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags); NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); - NLA_PUT_BE32(skb, IFLA_GRE_IFLAGS, p->i_flags); - NLA_PUT_BE32(skb, IFLA_GRE_OFLAGS, p->o_flags); + NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key); + NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key); NLA_PUT(skb, IFLA_GRE_LOCAL, 4, &p->iph.saddr); NLA_PUT(skb, IFLA_GRE_REMOTE, 4, &p->iph.daddr); NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); -- cgit v1.2.3-70-g09d2 From 4d74f8ba1fb152ae07eb858abb713e094e77b7d5 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 10 Oct 2008 12:11:06 -0700 Subject: gre: minor cleanups in netlink interface - use typeful helpers for IFLA_GRE_LOCAL/IFLA_GRE_REMOTE - replace magic value by FIELD_SIZEOF - use MODULE_ALIAS_RTNL_LINK macro Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c0755e98b81..05ebce2881e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1368,10 +1368,10 @@ static void ipgre_netlink_parms(struct nlattr *data[], parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); if (data[IFLA_GRE_LOCAL]) - memcpy(&parms->iph.saddr, nla_data(data[IFLA_GRE_LOCAL]), 4); + parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); if (data[IFLA_GRE_REMOTE]) - memcpy(&parms->iph.daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); + parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); if (data[IFLA_GRE_TTL]) parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); @@ -1541,8 +1541,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key); NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key); - NLA_PUT(skb, IFLA_GRE_LOCAL, 4, &p->iph.saddr); - NLA_PUT(skb, IFLA_GRE_REMOTE, 4, &p->iph.daddr); + NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr); + NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr); NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos); NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))); @@ -1559,8 +1559,8 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, [IFLA_GRE_IKEY] = { .type = NLA_U32 }, [IFLA_GRE_OKEY] = { .type = NLA_U32 }, - [IFLA_GRE_LOCAL] = { .len = 4 }, - [IFLA_GRE_REMOTE] = { .len = 4 }, + [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, [IFLA_GRE_TTL] = { .type = NLA_U8 }, [IFLA_GRE_TOS] = { .type = NLA_U8 }, [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, @@ -1643,5 +1643,5 @@ static void __exit ipgre_fini(void) module_init(ipgre_init); module_exit(ipgre_fini); MODULE_LICENSE("GPL"); -MODULE_ALIAS("rtnl-link-gre"); -MODULE_ALIAS("rtnl-link-gretap"); +MODULE_ALIAS_RTNL_LINK("gre"); +MODULE_ALIAS_RTNL_LINK("gretap"); -- cgit v1.2.3-70-g09d2