diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/datagram.c | 72 | ||||
-rw-r--r-- | net/core/dev.c | 420 | ||||
-rw-r--r-- | net/core/fib_rules.c | 25 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 21 | ||||
-rw-r--r-- | net/core/iovec.c | 24 | ||||
-rw-r--r-- | net/core/neighbour.c | 2 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 165 | ||||
-rw-r--r-- | net/core/net_namespace.c | 2 | ||||
-rw-r--r-- | net/core/netpoll.c | 11 | ||||
-rw-r--r-- | net/core/netprio_cgroup.c | 72 | ||||
-rw-r--r-- | net/core/pktgen.c | 61 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 29 | ||||
-rw-r--r-- | net/core/scm.c | 6 | ||||
-rw-r--r-- | net/core/secure_seq.c | 27 | ||||
-rw-r--r-- | net/core/skbuff.c | 19 | ||||
-rw-r--r-- | net/core/sock.c | 166 | ||||
-rw-r--r-- | net/core/stream.c | 2 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 30 |
18 files changed, 896 insertions, 258 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index 8ab48cd8955..af814e76420 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -48,6 +48,7 @@ #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/slab.h> +#include <linux/pagemap.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -573,6 +574,77 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iovec); +/** + * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec + * @skb: buffer to copy + * @from: io vector to copy to + * @offset: offset in the io vector to start copying from + * @count: amount of vectors to copy to buffer from + * + * The function will first copy up to headlen, and then pin the userspace + * pages and build frags through them. + * + * Returns 0, -EFAULT or -EMSGSIZE. + * Note: the iovec is not modified during the copy + */ +int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, + int offset, size_t count) +{ + int len = iov_length(from, count) - offset; + int copy = min_t(int, skb_headlen(skb), len); + int size; + int i = 0; + + /* copy up to skb headlen */ + if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy)) + return -EFAULT; + + if (len == copy) + return 0; + + offset += copy; + while (count--) { + struct page *page[MAX_SKB_FRAGS]; + int num_pages; + unsigned long base; + unsigned long truesize; + + /* Skip over from offset and copied */ + if (offset >= from->iov_len) { + offset -= from->iov_len; + ++from; + continue; + } + len = from->iov_len - offset; + base = (unsigned long)from->iov_base + offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + if (i + size > MAX_SKB_FRAGS) + return -EMSGSIZE; + num_pages = get_user_pages_fast(base, size, 0, &page[i]); + if (num_pages != size) { + release_pages(&page[i], num_pages, 0); + return -EFAULT; + } + truesize = size * PAGE_SIZE; + skb->data_len += len; + skb->len += len; + skb->truesize += truesize; + atomic_add(truesize, &skb->sk->sk_wmem_alloc); + while (len) { + int off = base & ~PAGE_MASK; + int size = min_t(int, len, PAGE_SIZE - off); + skb_fill_page_desc(skb, i, page[i], off, size); + base += size; + len -= size; + i++; + } + offset = 0; + ++from; + } + return 0; +} +EXPORT_SYMBOL(zerocopy_sg_from_iovec); + static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 __user *to, int len, __wsum *csump) diff --git a/net/core/dev.c b/net/core/dev.c index 26755dd40da..65f829cfd92 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -174,7 +174,7 @@ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id; static DEFINE_HASHTABLE(napi_hash, 8); -seqcount_t devnet_rename_seq; +static seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) { @@ -1691,13 +1691,13 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) kfree_skb(skb); return NET_RX_DROP; } - skb_scrub_packet(skb); skb->protocol = eth_type_trans(skb, dev); /* eth_type_trans() can set pkt_type. - * clear pkt_type _after_ calling eth_type_trans() + * call skb_scrub_packet() after it to clear pkt_type _after_ calling + * eth_type_trans(). */ - skb->pkt_type = PACKET_HOST; + skb_scrub_packet(skb, true); return netif_rx(skb); } @@ -4367,57 +4367,48 @@ softnet_break: goto out; } -struct netdev_upper { +struct netdev_adjacent { struct net_device *dev; + + /* upper master flag, there can only be one master device per list */ bool master; + + /* indicates that this dev is our first-level lower/upper device */ + bool neighbour; + + /* counter for the number of times this device was added to us */ + u16 ref_nr; + struct list_head list; struct rcu_head rcu; - struct list_head search_list; }; -static void __append_search_uppers(struct list_head *search_list, - struct net_device *dev) +static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, + struct net_device *adj_dev, + bool upper) { - struct netdev_upper *upper; + struct netdev_adjacent *adj; + struct list_head *dev_list; - list_for_each_entry(upper, &dev->upper_dev_list, list) { - /* check if this upper is not already in search list */ - if (list_empty(&upper->search_list)) - list_add_tail(&upper->search_list, search_list); + dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list; + + list_for_each_entry(adj, dev_list, list) { + if (adj->dev == adj_dev) + return adj; } + return NULL; } -static bool __netdev_search_upper_dev(struct net_device *dev, - struct net_device *upper_dev) +static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, + struct net_device *udev) { - LIST_HEAD(search_list); - struct netdev_upper *upper; - struct netdev_upper *tmp; - bool ret = false; - - __append_search_uppers(&search_list, dev); - list_for_each_entry(upper, &search_list, search_list) { - if (upper->dev == upper_dev) { - ret = true; - break; - } - __append_search_uppers(&search_list, upper->dev); - } - list_for_each_entry_safe(upper, tmp, &search_list, search_list) - INIT_LIST_HEAD(&upper->search_list); - return ret; + return __netdev_find_adj(dev, udev, true); } -static struct netdev_upper *__netdev_find_upper(struct net_device *dev, - struct net_device *upper_dev) +static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, + struct net_device *ldev) { - struct netdev_upper *upper; - - list_for_each_entry(upper, &dev->upper_dev_list, list) { - if (upper->dev == upper_dev) - return upper; - } - return NULL; + return __netdev_find_adj(dev, ldev, false); } /** @@ -4462,7 +4453,7 @@ EXPORT_SYMBOL(netdev_has_any_upper_dev); */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; ASSERT_RTNL(); @@ -4470,13 +4461,38 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) return NULL; upper = list_first_entry(&dev->upper_dev_list, - struct netdev_upper, list); + struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get); +/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->upper_dev_list) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device @@ -4486,20 +4502,158 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get); */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { - struct netdev_upper *upper; + struct netdev_adjacent *upper; upper = list_first_or_null_rcu(&dev->upper_dev_list, - struct netdev_upper, list); + struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + bool neighbour, bool master, + bool upper) +{ + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(dev, adj_dev, upper); + + if (adj) { + BUG_ON(neighbour); + adj->ref_nr++; + return 0; + } + + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; + + adj->dev = adj_dev; + adj->master = master; + adj->neighbour = neighbour; + adj->ref_nr = 1; + + dev_hold(adj_dev); + pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); + + if (!upper) { + list_add_tail_rcu(&adj->list, &dev->lower_dev_list); + return 0; + } + + /* Ensure that master upper link is always the first item in list. */ + if (master) + list_add_rcu(&adj->list, &dev->upper_dev_list); + else + list_add_tail_rcu(&adj->list, &dev->upper_dev_list); + + return 0; +} + +static inline int __netdev_upper_dev_insert(struct net_device *dev, + struct net_device *udev, + bool master, bool neighbour) +{ + return __netdev_adjacent_dev_insert(dev, udev, neighbour, master, + true); +} + +static inline int __netdev_lower_dev_insert(struct net_device *dev, + struct net_device *ldev, + bool neighbour) +{ + return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false, + false); +} + +void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, bool upper) +{ + struct netdev_adjacent *adj; + + if (upper) + adj = __netdev_find_upper(dev, adj_dev); + else + adj = __netdev_find_lower(dev, adj_dev); + + if (!adj) + BUG(); + + if (adj->ref_nr > 1) { + adj->ref_nr--; + return; + } + + list_del_rcu(&adj->list); + pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); + dev_put(adj_dev); + kfree_rcu(adj, rcu); +} + +static inline void __netdev_upper_dev_remove(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_adjacent_dev_remove(dev, udev, true); +} + +static inline void __netdev_lower_dev_remove(struct net_device *dev, + struct net_device *ldev) +{ + return __netdev_adjacent_dev_remove(dev, ldev, false); +} + +int __netdev_adjacent_dev_insert_link(struct net_device *dev, + struct net_device *upper_dev, + bool master, bool neighbour) +{ + int ret; + + ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour); + if (ret) + return ret; + + ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour); + if (ret) { + __netdev_upper_dev_remove(dev, upper_dev); + return ret; + } + + return 0; +} + +static inline int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_adjacent_dev_insert_link(dev, udev, false, false); +} + +static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *udev, + bool master) +{ + return __netdev_adjacent_dev_insert_link(dev, udev, master, true); +} + +void __netdev_adjacent_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_upper_dev_remove(dev, upper_dev); + __netdev_lower_dev_remove(upper_dev, dev); +} + + static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master) { - struct netdev_upper *upper; + struct netdev_adjacent *i, *j, *to_i, *to_j; + int ret = 0; ASSERT_RTNL(); @@ -4507,7 +4661,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_search_upper_dev(upper_dev, dev)) + if (__netdev_find_upper(upper_dev, dev)) return -EBUSY; if (__netdev_find_upper(dev, upper_dev)) @@ -4516,22 +4670,76 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - upper = kmalloc(sizeof(*upper), GFP_KERNEL); - if (!upper) - return -ENOMEM; + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master); + if (ret) + return ret; - upper->dev = upper_dev; - upper->master = master; - INIT_LIST_HEAD(&upper->search_list); + /* Now that we linked these devs, make all the upper_dev's + * upper_dev_list visible to every dev's lower_dev_list and vice + * versa, and don't forget the devices itself. All of these + * links are non-neighbours. + */ + list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, j->dev); + if (ret) + goto rollback_mesh; + } + } + + /* add dev to every upper_dev's upper device */ + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + ret = __netdev_adjacent_dev_link(dev, i->dev); + if (ret) + goto rollback_upper_mesh; + } + + /* add upper_dev to every dev's lower device */ + list_for_each_entry(i, &dev->lower_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, upper_dev); + if (ret) + goto rollback_lower_mesh; + } - /* Ensure that master upper link is always the first item in list. */ - if (master) - list_add_rcu(&upper->list, &dev->upper_dev_list); - else - list_add_tail_rcu(&upper->list, &dev->upper_dev_list); - dev_hold(upper_dev); call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); return 0; + +rollback_lower_mesh: + to_i = i; + list_for_each_entry(i, &dev->lower_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + } + + i = NULL; + +rollback_upper_mesh: + to_i = i; + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(dev, i->dev); + } + + i = j = NULL; + +rollback_mesh: + to_i = i; + to_j = j; + list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + if (i == to_i && j == to_j) + break; + __netdev_adjacent_dev_unlink(i->dev, j->dev); + } + if (i == to_i) + break; + } + + __netdev_adjacent_dev_unlink(dev, upper_dev); + + return ret; } /** @@ -4580,16 +4788,28 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_upper *upper; - + struct netdev_adjacent *i, *j; ASSERT_RTNL(); - upper = __netdev_find_upper(dev, upper_dev); - if (!upper) - return; - list_del_rcu(&upper->list); - dev_put(upper_dev); - kfree_rcu(upper, rcu); + __netdev_adjacent_dev_unlink(dev, upper_dev); + + /* Here is the tricky part. We must remove all dev's lower + * devices from all upper_dev's upper devices and vice + * versa, to maintain the graph relationship. + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + list_for_each_entry(j, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, j->dev); + + /* remove also the devices itself from lower/upper device + * list + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + + list_for_each_entry(i, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(dev, i->dev); + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -4989,6 +5209,24 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier) EXPORT_SYMBOL(dev_change_carrier); /** + * dev_get_phys_port_id - Get device physical port ID + * @dev: device + * @ppid: port ID + * + * Get device physical port ID + */ +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_port_id *ppid) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_id(dev, ppid); +} +EXPORT_SYMBOL(dev_get_phys_port_id); + +/** * dev_new_index - allocate an ifindex * @net: the applicable net namespace * @@ -5009,10 +5247,12 @@ static int dev_new_index(struct net *net) /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); +static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); + dev_net(dev)->dev_unreg_count++; } static void rollback_registered_many(struct list_head *head) @@ -5680,6 +5920,12 @@ void netdev_run_todo(void) if (dev->destructor) dev->destructor(dev); + /* Report a network device has been unregistered */ + rtnl_lock(); + dev_net(dev)->dev_unreg_count--; + __rtnl_unlock(); + wake_up(&netdev_unregistering_wq); + /* Free network device */ kobject_put(&dev->dev.kobj); } @@ -5832,6 +6078,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->upper_dev_list); + INIT_LIST_HEAD(&dev->lower_dev_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); @@ -6364,6 +6611,34 @@ static void __net_exit default_device_exit(struct net *net) rtnl_unlock(); } +static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) +{ + /* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace in net_list. + */ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network @@ -6375,7 +6650,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) struct net *net; LIST_HEAD(dev_kill_list); - rtnl_lock(); + /* To prevent network device cleanup code from dereferencing + * loopback devices or network devices that have been freed + * wait here for all pending unregistrations to complete, + * before unregistring the loopback device and allowing the + * network namespace be freed. + * + * The netdev todo list containing all network devices + * unregistrations that happen in default_device_exit_batch + * will run in the rtnl_unlock() at the end of + * default_device_exit_batch. + */ + rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 21735440c44..2e654138433 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -33,6 +33,9 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->flags = flags; r->fr_net = hold_net(ops->fro_net); + r->suppress_prefixlen = -1; + r->suppress_ifgroup = -1; + /* The lock is not required here, the list in unreacheable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); @@ -226,6 +229,9 @@ jumped: else err = ops->action(rule, fl, flags, arg); + if (!err && ops->suppress && ops->suppress(rule, arg)) + continue; + if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(atomic_inc_not_zero(&rule->refcnt))) { @@ -337,6 +343,15 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); + if (tb[FRA_SUPPRESS_PREFIXLEN]) + rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); + else + rule->suppress_prefixlen = -1; + + if (tb[FRA_SUPPRESS_IFGROUP]) + rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + else + rule->suppress_ifgroup = -1; if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); @@ -523,6 +538,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4); /* FRA_FWMASK */ @@ -548,6 +565,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->table = rule->table; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; + if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) + goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; @@ -580,6 +599,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target))) goto nla_put_failure; + + if (rule->suppress_ifgroup != -1) { + if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) + goto nla_put_failure; + } + if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b84a1b155bc..8d7d0dd72db 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -140,7 +140,11 @@ ipv6: break; } case IPPROTO_IPIP: - goto again; + proto = htons(ETH_P_IP); + goto ip; + case IPPROTO_IPV6: + proto = htons(ETH_P_IPV6); + goto ipv6; default: break; } @@ -150,8 +154,8 @@ ipv6: if (poff >= 0) { __be32 *ports, _ports; - nhoff += poff; - ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports); + ports = skb_header_pointer(skb, nhoff + poff, + sizeof(_ports), &_ports); if (ports) flow->ports = *ports; } @@ -346,14 +350,9 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) if (new_index < 0) new_index = skb_tx_hash(dev, skb); - if (queue_index != new_index && sk) { - struct dst_entry *dst = - rcu_dereference_check(sk->sk_dst_cache, 1); - - if (dst && skb_dst(skb) == dst) - sk_tx_queue_set(sk, queue_index); - - } + if (queue_index != new_index && sk && + rcu_access_pointer(sk->sk_dst_cache)) + sk_tx_queue_set(sk, new_index); queue_index = new_index; } diff --git a/net/core/iovec.c b/net/core/iovec.c index de178e46268..b77eeecc001 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -212,3 +212,27 @@ out_fault: goto out; } EXPORT_SYMBOL(csum_partial_copy_fromiovecend); + +unsigned long iov_pages(const struct iovec *iov, int offset, + unsigned long nr_segs) +{ + unsigned long seg, base; + int pages = 0, len, size; + + while (nr_segs && (offset >= iov->iov_len)) { + offset -= iov->iov_len; + ++iov; + --nr_segs; + } + + for (seg = 0; seg < nr_segs; seg++) { + base = (unsigned long)iov[seg].iov_base + offset; + len = iov[seg].iov_len - offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + pages += size; + offset = 0; + } + + return pages; +} +EXPORT_SYMBOL(iov_pages); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 60533db8b72..6072610a867 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2759,13 +2759,11 @@ errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } -#ifdef CONFIG_ARPD void neigh_app_ns(struct neighbour *n) { __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST); } EXPORT_SYMBOL(neigh_app_ns); -#endif /* CONFIG_ARPD */ #ifdef CONFIG_SYSCTL static int zero; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 981fed397d1..d954b56b4e4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -60,12 +60,19 @@ static ssize_t format_##field(const struct net_device *net, char *buf) \ { \ return sprintf(buf, format_string, net->field); \ } \ -static ssize_t show_##field(struct device *dev, \ +static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ return netdev_show(dev, attr, buf, format_##field); \ -} +} \ + +#define NETDEVICE_SHOW_RO(field, format_string) \ +NETDEVICE_SHOW(field, format_string); \ +static DEVICE_ATTR_RO(field) +#define NETDEVICE_SHOW_RW(field, format_string) \ +NETDEVICE_SHOW(field, format_string); \ +static DEVICE_ATTR_RW(field) /* use same locking and permission rules as SIF* ioctl's */ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, @@ -96,16 +103,16 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, return ret; } -NETDEVICE_SHOW(dev_id, fmt_hex); -NETDEVICE_SHOW(addr_assign_type, fmt_dec); -NETDEVICE_SHOW(addr_len, fmt_dec); -NETDEVICE_SHOW(iflink, fmt_dec); -NETDEVICE_SHOW(ifindex, fmt_dec); -NETDEVICE_SHOW(type, fmt_dec); -NETDEVICE_SHOW(link_mode, fmt_dec); +NETDEVICE_SHOW_RO(dev_id, fmt_hex); +NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); +NETDEVICE_SHOW_RO(addr_len, fmt_dec); +NETDEVICE_SHOW_RO(iflink, fmt_dec); +NETDEVICE_SHOW_RO(ifindex, fmt_dec); +NETDEVICE_SHOW_RO(type, fmt_dec); +NETDEVICE_SHOW_RO(link_mode, fmt_dec); /* use same locking rules as GIFHWADDR ioctl's */ -static ssize_t show_address(struct device *dev, struct device_attribute *attr, +static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *net = to_net_dev(dev); @@ -117,15 +124,17 @@ static ssize_t show_address(struct device *dev, struct device_attribute *attr, read_unlock(&dev_base_lock); return ret; } +static DEVICE_ATTR_RO(address); -static ssize_t show_broadcast(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t broadcast_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct net_device *net = to_net_dev(dev); if (dev_isalive(net)) return sysfs_format_mac(buf, net->broadcast, net->addr_len); return -EINVAL; } +static DEVICE_ATTR_RO(broadcast); static int change_carrier(struct net_device *net, unsigned long new_carrier) { @@ -134,13 +143,13 @@ static int change_carrier(struct net_device *net, unsigned long new_carrier) return dev_change_carrier(net, (bool) new_carrier); } -static ssize_t store_carrier(struct device *dev, struct device_attribute *attr, - const char *buf, size_t len) +static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_carrier); } -static ssize_t show_carrier(struct device *dev, +static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); @@ -149,8 +158,9 @@ static ssize_t show_carrier(struct device *dev, } return -EINVAL; } +static DEVICE_ATTR_RW(carrier); -static ssize_t show_speed(struct device *dev, +static ssize_t speed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); @@ -167,8 +177,9 @@ static ssize_t show_speed(struct device *dev, rtnl_unlock(); return ret; } +static DEVICE_ATTR_RO(speed); -static ssize_t show_duplex(struct device *dev, +static ssize_t duplex_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); @@ -198,8 +209,9 @@ static ssize_t show_duplex(struct device *dev, rtnl_unlock(); return ret; } +static DEVICE_ATTR_RO(duplex); -static ssize_t show_dormant(struct device *dev, +static ssize_t dormant_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); @@ -209,6 +221,7 @@ static ssize_t show_dormant(struct device *dev, return -EINVAL; } +static DEVICE_ATTR_RO(dormant); static const char *const operstates[] = { "unknown", @@ -220,7 +233,7 @@ static const char *const operstates[] = { "up" }; -static ssize_t show_operstate(struct device *dev, +static ssize_t operstate_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); @@ -237,35 +250,33 @@ static ssize_t show_operstate(struct device *dev, return sprintf(buf, "%s\n", operstates[operstate]); } +static DEVICE_ATTR_RO(operstate); /* read-write attributes */ -NETDEVICE_SHOW(mtu, fmt_dec); static int change_mtu(struct net_device *net, unsigned long new_mtu) { return dev_set_mtu(net, (int) new_mtu); } -static ssize_t store_mtu(struct device *dev, struct device_attribute *attr, +static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_mtu); } - -NETDEVICE_SHOW(flags, fmt_hex); +NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *net, unsigned long new_flags) { return dev_change_flags(net, (unsigned int) new_flags); } -static ssize_t store_flags(struct device *dev, struct device_attribute *attr, +static ssize_t flags_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_flags); } - -NETDEVICE_SHOW(tx_queue_len, fmt_ulong); +NETDEVICE_SHOW_RW(flags, fmt_hex); static int change_tx_queue_len(struct net_device *net, unsigned long new_len) { @@ -273,7 +284,7 @@ static int change_tx_queue_len(struct net_device *net, unsigned long new_len) return 0; } -static ssize_t store_tx_queue_len(struct device *dev, +static ssize_t tx_queue_len_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -282,8 +293,9 @@ static ssize_t store_tx_queue_len(struct device *dev, return netdev_store(dev, attr, buf, len, change_tx_queue_len); } +NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); -static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, +static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); @@ -306,7 +318,7 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, return ret < 0 ? ret : len; } -static ssize_t show_ifalias(struct device *dev, +static ssize_t ifalias_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); @@ -319,8 +331,7 @@ static ssize_t show_ifalias(struct device *dev, rtnl_unlock(); return ret; } - -NETDEVICE_SHOW(group, fmt_dec); +static DEVICE_ATTR_RW(ifalias); static int change_group(struct net_device *net, unsigned long new_group) { @@ -328,35 +339,60 @@ static int change_group(struct net_device *net, unsigned long new_group) return 0; } -static ssize_t store_group(struct device *dev, struct device_attribute *attr, - const char *buf, size_t len) +static ssize_t group_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_group); } +NETDEVICE_SHOW(group, fmt_dec); +static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); + +static ssize_t phys_port_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; -static struct device_attribute net_class_attributes[] = { - __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL), - __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), - __ATTR(dev_id, S_IRUGO, show_dev_id, NULL), - __ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias), - __ATTR(iflink, S_IRUGO, show_iflink, NULL), - __ATTR(ifindex, S_IRUGO, show_ifindex, NULL), - __ATTR(type, S_IRUGO, show_type, NULL), - __ATTR(link_mode, S_IRUGO, show_link_mode, NULL), - __ATTR(address, S_IRUGO, show_address, NULL), - __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), - __ATTR(carrier, S_IRUGO | S_IWUSR, show_carrier, store_carrier), - __ATTR(speed, S_IRUGO, show_speed, NULL), - __ATTR(duplex, S_IRUGO, show_duplex, NULL), - __ATTR(dormant, S_IRUGO, show_dormant, NULL), - __ATTR(operstate, S_IRUGO, show_operstate, NULL), - __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu), - __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), - __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, - store_tx_queue_len), - __ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group), - {} + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + struct netdev_phys_port_id ppid; + + ret = dev_get_phys_port_id(netdev, &ppid); + if (!ret) + ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_port_id); + +static struct attribute *net_class_attrs[] = { + &dev_attr_netdev_group.attr, + &dev_attr_type.attr, + &dev_attr_dev_id.attr, + &dev_attr_iflink.attr, + &dev_attr_ifindex.attr, + &dev_attr_addr_assign_type.attr, + &dev_attr_addr_len.attr, + &dev_attr_link_mode.attr, + &dev_attr_address.attr, + &dev_attr_broadcast.attr, + &dev_attr_speed.attr, + &dev_attr_duplex.attr, + &dev_attr_dormant.attr, + &dev_attr_operstate.attr, + &dev_attr_ifalias.attr, + &dev_attr_carrier.attr, + &dev_attr_mtu.attr, + &dev_attr_flags.attr, + &dev_attr_tx_queue_len.attr, + &dev_attr_phys_port_id.attr, + NULL, }; +ATTRIBUTE_GROUPS(net_class); /* Show a given an attribute in the statistics group */ static ssize_t netstat_show(const struct device *d, @@ -382,13 +418,13 @@ static ssize_t netstat_show(const struct device *d, /* generate a read-only statistics attribute */ #define NETSTAT_ENTRY(name) \ -static ssize_t show_##name(struct device *d, \ +static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *buf) \ { \ return netstat_show(d, attr, buf, \ offsetof(struct rtnl_link_stats64, name)); \ } \ -static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) +static DEVICE_ATTR_RO(name) NETSTAT_ENTRY(rx_packets); NETSTAT_ENTRY(tx_packets); @@ -457,6 +493,9 @@ static struct attribute_group wireless_group = { .attrs = wireless_attrs, }; #endif + +#else /* CONFIG_SYSFS */ +#define net_class_groups NULL #endif /* CONFIG_SYSFS */ #ifdef CONFIG_RPS @@ -1157,6 +1196,13 @@ static void remove_queue_kobjects(struct net_device *net) #endif } +static bool net_current_may_mount(void) +{ + struct net *net = current->nsproxy->net_ns; + + return ns_capable(net->user_ns, CAP_SYS_ADMIN); +} + static void *net_grab_current_ns(void) { struct net *ns = current->nsproxy->net_ns; @@ -1179,6 +1225,7 @@ static const void *net_netlink_ns(struct sock *sk) struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, + .current_may_mount = net_current_may_mount, .grab_current_ns = net_grab_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, @@ -1229,9 +1276,7 @@ static const void *net_namespace(struct device *d) static struct class net_class = { .name = "net", .dev_release = netdev_release, -#ifdef CONFIG_SYSFS - .dev_attrs = net_class_attributes, -#endif /* CONFIG_SYSFS */ + .dev_groups = net_class_groups, .dev_uevent = netdev_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f9765203675..81d3a9a0845 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -651,7 +651,7 @@ static int netns_install(struct nsproxy *nsproxy, void *ns) struct net *net = ns; if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; put_net(nsproxy->net_ns); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2c637e9a0b2..fc75c9e461b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -550,7 +550,7 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo return; proto = ntohs(eth_hdr(skb)->h_proto); - if (proto == ETH_P_IP) { + if (proto == ETH_P_ARP) { struct arphdr *arp; unsigned char *arp_ptr; /* No arp on this interface */ @@ -1284,15 +1284,14 @@ EXPORT_SYMBOL_GPL(__netpoll_free_async); void netpoll_cleanup(struct netpoll *np) { - if (!np->dev) - return; - rtnl_lock(); + if (!np->dev) + goto out; __netpoll_cleanup(np); - rtnl_unlock(); - dev_put(np->dev); np->dev = NULL; +out: + rtnl_unlock(); } EXPORT_SYMBOL(netpoll_cleanup); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index e533259dce3..d9cd627e6a1 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -29,12 +29,6 @@ #define PRIOMAP_MIN_SZ 128 -static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), - struct cgroup_netprio_state, css); -} - /* * Extend @dev->priomap so that it's large enough to accomodate * @target_idx. @dev->priomap.priomap_len > @target_idx after successful @@ -87,67 +81,70 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) /** * netprio_prio - return the effective netprio of a cgroup-net_device pair - * @cgrp: cgroup part of the target pair + * @css: css part of the target pair * @dev: net_device part of the target pair * * Should be called under RCU read or rtnl lock. */ -static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev) +static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); + int id = css->cgroup->id; - if (map && cgrp->id < map->priomap_len) - return map->priomap[cgrp->id]; + if (map && id < map->priomap_len) + return map->priomap[id]; return 0; } /** * netprio_set_prio - set netprio on a cgroup-net_device pair - * @cgrp: cgroup part of the target pair + * @css: css part of the target pair * @dev: net_device part of the target pair * @prio: prio to set * - * Set netprio to @prio on @cgrp-@dev pair. Should be called under rtnl + * Set netprio to @prio on @css-@dev pair. Should be called under rtnl * lock and may fail under memory pressure for non-zero @prio. */ -static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev, - u32 prio) +static int netprio_set_prio(struct cgroup_subsys_state *css, + struct net_device *dev, u32 prio) { struct netprio_map *map; + int id = css->cgroup->id; int ret; /* avoid extending priomap for zero writes */ map = rtnl_dereference(dev->priomap); - if (!prio && (!map || map->priomap_len <= cgrp->id)) + if (!prio && (!map || map->priomap_len <= id)) return 0; - ret = extend_netdev_table(dev, cgrp->id); + ret = extend_netdev_table(dev, id); if (ret) return ret; map = rtnl_dereference(dev->priomap); - map->priomap[cgrp->id] = prio; + map->priomap[id] = prio; return 0; } -static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { - struct cgroup_netprio_state *cs; + struct cgroup_subsys_state *css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); - if (!cs) + css = kzalloc(sizeof(*css), GFP_KERNEL); + if (!css) return ERR_PTR(-ENOMEM); - return &cs->css; + return css; } -static int cgrp_css_online(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup_subsys_state *css) { - struct cgroup *parent = cgrp->parent; + struct cgroup_subsys_state *parent_css = css_parent(css); struct net_device *dev; int ret = 0; - if (!parent) + if (!parent_css) return 0; rtnl_lock(); @@ -156,9 +153,9 @@ static int cgrp_css_online(struct cgroup *cgrp) * onlining, there is no need to clear them on offline. */ for_each_netdev(&init_net, dev) { - u32 prio = netprio_prio(parent, dev); + u32 prio = netprio_prio(parent_css, dev); - ret = netprio_set_prio(cgrp, dev, prio); + ret = netprio_set_prio(css, dev, prio); if (ret) break; } @@ -166,29 +163,29 @@ static int cgrp_css_online(struct cgroup *cgrp) return ret; } -static void cgrp_css_free(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp_netprio_state(cgrp)); + kfree(css); } -static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) +static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { - return cgrp->id; + return css->cgroup->id; } -static int read_priomap(struct cgroup *cont, struct cftype *cft, +static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) - cb->fill(cb, dev->name, netprio_prio(cont, dev)); + cb->fill(cb, dev->name, netprio_prio(css, dev)); rcu_read_unlock(); return 0; } -static int write_priomap(struct cgroup *cgrp, struct cftype *cft, +static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { char devname[IFNAMSIZ + 1]; @@ -205,7 +202,7 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft, rtnl_lock(); - ret = netprio_set_prio(cgrp, dev, prio); + ret = netprio_set_prio(css, dev, prio); rtnl_unlock(); dev_put(dev); @@ -221,12 +218,13 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *p; void *v; - cgroup_taskset_for_each(p, cgrp, tset) { + cgroup_taskset_for_each(p, css, tset) { task_lock(p); v = (void *)(unsigned long)task_netprioidx(p); iterate_fd(p->files, 0, update_netprio, v); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 9640972ec50..261357a6630 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -160,6 +160,8 @@ #include <net/net_namespace.h> #include <net/checksum.h> #include <net/ipv6.h> +#include <net/udp.h> +#include <net/ip6_checksum.h> #include <net/addrconf.h> #ifdef CONFIG_XFRM #include <net/xfrm.h> @@ -198,6 +200,7 @@ #define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ #define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ #define F_NODE (1<<15) /* Node memory alloc*/ +#define F_UDPCSUM (1<<16) /* Include UDP checksum */ /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -631,6 +634,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->flags & F_UDPDST_RND) seq_printf(seq, "UDPDST_RND "); + if (pkt_dev->flags & F_UDPCSUM) + seq_printf(seq, "UDPCSUM "); + if (pkt_dev->flags & F_MPLS_RND) seq_printf(seq, "MPLS_RND "); @@ -1228,6 +1234,12 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "!NODE_ALLOC") == 0) pkt_dev->flags &= ~F_NODE; + else if (strcmp(f, "UDPCSUM") == 0) + pkt_dev->flags |= F_UDPCSUM; + + else if (strcmp(f, "!UDPCSUM") == 0) + pkt_dev->flags &= ~F_UDPCSUM; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", @@ -2733,7 +2745,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, udph->source = htons(pkt_dev->cur_udp_src); udph->dest = htons(pkt_dev->cur_udp_dst); udph->len = htons(datalen + 8); /* DATA + udphdr */ - udph->check = 0; /* No checksum */ + udph->check = 0; iph->ihl = 5; iph->version = 4; @@ -2747,11 +2759,28 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, iph->frag_off = 0; iplen = 20 + 8 + datalen; iph->tot_len = htons(iplen); - iph->check = 0; - iph->check = ip_fast_csum((void *)iph, iph->ihl); + ip_send_check(iph); skb->protocol = protocol; skb->dev = odev; skb->pkt_type = PACKET_HOST; + + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & NETIF_F_V4_CSUM) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + udp4_hwcsum(skb, udph->source, udph->dest); + } else { + __wsum csum = udp_csum(skb); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_tcpudp_magic(udph->source, udph->dest, + datalen + 8, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + pktgen_finalize_skb(pkt_dev, skb, datalen); #ifdef CONFIG_XFRM @@ -2768,7 +2797,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, struct sk_buff *skb = NULL; __u8 *eth; struct udphdr *udph; - int datalen; + int datalen, udplen; struct ipv6hdr *iph; __be16 protocol = htons(ETH_P_IPV6); __be32 *mpls; @@ -2844,10 +2873,11 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, net_info_ratelimited("increased datalen to %d\n", datalen); } + udplen = datalen + sizeof(struct udphdr); udph->source = htons(pkt_dev->cur_udp_src); udph->dest = htons(pkt_dev->cur_udp_dst); - udph->len = htons(datalen + sizeof(struct udphdr)); - udph->check = 0; /* No checksum */ + udph->len = htons(udplen); + udph->check = 0; *(__be32 *) iph = htonl(0x60000000); /* Version + flow */ @@ -2858,7 +2888,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, iph->hop_limit = 32; - iph->payload_len = htons(sizeof(struct udphdr) + datalen); + iph->payload_len = htons(udplen); iph->nexthdr = IPPROTO_UDP; iph->daddr = pkt_dev->cur_in6_daddr; @@ -2868,6 +2898,23 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb->dev = odev; skb->pkt_type = PACKET_HOST; + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & NETIF_F_V6_CSUM) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0); + } else { + __wsum csum = udp_csum(skb); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + pktgen_finalize_skb(pkt_dev, skb, datalen); return skb; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ca198c1d1d3..2a0e21de306 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -767,7 +767,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ - + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ + + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -846,6 +847,24 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + struct netdev_phys_port_id ppid; + + err = dev_get_phys_port_id(dev, &ppid); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -913,6 +932,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } + if (rtnl_phys_port_id_fill(skb, dev)) + goto nla_put_failure; + attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (attr == NULL) @@ -1113,6 +1135,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, }; EXPORT_SYMBOL(ifla_policy); @@ -1844,10 +1867,10 @@ replay: else err = register_netdevice(dev); - if (err < 0 && !IS_ERR(dev)) + if (err < 0) { free_netdev(dev); - if (err < 0) goto out; + } err = rtnl_configure_link(dev, ifm); if (err < 0) diff --git a/net/core/scm.c b/net/core/scm.c index 03795d0147f..b442e7e25e6 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -54,11 +54,11 @@ static __inline__ int scm_check_creds(struct ucred *creds) return -EINVAL; if ((creds->pid == task_tgid_vnr(current) || - ns_capable(current->nsproxy->pid_ns->user_ns, CAP_SYS_ADMIN)) && + ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) && ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || - uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) && + uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) && ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || - gid_eq(gid, cred->sgid)) || nsown_capable(CAP_SETGID))) { + gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) { return 0; } return -EPERM; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 6a2f13cee86..3f1ec1586ae 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -10,11 +10,24 @@ #include <net/secure_seq.h> -static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; +#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) -void net_secret_init(void) +static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; + +static void net_secret_init(void) { - get_random_bytes(net_secret, sizeof(net_secret)); + u32 tmp; + int i; + + if (likely(net_secret[0])) + return; + + for (i = NET_SECRET_SIZE; i > 0;) { + do { + get_random_bytes(&tmp, sizeof(tmp)); + } while (!tmp); + cmpxchg(&net_secret[--i], 0, tmp); + } } #ifdef CONFIG_INET @@ -42,6 +55,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, u32 hash[MD5_DIGEST_WORDS]; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + (__force u32)daddr[i]; @@ -63,6 +77,7 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, u32 hash[MD5_DIGEST_WORDS]; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + (__force u32) daddr[i]; @@ -82,6 +97,7 @@ __u32 secure_ip_id(__be32 daddr) { u32 hash[MD5_DIGEST_WORDS]; + net_secret_init(); hash[0] = (__force __u32) daddr; hash[1] = net_secret[13]; hash[2] = net_secret[14]; @@ -96,6 +112,7 @@ __u32 secure_ipv6_id(const __be32 daddr[4]) { __u32 hash[4]; + net_secret_init(); memcpy(hash, daddr, 16); md5_transform(hash, net_secret); @@ -107,6 +124,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, { u32 hash[MD5_DIGEST_WORDS]; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = ((__force u16)sport << 16) + (__force u16)dport; @@ -121,6 +139,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { u32 hash[MD5_DIGEST_WORDS]; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = (__force u32)dport ^ net_secret[14]; @@ -140,6 +159,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, u32 hash[MD5_DIGEST_WORDS]; u64 seq; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = ((__force u16)sport << 16) + (__force u16)dport; @@ -164,6 +184,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, u64 seq; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + daddr[i]; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2c3d0f53d19..d81cff119f7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3500,17 +3500,22 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, EXPORT_SYMBOL(skb_try_coalesce); /** - * skb_scrub_packet - scrub an skb before sending it to another netns + * skb_scrub_packet - scrub an skb * * @skb: buffer to clean - * - * skb_scrub_packet can be used to clean an skb before injecting it in - * another namespace. We have to clear all information in the skb that - * could impact namespace isolation. + * @xnet: packet is crossing netns + * + * skb_scrub_packet can be used after encapsulating or decapsulting a packet + * into/from a tunnel. Some information have to be cleared during these + * operations. + * skb_scrub_packet can also be used to clean a skb before injecting it in + * another namespace (@xnet == true). We have to clear all information in the + * skb that could impact namespace isolation. */ -void skb_scrub_packet(struct sk_buff *skb) +void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb_orphan(skb); + if (xnet) + skb_orphan(skb); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; diff --git a/net/core/sock.c b/net/core/sock.c index 2c097c5a35d..5b6beba494a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -93,6 +93,7 @@ #include <linux/capability.h> #include <linux/errno.h> +#include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> @@ -1575,6 +1576,25 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +void skb_orphan_partial(struct sk_buff *skb) +{ + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, + * so we do not completely orphan skb, but transfert all + * accounted bytes but one, to avoid unexpected reorders. + */ + if (skb->destructor == sock_wfree +#ifdef CONFIG_INET + || skb->destructor == tcp_wfree +#endif + ) { + atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); + skb->truesize = 1; + } else { + skb_orphan(skb); + } +} +EXPORT_SYMBOL(skb_orphan_partial); + /* * Read buffer destructor automatically called from kfree_skb. */ @@ -1721,24 +1741,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode) + int *errcode, int max_page_order) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; + unsigned long chunk; gfp_t gfp_mask; long timeo; int err; int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + struct page *page; + int i; err = -EMSGSIZE; if (npages > MAX_SKB_FRAGS) goto failure; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; - timeo = sock_sndtimeo(sk, noblock); - while (1) { + while (!skb) { err = sock_error(sk); if (err != 0) goto failure; @@ -1747,50 +1766,52 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - __skb_fill_page_desc(skb, i, - page, 0, - (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len)); - data_len -= PAGE_SIZE; - } + if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + continue; + } - /* Full success... */ - break; - } - err = -ENOBUFS; + err = -ENOBUFS; + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + skb = alloc_skb(header_len, gfp_mask); + if (!skb) goto failure; + + skb->truesize += data_len; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(sk->sk_allocation | + __GFP_COMP | __GFP_NOWARN, + order); + if (page) + goto fill_page; + } + order--; + } + page = alloc_page(sk->sk_allocation); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; } - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); } skb_set_owner_w(skb, sk); @@ -1799,6 +1820,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, interrupted: err = sock_intr_errno(timeo); failure: + kfree_skb(skb); *errcode = err; return NULL; } @@ -1807,7 +1829,7 @@ EXPORT_SYMBOL(sock_alloc_send_pskb); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } EXPORT_SYMBOL(sock_alloc_send_skb); @@ -2425,6 +2447,52 @@ void sock_enable_timestamp(struct sock *sk, int flag) } } +int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + int copied, err; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else + spin_unlock_bh(&sk->sk_error_queue.lock); + +out_free_skb: + kfree_skb(skb); +out: + return err; +} +EXPORT_SYMBOL(sock_recv_errqueue); + /* * Get a socket option on an socket. * diff --git a/net/core/stream.c b/net/core/stream.c index f5df85dcd20..512f0a24269 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -30,7 +30,7 @@ void sk_stream_write_space(struct sock *sk) struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { + if (sk_stream_is_writeable(sk) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 31107abd278..cca44419090 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -20,6 +20,7 @@ #include <net/sock.h> #include <net/net_ratelimit.h> #include <net/busy_poll.h> +#include <net/pkt_sched.h> static int zero = 0; static int one = 1; @@ -193,6 +194,26 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, } #endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_SCHED +static int set_default_qdisc(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char id[IFNAMSIZ]; + struct ctl_table tbl = { + .data = id, + .maxlen = IFNAMSIZ, + }; + int ret; + + qdisc_get_default(id, IFNAMSIZ); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = qdisc_set_default(id); + return ret; +} +#endif + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -315,7 +336,14 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, -# +#endif +#ifdef CONFIG_NET_SCHED + { + .procname = "default_qdisc", + .mode = 0644, + .maxlen = IFNAMSIZ, + .proc_handler = set_default_qdisc + }, #endif #endif /* CONFIG_NET */ { |