diff options
Diffstat (limited to 'net')
193 files changed, 3592 insertions, 2008 deletions
diff --git a/net/802/mrp.c b/net/802/mrp.c index e085bcc754f..1eb05d80b07 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -871,10 +871,10 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) */ del_timer_sync(&app->join_timer); - spin_lock(&app->lock); + spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_pdu_queue(app); - spin_unlock(&app->lock); + spin_unlock_bh(&app->lock); mrp_queue_xmit(app); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 9424f3718ea..2fb2d88e8c2 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -341,7 +341,7 @@ static void __vlan_device_event(struct net_device *dev, unsigned long event) static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; diff --git a/net/Kconfig b/net/Kconfig index 2ddc9046868..d6a9ce6e180 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -218,6 +218,7 @@ source "net/batman-adv/Kconfig" source "net/openvswitch/Kconfig" source "net/vmw_vsock/Kconfig" source "net/netlink/Kconfig" +source "net/mpls/Kconfig" config RPS boolean @@ -242,6 +243,18 @@ config NETPRIO_CGROUP Cgroup subsystem for use in assigning processes to network priorities on a per-interface basis +config NET_LL_RX_POLL + bool "Low Latency Receive Poll" + depends on X86_TSC + default n + ---help--- + Support Low Latency Receive Queue Poll. + (For network card drivers which support this option.) + When waiting for data in read or poll call directly into the the device driver + to flush packets which may be pending on the device queues into the stack. + + If unsure, say N. + config BQL boolean depends on SYSFS @@ -259,6 +272,18 @@ config BPF_JIT packet sniffing (libpcap/tcpdump). Note : Admin should enable this feature changing /proc/sys/net/core/bpf_jit_enable +config NET_FLOW_LIMIT + boolean + depends on RPS + default y + ---help--- + The network stack has to drop packets when a receive processing CPU's + backlog reaches netdev_max_backlog. If a few out of many active flows + generate the vast majority of load, drop their traffic earlier to + maintain capacity for the other flows. This feature provides servers + with many clients some protection against DoS by a single (spoofed) + flow that greatly exceeds average workload. + menu "Network testing" config NET_PKTGEN diff --git a/net/Makefile b/net/Makefile index 091e7b04f30..9492e8cb64e 100644 --- a/net/Makefile +++ b/net/Makefile @@ -70,3 +70,4 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ obj-$(CONFIG_NFC) += nfc/ obj-$(CONFIG_OPENVSWITCH) += openvswitch/ obj-$(CONFIG_VSOCKETS) += vmw_vsock/ +obj-$(CONFIG_NET_MPLS_GSO) += mpls/ diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 173a2e82f48..690356fa52b 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -332,7 +332,7 @@ static void aarp_expire_timeout(unsigned long unused) static int aarp_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); int ct; if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index ef12839a7cf..7fee50d637f 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -644,7 +644,7 @@ static inline void atalk_dev_down(struct net_device *dev) static int ddp_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/atm/clip.c b/net/atm/clip.c index 8ae3a787933..8215f7cb170 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -539,9 +539,9 @@ static int clip_create(int number) } static int clip_device_event(struct notifier_block *this, unsigned long event, - void *arg) + void *ptr) { - struct net_device *dev = arg; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; @@ -575,6 +575,7 @@ static int clip_inet_event(struct notifier_block *this, unsigned long event, void *ifa) { struct in_device *in_dev; + struct netdev_notifier_info info; in_dev = ((struct in_ifaddr *)ifa)->ifa_dev; /* @@ -583,7 +584,8 @@ static int clip_inet_event(struct notifier_block *this, unsigned long event, */ if (event != NETDEV_UP) return NOTIFY_DONE; - return clip_device_event(this, NETDEV_CHANGE, in_dev->dev); + netdev_notifier_info_init(&info, in_dev->dev); + return clip_device_event(this, NETDEV_CHANGE, &info); } static struct notifier_block clip_dev_notifier = { diff --git a/net/atm/mpc.c b/net/atm/mpc.c index d4cc1be5c36..3af12755cd0 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -998,14 +998,12 @@ int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc) } static int mpoa_event_listener(struct notifier_block *mpoa_notifier, - unsigned long event, void *dev_ptr) + unsigned long event, void *ptr) { - struct net_device *dev; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct mpoa_client *mpc; struct lec_priv *priv; - dev = dev_ptr; - if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index e277e38f736..4b4d2b779ec 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -111,9 +111,9 @@ again: * Handle device status changes. */ static int ax25_device_event(struct notifier_block *this, unsigned long event, - void *ptr) + void *ptr) { - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; @@ -1974,7 +1974,7 @@ static struct packet_type ax25_packet_type __read_mostly = { }; static struct notifier_block ax25_dev_notifier = { - .notifier_call =ax25_device_event, + .notifier_call = ax25_device_event, }; static int __init ax25_init(void) diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index acbac2a9c62..489bb36f1b9 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -32,7 +32,6 @@ batman-adv-y += icmp_socket.o batman-adv-y += main.o batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o -batman-adv-y += ring_buffer.o batman-adv-y += routing.o batman-adv-y += send.o batman-adv-y += soft-interface.o diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 071f288b77a..d07323b3e9b 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -19,7 +19,6 @@ #include "main.h" #include "translation-table.h" -#include "ring_buffer.h" #include "originator.h" #include "routing.h" #include "gateway_common.h" @@ -29,16 +28,57 @@ #include "bat_algo.h" #include "network-coding.h" +/** + * batadv_ring_buffer_set - update the ring buffer with the given value + * @lq_recv: pointer to the ring buffer + * @lq_index: index to store the value at + * @value: value to store in the ring buffer + */ +static void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, + uint8_t value) +{ + lq_recv[*lq_index] = value; + *lq_index = (*lq_index + 1) % BATADV_TQ_GLOBAL_WINDOW_SIZE; +} + +/** + * batadv_ring_buffer_set - compute the average of all non-zero values stored + * in the given ring buffer + * @lq_recv: pointer to the ring buffer + * + * Returns computed average value. + */ +static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[]) +{ + const uint8_t *ptr; + uint16_t count = 0, i = 0, sum = 0; + + ptr = lq_recv; + + while (i < BATADV_TQ_GLOBAL_WINDOW_SIZE) { + if (*ptr != 0) { + count++; + sum += *ptr; + } + + i++; + ptr++; + } + + if (count == 0) + return 0; + + return (uint8_t)(sum / count); +} static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, const uint8_t *neigh_addr, struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh, __be32 seqno) + struct batadv_orig_node *orig_neigh) { struct batadv_neigh_node *neigh_node; - neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, - ntohl(seqno)); + neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr); if (!neigh_node) goto out; @@ -413,18 +453,16 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, else skb_size = packet_len; - skb_size += ETH_HLEN + NET_IP_ALIGN; + skb_size += ETH_HLEN; - forw_packet_aggr->skb = dev_alloc_skb(skb_size); + forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size); if (!forw_packet_aggr->skb) { if (!own_packet) atomic_inc(&bat_priv->batman_queue_left); kfree(forw_packet_aggr); goto out; } - skb_reserve(forw_packet_aggr->skb, ETH_HLEN + NET_IP_ALIGN); - - INIT_HLIST_NODE(&forw_packet_aggr->list); + skb_reserve(forw_packet_aggr->skb, ETH_HLEN); skb_buff = skb_put(forw_packet_aggr->skb, packet_len); forw_packet_aggr->packet_len = packet_len; @@ -590,6 +628,41 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, if_incoming, 0, batadv_iv_ogm_fwd_send_time()); } +/** + * batadv_iv_ogm_slide_own_bcast_window - bitshift own OGM broadcast windows for + * the given interface + * @hard_iface: the interface for which the windows have to be shifted + */ +static void +batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct hlist_head *head; + struct batadv_orig_node *orig_node; + unsigned long *word; + uint32_t i; + size_t word_index; + uint8_t *w; + + for (i = 0; i < hash->size; i++) { + head = &hash->table[i]; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + spin_lock_bh(&orig_node->ogm_cnt_lock); + word_index = hard_iface->if_num * BATADV_NUM_WORDS; + word = &(orig_node->bcast_own[word_index]); + + batadv_bit_get_packet(bat_priv, word, 1, 0); + w = &orig_node->bcast_own_sum[hard_iface->if_num]; + *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE); + spin_unlock_bh(&orig_node->ogm_cnt_lock); + } + rcu_read_unlock(); + } +} + static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); @@ -634,7 +707,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) batadv_ogm_packet->gw_flags = BATADV_NO_FLAGS; } - batadv_slide_own_bcast_window(hard_iface); + batadv_iv_ogm_slide_own_bcast_window(hard_iface); batadv_iv_ogm_queue_add(bat_priv, hard_iface->bat_iv.ogm_buff, hard_iface->bat_iv.ogm_buff_len, hard_iface, 1, batadv_iv_ogm_emit_send_time(bat_priv)); @@ -670,7 +743,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && tmp_neigh_node->if_incoming == if_incoming && atomic_inc_not_zero(&tmp_neigh_node->refcount)) { - if (neigh_node) + if (WARN(neigh_node, "too many matching neigh_nodes")) batadv_neigh_node_free_ref(neigh_node); neigh_node = tmp_neigh_node; continue; @@ -696,8 +769,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, neigh_node = batadv_iv_ogm_neigh_new(if_incoming, ethhdr->h_source, - orig_node, orig_tmp, - batadv_ogm_packet->seqno); + orig_node, orig_tmp); batadv_orig_node_free_ref(orig_tmp); if (!neigh_node) @@ -829,8 +901,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, neigh_node = batadv_iv_ogm_neigh_new(if_incoming, orig_neigh_node->orig, orig_neigh_node, - orig_neigh_node, - batadv_ogm_packet->seqno); + orig_neigh_node); if (!neigh_node) goto out; @@ -991,7 +1062,7 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, struct batadv_neigh_node *orig_neigh_router = NULL; int has_directlink_flag; int is_my_addr = 0, is_my_orig = 0, is_my_oldorig = 0; - int is_broadcast = 0, is_bidirect; + int is_bidirect; bool is_single_hop_neigh = false; bool is_from_best_next_hop = false; int is_duplicate, sameseq, simlar_ttl; @@ -1054,19 +1125,9 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, if (batadv_compare_eth(batadv_ogm_packet->prev_sender, hard_iface->net_dev->dev_addr)) is_my_oldorig = 1; - - if (is_broadcast_ether_addr(ethhdr->h_source)) - is_broadcast = 1; } rcu_read_unlock(); - if (batadv_ogm_packet->header.version != BATADV_COMPAT_VERSION) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Drop packet: incompatible batman version (%i)\n", - batadv_ogm_packet->header.version); - return; - } - if (is_my_addr) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: received my own broadcast (sender: %pM)\n", @@ -1074,13 +1135,6 @@ static void batadv_iv_ogm_process(const struct ethhdr *ethhdr, return; } - if (is_broadcast) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Drop packet: ignoring all packets with broadcast source addr (sender: %pM)\n", - ethhdr->h_source); - return; - } - if (is_my_orig) { unsigned long *word; int offset; @@ -1288,7 +1342,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, skb->len + ETH_HLEN); packet_len = skb_headlen(skb); - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); packet_buff = skb->data; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff; diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 379061c7254..e9d8e0b3c3d 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -180,7 +180,7 @@ static struct batadv_bla_claim */ static struct batadv_bla_backbone_gw * batadv_backbone_hash_find(struct batadv_priv *bat_priv, - uint8_t *addr, short vid) + uint8_t *addr, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; struct hlist_head *head; @@ -257,7 +257,7 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...) */ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, - short vid, int claimtype) + unsigned short vid, int claimtype) { struct sk_buff *skb; struct ethhdr *ethhdr; @@ -307,7 +307,8 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, */ memcpy(ethhdr->h_source, mac, ETH_ALEN); batadv_dbg(BATADV_DBG_BLA, bat_priv, - "bla_send_claim(): CLAIM %pM on vid %d\n", mac, vid); + "bla_send_claim(): CLAIM %pM on vid %d\n", mac, + BATADV_PRINT_VID(vid)); break; case BATADV_CLAIM_TYPE_UNCLAIM: /* unclaim frame @@ -316,7 +317,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, memcpy(hw_src, mac, ETH_ALEN); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_send_claim(): UNCLAIM %pM on vid %d\n", mac, - vid); + BATADV_PRINT_VID(vid)); break; case BATADV_CLAIM_TYPE_ANNOUNCE: /* announcement frame @@ -325,7 +326,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, memcpy(hw_src, mac, ETH_ALEN); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_send_claim(): ANNOUNCE of %pM on vid %d\n", - ethhdr->h_source, vid); + ethhdr->h_source, BATADV_PRINT_VID(vid)); break; case BATADV_CLAIM_TYPE_REQUEST: /* request frame @@ -335,13 +336,15 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, memcpy(hw_src, mac, ETH_ALEN); memcpy(ethhdr->h_dest, mac, ETH_ALEN); batadv_dbg(BATADV_DBG_BLA, bat_priv, - "bla_send_claim(): REQUEST of %pM to %pMon vid %d\n", - ethhdr->h_source, ethhdr->h_dest, vid); + "bla_send_claim(): REQUEST of %pM to %pM on vid %d\n", + ethhdr->h_source, ethhdr->h_dest, + BATADV_PRINT_VID(vid)); break; } - if (vid != -1) - skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), vid); + if (vid & BATADV_VLAN_HAS_TAG) + skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), + vid & VLAN_VID_MASK); skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, soft_iface); @@ -367,7 +370,7 @@ out: */ static struct batadv_bla_backbone_gw * batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, uint8_t *orig, - short vid, bool own_backbone) + unsigned short vid, bool own_backbone) { struct batadv_bla_backbone_gw *entry; struct batadv_orig_node *orig_node; @@ -380,7 +383,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, uint8_t *orig, batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_get_backbone_gw(): not found (%pM, %d), creating new entry\n", - orig, vid); + orig, BATADV_PRINT_VID(vid)); entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) @@ -434,7 +437,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, uint8_t *orig, static void batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - short vid) + unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -456,7 +459,7 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, */ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - short vid) + unsigned short vid) { struct hlist_head *head; struct batadv_hashtable *hash; @@ -547,7 +550,7 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, * @backbone_gw: the backbone gateway which claims it */ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, - const uint8_t *mac, const short vid, + const uint8_t *mac, const unsigned short vid, struct batadv_bla_backbone_gw *backbone_gw) { struct batadv_bla_claim *claim; @@ -572,7 +575,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, atomic_set(&claim->refcount, 2); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_add_claim(): adding new entry %pM, vid %d to hash ...\n", - mac, vid); + mac, BATADV_PRINT_VID(vid)); hash_added = batadv_hash_add(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim, @@ -591,7 +594,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_add_claim(): changing ownership for %pM, vid %d\n", - mac, vid); + mac, BATADV_PRINT_VID(vid)); claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); batadv_backbone_gw_free_ref(claim->backbone_gw); @@ -611,7 +614,7 @@ claim_free_ref: * given mac address and vid. */ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, - const uint8_t *mac, const short vid) + const uint8_t *mac, const unsigned short vid) { struct batadv_bla_claim search_claim, *claim; @@ -622,7 +625,7 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, return; batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_del_claim(): %pM, vid %d\n", - mac, vid); + mac, BATADV_PRINT_VID(vid)); batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim); @@ -637,7 +640,7 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, /* check for ANNOUNCE frame, return 1 if handled */ static int batadv_handle_announce(struct batadv_priv *bat_priv, uint8_t *an_addr, uint8_t *backbone_addr, - short vid) + unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; uint16_t crc; @@ -658,12 +661,13 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_BLA, bat_priv, "handle_announce(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n", - vid, backbone_gw->orig, crc); + BATADV_PRINT_VID(vid), backbone_gw->orig, crc); if (backbone_gw->crc != crc) { batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "handle_announce(): CRC FAILED for %pM/%d (my = %#.4x, sent = %#.4x)\n", - backbone_gw->orig, backbone_gw->vid, + backbone_gw->orig, + BATADV_PRINT_VID(backbone_gw->vid), backbone_gw->crc, crc); batadv_bla_send_request(backbone_gw); @@ -685,7 +689,7 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, static int batadv_handle_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, uint8_t *backbone_addr, - struct ethhdr *ethhdr, short vid) + struct ethhdr *ethhdr, unsigned short vid) { /* check for REQUEST frame */ if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest)) @@ -699,7 +703,7 @@ static int batadv_handle_request(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_BLA, bat_priv, "handle_request(): REQUEST vid %d (sent by %pM)...\n", - vid, ethhdr->h_source); + BATADV_PRINT_VID(vid), ethhdr->h_source); batadv_bla_answer_request(bat_priv, primary_if, vid); return 1; @@ -709,7 +713,7 @@ static int batadv_handle_request(struct batadv_priv *bat_priv, static int batadv_handle_unclaim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, uint8_t *backbone_addr, - uint8_t *claim_addr, short vid) + uint8_t *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -727,7 +731,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, /* this must be an UNCLAIM frame */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "handle_unclaim(): UNCLAIM %pM on vid %d (sent by %pM)...\n", - claim_addr, vid, backbone_gw->orig); + claim_addr, BATADV_PRINT_VID(vid), backbone_gw->orig); batadv_bla_del_claim(bat_priv, claim_addr, vid); batadv_backbone_gw_free_ref(backbone_gw); @@ -738,7 +742,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, static int batadv_handle_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, uint8_t *backbone_addr, uint8_t *claim_addr, - short vid) + unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -861,14 +865,15 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, struct batadv_bla_claim_dst *bla_dst; uint16_t proto; int headlen; - short vid = -1; + unsigned short vid = BATADV_NO_FLAGS; int ret; - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); if (ntohs(ethhdr->h_proto) == ETH_P_8021Q) { vhdr = (struct vlan_ethhdr *)ethhdr; vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK; + vid |= BATADV_VLAN_HAS_TAG; proto = ntohs(vhdr->h_vlan_encapsulated_proto); headlen = sizeof(*vhdr); } else { @@ -885,7 +890,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, return 0; /* pskb_may_pull() may have modified the pointers, get ethhdr again */ - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); arphdr = (struct arphdr *)((uint8_t *)ethhdr + headlen); /* Check whether the ARP frame carries a valid @@ -910,7 +915,8 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, if (ret == 1) batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_process_claim(): received a claim frame from another group. From: %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", - ethhdr->h_source, vid, hw_src, hw_dst); + ethhdr->h_source, BATADV_PRINT_VID(vid), hw_src, + hw_dst); if (ret < 2) return ret; @@ -945,7 +951,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_process_claim(): ERROR - this looks like a claim frame, but is useless. eth src %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", - ethhdr->h_source, vid, hw_src, hw_dst); + ethhdr->h_source, BATADV_PRINT_VID(vid), hw_src, hw_dst); return 1; } @@ -1358,7 +1364,7 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb, struct ethhdr *ethhdr; struct vlan_ethhdr *vhdr; struct batadv_bla_backbone_gw *backbone_gw; - short vid = -1; + unsigned short vid = BATADV_NO_FLAGS; if (!atomic_read(&orig_node->bat_priv->bridge_loop_avoidance)) return 0; @@ -1375,6 +1381,7 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb, vhdr = (struct vlan_ethhdr *)(skb->data + hdr_size); vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK; + vid |= BATADV_VLAN_HAS_TAG; } /* see if this originator is a backbone gw for this VLAN */ @@ -1424,15 +1431,15 @@ void batadv_bla_free(struct batadv_priv *bat_priv) * returns 1, otherwise it returns 0 and the caller shall further * process the skb. */ -int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, short vid, - bool is_bcast) +int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid, bool is_bcast) { struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; int ret; - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) @@ -1519,7 +1526,8 @@ out: * returns 1, otherwise it returns 0 and the caller shall further * process the skb. */ -int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, short vid) +int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid) { struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; @@ -1539,7 +1547,7 @@ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, short vid) if (batadv_bla_process_claim(bat_priv, primary_if, skb)) goto handled; - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); if (unlikely(atomic_read(&bat_priv->bla.num_requests))) /* don't allow broadcasts while requests are in flight */ @@ -1623,8 +1631,8 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) hlist_for_each_entry_rcu(claim, head, hash_entry) { is_own = batadv_compare_eth(claim->backbone_gw->orig, primary_addr); - seq_printf(seq, " * %pM on % 5d by %pM [%c] (%#.4x)\n", - claim->addr, claim->vid, + seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n", + claim->addr, BATADV_PRINT_VID(claim->vid), claim->backbone_gw->orig, (is_own ? 'x' : ' '), claim->backbone_gw->crc); @@ -1676,10 +1684,10 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) if (is_own) continue; - seq_printf(seq, - " * %pM on % 5d % 4i.%03is (%#.4x)\n", - backbone_gw->orig, backbone_gw->vid, - secs, msecs, backbone_gw->crc); + seq_printf(seq, " * %pM on %5d %4i.%03is (%#.4x)\n", + backbone_gw->orig, + BATADV_PRINT_VID(backbone_gw->vid), secs, + msecs, backbone_gw->crc); } rcu_read_unlock(); } diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index dea2fbc5d98..4b102e71e5b 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -21,9 +21,10 @@ #define _NET_BATMAN_ADV_BLA_H_ #ifdef CONFIG_BATMAN_ADV_BLA -int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, short vid, - bool is_bcast); -int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, short vid); +int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid, bool is_bcast); +int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid); int batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size); int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset); @@ -42,13 +43,14 @@ void batadv_bla_free(struct batadv_priv *bat_priv); #else /* ifdef CONFIG_BATMAN_ADV_BLA */ static inline int batadv_bla_rx(struct batadv_priv *bat_priv, - struct sk_buff *skb, short vid, bool is_bcast) + struct sk_buff *skb, unsigned short vid, + bool is_bcast) { return 0; } static inline int batadv_bla_tx(struct batadv_priv *bat_priv, - struct sk_buff *skb, short vid) + struct sk_buff *skb, unsigned short vid) { return 0; } diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 8e15d966d9b..06345d40158 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -45,9 +45,9 @@ static void batadv_dat_start_timer(struct batadv_priv *bat_priv) } /** - * batadv_dat_entry_free_ref - decrements the dat_entry refcounter and possibly + * batadv_dat_entry_free_ref - decrement the dat_entry refcounter and possibly * free it - * @dat_entry: the oentry to free + * @dat_entry: the entry to free */ static void batadv_dat_entry_free_ref(struct batadv_dat_entry *dat_entry) { @@ -56,10 +56,10 @@ static void batadv_dat_entry_free_ref(struct batadv_dat_entry *dat_entry) } /** - * batadv_dat_to_purge - checks whether a dat_entry has to be purged or not + * batadv_dat_to_purge - check whether a dat_entry has to be purged or not * @dat_entry: the entry to check * - * Returns true if the entry has to be purged now, false otherwise + * Returns true if the entry has to be purged now, false otherwise. */ static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry) { @@ -75,8 +75,8 @@ static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry) * returns a boolean value: true is the entry has to be deleted, * false otherwise * - * Loops over each entry in the DAT local storage and delete it if and only if - * the to_purge function passed as argument returns true + * Loops over each entry in the DAT local storage and deletes it if and only if + * the to_purge function passed as argument returns true. */ static void __batadv_dat_purge(struct batadv_priv *bat_priv, bool (*to_purge)(struct batadv_dat_entry *)) @@ -97,7 +97,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv, spin_lock_bh(list_lock); hlist_for_each_entry_safe(dat_entry, node_tmp, head, hash_entry) { - /* if an helper function has been passed as parameter, + /* if a helper function has been passed as parameter, * ask it if the entry has to be purged or not */ if (to_purge && !to_purge(dat_entry)) @@ -134,7 +134,7 @@ static void batadv_dat_purge(struct work_struct *work) * @node: node in the local table * @data2: second object to compare the node to * - * Returns 1 if the two entry are the same, 0 otherwise + * Returns 1 if the two entries are the same, 0 otherwise. */ static int batadv_compare_dat(const struct hlist_node *node, const void *data2) { @@ -149,7 +149,7 @@ static int batadv_compare_dat(const struct hlist_node *node, const void *data2) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the hw_src field in the ARP packet + * Returns the value of the hw_src field in the ARP packet. */ static uint8_t *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) { @@ -166,7 +166,7 @@ static uint8_t *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the ip_src field in the ARP packet + * Returns the value of the ip_src field in the ARP packet. */ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) { @@ -178,7 +178,7 @@ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the hw_dst field in the ARP packet + * Returns the value of the hw_dst field in the ARP packet. */ static uint8_t *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) { @@ -190,7 +190,7 @@ static uint8_t *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * - * Returns the value of the ip_dst field in the ARP packet + * Returns the value of the ip_dst field in the ARP packet. */ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) { @@ -202,7 +202,7 @@ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) * @data: data to hash * @size: size of the hash table * - * Returns the selected index in the hash table for the given data + * Returns the selected index in the hash table for the given data. */ static uint32_t batadv_hash_dat(const void *data, uint32_t size) { @@ -224,12 +224,12 @@ static uint32_t batadv_hash_dat(const void *data, uint32_t size) } /** - * batadv_dat_entry_hash_find - looks for a given dat_entry in the local hash + * batadv_dat_entry_hash_find - look for a given dat_entry in the local hash * table * @bat_priv: the bat priv with all the soft interface information * @ip: search key * - * Returns the dat_entry if found, NULL otherwise + * Returns the dat_entry if found, NULL otherwise. */ static struct batadv_dat_entry * batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip) @@ -343,9 +343,6 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, if (hdr_size == 0) return; - /* if the ARP packet is encapsulated in a batman packet, let's print - * some debug messages - */ unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; switch (unicast_4addr_packet->u.header.packet_type) { @@ -409,7 +406,8 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, * @candidate: orig_node under evaluation * @max_orig_node: last selected candidate * - * Returns true if the node has been elected as next candidate or false othrwise + * Returns true if the node has been elected as next candidate or false + * otherwise. */ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, int select, batadv_dat_addr_t tmp_max, @@ -472,7 +470,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, */ cands[select].type = BATADV_DAT_CANDIDATE_NOT_FOUND; - /* iterate over the originator list and find the node with closest + /* iterate over the originator list and find the node with the closest * dat_address which has not been selected yet */ for (i = 0; i < hash->size; i++) { @@ -480,7 +478,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - /* the dht space is a ring and addresses are unsigned */ + /* the dht space is a ring using unsigned addresses */ tmp_max = BATADV_DAT_ADDR_MAX - orig_node->dat_addr + ip_key; @@ -512,7 +510,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, } /** - * batadv_dat_select_candidates - selects the nodes which the DHT message has to + * batadv_dat_select_candidates - select the nodes which the DHT message has to * be sent to * @bat_priv: the bat priv with all the soft interface information * @ip_dst: ipv4 to look up in the DHT @@ -521,7 +519,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, * closest values (from the LEFT, with wrap around if needed) then the hash * value of the key. ip_dst is the key. * - * Returns the candidate array of size BATADV_DAT_CANDIDATE_NUM + * Returns the candidate array of size BATADV_DAT_CANDIDATE_NUM. */ static struct batadv_dat_candidate * batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) @@ -558,10 +556,11 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) * @ip: the DHT key * @packet_subtype: unicast4addr packet subtype to use * - * In this function the skb is copied by means of pskb_copy() and is sent as - * unicast packet to each of the selected candidates + * This function copies the skb with pskb_copy() and is sent as unicast packet + * to each of the selected candidates. * - * Returns true if the packet is sent to at least one candidate, false otherwise + * Returns true if the packet is sent to at least one candidate, false + * otherwise. */ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, struct sk_buff *skb, __be32 ip, @@ -727,7 +726,7 @@ out: * @skb: packet to analyse * @hdr_size: size of the possible header before the ARP packet in the skb * - * Returns the ARP type if the skb contains a valid ARP packet, 0 otherwise + * Returns the ARP type if the skb contains a valid ARP packet, 0 otherwise. */ static uint16_t batadv_arp_get_type(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -754,9 +753,7 @@ static uint16_t batadv_arp_get_type(struct batadv_priv *bat_priv, arphdr = (struct arphdr *)(skb->data + hdr_size + ETH_HLEN); - /* Check whether the ARP packet carries a valid - * IP information - */ + /* check whether the ARP packet carries a valid IP information */ if (arphdr->ar_hrd != htons(ARPHRD_ETHER)) goto out; @@ -784,7 +781,7 @@ static uint16_t batadv_arp_get_type(struct batadv_priv *bat_priv, if (is_zero_ether_addr(hw_src) || is_multicast_ether_addr(hw_src)) goto out; - /* we don't care about the destination MAC address in ARP requests */ + /* don't care about the destination MAC address in ARP requests */ if (arphdr->ar_op != htons(ARPOP_REQUEST)) { hw_dst = batadv_arp_hw_dst(skb, hdr_size); if (is_zero_ether_addr(hw_dst) || @@ -804,8 +801,8 @@ out: * @skb: packet to check * * Returns true if the message has been sent to the dht candidates, false - * otherwise. In case of true the message has to be enqueued to permit the - * fallback + * otherwise. In case of a positive return value the message has to be enqueued + * to permit the fallback. */ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, struct sk_buff *skb) @@ -837,6 +834,19 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst); if (dat_entry) { + /* If the ARP request is destined for a local client the local + * client will answer itself. DAT would only generate a + * duplicate packet. + * + * Moreover, if the soft-interface is enslaved into a bridge, an + * additional DAT answer may trigger kernel warnings about + * a packet coming from the wrong port. + */ + if (batadv_is_my_client(bat_priv, dat_entry->mac_addr)) { + ret = true; + goto out; + } + skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src, bat_priv->soft_iface, ip_dst, hw_src, dat_entry->mac_addr, hw_src); @@ -854,7 +864,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n"); ret = true; } else { - /* Send the request on the DHT */ + /* Send the request to the DHT */ ret = batadv_dat_send_data(bat_priv, skb, ip_dst, BATADV_P_DAT_DHT_GET); } @@ -871,7 +881,7 @@ out: * @skb: packet to check * @hdr_size: size of the encapsulation header * - * Returns true if the request has been answered, false otherwise + * Returns true if the request has been answered, false otherwise. */ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -911,10 +921,9 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, if (!skb_new) goto out; - /* to preserve backwards compatibility, here the node has to answer - * using the same packet type it received for the request. This is due - * to that if a node is not using the 4addr packet format it may not - * support it. + /* To preserve backwards compatibility, the node has choose the outgoing + * format based on the incoming request packet type. The assumption is + * that a node not using the 4addr packet format doesn't support it. */ if (hdr_size == sizeof(struct batadv_unicast_4addr_packet)) err = batadv_unicast_4addr_send_skb(bat_priv, skb_new, @@ -964,7 +973,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, batadv_dat_entry_add(bat_priv, ip_dst, hw_dst); /* Send the ARP reply to the candidates for both the IP addresses that - * the node got within the ARP reply + * the node obtained from the ARP reply */ batadv_dat_send_data(bat_priv, skb, ip_src, BATADV_P_DAT_DHT_PUT); batadv_dat_send_data(bat_priv, skb, ip_dst, BATADV_P_DAT_DHT_PUT); @@ -974,7 +983,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, * DAT storage only * @bat_priv: the bat priv with all the soft interface information * @skb: packet to check - * @hdr_size: siaze of the encapsulation header + * @hdr_size: size of the encapsulation header */ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) @@ -1018,11 +1027,11 @@ out: /** * batadv_dat_drop_broadcast_packet - check if an ARP request has to be dropped - * (because the node has already got the reply via DAT) or not + * (because the node has already obtained the reply via DAT) or not * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the broadcast packet * - * Returns true if the node can drop the packet, false otherwise + * Returns true if the node can drop the packet, false otherwise. */ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 522243aff2f..c478e6bcf89 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -117,6 +117,58 @@ static int batadv_is_valid_iface(const struct net_device *net_dev) return 1; } +/** + * batadv_is_wifi_netdev - check if the given net_device struct is a wifi + * interface + * @net_device: the device to check + * + * Returns true if the net device is a 802.11 wireless device, false otherwise. + */ +static bool batadv_is_wifi_netdev(struct net_device *net_device) +{ +#ifdef CONFIG_WIRELESS_EXT + /* pre-cfg80211 drivers have to implement WEXT, so it is possible to + * check for wireless_handlers != NULL + */ + if (net_device->wireless_handlers) + return true; +#endif + + /* cfg80211 drivers have to set ieee80211_ptr */ + if (net_device->ieee80211_ptr) + return true; + + return false; +} + +/** + * batadv_is_wifi_iface - check if the given interface represented by ifindex + * is a wifi interface + * @ifindex: interface index to check + * + * Returns true if the interface represented by ifindex is a 802.11 wireless + * device, false otherwise. + */ +bool batadv_is_wifi_iface(int ifindex) +{ + struct net_device *net_device = NULL; + bool ret = false; + + if (ifindex == BATADV_NULL_IFINDEX) + goto out; + + net_device = dev_get_by_index(&init_net, ifindex); + if (!net_device) + goto out; + + ret = batadv_is_wifi_netdev(net_device); + +out: + if (net_device) + dev_put(net_device); + return ret; +} + static struct batadv_hard_iface * batadv_hardif_get_active(const struct net_device *soft_iface) { @@ -525,7 +577,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) dev_hold(net_dev); - hard_iface = kmalloc(sizeof(*hard_iface), GFP_ATOMIC); + hard_iface = kzalloc(sizeof(*hard_iface), GFP_ATOMIC); if (!hard_iface) goto release_dev; @@ -541,18 +593,16 @@ batadv_hardif_add_interface(struct net_device *net_dev) INIT_WORK(&hard_iface->cleanup_work, batadv_hardif_remove_interface_finish); + hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT; + if (batadv_is_wifi_netdev(net_dev)) + hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; + /* extra reference for return */ atomic_set(&hard_iface->refcount, 2); batadv_check_known_mac_addr(hard_iface->net_dev); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); - /* This can't be called via a bat_priv callback because - * we have no bat_priv yet. - */ - atomic_set(&hard_iface->bat_iv.ogm_seqno, 1); - hard_iface->bat_iv.ogm_buff = NULL; - return hard_iface; free_if: @@ -595,7 +645,7 @@ void batadv_hardif_remove_interfaces(void) static int batadv_hard_if_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *net_dev = ptr; + struct net_device *net_dev = netdev_notifier_info_to_dev(ptr); struct batadv_hard_iface *hard_iface; struct batadv_hard_iface *primary_if = NULL; struct batadv_priv *bat_priv; @@ -657,38 +707,6 @@ out: return NOTIFY_DONE; } -/* This function returns true if the interface represented by ifindex is a - * 802.11 wireless device - */ -bool batadv_is_wifi_iface(int ifindex) -{ - struct net_device *net_device = NULL; - bool ret = false; - - if (ifindex == BATADV_NULL_IFINDEX) - goto out; - - net_device = dev_get_by_index(&init_net, ifindex); - if (!net_device) - goto out; - -#ifdef CONFIG_WIRELESS_EXT - /* pre-cfg80211 drivers have to implement WEXT, so it is possible to - * check for wireless_handlers != NULL - */ - if (net_device->wireless_handlers) - ret = true; - else -#endif - /* cfg80211 drivers have to set ieee80211_ptr */ - if (net_device->ieee80211_ptr) - ret = true; -out: - if (net_device) - dev_put(net_device); - return ret; -} - struct notifier_block batadv_hard_if_notifier = { .notifier_call = batadv_hard_if_event, }; diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 0ba6c899b2d..b27508b8085 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -177,13 +177,13 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff, if (len >= sizeof(struct batadv_icmp_packet_rr)) packet_len = sizeof(struct batadv_icmp_packet_rr); - skb = dev_alloc_skb(packet_len + ETH_HLEN + NET_IP_ALIGN); + skb = netdev_alloc_skb_ip_align(NULL, packet_len + ETH_HLEN); if (!skb) { len = -ENOMEM; goto out; } - skb_reserve(skb, ETH_HLEN + NET_IP_ALIGN); + skb_reserve(skb, ETH_HLEN); icmp_packet = (struct batadv_icmp_packet_rr *)skb_put(skb, packet_len); if (copy_from_user(icmp_packet, buff, packet_len)) { diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 3e30a0f1b90..51aafd669cb 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -163,16 +163,25 @@ void batadv_mesh_free(struct net_device *soft_iface) batadv_vis_quit(bat_priv); batadv_gw_node_purge(bat_priv); - batadv_originator_free(bat_priv); batadv_nc_free(bat_priv); + batadv_dat_free(bat_priv); + batadv_bla_free(bat_priv); + /* Free the TT and the originator tables only after having terminated + * all the other depending components which may use these structures for + * their purposes. + */ batadv_tt_free(bat_priv); - batadv_bla_free(bat_priv); - - batadv_dat_free(bat_priv); + /* Since the originator table clean up routine is accessing the TT + * tables as well, it has to be invoked after the TT tables have been + * freed and marked as empty. This ensures that no cleanup RCU callbacks + * accessing the TT data are scheduled for later execution. + */ + batadv_originator_free(bat_priv); free_percpu(bat_priv->bat_counters); + bat_priv->bat_counters = NULL; atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); } @@ -475,7 +484,7 @@ static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) char *algo_name = (char *)val; size_t name_len = strlen(algo_name); - if (algo_name[name_len - 1] == '\n') + if (name_len > 0 && algo_name[name_len - 1] == '\n') algo_name[name_len - 1] = '\0'; bat_algo_ops = batadv_algo_get(algo_name); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 59a0d6af15c..5e9aebb7d56 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -26,7 +26,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2013.2.0" +#define BATADV_SOURCE_VERSION "2013.3.0" #endif /* B.A.T.M.A.N. parameters */ @@ -76,6 +76,11 @@ #define BATADV_LOG_BUF_LEN 8192 /* has to be a power of 2 */ +/* number of packets to send for broadcasts on different interface types */ +#define BATADV_NUM_BCASTS_DEFAULT 1 +#define BATADV_NUM_BCASTS_WIRELESS 3 +#define BATADV_NUM_BCASTS_MAX 3 + /* msecs after which an ARP_REQUEST is sent in broadcast as fallback */ #define ARP_REQ_DELAY 250 /* numbers of originator to contact for any PUT/GET DHT operation */ @@ -157,6 +162,17 @@ enum batadv_uev_type { #include <linux/seq_file.h> #include "types.h" +/** + * batadv_vlan_flags - flags for the four MSB of any vlan ID field + * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not + */ +enum batadv_vlan_flags { + BATADV_VLAN_HAS_TAG = BIT(15), +}; + +#define BATADV_PRINT_VID(vid) (vid & BATADV_VLAN_HAS_TAG ? \ + (int)(vid & VLAN_VID_MASK) : -1) + extern char batadv_routing_algo[]; extern struct list_head batadv_hardif_list; diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index f7c54305a91..a487d46e0ae 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1245,7 +1245,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, return; /* Set the mac header as if we actually sent the packet uncoded */ - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); memcpy(ethhdr->h_source, ethhdr->h_dest, ETH_ALEN); memcpy(ethhdr->h_dest, eth_dst_new, ETH_ALEN); @@ -1359,18 +1359,17 @@ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, * buffer * @skb: data skb to forward * @neigh_node: next hop to forward packet to - * @ethhdr: pointer to the ethernet header inside the skb * * Returns true if the skb was consumed (encoded packet sent) or false otherwise */ bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node, - struct ethhdr *ethhdr) + struct batadv_neigh_node *neigh_node) { const struct net_device *netdev = neigh_node->if_incoming->soft_iface; struct batadv_priv *bat_priv = netdev_priv(netdev); struct batadv_unicast_packet *packet; struct batadv_nc_path *nc_path; + struct ethhdr *ethhdr = eth_hdr(skb); __be32 packet_id; u8 *payload; @@ -1423,7 +1422,7 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, { struct batadv_unicast_packet *packet; struct batadv_nc_path *nc_path; - struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb); + struct ethhdr *ethhdr = eth_hdr(skb); __be32 packet_id; u8 *payload; @@ -1482,7 +1481,7 @@ out: void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb) { - struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb); + struct ethhdr *ethhdr = eth_hdr(skb); if (batadv_is_my_mac(bat_priv, ethhdr->h_dest)) return; @@ -1514,6 +1513,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, struct ethhdr *ethhdr, ethhdr_tmp; uint8_t *orig_dest, ttl, ttvn; unsigned int coding_len; + int err; /* Save headers temporarily */ memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp)); @@ -1532,7 +1532,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, skb_reset_network_header(skb); /* Reconstruct original mac header */ - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); memcpy(ethhdr, ðhdr_tmp, sizeof(*ethhdr)); /* Select the correct unicast header information based on the location @@ -1568,8 +1568,11 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, coding_len); /* Resize decoded skb if decoded with larger packet */ - if (nc_packet->skb->len > coding_len + h_size) - pskb_trim_rcsum(skb, coding_len + h_size); + if (nc_packet->skb->len > coding_len + h_size) { + err = pskb_trim_rcsum(skb, coding_len + h_size); + if (err) + return NULL; + } /* Create decoded unicast packet */ unicast_packet = (struct batadv_unicast_packet *)skb->data; @@ -1673,7 +1676,7 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb, return NET_RX_DROP; coded_packet = (struct batadv_coded_packet *)skb->data; - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); /* Verify frame is destined for us */ if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) && @@ -1759,6 +1762,13 @@ int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset) /* For each orig_node in this bin */ rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + /* no need to print the orig node if it does not have + * network coding neighbors + */ + if (list_empty(&orig_node->in_coding_list) && + list_empty(&orig_node->out_coding_list)) + continue; + seq_printf(seq, "Node: %pM\n", orig_node->orig); seq_puts(seq, " Ingoing: "); diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 4fa6d0caddb..85a4ec81ad5 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -36,8 +36,7 @@ void batadv_nc_purge_orig(struct batadv_priv *bat_priv, void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv); void batadv_nc_init_orig(struct batadv_orig_node *orig_node); bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node, - struct ethhdr *ethhdr); + struct batadv_neigh_node *neigh_node); void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, struct sk_buff *skb); void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, @@ -87,8 +86,7 @@ static inline void batadv_nc_init_orig(struct batadv_orig_node *orig_node) } static inline bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node, - struct ethhdr *ethhdr) + struct batadv_neigh_node *neigh_node) { return false; } diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 2f345254663..f50553a7de6 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -92,7 +92,7 @@ batadv_orig_node_get_router(struct batadv_orig_node *orig_node) struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, - const uint8_t *neigh_addr, uint32_t seqno) + const uint8_t *neigh_addr) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_neigh_node *neigh_node; @@ -110,8 +110,8 @@ batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, atomic_set(&neigh_node->refcount, 2); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Creating new neighbor %pM, initial seqno %d\n", - neigh_addr, seqno); + "Creating new neighbor %pM on interface %s\n", neigh_addr, + hard_iface->net_dev->name); out: return neigh_node; @@ -156,12 +156,28 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) kfree(orig_node); } +/** + * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly + * schedule an rcu callback for freeing it + * @orig_node: the orig node to free + */ void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node) { if (atomic_dec_and_test(&orig_node->refcount)) call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } +/** + * batadv_orig_node_free_ref_now - decrement the orig node refcounter and + * possibly free it (without rcu callback) + * @orig_node: the orig node to free + */ +void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node) +{ + if (atomic_dec_and_test(&orig_node->refcount)) + batadv_orig_node_free_rcu(&orig_node->rcu); +} + void batadv_originator_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 7df48fa7669..7887b84a9af 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -26,11 +26,12 @@ int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node); +void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_get_orig_node(struct batadv_priv *bat_priv, const uint8_t *addr); struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, - const uint8_t *neigh_addr, uint32_t seqno); + const uint8_t *neigh_addr); void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node); struct batadv_neigh_node * batadv_orig_node_get_router(struct batadv_orig_node *orig_node); diff --git a/net/batman-adv/ring_buffer.c b/net/batman-adv/ring_buffer.c deleted file mode 100644 index ccab0bbdbb5..00000000000 --- a/net/batman-adv/ring_buffer.c +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: - * - * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#include "main.h" -#include "ring_buffer.h" - -void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, - uint8_t value) -{ - lq_recv[*lq_index] = value; - *lq_index = (*lq_index + 1) % BATADV_TQ_GLOBAL_WINDOW_SIZE; -} - -uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[]) -{ - const uint8_t *ptr; - uint16_t count = 0, i = 0, sum = 0; - - ptr = lq_recv; - - while (i < BATADV_TQ_GLOBAL_WINDOW_SIZE) { - if (*ptr != 0) { - count++; - sum += *ptr; - } - - i++; - ptr++; - } - - if (count == 0) - return 0; - - return (uint8_t)(sum / count); -} diff --git a/net/batman-adv/ring_buffer.h b/net/batman-adv/ring_buffer.h deleted file mode 100644 index 3f92ae248e8..00000000000 --- a/net/batman-adv/ring_buffer.h +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (C) 2007-2013 B.A.T.M.A.N. contributors: - * - * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#ifndef _NET_BATMAN_ADV_RING_BUFFER_H_ -#define _NET_BATMAN_ADV_RING_BUFFER_H_ - -void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, - uint8_t value); -uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[]); - -#endif /* _NET_BATMAN_ADV_RING_BUFFER_H_ */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index b27a4d792d1..2f0bd3ffe6e 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -34,35 +34,6 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); -void batadv_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) -{ - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_head *head; - struct batadv_orig_node *orig_node; - unsigned long *word; - uint32_t i; - size_t word_index; - uint8_t *w; - - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) { - spin_lock_bh(&orig_node->ogm_cnt_lock); - word_index = hard_iface->if_num * BATADV_NUM_WORDS; - word = &(orig_node->bcast_own[word_index]); - - batadv_bit_get_packet(bat_priv, word, 1, 0); - w = &orig_node->bcast_own_sum[hard_iface->if_num]; - *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE); - spin_unlock_bh(&orig_node->ogm_cnt_lock); - } - rcu_read_unlock(); - } -} - static void _batadv_update_route(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node) @@ -256,7 +227,7 @@ bool batadv_check_management_packet(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, header_len))) return false; - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); /* packet with broadcast indication but unicast recipient */ if (!is_broadcast_ether_addr(ethhdr->h_dest)) @@ -314,7 +285,7 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, icmp_packet->msg_type = BATADV_ECHO_REPLY; icmp_packet->header.ttl = BATADV_TTL; - if (batadv_send_skb_to_orig(skb, orig_node, NULL)) + if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP) ret = NET_RX_SUCCESS; out: @@ -362,7 +333,7 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, icmp_packet->msg_type = BATADV_TTL_EXCEEDED; icmp_packet->header.ttl = BATADV_TTL; - if (batadv_send_skb_to_orig(skb, orig_node, NULL)) + if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP) ret = NET_RX_SUCCESS; out: @@ -392,7 +363,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, hdr_size))) goto out; - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); /* packet with unicast indication but broadcast recipient */ if (is_broadcast_ether_addr(ethhdr->h_dest)) @@ -439,7 +410,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, icmp_packet->header.ttl--; /* route it */ - if (batadv_send_skb_to_orig(skb, orig_node, recv_if)) + if (batadv_send_skb_to_orig(skb, orig_node, recv_if) != NET_XMIT_DROP) ret = NET_RX_SUCCESS; out: @@ -569,7 +540,7 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, if (unlikely(!pskb_may_pull(skb, hdr_size))) return -ENODATA; - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); /* packet with unicast indication but broadcast recipient */ if (is_broadcast_ether_addr(ethhdr->h_dest)) @@ -803,8 +774,8 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_orig_node *orig_node = NULL; struct batadv_neigh_node *neigh_node = NULL; struct batadv_unicast_packet *unicast_packet; - struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb); - int ret = NET_RX_DROP; + struct ethhdr *ethhdr = eth_hdr(skb); + int res, ret = NET_RX_DROP; struct sk_buff *new_skb; unicast_packet = (struct batadv_unicast_packet *)skb->data; @@ -864,16 +835,19 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, /* decrement ttl */ unicast_packet->header.ttl--; - /* network code packet if possible */ - if (batadv_nc_skb_forward(skb, neigh_node, ethhdr)) { - ret = NET_RX_SUCCESS; - } else if (batadv_send_skb_to_orig(skb, orig_node, recv_if)) { - ret = NET_RX_SUCCESS; + res = batadv_send_skb_to_orig(skb, orig_node, recv_if); - /* Update stats counter */ + /* translate transmit result into receive result */ + if (res == NET_XMIT_SUCCESS) { + /* skb was transmitted and consumed */ batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, skb->len + ETH_HLEN); + + ret = NET_RX_SUCCESS; + } else if (res == NET_XMIT_POLICED) { + /* skb was buffered and consumed */ + ret = NET_RX_SUCCESS; } out: @@ -1165,7 +1139,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, hdr_size))) goto out; - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); /* packet with broadcast indication but unicast recipient */ if (!is_broadcast_ether_addr(ethhdr->h_dest)) @@ -1265,7 +1239,7 @@ int batadv_recv_vis_packet(struct sk_buff *skb, return NET_RX_DROP; vis_packet = (struct batadv_vis_packet *)skb->data; - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); /* not for me */ if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest)) diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 99eeafaba40..72a29bde201 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -20,7 +20,6 @@ #ifndef _NET_BATMAN_ADV_ROUTING_H_ #define _NET_BATMAN_ADV_ROUTING_H_ -void batadv_slide_own_bcast_window(struct batadv_hard_iface *hard_iface); bool batadv_check_management_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, int header_len); diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 263cfd1ccee..e9ff8d80120 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -61,7 +61,7 @@ int batadv_send_skb_packet(struct sk_buff *skb, skb_reset_mac_header(skb); - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); memcpy(ethhdr->h_source, hard_iface->net_dev->dev_addr, ETH_ALEN); memcpy(ethhdr->h_dest, dst_addr, ETH_ALEN); ethhdr->h_proto = __constant_htons(ETH_P_BATMAN); @@ -96,26 +96,37 @@ send_skb_err: * host, NULL can be passed as recv_if and no interface alternating is * attempted. * - * Returns TRUE on success; FALSE otherwise. + * Returns NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or + * NET_XMIT_POLICED if the skb is buffered for later transmit. */ -bool batadv_send_skb_to_orig(struct sk_buff *skb, - struct batadv_orig_node *orig_node, - struct batadv_hard_iface *recv_if) +int batadv_send_skb_to_orig(struct sk_buff *skb, + struct batadv_orig_node *orig_node, + struct batadv_hard_iface *recv_if) { struct batadv_priv *bat_priv = orig_node->bat_priv; struct batadv_neigh_node *neigh_node; + int ret = NET_XMIT_DROP; /* batadv_find_router() increases neigh_nodes refcount if found. */ neigh_node = batadv_find_router(bat_priv, orig_node, recv_if); if (!neigh_node) - return false; + return ret; - /* route it */ - batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr); + /* try to network code the packet, if it is received on an interface + * (i.e. being forwarded). If the packet originates from this node or if + * network coding fails, then send the packet as usual. + */ + if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) { + ret = NET_XMIT_POLICED; + } else { + batadv_send_skb_packet(skb, neigh_node->if_incoming, + neigh_node->addr); + ret = NET_XMIT_SUCCESS; + } batadv_neigh_node_free_ref(neigh_node); - return true; + return ret; } void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface) @@ -152,8 +163,6 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet, unsigned long send_time) { - INIT_HLIST_NODE(&forw_packet->list); - /* add new packet to packet list */ spin_lock_bh(&bat_priv->forw_bcast_list_lock); hlist_add_head(&forw_packet->list, &bat_priv->forw_bcast_list); @@ -260,6 +269,9 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) if (hard_iface->soft_iface != soft_iface) continue; + if (forw_packet->num_packets >= hard_iface->num_bcasts) + continue; + /* send a copy of the saved skb */ skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); if (skb1) @@ -271,7 +283,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) forw_packet->num_packets++; /* if we still have some more bcasts to send */ - if (forw_packet->num_packets < 3) { + if (forw_packet->num_packets < BATADV_NUM_BCASTS_MAX) { _batadv_add_bcast_packet_to_list(bat_priv, forw_packet, msecs_to_jiffies(5)); return; diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 38e662f619a..e7b17880fca 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -23,9 +23,9 @@ int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, const uint8_t *dst_addr); -bool batadv_send_skb_to_orig(struct sk_buff *skb, - struct batadv_orig_node *orig_node, - struct batadv_hard_iface *recv_if); +int batadv_send_skb_to_orig(struct sk_buff *skb, + struct batadv_orig_node *orig_node, + struct batadv_hard_iface *recv_if); void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface); int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, const struct sk_buff *skb, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 6f20d339e33..700d0b49742 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -154,7 +154,7 @@ static int batadv_interface_tx(struct sk_buff *skb, 0x00, 0x00}; unsigned int header_len = 0; int data_len = skb->len, ret; - short vid __maybe_unused = -1; + unsigned short vid __maybe_unused = BATADV_NO_FLAGS; bool do_bcast = false; uint32_t seqno; unsigned long brd_delay = 1; @@ -303,7 +303,7 @@ void batadv_interface_rx(struct net_device *soft_iface, struct ethhdr *ethhdr; struct vlan_ethhdr *vhdr; struct batadv_header *batadv_header = (struct batadv_header *)skb->data; - short vid __maybe_unused = -1; + unsigned short vid __maybe_unused = BATADV_NO_FLAGS; __be16 ethertype = __constant_htons(ETH_P_BATMAN); bool is_bcast; @@ -316,7 +316,7 @@ void batadv_interface_rx(struct net_device *soft_iface, skb_pull_rcsum(skb, hdr_size); skb_reset_mac_header(skb); - ethhdr = (struct ethhdr *)skb_mac_header(skb); + ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { case ETH_P_8021Q: @@ -505,6 +505,7 @@ unreg_debugfs: batadv_debugfs_del_meshif(dev); free_bat_counters: free_percpu(bat_priv->bat_counters); + bat_priv->bat_counters = NULL; return ret; } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 5e89deeb954..429aeef3d8b 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -144,7 +144,12 @@ static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) struct batadv_tt_orig_list_entry *orig_entry; orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); - batadv_orig_node_free_ref(orig_entry->orig_node); + + /* We are in an rcu callback here, therefore we cannot use + * batadv_orig_node_free_ref() and its call_rcu(): + * An rcu_barrier() wouldn't wait for that to finish + */ + batadv_orig_node_free_ref_now(orig_entry->orig_node); kfree(orig_entry); } @@ -158,10 +163,19 @@ batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry) call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); } +/** + * batadv_tt_local_event - store a local TT event (ADD/DEL) + * @bat_priv: the bat priv with all the soft interface information + * @tt_local_entry: the TT entry involved in the event + * @event_flags: flags to store in the event structure + */ static void batadv_tt_local_event(struct batadv_priv *bat_priv, - const uint8_t *addr, uint8_t flags) + struct batadv_tt_local_entry *tt_local_entry, + uint8_t event_flags) { struct batadv_tt_change_node *tt_change_node, *entry, *safe; + struct batadv_tt_common_entry *common = &tt_local_entry->common; + uint8_t flags = common->flags | event_flags; bool event_removed = false; bool del_op_requested, del_op_entry; @@ -171,7 +185,7 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv, return; tt_change_node->change.flags = flags; - memcpy(tt_change_node->change.addr, addr, ETH_ALEN); + memcpy(tt_change_node->change.addr, common->addr, ETH_ALEN); del_op_requested = flags & BATADV_TT_CLIENT_DEL; @@ -179,7 +193,7 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->tt.changes_list_lock); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { - if (!batadv_compare_eth(entry->change.addr, addr)) + if (!batadv_compare_eth(entry->change.addr, common->addr)) continue; /* DEL+ADD in the same orig interval have no effect and can be @@ -327,7 +341,7 @@ void batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, } add_event: - batadv_tt_local_event(bat_priv, addr, tt_local->common.flags); + batadv_tt_local_event(bat_priv, tt_local, BATADV_NO_FLAGS); check_roaming: /* Check whether it is a roaming, but don't do anything if the roaming @@ -524,8 +538,7 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, uint16_t flags, const char *message) { - batadv_tt_local_event(bat_priv, tt_local_entry->common.addr, - tt_local_entry->common.flags | flags); + batadv_tt_local_event(bat_priv, tt_local_entry, flags); /* The local client has to be marked as "pending to be removed" but has * to be kept in the table in order to send it in a full table @@ -579,8 +592,7 @@ uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, /* if this client has been added right now, it is possible to * immediately purge it */ - batadv_tt_local_event(bat_priv, tt_local_entry->common.addr, - curr_flags | BATADV_TT_CLIENT_DEL); + batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL); hlist_del_rcu(&tt_local_entry->common.hash_entry); batadv_tt_local_entry_free_ref(tt_local_entry); @@ -786,10 +798,25 @@ out: batadv_tt_orig_list_entry_free_ref(orig_entry); } -/* caller must hold orig_node refcount */ +/** + * batadv_tt_global_add - add a new TT global entry or update an existing one + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: the originator announcing the client + * @tt_addr: the mac address of the non-mesh client + * @flags: TT flags that have to be set for this non-mesh client + * @ttvn: the tt version number ever announcing this non-mesh client + * + * Add a new TT global entry for the given originator. If the entry already + * exists add a new reference to the given originator (a global entry can have + * references to multiple originators) and adjust the flags attribute to reflect + * the function argument. + * If a TT local entry exists for this non-mesh client remove it. + * + * The caller must hold orig_node refcount. + */ int batadv_tt_global_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - const unsigned char *tt_addr, uint8_t flags, + const unsigned char *tt_addr, uint16_t flags, uint8_t ttvn) { struct batadv_tt_global_entry *tt_global_entry; @@ -1595,11 +1622,11 @@ batadv_tt_response_fill_table(uint16_t tt_len, uint8_t ttvn, tt_tot = tt_len / sizeof(struct batadv_tt_change); len = tt_query_size + tt_len; - skb = dev_alloc_skb(len + ETH_HLEN + NET_IP_ALIGN); + skb = netdev_alloc_skb_ip_align(NULL, len + ETH_HLEN); if (!skb) goto out; - skb_reserve(skb, ETH_HLEN + NET_IP_ALIGN); + skb_reserve(skb, ETH_HLEN); tt_response = (struct batadv_tt_query_packet *)skb_put(skb, len); tt_response->ttvn = ttvn; @@ -1660,11 +1687,11 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv, if (!tt_req_node) goto out; - skb = dev_alloc_skb(sizeof(*tt_request) + ETH_HLEN + NET_IP_ALIGN); + skb = netdev_alloc_skb_ip_align(NULL, sizeof(*tt_request) + ETH_HLEN); if (!skb) goto out; - skb_reserve(skb, ETH_HLEN + NET_IP_ALIGN); + skb_reserve(skb, ETH_HLEN); tt_req_len = sizeof(*tt_request); tt_request = (struct batadv_tt_query_packet *)skb_put(skb, tt_req_len); @@ -1686,7 +1713,7 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv, batadv_inc_counter(bat_priv, BATADV_CNT_TT_REQUEST_TX); - if (batadv_send_skb_to_orig(skb, dst_orig_node, NULL)) + if (batadv_send_skb_to_orig(skb, dst_orig_node, NULL) != NET_XMIT_DROP) ret = 0; out: @@ -1710,7 +1737,7 @@ batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_orig_node *req_dst_orig_node; struct batadv_orig_node *res_dst_orig_node = NULL; uint8_t orig_ttvn, req_ttvn, ttvn; - int ret = false; + int res, ret = false; unsigned char *tt_buff; bool full_table; uint16_t tt_len, tt_tot; @@ -1757,11 +1784,11 @@ batadv_send_other_tt_response(struct batadv_priv *bat_priv, tt_tot = tt_len / sizeof(struct batadv_tt_change); len = sizeof(*tt_response) + tt_len; - skb = dev_alloc_skb(len + ETH_HLEN + NET_IP_ALIGN); + skb = netdev_alloc_skb_ip_align(NULL, len + ETH_HLEN); if (!skb) goto unlock; - skb_reserve(skb, ETH_HLEN + NET_IP_ALIGN); + skb_reserve(skb, ETH_HLEN); packet_pos = skb_put(skb, len); tt_response = (struct batadv_tt_query_packet *)packet_pos; tt_response->ttvn = req_ttvn; @@ -1805,8 +1832,10 @@ batadv_send_other_tt_response(struct batadv_priv *bat_priv, batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX); - if (batadv_send_skb_to_orig(skb, res_dst_orig_node, NULL)) + res = batadv_send_skb_to_orig(skb, res_dst_orig_node, NULL); + if (res != NET_XMIT_DROP) ret = true; + goto out; unlock: @@ -1873,11 +1902,11 @@ batadv_send_my_tt_response(struct batadv_priv *bat_priv, tt_tot = tt_len / sizeof(struct batadv_tt_change); len = sizeof(*tt_response) + tt_len; - skb = dev_alloc_skb(len + ETH_HLEN + NET_IP_ALIGN); + skb = netdev_alloc_skb_ip_align(NULL, len + ETH_HLEN); if (!skb) goto unlock; - skb_reserve(skb, ETH_HLEN + NET_IP_ALIGN); + skb_reserve(skb, ETH_HLEN); packet_pos = skb_put(skb, len); tt_response = (struct batadv_tt_query_packet *)packet_pos; tt_response->ttvn = req_ttvn; @@ -1920,7 +1949,7 @@ batadv_send_my_tt_response(struct batadv_priv *bat_priv, batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX); - if (batadv_send_skb_to_orig(skb, orig_node, NULL)) + if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP) ret = true; goto out; @@ -2207,11 +2236,11 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client, if (!batadv_tt_check_roam_count(bat_priv, client)) goto out; - skb = dev_alloc_skb(sizeof(*roam_adv_packet) + ETH_HLEN + NET_IP_ALIGN); + skb = netdev_alloc_skb_ip_align(NULL, len + ETH_HLEN); if (!skb) goto out; - skb_reserve(skb, ETH_HLEN + NET_IP_ALIGN); + skb_reserve(skb, ETH_HLEN); roam_adv_packet = (struct batadv_roam_adv_packet *)skb_put(skb, len); @@ -2233,7 +2262,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client, batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_TX); - if (batadv_send_skb_to_orig(skb, orig_node, NULL)) + if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP) ret = 0; out: diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index ab8e683b402..659a3bb759c 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -33,7 +33,7 @@ void batadv_tt_global_add_orig(struct batadv_priv *bat_priv, const unsigned char *tt_buff, int tt_buff_len); int batadv_tt_global_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - const unsigned char *addr, uint8_t flags, + const unsigned char *addr, uint16_t flags, uint8_t ttvn); int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index aba8364c368..b2c94e13931 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -61,6 +61,7 @@ struct batadv_hard_iface_bat_iv { * @if_status: status of the interface for batman-adv * @net_dev: pointer to the net_device * @frag_seqno: last fragment sequence number sent by this interface + * @num_bcasts: number of payload re-broadcasts on this interface (ARQ) * @hardif_obj: kobject of the per interface sysfs "mesh" directory * @refcount: number of contexts the object is used * @batman_adv_ptype: packet type describing packets that should be processed by @@ -76,6 +77,7 @@ struct batadv_hard_iface { char if_status; struct net_device *net_dev; atomic_t frag_seqno; + uint8_t num_bcasts; struct kobject *hardif_obj; atomic_t refcount; struct packet_type batman_adv_ptype; @@ -640,7 +642,7 @@ struct batadv_socket_packet { #ifdef CONFIG_BATMAN_ADV_BLA struct batadv_bla_backbone_gw { uint8_t orig[ETH_ALEN]; - short vid; + unsigned short vid; struct hlist_node hash_entry; struct batadv_priv *bat_priv; unsigned long lasttime; @@ -663,7 +665,7 @@ struct batadv_bla_backbone_gw { */ struct batadv_bla_claim { uint8_t addr[ETH_ALEN]; - short vid; + unsigned short vid; struct batadv_bla_backbone_gw *backbone_gw; unsigned long lasttime; struct hlist_node hash_entry; diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c index 0bb3b5982f9..dc8b5d4dd63 100644 --- a/net/batman-adv/unicast.c +++ b/net/batman-adv/unicast.c @@ -464,7 +464,7 @@ find_router: goto out; } - if (batadv_send_skb_to_orig(skb, orig_node, NULL)) + if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP) ret = 0; out: diff --git a/net/batman-adv/vis.c b/net/batman-adv/vis.c index 1625e5793a8..4983340f194 100644 --- a/net/batman-adv/vis.c +++ b/net/batman-adv/vis.c @@ -392,12 +392,12 @@ batadv_add_packet(struct batadv_priv *bat_priv, return NULL; len = sizeof(*packet) + vis_info_len; - info->skb_packet = dev_alloc_skb(len + ETH_HLEN + NET_IP_ALIGN); + info->skb_packet = netdev_alloc_skb_ip_align(NULL, len + ETH_HLEN); if (!info->skb_packet) { kfree(info); return NULL; } - skb_reserve(info->skb_packet, ETH_HLEN + NET_IP_ALIGN); + skb_reserve(info->skb_packet, ETH_HLEN); packet = (struct batadv_vis_packet *)skb_put(info->skb_packet, len); kref_init(&info->refcount); @@ -697,7 +697,7 @@ static void batadv_broadcast_vis_packet(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node; struct batadv_vis_packet *packet; struct sk_buff *skb; - uint32_t i; + uint32_t i, res; packet = (struct batadv_vis_packet *)info->skb_packet->data; @@ -724,7 +724,8 @@ static void batadv_broadcast_vis_packet(struct batadv_priv *bat_priv, if (!skb) continue; - if (!batadv_send_skb_to_orig(skb, orig_node, NULL)) + res = batadv_send_skb_to_orig(skb, orig_node, NULL); + if (res == NET_XMIT_DROP) kfree_skb(skb); } rcu_read_unlock(); @@ -748,7 +749,7 @@ static void batadv_unicast_vis_packet(struct batadv_priv *bat_priv, if (!skb) goto out; - if (!batadv_send_skb_to_orig(skb, orig_node, NULL)) + if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP) kfree_skb(skb); out: @@ -854,13 +855,13 @@ int batadv_vis_init(struct batadv_priv *bat_priv) if (!bat_priv->vis.my_info) goto err; - len = sizeof(*packet) + BATADV_MAX_VIS_PACKET_SIZE; - len += ETH_HLEN + NET_IP_ALIGN; - bat_priv->vis.my_info->skb_packet = dev_alloc_skb(len); + len = sizeof(*packet) + BATADV_MAX_VIS_PACKET_SIZE + ETH_HLEN; + bat_priv->vis.my_info->skb_packet = netdev_alloc_skb_ip_align(NULL, + len); if (!bat_priv->vis.my_info->skb_packet) goto free_info; - skb_reserve(bat_priv->vis.my_info->skb_packet, ETH_HLEN + NET_IP_ALIGN); + skb_reserve(bat_priv->vis.my_info->skb_packet, ETH_HLEN); tmp_skb = bat_priv->vis.my_info->skb_packet; packet = (struct batadv_vis_packet *)skb_put(tmp_skb, sizeof(*packet)); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 967312803e4..2ef66781fed 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -22,6 +22,9 @@ #include <asm/uaccess.h> #include "br_private.h" +#define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \ + NETIF_F_GSO_MASK | NETIF_F_HW_CSUM) + /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -55,10 +58,10 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_pull(skb, ETH_HLEN); if (is_broadcast_ether_addr(dest)) - br_flood_deliver(br, skb); + br_flood_deliver(br, skb, false); else if (is_multicast_ether_addr(dest)) { if (unlikely(netpoll_tx_running(dev))) { - br_flood_deliver(br, skb); + br_flood_deliver(br, skb, false); goto out; } if (br_multicast_rcv(br, NULL, skb)) { @@ -70,11 +73,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) br_multicast_deliver(mdst, skb); else - br_flood_deliver(br, skb); + br_flood_deliver(br, skb, false); } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) br_deliver(dst->dst, skb); else - br_flood_deliver(br, skb); + br_flood_deliver(br, skb, true); out: rcu_read_unlock(); @@ -346,12 +349,10 @@ void br_dev_setup(struct net_device *dev) dev->tx_queue_len = 0; dev->priv_flags = IFF_EBRIDGE; - dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | - NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_LLTX | - NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_CTAG_TX; - dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | - NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | - NETIF_F_HW_VLAN_CTAG_TX; + dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL | + NETIF_F_HW_VLAN_CTAG_TX; + dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX; + dev->vlan_features = COMMON_FEATURES; br->dev = dev; spin_lock_init(&br->lock); diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 092b20e4ee4..4b81b147178 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -174,7 +174,8 @@ out: static void br_flood(struct net_bridge *br, struct sk_buff *skb, struct sk_buff *skb0, void (*__packet_hook)(const struct net_bridge_port *p, - struct sk_buff *skb)) + struct sk_buff *skb), + bool unicast) { struct net_bridge_port *p; struct net_bridge_port *prev; @@ -182,6 +183,9 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, prev = NULL; list_for_each_entry_rcu(p, &br->port_list, list) { + /* Do not flood unicast traffic to ports that turn it off */ + if (unicast && !(p->flags & BR_FLOOD)) + continue; prev = maybe_deliver(prev, p, skb, __packet_hook); if (IS_ERR(prev)) goto out; @@ -203,16 +207,16 @@ out: /* called with rcu_read_lock */ -void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb) +void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast) { - br_flood(br, skb, NULL, __br_deliver); + br_flood(br, skb, NULL, __br_deliver, unicast); } /* called under bridge lock */ void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, - struct sk_buff *skb2) + struct sk_buff *skb2, bool unicast) { - br_flood(br, skb, skb2, __br_forward); + br_flood(br, skb, skb2, __br_forward, unicast); } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 4cdba60926f..5623be6b9ec 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -221,7 +221,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; - p->flags = 0; + p->flags = BR_LEARNING | BR_FLOOD; br_init_port(p); p->state = BR_STATE_DISABLED; br_stp_port_timer_init(p); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 828e2bcc1f5..1b8b8b824cd 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -65,6 +65,7 @@ int br_handle_frame_finish(struct sk_buff *skb) struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; struct sk_buff *skb2; + bool unicast = true; u16 vid = 0; if (!p || p->state == BR_STATE_DISABLED) @@ -75,7 +76,8 @@ int br_handle_frame_finish(struct sk_buff *skb) /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; - br_fdb_update(br, p, eth_hdr(skb)->h_source, vid); + if (p->flags & BR_LEARNING) + br_fdb_update(br, p, eth_hdr(skb)->h_source, vid); if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) && br_multicast_rcv(br, p, skb)) @@ -94,9 +96,10 @@ int br_handle_frame_finish(struct sk_buff *skb) dst = NULL; - if (is_broadcast_ether_addr(dest)) + if (is_broadcast_ether_addr(dest)) { skb2 = skb; - else if (is_multicast_ether_addr(dest)) { + unicast = false; + } else if (is_multicast_ether_addr(dest)) { mdst = br_mdb_get(br, skb, vid); if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) { if ((mdst && mdst->mglist) || @@ -109,6 +112,7 @@ int br_handle_frame_finish(struct sk_buff *skb) } else skb2 = skb; + unicast = false; br->dev->stats.multicast++; } else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) { @@ -122,7 +126,7 @@ int br_handle_frame_finish(struct sk_buff *skb) dst->used = jiffies; br_forward(dst->dst, skb, skb2); } else - br_flood_forward(br, skb, skb2); + br_flood_forward(br, skb, skb2, unicast); } if (skb2) @@ -142,7 +146,8 @@ static int br_handle_local_finish(struct sk_buff *skb) u16 vid = 0; br_vlan_get_tag(skb, &vid); - br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid); + if (p->flags & BR_LEARNING) + br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid); return 0; /* process further */ } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 81f2389f78e..37a46769796 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -23,6 +23,7 @@ #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/timer.h> +#include <linux/inetdevice.h> #include <net/ip.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -381,7 +382,8 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->protocol = IPPROTO_IGMP; - iph->saddr = 0; + iph->saddr = br->multicast_query_use_ifaddr ? + inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0; iph->daddr = htonl(INADDR_ALLHOSTS_GROUP); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; @@ -615,8 +617,6 @@ rehash: mp->br = br; mp->addr = *group; - setup_timer(&mp->timer, br_multicast_group_expired, - (unsigned long)mp); hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]); mdb->size++; @@ -654,7 +654,6 @@ static int br_multicast_add_group(struct net_bridge *br, struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - unsigned long now = jiffies; int err; spin_lock(&br->multicast_lock); @@ -669,7 +668,6 @@ static int br_multicast_add_group(struct net_bridge *br, if (!port) { mp->mglist = true; - mod_timer(&mp->timer, now + br->multicast_membership_interval); goto out; } @@ -677,7 +675,7 @@ static int br_multicast_add_group(struct net_bridge *br, (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p->port == port) - goto found; + goto out; if ((unsigned long)p->port < (unsigned long)port) break; } @@ -688,8 +686,6 @@ static int br_multicast_add_group(struct net_bridge *br, rcu_assign_pointer(*pp, p); br_mdb_notify(br->dev, port, group, RTM_NEWMDB); -found: - mod_timer(&p->timer, now + br->multicast_membership_interval); out: err = 0; @@ -1129,6 +1125,10 @@ static int br_ip4_multicast_query(struct net_bridge *br, if (!mp) goto out; + setup_timer(&mp->timer, br_multicast_group_expired, (unsigned long)mp); + mod_timer(&mp->timer, now + br->multicast_membership_interval); + mp->timer_armed = true; + max_delay *= br->multicast_last_member_count; if (mp->mglist && @@ -1203,6 +1203,10 @@ static int br_ip6_multicast_query(struct net_bridge *br, if (!mp) goto out; + setup_timer(&mp->timer, br_multicast_group_expired, (unsigned long)mp); + mod_timer(&mp->timer, now + br->multicast_membership_interval); + mp->timer_armed = true; + max_delay *= br->multicast_last_member_count; if (mp->mglist && (timer_pending(&mp->timer) ? @@ -1246,6 +1250,32 @@ static void br_multicast_leave_group(struct net_bridge *br, if (!mp) goto out; + if (br->multicast_querier && + !timer_pending(&br->multicast_querier_timer)) { + __br_multicast_send_query(br, port, &mp->addr); + + time = jiffies + br->multicast_last_member_count * + br->multicast_last_member_interval; + mod_timer(port ? &port->multicast_query_timer : + &br->multicast_query_timer, time); + + for (p = mlock_dereference(mp->ports, br); + p != NULL; + p = mlock_dereference(p->next, br)) { + if (p->port != port) + continue; + + if (!hlist_unhashed(&p->mglist) && + (timer_pending(&p->timer) ? + time_after(p->timer.expires, time) : + try_to_del_timer_sync(&p->timer) >= 0)) { + mod_timer(&p->timer, time); + } + + break; + } + } + if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) { struct net_bridge_port_group __rcu **pp; @@ -1261,7 +1291,7 @@ static void br_multicast_leave_group(struct net_bridge *br, call_rcu_bh(&p->rcu, br_multicast_free_pg); br_mdb_notify(br->dev, port, group, RTM_DELMDB); - if (!mp->ports && !mp->mglist && + if (!mp->ports && !mp->mglist && mp->timer_armed && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); } @@ -1273,30 +1303,12 @@ static void br_multicast_leave_group(struct net_bridge *br, br->multicast_last_member_interval; if (!port) { - if (mp->mglist && + if (mp->mglist && mp->timer_armed && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : try_to_del_timer_sync(&mp->timer) >= 0)) { mod_timer(&mp->timer, time); } - - goto out; - } - - for (p = mlock_dereference(mp->ports, br); - p != NULL; - p = mlock_dereference(p->next, br)) { - if (p->port != port) - continue; - - if (!hlist_unhashed(&p->mglist) && - (timer_pending(&p->timer) ? - time_after(p->timer.expires, time) : - try_to_del_timer_sync(&p->timer) >= 0)) { - mod_timer(&p->timer, time); - } - - break; } out: @@ -1618,6 +1630,7 @@ void br_multicast_init(struct net_bridge *br) br->multicast_router = 1; br->multicast_querier = 0; + br->multicast_query_use_ifaddr = 0; br->multicast_last_member_count = 2; br->multicast_startup_query_count = 2; @@ -1671,6 +1684,7 @@ void br_multicast_stop(struct net_bridge *br) hlist_for_each_entry_safe(mp, n, &mdb->mhash[i], hlist[ver]) { del_timer(&mp->timer); + mp->timer_armed = false; call_rcu_bh(&mp->rcu, br_multicast_free_group); } } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 8e3abf56479..1fc30abd3a5 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -30,6 +30,8 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_GUARD */ + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ + + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ + + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + 0; } @@ -56,7 +58,9 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_MODE, mode) || nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) || nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || - nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE))) + nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || + nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || + nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD))) return -EMSGSIZE; return 0; @@ -281,6 +285,8 @@ static const struct nla_policy ifla_brport_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MODE] = { .type = NLA_U8 }, [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, + [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, + [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ @@ -328,6 +334,8 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE); br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); + br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); + br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index 1644b3e1f94..3a3f371b284 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -31,7 +31,7 @@ struct notifier_block br_device_notifier = { */ static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; bool changed_addr; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d2c043a857b..3be89b3ce17 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -112,6 +112,7 @@ struct net_bridge_mdb_entry struct timer_list timer; struct br_ip addr; bool mglist; + bool timer_armed; }; struct net_bridge_mdb_htable @@ -157,6 +158,8 @@ struct net_bridge_port #define BR_ROOT_BLOCK 0x00000004 #define BR_MULTICAST_FAST_LEAVE 0x00000008 #define BR_ADMIN_COST 0x00000010 +#define BR_LEARNING 0x00000020 +#define BR_FLOOD 0x00000040 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING u32 multicast_startup_queries_sent; @@ -249,6 +252,7 @@ struct net_bridge u8 multicast_disabled:1; u8 multicast_querier:1; + u8 multicast_query_use_ifaddr:1; u32 hash_elasticity; u32 hash_max; @@ -411,9 +415,10 @@ extern int br_dev_queue_push_xmit(struct sk_buff *skb); extern void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0); extern int br_forward_finish(struct sk_buff *skb); -extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb); +extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, + bool unicast); extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, - struct sk_buff *skb2); + struct sk_buff *skb2, bool unicast); /* br_if.c */ extern void br_port_carrier_check(struct net_bridge_port *p); diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 8baa9c08e1a..394bb96b608 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -375,6 +375,31 @@ static ssize_t store_multicast_snooping(struct device *d, static DEVICE_ATTR(multicast_snooping, S_IRUGO | S_IWUSR, show_multicast_snooping, store_multicast_snooping); +static ssize_t show_multicast_query_use_ifaddr(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", br->multicast_query_use_ifaddr); +} + +static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val) +{ + br->multicast_query_use_ifaddr = !!val; + return 0; +} + +static ssize_t +store_multicast_query_use_ifaddr(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_query_use_ifaddr); +} +static DEVICE_ATTR(multicast_query_use_ifaddr, S_IRUGO | S_IWUSR, + show_multicast_query_use_ifaddr, + store_multicast_query_use_ifaddr); + static ssize_t show_multicast_querier(struct device *d, struct device_attribute *attr, char *buf) @@ -734,6 +759,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_multicast_router.attr, &dev_attr_multicast_snooping.attr, &dev_attr_multicast_querier.attr, + &dev_attr_multicast_query_use_ifaddr.attr, &dev_attr_hash_elasticity.attr, &dev_attr_hash_max.attr, &dev_attr_multicast_last_member_count.attr, diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index a1ef1b6e14d..2a2cdb756d5 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -158,6 +158,8 @@ static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); +BRPORT_ATTR_FLAG(learning, BR_LEARNING); +BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -195,6 +197,8 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_hairpin_mode, &brport_attr_bpdu_guard, &brport_attr_root_block, + &brport_attr_learning, + &brport_attr_unicast_flood, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &brport_attr_multicast_router, &brport_attr_multicast_fast_leave, diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 9878eb8204c..19c37a4929b 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -72,13 +72,12 @@ print_ports(const struct sk_buff *skb, uint8_t protocol, int offset) } static void -ebt_log_packet(u_int8_t pf, unsigned int hooknum, - const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct nf_loginfo *loginfo, - const char *prefix) +ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *loginfo, + const char *prefix) { unsigned int bitmask; - struct net *net = dev_net(in ? in : out); /* FIXME: Disabled from containers until syslog ns is supported */ if (!net_eq(net, &init_net)) @@ -191,7 +190,7 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par) nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in, par->out, &li, "%s", info->prefix); else - ebt_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in, + ebt_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in, par->out, &li, info->prefix); return EBT_CONTINUE; } diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index fc1905c5141..518093802d1 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -131,14 +131,16 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) return skb; } -static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct ebt_ulog_info *uloginfo, const char *prefix) +static void ebt_ulog_packet(struct net *net, unsigned int hooknr, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct ebt_ulog_info *uloginfo, + const char *prefix) { ebt_ulog_packet_msg_t *pm; size_t size, copy_len; struct nlmsghdr *nlh; - struct net *net = dev_net(in ? in : out); struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); unsigned int group = uloginfo->nlgroup; ebt_ulog_buff_t *ub = &ebt->ulog_buffers[group]; @@ -233,7 +235,7 @@ unlock: } /* this function is registered with the netfilter core */ -static void ebt_log_packet(u_int8_t pf, unsigned int hooknum, +static void ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *li, const char *prefix) @@ -252,13 +254,15 @@ static void ebt_log_packet(u_int8_t pf, unsigned int hooknum, strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); } - ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); + ebt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); } static unsigned int ebt_ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) { - ebt_ulog_packet(par->hooknum, skb, par->in, par->out, + struct net *net = dev_net(par->in ? par->in : par->out); + + ebt_ulog_packet(net, par->hooknum, skb, par->in, par->out, par->targinfo, NULL); return EBT_CONTINUE; } @@ -267,6 +271,12 @@ static int ebt_ulog_tg_check(const struct xt_tgchk_param *par) { struct ebt_ulog_info *uloginfo = par->targinfo; + if (!par->net->xt.ebt_ulog_warn_deprecated) { + pr_info("ebt_ulog is deprecated and it will be removed soon, " + "use ebt_nflog instead\n"); + par->net->xt.ebt_ulog_warn_deprecated = true; + } + if (uloginfo->nlgroup > 31) return -EINVAL; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 3d110c4fc78..ac780242838 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1339,7 +1339,7 @@ static inline int ebt_make_matchname(const struct ebt_entry_match *m, /* ebtables expects 32 bytes long names but xt_match names are 29 bytes long. Copy 29 bytes and fill remaining bytes with zeroes. */ - strncpy(name, m->u.match->name, sizeof(name)); + strlcpy(name, m->u.match->name, sizeof(name)); if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; return 0; @@ -1351,7 +1351,7 @@ static inline int ebt_make_watchername(const struct ebt_entry_watcher *w, char __user *hlp = ubase + ((char *)w - base); char name[EBT_FUNCTION_MAXNAMELEN] = {}; - strncpy(name, w->u.watcher->name, sizeof(name)); + strlcpy(name, w->u.watcher->name, sizeof(name)); if (copy_to_user(hlp , name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; return 0; @@ -1377,7 +1377,7 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase) ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase); if (ret != 0) return ret; - strncpy(name, t->u.target->name, sizeof(name)); + strlcpy(name, t->u.target->name, sizeof(name)); if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; return 0; diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 1f9ece1a9c3..4dca159435c 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -352,9 +352,9 @@ EXPORT_SYMBOL(caif_enroll_dev); /* notify Caif of device events */ static int caif_device_notify(struct notifier_block *me, unsigned long what, - void *arg) + void *ptr) { - struct net_device *dev = arg; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct caif_device_entry *caifd = NULL; struct caif_dev_common *caifdev; struct cfcnfg *cfg; diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 942e00a425f..75ed04b78fa 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -121,9 +121,9 @@ static struct packet_type caif_usb_type __read_mostly = { }; static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, - void *arg) + void *ptr) { - struct net_device *dev = arg; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct caif_dev_common common; struct cflayer *layer, *link_support; struct usbnet *usbnet; diff --git a/net/can/af_can.c b/net/can/af_can.c index c4e50852c9f..3ab8dd2e128 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -794,9 +794,9 @@ EXPORT_SYMBOL(can_proto_unregister); * af_can notifier to create/remove CAN netdevice specific structs */ static int can_notifier(struct notifier_block *nb, unsigned long msg, - void *data) + void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dev_rcv_lists *d; if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/can/bcm.c b/net/can/bcm.c index 8f113e6ff32..46f20bfafc0 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1350,9 +1350,9 @@ static int bcm_sendmsg(struct kiocb *iocb, struct socket *sock, * notification handler for netdevice status changes */ static int bcm_notifier(struct notifier_block *nb, unsigned long msg, - void *data) + void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct bcm_sock *bo = container_of(nb, struct bcm_sock, notifier); struct sock *sk = &bo->sk; struct bcm_op *op; diff --git a/net/can/gw.c b/net/can/gw.c index 3ee690e8c7d..2f291f961a1 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -445,9 +445,9 @@ static inline void cgw_unregister_filter(struct cgw_job *gwj) } static int cgw_notifier(struct notifier_block *nb, - unsigned long msg, void *data) + unsigned long msg, void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/can/raw.c b/net/can/raw.c index 1085e65f848..641e1c89512 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -239,9 +239,9 @@ static int raw_enable_allfilters(struct net_device *dev, struct sock *sk) } static int raw_notifier(struct notifier_block *nb, - unsigned long msg, void *data) + unsigned long msg, void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct raw_sock *ro = container_of(nb, struct raw_sock, notifier); struct sock *sk = &ro->sk; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a3395fdfbd4..d5953b87918 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1204,6 +1204,7 @@ void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, mutex_lock(&osdc->request_mutex); if (req->r_linger) { __unregister_linger_request(osdc, req); + req->r_linger = 0; ceph_osdc_put_request(req); } mutex_unlock(&osdc->request_mutex); @@ -2120,7 +2121,9 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); __register_request(osdc, req); - WARN_ON(req->r_sent); + req->r_sent = 0; + req->r_got_reply = 0; + req->r_completed = 0; rc = __map_request(osdc, req, 0); if (rc < 0) { if (nofail) { diff --git a/net/compat.c b/net/compat.c index 79ae8848500..f0a1ba6c808 100644 --- a/net/compat.c +++ b/net/compat.c @@ -734,19 +734,25 @@ static unsigned char nas[21] = { asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags) { - return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags) { + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT); } asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags) { - return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, unsigned int flags) @@ -768,6 +774,9 @@ asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, int datagrams; struct timespec ktspec; + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + if (COMPAT_USE_64BIT_TIME) return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, diff --git a/net/core/datagram.c b/net/core/datagram.c index b71423db778..9cbaba98ce4 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -56,6 +56,7 @@ #include <net/sock.h> #include <net/tcp_states.h> #include <trace/events/skb.h> +#include <net/ll_poll.h> /* * Is a socket 'connection oriented' ? @@ -207,6 +208,9 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, } spin_unlock_irqrestore(&queue->lock, cpu_flags); + if (sk_valid_ll(sk) && sk_poll_ll(sk, flags & MSG_DONTWAIT)) + continue; + /* User doesn't want to wait */ error = -EAGAIN; if (!timeo) diff --git a/net/core/dev.c b/net/core/dev.c index fc1e289397f..fa007dba6be 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -129,6 +129,7 @@ #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> +#include <linux/hashtable.h> #include "net-sysfs.h" @@ -166,6 +167,12 @@ static struct list_head offload_base __read_mostly; DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +/* protects napi_hash addition/deletion and napi_gen_id */ +static DEFINE_SPINLOCK(napi_hash_lock); + +static unsigned int napi_gen_id; +static DEFINE_HASHTABLE(napi_hash, 8); + seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) @@ -1198,9 +1205,7 @@ static int __dev_open(struct net_device *dev) * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ - ret = netpoll_rx_disable(dev); - if (ret) - return ret; + netpoll_rx_disable(dev); ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); ret = notifier_to_errno(ret); @@ -1309,9 +1314,7 @@ static int __dev_close(struct net_device *dev) LIST_HEAD(single); /* Temporarily disable netpoll until the interface is down */ - retval = netpoll_rx_disable(dev); - if (retval) - return retval; + netpoll_rx_disable(dev); list_add(&dev->unreg_list, &single); retval = __dev_close_many(&single); @@ -1353,14 +1356,11 @@ static int dev_close_many(struct list_head *head) */ int dev_close(struct net_device *dev) { - int ret = 0; if (dev->flags & IFF_UP) { LIST_HEAD(single); /* Block netpoll rx while the interface is going down */ - ret = netpoll_rx_disable(dev); - if (ret) - return ret; + netpoll_rx_disable(dev); list_add(&dev->unreg_list, &single); dev_close_many(&single); @@ -1368,7 +1368,7 @@ int dev_close(struct net_device *dev) netpoll_rx_enable(dev); } - return ret; + return 0; } EXPORT_SYMBOL(dev_close); @@ -1398,6 +1398,14 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); +static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, + struct net_device *dev) +{ + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return nb->notifier_call(nb, val, &info); +} static int dev_boot_phase = 1; @@ -1430,7 +1438,7 @@ int register_netdevice_notifier(struct notifier_block *nb) goto unlock; for_each_net(net) { for_each_netdev(net, dev) { - err = nb->notifier_call(nb, NETDEV_REGISTER, dev); + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) goto rollback; @@ -1438,7 +1446,7 @@ int register_netdevice_notifier(struct notifier_block *nb) if (!(dev->flags & IFF_UP)) continue; - nb->notifier_call(nb, NETDEV_UP, dev); + call_netdevice_notifier(nb, NETDEV_UP, dev); } } @@ -1454,10 +1462,11 @@ rollback: goto outroll; if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } @@ -1495,10 +1504,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb) for_each_net(net) { for_each_netdev(net, dev) { if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } unlock: @@ -1508,6 +1518,25 @@ unlock: EXPORT_SYMBOL(unregister_netdevice_notifier); /** + * call_netdevice_notifiers_info - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @info: notifier information data + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, + struct netdev_notifier_info *info) +{ + ASSERT_RTNL(); + netdev_notifier_info_init(info, dev); + return raw_notifier_call_chain(&netdev_chain, val, info); +} +EXPORT_SYMBOL(call_netdevice_notifiers_info); + +/** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function @@ -1518,8 +1547,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - ASSERT_RTNL(); - return raw_notifier_call_chain(&netdev_chain, val, dev); + struct netdev_notifier_info info; + + return call_netdevice_notifiers_info(val, dev, &info); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -1629,7 +1659,6 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) return NET_RX_DROP; } skb->skb_iif = 0; - skb->dev = dev; skb_dst_drop(skb); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; @@ -1702,7 +1731,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb2); if (skb_network_header(skb2) < skb2->data || - skb2->network_header > skb2->tail) { + skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ntohs(skb2->protocol), dev->name); @@ -3065,6 +3094,46 @@ static int rps_ipi_queued(struct softnet_data *sd) return 0; } +#ifdef CONFIG_NET_FLOW_LIMIT +int netdev_flow_limit_table_len __read_mostly = (1 << 12); +#endif + +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +{ +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + struct softnet_data *sd; + unsigned int old_flow, new_flow; + + if (qlen < (netdev_max_backlog >> 1)) + return false; + + sd = &__get_cpu_var(softnet_data); + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) { + new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1); + old_flow = fl->history[fl->history_head]; + fl->history[fl->history_head] = new_flow; + + fl->history_head++; + fl->history_head &= FLOW_LIMIT_HISTORY - 1; + + if (likely(fl->buckets[old_flow])) + fl->buckets[old_flow]--; + + if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { + fl->count++; + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); +#endif + return false; +} + /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). @@ -3074,13 +3143,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, { struct softnet_data *sd; unsigned long flags; + unsigned int qlen; sd = &per_cpu(softnet_data, cpu); local_irq_save(flags); rps_lock(sd); - if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); @@ -3828,7 +3899,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; - if (skb->mac_header == skb->tail && + if (skb_mac_header(skb) == skb_tail_pointer(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); @@ -4072,6 +4143,58 @@ void napi_complete(struct napi_struct *n) } EXPORT_SYMBOL(napi_complete); +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} +EXPORT_SYMBOL_GPL(napi_by_id); + +void napi_hash_add(struct napi_struct *napi) +{ + if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + + spin_lock(&napi_hash_lock); + + /* 0 is not a valid id, we also skip an id that is taken + * we expect both events to be extremely rare + */ + napi->napi_id = 0; + while (!napi->napi_id) { + napi->napi_id = ++napi_gen_id; + if (napi_by_id(napi->napi_id)) + napi->napi_id = 0; + } + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); + } +} +EXPORT_SYMBOL_GPL(napi_hash_add); + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +void napi_hash_del(struct napi_struct *napi) +{ + spin_lock(&napi_hash_lock); + + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + hlist_del_rcu(&napi->napi_hash_node); + + spin_unlock(&napi_hash_lock); +} +EXPORT_SYMBOL_GPL(napi_hash_del); + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -4370,7 +4493,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, else list_add_tail_rcu(&upper->list, &dev->upper_dev_list); dev_hold(upper_dev); - + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); return 0; } @@ -4430,6 +4553,7 @@ void netdev_upper_dev_unlink(struct net_device *dev, list_del_rcu(&upper->list); dev_put(upper_dev); kfree_rcu(upper, rcu); + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -4700,8 +4824,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) } if (dev->flags & IFF_UP && - (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) - call_netdevice_notifiers(NETDEV_CHANGE, dev); + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = changes; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); + } } /** @@ -5235,6 +5364,10 @@ int register_netdevice(struct net_device *dev) */ dev->hw_enc_features |= NETIF_F_SG; + /* Make NETIF_F_SG inheritable to MPLS. + */ + dev->mpls_features |= NETIF_F_SG; + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) @@ -6014,7 +6147,7 @@ netdev_features_t netdev_increment_features(netdev_features_t all, } EXPORT_SYMBOL(netdev_increment_features); -static struct hlist_head *netdev_create_hash(void) +static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; @@ -6270,6 +6403,10 @@ static int __init net_dev_init(void) sd->backlog.weight = weight_p; sd->backlog.gro_list = NULL; sd->backlog.gro_count = 0; + +#ifdef CONFIG_NET_FLOW_LIMIT + sd->flow_limit = NULL; +#endif } dev_boot_phase = 0; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index c013f38482a..6cda4e2c213 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -39,6 +39,7 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, ha->refcount = 1; ha->global_use = global; ha->synced = sync; + ha->sync_cnt = 0; list_add_tail_rcu(&ha->list, &list->list); list->count++; @@ -66,7 +67,7 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, } if (sync) { if (ha->synced) - return 0; + return -EEXIST; else ha->synced = true; } @@ -139,10 +140,13 @@ static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, false, true); - if (err) + if (err && err != -EEXIST) return err; - ha->sync_cnt++; - ha->refcount++; + + if (!err) { + ha->sync_cnt++; + ha->refcount++; + } return 0; } @@ -159,7 +163,8 @@ static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, if (err) return; ha->sync_cnt--; - __hw_addr_del_entry(from_list, ha, false, true); + /* address on from list is not marked synced */ + __hw_addr_del_entry(from_list, ha, false, false); } static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, @@ -796,7 +801,7 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) return -EINVAL; netif_addr_lock_nested(to); - err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); + err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index d23b6682f4e..5e78d44333b 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -295,9 +295,9 @@ static int net_dm_cmd_trace(struct sk_buff *skb, } static int dropmon_net_event(struct notifier_block *ev_block, - unsigned long event, void *ptr) + unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dm_hw_stat_delta *new_stat = NULL; struct dm_hw_stat_delta *tmp; diff --git a/net/core/dst.c b/net/core/dst.c index df9cc810ec8..ca4231ec734 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -372,7 +372,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dst_entry *dst, *last = NULL; switch (event) { diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 22efdaa76eb..cd23d314d68 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -82,6 +82,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", + [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", @@ -1413,7 +1414,7 @@ static int ethtool_get_module_eeprom(struct net_device *dev, modinfo.eeprom_len); } -/* The main entry point in this file. Called from net/core/dev.c */ +/* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) { diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index d5a9f8ead0d..21735440c44 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -705,9 +705,9 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) static int fib_rules_event(struct notifier_block *this, unsigned long event, - void *ptr) + void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct fib_rules_ops *ops; diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index d9d198aa9fe..6b5b6e7013c 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -82,7 +82,7 @@ struct gen_estimator { struct list_head list; struct gnet_stats_basic_packed *bstats; - struct gnet_stats_rate_est *rate_est; + struct gnet_stats_rate_est64 *rate_est; spinlock_t *stats_lock; int ewma_log; u64 last_bytes; @@ -167,7 +167,7 @@ static void gen_add_node(struct gen_estimator *est) static struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est *rate_est) + const struct gnet_stats_rate_est64 *rate_est) { struct rb_node *p = est_root.rb_node; @@ -203,7 +203,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * */ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { @@ -258,7 +258,7 @@ EXPORT_SYMBOL(gen_new_estimator); * Note : Caller should respect an RCU grace period before freeing stats_lock */ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est) + struct gnet_stats_rate_est64 *rate_est) { struct gen_estimator *e; @@ -290,7 +290,7 @@ EXPORT_SYMBOL(gen_kill_estimator); * Returns 0 on success or a negative error code. */ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { gen_kill_estimator(bstats, rate_est); @@ -306,7 +306,7 @@ EXPORT_SYMBOL(gen_replace_estimator); * Returns true if estimator is active, and false if not. */ bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est *rate_est) + const struct gnet_stats_rate_est64 *rate_est) { bool res; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index ddedf211e58..9d3d9e78397 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -143,18 +143,30 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, - struct gnet_stats_rate_est *r) + struct gnet_stats_rate_est64 *r) { + struct gnet_stats_rate_est est; + int res; + if (b && !gen_estimator_active(b, r)) return 0; + est.bps = min_t(u64, UINT_MAX, r->bps); + /* we have some time before reaching 2^32 packets per second */ + est.pps = r->pps; + if (d->compat_tc_stats) { - d->tc_stats.bps = r->bps; - d->tc_stats.pps = r->pps; + d->tc_stats.bps = est.bps; + d->tc_stats.pps = est.pps; } - if (d->tail) - return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r)); + if (d->tail) { + res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est)); + if (res < 0 || est.bps == r->bps) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r)); + } return 0; } diff --git a/net/core/iovec.c b/net/core/iovec.c index 7e7aeb01de4..de178e46268 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -75,31 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a /* * Copy kernel to iovec. Returns -EFAULT on error. - * - * Note: this modifies the original iovec. - */ - -int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) -{ - while (len > 0) { - if (iov->iov_len) { - int copy = min_t(unsigned int, iov->iov_len, len); - if (copy_to_user(iov->iov_base, kdata, copy)) - return -EFAULT; - kdata += copy; - len -= copy; - iov->iov_len -= copy; - iov->iov_base += copy; - } - iov++; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_toiovec); - -/* - * Copy kernel to iovec. Returns -EFAULT on error. */ int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, @@ -125,31 +100,6 @@ int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, EXPORT_SYMBOL(memcpy_toiovecend); /* - * Copy iovec to kernel. Returns -EFAULT on error. - * - * Note: this modifies the original iovec. - */ - -int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) -{ - while (len > 0) { - if (iov->iov_len) { - int copy = min_t(unsigned int, len, iov->iov_len); - if (copy_from_user(kdata, iov->iov_base, copy)) - return -EFAULT; - len -= copy; - kdata += copy; - iov->iov_base += copy; - iov->iov_len -= copy; - } - iov++; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_fromiovec); - -/* * Copy iovec from kernel. Returns -EFAULT on error. */ diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 569d355fec3..2bf83299600 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v) static int softnet_seq_show(struct seq_file *seq, void *v) { struct softnet_data *sd = v; + unsigned int flow_limit_count = 0; - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) + flow_limit_count = fl->count; + rcu_read_unlock(); +#endif + + seq_printf(seq, + "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", sd->processed, sd->dropped, sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ - sd->cpu_collision, sd->received_rps); + sd->cpu_collision, sd->received_rps, flow_limit_count); return 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index cec074be8c4..03c8ec3edc7 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -247,7 +247,7 @@ static void netpoll_poll_dev(struct net_device *dev) zap_completion_queue(); } -int netpoll_rx_disable(struct net_device *dev) +void netpoll_rx_disable(struct net_device *dev) { struct netpoll_info *ni; int idx; @@ -257,7 +257,6 @@ int netpoll_rx_disable(struct net_device *dev) if (ni) down(&ni->dev_lock); srcu_read_unlock(&netpoll_srcu, idx); - return 0; } EXPORT_SYMBOL(netpoll_rx_disable); @@ -690,25 +689,20 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo send_skb->dev = skb->dev; skb_reset_network_header(send_skb); - skb_put(send_skb, sizeof(struct ipv6hdr)); - hdr = ipv6_hdr(send_skb); - + hdr = (struct ipv6hdr *) skb_put(send_skb, sizeof(struct ipv6hdr)); *(__be32*)hdr = htonl(0x60000000); - hdr->payload_len = htons(size); hdr->nexthdr = IPPROTO_ICMPV6; hdr->hop_limit = 255; hdr->saddr = *saddr; hdr->daddr = *daddr; - send_skb->transport_header = send_skb->tail; - skb_put(send_skb, size); - - icmp6h = (struct icmp6hdr *)skb_transport_header(skb); + icmp6h = (struct icmp6hdr *) skb_put(send_skb, sizeof(struct icmp6hdr)); icmp6h->icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; icmp6h->icmp6_router = 0; icmp6h->icmp6_solicited = 1; - target = (struct in6_addr *)(skb_transport_header(send_skb) + sizeof(struct icmp6hdr)); + + target = (struct in6_addr *) skb_put(send_skb, sizeof(struct in6_addr)); *target = msg->target; icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, size, IPPROTO_ICMPV6, diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 0777d0aa18c..e533259dce3 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -261,7 +261,7 @@ struct cgroup_subsys net_prio_subsys = { static int netprio_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netprio_map *old; /* diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 11f2704c381..303412d8332 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1921,7 +1921,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d static int pktgen_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id); if (pn->pktgen_exiting) @@ -2708,15 +2708,15 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IP); } - skb->network_header = skb->tail; - skb->transport_header = skb->network_header + sizeof(struct iphdr); - skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); + skb_set_mac_header(skb, 0); + skb_set_network_header(skb, skb->len); + iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr)); + + skb_set_transport_header(skb, skb->len); + udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; - iph = ip_hdr(skb); - udph = udp_hdr(skb); - memcpy(eth, pkt_dev->hh, 12); *(__be16 *) & eth[12] = protocol; @@ -2746,8 +2746,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, iph->check = 0; iph->check = ip_fast_csum((void *)iph, iph->ihl); skb->protocol = protocol; - skb->mac_header = (skb->network_header - ETH_HLEN - - pkt_dev->pkt_overhead); skb->dev = odev; skb->pkt_type = PACKET_HOST; pktgen_finalize_skb(pkt_dev, skb, datalen); @@ -2822,13 +2820,14 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IPV6); } - skb->network_header = skb->tail; - skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); - skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); + skb_set_mac_header(skb, 0); + skb_set_network_header(skb, skb->len); + iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + + skb_set_transport_header(skb, skb->len); + udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; - iph = ipv6_hdr(skb); - udph = udp_hdr(skb); memcpy(eth, pkt_dev->hh, 12); *(__be16 *) ð[12] = protocol; @@ -2863,8 +2862,6 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, iph->daddr = pkt_dev->cur_in6_daddr; iph->saddr = pkt_dev->cur_in6_saddr; - skb->mac_header = (skb->network_header - ETH_HLEN - - pkt_dev->pkt_overhead); skb->protocol = protocol; skb->dev = odev; skb->pkt_type = PACKET_HOST; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a08bd2b7fe3..49c14451d8a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2667,7 +2667,7 @@ static void rtnetlink_rcv(struct sk_buff *skb) static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_UP: diff --git a/net/core/skbuff.c b/net/core/skbuff.c index af9185d0be6..edf37578e21 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -195,13 +195,11 @@ struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->data = NULL; + skb->head = NULL; skb->truesize = sizeof(struct sk_buff); atomic_set(&skb->users, 1); -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = ~0U; -#endif + skb->mac_header = (typeof(skb->mac_header))~0U; out: return skb; } @@ -275,10 +273,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = ~0U; - skb->transport_header = ~0U; -#endif + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); @@ -344,10 +340,8 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = ~0U; - skb->transport_header = ~0U; -#endif + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); @@ -611,7 +605,7 @@ static void skb_release_head_state(struct sk_buff *skb) static void skb_release_all(struct sk_buff *skb) { skb_release_head_state(skb); - if (likely(skb->data)) + if (likely(skb->head)) skb_release_data(skb); } @@ -739,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->vlan_tci = old->vlan_tci; skb_copy_secmark(new, old); + +#ifdef CONFIG_NET_LL_RX_POLL + new->napi_id = old->napi_id; +#endif } /* @@ -911,18 +909,8 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { -#ifndef NET_SKBUFF_DATA_USES_OFFSET - /* - * Shift between the two data areas in bytes - */ - unsigned long offset = new->data - old->data; -#endif - __copy_skb_header(new, old); -#ifndef NET_SKBUFF_DATA_USES_OFFSET - skb_headers_offset_update(new, offset); -#endif skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; @@ -1114,7 +1102,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->end = skb->head + size; #endif skb->tail += off; - skb_headers_offset_update(skb, off); + skb_headers_offset_update(skb, nhead); /* Only adjust this if it actually is csum_start rather than csum */ if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start += nhead; @@ -1209,9 +1197,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, off = newheadroom - oldheadroom; if (n->ip_summed == CHECKSUM_PARTIAL) n->csum_start += off; -#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb_headers_offset_update(n, off); -#endif return n; } @@ -2853,7 +2840,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) doffset + tnl_hlen); if (fskb != skb_shinfo(skb)->frag_list) - continue; + goto perform_csum_check; if (!sg) { nskb->ip_summed = CHECKSUM_NONE; @@ -2917,6 +2904,7 @@ skip_fraglist: nskb->len += nskb->data_len; nskb->truesize += nskb->data_len; +perform_csum_check: if (!csum) { nskb->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); diff --git a/net/core/sock.c b/net/core/sock.c index d4f4cea726e..788c0da5eed 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -139,6 +139,8 @@ #include <net/tcp.h> #endif +#include <net/ll_poll.h> + static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -210,7 +212,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , - "sk_lock-AF_NFC" , "sk_lock-AF_MAX" + "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -226,7 +228,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , - "slock-AF_NFC" , "slock-AF_MAX" + "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -242,7 +244,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , - "clock-AF_NFC" , "clock-AF_MAX" + "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX" }; /* @@ -1217,18 +1219,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) #endif } -/* - * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes - * un-modified. Special care is taken when initializing object to zero. - */ -static inline void sk_prot_clear_nulls(struct sock *sk, int size) -{ - if (offsetof(struct sock, sk_node.next) != 0) - memset(sk, 0, offsetof(struct sock, sk_node.next)); - memset(&sk->sk_node.pprev, 0, - size - offsetof(struct sock, sk_node.pprev)); -} - void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) { unsigned long nulls1, nulls2; @@ -2296,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); +#ifdef CONFIG_NET_LL_RX_POLL + sk->sk_napi_id = 0; +#endif + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cfdb46ab3a7..4b48f39582b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -19,6 +19,7 @@ #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> +#include <net/ll_poll.h> static int one = 1; @@ -87,6 +88,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, } #endif /* CONFIG_RPS */ +#ifdef CONFIG_NET_FLOW_LIMIT +static DEFINE_MUTEX(flow_limit_update_mutex); + +static int flow_limit_cpu_sysctl(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct sd_flow_limit *cur; + struct softnet_data *sd; + cpumask_var_t mask; + int i, len, ret = 0; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + if (write) { + ret = cpumask_parse_user(buffer, *lenp, mask); + if (ret) + goto done; + + mutex_lock(&flow_limit_update_mutex); + len = sizeof(*cur) + netdev_flow_limit_table_len; + for_each_possible_cpu(i) { + sd = &per_cpu(softnet_data, i); + cur = rcu_dereference_protected(sd->flow_limit, + lockdep_is_held(&flow_limit_update_mutex)); + if (cur && !cpumask_test_cpu(i, mask)) { + RCU_INIT_POINTER(sd->flow_limit, NULL); + synchronize_rcu(); + kfree(cur); + } else if (!cur && cpumask_test_cpu(i, mask)) { + cur = kzalloc(len, GFP_KERNEL); + if (!cur) { + /* not unwinding previous changes */ + ret = -ENOMEM; + goto write_unlock; + } + cur->num_buckets = netdev_flow_limit_table_len; + rcu_assign_pointer(sd->flow_limit, cur); + } + } +write_unlock: + mutex_unlock(&flow_limit_update_mutex); + } else { + if (*ppos || !*lenp) { + *lenp = 0; + goto done; + } + + cpumask_clear(mask); + rcu_read_lock(); + for_each_possible_cpu(i) { + sd = &per_cpu(softnet_data, i); + if (rcu_dereference(sd->flow_limit)) + cpumask_set_cpu(i, mask); + } + rcu_read_unlock(); + + len = cpumask_scnprintf(buffer, *lenp, mask); + *lenp = len + 1; + *ppos += len + 1; + } + +done: + free_cpumask_var(mask); + return ret; +} + +static int flow_limit_table_len_sysctl(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned int old, *ptr; + int ret; + + mutex_lock(&flow_limit_update_mutex); + + ptr = table->data; + old = *ptr; + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (!ret && write && !is_power_of_2(*ptr)) { + *ptr = old; + ret = -EINVAL; + } + + mutex_unlock(&flow_limit_update_mutex); + return ret; +} +#endif /* CONFIG_NET_FLOW_LIMIT */ + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -180,6 +271,29 @@ static struct ctl_table net_core_table[] = { .proc_handler = rps_sock_flow_sysctl }, #endif +#ifdef CONFIG_NET_FLOW_LIMIT + { + .procname = "flow_limit_cpu_bitmap", + .mode = 0644, + .proc_handler = flow_limit_cpu_sysctl + }, + { + .procname = "flow_limit_table_len", + .data = &netdev_flow_limit_table_len, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = flow_limit_table_len_sysctl + }, +#endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_LL_RX_POLL + { + .procname = "low_latency_poll", + .data = &sysctl_net_ll_poll, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax + }, +#endif #endif /* CONFIG_NET */ { .procname = "netdev_budget", diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index c21f200eed9..dd4d506ef92 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2078,9 +2078,9 @@ out_err: } static int dn_device_event(struct notifier_block *this, unsigned long event, - void *ptr) + void *ptr) { - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c index 55e1fd5b3e5..3b9d5f20bd1 100644 --- a/net/ieee802154/6lowpan.c +++ b/net/ieee802154/6lowpan.c @@ -1352,10 +1352,9 @@ static inline void lowpan_netlink_fini(void) } static int lowpan_device_event(struct notifier_block *unused, - unsigned long event, - void *ptr) + unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); LIST_HEAD(del_list); struct lowpan_dev_record *entry, *tmp; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 8603ca82710..37cf1a6ea3a 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -9,10 +9,7 @@ config IP_MULTICAST intend to participate in the MBONE, a high bandwidth network on top of the Internet which carries audio and video broadcasts. More information about the MBONE is on the WWW at - <http://www.savetz.com/mbone/>. Information about the multicast - capabilities of the various network cards is contained in - <file:Documentation/networking/multicast.txt>. For most people, it's - safe to say N. + <http://www.savetz.com/mbone/>. For most people, it's safe to say N. config IP_ADVANCED_ROUTER bool "IP: advanced router" @@ -223,10 +220,8 @@ config IP_MROUTE packets that have several destination addresses. It is needed on the MBONE, a high bandwidth network on top of the Internet which carries audio and video broadcasts. In order to do that, you would most - likely run the program mrouted. Information about the multicast - capabilities of the various network cards is contained in - <file:Documentation/networking/multicast.txt>. If you haven't heard - about it, you don't need it. + likely run the program mrouted. If you haven't heard about it, you + don't need it. config IP_MROUTE_MULTIPLE_TABLES bool "IP: multicast policy routing" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 089cb9f3638..4d3e138c564 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ - datagram.o raw.o udp.o udplite.o \ + tcp_offload.o datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ inet_fragment.o ping.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d01be2a3ae5..7b514290efc 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1295,6 +1295,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_GRE | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | + SKB_GSO_MPLS | 0))) goto out; @@ -1384,7 +1385,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, goto out_unlock; id = ntohl(*(__be32 *)&iph->id); - flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); + flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; for (p = *head; p; p = p->next) { @@ -1406,6 +1407,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, NAPI_GRO_CB(p)->flush |= (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | + ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) | ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); NAPI_GRO_CB(p)->flush |= flush; @@ -1557,15 +1559,6 @@ static const struct net_protocol tcp_protocol = { .netns_ok = 1, }; -static const struct net_offload tcp_offload = { - .callbacks = { - .gso_send_check = tcp_v4_gso_send_check, - .gso_segment = tcp_tso_segment, - .gro_receive = tcp4_gro_receive, - .gro_complete = tcp4_gro_complete, - }, -}; - static const struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, @@ -1681,8 +1674,8 @@ static int __init ipv4_offload_init(void) */ if (inet_add_offload(&udp_offload, IPPROTO_UDP) < 0) pr_crit("%s: Cannot add UDP protocol offload\n", __func__); - if (inet_add_offload(&tcp_offload, IPPROTO_TCP) < 0) - pr_crit("%s: Cannot add TCP protocol offlaod\n", __func__); + if (tcpv4_offload_init() < 0) + pr_crit("%s: Cannot add TCP protocol offload\n", __func__); dev_add_offload(&ip_packet_offload); return 0; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 2e7f1948216..717902669d2 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -419,12 +419,9 @@ static void ah4_err(struct sk_buff *skb, u32 info) if (!x) return; - if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) { - atomic_inc(&flow_cache_genid); - rt_genid_bump(net); - + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); - } else + else ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); xfrm_state_put(x); } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 247ec1951c3..4429b013f26 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1234,13 +1234,19 @@ out: static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_change_info *change_info; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); rt_cache_flush(dev_net(dev)); break; + case NETDEV_CHANGE: + change_info = ptr; + if (change_info->flags_changed & IFF_NOARP) + neigh_changeaddr(&arp_tbl, dev); + break; default: break; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index dfc39d4d48b..b047e2d8a61 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1333,7 +1333,7 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev, static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev = __in_dev_get_rtnl(dev); ASSERT_RTNL(); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 4cfe34d4cc9..ab3d814bc80 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -502,12 +502,9 @@ static void esp4_err(struct sk_buff *skb, u32 info) if (!x) return; - if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) { - atomic_inc(&flow_cache_genid); - rt_genid_bump(net); - + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); - } else + else ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); xfrm_state_put(x); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c7629a209f9..05a4888dede 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1038,7 +1038,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev; struct net *net = dev_net(dev); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 76e10b47e05..5f7d11a4587 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -482,7 +482,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct iphdr *iph; int room; - struct icmp_bxm icmp_param; + struct icmp_bxm *icmp_param; struct rtable *rt = skb_rtable(skb_in); struct ipcm_cookie ipc; struct flowi4 fl4; @@ -503,7 +503,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) iph = ip_hdr(skb_in); if ((u8 *)iph < skb_in->head || - (skb_in->network_header + sizeof(*iph)) > skb_in->tail) + (skb_network_header(skb_in) + sizeof(*iph)) > + skb_tail_pointer(skb_in)) goto out; /* @@ -557,9 +558,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } } + icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC); + if (!icmp_param) + return; + sk = icmp_xmit_lock(net); if (sk == NULL) - return; + goto out_free; /* * Construct source address and options. @@ -585,7 +590,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) IPTOS_PREC_INTERNETCONTROL) : iph->tos; - if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) + if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) goto out_unlock; @@ -593,19 +598,19 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) * Prepare data for ICMP header. */ - icmp_param.data.icmph.type = type; - icmp_param.data.icmph.code = code; - icmp_param.data.icmph.un.gateway = info; - icmp_param.data.icmph.checksum = 0; - icmp_param.skb = skb_in; - icmp_param.offset = skb_network_offset(skb_in); + icmp_param->data.icmph.type = type; + icmp_param->data.icmph.code = code; + icmp_param->data.icmph.un.gateway = info; + icmp_param->data.icmph.checksum = 0; + icmp_param->skb = skb_in; + icmp_param->offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; ipc.addr = iph->saddr; - ipc.opt = &icmp_param.replyopts.opt; + ipc.opt = &icmp_param->replyopts.opt; ipc.tx_flags = 0; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, - type, code, &icmp_param); + type, code, icmp_param); if (IS_ERR(rt)) goto out_unlock; @@ -617,19 +622,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) room = dst_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; + room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); - icmp_param.data_len = skb_in->len - icmp_param.offset; - if (icmp_param.data_len > room) - icmp_param.data_len = room; - icmp_param.head_len = sizeof(struct icmphdr); + icmp_param->data_len = skb_in->len - icmp_param->offset; + if (icmp_param->data_len > room) + icmp_param->data_len = room; + icmp_param->head_len = sizeof(struct icmphdr); - icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); + icmp_push_reply(icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); +out_free: + kfree(icmp_param); out:; } EXPORT_SYMBOL(icmp_send); @@ -657,7 +664,8 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) } /* - * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and + * ICMP_PARAMETERPROB. */ static void icmp_unreach(struct sk_buff *skb) @@ -939,7 +947,8 @@ error: void icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; - struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); + int offset = iph->ihl<<2; + struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); int type = icmp_hdr(skb)->type; int code = icmp_hdr(skb)->code; struct net *net = dev_net(skb->dev); @@ -949,7 +958,7 @@ void icmp_err(struct sk_buff *skb, u32 info) * triggered by ICMP_ECHOREPLY which sent from kernel. */ if (icmph->type != ICMP_ECHOREPLY) { - ping_err(skb, info); + ping_err(skb, offset, info); return; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d8c232794bc..450f625361e 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -363,7 +363,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) static int igmpv3_sendpack(struct sk_buff *skb) { struct igmphdr *pig = igmp_hdr(skb); - const int igmplen = skb->tail - skb->transport_header; + const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb); pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c625e4dad4b..a982657d05e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -235,7 +235,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) */ struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn; - const struct iphdr *iph = (const struct iphdr *)skb->data; + const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; @@ -281,6 +281,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) else itn = net_generic(net, ipgre_net_id); + iph = (const struct iphdr *)skb->data; t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags, iph->daddr, iph->saddr, tpi.key); @@ -428,7 +429,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, return; } - ip_tunnel_xmit(skb, dev, tnl_params); + ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } static netdev_tx_t ipgre_xmit(struct sk_buff *skb, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 147abf5275a..4bcabf3ab4c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -84,7 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); /* Generate a checksum for an outgoing IP datagram. */ -__inline__ void ip_send_check(struct iphdr *iph) +void ip_send_check(struct iphdr *iph) { iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e4147ec1665..7c79cf8ad44 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -487,7 +487,7 @@ drop: EXPORT_SYMBOL_GPL(ip_tunnel_rcv); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, - const struct iphdr *tnl_params) + const struct iphdr *tnl_params, const u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *inner_iph; @@ -503,6 +503,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ @@ -658,7 +659,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); @@ -670,7 +670,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; iph->frag_off = df; - iph->protocol = tnl_params->protocol; + iph->protocol = protocol; iph->tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); iph->daddr = fl4.daddr; iph->saddr = fl4.saddr; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 59cb8c76905..826be4cb482 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -47,12 +47,9 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) if (!x) return; - if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) { - atomic_inc(&flow_cache_genid); - rt_genid_bump(net); - + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); - } else + else ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); xfrm_state_put(x); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 77bfcce64fe..9df7ecd393f 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -222,7 +222,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb->encapsulation = 1; } - ip_tunnel_xmit(skb, dev, tiph); + ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); return NETDEV_TX_OK; tx_error: diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9d9610ae785..132a0966470 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -980,7 +980,7 @@ static int ipmr_cache_report(struct mr_table *mrt, /* Copy the IP header */ - skb->network_header = skb->tail; + skb_set_network_header(skb, skb->len); skb_put(skb, ihl); skb_copy_to_linear_data(skb, pkt->data, ihl); ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ @@ -1609,7 +1609,7 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index e7916c19393..4e902801742 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -111,7 +111,7 @@ config IP_NF_TARGET_REJECT To compile it as a module, choose M here. If unsure, say N. config IP_NF_TARGET_ULOG - tristate "ULOG target support" + tristate "ULOG target support (obsolete)" default m if NETFILTER_ADVANCED=n ---help--- diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 5d5d4d1be9c..30e4de94056 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -108,7 +108,7 @@ static int masq_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - const struct net_device *dev = ptr; + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_DOWN) { @@ -129,7 +129,10 @@ static int masq_inet_event(struct notifier_block *this, void *ptr) { struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; - return masq_device_event(this, event, dev); + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return masq_device_event(this, event, &info); } static struct notifier_block masq_dev_notifier = { diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index f8a222cb644..57c671152c4 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -162,7 +162,8 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) return skb; } -static void ipt_ulog_packet(unsigned int hooknum, +static void ipt_ulog_packet(struct net *net, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -174,7 +175,6 @@ static void ipt_ulog_packet(unsigned int hooknum, size_t size, copy_len; struct nlmsghdr *nlh; struct timeval tv; - struct net *net = dev_net(in ? in : out); struct ulog_net *ulog = ulog_pernet(net); /* ffs == find first bit set, necessary because userspace @@ -231,8 +231,10 @@ static void ipt_ulog_packet(unsigned int hooknum, put_unaligned(tv.tv_usec, &pm->timestamp_usec); put_unaligned(skb->mark, &pm->mark); pm->hook = hooknum; - if (prefix != NULL) - strncpy(pm->prefix, prefix, sizeof(pm->prefix)); + if (prefix != NULL) { + strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1); + pm->prefix[sizeof(pm->prefix) - 1] = '\0'; + } else if (loginfo->prefix[0] != '\0') strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); else @@ -291,12 +293,15 @@ alloc_failure: static unsigned int ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) { - ipt_ulog_packet(par->hooknum, skb, par->in, par->out, + struct net *net = dev_net(par->in ? par->in : par->out); + + ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out, par->targinfo, NULL); return XT_CONTINUE; } -static void ipt_logfn(u_int8_t pf, +static void ipt_logfn(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -318,13 +323,19 @@ static void ipt_logfn(u_int8_t pf, strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); } - ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); + ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); } static int ulog_tg_check(const struct xt_tgchk_param *par) { const struct ipt_ulog_info *loginfo = par->targinfo; + if (!par->net->xt.ulog_warn_deprecated) { + pr_info("ULOG is deprecated and it will be removed soon, " + "use NFLOG instead\n"); + par->net->xt.ulog_warn_deprecated = true; + } + if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { pr_debug("prefix not null-terminated\n"); return -EINVAL; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 7d93d62cd5f..1f1b2dd9027 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -33,7 +33,6 @@ #include <linux/netdevice.h> #include <net/snmp.h> #include <net/ip.h> -#include <net/ipv6.h> #include <net/icmp.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -46,8 +45,18 @@ #include <net/inet_common.h> #include <net/checksum.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <linux/in6.h> +#include <linux/icmpv6.h> +#include <net/addrconf.h> +#include <net/ipv6.h> +#include <net/transp_v6.h> +#endif -static struct ping_table ping_table; + +struct ping_table ping_table; +struct pingv6_ops pingv6_ops; +EXPORT_SYMBOL_GPL(pingv6_ops); static u16 ping_port_rover; @@ -58,6 +67,7 @@ static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int ma pr_debug("hash(%d) = %d\n", num, res); return res; } +EXPORT_SYMBOL_GPL(ping_hash); static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned int num) @@ -65,7 +75,7 @@ static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; } -static int ping_v4_get_port(struct sock *sk, unsigned short ident) +int ping_get_port(struct sock *sk, unsigned short ident) { struct hlist_nulls_node *node; struct hlist_nulls_head *hlist; @@ -103,6 +113,10 @@ next_port: ping_portaddr_for_each_entry(sk2, node, hlist) { isk2 = inet_sk(sk2); + /* BUG? Why is this reuse and not reuseaddr? ping.c + * doesn't turn off SO_REUSEADDR, and it doesn't expect + * that other ping processes can steal its packets. + */ if ((isk2->inet_num == ident) && (sk2 != sk) && (!sk2->sk_reuse || !sk->sk_reuse)) @@ -125,17 +139,18 @@ fail: write_unlock_bh(&ping_table.lock); return 1; } +EXPORT_SYMBOL_GPL(ping_get_port); -static void ping_v4_hash(struct sock *sk) +void ping_hash(struct sock *sk) { - pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); + pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); BUG(); /* "Please do not press this button again." */ } -static void ping_v4_unhash(struct sock *sk) +void ping_unhash(struct sock *sk) { struct inet_sock *isk = inet_sk(sk); - pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); + pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); if (sk_hashed(sk)) { write_lock_bh(&ping_table.lock); hlist_nulls_del(&sk->sk_nulls_node); @@ -146,31 +161,61 @@ static void ping_v4_unhash(struct sock *sk) write_unlock_bh(&ping_table.lock); } } +EXPORT_SYMBOL_GPL(ping_unhash); -static struct sock *ping_v4_lookup(struct net *net, __be32 saddr, __be32 daddr, - u16 ident, int dif) +static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) { struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); struct sock *sk = NULL; struct inet_sock *isk; struct hlist_nulls_node *hnode; + int dif = skb->dev->ifindex; + + if (skb->protocol == htons(ETH_P_IP)) { + pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", + (int)ident, &ip_hdr(skb)->daddr, dif); +#if IS_ENABLED(CONFIG_IPV6) + } else if (skb->protocol == htons(ETH_P_IPV6)) { + pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n", + (int)ident, &ipv6_hdr(skb)->daddr, dif); +#endif + } - pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", - (int)ident, &daddr, dif); read_lock_bh(&ping_table.lock); ping_portaddr_for_each_entry(sk, hnode, hslot) { isk = inet_sk(sk); - pr_debug("found: %p: num = %d, daddr = %pI4, dif = %d\n", sk, - (int)isk->inet_num, &isk->inet_rcv_saddr, - sk->sk_bound_dev_if); - pr_debug("iterate\n"); if (isk->inet_num != ident) continue; - if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr) - continue; + + if (skb->protocol == htons(ETH_P_IP) && + sk->sk_family == AF_INET) { + pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk, + (int) isk->inet_num, &isk->inet_rcv_saddr, + sk->sk_bound_dev_if); + + if (isk->inet_rcv_saddr && + isk->inet_rcv_saddr != ip_hdr(skb)->daddr) + continue; +#if IS_ENABLED(CONFIG_IPV6) + } else if (skb->protocol == htons(ETH_P_IPV6) && + sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk, + (int) isk->inet_num, + &inet6_sk(sk)->rcv_saddr, + sk->sk_bound_dev_if); + + if (!ipv6_addr_any(&np->rcv_saddr) && + !ipv6_addr_equal(&np->rcv_saddr, + &ipv6_hdr(skb)->daddr)) + continue; +#endif + } + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) continue; @@ -200,7 +245,7 @@ static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, } -static int ping_init_sock(struct sock *sk) +int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); kgid_t group = current_egid(); @@ -225,8 +270,9 @@ static int ping_init_sock(struct sock *sk) return -EACCES; } +EXPORT_SYMBOL_GPL(ping_init_sock); -static void ping_close(struct sock *sk, long timeout) +void ping_close(struct sock *sk, long timeout) { pr_debug("ping_close(sk=%p,sk->num=%u)\n", inet_sk(sk), inet_sk(sk)->inet_num); @@ -234,36 +280,122 @@ static void ping_close(struct sock *sk, long timeout) sk_common_release(sk); } +EXPORT_SYMBOL_GPL(ping_close); + +/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ +int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, + struct sockaddr *uaddr, int addr_len) { + struct net *net = sock_net(sk); + if (sk->sk_family == AF_INET) { + struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; + int chk_addr_ret; + + if (addr_len < sizeof(*addr)) + return -EINVAL; + + pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", + sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); + + chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); + + if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) + chk_addr_ret = RTN_LOCAL; + + if ((sysctl_ip_nonlocal_bind == 0 && + isk->freebind == 0 && isk->transparent == 0 && + chk_addr_ret != RTN_LOCAL) || + chk_addr_ret == RTN_MULTICAST || + chk_addr_ret == RTN_BROADCAST) + return -EADDRNOTAVAIL; + +#if IS_ENABLED(CONFIG_IPV6) + } else if (sk->sk_family == AF_INET6) { + struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; + int addr_type, scoped, has_addr; + struct net_device *dev = NULL; + + if (addr_len < sizeof(*addr)) + return -EINVAL; + + pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", + sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); + + addr_type = ipv6_addr_type(&addr->sin6_addr); + scoped = __ipv6_addr_needs_scope_id(addr_type); + if ((addr_type != IPV6_ADDR_ANY && + !(addr_type & IPV6_ADDR_UNICAST)) || + (scoped && !addr->sin6_scope_id)) + return -EINVAL; + + rcu_read_lock(); + if (addr->sin6_scope_id) { + dev = dev_get_by_index_rcu(net, addr->sin6_scope_id); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + } + has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev, + scoped); + rcu_read_unlock(); + + if (!(isk->freebind || isk->transparent || has_addr || + addr_type == IPV6_ADDR_ANY)) + return -EADDRNOTAVAIL; + + if (scoped) + sk->sk_bound_dev_if = addr->sin6_scope_id; +#endif + } else { + return -EAFNOSUPPORT; + } + return 0; +} + +void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) +{ + if (saddr->sa_family == AF_INET) { + struct inet_sock *isk = inet_sk(sk); + struct sockaddr_in *addr = (struct sockaddr_in *) saddr; + isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; +#if IS_ENABLED(CONFIG_IPV6) + } else if (saddr->sa_family == AF_INET6) { + struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr; + struct ipv6_pinfo *np = inet6_sk(sk); + np->rcv_saddr = np->saddr = addr->sin6_addr; +#endif + } +} +void ping_clear_saddr(struct sock *sk, int dif) +{ + sk->sk_bound_dev_if = dif; + if (sk->sk_family == AF_INET) { + struct inet_sock *isk = inet_sk(sk); + isk->inet_rcv_saddr = isk->inet_saddr = 0; +#if IS_ENABLED(CONFIG_IPV6) + } else if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + memset(&np->rcv_saddr, 0, sizeof(np->rcv_saddr)); + memset(&np->saddr, 0, sizeof(np->saddr)); +#endif + } +} /* * We need our own bind because there are no privileged id's == local ports. * Moreover, we don't allow binding to multi- and broadcast addresses. */ -static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct inet_sock *isk = inet_sk(sk); unsigned short snum; - int chk_addr_ret; int err; + int dif = sk->sk_bound_dev_if; - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n", - sk, addr->sin_addr.s_addr, ntohs(addr->sin_port)); - - chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); - if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) - chk_addr_ret = RTN_LOCAL; - - if ((sysctl_ip_nonlocal_bind == 0 && - isk->freebind == 0 && isk->transparent == 0 && - chk_addr_ret != RTN_LOCAL) || - chk_addr_ret == RTN_MULTICAST || - chk_addr_ret == RTN_BROADCAST) - return -EADDRNOTAVAIL; + err = ping_check_bind_addr(sk, isk, uaddr, addr_len); + if (err) + return err; lock_sock(sk); @@ -272,42 +404,50 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto out; err = -EADDRINUSE; - isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; - snum = ntohs(addr->sin_port); - if (ping_v4_get_port(sk, snum) != 0) { - isk->inet_saddr = isk->inet_rcv_saddr = 0; + ping_set_saddr(sk, uaddr); + snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port); + if (ping_get_port(sk, snum) != 0) { + ping_clear_saddr(sk, dif); goto out; } - pr_debug("after bind(): num = %d, daddr = %pI4, dif = %d\n", + pr_debug("after bind(): num = %d, dif = %d\n", (int)isk->inet_num, - &isk->inet_rcv_saddr, (int)sk->sk_bound_dev_if); err = 0; - if (isk->inet_rcv_saddr) + if ((sk->sk_family == AF_INET && isk->inet_rcv_saddr) || + (sk->sk_family == AF_INET6 && + !ipv6_addr_any(&inet6_sk(sk)->rcv_saddr))) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; isk->inet_sport = htons(isk->inet_num); isk->inet_daddr = 0; isk->inet_dport = 0; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + memset(&inet6_sk(sk)->daddr, 0, sizeof(inet6_sk(sk)->daddr)); +#endif + sk_dst_reset(sk); out: release_sock(sk); pr_debug("ping_v4_bind -> %d\n", err); return err; } +EXPORT_SYMBOL_GPL(ping_bind); /* * Is this a supported type of ICMP message? */ -static inline int ping_supported(int type, int code) +static inline int ping_supported(int family, int type, int code) { - if (type == ICMP_ECHO && code == 0) - return 1; - return 0; + return (family == AF_INET && type == ICMP_ECHO && code == 0) || + (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0); } /* @@ -315,30 +455,42 @@ static inline int ping_supported(int type, int code) * sort of error condition. */ -static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); - -void ping_err(struct sk_buff *skb, u32 info) +void ping_err(struct sk_buff *skb, int offset, u32 info) { - struct iphdr *iph = (struct iphdr *)skb->data; - struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); + int family; + struct icmphdr *icmph; struct inet_sock *inet_sock; - int type = icmp_hdr(skb)->type; - int code = icmp_hdr(skb)->code; + int type; + int code; struct net *net = dev_net(skb->dev); struct sock *sk; int harderr; int err; + if (skb->protocol == htons(ETH_P_IP)) { + family = AF_INET; + type = icmp_hdr(skb)->type; + code = icmp_hdr(skb)->code; + icmph = (struct icmphdr *)(skb->data + offset); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + family = AF_INET6; + type = icmp6_hdr(skb)->icmp6_type; + code = icmp6_hdr(skb)->icmp6_code; + icmph = (struct icmphdr *) (skb->data + offset); + } else { + BUG(); + } + /* We assume the packet has already been checked by icmp_unreach */ - if (!ping_supported(icmph->type, icmph->code)) + if (!ping_supported(family, icmph->type, icmph->code)) return; - pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type, - code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); + pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n", + skb->protocol, type, code, ntohs(icmph->un.echo.id), + ntohs(icmph->un.echo.sequence)); - sk = ping_v4_lookup(net, iph->daddr, iph->saddr, - ntohs(icmph->un.echo.id), skb->dev->ifindex); + sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (sk == NULL) { pr_debug("no socket, dropping\n"); return; /* No socket for error */ @@ -349,72 +501,83 @@ void ping_err(struct sk_buff *skb, u32 info) harderr = 0; inet_sock = inet_sk(sk); - switch (type) { - default: - case ICMP_TIME_EXCEEDED: - err = EHOSTUNREACH; - break; - case ICMP_SOURCE_QUENCH: - /* This is not a real error but ping wants to see it. - * Report it with some fake errno. */ - err = EREMOTEIO; - break; - case ICMP_PARAMETERPROB: - err = EPROTO; - harderr = 1; - break; - case ICMP_DEST_UNREACH: - if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ - ipv4_sk_update_pmtu(skb, sk, info); - if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { - err = EMSGSIZE; - harderr = 1; - break; + if (skb->protocol == htons(ETH_P_IP)) { + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + /* This is not a real error but ping wants to see it. + * Report it with some fake errno. + */ + err = EREMOTEIO; + break; + case ICMP_PARAMETERPROB: + err = EPROTO; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + ipv4_sk_update_pmtu(skb, sk, info); + if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { + err = EMSGSIZE; + harderr = 1; + break; + } + goto out; } - goto out; - } - err = EHOSTUNREACH; - if (code <= NR_ICMP_UNREACH) { - harderr = icmp_err_convert[code].fatal; - err = icmp_err_convert[code].errno; + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + case ICMP_REDIRECT: + /* See ICMP_SOURCE_QUENCH */ + ipv4_sk_redirect(skb, sk); + err = EREMOTEIO; + break; } - break; - case ICMP_REDIRECT: - /* See ICMP_SOURCE_QUENCH */ - ipv4_sk_redirect(skb, sk); - err = EREMOTEIO; - break; +#if IS_ENABLED(CONFIG_IPV6) + } else if (skb->protocol == htons(ETH_P_IPV6)) { + harderr = pingv6_ops.icmpv6_err_convert(type, code, &err); +#endif } /* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ - if (!inet_sock->recverr) { + if ((family == AF_INET && !inet_sock->recverr) || + (family == AF_INET6 && !inet6_sk(sk)->recverr)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { - ip_icmp_error(sk, skb, err, 0 /* no remote port */, - info, (u8 *)icmph); + if (family == AF_INET) { + ip_icmp_error(sk, skb, err, 0 /* no remote port */, + info, (u8 *)icmph); +#if IS_ENABLED(CONFIG_IPV6) + } else if (family == AF_INET6) { + pingv6_ops.ipv6_icmp_error(sk, skb, err, 0, + info, (u8 *)icmph); +#endif + } } sk->sk_err = err; sk->sk_error_report(sk); out: sock_put(sk); } +EXPORT_SYMBOL_GPL(ping_err); /* - * Copy and checksum an ICMP Echo packet from user space into a buffer. + * Copy and checksum an ICMP Echo packet from user space into a buffer + * starting from the payload. */ -struct pingfakehdr { - struct icmphdr icmph; - struct iovec *iov; - __wsum wcheck; -}; - -static int ping_getfrag(void *from, char *to, - int offset, int fraglen, int odd, struct sk_buff *skb) +int ping_getfrag(void *from, char *to, + int offset, int fraglen, int odd, struct sk_buff *skb) { struct pingfakehdr *pfh = (struct pingfakehdr *)from; @@ -425,20 +588,33 @@ static int ping_getfrag(void *from, char *to, pfh->iov, 0, fraglen - sizeof(struct icmphdr), &pfh->wcheck)) return -EFAULT; + } else if (offset < sizeof(struct icmphdr)) { + BUG(); + } else { + if (csum_partial_copy_fromiovecend + (to, pfh->iov, offset - sizeof(struct icmphdr), + fraglen, &pfh->wcheck)) + return -EFAULT; + } - return 0; +#if IS_ENABLED(CONFIG_IPV6) + /* For IPv6, checksum each skb as we go along, as expected by + * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in + * wcheck, it will be finalized in ping_v4_push_pending_frames. + */ + if (pfh->family == AF_INET6) { + skb->csum = pfh->wcheck; + skb->ip_summed = CHECKSUM_NONE; + pfh->wcheck = 0; } - if (offset < sizeof(struct icmphdr)) - BUG(); - if (csum_partial_copy_fromiovecend - (to, pfh->iov, offset - sizeof(struct icmphdr), - fraglen, &pfh->wcheck)) - return -EFAULT; +#endif + return 0; } +EXPORT_SYMBOL_GPL(ping_getfrag); -static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, - struct flowi4 *fl4) +static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, + struct flowi4 *fl4) { struct sk_buff *skb = skb_peek(&sk->sk_write_queue); @@ -450,24 +626,9 @@ static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, return ip_push_pending_frames(sk, fl4); } -static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) -{ - struct net *net = sock_net(sk); - struct flowi4 fl4; - struct inet_sock *inet = inet_sk(sk); - struct ipcm_cookie ipc; - struct icmphdr user_icmph; - struct pingfakehdr pfh; - struct rtable *rt = NULL; - struct ip_options_data opt_copy; - int free = 0; - __be32 saddr, daddr, faddr; - u8 tos; - int err; - - pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); - +int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, + void *user_icmph, size_t icmph_len) { + u8 type, code; if (len > 0xFFFF) return -EMSGSIZE; @@ -482,15 +643,53 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* * Fetch the ICMP header provided by the userland. - * iovec is modified! + * iovec is modified! The ICMP header is consumed. */ - - if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov, - sizeof(struct icmphdr))) + if (memcpy_fromiovec(user_icmph, msg->msg_iov, icmph_len)) return -EFAULT; - if (!ping_supported(user_icmph.type, user_icmph.code)) + + if (family == AF_INET) { + type = ((struct icmphdr *) user_icmph)->type; + code = ((struct icmphdr *) user_icmph)->code; +#if IS_ENABLED(CONFIG_IPV6) + } else if (family == AF_INET6) { + type = ((struct icmp6hdr *) user_icmph)->icmp6_type; + code = ((struct icmp6hdr *) user_icmph)->icmp6_code; +#endif + } else { + BUG(); + } + + if (!ping_supported(family, type, code)) return -EINVAL; + return 0; +} +EXPORT_SYMBOL_GPL(ping_common_sendmsg); + +int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct net *net = sock_net(sk); + struct flowi4 fl4; + struct inet_sock *inet = inet_sk(sk); + struct ipcm_cookie ipc; + struct icmphdr user_icmph; + struct pingfakehdr pfh; + struct rtable *rt = NULL; + struct ip_options_data opt_copy; + int free = 0; + __be32 saddr, daddr, faddr; + u8 tos; + int err; + + pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); + + err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph, + sizeof(user_icmph)); + if (err) + return err; + /* * Get and verify the address. */ @@ -595,13 +794,14 @@ back_from_confirm: pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; pfh.iov = msg->msg_iov; pfh.wcheck = 0; + pfh.family = AF_INET; err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else - err = ping_push_pending_frames(sk, &pfh, &fl4); + err = ping_v4_push_pending_frames(sk, &pfh, &fl4); release_sock(sk); out: @@ -622,11 +822,13 @@ do_confirm: goto out; } -static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len) +int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) { struct inet_sock *isk = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + int family = sk->sk_family; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; struct sk_buff *skb; int copied, err; @@ -636,11 +838,22 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (flags & MSG_OOB) goto out; - if (addr_len) - *addr_len = sizeof(*sin); + if (addr_len) { + if (family == AF_INET) + *addr_len = sizeof(*sin); + else if (family == AF_INET6 && addr_len) + *addr_len = sizeof(*sin6); + } - if (flags & MSG_ERRQUEUE) - return ip_recv_error(sk, msg, len); + if (flags & MSG_ERRQUEUE) { + if (family == AF_INET) { + return ip_recv_error(sk, msg, len); +#if IS_ENABLED(CONFIG_IPV6) + } else if (family == AF_INET6) { + return pingv6_ops.ipv6_recv_error(sk, msg, len); +#endif + } + } skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) @@ -659,15 +872,40 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sock_recv_timestamp(msg, sk, skb); - /* Copy the address. */ - if (sin) { + /* Copy the address and add cmsg data. */ + if (family == AF_INET) { + sin = (struct sockaddr_in *) msg->msg_name; sin->sin_family = AF_INET; sin->sin_port = 0 /* skb->h.uh->source */; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + + if (isk->cmsg_flags) + ip_cmsg_recv(msg, skb); + +#if IS_ENABLED(CONFIG_IPV6) + } else if (family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *ip6 = ipv6_hdr(skb); + sin6 = (struct sockaddr_in6 *) msg->msg_name; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_addr = ip6->saddr; + + sin6->sin6_flowinfo = 0; + if (np->sndflow) + sin6->sin6_flowinfo = ip6_flowinfo(ip6); + + sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, + IP6CB(skb)->iif); + + if (inet6_sk(sk)->rxopt.all) + pingv6_ops.ip6_datagram_recv_ctl(sk, msg, skb); +#endif + } else { + BUG(); } - if (isk->cmsg_flags) - ip_cmsg_recv(msg, skb); + err = copied; done: @@ -676,8 +914,9 @@ out: pr_debug("ping_recvmsg -> %d\n", err); return err; } +EXPORT_SYMBOL_GPL(ping_recvmsg); -static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", inet_sk(sk), inet_sk(sk)->inet_num, skb); @@ -688,6 +927,7 @@ static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } return 0; } +EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); /* @@ -698,10 +938,7 @@ void ping_rcv(struct sk_buff *skb) { struct sock *sk; struct net *net = dev_net(skb->dev); - struct iphdr *iph = ip_hdr(skb); struct icmphdr *icmph = icmp_hdr(skb); - __be32 saddr = iph->saddr; - __be32 daddr = iph->daddr; /* We assume the packet has already been checked by icmp_rcv */ @@ -711,8 +948,7 @@ void ping_rcv(struct sk_buff *skb) /* Push ICMP header back */ skb_push(skb, skb->data - (u8 *)icmph); - sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id), - skb->dev->ifindex); + sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (sk != NULL) { pr_debug("rcv on socket %p\n", sk); ping_queue_rcv_skb(sk, skb_get(skb)); @@ -723,6 +959,7 @@ void ping_rcv(struct sk_buff *skb) /* We're called from icmp_rcv(). kfree_skb() is done there. */ } +EXPORT_SYMBOL_GPL(ping_rcv); struct proto ping_prot = { .name = "PING", @@ -733,14 +970,14 @@ struct proto ping_prot = { .disconnect = udp_disconnect, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .sendmsg = ping_sendmsg, + .sendmsg = ping_v4_sendmsg, .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, - .hash = ping_v4_hash, - .unhash = ping_v4_unhash, - .get_port = ping_v4_get_port, + .hash = ping_hash, + .unhash = ping_unhash, + .get_port = ping_get_port, .obj_size = sizeof(struct inet_sock), }; EXPORT_SYMBOL(ping_prot); @@ -764,7 +1001,8 @@ static struct sock *ping_get_first(struct seq_file *seq, int start) continue; sk_nulls_for_each(sk, node, hslot) { - if (net_eq(sock_net(sk), net)) + if (net_eq(sock_net(sk), net) && + sk->sk_family == state->family) goto found; } } @@ -797,17 +1035,24 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) return pos ? NULL : sk; } -static void *ping_seq_start(struct seq_file *seq, loff_t *pos) +void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) { struct ping_iter_state *state = seq->private; state->bucket = 0; + state->family = family; read_lock_bh(&ping_table.lock); return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } +EXPORT_SYMBOL_GPL(ping_seq_start); + +static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos) +{ + return ping_seq_start(seq, pos, AF_INET); +} -static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) +void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; @@ -819,13 +1064,15 @@ static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; return sk; } +EXPORT_SYMBOL_GPL(ping_seq_next); -static void ping_seq_stop(struct seq_file *seq, void *v) +void ping_seq_stop(struct seq_file *seq, void *v) { read_unlock_bh(&ping_table.lock); } +EXPORT_SYMBOL_GPL(ping_seq_stop); -static void ping_format_sock(struct sock *sp, struct seq_file *f, +static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, int bucket, int *len) { struct inet_sock *inet = inet_sk(sp); @@ -846,7 +1093,7 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f, atomic_read(&sp->sk_drops), len); } -static int ping_seq_show(struct seq_file *seq, void *v) +static int ping_v4_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%-127s\n", @@ -857,72 +1104,86 @@ static int ping_seq_show(struct seq_file *seq, void *v) struct ping_iter_state *state = seq->private; int len; - ping_format_sock(v, seq, state->bucket, &len); + ping_v4_format_sock(v, seq, state->bucket, &len); seq_printf(seq, "%*s\n", 127 - len, ""); } return 0; } -static const struct seq_operations ping_seq_ops = { - .show = ping_seq_show, - .start = ping_seq_start, +static const struct seq_operations ping_v4_seq_ops = { + .show = ping_v4_seq_show, + .start = ping_v4_seq_start, .next = ping_seq_next, .stop = ping_seq_stop, }; static int ping_seq_open(struct inode *inode, struct file *file) { - return seq_open_net(inode, file, &ping_seq_ops, + struct ping_seq_afinfo *afinfo = PDE_DATA(inode); + return seq_open_net(inode, file, &afinfo->seq_ops, sizeof(struct ping_iter_state)); } -static const struct file_operations ping_seq_fops = { +const struct file_operations ping_seq_fops = { .open = ping_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, }; +EXPORT_SYMBOL_GPL(ping_seq_fops); + +static struct ping_seq_afinfo ping_v4_seq_afinfo = { + .name = "icmp", + .family = AF_INET, + .seq_fops = &ping_seq_fops, + .seq_ops = { + .start = ping_v4_seq_start, + .show = ping_v4_seq_show, + .next = ping_seq_next, + .stop = ping_seq_stop, + }, +}; -static int ping_proc_register(struct net *net) +int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo) { struct proc_dir_entry *p; - int rc = 0; - - p = proc_create("icmp", S_IRUGO, net->proc_net, &ping_seq_fops); + p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + afinfo->seq_fops, afinfo); if (!p) - rc = -ENOMEM; - return rc; + return -ENOMEM; + return 0; } +EXPORT_SYMBOL_GPL(ping_proc_register); -static void ping_proc_unregister(struct net *net) +void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo) { - remove_proc_entry("icmp", net->proc_net); + remove_proc_entry(afinfo->name, net->proc_net); } +EXPORT_SYMBOL_GPL(ping_proc_unregister); - -static int __net_init ping_proc_init_net(struct net *net) +static int __net_init ping_v4_proc_init_net(struct net *net) { - return ping_proc_register(net); + return ping_proc_register(net, &ping_v4_seq_afinfo); } -static void __net_exit ping_proc_exit_net(struct net *net) +static void __net_exit ping_v4_proc_exit_net(struct net *net) { - ping_proc_unregister(net); + ping_proc_unregister(net, &ping_v4_seq_afinfo); } -static struct pernet_operations ping_net_ops = { - .init = ping_proc_init_net, - .exit = ping_proc_exit_net, +static struct pernet_operations ping_v4_net_ops = { + .init = ping_v4_proc_init_net, + .exit = ping_v4_proc_exit_net, }; int __init ping_proc_init(void) { - return register_pernet_subsys(&ping_net_ops); + return register_pernet_subsys(&ping_v4_net_ops); } void ping_proc_exit(void) { - unregister_pernet_subsys(&ping_net_ops); + unregister_pernet_subsys(&ping_v4_net_ops); } #endif diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2a5bf86d241..6577a1149a4 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), + SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 550781a17b3..198ea596f2d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -594,11 +594,25 @@ static inline u32 fnhe_hashfun(__be32 daddr) return hval & (FNHE_HASH_SIZE - 1); } +static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) +{ + rt->rt_pmtu = fnhe->fnhe_pmtu; + rt->dst.expires = fnhe->fnhe_expires; + + if (fnhe->fnhe_gw) { + rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_gateway = fnhe->fnhe_gw; + rt->rt_uses_gateway = 1; + } +} + static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, u32 pmtu, unsigned long expires) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; + struct rtable *rt; + unsigned int i; int depth; u32 hval = fnhe_hashfun(daddr); @@ -627,8 +641,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_gw = gw; if (pmtu) { fnhe->fnhe_pmtu = pmtu; - fnhe->fnhe_expires = expires; + fnhe->fnhe_expires = max(1UL, expires); } + /* Update all cached dsts too */ + rt = rcu_dereference(fnhe->fnhe_rth); + if (rt) + fill_route_from_fnhe(rt, fnhe); } else { if (depth > FNHE_RECLAIM_DEPTH) fnhe = fnhe_oldest(hash); @@ -640,10 +658,23 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_next = hash->chain; rcu_assign_pointer(hash->chain, fnhe); } + fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev)); fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_expires = expires; + + /* Exception created; mark the cached routes for the nexthop + * stale, so anyone caching it rechecks if this exception + * applies to them. + */ + for_each_possible_cpu(i) { + struct rtable __rcu **prt; + prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); + rt = rcu_dereference(*prt); + if (rt) + rt->dst.obsolete = DST_OBSOLETE_KILL; + } } fnhe->fnhe_stamp = jiffies; @@ -737,10 +768,15 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf { struct rtable *rt; struct flowi4 fl4; + const struct iphdr *iph = (const struct iphdr *) skb->data; + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; rt = (struct rtable *) dst; - ip_rt_build_flow_key(&fl4, sk, skb); + __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } @@ -917,12 +953,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; - if (!rt->rt_pmtu) { - dst->obsolete = DST_OBSOLETE_KILL; - } else { - rt->rt_pmtu = mtu; - dst->expires = max(1UL, jiffies + ip_rt_mtu_expires); - } + if (rt->rt_pmtu == mtu && + time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) + return; rcu_read_lock(); if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { @@ -1063,11 +1096,11 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. * - * When a PMTU/redirect information update invalidates a - * route, this is indicated by setting obsolete to - * DST_OBSOLETE_KILL. + * When a PMTU/redirect information update invalidates a route, + * this is indicated by setting obsolete to DST_OBSOLETE_KILL or + * DST_OBSOLETE_DEAD by dst_free(). */ - if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt)) + if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) return NULL; return dst; } @@ -1209,26 +1242,17 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, spin_lock_bh(&fnhe_lock); if (daddr == fnhe->fnhe_daddr) { + int genid = fnhe_genid(dev_net(rt->dst.dev)); struct rtable *orig = rcu_dereference(fnhe->fnhe_rth); - if (orig && rt_is_expired(orig)) { + + if (fnhe->fnhe_genid != genid) { + fnhe->fnhe_genid = genid; fnhe->fnhe_gw = 0; fnhe->fnhe_pmtu = 0; fnhe->fnhe_expires = 0; } - if (fnhe->fnhe_pmtu) { - unsigned long expires = fnhe->fnhe_expires; - unsigned long diff = expires - jiffies; - - if (time_before(jiffies, expires)) { - rt->rt_pmtu = fnhe->fnhe_pmtu; - dst_set_expires(&rt->dst, diff); - } - } - if (fnhe->fnhe_gw) { - rt->rt_flags |= RTCF_REDIRECTED; - rt->rt_gateway = fnhe->fnhe_gw; - rt->rt_uses_gateway = 1; - } else if (!rt->rt_gateway) + fill_route_from_fnhe(rt, fnhe); + if (!rt->rt_gateway) rt->rt_gateway = daddr; rcu_assign_pointer(fnhe->fnhe_rth, rt); @@ -2428,8 +2452,11 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = (struct net *)__ctl->extra1; + if (write) { - rt_cache_flush((struct net *)__ctl->extra1); + rt_cache_flush(net); + fnhe_genid_bump(net); return 0; } @@ -2604,6 +2631,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { static __net_init int rt_genid_init(struct net *net) { atomic_set(&net->rt_genid, 0); + atomic_set(&net->fnhe_genid, 0); get_random_bytes(&net->ipv4.dev_addr_genid, sizeof(net->ipv4.dev_addr_genid)); return 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dcb116dde21..46ed9afd1f5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include <asm/uaccess.h> #include <asm/ioctls.h> +#include <net/ll_poll.h> int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; @@ -436,6 +437,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + sock_rps_record_flow(sk); + sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == TCP_LISTEN) return inet_csk_listen_poll(sk); @@ -1551,6 +1554,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct sk_buff *skb; u32 urg_hole = 0; + if (sk_valid_ll(sk) && skb_queue_empty(&sk->sk_receive_queue) + && (sk->sk_state == TCP_ESTABLISHED)) + sk_poll_ll(sk, nonblock); + lock_sock(sk); err = -ENOTCONN; @@ -2875,229 +2882,9 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - struct tcphdr *th; - unsigned int thlen; - unsigned int seq; - __be32 delta; - unsigned int oldlen; - unsigned int mss; - struct sk_buff *gso_skb = skb; - __sum16 newcheck; - - if (!pskb_may_pull(skb, sizeof(*th))) - goto out; - - th = tcp_hdr(skb); - thlen = th->doff * 4; - if (thlen < sizeof(*th)) - goto out; - - if (!pskb_may_pull(skb, thlen)) - goto out; - - oldlen = (u16)~skb->len; - __skb_pull(skb, thlen); - - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; - - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { - /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; - - if (unlikely(type & - ~(SKB_GSO_TCPV4 | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_TCPV6 | - SKB_GSO_GRE | - SKB_GSO_UDP_TUNNEL | - 0) || - !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) - goto out; - - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); - - segs = NULL; - goto out; - } - - segs = skb_segment(skb, features); - if (IS_ERR(segs)) - goto out; - - delta = htonl(oldlen + (thlen + mss)); - - skb = segs; - th = tcp_hdr(skb); - seq = ntohl(th->seq); - - newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + - (__force u32)delta)); - - do { - th->fin = th->psh = 0; - th->check = newcheck; - - if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = - csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); - - seq += mss; - skb = skb->next; - th = tcp_hdr(skb); - - th->seq = htonl(seq); - th->cwr = 0; - } while (skb->next); - - /* Following permits TCP Small Queues to work well with GSO : - * The callback to TCP stack will be called at the time last frag - * is freed at TX completion, and not right now when gso_skb - * is freed by GSO engine - */ - if (gso_skb->destructor == tcp_wfree) { - swap(gso_skb->sk, skb->sk); - swap(gso_skb->destructor, skb->destructor); - swap(gso_skb->truesize, skb->truesize); - } - - delta = htonl(oldlen + (skb->tail - skb->transport_header) + - skb->data_len); - th->check = ~csum_fold((__force __wsum)((__force u32)th->check + - (__force u32)delta)); - if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb_transport_header(skb), - thlen, skb->csum)); - -out: - return segs; -} -EXPORT_SYMBOL(tcp_tso_segment); - -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) -{ - struct sk_buff **pp = NULL; - struct sk_buff *p; - struct tcphdr *th; - struct tcphdr *th2; - unsigned int len; - unsigned int thlen; - __be32 flags; - unsigned int mss = 1; - unsigned int hlen; - unsigned int off; - int flush = 1; - int i; - - off = skb_gro_offset(skb); - hlen = off + sizeof(*th); - th = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - th = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!th)) - goto out; - } - - thlen = th->doff * 4; - if (thlen < sizeof(*th)) - goto out; - - hlen = off + thlen; - if (skb_gro_header_hard(skb, hlen)) { - th = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!th)) - goto out; - } - - skb_gro_pull(skb, thlen); - - len = skb_gro_len(skb); - flags = tcp_flag_word(th); - - for (; (p = *head); head = &p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - th2 = tcp_hdr(p); - - if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - - goto found; - } - - goto out_check_final; - -found: - flush = NAPI_GRO_CB(p)->flush; - flush |= (__force int)(flags & TCP_FLAG_CWR); - flush |= (__force int)((flags ^ tcp_flag_word(th2)) & - ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); - flush |= (__force int)(th->ack_seq ^ th2->ack_seq); - for (i = sizeof(*th); i < thlen; i += 4) - flush |= *(u32 *)((u8 *)th + i) ^ - *(u32 *)((u8 *)th2 + i); - - mss = skb_shinfo(p)->gso_size; - - flush |= (len - 1) >= mss; - flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); - - if (flush || skb_gro_receive(head, skb)) { - mss = 1; - goto out_check_final; - } - - p = *head; - th2 = tcp_hdr(p); - tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); - -out_check_final: - flush = len < mss; - flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | - TCP_FLAG_RST | TCP_FLAG_SYN | - TCP_FLAG_FIN)); - - if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) - pp = head; - -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} -EXPORT_SYMBOL(tcp_gro_receive); - -int tcp_gro_complete(struct sk_buff *skb) -{ - struct tcphdr *th = tcp_hdr(skb); - - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - - skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - - if (th->cwr) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; - - return 0; -} -EXPORT_SYMBOL(tcp_gro_complete); - #ifdef CONFIG_TCP_MD5SIG -static unsigned long tcp_md5sig_users; -static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool; -static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); +static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly; +static DEFINE_MUTEX(tcp_md5sig_mutex); static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) { @@ -3112,30 +2899,14 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) free_percpu(pool); } -void tcp_free_md5sig_pool(void) -{ - struct tcp_md5sig_pool __percpu *pool = NULL; - - spin_lock_bh(&tcp_md5sig_pool_lock); - if (--tcp_md5sig_users == 0) { - pool = tcp_md5sig_pool; - tcp_md5sig_pool = NULL; - } - spin_unlock_bh(&tcp_md5sig_pool_lock); - if (pool) - __tcp_free_md5sig_pool(pool); -} -EXPORT_SYMBOL(tcp_free_md5sig_pool); - -static struct tcp_md5sig_pool __percpu * -__tcp_alloc_md5sig_pool(struct sock *sk) +static void __tcp_alloc_md5sig_pool(void) { int cpu; struct tcp_md5sig_pool __percpu *pool; pool = alloc_percpu(struct tcp_md5sig_pool); if (!pool) - return NULL; + return; for_each_possible_cpu(cpu) { struct crypto_hash *hash; @@ -3146,53 +2917,27 @@ __tcp_alloc_md5sig_pool(struct sock *sk) per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; } - return pool; + /* before setting tcp_md5sig_pool, we must commit all writes + * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool() + */ + smp_wmb(); + tcp_md5sig_pool = pool; + return; out_free: __tcp_free_md5sig_pool(pool); - return NULL; } -struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk) +bool tcp_alloc_md5sig_pool(void) { - struct tcp_md5sig_pool __percpu *pool; - bool alloc = false; - -retry: - spin_lock_bh(&tcp_md5sig_pool_lock); - pool = tcp_md5sig_pool; - if (tcp_md5sig_users++ == 0) { - alloc = true; - spin_unlock_bh(&tcp_md5sig_pool_lock); - } else if (!pool) { - tcp_md5sig_users--; - spin_unlock_bh(&tcp_md5sig_pool_lock); - cpu_relax(); - goto retry; - } else - spin_unlock_bh(&tcp_md5sig_pool_lock); - - if (alloc) { - /* we cannot hold spinlock here because this may sleep. */ - struct tcp_md5sig_pool __percpu *p; - - p = __tcp_alloc_md5sig_pool(sk); - spin_lock_bh(&tcp_md5sig_pool_lock); - if (!p) { - tcp_md5sig_users--; - spin_unlock_bh(&tcp_md5sig_pool_lock); - return NULL; - } - pool = tcp_md5sig_pool; - if (pool) { - /* oops, it has already been assigned. */ - spin_unlock_bh(&tcp_md5sig_pool_lock); - __tcp_free_md5sig_pool(p); - } else { - tcp_md5sig_pool = pool = p; - spin_unlock_bh(&tcp_md5sig_pool_lock); - } + if (unlikely(!tcp_md5sig_pool)) { + mutex_lock(&tcp_md5sig_mutex); + + if (!tcp_md5sig_pool) + __tcp_alloc_md5sig_pool(); + + mutex_unlock(&tcp_md5sig_mutex); } - return pool; + return tcp_md5sig_pool != NULL; } EXPORT_SYMBOL(tcp_alloc_md5sig_pool); @@ -3209,28 +2954,15 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) struct tcp_md5sig_pool __percpu *p; local_bh_disable(); - - spin_lock(&tcp_md5sig_pool_lock); - p = tcp_md5sig_pool; - if (p) - tcp_md5sig_users++; - spin_unlock(&tcp_md5sig_pool_lock); - + p = ACCESS_ONCE(tcp_md5sig_pool); if (p) - return this_cpu_ptr(p); + return __this_cpu_ptr(p); local_bh_enable(); return NULL; } EXPORT_SYMBOL(tcp_get_md5sig_pool); -void tcp_put_md5sig_pool(void) -{ - local_bh_enable(); - tcp_free_md5sig_pool(); -} -EXPORT_SYMBOL(tcp_put_md5sig_pool); - int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, const struct tcphdr *th) { @@ -3269,8 +3001,11 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, for (i = 0; i < shi->nr_frags; ++i) { const struct skb_frag_struct *f = &shi->frags[i]; - struct page *page = skb_frag_page(f); - sg_set_page(&sg, page, skb_frag_size(f), f->page_offset); + unsigned int offset = f->page_offset; + struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); + + sg_set_page(&sg, page, skb_frag_size(f), + offset_in_page(offset)); if (crypto_hash_update(desc, &sg, skb_frag_size(f))) return 1; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 08bbe609652..907311c9a01 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -360,9 +360,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) if (mss > 1460) icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); - rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < mss) - rcvmem += 128; + rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER); rcvmem *= icwnd; @@ -1257,8 +1255,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = prev; - if (skb == tp->scoreboard_skb_hint) - tp->scoreboard_skb_hint = prev; if (skb == tp->lost_skb_hint) { tp->lost_skb_hint = prev; tp->lost_cnt_hint -= tcp_skb_pcount(prev); @@ -1966,20 +1962,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) return true; } -static inline int tcp_skb_timedout(const struct sock *sk, - const struct sk_buff *skb) -{ - return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; -} - -static inline int tcp_head_timedout(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - return tp->packets_out && - tcp_skb_timedout(sk, tcp_write_queue_head(sk)); -} - /* Linux NewReno/SACK/FACK/ECN state machine. * -------------------------------------- * @@ -2086,12 +2068,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) if (tcp_dupack_heuristics(tp) > tp->reordering) return true; - /* Trick#3 : when we use RFC2988 timer restart, fast - * retransmit can be triggered by timeout of queue head. - */ - if (tcp_is_fack(tp) && tcp_head_timedout(sk)) - return true; - /* Trick#4: It is still not OK... But will it be useful to delay * recovery more? */ @@ -2128,44 +2104,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) return false; } -/* New heuristics: it is possible only after we switched to restart timer - * each time when something is ACKed. Hence, we can detect timed out packets - * during fast retransmit without falling to slow start. - * - * Usefulness of this as is very questionable, since we should know which of - * the segments is the next to timeout which is relatively expensive to find - * in general case unless we add some data structure just for that. The - * current approach certainly won't find the right one too often and when it - * finally does find _something_ it usually marks large part of the window - * right away (because a retransmission with a larger timestamp blocks the - * loop from advancing). -ij - */ -static void tcp_timeout_skbs(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if (!tcp_is_fack(tp) || !tcp_head_timedout(sk)) - return; - - skb = tp->scoreboard_skb_hint; - if (tp->scoreboard_skb_hint == NULL) - skb = tcp_write_queue_head(sk); - - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - if (!tcp_skb_timedout(sk, skb)) - break; - - tcp_skb_mark_lost(tp, skb); - } - - tp->scoreboard_skb_hint = skb; - - tcp_verify_left_out(tp); -} - /* Detect loss in event "A" above by marking head of queue up as lost. * For FACK or non-SACK(Reno) senders, the first "packets" number of segments * are considered lost. For RFC3517 SACK, a segment is considered lost if it @@ -2251,8 +2189,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) else if (fast_rexmit) tcp_mark_head_lost(sk, 1, 1); } - - tcp_timeout_skbs(sk); } /* CWND moderation, preventing bursts due to too big ACKs @@ -2307,10 +2243,22 @@ static void DBGUNDO(struct sock *sk, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) +static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) { struct tcp_sock *tp = tcp_sk(sk); + if (unmark_loss) { + struct sk_buff *skb; + + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + } + tp->lost_out = 0; + tcp_clear_all_retrans_hints(tp); + } + if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2319,7 +2267,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); - if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) { + if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); } @@ -2327,6 +2275,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } tp->snd_cwnd_stamp = tcp_time_stamp; + tp->undo_marker = 0; } static inline bool tcp_may_undo(const struct tcp_sock *tp) @@ -2346,14 +2295,13 @@ static bool tcp_try_undo_recovery(struct sock *sk) * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwr(sk, true); + tcp_undo_cwnd_reduction(sk, false); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else mib_idx = LINUX_MIB_TCPFULLUNDO; NET_INC_STATS_BH(sock_net(sk), mib_idx); - tp->undo_marker = 0; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq @@ -2367,16 +2315,17 @@ static bool tcp_try_undo_recovery(struct sock *sk) } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ -static void tcp_try_undo_dsack(struct sock *sk) +static bool tcp_try_undo_dsack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); - tcp_undo_cwr(sk, true); - tp->undo_marker = 0; + tcp_undo_cwnd_reduction(sk, false); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); + return true; } + return false; } /* We can clear retrans_stamp when there are no retransmissions in the @@ -2408,60 +2357,20 @@ static bool tcp_any_retrans_done(const struct sock *sk) return false; } -/* Undo during fast recovery after partial ACK. */ - -static int tcp_try_undo_partial(struct sock *sk, int acked) -{ - struct tcp_sock *tp = tcp_sk(sk); - /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); - - if (tcp_may_undo(tp)) { - /* Plain luck! Hole if filled with delayed - * packet, rather than with a retransmit. - */ - if (!tcp_any_retrans_done(sk)) - tp->retrans_stamp = 0; - - tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); - - DBGUNDO(sk, "Hoe"); - tcp_undo_cwr(sk, false); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); - - /* So... Do not make Hoe's retransmit yet. - * If the first packet was delayed, the rest - * ones are most probably delayed as well. - */ - failed = 0; - } - return failed; -} - /* Undo during loss recovery after partial ACK or using F-RTO. */ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) { struct tcp_sock *tp = tcp_sk(sk); if (frto_undo || tcp_may_undo(tp)) { - struct sk_buff *skb; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - } - - tcp_clear_all_retrans_hints(tp); + tcp_undo_cwnd_reduction(sk, true); DBGUNDO(sk, "partial loss"); - tp->lost_out = 0; - tcp_undo_cwr(sk, true); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); if (frto_undo) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; - tp->undo_marker = 0; if (frto_undo || tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); return true; @@ -2494,12 +2403,14 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) TCP_ECN_queue_cwr(tp); } -static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, +static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); + int newly_acked_sacked = prior_unsacked - + (tp->packets_out - tp->sacked_out); tp->prr_delivered += newly_acked_sacked; if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { @@ -2556,7 +2467,7 @@ static void tcp_try_keep_open(struct sock *sk) } } -static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) +static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) { struct tcp_sock *tp = tcp_sk(sk); @@ -2573,7 +2484,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) tcp_moderate_cwnd(tp); } else { - tcp_cwnd_reduction(sk, newly_acked_sacked, 0); + tcp_cwnd_reduction(sk, prior_unsacked, 0); } } @@ -2731,6 +2642,40 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) tcp_xmit_retransmit_queue(sk); } +/* Undo during fast recovery after partial ACK. */ +static bool tcp_try_undo_partial(struct sock *sk, const int acked, + const int prior_unsacked) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->undo_marker && tcp_packet_delayed(tp)) { + /* Plain luck! Hole if filled with delayed + * packet, rather than with a retransmit. + */ + tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + + /* We are getting evidence that the reordering degree is higher + * than we realized. If there are no retransmits out then we + * can undo. Otherwise we clock out new packets but do not + * mark more packets lost or retransmit more. + */ + if (tp->retrans_out) { + tcp_cwnd_reduction(sk, prior_unsacked, 0); + return true; + } + + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; + + DBGUNDO(sk, "partial recovery"); + tcp_undo_cwnd_reduction(sk, true); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); + tcp_try_keep_open(sk); + return true; + } + return false; +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2742,15 +2687,14 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, - int prior_sacked, bool is_dupack, - int flag) +static void tcp_fastretrans_alert(struct sock *sk, const int acked, + const int prior_unsacked, + bool is_dupack, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && + bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int newly_acked_sacked = 0; int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) @@ -2802,9 +2746,17 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); - } else - do_lost = tcp_try_undo_partial(sk, pkts_acked); - newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; + } else { + if (tcp_try_undo_partial(sk, acked, prior_unsacked)) + return; + /* Partial ACK arrived. Force fast retransmit. */ + do_lost = tcp_is_reno(tp) || + tcp_fackets_out(tp) > tp->reordering; + } + if (tcp_try_undo_dsack(sk)) { + tcp_try_keep_open(sk); + return; + } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack); @@ -2818,13 +2770,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (is_dupack) tcp_add_reno_sack(sk); } - newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk, flag)) { - tcp_try_to_open(sk, flag, newly_acked_sacked); + tcp_try_to_open(sk, flag, prior_unsacked); return; } @@ -2844,9 +2795,9 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, fast_rexmit = 1; } - if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) + if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit); + tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); tcp_xmit_retransmit_queue(sk); } @@ -3077,7 +3028,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - tp->scoreboard_skb_hint = NULL; if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = NULL; if (skb == tp->lost_skb_hint) @@ -3330,9 +3280,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) bool is_dupack = false; u32 prior_in_flight; u32 prior_fackets; - int prior_packets; - int prior_sacked = tp->sacked_out; - int pkts_acked = 0; + int prior_packets = tp->packets_out; + const int prior_unsacked = tp->packets_out - tp->sacked_out; + int acked = 0; /* Number of packets newly acked */ /* If the ack is older than previous acks * then we can probably ignore it. @@ -3403,21 +3353,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sk->sk_err_soft = 0; icsk->icsk_probes_out = 0; tp->rcv_tstamp = tcp_time_stamp; - prior_packets = tp->packets_out; if (!prior_packets) goto no_queue; /* See if we can take anything off of the retransmit queue. */ + acked = tp->packets_out; flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); - - pkts_acked = prior_packets - tp->packets_out; + acked -= tp->packets_out; if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, + tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } else { if (flag & FLAG_DATA_ACKED) @@ -3440,7 +3389,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, + tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3463,7 +3412,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, + tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } @@ -5598,6 +5547,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *req; int queued = 0; + bool acceptable; tp->rx_opt.saw_tstamp = 0; @@ -5668,157 +5618,147 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; /* step 5: check the ACK field */ - if (true) { - int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT) > 0; - - switch (sk->sk_state) { - case TCP_SYN_RECV: - if (acceptable) { - /* Once we leave TCP_SYN_RECV, we no longer - * need req so release it. - */ - if (req) { - tcp_synack_rtt_meas(sk, req); - tp->total_retrans = req->num_retrans; - - reqsk_fastopen_remove(sk, req, false); - } else { - /* Make sure socket is routed, for - * correct metrics. - */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_init_congestion_control(sk); + acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | + FLAG_UPDATE_TS_RECENT) > 0; - tcp_mtup_init(sk); - tcp_init_buffer_space(sk); - tp->copied_seq = tp->rcv_nxt; - } - smp_mb(); - tcp_set_state(sk, TCP_ESTABLISHED); - sk->sk_state_change(sk); - - /* Note, that this wakeup is only for marginal - * crossed SYN case. Passively open sockets - * are not waked up, because sk->sk_sleep == - * NULL and sk->sk_socket == NULL. - */ - if (sk->sk_socket) - sk_wake_async(sk, - SOCK_WAKE_IO, POLL_OUT); - - tp->snd_una = TCP_SKB_CB(skb)->ack_seq; - tp->snd_wnd = ntohs(th->window) << - tp->rx_opt.snd_wscale; - tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - - if (tp->rx_opt.tstamp_ok) - tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - - if (req) { - /* Re-arm the timer because data may - * have been sent out. This is similar - * to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more - * aggressive and retranmitting any data - * sooner based on when they were sent - * out. - */ - tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); + switch (sk->sk_state) { + case TCP_SYN_RECV: + if (!acceptable) + return 1; - /* Prevent spurious tcp_cwnd_restart() on - * first data packet. - */ - tp->lsndtime = tcp_time_stamp; + /* Once we leave TCP_SYN_RECV, we no longer need req + * so release it. + */ + if (req) { + tcp_synack_rtt_meas(sk, req); + tp->total_retrans = req->num_retrans; - tcp_initialize_rcv_mss(sk); - tcp_fast_path_on(tp); - } else { - return 1; - } - break; + reqsk_fastopen_remove(sk, req, false); + } else { + /* Make sure socket is routed, for correct metrics. */ + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_congestion_control(sk); + + tcp_mtup_init(sk); + tcp_init_buffer_space(sk); + tp->copied_seq = tp->rcv_nxt; + } + smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); + + /* Note, that this wakeup is only for marginal crossed SYN case. + * Passively open sockets are not waked up, because + * sk->sk_sleep == NULL and sk->sk_socket == NULL. + */ + if (sk->sk_socket) + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - case TCP_FIN_WAIT1: - /* If we enter the TCP_FIN_WAIT1 state and we are a - * Fast Open socket and this is the first acceptable - * ACK we have received, this would have acknowledged - * our SYNACK so stop the SYNACK timer. + if (req) { + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. */ - if (req != NULL) { - /* Return RST if ack_seq is invalid. - * Note that RFC793 only says to generate a - * DUPACK for it but for TCP Fast Open it seems - * better to treat this case like TCP_SYN_RECV - * above. - */ - if (!acceptable) - return 1; - /* We no longer need the request sock. */ - reqsk_fastopen_remove(sk, req, false); - tcp_rearm_rto(sk); - } - if (tp->snd_una == tp->write_seq) { - struct dst_entry *dst; - - tcp_set_state(sk, TCP_FIN_WAIT2); - sk->sk_shutdown |= SEND_SHUTDOWN; - - dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); - - if (!sock_flag(sk, SOCK_DEAD)) - /* Wake up lingering close() */ - sk->sk_state_change(sk); - else { - int tmo; - - if (tp->linger2 < 0 || - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { - tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - return 1; - } + tcp_rearm_rto(sk); + } else + tcp_init_metrics(sk); - tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); - } else if (th->fin || sock_owned_by_user(sk)) { - /* Bad case. We could lose such FIN otherwise. - * It is not a big problem, but it looks confusing - * and not so rare event. We still can lose it now, - * if it spins in bh_lock_sock(), but it is really - * marginal case. - */ - inet_csk_reset_keepalive_timer(sk, tmo); - } else { - tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); - goto discard; - } - } - } - break; + /* Prevent spurious tcp_cwnd_restart() on first data packet */ + tp->lsndtime = tcp_time_stamp; - case TCP_CLOSING: - if (tp->snd_una == tp->write_seq) { - tcp_time_wait(sk, TCP_TIME_WAIT, 0); - goto discard; - } + tcp_initialize_rcv_mss(sk); + tcp_fast_path_on(tp); + break; + + case TCP_FIN_WAIT1: { + struct dst_entry *dst; + int tmo; + + /* If we enter the TCP_FIN_WAIT1 state and we are a + * Fast Open socket and this is the first acceptable + * ACK we have received, this would have acknowledged + * our SYNACK so stop the SYNACK timer. + */ + if (req != NULL) { + /* Return RST if ack_seq is invalid. + * Note that RFC793 only says to generate a + * DUPACK for it but for TCP Fast Open it seems + * better to treat this case like TCP_SYN_RECV + * above. + */ + if (!acceptable) + return 1; + /* We no longer need the request sock. */ + reqsk_fastopen_remove(sk, req, false); + tcp_rearm_rto(sk); + } + if (tp->snd_una != tp->write_seq) break; - case TCP_LAST_ACK: - if (tp->snd_una == tp->write_seq) { - tcp_update_metrics(sk); - tcp_done(sk); - goto discard; - } + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; + + dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); + + if (!sock_flag(sk, SOCK_DEAD)) { + /* Wake up lingering close() */ + sk->sk_state_change(sk); break; } + + if (tp->linger2 < 0 || + (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + tcp_done(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + return 1; + } + + tmo = tcp_fin_time(sk); + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else if (th->fin || sock_owned_by_user(sk)) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + inet_csk_reset_keepalive_timer(sk, tmo); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; + } + break; + } + + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + goto discard; + } + break; + + case TCP_LAST_ACK: + if (tp->snd_una == tp->write_seq) { + tcp_update_metrics(sk); + tcp_done(sk); + goto discard; + } + break; } /* step 6: check the URG bit */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 719652305a2..1063bb83e34 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -75,6 +75,7 @@ #include <net/netdma.h> #include <net/secure_seq.h> #include <net/tcp_memcontrol.h> +#include <net/ll_poll.h> #include <linux/inet.h> #include <linux/ipv6.h> @@ -545,8 +546,7 @@ out: sock_put(sk); } -static void __tcp_v4_send_check(struct sk_buff *skb, - __be32 saddr, __be32 daddr) +void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct tcphdr *th = tcp_hdr(skb); @@ -571,23 +571,6 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_v4_send_check); -int tcp_v4_gso_send_check(struct sk_buff *skb) -{ - const struct iphdr *iph; - struct tcphdr *th; - - if (!pskb_may_pull(skb, sizeof(*th))) - return -EINVAL; - - iph = ip_hdr(skb); - th = tcp_hdr(skb); - - th->check = 0; - skb->ip_summed = CHECKSUM_PARTIAL; - __tcp_v4_send_check(skb, iph->saddr, iph->daddr); - return 0; -} - /* * This routine will send an RST to the other tcp. * @@ -1026,7 +1009,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key = sock_kmalloc(sk, sizeof(*key), gfp); if (!key) return -ENOMEM; - if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) { + if (!tcp_alloc_md5sig_pool()) { sock_kfree_s(sk, key, sizeof(*key)); return -ENOMEM; } @@ -1044,9 +1027,7 @@ EXPORT_SYMBOL(tcp_md5_do_add); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family) { - struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; - struct tcp_md5sig_info *md5sig; key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET); if (!key) @@ -1054,10 +1035,6 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family) hlist_del_rcu(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); kfree_rcu(key, rcu); - md5sig = rcu_dereference_protected(tp->md5sig_info, - sock_owned_by_user(sk)); - if (hlist_empty(&md5sig->head)) - tcp_free_md5sig_pool(); return 0; } EXPORT_SYMBOL(tcp_md5_do_del); @@ -1071,8 +1048,6 @@ static void tcp_clear_md5_list(struct sock *sk) md5sig = rcu_dereference_protected(tp->md5sig_info, 1); - if (!hlist_empty(&md5sig->head)) - tcp_free_md5sig_pool(); hlist_for_each_entry_safe(key, n, &md5sig->head, node) { hlist_del_rcu(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); @@ -2019,6 +1994,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; + sk_mark_ll(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); @@ -2803,52 +2779,6 @@ void tcp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) -{ - const struct iphdr *iph = skb_gro_network_header(skb); - __wsum wsum; - __sum16 sum; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, - skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } -flush: - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - - case CHECKSUM_NONE: - wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb_gro_len(skb), IPPROTO_TCP, 0); - sum = csum_fold(skb_checksum(skb, - skb_gro_offset(skb), - skb_gro_len(skb), - wsum)); - if (sum) - goto flush; - - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - - return tcp_gro_receive(head, skb); -} - -int tcp4_gro_complete(struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - struct tcphdr *th = tcp_hdr(skb); - - th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), - iph->saddr, iph->daddr, 0); - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; - - return tcp_gro_complete(skb); -} - struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0f017882725..ab1c0865852 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -317,7 +317,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) key = tp->af_specific->md5_lookup(sk, sk); if (key != NULL) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); - if (tcptw->tw_md5_key && tcp_alloc_md5sig_pool(sk) == NULL) + if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()) BUG(); } } while (0); @@ -358,10 +358,8 @@ void tcp_twsk_destructor(struct sock *sk) #ifdef CONFIG_TCP_MD5SIG struct tcp_timewait_sock *twsk = tcp_twsk(sk); - if (twsk->tw_md5_key) { - tcp_free_md5sig_pool(); + if (twsk->tw_md5_key) kfree_rcu(twsk->tw_md5_key, rcu); - } #endif } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c new file mode 100644 index 00000000000..3a7525e6c08 --- /dev/null +++ b/net/ipv4/tcp_offload.c @@ -0,0 +1,332 @@ +/* + * IPV4 GSO/GRO offload support + * Linux INET implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * TCPv4 GSO/GRO support + */ + +#include <linux/skbuff.h> +#include <net/tcp.h> +#include <net/protocol.h> + +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct tcphdr *th; + unsigned int thlen; + unsigned int seq; + __be32 delta; + unsigned int oldlen; + unsigned int mss; + struct sk_buff *gso_skb = skb; + __sum16 newcheck; + bool ooo_okay, copy_destructor; + + if (!pskb_may_pull(skb, sizeof(*th))) + goto out; + + th = tcp_hdr(skb); + thlen = th->doff * 4; + if (thlen < sizeof(*th)) + goto out; + + if (!pskb_may_pull(skb, thlen)) + goto out; + + oldlen = (u16)~skb->len; + __skb_pull(skb, thlen); + + mss = tcp_skb_mss(skb); + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & + ~(SKB_GSO_TCPV4 | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_TCPV6 | + SKB_GSO_GRE | + SKB_GSO_MPLS | + SKB_GSO_UDP_TUNNEL | + 0) || + !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + copy_destructor = gso_skb->destructor == tcp_wfree; + ooo_okay = gso_skb->ooo_okay; + /* All segments but the first should have ooo_okay cleared */ + skb->ooo_okay = 0; + + segs = skb_segment(skb, features); + if (IS_ERR(segs)) + goto out; + + /* Only first segment might have ooo_okay set */ + segs->ooo_okay = ooo_okay; + + delta = htonl(oldlen + (thlen + mss)); + + skb = segs; + th = tcp_hdr(skb); + seq = ntohl(th->seq); + + newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + + (__force u32)delta)); + + do { + th->fin = th->psh = 0; + th->check = newcheck; + + if (skb->ip_summed != CHECKSUM_PARTIAL) + th->check = + csum_fold(csum_partial(skb_transport_header(skb), + thlen, skb->csum)); + + seq += mss; + if (copy_destructor) { + skb->destructor = gso_skb->destructor; + skb->sk = gso_skb->sk; + /* {tcp|sock}_wfree() use exact truesize accounting : + * sum(skb->truesize) MUST be exactly be gso_skb->truesize + * So we account mss bytes of 'true size' for each segment. + * The last segment will contain the remaining. + */ + skb->truesize = mss; + gso_skb->truesize -= mss; + } + skb = skb->next; + th = tcp_hdr(skb); + + th->seq = htonl(seq); + th->cwr = 0; + } while (skb->next); + + /* Following permits TCP Small Queues to work well with GSO : + * The callback to TCP stack will be called at the time last frag + * is freed at TX completion, and not right now when gso_skb + * is freed by GSO engine + */ + if (copy_destructor) { + swap(gso_skb->sk, skb->sk); + swap(gso_skb->destructor, skb->destructor); + swap(gso_skb->truesize, skb->truesize); + } + + delta = htonl(oldlen + (skb_tail_pointer(skb) - + skb_transport_header(skb)) + + skb->data_len); + th->check = ~csum_fold((__force __wsum)((__force u32)th->check + + (__force u32)delta)); + if (skb->ip_summed != CHECKSUM_PARTIAL) + th->check = csum_fold(csum_partial(skb_transport_header(skb), + thlen, skb->csum)); +out: + return segs; +} +EXPORT_SYMBOL(tcp_tso_segment); + +struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct sk_buff *p; + struct tcphdr *th; + struct tcphdr *th2; + unsigned int len; + unsigned int thlen; + __be32 flags; + unsigned int mss = 1; + unsigned int hlen; + unsigned int off; + int flush = 1; + int i; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*th); + th = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + th = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!th)) + goto out; + } + + thlen = th->doff * 4; + if (thlen < sizeof(*th)) + goto out; + + hlen = off + thlen; + if (skb_gro_header_hard(skb, hlen)) { + th = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!th)) + goto out; + } + + skb_gro_pull(skb, thlen); + + len = skb_gro_len(skb); + flags = tcp_flag_word(th); + + for (; (p = *head); head = &p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + th2 = tcp_hdr(p); + + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + goto found; + } + + goto out_check_final; + +found: + flush = NAPI_GRO_CB(p)->flush; + flush |= (__force int)(flags & TCP_FLAG_CWR); + flush |= (__force int)((flags ^ tcp_flag_word(th2)) & + ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); + flush |= (__force int)(th->ack_seq ^ th2->ack_seq); + for (i = sizeof(*th); i < thlen; i += 4) + flush |= *(u32 *)((u8 *)th + i) ^ + *(u32 *)((u8 *)th2 + i); + + mss = tcp_skb_mss(p); + + flush |= (len - 1) >= mss; + flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); + + if (flush || skb_gro_receive(head, skb)) { + mss = 1; + goto out_check_final; + } + + p = *head; + th2 = tcp_hdr(p); + tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); + +out_check_final: + flush = len < mss; + flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | + TCP_FLAG_RST | TCP_FLAG_SYN | + TCP_FLAG_FIN)); + + if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) + pp = head; + +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} +EXPORT_SYMBOL(tcp_gro_receive); + +int tcp_gro_complete(struct sk_buff *skb) +{ + struct tcphdr *th = tcp_hdr(skb); + + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + if (th->cwr) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + + return 0; +} +EXPORT_SYMBOL(tcp_gro_complete); + +static int tcp_v4_gso_send_check(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct tcphdr *th; + + if (!pskb_may_pull(skb, sizeof(*th))) + return -EINVAL; + + iph = ip_hdr(skb); + th = tcp_hdr(skb); + + th->check = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v4_send_check(skb, iph->saddr, iph->daddr); + return 0; +} + +static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + const struct iphdr *iph = skb_gro_network_header(skb); + __wsum wsum; + __sum16 sum; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, + skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + + case CHECKSUM_NONE: + wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb_gro_len(skb), IPPROTO_TCP, 0); + sum = csum_fold(skb_checksum(skb, + skb_gro_offset(skb), + skb_gro_len(skb), + wsum)); + if (sum) + goto flush; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + + return tcp_gro_receive(head, skb); +} + +static int tcp4_gro_complete(struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + + th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), + iph->saddr, iph->daddr, 0); + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + + return tcp_gro_complete(skb); +} + +static const struct net_offload tcpv4_offload = { + .callbacks = { + .gso_send_check = tcp_v4_gso_send_check, + .gso_segment = tcp_tso_segment, + .gro_receive = tcp4_gro_receive, + .gro_complete = tcp4_gro_complete, + }, +}; + +int __init tcpv4_offload_init(void) +{ + return inet_add_offload(&tcpv4_offload, IPPROTO_TCP); +} diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 536d40929ba..ec335fabd5c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -874,11 +874,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - if (tcp_packets_in_flight(tp) == 0) { + if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); - skb->ooo_okay = 1; - } else - skb->ooo_okay = 0; + + /* if no packet is in qdisc/device queue, then allow XPS to select + * another queue. + */ + skb->ooo_okay = sk_wmem_alloc_get(sk) == 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0bf5d399a03..2955b25aee6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -109,6 +109,7 @@ #include <trace/events/udp.h> #include <linux/static_key.h> #include <trace/events/skb.h> +#include <net/ll_poll.h> #include "udp_impl.h" struct udp_table udp_table __read_mostly; @@ -1709,7 +1710,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { - int ret = udp_queue_rcv_skb(sk, skb); + int ret; + + sk_mark_ll(sk, skb); + ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but @@ -1967,6 +1971,8 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; + sock_rps_record_flow(sk); + /* Check for false positives due to checksum errors */ if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk)) @@ -2381,7 +2387,7 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | - SKB_GSO_GRE) || + SKB_GSO_GRE | SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 05a5df2febc..06347dbd32c 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -63,7 +63,7 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, - .priority = 2, + .priority = 3, }; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 9af088d2cda..470a9c008e9 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_IPV6) += ipv6.o ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ addrlabel.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ - raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ + raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o ipv6-offload := ip6_offload.o tcpv6_offload.o udp_offload.o exthdrs_offload.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d1ab6ab29a5..21010fddb20 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1126,8 +1126,7 @@ retry: ift = !max_addresses || ipv6_count_addresses(idev) < max_addresses ? - ipv6_add_addr(idev, &addr, tmp_plen, - ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, + ipv6_add_addr(idev, &addr, tmp_plen, ipv6_addr_scope(&addr), addr_flags) : NULL; if (IS_ERR_OR_NULL(ift)) { in6_ifa_put(ifp); @@ -1487,7 +1486,7 @@ static int ipv6_count_addresses(struct inet6_dev *idev) } int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev, int strict) + const struct net_device *dev, int strict) { struct inet6_ifaddr *ifp; unsigned int hash = inet6_addr_hash(addr); @@ -2402,6 +2401,7 @@ err_exit: * Manual configuration of address on an interface */ static int inet6_addr_add(struct net *net, int ifindex, const struct in6_addr *pfx, + const struct in6_addr *peer_pfx, unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, __u32 valid_lft) { @@ -2457,6 +2457,8 @@ static int inet6_addr_add(struct net *net, int ifindex, const struct in6_addr *p ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; ifp->tstamp = jiffies; + if (peer_pfx) + ifp->peer_addr = *peer_pfx; spin_unlock_bh(&ifp->lock); addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, @@ -2526,7 +2528,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) return -EFAULT; rtnl_lock(); - err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, + err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL, ireq.ifr6_prefixlen, IFA_F_PERMANENT, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); rtnl_unlock(); @@ -2658,8 +2660,10 @@ static void init_loopback(struct net_device *dev) sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, 0); /* Failure cases are ignored */ - if (!IS_ERR(sp_rt)) + if (!IS_ERR(sp_rt)) { + sp_ifa->rt = sp_rt; ip6_ins_rt(sp_rt); + } } read_unlock_bh(&idev->lock); } @@ -2824,9 +2828,9 @@ static void addrconf_ip6_tnl_config(struct net_device *dev) } static int addrconf_notify(struct notifier_block *this, unsigned long event, - void *data) + void *ptr) { - struct net_device *dev = (struct net_device *) data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct inet6_dev *idev = __in6_dev_get(dev); int run_pending = 0; int err; @@ -3610,18 +3614,20 @@ restart: rcu_read_unlock_bh(); } -static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local) +static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local, + struct in6_addr **peer_pfx) { struct in6_addr *pfx = NULL; + *peer_pfx = NULL; + if (addr) pfx = nla_data(addr); if (local) { if (pfx && nla_memcmp(local, pfx, sizeof(*pfx))) - pfx = NULL; - else - pfx = nla_data(local); + *peer_pfx = pfx; + pfx = nla_data(local); } return pfx; @@ -3639,7 +3645,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) struct net *net = sock_net(skb->sk); struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; - struct in6_addr *pfx; + struct in6_addr *pfx, *peer_pfx; int err; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); @@ -3647,7 +3653,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) return err; ifm = nlmsg_data(nlh); - pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx); if (pfx == NULL) return -EINVAL; @@ -3705,7 +3711,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) struct net *net = sock_net(skb->sk); struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; - struct in6_addr *pfx; + struct in6_addr *pfx, *peer_pfx; struct inet6_ifaddr *ifa; struct net_device *dev; u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME; @@ -3717,7 +3723,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) return err; ifm = nlmsg_data(nlh); - pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx); if (pfx == NULL) return -EINVAL; @@ -3745,7 +3751,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) * It would be best to check for !NLM_F_CREATE here but * userspace alreay relies on not having to provide this. */ - return inet6_addr_add(net, ifm->ifa_index, pfx, + return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx, ifm->ifa_prefixlen, ifa_flags, preferred_lft, valid_lft); } @@ -3802,6 +3808,7 @@ static inline int rt_scope(int ifa_scope) static inline int inet6_ifaddr_msgsize(void) { return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(16) /* IFA_LOCAL */ + nla_total_size(16) /* IFA_ADDRESS */ + nla_total_size(sizeof(struct ifa_cacheinfo)); } @@ -3840,13 +3847,22 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, valid = INFINITY_LIFE_TIME; } - if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0 || - put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } + if (!ipv6_addr_any(&ifa->peer_addr)) { + if (nla_put(skb, IFA_LOCAL, 16, &ifa->addr) < 0 || + nla_put(skb, IFA_ADDRESS, 16, &ifa->peer_addr) < 0) + goto error; + } else + if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0) + goto error; + + if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) + goto error; return nlmsg_end(skb, nlh); + +error: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; } static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, @@ -4046,7 +4062,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh) struct net *net = sock_net(in_skb->sk); struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; - struct in6_addr *addr = NULL; + struct in6_addr *addr = NULL, *peer; struct net_device *dev = NULL; struct inet6_ifaddr *ifa; struct sk_buff *skb; @@ -4056,7 +4072,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh) if (err < 0) goto errout; - addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); if (addr == NULL) { err = -EINVAL; goto errout; @@ -4564,11 +4580,26 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ip6_ins_rt(ifp->rt); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); + if (!ipv6_addr_any(&ifp->peer_addr)) + addrconf_prefix_route(&ifp->peer_addr, 128, + ifp->idev->dev, 0, 0); break; case RTM_DELADDR: if (ifp->idev->cnf.forwarding) addrconf_leave_anycast(ifp); addrconf_leave_solict(ifp->idev, &ifp->addr); + if (!ipv6_addr_any(&ifp->peer_addr)) { + struct rt6_info *rt; + struct net_device *dev = ifp->idev->dev; + + rt = rt6_lookup(dev_net(dev), &ifp->peer_addr, NULL, + dev->ifindex, 1); + if (rt) { + dst_hold(&rt->dst); + if (ip6_del_rt(rt)) + dst_free(&rt->dst); + } + } dst_hold(&ifp->rt->dst); if (ip6_del_rt(ifp->rt)) @@ -4616,13 +4647,16 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, static void dev_disable_change(struct inet6_dev *idev) { + struct netdev_notifier_info info; + if (!idev || !idev->dev) return; + netdev_notifier_info_init(&info, idev->dev); if (idev->cnf.disable_ipv6) - addrconf_notify(NULL, NETDEV_DOWN, idev->dev); + addrconf_notify(NULL, NETDEV_DOWN, &info); else - addrconf_notify(NULL, NETDEV_UP, idev->dev); + addrconf_notify(NULL, NETDEV_UP, &info); } static void addrconf_disable_change(struct net *net, __s32 newf) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index ab5c7ad482c..a5ac969aeef 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -49,6 +49,7 @@ #include <net/udp.h> #include <net/udplite.h> #include <net/tcp.h> +#include <net/ping.h> #include <net/protocol.h> #include <net/inet_common.h> #include <net/route.h> @@ -840,6 +841,9 @@ static int __init inet6_init(void) if (err) goto out_unregister_udplite_proto; + err = proto_register(&pingv6_prot, 1); + if (err) + goto out_unregister_ping_proto; /* We MUST register RAW sockets before we create the ICMP6, * IGMP6, or NDISC control sockets. @@ -930,6 +934,10 @@ static int __init inet6_init(void) if (err) goto ipv6_packet_fail; + err = pingv6_init(); + if (err) + goto pingv6_fail; + #ifdef CONFIG_SYSCTL err = ipv6_sysctl_register(); if (err) @@ -942,6 +950,8 @@ out: sysctl_fail: ipv6_packet_cleanup(); #endif +pingv6_fail: + pingv6_exit(); ipv6_packet_fail: tcpv6_exit(); tcpv6_fail: @@ -985,6 +995,8 @@ register_pernet_fail: rtnl_unregister_all(PF_INET6); out_sock_register_fail: rawv6_exit(); +out_unregister_ping_proto: + proto_unregister(&pingv6_prot); out_unregister_raw_proto: proto_unregister(&rawv6_prot); out_unregister_udplite_proto: diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 4b56cbbc789..197e6f4a2b7 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -879,3 +879,30 @@ exit_f: return err; } EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl); + +void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, + __u16 srcp, __u16 destp, int bucket) +{ + struct ipv6_pinfo *np = inet6_sk(sp); + const struct in6_addr *dest, *src; + + dest = &np->daddr; + src = &np->rcv_saddr; + seq_printf(seq, + "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", + bucket, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + 0, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops)); +} diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index c5e83fae4df..140748debc4 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -115,7 +115,7 @@ EXPORT_SYMBOL(ipv6_skip_exthdr); int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) { const unsigned char *nh = skb_network_header(skb); - int packet_len = skb->tail - skb->network_header; + int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); struct ipv6_opt_hdr *hdr; int len; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index b4ff0a42b8c..4b4890bbe16 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -57,6 +57,7 @@ #include <net/ipv6.h> #include <net/ip6_checksum.h> +#include <net/ping.h> #include <net/protocol.h> #include <net/raw.h> #include <net/rawv6.h> @@ -84,12 +85,18 @@ static inline struct sock *icmpv6_sk(struct net *net) static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { + /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ + struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); struct net *net = dev_net(skb->dev); if (type == ICMPV6_PKT_TOOBIG) ip6_update_pmtu(skb, net, info, 0, 0); else if (type == NDISC_REDIRECT) ip6_redirect(skb, net, 0, 0); + + if (!(type & ICMPV6_INFOMSG_MASK)) + if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) + ping_err(skb, offset, info); } static int icmpv6_rcv(struct sk_buff *skb); @@ -224,7 +231,8 @@ static bool opt_unrec(struct sk_buff *skb, __u32 offset) return (*op & 0xC0) == 0x80; } -static int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len) +int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len) { struct sk_buff *skb; struct icmp6hdr *icmp6h; @@ -307,8 +315,8 @@ static void mip6_addr_swap(struct sk_buff *skb) static inline void mip6_addr_swap(struct sk_buff *skb) {} #endif -static struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *skb, - struct sock *sk, struct flowi6 *fl6) +struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *skb, + struct sock *sk, struct flowi6 *fl6) { struct dst_entry *dst, *dst2; struct flowi6 fl2; @@ -391,7 +399,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) int err = 0; if ((u8 *)hdr < skb->head || - (skb->network_header + sizeof(*hdr)) > skb->tail) + (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb)) return; /* @@ -697,7 +705,8 @@ static int icmpv6_rcv(struct sk_buff *skb) skb->csum = ~csum_unfold(csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, 0)); if (__skb_checksum_complete(skb)) { - LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%pI6 > %pI6]\n", + LIMIT_NETDEBUG(KERN_DEBUG + "ICMPv6 checksum failed [%pI6c > %pI6c]\n", saddr, daddr); goto csum_error; } @@ -718,7 +727,7 @@ static int icmpv6_rcv(struct sk_buff *skb) break; case ICMPV6_ECHO_REPLY: - /* we couldn't care less */ + ping_rcv(skb); break; case ICMPV6_PKT_TOOBIG: diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index d3ddd840035..ecd60733e5e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1081,6 +1081,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, } if (t == NULL) t = netdev_priv(dev); + memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; @@ -1128,6 +1129,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, if (t) { err = 0; + memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 71b766ee821..a263b990ee1 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -98,6 +98,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_GRE | SKB_GSO_UDP_TUNNEL | + SKB_GSO_MPLS | SKB_GSO_TCPV6 | 0))) goto out; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d2eedf19233..dae1949019d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1147,7 +1147,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, if (WARN_ON(np->cork.opt)) return -EINVAL; - np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation); + np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation); if (unlikely(np->cork.opt == NULL)) return -ENOBUFS; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 241fb8ad9fc..583e8d435f9 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1319,7 +1319,7 @@ static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc, static int ip6mr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr6_table *mrt; struct mif_device *v; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index bfa6cc36ef2..72c8bfe06bb 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1409,8 +1409,9 @@ static void mld_sendpack(struct sk_buff *skb) idev = __in6_dev_get(skb->dev); IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - payload_len = (skb->tail - skb->network_header) - sizeof(*pip6); - mldlen = skb->tail - skb->transport_header; + payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) - + sizeof(*pip6); + mldlen = skb_tail_pointer(skb) - skb_transport_header(skb); pip6->payload_len = htons(payload_len); pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 0f9bdc5ee9f..9ac01dc9402 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -268,7 +268,8 @@ static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb->tail - skb->network_header; + unsigned int packet_len = skb_tail_pointer(skb) - + skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; @@ -404,7 +405,8 @@ static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb->tail - skb->network_header; + unsigned int packet_len = skb_tail_pointer(skb) - + skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 2712ab22a17..781dd3c9968 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -693,7 +693,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; - u32 ndoptlen = skb->tail - (skb->transport_header + + u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; @@ -853,7 +853,7 @@ static void ndisc_recv_na(struct sk_buff *skb) const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; - u32 ndoptlen = skb->tail - (skb->transport_header + + u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; @@ -1069,7 +1069,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) __u8 * opt = (__u8 *)(ra_msg + 1); - optlen = (skb->tail - skb->transport_header) - sizeof(struct ra_msg); + optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - + sizeof(struct ra_msg); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); @@ -1346,7 +1347,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) u8 *hdr; struct ndisc_options ndopts; struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); - u32 ndoptlen = skb->tail - (skb->transport_header + + u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct rd_msg, opt)); #ifdef CONFIG_IPV6_NDISC_NODETYPE @@ -1568,7 +1569,7 @@ int ndisc_rcv(struct sk_buff *skb) static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct inet6_dev *idev; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 72836f40b73..95f3f1da0d7 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -10,6 +10,7 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/export.h> +#include <net/addrconf.h> #include <net/dst.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -186,6 +187,10 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, return csum; }; +static const struct nf_ipv6_ops ipv6ops = { + .chk_addr = ipv6_chk_addr, +}; + static const struct nf_afinfo nf_ip6_afinfo = { .family = AF_INET6, .checksum = nf_ip6_checksum, @@ -198,6 +203,7 @@ static const struct nf_afinfo nf_ip6_afinfo = { int __init ipv6_netfilter_init(void) { + RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops); return nf_register_afinfo(&nf_ip6_afinfo); } @@ -206,5 +212,6 @@ int __init ipv6_netfilter_init(void) */ void ipv6_netfilter_fini(void) { + RCU_INIT_POINTER(nf_ipv6_ops, NULL); nf_unregister_afinfo(&nf_ip6_afinfo); } diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 60e9053bab0..47bff610751 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -71,7 +71,7 @@ static int device_cmp(struct nf_conn *ct, void *ifindex) static int masq_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - const struct net_device *dev = ptr; + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_DOWN) @@ -89,8 +89,10 @@ static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; + struct netdev_notifier_info info; - return masq_device_event(this, event, ifa->idev->dev); + netdev_notifier_info_init(&info, ifa->idev->dev); + return masq_device_event(this, event, &info); } static struct notifier_block masq_inet_notifier = { diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index c2e73e647e4..ab92a3673fb 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -40,7 +40,8 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) u16 offset = sizeof(struct ipv6hdr); struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); - unsigned int packet_len = skb->tail - skb->network_header; + unsigned int packet_len = skb_tail_pointer(skb) - + skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c new file mode 100644 index 00000000000..a4311038591 --- /dev/null +++ b/net/ipv6/ping.c @@ -0,0 +1,272 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * "Ping" sockets + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on ipv4/ping.c code. + * + * Authors: Lorenzo Colitti (IPv6 support) + * Vasiliy Kulikov / Openwall (IPv4 implementation, for Linux 2.6), + * Pavel Kankovsky (IPv4 implementation, for Linux 2.4.32) + * + */ + +#include <net/addrconf.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/protocol.h> +#include <net/udp.h> +#include <net/transp_v6.h> +#include <net/ping.h> + +struct proto pingv6_prot = { + .name = "PINGv6", + .owner = THIS_MODULE, + .init = ping_init_sock, + .close = ping_close, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .sendmsg = ping_v6_sendmsg, + .recvmsg = ping_recvmsg, + .bind = ping_bind, + .backlog_rcv = ping_queue_rcv_skb, + .hash = ping_hash, + .unhash = ping_unhash, + .get_port = ping_get_port, + .obj_size = sizeof(struct raw6_sock), +}; +EXPORT_SYMBOL_GPL(pingv6_prot); + +static struct inet_protosw pingv6_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_ICMPV6, + .prot = &pingv6_prot, + .ops = &inet6_dgram_ops, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, +}; + + +/* Compatibility glue so we can support IPv6 when it's compiled as a module */ +int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + return -EAFNOSUPPORT; +} +int dummy_ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + return -EAFNOSUPPORT; +} +int dummy_icmpv6_err_convert(u8 type, u8 code, int *err) +{ + return -EAFNOSUPPORT; +} +void dummy_ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload) {} +int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, + const struct net_device *dev, int strict) +{ + return 0; +} + +int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct icmp6hdr user_icmph; + int addr_type; + struct in6_addr *daddr; + int iif = 0; + struct flowi6 fl6; + int err; + int hlimit; + struct dst_entry *dst; + struct rt6_info *rt; + struct pingfakehdr pfh; + + pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); + + err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph, + sizeof(user_icmph)); + if (err) + return err; + + if (msg->msg_name) { + struct sockaddr_in6 *u = (struct sockaddr_in6 *) msg->msg_name; + if (msg->msg_namelen < sizeof(struct sockaddr_in6) || + u->sin6_family != AF_INET6) { + return -EINVAL; + } + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != u->sin6_scope_id) { + return -EINVAL; + } + daddr = &(u->sin6_addr); + iif = u->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = &np->daddr; + } + + if (!iif) + iif = sk->sk_bound_dev_if; + + addr_type = ipv6_addr_type(daddr); + if (__ipv6_addr_needs_scope_id(addr_type) && !iif) + return -EINVAL; + if (addr_type & IPV6_ADDR_MAPPED) + return -EINVAL; + + /* TODO: use ip6_datagram_send_ctl to get options from cmsg */ + + memset(&fl6, 0, sizeof(fl6)); + + fl6.flowi6_proto = IPPROTO_ICMPV6; + fl6.saddr = np->saddr; + fl6.daddr = *daddr; + fl6.fl6_icmp_type = user_icmph.icmp6_type; + fl6.fl6_icmp_code = user_icmph.icmp6_code; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + + dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, 1); + if (IS_ERR(dst)) + return PTR_ERR(dst); + rt = (struct rt6_info *) dst; + + np = inet6_sk(sk); + if (!np) + return -EBADF; + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + + pfh.icmph.type = user_icmph.icmp6_type; + pfh.icmph.code = user_icmph.icmp6_code; + pfh.icmph.checksum = 0; + pfh.icmph.un.echo.id = inet->inet_sport; + pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence; + pfh.iov = msg->msg_iov; + pfh.wcheck = 0; + pfh.family = AF_INET6; + + if (ipv6_addr_is_multicast(&fl6.daddr)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + + err = ip6_append_data(sk, ping_getfrag, &pfh, len, + 0, hlimit, + np->tclass, NULL, &fl6, rt, + MSG_DONTWAIT, np->dontfrag); + + if (err) { + ICMP6_INC_STATS_BH(sock_net(sk), rt->rt6i_idev, + ICMP6_MIB_OUTERRORS); + ip6_flush_pending_frames(sk); + } else { + err = icmpv6_push_pending_frames(sk, &fl6, + (struct icmp6hdr *) &pfh.icmph, + len); + } + + return err; +} + +#ifdef CONFIG_PROC_FS +static void *ping_v6_seq_start(struct seq_file *seq, loff_t *pos) +{ + return ping_seq_start(seq, pos, AF_INET6); +} + +int ping_v6_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); + } else { + int bucket = ((struct ping_iter_state *) seq->private)->bucket; + struct inet_sock *inet = inet_sk(v); + __u16 srcp = ntohs(inet->inet_sport); + __u16 destp = ntohs(inet->inet_dport); + ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket); + } + return 0; +} + +static struct ping_seq_afinfo ping_v6_seq_afinfo = { + .name = "icmp6", + .family = AF_INET6, + .seq_fops = &ping_seq_fops, + .seq_ops = { + .start = ping_v6_seq_start, + .show = ping_v6_seq_show, + .next = ping_seq_next, + .stop = ping_seq_stop, + }, +}; + +static int __net_init ping_v6_proc_init_net(struct net *net) +{ + return ping_proc_register(net, &ping_v6_seq_afinfo); +} + +static void __net_init ping_v6_proc_exit_net(struct net *net) +{ + return ping_proc_unregister(net, &ping_v6_seq_afinfo); +} + +static struct pernet_operations ping_v6_net_ops = { + .init = ping_v6_proc_init_net, + .exit = ping_v6_proc_exit_net, +}; +#endif + +int __init pingv6_init(void) +{ +#ifdef CONFIG_PROC_FS + int ret = register_pernet_subsys(&ping_v6_net_ops); + if (ret) + return ret; +#endif + pingv6_ops.ipv6_recv_error = ipv6_recv_error; + pingv6_ops.ip6_datagram_recv_ctl = ip6_datagram_recv_ctl; + pingv6_ops.icmpv6_err_convert = icmpv6_err_convert; + pingv6_ops.ipv6_icmp_error = ipv6_icmp_error; + pingv6_ops.ipv6_chk_addr = ipv6_chk_addr; + return inet6_register_protosw(&pingv6_protosw); +} + +/* This never gets called because it's not possible to unload the ipv6 module, + * but just in case. + */ +void pingv6_exit(void) +{ + pingv6_ops.ipv6_recv_error = dummy_ipv6_recv_error; + pingv6_ops.ip6_datagram_recv_ctl = dummy_ip6_datagram_recv_ctl; + pingv6_ops.icmpv6_err_convert = dummy_icmpv6_err_convert; + pingv6_ops.ipv6_icmp_error = dummy_ipv6_icmp_error; + pingv6_ops.ipv6_chk_addr = dummy_ipv6_chk_addr; +#ifdef CONFIG_PROC_FS + unregister_pernet_subsys(&ping_v6_net_ops); +#endif + inet6_unregister_protosw(&pingv6_protosw); +} diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index f3c1ff4357f..51c3285b5d9 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -90,7 +90,7 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), - SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), + /* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */ SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index eedff8ccded..c45f7a5c36e 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1132,7 +1132,8 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) - amount = skb->tail - skb->transport_header; + amount = skb_tail_pointer(skb) - + skb_transport_header(skb); spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } @@ -1226,45 +1227,16 @@ struct proto rawv6_prot = { }; #ifdef CONFIG_PROC_FS -static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) -{ - struct ipv6_pinfo *np = inet6_sk(sp); - const struct in6_addr *dest, *src; - __u16 destp, srcp; - - dest = &np->daddr; - src = &np->rcv_saddr; - destp = 0; - srcp = inet_sk(sp)->inet_num; - seq_printf(seq, - "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", - i, - src->s6_addr32[0], src->s6_addr32[1], - src->s6_addr32[2], src->s6_addr32[3], srcp, - dest->s6_addr32[0], dest->s6_addr32[1], - dest->s6_addr32[2], dest->s6_addr32[3], destp, - sp->sk_state, - sk_wmem_alloc_get(sp), - sk_rmem_alloc_get(sp), - 0, 0L, 0, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), - 0, - sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); -} - static int raw6_seq_show(struct seq_file *seq, void *v) { - if (v == SEQ_START_TOKEN) - seq_printf(seq, - " sl " - "local_address " - "remote_address " - "st tx_queue rx_queue tr tm->when retrnsmt" - " uid timeout inode ref pointer drops\n"); - else - raw6_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); + if (v == SEQ_START_TOKEN) { + seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); + } else { + struct sock *sp = v; + __u16 srcp = inet_sk(sp)->inet_num; + ip6_dgram_sock_seq_show(seq, v, srcp, 0, + raw_seq_private(seq)->bucket); + } return 0; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ad0aa6b0b86..2b874185ebb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1649,7 +1649,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu int optlen, on_link; u8 *lladdr; - optlen = skb->tail - skb->transport_header; + optlen = skb_tail_pointer(skb) - skb_transport_header(skb); optlen -= sizeof(*msg); if (optlen < 0) { @@ -2681,9 +2681,9 @@ errout: } static int ip6_route_dev_notify(struct notifier_block *this, - unsigned long event, void *data) + unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 335363478bb..6b9c1f128ea 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -577,6 +577,10 @@ static int ipip6_rcv(struct sk_buff *skb) if (tunnel != NULL) { struct pcpu_tstats *tstats; + if (tunnel->parms.iph.protocol != IPPROTO_IPV6 && + tunnel->parms.iph.protocol != 0) + goto out; + secpath_reset(skb); skb->mac_header = skb->network_header; skb_reset_network_header(skb); @@ -629,6 +633,35 @@ out: return 0; } +static const struct tnl_ptk_info tpi = { + /* no tunnel info required for ipip. */ + .proto = htons(ETH_P_IP), +}; + +static int ipip_rcv(struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + struct ip_tunnel *tunnel; + + tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, + iph->saddr, iph->daddr); + if (tunnel != NULL) { + if (tunnel->parms.iph.protocol != IPPROTO_IPIP && + tunnel->parms.iph.protocol != 0) + goto drop; + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); + } + + return 1; + +drop: + kfree_skb(skb); + return 0; +} + /* * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function * stores the embedded IPv4 address in v4dst and returns true. @@ -877,6 +910,43 @@ tx_error: return NETDEV_TX_OK; } +static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + const struct iphdr *tiph = &tunnel->parms.iph; + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP); + return NETDEV_TX_OK; +} + +static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + ipip_tunnel_xmit(skb, dev); + break; + case htons(ETH_P_IPV6): + ipip6_tunnel_xmit(skb, dev); + break; + default: + goto tx_err; + } + + return NETDEV_TX_OK; + +tx_err: + dev->stats.tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + +} + static void ipip6_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; @@ -1027,7 +1097,11 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) goto done; err = -EINVAL; - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPV6 || + if (p.iph.protocol != IPPROTO_IPV6 && + p.iph.protocol != IPPROTO_IPIP && + p.iph.protocol != 0) + goto done; + if (p.iph.version != 4 || p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) goto done; if (p.iph.ttl) @@ -1164,7 +1238,7 @@ static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) static const struct net_device_ops ipip6_netdev_ops = { .ndo_uninit = ipip6_tunnel_uninit, - .ndo_start_xmit = ipip6_tunnel_xmit, + .ndo_start_xmit = sit_tunnel_xmit, .ndo_do_ioctl = ipip6_tunnel_ioctl, .ndo_change_mtu = ipip6_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, @@ -1232,6 +1306,22 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) return 0; } +static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + u8 proto; + + if (!data) + return 0; + + proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); + if (proto != IPPROTO_IPV6 && + proto != IPPROTO_IPIP && + proto != 0) + return -EINVAL; + + return 0; +} + static void ipip6_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm *parms) { @@ -1268,6 +1358,10 @@ static void ipip6_netlink_parms(struct nlattr *data[], if (data[IFLA_IPTUN_FLAGS]) parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); + + if (data[IFLA_IPTUN_PROTO]) + parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); + } #ifdef CONFIG_IPV6_SIT_6RD @@ -1391,6 +1485,8 @@ static size_t ipip6_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_IPTUN_FLAGS */ nla_total_size(2) + + /* IFLA_IPTUN_PROTO */ + nla_total_size(1) + #ifdef CONFIG_IPV6_SIT_6RD /* IFLA_IPTUN_6RD_PREFIX */ nla_total_size(sizeof(struct in6_addr)) + @@ -1416,6 +1512,7 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || + nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags)) goto nla_put_failure; @@ -1445,6 +1542,7 @@ static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U16 }, + [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, #ifdef CONFIG_IPV6_SIT_6RD [IFLA_IPTUN_6RD_PREFIX] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_6RD_RELAY_PREFIX] = { .type = NLA_U32 }, @@ -1459,6 +1557,7 @@ static struct rtnl_link_ops sit_link_ops __read_mostly = { .policy = ipip6_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipip6_tunnel_setup, + .validate = ipip6_validate, .newlink = ipip6_newlink, .changelink = ipip6_changelink, .get_size = ipip6_get_size, @@ -1471,6 +1570,12 @@ static struct xfrm_tunnel sit_handler __read_mostly = { .priority = 1, }; +static struct xfrm_tunnel ipip_handler __read_mostly = { + .handler = ipip_rcv, + .err_handler = ipip6_err, + .priority = 2, +}; + static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_head *head) { int prio; @@ -1553,6 +1658,7 @@ static void __exit sit_cleanup(void) { rtnl_link_unregister(&sit_link_ops); xfrm4_tunnel_deregister(&sit_handler, AF_INET6); + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); unregister_pernet_device(&sit_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ @@ -1569,9 +1675,14 @@ static int __init sit_init(void) return err; err = xfrm4_tunnel_register(&sit_handler, AF_INET6); if (err < 0) { - pr_info("%s: can't add protocol\n", __func__); + pr_info("%s: can't register ip6ip4\n", __func__); goto xfrm_tunnel_failed; } + err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + if (err < 0) { + pr_info("%s: can't register ip4ip4\n", __func__); + goto xfrm_tunnel4_failed; + } err = rtnl_link_register(&sit_link_ops); if (err < 0) goto rtnl_link_failed; @@ -1580,6 +1691,8 @@ out: return err; rtnl_link_failed: + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +xfrm_tunnel4_failed: xfrm4_tunnel_deregister(&sit_handler, AF_INET6); xfrm_tunnel_failed: unregister_pernet_device(&sit_net_ops); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 71167069b39..5cffa5c3e6b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -63,6 +63,7 @@ #include <net/inet_common.h> #include <net/secure_seq.h> #include <net/tcp_memcontrol.h> +#include <net/ll_poll.h> #include <asm/uaccess.h> @@ -1498,6 +1499,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; + sk_mark_ll(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); @@ -1890,6 +1892,17 @@ void tcp6_proc_exit(struct net *net) } #endif +static void tcp_v6_clear_sk(struct sock *sk, int size) +{ + struct inet_sock *inet = inet_sk(sk); + + /* we do not want to clear pinet6 field, because of RCU lookups */ + sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6)); + + size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); + memset(&inet->pinet6 + 1, 0, size); +} + struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, @@ -1933,6 +1946,7 @@ struct proto tcpv6_prot = { #ifdef CONFIG_MEMCG_KMEM .proto_cgroup = tcp_proto_cgroup, #endif + .clear_sk = tcp_v6_clear_sk, }; static const struct inet6_protocol tcpv6_protocol = { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d4defdd4493..f77e34c5a0e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/inet6_hashtables.h> +#include <net/ll_poll.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -841,7 +842,10 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { - int ret = udpv6_queue_rcv_skb(sk, skb); + int ret; + + sk_mark_ll(sk, skb); + ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but @@ -1359,48 +1363,17 @@ static const struct inet6_protocol udpv6_protocol = { /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS - -static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket) -{ - struct inet_sock *inet = inet_sk(sp); - struct ipv6_pinfo *np = inet6_sk(sp); - const struct in6_addr *dest, *src; - __u16 destp, srcp; - - dest = &np->daddr; - src = &np->rcv_saddr; - destp = ntohs(inet->inet_dport); - srcp = ntohs(inet->inet_sport); - seq_printf(seq, - "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", - bucket, - src->s6_addr32[0], src->s6_addr32[1], - src->s6_addr32[2], src->s6_addr32[3], srcp, - dest->s6_addr32[0], dest->s6_addr32[1], - dest->s6_addr32[2], dest->s6_addr32[3], destp, - sp->sk_state, - sk_wmem_alloc_get(sp), - sk_rmem_alloc_get(sp), - 0, 0L, 0, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), - 0, - sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); -} - int udp6_seq_show(struct seq_file *seq, void *v) { - if (v == SEQ_START_TOKEN) - seq_printf(seq, - " sl " - "local_address " - "remote_address " - "st tx_queue rx_queue tr tm->when retrnsmt" - " uid timeout inode ref pointer drops\n"); - else - udp6_sock_seq_show(seq, v, ((struct udp_iter_state *)seq->private)->bucket); + if (v == SEQ_START_TOKEN) { + seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); + } else { + int bucket = ((struct udp_iter_state *)seq->private)->bucket; + struct inet_sock *inet = inet_sk(v); + __u16 srcp = ntohs(inet->inet_sport); + __u16 destp = ntohs(inet->inet_dport); + ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket); + } return 0; } @@ -1432,6 +1405,17 @@ void udp6_proc_exit(struct net *net) { } #endif /* CONFIG_PROC_FS */ +void udp_v6_clear_sk(struct sock *sk, int size) +{ + struct inet_sock *inet = inet_sk(sk); + + /* we do not want to clear pinet6 field, because of RCU lookups */ + sk_prot_clear_portaddr_nulls(sk, offsetof(struct inet_sock, pinet6)); + + size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); + memset(&inet->pinet6 + 1, 0, size); +} + /* ------------------------------------------------------------------------ */ struct proto udpv6_prot = { @@ -1462,7 +1446,7 @@ struct proto udpv6_prot = { .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif - .clear_sk = sk_prot_clear_portaddr_nulls, + .clear_sk = udp_v6_clear_sk, }; static struct inet_protosw udpv6_protosw = { diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index d7571046bfc..4691ed50a92 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -31,6 +31,8 @@ extern int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, extern int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); extern void udpv6_destroy_sock(struct sock *sk); +extern void udp_v6_clear_sk(struct sock *sk, int size); + #ifdef CONFIG_PROC_FS extern int udp6_seq_show(struct seq_file *seq, void *v); #endif diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 3bb3a891a42..5d1b8d7ac99 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -46,11 +46,12 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, unsigned int mss; unsigned int unfrag_ip6hlen, unfrag_len; struct frag_hdr *fptr; - u8 *mac_start, *prevhdr; + u8 *packet_start, *prevhdr; u8 nexthdr; u8 frag_hdr_sz = sizeof(struct frag_hdr); int offset; __wsum csum; + int tnl_hlen; mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) @@ -63,7 +64,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | - SKB_GSO_GRE) || + SKB_GSO_GRE | + SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; @@ -83,9 +85,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, skb->ip_summed = CHECKSUM_NONE; /* Check if there is enough headroom to insert fragment header. */ - if ((skb_mac_header(skb) < skb->head + frag_hdr_sz) && - pskb_expand_head(skb, frag_hdr_sz, 0, GFP_ATOMIC)) - goto out; + tnl_hlen = skb_tnl_header_len(skb); + if (skb_headroom(skb) < (tnl_hlen + frag_hdr_sz)) { + if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) + goto out; + } /* Find the unfragmentable header and shift it left by frag_hdr_sz * bytes to insert fragment header. @@ -93,11 +97,12 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); nexthdr = *prevhdr; *prevhdr = NEXTHDR_FRAGMENT; - unfrag_len = skb_network_header(skb) - skb_mac_header(skb) + - unfrag_ip6hlen; - mac_start = skb_mac_header(skb); - memmove(mac_start-frag_hdr_sz, mac_start, unfrag_len); + unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + + unfrag_ip6hlen + tnl_hlen; + packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; + memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); + SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; skb->mac_header -= frag_hdr_sz; skb->network_header -= frag_hdr_sz; diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 1d08e21d9f6..dfcc4be4689 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -56,7 +56,7 @@ struct proto udplitev6_prot = { .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif - .clear_sk = sk_prot_clear_portaddr_nulls, + .clear_sk = udp_v6_clear_sk, }; static struct inet_protosw udplite6_protosw = { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 4ef7bdb6544..23ed03d786c 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -103,8 +103,10 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, dev_hold(dev); xdst->u.rt6.rt6i_idev = in6_dev_get(dev); - if (!xdst->u.rt6.rt6i_idev) + if (!xdst->u.rt6.rt6i_idev) { + dev_put(dev); return -ENODEV; + } rt6_transfer_peer(&xdst->u.rt6, rt); diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index f547a47d381..7a1e0fc1bd4 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -330,7 +330,7 @@ static __inline__ void __ipxitf_put(struct ipx_interface *intrfc) static int ipxitf_device_event(struct notifier_block *notifier, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ipx_interface *i, *tmp; if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c index 8c004161a84..9ea0c933b9f 100644 --- a/net/irda/irlap_frame.c +++ b/net/irda/irlap_frame.c @@ -544,7 +544,7 @@ static void irlap_recv_discovery_xid_cmd(struct irlap_cb *self, /* * We now have some discovery info to deliver! */ - discovery = kmalloc(sizeof(discovery_t), GFP_ATOMIC); + discovery = kzalloc(sizeof(discovery_t), GFP_ATOMIC); if (!discovery) { IRDA_WARNING("%s: unable to malloc!\n", __func__); return; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index ae691651b72..168aff5e60d 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -2293,7 +2293,7 @@ out_unlock: static int afiucv_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *event_dev = (struct net_device *)ptr; + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); struct sock *sk; struct iucv_sock *iucv; diff --git a/net/key/af_key.c b/net/key/af_key.c index 5b1e5af2571..c5fbd758968 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2366,6 +2366,8 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa out: xfrm_pol_put(xp); + if (err == 0) + xfrm_garbage_collect(net); return err; } @@ -2615,6 +2617,8 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_ out: xfrm_pol_put(xp); + if (delete && err == 0) + xfrm_garbage_collect(net); return err; } diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 7cabaf261fe..cc117591f67 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -159,10 +159,11 @@ static int ieee80211_change_mtu(struct net_device *dev, int new_mtu) return 0; } -static int ieee80211_verify_mac(struct ieee80211_local *local, u8 *addr, +static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr, bool check_dup) { - struct ieee80211_sub_if_data *sdata; + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *iter; u64 new, mask, tmp; u8 *m; int ret = 0; @@ -184,12 +185,15 @@ static int ieee80211_verify_mac(struct ieee80211_local *local, u8 *addr, return ret; mutex_lock(&local->iflist_mtx); - list_for_each_entry(sdata, &local->interfaces, list) { - if (sdata->vif.type == NL80211_IFTYPE_MONITOR && - !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)) + list_for_each_entry(iter, &local->interfaces, list) { + if (iter == sdata) continue; - m = sdata->vif.addr; + if (iter->vif.type == NL80211_IFTYPE_MONITOR && + !(iter->u.mntr_flags & MONITOR_FLAG_ACTIVE)) + continue; + + m = iter->vif.addr; tmp = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) | ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) | ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8); @@ -218,7 +222,7 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr) !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)) check_dup = false; - ret = ieee80211_verify_mac(sdata->local, sa->sa_data, check_dup); + ret = ieee80211_verify_mac(sdata, sa->sa_data, check_dup); if (ret) return ret; @@ -1503,7 +1507,17 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, break; } + /* + * Pick address of existing interface in case user changed + * MAC address manually, default to perm_addr. + */ m = local->hw.wiphy->perm_addr; + list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR) + continue; + m = sdata->vif.addr; + break; + } start = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) | ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) | ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8); @@ -1750,10 +1764,9 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) } static int netdev_notify(struct notifier_block *nb, - unsigned long state, - void *ndev) + unsigned long state, void *ptr) { - struct net_device *dev = ndev; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ieee80211_sub_if_data *sdata; if (state != NETDEV_CHANGENAME) diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig new file mode 100644 index 00000000000..37421db8896 --- /dev/null +++ b/net/mpls/Kconfig @@ -0,0 +1,9 @@ +# +# MPLS configuration +# +config NET_MPLS_GSO + tristate "MPLS: GSO support" + help + This is helper module to allow segmentation of non-MPLS GSO packets + that have had MPLS stack entries pushed onto them and thus + become MPLS GSO packets. diff --git a/net/mpls/Makefile b/net/mpls/Makefile new file mode 100644 index 00000000000..0a3c171be53 --- /dev/null +++ b/net/mpls/Makefile @@ -0,0 +1,4 @@ +# +# Makefile for MPLS. +# +obj-y += mpls_gso.o diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c new file mode 100644 index 00000000000..1bec1219ab8 --- /dev/null +++ b/net/mpls/mpls_gso.c @@ -0,0 +1,108 @@ +/* + * MPLS GSO Support + * + * Authors: Simon Horman (horms@verge.net.au) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on: GSO portions of net/ipv4/gre.c + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/err.h> +#include <linux/module.h> +#include <linux/netdev_features.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> + +static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + netdev_features_t mpls_features; + __be16 mpls_protocol; + + if (unlikely(skb_shinfo(skb)->gso_type & + ~(SKB_GSO_TCPV4 | + SKB_GSO_TCPV6 | + SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_GRE | + SKB_GSO_MPLS))) + goto out; + + /* Setup inner SKB. */ + mpls_protocol = skb->protocol; + skb->protocol = skb->inner_protocol; + + /* Push back the mac header that skb_mac_gso_segment() has pulled. + * It will be re-pulled by the call to skb_mac_gso_segment() below + */ + __skb_push(skb, skb->mac_len); + + /* Segment inner packet. */ + mpls_features = skb->dev->mpls_features & netif_skb_features(skb); + segs = skb_mac_gso_segment(skb, mpls_features); + + + /* Restore outer protocol. */ + skb->protocol = mpls_protocol; + + /* Re-pull the mac header that the call to skb_mac_gso_segment() + * above pulled. It will be re-pushed after returning + * skb_mac_gso_segment(), an indirect caller of this function. + */ + __skb_push(skb, skb->data - skb_mac_header(skb)); + +out: + return segs; +} + +static int mpls_gso_send_check(struct sk_buff *skb) +{ + return 0; +} + +static struct packet_offload mpls_mc_offload = { + .type = cpu_to_be16(ETH_P_MPLS_MC), + .callbacks = { + .gso_send_check = mpls_gso_send_check, + .gso_segment = mpls_gso_segment, + }, +}; + +static struct packet_offload mpls_uc_offload = { + .type = cpu_to_be16(ETH_P_MPLS_UC), + .callbacks = { + .gso_send_check = mpls_gso_send_check, + .gso_segment = mpls_gso_segment, + }, +}; + +static int __init mpls_gso_init(void) +{ + pr_info("MPLS GSO support\n"); + + dev_add_offload(&mpls_uc_offload); + dev_add_offload(&mpls_mc_offload); + + return 0; +} + +static void __exit mpls_gso_exit(void) +{ + dev_remove_offload(&mpls_uc_offload); + dev_remove_offload(&mpls_mc_offload); +} + +module_init(mpls_gso_init); +module_exit(mpls_gso_exit); + +MODULE_DESCRIPTION("MPLS GSO support"); +MODULE_AUTHOR("Simon Horman (horms@verge.net.au)"); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 07c865a31a3..2217363ab42 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -30,6 +30,8 @@ static DEFINE_MUTEX(afinfo_mutex); const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; EXPORT_SYMBOL(nf_afinfo); +const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; +EXPORT_SYMBOL_GPL(nf_ipv6_ops); int nf_register_afinfo(const struct nf_afinfo *afinfo) { @@ -302,17 +304,26 @@ static struct pernet_operations netfilter_net_ops = { .exit = netfilter_net_exit, }; -void __init netfilter_init(void) +int __init netfilter_init(void) { - int i, h; + int i, h, ret; + for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) { for (h = 0; h < NF_MAX_HOOKS; h++) INIT_LIST_HEAD(&nf_hooks[i][h]); } - if (register_pernet_subsys(&netfilter_net_ops) < 0) - panic("cannot create netfilter proc entry"); + ret = register_pernet_subsys(&netfilter_net_ops); + if (ret < 0) + goto err; + + ret = netfilter_log_init(); + if (ret < 0) + goto err_pernet; - if (netfilter_log_init() < 0) - panic("cannot initialize nf_log"); + return 0; +err_pernet: + unregister_pernet_subsys(&netfilter_net_ops); +err: + return ret; } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index a083bda322b..c8c52a98590 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -975,8 +975,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) return cp; } } - rcu_read_unlock(); - rcu_read_lock(); + cond_resched_rcu(); } return NULL; @@ -1015,8 +1014,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) iter->l = &ip_vs_conn_tab[idx]; return cp; } - rcu_read_unlock(); - rcu_read_lock(); + cond_resched_rcu(); } iter->l = NULL; return NULL; @@ -1206,17 +1204,13 @@ void ip_vs_random_dropentry(struct net *net) int idx; struct ip_vs_conn *cp, *cp_c; + rcu_read_lock(); /* * Randomly scan 1/32 of the whole table every second */ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { unsigned int hash = net_random() & ip_vs_conn_tab_mask; - /* - * Lock is actually needed in this loop. - */ - rcu_read_lock(); - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ @@ -1252,8 +1246,9 @@ void ip_vs_random_dropentry(struct net *net) __ip_vs_conn_put(cp); } } - rcu_read_unlock(); + cond_resched_rcu(); } + rcu_read_unlock(); } @@ -1267,11 +1262,8 @@ static void ip_vs_conn_flush(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); flush_again: + rcu_read_lock(); for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - /* - * Lock is actually needed in this loop. - */ - rcu_read_lock(); hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { if (!ip_vs_conn_net_eq(cp, net)) @@ -1286,8 +1278,9 @@ flush_again: __ip_vs_conn_put(cp); } } - rcu_read_unlock(); + cond_resched_rcu(); } + rcu_read_unlock(); /* the counter may be not NULL, because maybe some conn entries are run by slow timer handler or unhashed but still referred */ diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 085b5880ab0..05565d2b3a6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1001,6 +1001,32 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) return th->rst; } +static inline bool is_new_conn(const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + switch (iph->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th == NULL) + return false; + return th->syn; + } + case IPPROTO_SCTP: { + sctp_chunkhdr_t *sch, schunk; + + sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(schunk), &schunk); + if (sch == NULL) + return false; + return sch->type == SCTP_CID_INIT; + } + default: + return false; + } +} + /* Handle response packets: rewrite addresses and send away... */ static unsigned int @@ -1612,6 +1638,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) * Check if the packet belongs to an existing connection entry */ cp = pp->conn_in_get(af, skb, &iph, 0); + + if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && + unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && + is_new_conn(skb, &iph)) { + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + cp = NULL; + } + if (unlikely(!cp) && !iph.fragoffs) { /* No (second) fragments need to enter here, as nf_defrag_ipv6 * replayed fragment zero will already have created the cp diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5b142fb1648..df05c1c276f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1487,9 +1487,9 @@ ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) * Currently only NETDEV_DOWN is handled to release refs to cached dsts */ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, - void *ptr) + void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_service *svc; @@ -1716,9 +1716,9 @@ static struct ctl_table vs_vars[] = { }, { .procname = "sync_qlen_max", - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "sync_sock_size", diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 0df269d7c99..a65edfe4b16 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -67,8 +67,8 @@ struct ip_vs_sh_bucket { #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) struct ip_vs_sh_state { - struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; struct rcu_head rcu_head; + struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; }; /* diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 6b217074237..b8a0924064e 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -55,10 +55,14 @@ unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb, struct nf_conntrack_expect *exp); EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); -static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); -static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); +static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, + char, unsigned int *); +static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *, + char, unsigned int *); +static int try_eprt(const char *, size_t, struct nf_conntrack_man *, + char, unsigned int *); static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, - char); + char, unsigned int *); static struct ftp_search { const char *pattern; @@ -66,7 +70,7 @@ static struct ftp_search { char skip; char term; enum nf_ct_ftp_type ftptype; - int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); + int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *); } search[IP_CT_DIR_MAX][2] = { [IP_CT_DIR_ORIGINAL] = { { @@ -90,10 +94,8 @@ static struct ftp_search { { .pattern = "227 ", .plen = sizeof("227 ") - 1, - .skip = '(', - .term = ')', .ftptype = NF_CT_FTP_PASV, - .getnum = try_rfc959, + .getnum = try_rfc1123, }, { .pattern = "229 ", @@ -132,8 +134,9 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[], i++; else { /* Unexpected character; true if it's the - terminator and we're finished. */ - if (*data == term && i == array_size - 1) + terminator (or we don't care about one) + and we're finished. */ + if ((*data == term || !term) && i == array_size - 1) return len; pr_debug("Char %u (got %u nums) `%u' unexpected\n", @@ -148,7 +151,8 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[], /* Returns 0, or length of numbers: 192,168,1,1,5,6 */ static int try_rfc959(const char *data, size_t dlen, - struct nf_conntrack_man *cmd, char term) + struct nf_conntrack_man *cmd, char term, + unsigned int *offset) { int length; u_int32_t array[6]; @@ -163,6 +167,33 @@ static int try_rfc959(const char *data, size_t dlen, return length; } +/* + * From RFC 1123: + * The format of the 227 reply to a PASV command is not + * well standardized. In particular, an FTP client cannot + * assume that the parentheses shown on page 40 of RFC-959 + * will be present (and in fact, Figure 3 on page 43 omits + * them). Therefore, a User-FTP program that interprets + * the PASV reply must scan the reply for the first digit + * of the host and port numbers. + */ +static int try_rfc1123(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term, + unsigned int *offset) +{ + int i; + for (i = 0; i < dlen; i++) + if (isdigit(data[i])) + break; + + if (i == dlen) + return 0; + + *offset += i; + + return try_rfc959(data + i, dlen - i, cmd, 0, offset); +} + /* Grab port: number up to delimiter */ static int get_port(const char *data, int start, size_t dlen, char delim, __be16 *port) @@ -191,7 +222,7 @@ static int get_port(const char *data, int start, size_t dlen, char delim, /* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, - char term) + char term, unsigned int *offset) { char delim; int length; @@ -239,7 +270,8 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, /* Returns 0, or length of numbers: |||6446| */ static int try_epsv_response(const char *data, size_t dlen, - struct nf_conntrack_man *cmd, char term) + struct nf_conntrack_man *cmd, char term, + unsigned int *offset) { char delim; @@ -261,9 +293,10 @@ static int find_pattern(const char *data, size_t dlen, unsigned int *numlen, struct nf_conntrack_man *cmd, int (*getnum)(const char *, size_t, - struct nf_conntrack_man *, char)) + struct nf_conntrack_man *, char, + unsigned int *)) { - size_t i; + size_t i = plen; pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); if (dlen == 0) @@ -293,16 +326,18 @@ static int find_pattern(const char *data, size_t dlen, pr_debug("Pattern matches!\n"); /* Now we've found the constant string, try to skip to the 'skip' character */ - for (i = plen; data[i] != skip; i++) - if (i == dlen - 1) return -1; + if (skip) { + for (i = plen; data[i] != skip; i++) + if (i == dlen - 1) return -1; - /* Skip over the last character */ - i++; + /* Skip over the last character */ + i++; + } pr_debug("Skipped up to `%c'!\n", skip); *numoff = i; - *numlen = getnum(data + i, dlen - i, cmd, term); + *numlen = getnum(data + i, dlen - i, cmd, term, numoff); if (!*numlen) return -1; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 388656d5a9e..4b60a87b759 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -148,7 +148,7 @@ void nf_log_packet(struct net *net, va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); va_end(args); - logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); + logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix); } rcu_read_unlock(); } @@ -368,17 +368,18 @@ static int __net_init nf_log_net_init(struct net *net) return 0; out_sysctl: - /* For init_net: errors will trigger panic, don't unroll on error. */ - if (!net_eq(net, &init_net)) - remove_proc_entry("nf_log", net->nf.proc_netfilter); - +#ifdef CONFIG_PROC_FS + remove_proc_entry("nf_log", net->nf.proc_netfilter); +#endif return ret; } static void __net_exit nf_log_net_exit(struct net *net) { netfilter_log_sysctl_exit(net); +#ifdef CONFIG_PROC_FS remove_proc_entry("nf_log", net->nf.proc_netfilter); +#endif } static struct pernet_operations nf_log_net_ops = { diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 5fea563afe3..85e20a91908 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -104,7 +104,7 @@ static void mangle_contents(struct sk_buff *skb, /* move post-replacement */ memmove(data + match_offset + rep_len, data + match_offset + match_len, - skb->tail - (skb->network_header + dataoff + + skb_tail_pointer(skb) - (skb_network_header(skb) + dataoff + match_offset + match_len)); /* insert data from buffer */ diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index faf1e9300d8..962e9792e31 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -602,7 +602,8 @@ static struct nf_loginfo default_loginfo = { /* log handler for internal netfilter logging api */ void -nfulnl_log_packet(u_int8_t pf, +nfulnl_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -615,7 +616,6 @@ nfulnl_log_packet(u_int8_t pf, const struct nf_loginfo *li; unsigned int qthreshold; unsigned int plen; - struct net *net = dev_net(in ? in : out); struct nfnl_log_net *log = nfnl_log_pernet(net); if (li_user && li_user->type == NF_LOG_TYPE_ULOG) @@ -1045,7 +1045,9 @@ static int __net_init nfnl_log_net_init(struct net *net) static void __net_exit nfnl_log_net_exit(struct net *net) { +#ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); +#endif } static struct pernet_operations nfnl_log_net_ops = { diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 2e0e835baf7..c011543bff5 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -41,6 +41,14 @@ #define NFQNL_QMAX_DEFAULT 1024 +/* We're using struct nlattr which has 16bit nla_len. Note that nla_len + * includes the header length. Thus, the maximum packet length that we + * support is 65531 bytes. We send truncated packets if the specified length + * is larger than that. Userspace can check for presence of NFQA_CAP_LEN + * attribute to detect truncation. + */ +#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) + struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ struct rcu_head rcu; @@ -122,7 +130,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, inst->queue_num = queue_num; inst->peer_portid = portid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; - inst->copy_range = 0xffff; + inst->copy_range = NFQNL_MAX_COPY_RANGE; inst->copy_mode = NFQNL_COPY_NONE; spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); @@ -333,10 +341,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, return NULL; data_len = ACCESS_ONCE(queue->copy_range); - if (data_len == 0 || data_len > entskb->len) + if (data_len > entskb->len) data_len = entskb->len; - if (!entskb->head_frag || skb_headlen(entskb) < L1_CACHE_BYTES || skb_shinfo(entskb)->nr_frags >= MAX_SKB_FRAGS) @@ -465,7 +472,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) goto nla_put_failure; - if (cap_len > 0 && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) + if (cap_len > data_len && + nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) goto nla_put_failure; if (nfqnl_put_packet_info(skb, entskb)) @@ -509,10 +517,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, } spin_lock_bh(&queue->lock); - if (!queue->peer_portid) { - err = -EINVAL; - goto err_out_free_nskb; - } if (queue->queue_total >= queue->queue_maxlen) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; @@ -731,13 +735,8 @@ nfqnl_set_mode(struct nfqnl_instance *queue, case NFQNL_COPY_PACKET: queue->copy_mode = mode; - /* We're using struct nlattr which has 16bit nla_len. Note that - * nla_len includes the header length. Thus, the maximum packet - * length that we support is 65531 bytes. We send truncated - * packets if the specified length is larger than that. - */ - if (range > 0xffff - NLA_HDRLEN) - queue->copy_range = 0xffff - NLA_HDRLEN; + if (range == 0 || range > NFQNL_MAX_COPY_RANGE) + queue->copy_range = NFQNL_MAX_COPY_RANGE; else queue->copy_range = range; break; @@ -800,7 +799,7 @@ static int nfqnl_rcv_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) @@ -1285,7 +1284,9 @@ static int __net_init nfnl_queue_net_init(struct net *net) static void __net_exit nfnl_queue_net_exit(struct net *net) { +#ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); +#endif } static struct pernet_operations nfnl_queue_net_ops = { diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index a60261cb0e8..da35ac06a97 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -26,6 +26,9 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) if (skb->nfct != NULL) return XT_CONTINUE; + /* special case the untracked ct : we want the percpu object */ + if (!ct) + ct = nf_ct_untracked_get(); atomic_inc(&ct->ct_general.use); skb->nfct = &ct->ct_general; skb->nfctinfo = IP_CT_NEW; @@ -186,8 +189,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, int ret = -EOPNOTSUPP; if (info->flags & XT_CT_NOTRACK) { - ct = nf_ct_untracked_get(); - atomic_inc(&ct->ct_general.use); + ct = NULL; goto out; } @@ -311,7 +313,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, struct nf_conn *ct = info->ct; struct nf_conn_help *help; - if (!nf_ct_is_untracked(ct)) { + if (ct && !nf_ct_is_untracked(ct)) { help = nfct_help(ct); if (help) module_put(help->helper->me); @@ -319,8 +321,8 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, nf_ct_l3proto_module_put(par->family); xt_ct_destroy_timeout(ct); + nf_ct_put(info->ct); } - nf_ct_put(info->ct); } static void xt_ct_tg_destroy_v0(const struct xt_tgdtor_param *par) diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index fe573f6c9e9..5ab24843370 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -466,7 +466,8 @@ log_packet_common(struct sbuff *m, static void -ipt_log_packet(u_int8_t pf, +ipt_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -475,7 +476,6 @@ ipt_log_packet(u_int8_t pf, const char *prefix) { struct sbuff *m; - struct net *net = dev_net(in ? in : out); /* FIXME: Disabled from containers until syslog ns is supported */ if (!net_eq(net, &init_net)) @@ -737,7 +737,7 @@ static void dump_ipv6_packet(struct sbuff *m, dump_sk_uid_gid(m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ - if (!recurse && skb->mark) + if (recurse && skb->mark) sb_add(m, "MARK=0x%x ", skb->mark); } @@ -797,7 +797,8 @@ fallback: } static void -ip6t_log_packet(u_int8_t pf, +ip6t_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -806,7 +807,6 @@ ip6t_log_packet(u_int8_t pf, const char *prefix) { struct sbuff *m; - struct net *net = dev_net(in ? in : out); /* FIXME: Disabled from containers until syslog ns is supported */ if (!net_eq(net, &init_net)) @@ -833,17 +833,18 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_log_info *loginfo = par->targinfo; struct nf_loginfo li; + struct net *net = dev_net(par->in ? par->in : par->out); li.type = NF_LOG_TYPE_LOG; li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; if (par->family == NFPROTO_IPV4) - ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, + ipt_log_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li, loginfo->prefix); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) else if (par->family == NFPROTO_IPV6) - ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in, + ip6t_log_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in, par->out, &li, loginfo->prefix); #endif else diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index a17dd0f589b..fb7497c928a 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -26,13 +26,14 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_nflog_info *info = par->targinfo; struct nf_loginfo li; + struct net *net = dev_net(par->in ? par->in : par->out); li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; - nfulnl_log_packet(par->family, par->hooknum, skb, par->in, + nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in, par->out, &li, info->prefix); return XT_CONTINUE; } diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 25fd1c4e1ee..1eb1a44bfd3 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -30,17 +30,28 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) static unsigned int tcpoptstrip_mangle_packet(struct sk_buff *skb, - const struct xt_tcpoptstrip_target_info *info, + const struct xt_action_param *par, unsigned int tcphoff, unsigned int minlen) { + const struct xt_tcpoptstrip_target_info *info = par->targinfo; unsigned int optl, i, j; struct tcphdr *tcph; u_int16_t n, o; u_int8_t *opt; + int len; + + /* This is a fragment, no TCP header is available */ + if (par->fragoff != 0) + return XT_CONTINUE; if (!skb_make_writable(skb, skb->len)) return NF_DROP; + len = skb->len - tcphoff; + if (len < (int)sizeof(struct tcphdr) || + tcp_hdr(skb)->doff * 4 > len) + return NF_DROP; + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); opt = (u_int8_t *)tcph; @@ -76,7 +87,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, static unsigned int tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb), + return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb), sizeof(struct iphdr) + sizeof(struct tcphdr)); } @@ -94,7 +105,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) if (tcphoff < 0) return NF_DROP; - return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff, + return tcpoptstrip_mangle_packet(skb, par, tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); } #endif diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index bd93e51d30a..292934d2348 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -200,7 +200,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct xt_tee_priv *priv; priv = container_of(this, struct xt_tee_priv, notifier); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index 49c5ff7f6dd..68ff29f6086 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -22,6 +22,7 @@ #include <net/ip6_fib.h> #endif +#include <linux/netfilter_ipv6.h> #include <linux/netfilter/xt_addrtype.h> #include <linux/netfilter/x_tables.h> @@ -33,12 +34,12 @@ MODULE_ALIAS("ip6t_addrtype"); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, - const struct in6_addr *addr) + const struct in6_addr *addr, u16 mask) { const struct nf_afinfo *afinfo; struct flowi6 flow; struct rt6_info *rt; - u32 ret; + u32 ret = 0; int route_err; memset(&flow, 0, sizeof(flow)); @@ -49,12 +50,19 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, rcu_read_lock(); afinfo = nf_get_afinfo(NFPROTO_IPV6); - if (afinfo != NULL) + if (afinfo != NULL) { + const struct nf_ipv6_ops *v6ops; + + if (dev && (mask & XT_ADDRTYPE_LOCAL)) { + v6ops = nf_get_ipv6_ops(); + if (v6ops && v6ops->chk_addr(net, addr, dev, true)) + ret = XT_ADDRTYPE_LOCAL; + } route_err = afinfo->route(net, (struct dst_entry **)&rt, - flowi6_to_flowi(&flow), !!dev); - else + flowi6_to_flowi(&flow), false); + } else { route_err = 1; - + } rcu_read_unlock(); if (route_err) @@ -62,15 +70,12 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, if (rt->rt6i_flags & RTF_REJECT) ret = XT_ADDRTYPE_UNREACHABLE; - else - ret = 0; - if (rt->rt6i_flags & RTF_LOCAL) + if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) ret |= XT_ADDRTYPE_LOCAL; if (rt->rt6i_flags & RTF_ANYCAST) ret |= XT_ADDRTYPE_ANYCAST; - dst_release(&rt->dst); return ret; } @@ -90,7 +95,7 @@ static bool match_type6(struct net *net, const struct net_device *dev, if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | XT_ADDRTYPE_UNREACHABLE) & mask) - return !!(mask & match_lookup_rt6(net, dev, addr)); + return !!(mask & match_lookup_rt6(net, dev, addr, mask)); return true; } diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index ed0db15ab00..7720b036d76 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -18,7 +18,7 @@ static bool xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateest_match_info *info = par->matchinfo; - struct gnet_stats_rate_est *r; + struct gnet_stats_rate_est64 *r; u_int32_t bps1, bps2, pps1, pps2; bool ret = true; diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 63b2bdb59e9..02704245710 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -107,7 +107,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, { const struct iphdr *iph = ip_hdr(skb); struct udphdr _hdr, *hp = NULL; - struct sock *sk; + struct sock *sk = skb->sk; __be32 uninitialized_var(daddr), uninitialized_var(saddr); __be16 uninitialized_var(dport), uninitialized_var(sport); u8 uninitialized_var(protocol); @@ -155,9 +155,11 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, } #endif - sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, - saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); - if (sk != NULL) { + if (!sk) + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, + saddr, daddr, sport, dport, + par->in, NFT_LOOKUP_ANY); + if (sk) { bool wildcard; bool transparent = true; @@ -173,7 +175,8 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, (sk->sk_state == TCP_TIME_WAIT && inet_twsk(sk)->tw_transparent)); - xt_socket_put_sk(sk); + if (sk != skb->sk) + xt_socket_put_sk(sk); if (wildcard || !transparent) sk = NULL; @@ -260,7 +263,7 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6hdr *iph = ipv6_hdr(skb); struct udphdr _hdr, *hp = NULL; - struct sock *sk; + struct sock *sk = skb->sk; struct in6_addr *daddr = NULL, *saddr = NULL; __be16 uninitialized_var(dport), uninitialized_var(sport); int thoff = 0, uninitialized_var(tproto); @@ -291,9 +294,11 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) return false; } - sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, - saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); - if (sk != NULL) { + if (!sk) + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + saddr, daddr, sport, dport, + par->in, NFT_LOOKUP_ANY); + if (sk) { bool wildcard; bool transparent = true; @@ -309,7 +314,8 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) (sk->sk_state == TCP_TIME_WAIT && inet_twsk(sk)->tw_transparent)); - xt_socket_put_sk(sk); + if (sk != skb->sk) + xt_socket_put_sk(sk); if (wildcard || !transparent) sk = NULL; diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index d8d42433755..6bb1d42f0fa 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -245,6 +245,71 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, } } +/** + * netlbl_domhsh_validate - Validate a new domain mapping entry + * @entry: the entry to validate + * + * This function validates the new domain mapping entry to ensure that it is + * a valid entry. Returns zero on success, negative values on failure. + * + */ +static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry) +{ + struct netlbl_af4list *iter4; + struct netlbl_domaddr4_map *map4; +#if IS_ENABLED(CONFIG_IPV6) + struct netlbl_af6list *iter6; + struct netlbl_domaddr6_map *map6; +#endif /* IPv6 */ + + if (entry == NULL) + return -EINVAL; + + switch (entry->type) { + case NETLBL_NLTYPE_UNLABELED: + if (entry->type_def.cipsov4 != NULL || + entry->type_def.addrsel != NULL) + return -EINVAL; + break; + case NETLBL_NLTYPE_CIPSOV4: + if (entry->type_def.cipsov4 == NULL) + return -EINVAL; + break; + case NETLBL_NLTYPE_ADDRSELECT: + netlbl_af4list_foreach(iter4, &entry->type_def.addrsel->list4) { + map4 = netlbl_domhsh_addr4_entry(iter4); + switch (map4->type) { + case NETLBL_NLTYPE_UNLABELED: + if (map4->type_def.cipsov4 != NULL) + return -EINVAL; + break; + case NETLBL_NLTYPE_CIPSOV4: + if (map4->type_def.cipsov4 == NULL) + return -EINVAL; + break; + default: + return -EINVAL; + } + } +#if IS_ENABLED(CONFIG_IPV6) + netlbl_af6list_foreach(iter6, &entry->type_def.addrsel->list6) { + map6 = netlbl_domhsh_addr6_entry(iter6); + switch (map6->type) { + case NETLBL_NLTYPE_UNLABELED: + break; + default: + return -EINVAL; + } + } +#endif /* IPv6 */ + break; + default: + return -EINVAL; + } + + return 0; +} + /* * Domain Hash Table Functions */ @@ -311,6 +376,10 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, struct netlbl_af6list *tmp6; #endif /* IPv6 */ + ret_val = netlbl_domhsh_validate(entry); + if (ret_val != 0) + return ret_val; + /* XXX - we can remove this RCU read lock as the spinlock protects the * entire function, but before we do we need to fixup the * netlbl_af[4,6]list RCU functions to do "the right thing" with diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 8a6c6ea466d..af3531926ee 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -708,7 +708,7 @@ unlhsh_remove_return: * netlbl_unlhsh_netdev_handler - Network device notification handler * @this: notifier block * @event: the event - * @ptr: the network device (cast to void) + * @ptr: the netdevice notifier info (cast to void) * * Description: * Handle network device events, although at present all we care about is a @@ -717,10 +717,9 @@ unlhsh_remove_return: * */ static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, - unsigned long event, - void *ptr) + unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netlbl_unlhsh_iface *iface = NULL; if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 12ac6b47a35..9b6b115e008 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -747,9 +747,13 @@ static void netlink_skb_destructor(struct sk_buff *skb) atomic_dec(&ring->pending); sock_put(sk); - skb->data = NULL; + skb->head = NULL; } #endif + if (is_vmalloc_addr(skb->head)) { + vfree(skb->head); + skb->head = NULL; + } if (skb->sk != NULL) sock_rfree(skb); } @@ -854,16 +858,23 @@ netlink_unlock_table(void) wake_up(&nl_table_wait); } +static bool netlink_compare(struct net *net, struct sock *sk) +{ + return net_eq(sock_net(sk), net); +} + static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { - struct nl_portid_hash *hash = &nl_table[protocol].hash; + struct netlink_table *table = &nl_table[protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; struct sock *sk; read_lock(&nl_table_lock); head = nl_portid_hashfn(hash, portid); sk_for_each(sk, head) { - if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->portid == portid)) { + if (table->compare(net, sk) && + (nlk_sk(sk)->portid == portid)) { sock_hold(sk); goto found; } @@ -976,7 +987,8 @@ netlink_update_listeners(struct sock *sk) static int netlink_insert(struct sock *sk, struct net *net, u32 portid) { - struct nl_portid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct netlink_table *table = &nl_table[sk->sk_protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; int err = -EADDRINUSE; struct sock *osk; @@ -986,7 +998,8 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid) head = nl_portid_hashfn(hash, portid); len = 0; sk_for_each(osk, head) { - if (net_eq(sock_net(osk), net) && (nlk_sk(osk)->portid == portid)) + if (table->compare(net, osk) && + (nlk_sk(osk)->portid == portid)) break; len++; } @@ -1161,6 +1174,7 @@ static int netlink_release(struct socket *sock) kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; + nl_table[sk->sk_protocol].compare = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } @@ -1183,7 +1197,8 @@ static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); - struct nl_portid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct netlink_table *table = &nl_table[sk->sk_protocol]; + struct nl_portid_hash *hash = &table->hash; struct hlist_head *head; struct sock *osk; s32 portid = task_tgid_vnr(current); @@ -1195,7 +1210,7 @@ retry: netlink_table_grab(); head = nl_portid_hashfn(hash, portid); sk_for_each(osk, head) { - if (!net_eq(sock_net(osk), net)) + if (!table->compare(net, osk)) continue; if (nlk_sk(osk)->portid == portid) { /* Bind collision, search negative portid values. */ @@ -1420,6 +1435,35 @@ struct sock *netlink_getsockbyfilp(struct file *filp) return sock; } +static struct sk_buff *netlink_alloc_large_skb(unsigned int size) +{ + struct sk_buff *skb; + void *data; + + if (size <= NLMSG_GOODSIZE) + return alloc_skb(size, GFP_KERNEL); + + skb = alloc_skb_head(GFP_KERNEL); + if (skb == NULL) + return NULL; + + data = vmalloc(size); + if (data == NULL) + goto err; + + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; + skb->len = 0; + skb->destructor = netlink_skb_destructor; + + return skb; +err: + kfree_skb(skb); + return NULL; +} + /* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the @@ -1510,7 +1554,7 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) return skb; delta = skb->end - skb->tail; - if (delta * 2 < skb->truesize) + if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; if (skb_shared(skb)) { @@ -2096,7 +2140,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; - skb = alloc_skb(len, GFP_KERNEL); + skb = netlink_alloc_large_skb(len); if (skb == NULL) goto out; @@ -2282,9 +2326,12 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; + nl_table[unit].compare = netlink_compare; if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].flags = cfg->flags; + if (cfg->compare) + nl_table[unit].compare = cfg->compare; } nl_table[unit].registered = 1; } else { @@ -2707,6 +2754,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *s; struct nl_seq_iter *iter; + struct net *net; int i, j; ++*pos; @@ -2714,11 +2762,12 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (v == SEQ_START_TOKEN) return netlink_seq_socket_idx(seq, 0); + net = seq_file_net(seq); iter = seq->private; s = v; do { s = sk_next(s); - } while (s && sock_net(s) != seq_file_net(seq)); + } while (s && !nl_table[s->sk_protocol].compare(net, s)); if (s) return s; @@ -2730,7 +2779,8 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) for (; j <= hash->mask; j++) { s = sk_head(&hash->table[j]); - while (s && sock_net(s) != seq_file_net(seq)) + + while (s && !nl_table[s->sk_protocol].compare(net, s)) s = sk_next(s); if (s) { iter->link = i; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index ed8522265f4..eaa88d187cd 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -73,6 +73,7 @@ struct netlink_table { struct mutex *cb_mutex; struct module *module; void (*bind)(int group); + bool (*compare)(struct net *net, struct sock *sock); int registered; }; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index ec0c80fde69..698814bfa7a 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -117,7 +117,7 @@ static void nr_kill_by_device(struct net_device *dev) */ static int nr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/nfc/Makefile b/net/nfc/Makefile index fb799deaed4..a76f4533cb6 100644 --- a/net/nfc/Makefile +++ b/net/nfc/Makefile @@ -5,7 +5,6 @@ obj-$(CONFIG_NFC) += nfc.o obj-$(CONFIG_NFC_NCI) += nci/ obj-$(CONFIG_NFC_HCI) += hci/ -#obj-$(CONFIG_NFC_LLCP) += llcp/ nfc-objs := core.o netlink.o af_nfc.o rawsock.o llcp_core.o llcp_commands.o \ llcp_sock.o diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index ef4feec6cd8..c3235675f35 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -78,7 +78,7 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct ovs_net *ovs_net; - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vport *vport = NULL; if (!ovs_is_internal_dev(dev)) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8ec1bca7f85..79fe63246b2 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3331,10 +3331,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, } -static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data) +static int packet_notifier(struct notifier_block *this, + unsigned long msg, void *ptr) { struct sock *sk; - struct net_device *dev = data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); rcu_read_lock(); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 45a7df6575d..56a6146ac94 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -292,9 +292,9 @@ static void phonet_route_autodel(struct net_device *dev) /* notify Phonet of device events */ static int phonet_device_notify(struct notifier_block *me, unsigned long what, - void *arg) + void *ptr) { - struct net_device *dev = arg; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (what) { case NETDEV_REGISTER: diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 9c834745159..e98fcfbe600 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -202,10 +202,10 @@ static void rose_kill_by_device(struct net_device *dev) /* * Handle device status changes. */ -static int rose_device_event(struct notifier_block *this, unsigned long event, - void *ptr) +static int rose_device_event(struct notifier_block *this, + unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5d676edc22a..977c10e0631 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -243,7 +243,7 @@ nla_put_failure: static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; if (event == NETDEV_UNREGISTER) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 823463adbd2..189e3c5b3d0 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -231,14 +231,14 @@ override: } if (R_tab) { police->rate_present = true; - psched_ratecfg_precompute(&police->rate, R_tab->rate.rate); + psched_ratecfg_precompute(&police->rate, &R_tab->rate); qdisc_put_rtab(R_tab); } else { police->rate_present = false; } if (P_tab) { police->peak_present = true; - psched_ratecfg_precompute(&police->peak, P_tab->rate.rate); + psched_ratecfg_precompute(&police->peak, &P_tab->rate); qdisc_put_rtab(P_tab); } else { police->peak_present = false; @@ -376,9 +376,9 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) }; if (police->rate_present) - opt.rate.rate = psched_ratecfg_getrate(&police->rate); + psched_ratecfg_getrate(&opt.rate, &police->rate); if (police->peak_present) - opt.peakrate.rate = psched_ratecfg_getrate(&police->peak); + psched_ratecfg_getrate(&opt.peakrate, &police->peak); if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) goto nla_put_failure; if (police->tcfp_result && diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 1bc210ffcba..71a56886255 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -130,7 +130,7 @@ struct cbq_class { psched_time_t penalized; struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct tc_cbq_xstats xstats; struct tcf_proto *filter_list; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 759b308d1a8..8302717ea30 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -25,7 +25,7 @@ struct drr_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct list_head alist; struct Qdisc *qdisc; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index eac7e0ee23c..20224086cc2 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -898,14 +898,16 @@ void dev_shutdown(struct net_device *dev) WARN_ON(timer_pending(&dev->watchdog_timer)); } -void psched_ratecfg_precompute(struct psched_ratecfg *r, u32 rate) +void psched_ratecfg_precompute(struct psched_ratecfg *r, + const struct tc_ratespec *conf) { u64 factor; u64 mult; int shift; - r->rate_bps = (u64)rate << 3; - r->shift = 0; + memset(r, 0, sizeof(*r)); + r->overhead = conf->overhead; + r->rate_bps = (u64)conf->rate << 3; r->mult = 1; /* * Calibrate mult, shift so that token counting is accurate diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 9facea03fae..c4075610502 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -114,7 +114,7 @@ struct hfsc_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; unsigned int level; /* class level in hierarchy */ struct tcf_proto *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 79b1876b6cd..162fb800754 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -78,7 +78,7 @@ struct htb_class { /* general class parameters */ struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct tc_htb_xstats xstats; /* our special stats */ int refcnt; /* usage count of this class */ @@ -109,7 +109,7 @@ struct htb_class { } un; struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ struct rb_node pq_node; /* node for event queue */ - psched_time_t pq_key; + s64 pq_key; int prio_activity; /* for which prios are we active */ enum htb_cmode cmode; /* current mode of the class */ @@ -121,10 +121,10 @@ struct htb_class { /* token bucket parameters */ struct psched_ratecfg rate; struct psched_ratecfg ceil; - s64 buffer, cbuffer; /* token bucket depth/rate */ - psched_tdiff_t mbuffer; /* max wait time */ - s64 tokens, ctokens; /* current number of tokens */ - psched_time_t t_c; /* checkpoint time */ + s64 buffer, cbuffer; /* token bucket depth/rate */ + s64 mbuffer; /* max wait time */ + s64 tokens, ctokens; /* current number of tokens */ + s64 t_c; /* checkpoint time */ }; struct htb_sched { @@ -141,15 +141,15 @@ struct htb_sched { struct rb_root wait_pq[TC_HTB_MAXDEPTH]; /* time of nearest event per level (row) */ - psched_time_t near_ev_cache[TC_HTB_MAXDEPTH]; + s64 near_ev_cache[TC_HTB_MAXDEPTH]; int defcls; /* class where unclassified flows go to */ /* filters for qdisc itself */ struct tcf_proto *filter_list; - int rate2quantum; /* quant = rate / rate2quantum */ - psched_time_t now; /* cached dequeue time */ + int rate2quantum; /* quant = rate / rate2quantum */ + s64 now; /* cached dequeue time */ struct qdisc_watchdog watchdog; /* non shaped skbs; let them go directly thru */ @@ -664,8 +664,8 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, * next pending event (0 for no event in pq, q->now for too many events). * Note: Applied are events whose have cl->pq_key <= q->now. */ -static psched_time_t htb_do_events(struct htb_sched *q, int level, - unsigned long start) +static s64 htb_do_events(struct htb_sched *q, int level, + unsigned long start) { /* don't run for longer than 2 jiffies; 2 is used instead of * 1 to simplify things when jiffy is going to be incremented @@ -857,7 +857,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) struct sk_buff *skb; struct htb_sched *q = qdisc_priv(sch); int level; - psched_time_t next_event; + s64 next_event; unsigned long start_at; /* try to dequeue direct packets as high prio (!) to minimize cpu work */ @@ -880,7 +880,7 @@ ok: for (level = 0; level < TC_HTB_MAXDEPTH; level++) { /* common case optimization - skip event handler quickly */ int m; - psched_time_t event; + s64 event; if (q->now >= q->near_ev_cache[level]) { event = htb_do_events(q, level, start_at); @@ -1090,9 +1090,9 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, memset(&opt, 0, sizeof(opt)); - opt.rate.rate = psched_ratecfg_getrate(&cl->rate); + psched_ratecfg_getrate(&opt.rate, &cl->rate); opt.buffer = PSCHED_NS2TICKS(cl->buffer); - opt.ceil.rate = psched_ratecfg_getrate(&cl->ceil); + psched_ratecfg_getrate(&opt.ceil, &cl->ceil); opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer); opt.quantum = cl->quantum; opt.prio = cl->prio; @@ -1117,8 +1117,8 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) if (!cl->level && cl->un.leaf.q) cl->qstats.qlen = cl->un.leaf.q->q.qlen; - cl->xstats.tokens = cl->tokens; - cl->xstats.ctokens = cl->ctokens; + cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); + cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || @@ -1200,7 +1200,7 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, parent->un.leaf.q = new_q ? new_q : &noop_qdisc; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; - parent->t_c = psched_get_time(); + parent->t_c = ktime_to_ns(ktime_get()); parent->cmode = HTB_CAN_SEND; } @@ -1417,8 +1417,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* set class to be in HTB_CAN_SEND state */ cl->tokens = PSCHED_TICKS2NS(hopt->buffer); cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer); - cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC; /* 1min */ - cl->t_c = psched_get_time(); + cl->mbuffer = 60ULL * NSEC_PER_SEC; /* 1min */ + cl->t_c = ktime_to_ns(ktime_get()); cl->cmode = HTB_CAN_SEND; /* attach to the hash list and parent's family */ @@ -1459,8 +1459,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->prio = TC_HTB_NUMPRIO - 1; } - psched_ratecfg_precompute(&cl->rate, hopt->rate.rate); - psched_ratecfg_precompute(&cl->ceil, hopt->ceil.rate); + psched_ratecfg_precompute(&cl->rate, &hopt->rate); + psched_ratecfg_precompute(&cl->ceil, &hopt->ceil); cl->buffer = PSCHED_TICKS2NS(hopt->buffer); cl->cbuffer = PSCHED_TICKS2NS(hopt->buffer); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index d51852bba01..7c195d972bf 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -138,7 +138,7 @@ struct qfq_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct Qdisc *qdisc; struct list_head alist; /* Link for active-classes list. */ struct qfq_aggregate *agg; /* Parent aggregate. */ diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index c8388f3c342..1aaf1b6e51a 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -116,14 +116,57 @@ struct tbf_sched_data { struct qdisc_watchdog watchdog; /* Watchdog timer */ }; + +/* GSO packet is too big, segment it so that tbf can transmit + * each segment in time + */ +static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct sk_buff *segs, *nskb; + netdev_features_t features = netif_skb_features(skb); + int ret, nb; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + + if (IS_ERR_OR_NULL(segs)) + return qdisc_reshape_fail(skb, sch); + + nb = 0; + while (segs) { + nskb = segs->next; + segs->next = NULL; + if (likely(segs->len <= q->max_size)) { + qdisc_skb_cb(segs)->pkt_len = segs->len; + ret = qdisc_enqueue(segs, q->qdisc); + } else { + ret = qdisc_reshape_fail(skb, sch); + } + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + } else { + nb++; + } + segs = nskb; + } + sch->q.qlen += nb; + if (nb > 1) + qdisc_tree_decrease_qlen(sch, 1 - nb); + consume_skb(skb); + return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; +} + static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); int ret; - if (qdisc_pkt_len(skb) > q->max_size) + if (qdisc_pkt_len(skb) > q->max_size) { + if (skb_is_gso(skb)) + return tbf_segment(skb, sch); return qdisc_reshape_fail(skb, sch); - + } ret = qdisc_enqueue(skb, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) @@ -298,9 +341,9 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) q->tokens = q->buffer; q->ptokens = q->mtu; - psched_ratecfg_precompute(&q->rate, rtab->rate.rate); + psched_ratecfg_precompute(&q->rate, &rtab->rate); if (ptab) { - psched_ratecfg_precompute(&q->peak, ptab->rate.rate); + psched_ratecfg_precompute(&q->peak, &ptab->rate); q->peak_present = true; } else { q->peak_present = false; @@ -350,9 +393,9 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; opt.limit = q->limit; - opt.rate.rate = psched_ratecfg_getrate(&q->rate); + psched_ratecfg_getrate(&opt.rate, &q->rate); if (q->peak_present) - opt.peakrate.rate = psched_ratecfg_getrate(&q->peak); + psched_ratecfg_getrate(&opt.peakrate, &q->peak); else memset(&opt.peakrate, 0, sizeof(opt.peakrate)); opt.mtu = PSCHED_NS2TICKS(q->mtu); diff --git a/net/sctp/input.c b/net/sctp/input.c index 4b2c83146aa..6533d81a638 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -589,7 +589,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) struct sctp_association *asoc = NULL; struct sctp_transport *transport; struct inet_sock *inet; - sk_buff_data_t saveip, savesctp; + __u16 saveip, savesctp; int err; struct net *net = dev_net(skb->dev); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 391a245d520..8ee553b499c 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -153,7 +153,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct sctp_association *asoc; struct sctp_transport *transport; struct ipv6_pinfo *np; - sk_buff_data_t saveip, savesctp; + __be16 saveip, savesctp; int err; struct net *net = dev_net(skb->dev); diff --git a/net/socket.c b/net/socket.c index 6b94633ca61..21fd29f63ed 100644 --- a/net/socket.c +++ b/net/socket.c @@ -104,6 +104,12 @@ #include <linux/route.h> #include <linux/sockios.h> #include <linux/atalk.h> +#include <net/ll_poll.h> + +#ifdef CONFIG_NET_LL_RX_POLL +unsigned long sysctl_net_ll_poll __read_mostly; +EXPORT_SYMBOL_GPL(sysctl_net_ll_poll); +#endif static int sock_no_open(struct inode *irrelevant, struct file *dontcare); static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, @@ -1956,7 +1962,7 @@ struct used_address { unsigned int name_len; }; -static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg, +static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address) { @@ -2071,22 +2077,30 @@ out: * BSD sendmsg interface */ -SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, flags) +long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags) { int fput_needed, err; struct msghdr msg_sys; - struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed); + struct socket *sock; + sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; - err = __sys_sendmsg(sock, msg, &msg_sys, flags, NULL); + err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL); fput_light(sock->file, fput_needed); out: return err; } +SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, flags) +{ + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_sendmsg(fd, msg, flags); +} + /* * Linux sendmmsg interface */ @@ -2117,15 +2131,16 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, while (datagrams < vlen) { if (MSG_CMSG_COMPAT & flags) { - err = __sys_sendmsg(sock, (struct msghdr __user *)compat_entry, - &msg_sys, flags, &used_address); + err = ___sys_sendmsg(sock, (struct msghdr __user *)compat_entry, + &msg_sys, flags, &used_address); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { - err = __sys_sendmsg(sock, (struct msghdr __user *)entry, - &msg_sys, flags, &used_address); + err = ___sys_sendmsg(sock, + (struct msghdr __user *)entry, + &msg_sys, flags, &used_address); if (err < 0) break; err = put_user(err, &entry->msg_len); @@ -2149,10 +2164,12 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags) { + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; return __sys_sendmmsg(fd, mmsg, vlen, flags); } -static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg, +static int ___sys_recvmsg(struct socket *sock, struct msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, int nosec) { struct compat_msghdr __user *msg_compat = @@ -2244,23 +2261,31 @@ out: * BSD recvmsg interface */ -SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, - unsigned int, flags) +long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags) { int fput_needed, err; struct msghdr msg_sys; - struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed); + struct socket *sock; + sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; - err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0); + err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); fput_light(sock->file, fput_needed); out: return err; } +SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, + unsigned int, flags) +{ + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_recvmsg(fd, msg, flags); +} + /* * Linux recvmmsg interface */ @@ -2298,17 +2323,18 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, * No need to ask LSM for more than the first datagram. */ if (MSG_CMSG_COMPAT & flags) { - err = __sys_recvmsg(sock, (struct msghdr __user *)compat_entry, - &msg_sys, flags & ~MSG_WAITFORONE, - datagrams); + err = ___sys_recvmsg(sock, (struct msghdr __user *)compat_entry, + &msg_sys, flags & ~MSG_WAITFORONE, + datagrams); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { - err = __sys_recvmsg(sock, (struct msghdr __user *)entry, - &msg_sys, flags & ~MSG_WAITFORONE, - datagrams); + err = ___sys_recvmsg(sock, + (struct msghdr __user *)entry, + &msg_sys, flags & ~MSG_WAITFORONE, + datagrams); if (err < 0) break; err = put_user(err, &entry->msg_len); @@ -2375,6 +2401,9 @@ SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, int datagrams; struct timespec timeout_sys; + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + if (!timeout) return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL); @@ -2612,7 +2641,9 @@ static int __init sock_init(void) */ #ifdef CONFIG_NETFILTER - netfilter_init(); + err = netfilter_init(); + if (err) + goto out; #endif #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 7da6b457f66..fc2f78d6a9b 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -52,6 +52,8 @@ #include <linux/sunrpc/gss_api.h> #include <asm/uaccess.h> +#include "../netns.h" + static const struct rpc_authops authgss_ops; static const struct rpc_credops gss_credops; @@ -85,8 +87,6 @@ struct gss_auth { }; /* pipe_version >= 0 if and only if someone has a pipe open. */ -static int pipe_version = -1; -static atomic_t pipe_users = ATOMIC_INIT(0); static DEFINE_SPINLOCK(pipe_version_lock); static struct rpc_wait_queue pipe_version_rpc_waitqueue; static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue); @@ -266,24 +266,27 @@ struct gss_upcall_msg { char databuf[UPCALL_BUF_LEN]; }; -static int get_pipe_version(void) +static int get_pipe_version(struct net *net) { + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret; spin_lock(&pipe_version_lock); - if (pipe_version >= 0) { - atomic_inc(&pipe_users); - ret = pipe_version; + if (sn->pipe_version >= 0) { + atomic_inc(&sn->pipe_users); + ret = sn->pipe_version; } else ret = -EAGAIN; spin_unlock(&pipe_version_lock); return ret; } -static void put_pipe_version(void) +static void put_pipe_version(struct net *net) { - if (atomic_dec_and_lock(&pipe_users, &pipe_version_lock)) { - pipe_version = -1; + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + if (atomic_dec_and_lock(&sn->pipe_users, &pipe_version_lock)) { + sn->pipe_version = -1; spin_unlock(&pipe_version_lock); } } @@ -291,9 +294,10 @@ static void put_pipe_version(void) static void gss_release_msg(struct gss_upcall_msg *gss_msg) { + struct net *net = rpc_net_ns(gss_msg->auth->client); if (!atomic_dec_and_test(&gss_msg->count)) return; - put_pipe_version(); + put_pipe_version(net); BUG_ON(!list_empty(&gss_msg->list)); if (gss_msg->ctx != NULL) gss_put_ctx(gss_msg->ctx); @@ -439,7 +443,10 @@ static void gss_encode_msg(struct gss_upcall_msg *gss_msg, struct rpc_clnt *clnt, const char *service_name) { - if (pipe_version == 0) + struct net *net = rpc_net_ns(clnt); + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + if (sn->pipe_version == 0) gss_encode_v0_msg(gss_msg); else /* pipe_version == 1 */ gss_encode_v1_msg(gss_msg, clnt, service_name); @@ -455,7 +462,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, struct rpc_clnt *clnt, gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS); if (gss_msg == NULL) return ERR_PTR(-ENOMEM); - vers = get_pipe_version(); + vers = get_pipe_version(rpc_net_ns(clnt)); if (vers < 0) { kfree(gss_msg); return ERR_PTR(vers); @@ -559,24 +566,34 @@ out: static inline int gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) { + struct net *net = rpc_net_ns(gss_auth->client); + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe; struct rpc_cred *cred = &gss_cred->gc_base; struct gss_upcall_msg *gss_msg; + unsigned long timeout; DEFINE_WAIT(wait); - int err = 0; + int err; dprintk("RPC: %s for uid %u\n", __func__, from_kuid(&init_user_ns, cred->cr_uid)); retry: + err = 0; + /* Default timeout is 15s unless we know that gssd is not running */ + timeout = 15 * HZ; + if (!sn->gssd_running) + timeout = HZ >> 2; gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { err = wait_event_interruptible_timeout(pipe_version_waitqueue, - pipe_version >= 0, 15*HZ); - if (pipe_version < 0) { + sn->pipe_version >= 0, timeout); + if (sn->pipe_version < 0) { + if (err == 0) + sn->gssd_running = 0; warn_gssd(); err = -EACCES; } - if (err) + if (err < 0) goto out; goto retry; } @@ -707,20 +724,22 @@ out: static int gss_pipe_open(struct inode *inode, int new_version) { + struct net *net = inode->i_sb->s_fs_info; + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret = 0; spin_lock(&pipe_version_lock); - if (pipe_version < 0) { + if (sn->pipe_version < 0) { /* First open of any gss pipe determines the version: */ - pipe_version = new_version; + sn->pipe_version = new_version; rpc_wake_up(&pipe_version_rpc_waitqueue); wake_up(&pipe_version_waitqueue); - } else if (pipe_version != new_version) { + } else if (sn->pipe_version != new_version) { /* Trying to open a pipe of a different version */ ret = -EBUSY; goto out; } - atomic_inc(&pipe_users); + atomic_inc(&sn->pipe_users); out: spin_unlock(&pipe_version_lock); return ret; @@ -740,6 +759,7 @@ static int gss_pipe_open_v1(struct inode *inode) static void gss_pipe_release(struct inode *inode) { + struct net *net = inode->i_sb->s_fs_info; struct rpc_pipe *pipe = RPC_I(inode)->pipe; struct gss_upcall_msg *gss_msg; @@ -758,7 +778,7 @@ restart: } spin_unlock(&pipe->lock); - put_pipe_version(); + put_pipe_version(net); } static void diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 871c73c9216..29b4ba93ab3 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1287,7 +1287,7 @@ static bool use_gss_proxy(struct net *net) #ifdef CONFIG_PROC_FS -static bool set_gss_proxy(struct net *net, int type) +static int set_gss_proxy(struct net *net, int type) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret = 0; @@ -1317,10 +1317,12 @@ static inline bool gssp_ready(struct sunrpc_net *sn) return false; } -static int wait_for_gss_proxy(struct net *net) +static int wait_for_gss_proxy(struct net *net, struct file *file) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + if (file->f_flags & O_NONBLOCK && !gssp_ready(sn)) + return -EAGAIN; return wait_event_interruptible(sn->gssp_wq, gssp_ready(sn)); } @@ -1362,7 +1364,7 @@ static ssize_t read_gssp(struct file *file, char __user *buf, size_t len; int ret; - ret = wait_for_gss_proxy(net); + ret = wait_for_gss_proxy(net, file); if (ret) return ret; diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index 7111a4c9113..74d948f5d5a 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -28,7 +28,11 @@ struct sunrpc_net { wait_queue_head_t gssp_wq; struct rpc_clnt *gssp_clnt; int use_gss_proxy; + int pipe_version; + atomic_t pipe_users; struct proc_dir_entry *use_gssp_proc; + + unsigned int gssd_running; }; extern int sunrpc_net_id; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index a9129f8d707..e7ce4b3eb0b 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -216,11 +216,14 @@ rpc_destroy_inode(struct inode *inode) static int rpc_pipe_open(struct inode *inode, struct file *filp) { + struct net *net = inode->i_sb->s_fs_info; + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe; int first_open; int res = -ENXIO; mutex_lock(&inode->i_mutex); + sn->gssd_running = 1; pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -1069,6 +1072,8 @@ void rpc_pipefs_init_net(struct net *net) struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); mutex_init(&sn->pipefs_sb_lock); + sn->gssd_running = 1; + sn->pipe_version = -1; } /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f8529fc8e54..5356b120dbf 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -324,11 +324,17 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); * Note: If the task is ASYNC, and is being made runnable after sitting on an * rpc_wait_queue, this must be called with the queue spinlock held to protect * the wait queue operation. + * Note the ordering of rpc_test_and_set_running() and rpc_clear_queued(), + * which is needed to ensure that __rpc_execute() doesn't loop (due to the + * lockless RPC_IS_QUEUED() test) before we've had a chance to test + * the RPC_TASK_RUNNING flag. */ static void rpc_make_runnable(struct rpc_task *task) { + bool need_wakeup = !rpc_test_and_set_running(task); + rpc_clear_queued(task); - if (rpc_test_and_set_running(task)) + if (!need_wakeup) return; if (RPC_IS_ASYNC(task)) { INIT_WORK(&task->u.tk_work, rpc_async_schedule); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index c3f9e1ef7f5..06bdf5a1082 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -810,11 +810,15 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) goto badcred; argv->iov_base = (void*)((__be32*)argv->iov_base + slen); /* skip machname */ argv->iov_len -= slen*4; - + /* + * Note: we skip uid_valid()/gid_valid() checks here for + * backwards compatibility with clients that use -1 id's. + * Instead, -1 uid or gid is later mapped to the + * (export-specific) anonymous id by nfsd_setuser. + * Supplementary gid's will be left alone. + */ cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */ cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */ - if (!uid_valid(cred->cr_uid) || !gid_valid(cred->cr_gid)) - goto badcred; slen = svc_getnl(argv); /* gids length */ if (slen > 16 || (len -= (slen + 2)*4) < 0) goto badcred; @@ -823,8 +827,6 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) return SVC_CLOSE; for (i = 0; i < slen; i++) { kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); - if (!gid_valid(kgid)) - goto badcred; GROUP_AT(cred->cr_group_info, i) = kgid; } if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 120a676a336..fc60bea6316 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -251,9 +251,9 @@ static void disable_bearer(struct tipc_bearer *tb_ptr) * specified device. */ static int recv_notification(struct notifier_block *nb, unsigned long evt, - void *dv) + void *ptr) { - struct net_device *dev = (struct net_device *)dv; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct eth_bearer *eb_ptr = ð_bearers[0]; struct eth_bearer *stop = ð_bearers[MAX_ETH_BEARERS]; diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index 2a2864c25e1..baa9df4327d 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -244,9 +244,9 @@ static void disable_bearer(struct tipc_bearer *tb_ptr) * specified device. */ static int recv_notification(struct notifier_block *nb, unsigned long evt, - void *dv) + void *ptr) { - struct net_device *dev = (struct net_device *)dv; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ib_bearer *ib_ptr = &ib_bearers[0]; struct ib_bearer *stop = &ib_bearers[MAX_IB_BEARERS]; diff --git a/net/wireless/core.c b/net/wireless/core.c index 41cec1776f4..e4df7749022 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -775,10 +775,9 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, } static int cfg80211_netdev_notifier_call(struct notifier_block *nb, - unsigned long state, - void *ndev) + unsigned long state, void *ptr) { - struct net_device *dev = ndev; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; int ret; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 37ca9694aab..1d964e23853 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -224,7 +224,7 @@ static void x25_kill_by_device(struct net_device *dev) static int x25_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index bcfda8921b5..eb4a8428864 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -64,6 +64,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) if (unlikely(x->km.state != XFRM_STATE_VALID)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEINVALID); + err = -EINVAL; goto error; } @@ -88,7 +89,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) err = x->type->output(x, skb); if (err == -EINPROGRESS) - goto out_exit; + goto out; resume: if (err) { @@ -106,15 +107,14 @@ resume: x = dst->xfrm; } while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL)); - err = 0; + return 0; -out_exit: - return err; error: spin_unlock_bh(&x->lock); error_nolock: kfree_skb(skb); - goto out_exit; +out: + return err; } int xfrm_output_resume(struct sk_buff *skb, int err) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 23cea0f7433..e52cab3591d 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2557,11 +2557,12 @@ static void __xfrm_garbage_collect(struct net *net) } } -static void xfrm_garbage_collect(struct net *net) +void xfrm_garbage_collect(struct net *net) { flow_cache_flush(); __xfrm_garbage_collect(net); } +EXPORT_SYMBOL(xfrm_garbage_collect); static void xfrm_garbage_collect_deferred(struct net *net) { @@ -2784,7 +2785,7 @@ static void __net_init xfrm_dst_ops_init(struct net *net) static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_DOWN: diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index aa778748c56..3f565e495ac 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1681,6 +1681,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, out: xfrm_pol_put(xp); + if (delete && err == 0) + xfrm_garbage_collect(net); return err; } |