diff options
-rw-r--r-- | Documentation/sysctl/net.txt | 7 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/ixgbe/ixgbe.h | 134 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 40 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 69 | ||||
-rw-r--r-- | include/linux/netdevice.h | 32 | ||||
-rw-r--r-- | include/linux/skbuff.h | 8 | ||||
-rw-r--r-- | include/net/ll_poll.h | 148 | ||||
-rw-r--r-- | include/net/sock.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/snmp.h | 1 | ||||
-rw-r--r-- | net/Kconfig | 12 | ||||
-rw-r--r-- | net/core/datagram.c | 4 | ||||
-rw-r--r-- | net/core/dev.c | 59 | ||||
-rw-r--r-- | net/core/skbuff.c | 4 | ||||
-rw-r--r-- | net/core/sock.c | 6 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 10 | ||||
-rw-r--r-- | net/ipv4/proc.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 5 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp.c | 6 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 2 | ||||
-rw-r--r-- | net/ipv6/udp.c | 6 | ||||
-rw-r--r-- | net/socket.c | 6 |
23 files changed, 556 insertions, 12 deletions
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index c1f8640c2fc..85ab72dcdc3 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt, it's a Per-CPU variable. Default: 64 +low_latency_poll +---------------- +Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL) +Approximate time in us to spin waiting for packets on the device queue. +Recommended value is 50. May increase power usage. +Default: 0 (off) + rmem_default ------------ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index ca932387a80..fb098b46c6a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -52,6 +52,11 @@ #include <linux/dca.h> #endif +#include <net/ll_poll.h> + +#ifdef CONFIG_NET_LL_RX_POLL +#define LL_EXTENDED_STATS +#endif /* common prefix used by pr_<> macros */ #undef pr_fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -182,6 +187,11 @@ struct ixgbe_rx_buffer { struct ixgbe_queue_stats { u64 packets; u64 bytes; +#ifdef LL_EXTENDED_STATS + u64 yields; + u64 misses; + u64 cleaned; +#endif /* LL_EXTENDED_STATS */ }; struct ixgbe_tx_queue_stats { @@ -356,9 +366,133 @@ struct ixgbe_q_vector { struct rcu_head rcu; /* to avoid race with update stats on free */ char name[IFNAMSIZ + 9]; +#ifdef CONFIG_NET_LL_RX_POLL + unsigned int state; +#define IXGBE_QV_STATE_IDLE 0 +#define IXGBE_QV_STATE_NAPI 1 /* NAPI owns this QV */ +#define IXGBE_QV_STATE_POLL 2 /* poll owns this QV */ +#define IXGBE_QV_LOCKED (IXGBE_QV_STATE_NAPI | IXGBE_QV_STATE_POLL) +#define IXGBE_QV_STATE_NAPI_YIELD 4 /* NAPI yielded this QV */ +#define IXGBE_QV_STATE_POLL_YIELD 8 /* poll yielded this QV */ +#define IXGBE_QV_YIELD (IXGBE_QV_STATE_NAPI_YIELD | IXGBE_QV_STATE_POLL_YIELD) +#define IXGBE_QV_USER_PEND (IXGBE_QV_STATE_POLL | IXGBE_QV_STATE_POLL_YIELD) + spinlock_t lock; +#endif /* CONFIG_NET_LL_RX_POLL */ + /* for dynamic allocation of rings associated with this q_vector */ struct ixgbe_ring ring[0] ____cacheline_internodealigned_in_smp; }; +#ifdef CONFIG_NET_LL_RX_POLL +static inline void ixgbe_qv_init_lock(struct ixgbe_q_vector *q_vector) +{ + + spin_lock_init(&q_vector->lock); + q_vector->state = IXGBE_QV_STATE_IDLE; +} + +/* called from the device poll routine to get ownership of a q_vector */ +static inline bool ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector) +{ + int rc = true; + spin_lock(&q_vector->lock); + if (q_vector->state & IXGBE_QV_LOCKED) { + WARN_ON(q_vector->state & IXGBE_QV_STATE_NAPI); + q_vector->state |= IXGBE_QV_STATE_NAPI_YIELD; + rc = false; +#ifdef LL_EXTENDED_STATS + q_vector->tx.ring->stats.yields++; +#endif + } else + /* we don't care if someone yielded */ + q_vector->state = IXGBE_QV_STATE_NAPI; + spin_unlock(&q_vector->lock); + return rc; +} + +/* returns true is someone tried to get the qv while napi had it */ +static inline bool ixgbe_qv_unlock_napi(struct ixgbe_q_vector *q_vector) +{ + int rc = false; + spin_lock(&q_vector->lock); + WARN_ON(q_vector->state & (IXGBE_QV_STATE_POLL | + IXGBE_QV_STATE_NAPI_YIELD)); + + if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD) + rc = true; + q_vector->state = IXGBE_QV_STATE_IDLE; + spin_unlock(&q_vector->lock); + return rc; +} + +/* called from ixgbe_low_latency_poll() */ +static inline bool ixgbe_qv_lock_poll(struct ixgbe_q_vector *q_vector) +{ + int rc = true; + spin_lock_bh(&q_vector->lock); + if ((q_vector->state & IXGBE_QV_LOCKED)) { + q_vector->state |= IXGBE_QV_STATE_POLL_YIELD; + rc = false; +#ifdef LL_EXTENDED_STATS + q_vector->rx.ring->stats.yields++; +#endif + } else + /* preserve yield marks */ + q_vector->state |= IXGBE_QV_STATE_POLL; + spin_unlock_bh(&q_vector->lock); + return rc; +} + +/* returns true if someone tried to get the qv while it was locked */ +static inline bool ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector) +{ + int rc = false; + spin_lock_bh(&q_vector->lock); + WARN_ON(q_vector->state & (IXGBE_QV_STATE_NAPI)); + + if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD) + rc = true; + q_vector->state = IXGBE_QV_STATE_IDLE; + spin_unlock_bh(&q_vector->lock); + return rc; +} + +/* true if a socket is polling, even if it did not get the lock */ +static inline bool ixgbe_qv_ll_polling(struct ixgbe_q_vector *q_vector) +{ + WARN_ON(!(q_vector->state & IXGBE_QV_LOCKED)); + return q_vector->state & IXGBE_QV_USER_PEND; +} +#else /* CONFIG_NET_LL_RX_POLL */ +static inline void ixgbe_qv_init_lock(struct ixgbe_q_vector *q_vector) +{ +} + +static inline bool ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector) +{ + return true; +} + +static inline bool ixgbe_qv_unlock_napi(struct ixgbe_q_vector *q_vector) +{ + return false; +} + +static inline bool ixgbe_qv_lock_poll(struct ixgbe_q_vector *q_vector) +{ + return false; +} + +static inline bool ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector) +{ + return false; +} + +static inline bool ixgbe_qv_ll_polling(struct ixgbe_q_vector *q_vector) +{ + return false; +} +#endif /* CONFIG_NET_LL_RX_POLL */ + #ifdef CONFIG_IXGBE_HWMON #define IXGBE_HWMON_TYPE_LOC 0 diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index d3754722adb..24e2e7aafda 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -1054,6 +1054,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev, data[i] = 0; data[i+1] = 0; i += 2; +#ifdef LL_EXTENDED_STATS + data[i] = 0; + data[i+1] = 0; + data[i+2] = 0; + i += 3; +#endif continue; } @@ -1063,6 +1069,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev, data[i+1] = ring->stats.bytes; } while (u64_stats_fetch_retry_bh(&ring->syncp, start)); i += 2; +#ifdef LL_EXTENDED_STATS + data[i] = ring->stats.yields; + data[i+1] = ring->stats.misses; + data[i+2] = ring->stats.cleaned; + i += 3; +#endif } for (j = 0; j < IXGBE_NUM_RX_QUEUES; j++) { ring = adapter->rx_ring[j]; @@ -1070,6 +1082,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev, data[i] = 0; data[i+1] = 0; i += 2; +#ifdef LL_EXTENDED_STATS + data[i] = 0; + data[i+1] = 0; + data[i+2] = 0; + i += 3; +#endif continue; } @@ -1079,6 +1097,12 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev, data[i+1] = ring->stats.bytes; } while (u64_stats_fetch_retry_bh(&ring->syncp, start)); i += 2; +#ifdef LL_EXTENDED_STATS + data[i] = ring->stats.yields; + data[i+1] = ring->stats.misses; + data[i+2] = ring->stats.cleaned; + i += 3; +#endif } for (j = 0; j < IXGBE_MAX_PACKET_BUFFERS; j++) { @@ -1115,12 +1139,28 @@ static void ixgbe_get_strings(struct net_device *netdev, u32 stringset, p += ETH_GSTRING_LEN; sprintf(p, "tx_queue_%u_bytes", i); p += ETH_GSTRING_LEN; +#ifdef LL_EXTENDED_STATS + sprintf(p, "tx_q_%u_napi_yield", i); + p += ETH_GSTRING_LEN; + sprintf(p, "tx_q_%u_misses", i); + p += ETH_GSTRING_LEN; + sprintf(p, "tx_q_%u_cleaned", i); + p += ETH_GSTRING_LEN; +#endif /* LL_EXTENDED_STATS */ } for (i = 0; i < IXGBE_NUM_RX_QUEUES; i++) { sprintf(p, "rx_queue_%u_packets", i); p += ETH_GSTRING_LEN; sprintf(p, "rx_queue_%u_bytes", i); p += ETH_GSTRING_LEN; +#ifdef LL_EXTENDED_STATS + sprintf(p, "rx_q_%u_ll_poll_yield", i); + p += ETH_GSTRING_LEN; + sprintf(p, "rx_q_%u_misses", i); + p += ETH_GSTRING_LEN; + sprintf(p, "rx_q_%u_cleaned", i); + p += ETH_GSTRING_LEN; +#endif /* LL_EXTENDED_STATS */ } for (i = 0; i < IXGBE_MAX_PACKET_BUFFERS; i++) { sprintf(p, "tx_pb_%u_pxon", i); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c index ef5f7a678ce..90b4e1089ec 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c @@ -811,6 +811,7 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter, /* initialize NAPI */ netif_napi_add(adapter->netdev, &q_vector->napi, ixgbe_poll, 64); + napi_hash_add(&q_vector->napi); /* tie q_vector and adapter together */ adapter->q_vector[v_idx] = q_vector; @@ -931,6 +932,7 @@ static void ixgbe_free_q_vector(struct ixgbe_adapter *adapter, int v_idx) adapter->rx_ring[ring->queue_index] = NULL; adapter->q_vector[v_idx] = NULL; + napi_hash_del(&q_vector->napi); netif_napi_del(&q_vector->napi); /* diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index d30fbdd81fc..047ebaaf014 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1504,7 +1504,9 @@ static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector, { struct ixgbe_adapter *adapter = q_vector->adapter; - if (!(adapter->flags & IXGBE_FLAG_IN_NETPOLL)) + if (ixgbe_qv_ll_polling(q_vector)) + netif_receive_skb(skb); + else if (!(adapter->flags & IXGBE_FLAG_IN_NETPOLL)) napi_gro_receive(&q_vector->napi, skb); else netif_rx(skb); @@ -1892,9 +1894,9 @@ dma_sync: * expensive overhead for IOMMU access this provides a means of avoiding * it by maintaining the mapping of the page to the syste. * - * Returns true if all work is completed without reaching budget + * Returns amount of work completed **/ -static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, +static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, struct ixgbe_ring *rx_ring, const int budget) { @@ -1976,6 +1978,7 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, } #endif /* IXGBE_FCOE */ + skb_mark_ll(skb, &q_vector->napi); ixgbe_rx_skb(q_vector, skb); /* update budget accounting */ @@ -1992,9 +1995,43 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, if (cleaned_count) ixgbe_alloc_rx_buffers(rx_ring, cleaned_count); - return (total_rx_packets < budget); + return total_rx_packets; } +#ifdef CONFIG_NET_LL_RX_POLL +/* must be called with local_bh_disable()d */ +static int ixgbe_low_latency_recv(struct napi_struct *napi) +{ + struct ixgbe_q_vector *q_vector = + container_of(napi, struct ixgbe_q_vector, napi); + struct ixgbe_adapter *adapter = q_vector->adapter; + struct ixgbe_ring *ring; + int found = 0; + + if (test_bit(__IXGBE_DOWN, &adapter->state)) + return LL_FLUSH_FAILED; + + if (!ixgbe_qv_lock_poll(q_vector)) + return LL_FLUSH_BUSY; + + ixgbe_for_each_ring(ring, q_vector->rx) { + found = ixgbe_clean_rx_irq(q_vector, ring, 4); +#ifdef LL_EXTENDED_STATS + if (found) + ring->stats.cleaned += found; + else + ring->stats.misses++; +#endif + if (found) + break; + } + + ixgbe_qv_unlock_poll(q_vector); + + return found; +} +#endif /* CONFIG_NET_LL_RX_POLL */ + /** * ixgbe_configure_msix - Configure MSI-X hardware * @adapter: board private structure @@ -2550,6 +2587,9 @@ int ixgbe_poll(struct napi_struct *napi, int budget) ixgbe_for_each_ring(ring, q_vector->tx) clean_complete &= !!ixgbe_clean_tx_irq(q_vector, ring); + if (!ixgbe_qv_lock_napi(q_vector)) + return budget; + /* attempt to distribute budget to each queue fairly, but don't allow * the budget to go below 1 because we'll exit polling */ if (q_vector->rx.count > 1) @@ -2558,9 +2598,10 @@ int ixgbe_poll(struct napi_struct *napi, int budget) per_ring_budget = budget; ixgbe_for_each_ring(ring, q_vector->rx) - clean_complete &= ixgbe_clean_rx_irq(q_vector, ring, - per_ring_budget); + clean_complete &= (ixgbe_clean_rx_irq(q_vector, ring, + per_ring_budget) < per_ring_budget); + ixgbe_qv_unlock_napi(q_vector); /* If all work not completed, return budget and keep polling */ if (!clean_complete) return budget; @@ -3747,16 +3788,25 @@ static void ixgbe_napi_enable_all(struct ixgbe_adapter *adapter) { int q_idx; - for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++) + for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++) { + ixgbe_qv_init_lock(adapter->q_vector[q_idx]); napi_enable(&adapter->q_vector[q_idx]->napi); + } } static void ixgbe_napi_disable_all(struct ixgbe_adapter *adapter) { int q_idx; - for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++) + local_bh_disable(); /* for ixgbe_qv_lock_napi() */ + for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++) { napi_disable(&adapter->q_vector[q_idx]->napi); + while (!ixgbe_qv_lock_napi(adapter->q_vector[q_idx])) { + pr_info("QV %d locked\n", q_idx); + mdelay(1); + } + } + local_bh_enable(); } #ifdef CONFIG_IXGBE_DCB @@ -7177,6 +7227,9 @@ static const struct net_device_ops ixgbe_netdev_ops = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = ixgbe_netpoll, #endif +#ifdef CONFIG_NET_LL_RX_POLL + .ndo_ll_poll = ixgbe_low_latency_recv, +#endif #ifdef IXGBE_FCOE .ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get, .ndo_fcoe_ddp_target = ixgbe_fcoe_ddp_target, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8f967e34142..2ecb96d9a1e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -324,12 +324,15 @@ struct napi_struct { struct sk_buff *gro_list; struct sk_buff *skb; struct list_head dev_list; + struct hlist_node napi_hash_node; + unsigned int napi_id; }; enum { NAPI_STATE_SCHED, /* Poll is scheduled */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ + NAPI_STATE_HASHED, /* In NAPI hash */ }; enum gro_result { @@ -446,6 +449,32 @@ extern void __napi_complete(struct napi_struct *n); extern void napi_complete(struct napi_struct *n); /** + * napi_by_id - lookup a NAPI by napi_id + * @napi_id: hashed napi_id + * + * lookup @napi_id in napi_hash table + * must be called under rcu_read_lock() + */ +extern struct napi_struct *napi_by_id(unsigned int napi_id); + +/** + * napi_hash_add - add a NAPI to global hashtable + * @napi: napi context + * + * generate a new napi_id and store a @napi under it in napi_hash + */ +extern void napi_hash_add(struct napi_struct *napi); + +/** + * napi_hash_del - remove a NAPI from global table + * @napi: napi context + * + * Warning: caller must observe rcu grace period + * before freeing memory containing @napi + */ +extern void napi_hash_del(struct napi_struct *napi); + +/** * napi_disable - prevent NAPI from scheduling * @n: napi context * @@ -943,6 +972,9 @@ struct net_device_ops { gfp_t gfp); void (*ndo_netpoll_cleanup)(struct net_device *dev); #endif +#ifdef CONFIG_NET_LL_RX_POLL + int (*ndo_ll_poll)(struct napi_struct *dev); +#endif int (*ndo_set_vf_mac)(struct net_device *dev, int queue, u8 *mac); int (*ndo_set_vf_vlan)(struct net_device *dev, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9995834d2cb..400d82ae2b0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t; * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @dma_cookie: a cookie to one of several possible DMA operations * done by skb DMA functions + * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark * @dropcount: total number of sk_receive_queue overflows @@ -500,8 +501,11 @@ struct sk_buff { /* 7/9 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); -#ifdef CONFIG_NET_DMA - dma_cookie_t dma_cookie; +#if defined CONFIG_NET_DMA || defined CONFIG_NET_LL_RX_POLL + union { + unsigned int napi_id; + dma_cookie_t dma_cookie; + }; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h new file mode 100644 index 00000000000..bc262f88173 --- /dev/null +++ b/include/net/ll_poll.h @@ -0,0 +1,148 @@ +/* + * Low Latency Sockets + * Copyright(c) 2013 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * Author: Eliezer Tamir + * + * Contact Information: + * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> + */ + +/* + * For now this depends on CONFIG_X86_TSC + */ + +#ifndef _LINUX_NET_LL_POLL_H +#define _LINUX_NET_LL_POLL_H + +#include <linux/netdevice.h> +#include <net/ip.h> + +#ifdef CONFIG_NET_LL_RX_POLL + +struct napi_struct; +extern unsigned long sysctl_net_ll_poll __read_mostly; + +/* return values from ndo_ll_poll */ +#define LL_FLUSH_FAILED -1 +#define LL_FLUSH_BUSY -2 + +/* we don't mind a ~2.5% imprecision */ +#define TSC_MHZ (tsc_khz >> 10) + +static inline cycles_t ll_end_time(void) +{ + return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles(); +} + +static inline bool sk_valid_ll(struct sock *sk) +{ + return sysctl_net_ll_poll && sk->sk_napi_id && + !need_resched() && !signal_pending(current); +} + +static inline bool can_poll_ll(cycles_t end_time) +{ + return !time_after((unsigned long)get_cycles(), + (unsigned long)end_time); +} + +static inline bool sk_poll_ll(struct sock *sk, int nonblock) +{ + cycles_t end_time = ll_end_time(); + const struct net_device_ops *ops; + struct napi_struct *napi; + int rc = false; + + /* + * rcu read lock for napi hash + * bh so we don't race with net_rx_action + */ + rcu_read_lock_bh(); + + napi = napi_by_id(sk->sk_napi_id); + if (!napi) + goto out; + + ops = napi->dev->netdev_ops; + if (!ops->ndo_ll_poll) + goto out; + + do { + + rc = ops->ndo_ll_poll(napi); + + if (rc == LL_FLUSH_FAILED) + break; /* permanent failure */ + + if (rc > 0) + /* local bh are disabled so it is ok to use _BH */ + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_LOWLATENCYRXPACKETS, rc); + + } while (skb_queue_empty(&sk->sk_receive_queue) + && can_poll_ll(end_time) && !nonblock); + + rc = !skb_queue_empty(&sk->sk_receive_queue); +out: + rcu_read_unlock_bh(); + return rc; +} + +/* used in the NIC receive handler to mark the skb */ +static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi) +{ + skb->napi_id = napi->napi_id; +} + +/* used in the protocol hanlder to propagate the napi_id to the socket */ +static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) +{ + sk->sk_napi_id = skb->napi_id; +} + +#else /* CONFIG_NET_LL_RX_POLL */ + +static inline cycles_t ll_end_time(void) +{ + return 0; +} + +static inline bool sk_valid_ll(struct sock *sk) +{ + return false; +} + +static inline bool sk_poll_ll(struct sock *sk, int nonblock) +{ + return false; +} + +static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi) +{ +} + +static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) +{ +} + +static inline bool can_poll_ll(cycles_t end_time) +{ + return false; +} + +#endif /* CONFIG_NET_LL_RX_POLL */ +#endif /* _LINUX_NET_LL_POLL_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 66772cf8c3c..ac8e1818380 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -229,6 +229,7 @@ struct cg_proto; * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward + * @sk_napi_id: id of the last napi context to receive data for sk * @sk_allocation: allocation mode * @sk_sndbuf: size of send buffer in bytes * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, @@ -325,6 +326,9 @@ struct sock { #ifdef CONFIG_RPS __u32 sk_rxhash; #endif +#ifdef CONFIG_NET_LL_RX_POLL + unsigned int sk_napi_id; +#endif atomic_t sk_drops; int sk_rcvbuf; diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index df2e8b4f9c0..26cbf76f805 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -253,6 +253,7 @@ enum LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ + LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */ __LINUX_MIB_MAX }; diff --git a/net/Kconfig b/net/Kconfig index 523e43e6da1..d6a9ce6e180 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -243,6 +243,18 @@ config NETPRIO_CGROUP Cgroup subsystem for use in assigning processes to network priorities on a per-interface basis +config NET_LL_RX_POLL + bool "Low Latency Receive Poll" + depends on X86_TSC + default n + ---help--- + Support Low Latency Receive Queue Poll. + (For network card drivers which support this option.) + When waiting for data in read or poll call directly into the the device driver + to flush packets which may be pending on the device queues into the stack. + + If unsure, say N. + config BQL boolean depends on SYSFS diff --git a/net/core/datagram.c b/net/core/datagram.c index b71423db778..9cbaba98ce4 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -56,6 +56,7 @@ #include <net/sock.h> #include <net/tcp_states.h> #include <trace/events/skb.h> +#include <net/ll_poll.h> /* * Is a socket 'connection oriented' ? @@ -207,6 +208,9 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, } spin_unlock_irqrestore(&queue->lock, cpu_flags); + if (sk_valid_ll(sk) && sk_poll_ll(sk, flags & MSG_DONTWAIT)) + continue; + /* User doesn't want to wait */ error = -EAGAIN; if (!timeo) diff --git a/net/core/dev.c b/net/core/dev.c index 9c18557f93c..fa007dba6be 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -129,6 +129,7 @@ #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> +#include <linux/hashtable.h> #include "net-sysfs.h" @@ -166,6 +167,12 @@ static struct list_head offload_base __read_mostly; DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +/* protects napi_hash addition/deletion and napi_gen_id */ +static DEFINE_SPINLOCK(napi_hash_lock); + +static unsigned int napi_gen_id; +static DEFINE_HASHTABLE(napi_hash, 8); + seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) @@ -4136,6 +4143,58 @@ void napi_complete(struct napi_struct *n) } EXPORT_SYMBOL(napi_complete); +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} +EXPORT_SYMBOL_GPL(napi_by_id); + +void napi_hash_add(struct napi_struct *napi) +{ + if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + + spin_lock(&napi_hash_lock); + + /* 0 is not a valid id, we also skip an id that is taken + * we expect both events to be extremely rare + */ + napi->napi_id = 0; + while (!napi->napi_id) { + napi->napi_id = ++napi_gen_id; + if (napi_by_id(napi->napi_id)) + napi->napi_id = 0; + } + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); + } +} +EXPORT_SYMBOL_GPL(napi_hash_add); + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +void napi_hash_del(struct napi_struct *napi) +{ + spin_lock(&napi_hash_lock); + + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + hlist_del_rcu(&napi->napi_hash_node); + + spin_unlock(&napi_hash_lock); +} +EXPORT_SYMBOL_GPL(napi_hash_del); + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 73f57a0e152..4a4181e16c1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->vlan_tci = old->vlan_tci; skb_copy_secmark(new, old); + +#ifdef CONFIG_NET_LL_RX_POLL + new->napi_id = old->napi_id; +#endif } /* diff --git a/net/core/sock.c b/net/core/sock.c index 88868a9d21d..788c0da5eed 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -139,6 +139,8 @@ #include <net/tcp.h> #endif +#include <net/ll_poll.h> + static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); +#ifdef CONFIG_NET_LL_RX_POLL + sk->sk_napi_id = 0; +#endif + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 741db5fc780..4b48f39582b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -19,6 +19,7 @@ #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> +#include <net/ll_poll.h> static int one = 1; @@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = flow_limit_table_len_sysctl }, #endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_LL_RX_POLL + { + .procname = "low_latency_poll", + .data = &sysctl_net_ll_poll, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax + }, +#endif #endif /* CONFIG_NET */ { .procname = "netdev_budget", diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2a5bf86d241..6577a1149a4 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), + SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bc4246940f6..46ed9afd1f5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include <asm/uaccess.h> #include <asm/ioctls.h> +#include <net/ll_poll.h> int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; @@ -1553,6 +1554,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct sk_buff *skb; u32 urg_hole = 0; + if (sk_valid_ll(sk) && skb_queue_empty(&sk->sk_receive_queue) + && (sk->sk_state == TCP_ESTABLISHED)) + sk_poll_ll(sk, nonblock); + lock_sock(sk); err = -ENOTCONN; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 289039b4d8d..1063bb83e34 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -75,6 +75,7 @@ #include <net/netdma.h> #include <net/secure_seq.h> #include <net/tcp_memcontrol.h> +#include <net/ll_poll.h> #include <linux/inet.h> #include <linux/ipv6.h> @@ -1993,6 +1994,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; + sk_mark_ll(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c7338ec79cc..2955b25aee6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -109,6 +109,7 @@ #include <trace/events/udp.h> #include <linux/static_key.h> #include <trace/events/skb.h> +#include <net/ll_poll.h> #include "udp_impl.h" struct udp_table udp_table __read_mostly; @@ -1709,7 +1710,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { - int ret = udp_queue_rcv_skb(sk, skb); + int ret; + + sk_mark_ll(sk, skb); + ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0a17ed9eaf3..5cffa5c3e6b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -63,6 +63,7 @@ #include <net/inet_common.h> #include <net/secure_seq.h> #include <net/tcp_memcontrol.h> +#include <net/ll_poll.h> #include <asm/uaccess.h> @@ -1498,6 +1499,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; + sk_mark_ll(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b5808539cd5..f77e34c5a0e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/inet6_hashtables.h> +#include <net/ll_poll.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -841,7 +842,10 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { - int ret = udpv6_queue_rcv_skb(sk, skb); + int ret; + + sk_mark_ll(sk, skb); + ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but diff --git a/net/socket.c b/net/socket.c index 3ebdcb805c5..21fd29f63ed 100644 --- a/net/socket.c +++ b/net/socket.c @@ -104,6 +104,12 @@ #include <linux/route.h> #include <linux/sockios.h> #include <linux/atalk.h> +#include <net/ll_poll.h> + +#ifdef CONFIG_NET_LL_RX_POLL +unsigned long sysctl_net_ll_poll __read_mostly; +EXPORT_SYMBOL_GPL(sysctl_net_ll_poll); +#endif static int sock_no_open(struct inode *irrelevant, struct file *dontcare); static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, |