diff options
89 files changed, 2680 insertions, 1107 deletions
diff --git a/Documentation/devicetree/bindings/net/broadcom-bcmgenet.txt b/Documentation/devicetree/bindings/net/broadcom-bcmgenet.txt index f2febb94550..451fef26b4d 100644 --- a/Documentation/devicetree/bindings/net/broadcom-bcmgenet.txt +++ b/Documentation/devicetree/bindings/net/broadcom-bcmgenet.txt @@ -24,7 +24,7 @@ Optional properties: - fixed-link: When the GENET interface is connected to a MoCA hardware block or when operating in a RGMII to RGMII type of connection, or when the MDIO bus is voluntarily disabled, this property should be used to describe the "fixed link". - See Documentation/devicetree/bindings/net/fsl-tsec-phy.txt for information on + See Documentation/devicetree/bindings/net/fixed-link.txt for information on the property specifics Required child nodes: diff --git a/Documentation/devicetree/bindings/net/broadcom-systemport.txt b/Documentation/devicetree/bindings/net/broadcom-systemport.txt index 1b7600e022d..c183ea90d9b 100644 --- a/Documentation/devicetree/bindings/net/broadcom-systemport.txt +++ b/Documentation/devicetree/bindings/net/broadcom-systemport.txt @@ -8,7 +8,7 @@ Required properties: - local-mac-address: Ethernet MAC address (48 bits) of this adapter - phy-mode: Should be a string describing the PHY interface to the Ethernet switch/PHY, see Documentation/devicetree/bindings/net/ethernet.txt -- fixed-link: see Documentation/devicetree/bindings/net/fsl-tsec-phy.txt for +- fixed-link: see Documentation/devicetree/bindings/net/fixed-link.txt for the property specific details Optional properties: diff --git a/Documentation/devicetree/bindings/net/fixed-link.txt b/Documentation/devicetree/bindings/net/fixed-link.txt index e956de1be93..82bf7e0f47b 100644 --- a/Documentation/devicetree/bindings/net/fixed-link.txt +++ b/Documentation/devicetree/bindings/net/fixed-link.txt @@ -18,6 +18,18 @@ properties: * 'asym-pause' (boolean, optional), to indicate that asym_pause should be enabled. +Old, deprecated 'fixed-link' binding: + +* A 'fixed-link' property in the Ethernet MAC node, with 5 cells, of the + form <a b c d e> with the following accepted values: + - a: emulated PHY ID, choose any but but unique to the all specified + fixed-links, from 0 to 31 + - b: duplex configuration: 0 for half duplex, 1 for full duplex + - c: link speed in Mbits/sec, accepted values are: 10, 100 and 1000 + - d: pause configuration: 0 for no pause, 1 for pause + - e: asymmetric pause configuration: 0 for no asymmetric pause, 1 for + asymmetric pause + Example: ethernet@0 { diff --git a/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt b/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt index 737cdef4f90..be6ea8960f2 100644 --- a/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt +++ b/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt @@ -42,10 +42,7 @@ Properties: interrupt. For TSEC and eTSEC devices, the first interrupt is transmit, the second is receive, and the third is error. - phy-handle : See ethernet.txt file in the same directory. - - fixed-link : <a b c d e> where a is emulated phy id - choose any, - but unique to the all specified fixed-links, b is duplex - 0 half, - 1 full, c is link speed - d#10/d#100/d#1000, d is pause - 0 no - pause, 1 pause, e is asym_pause - 0 no asym_pause, 1 asym_pause. + - fixed-link : See fixed-link.txt in the same directory. - phy-connection-type : See ethernet.txt file in the same directory. This property is only really needed if the connection is of type "rgmii-id", as all other connection types are detected by hardware. diff --git a/MAINTAINERS b/MAINTAINERS index bde15ffcccf..f5de16eb195 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2228,9 +2228,8 @@ F: drivers/platform/chrome/ CISCO VIC ETHERNET NIC DRIVER M: Christian Benvenuti <benve@cisco.com> M: Sujith Sankar <ssujith@cisco.com> -M: Govindarajulu Varadarajan <govindarajulu90@gmail.com> +M: Govindarajulu Varadarajan <_govind@gmx.com> M: Neel Patel <neepatel@cisco.com> -M: Nishank Trivedi <nistrive@cisco.com> S: Supported F: drivers/net/ethernet/cisco/enic/ diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c index 228cf91b91c..ffd1169ebaa 100644 --- a/arch/powerpc/sysdev/fsl_soc.c +++ b/arch/powerpc/sysdev/fsl_soc.c @@ -25,7 +25,6 @@ #include <linux/of.h> #include <linux/of_platform.h> #include <linux/phy.h> -#include <linux/phy_fixed.h> #include <linux/spi/spi.h> #include <linux/fsl_devices.h> #include <linux/fs_enet_pd.h> @@ -178,37 +177,6 @@ u32 get_baudrate(void) EXPORT_SYMBOL(get_baudrate); #endif /* CONFIG_CPM2 */ -#ifdef CONFIG_FIXED_PHY -static int __init of_add_fixed_phys(void) -{ - int ret; - struct device_node *np; - u32 *fixed_link; - struct fixed_phy_status status = {}; - - for_each_node_by_name(np, "ethernet") { - fixed_link = (u32 *)of_get_property(np, "fixed-link", NULL); - if (!fixed_link) - continue; - - status.link = 1; - status.duplex = fixed_link[1]; - status.speed = fixed_link[2]; - status.pause = fixed_link[3]; - status.asym_pause = fixed_link[4]; - - ret = fixed_phy_add(PHY_POLL, fixed_link[0], &status); - if (ret) { - of_node_put(np); - return ret; - } - } - - return 0; -} -arch_initcall(of_add_fixed_phys); -#endif /* CONFIG_FIXED_PHY */ - #if defined(CONFIG_FSL_SOC_BOOKE) || defined(CONFIG_PPC_86xx) static __be32 __iomem *rstcr; diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index 1bdf104e90b..b621f56a36b 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -2551,12 +2551,12 @@ done: timeout = 5 * 1000; while (atomic_read(&vc->scq->used) > 0) { timeout = msleep_interruptible(timeout); - if (!timeout) + if (!timeout) { + pr_warn("%s: SCQ drain timeout: %u used\n", + card->name, atomic_read(&vc->scq->used)); break; + } } - if (!timeout) - printk("%s: SCQ drain timeout: %u used\n", - card->name, atomic_read(&vc->scq->used)); writel(TCMDQ_HALT | vc->index, SAR_REG_TCMDQ); clear_scd(card, vc->scq, vc->class); diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 71232053210..9071139d287 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1038,6 +1038,7 @@ static void bond_compute_features(struct bonding *bond) if (!bond_has_slaves(bond)) goto done; + vlan_features &= NETIF_F_ALL_FOR_ALL; bond_for_each_slave(bond, slave, iter) { vlan_features = netdev_increment_features(vlan_features, diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index d40c5b969e9..dc708a888f8 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1327,8 +1327,8 @@ static int bcm_sysport_open(struct net_device *dev) /* Read CRC forward */ priv->crc_fwd = !!(umac_readl(priv, UMAC_CMD) & CMD_CRC_FWD); - priv->phydev = of_phy_connect_fixed_link(dev, bcm_sysport_adj_link, - priv->phy_interface); + priv->phydev = of_phy_connect(dev, priv->phy_dn, bcm_sysport_adj_link, + 0, priv->phy_interface); if (!priv->phydev) { netdev_err(dev, "could not attach to PHY\n"); return -ENODEV; @@ -1551,6 +1551,19 @@ static int bcm_sysport_probe(struct platform_device *pdev) if (priv->phy_interface < 0) priv->phy_interface = PHY_INTERFACE_MODE_GMII; + /* In the case of a fixed PHY, the DT node associated + * to the PHY is the Ethernet MAC DT node. + */ + if (of_phy_is_fixed_link(dn)) { + ret = of_phy_register_fixed_link(dn); + if (ret) { + dev_err(&pdev->dev, "failed to register fixed PHY\n"); + goto err; + } + + priv->phy_dn = dn; + } + /* Initialize netdevice members */ macaddr = of_get_mac_address(dn); if (!macaddr || !is_valid_ether_addr(macaddr)) { diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h index abdeb62616d..73fd04a9479 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.h +++ b/drivers/net/ethernet/broadcom/bcmsysport.h @@ -656,6 +656,7 @@ struct bcm_sysport_priv { unsigned int rx_c_index; /* PHY device */ + struct device_node *phy_dn; struct phy_device *phydev; phy_interface_t phy_interface; int old_pause; diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index 4608673beaf..add8d859608 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -298,6 +298,7 @@ int bcmgenet_mii_config(struct net_device *dev) static int bcmgenet_mii_probe(struct net_device *dev) { struct bcmgenet_priv *priv = netdev_priv(dev); + struct device_node *dn = priv->pdev->dev.of_node; struct phy_device *phydev; unsigned int phy_flags; int ret; @@ -307,15 +308,19 @@ static int bcmgenet_mii_probe(struct net_device *dev) return 0; } - if (priv->phy_dn) - phydev = of_phy_connect(dev, priv->phy_dn, - bcmgenet_mii_setup, 0, - priv->phy_interface); - else - phydev = of_phy_connect_fixed_link(dev, - bcmgenet_mii_setup, - priv->phy_interface); + /* In the case of a fixed PHY, the DT node associated + * to the PHY is the Ethernet MAC DT node. + */ + if (of_phy_is_fixed_link(dn)) { + ret = of_phy_register_fixed_link(dn); + if (ret) + return ret; + + priv->phy_dn = dn; + } + phydev = of_phy_connect(dev, priv->phy_dn, bcmgenet_mii_setup, 0, + priv->phy_interface); if (!phydev) { pr_err("could not attach to PHY\n"); return -ENODEV; diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h index e35c8e0202a..f23ef321606 100644 --- a/drivers/net/ethernet/cisco/enic/enic.h +++ b/drivers/net/ethernet/cisco/enic/enic.h @@ -43,6 +43,8 @@ #define ENIC_CQ_MAX (ENIC_WQ_MAX + ENIC_RQ_MAX) #define ENIC_INTR_MAX (ENIC_CQ_MAX + 2) +#define ENIC_AIC_LARGE_PKT_DIFF 3 + struct enic_msix_entry { int requested; char devname[IFNAMSIZ]; @@ -50,6 +52,33 @@ struct enic_msix_entry { void *devid; }; +/* Store only the lower range. Higher range is given by fw. */ +struct enic_intr_mod_range { + u32 small_pkt_range_start; + u32 large_pkt_range_start; +}; + +struct enic_intr_mod_table { + u32 rx_rate; + u32 range_percent; +}; + +#define ENIC_MAX_LINK_SPEEDS 3 +#define ENIC_LINK_SPEED_10G 10000 +#define ENIC_LINK_SPEED_4G 4000 +#define ENIC_LINK_40G_INDEX 2 +#define ENIC_LINK_10G_INDEX 1 +#define ENIC_LINK_4G_INDEX 0 +#define ENIC_RX_COALESCE_RANGE_END 125 +#define ENIC_AIC_TS_BREAK 100 + +struct enic_rx_coal { + u32 small_pkt_range_start; + u32 large_pkt_range_start; + u32 range_end; + u32 use_adaptive_rx_coalesce; +}; + /* priv_flags */ #define ENIC_SRIOV_ENABLED (1 << 0) @@ -92,6 +121,7 @@ struct enic { unsigned int mc_count; unsigned int uc_count; u32 port_mtu; + struct enic_rx_coal rx_coalesce_setting; u32 rx_coalesce_usecs; u32 tx_coalesce_usecs; #ifdef CONFIG_PCI_IOV diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c index 58a8c67638e..1882db230e1 100644 --- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c +++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c @@ -79,6 +79,17 @@ static const struct enic_stat enic_rx_stats[] = { static const unsigned int enic_n_tx_stats = ARRAY_SIZE(enic_tx_stats); static const unsigned int enic_n_rx_stats = ARRAY_SIZE(enic_rx_stats); +void enic_intr_coal_set_rx(struct enic *enic, u32 timer) +{ + int i; + int intr; + + for (i = 0; i < enic->rq_count; i++) { + intr = enic_msix_rq_intr(enic, i); + vnic_intr_coalescing_timer_set(&enic->intr[intr], timer); + } +} + static int enic_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd) { @@ -178,9 +189,14 @@ static int enic_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ecmd) { struct enic *enic = netdev_priv(netdev); + struct enic_rx_coal *rxcoal = &enic->rx_coalesce_setting; ecmd->tx_coalesce_usecs = enic->tx_coalesce_usecs; ecmd->rx_coalesce_usecs = enic->rx_coalesce_usecs; + if (rxcoal->use_adaptive_rx_coalesce) + ecmd->use_adaptive_rx_coalesce = 1; + ecmd->rx_coalesce_usecs_low = rxcoal->small_pkt_range_start; + ecmd->rx_coalesce_usecs_high = rxcoal->range_end; return 0; } @@ -191,17 +207,31 @@ static int enic_set_coalesce(struct net_device *netdev, struct enic *enic = netdev_priv(netdev); u32 tx_coalesce_usecs; u32 rx_coalesce_usecs; + u32 rx_coalesce_usecs_low; + u32 rx_coalesce_usecs_high; + u32 coalesce_usecs_max; unsigned int i, intr; + struct enic_rx_coal *rxcoal = &enic->rx_coalesce_setting; + coalesce_usecs_max = vnic_dev_get_intr_coal_timer_max(enic->vdev); tx_coalesce_usecs = min_t(u32, ecmd->tx_coalesce_usecs, - vnic_dev_get_intr_coal_timer_max(enic->vdev)); + coalesce_usecs_max); rx_coalesce_usecs = min_t(u32, ecmd->rx_coalesce_usecs, - vnic_dev_get_intr_coal_timer_max(enic->vdev)); + coalesce_usecs_max); + + rx_coalesce_usecs_low = min_t(u32, ecmd->rx_coalesce_usecs_low, + coalesce_usecs_max); + rx_coalesce_usecs_high = min_t(u32, ecmd->rx_coalesce_usecs_high, + coalesce_usecs_max); switch (vnic_dev_get_intr_mode(enic->vdev)) { case VNIC_DEV_INTR_MODE_INTX: if (tx_coalesce_usecs != rx_coalesce_usecs) return -EINVAL; + if (ecmd->use_adaptive_rx_coalesce || + ecmd->rx_coalesce_usecs_low || + ecmd->rx_coalesce_usecs_high) + return -EOPNOTSUPP; intr = enic_legacy_io_intr(); vnic_intr_coalescing_timer_set(&enic->intr[intr], @@ -210,6 +240,10 @@ static int enic_set_coalesce(struct net_device *netdev, case VNIC_DEV_INTR_MODE_MSI: if (tx_coalesce_usecs != rx_coalesce_usecs) return -EINVAL; + if (ecmd->use_adaptive_rx_coalesce || + ecmd->rx_coalesce_usecs_low || + ecmd->rx_coalesce_usecs_high) + return -EOPNOTSUPP; vnic_intr_coalescing_timer_set(&enic->intr[0], tx_coalesce_usecs); @@ -221,12 +255,27 @@ static int enic_set_coalesce(struct net_device *netdev, tx_coalesce_usecs); } - for (i = 0; i < enic->rq_count; i++) { - intr = enic_msix_rq_intr(enic, i); - vnic_intr_coalescing_timer_set(&enic->intr[intr], - rx_coalesce_usecs); + if (rxcoal->use_adaptive_rx_coalesce) { + if (!ecmd->use_adaptive_rx_coalesce) { + rxcoal->use_adaptive_rx_coalesce = 0; + enic_intr_coal_set_rx(enic, rx_coalesce_usecs); + } + } else { + if (ecmd->use_adaptive_rx_coalesce) + rxcoal->use_adaptive_rx_coalesce = 1; + else + enic_intr_coal_set_rx(enic, rx_coalesce_usecs); } + if (ecmd->rx_coalesce_usecs_high) { + if (rx_coalesce_usecs_high < + (rx_coalesce_usecs_low + ENIC_AIC_LARGE_PKT_DIFF)) + return -EINVAL; + rxcoal->range_end = rx_coalesce_usecs_high; + rxcoal->small_pkt_range_start = rx_coalesce_usecs_low; + rxcoal->large_pkt_range_start = rx_coalesce_usecs_low + + ENIC_AIC_LARGE_PKT_DIFF; + } break; default: break; diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index 2945718ce80..0d8995cc92e 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -38,6 +38,7 @@ #include <linux/rtnetlink.h> #include <linux/prefetch.h> #include <net/ip6_checksum.h> +#include <linux/ktime.h> #include "cq_enet_desc.h" #include "vnic_dev.h" @@ -72,6 +73,35 @@ MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); MODULE_DEVICE_TABLE(pci, enic_id_table); +#define ENIC_LARGE_PKT_THRESHOLD 1000 +#define ENIC_MAX_COALESCE_TIMERS 10 +/* Interrupt moderation table, which will be used to decide the + * coalescing timer values + * {rx_rate in Mbps, mapping percentage of the range} + */ +struct enic_intr_mod_table mod_table[ENIC_MAX_COALESCE_TIMERS + 1] = { + {4000, 0}, + {4400, 10}, + {5060, 20}, + {5230, 30}, + {5540, 40}, + {5820, 50}, + {6120, 60}, + {6435, 70}, + {6745, 80}, + {7000, 90}, + {0xFFFFFFFF, 100} +}; + +/* This table helps the driver to pick different ranges for rx coalescing + * timer depending on the link speed. + */ +struct enic_intr_mod_range mod_range[ENIC_MAX_LINK_SPEEDS] = { + {0, 0}, /* 0 - 4 Gbps */ + {0, 3}, /* 4 - 10 Gbps */ + {3, 6}, /* 10 - 40 Gbps */ +}; + int enic_is_dynamic(struct enic *enic) { return enic->pdev->device == PCI_DEVICE_ID_CISCO_VIC_ENET_DYN; @@ -979,6 +1009,15 @@ static int enic_rq_alloc_buf(struct vnic_rq *rq) return 0; } +static void enic_intr_update_pkt_size(struct vnic_rx_bytes_counter *pkt_size, + u32 pkt_len) +{ + if (ENIC_LARGE_PKT_THRESHOLD <= pkt_len) + pkt_size->large_pkt_bytes_cnt += pkt_len; + else + pkt_size->small_pkt_bytes_cnt += pkt_len; +} + static void enic_rq_indicate_buf(struct vnic_rq *rq, struct cq_desc *cq_desc, struct vnic_rq_buf *buf, int skipped, void *opaque) @@ -986,6 +1025,7 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq, struct enic *enic = vnic_dev_priv(rq->vdev); struct net_device *netdev = enic->netdev; struct sk_buff *skb; + struct vnic_cq *cq = &enic->cq[enic_cq_rq(enic, rq->index)]; u8 type, color, eop, sop, ingress_port, vlan_stripped; u8 fcoe, fcoe_sof, fcoe_fc_crc_ok, fcoe_enc_error, fcoe_eof; @@ -1056,6 +1096,9 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq, napi_gro_receive(&enic->napi[q_number], skb); else netif_receive_skb(skb); + if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce) + enic_intr_update_pkt_size(&cq->pkt_size_counter, + bytes_written); } else { /* Buffer overflow @@ -1134,6 +1177,64 @@ static int enic_poll(struct napi_struct *napi, int budget) return rq_work_done; } +static void enic_set_int_moderation(struct enic *enic, struct vnic_rq *rq) +{ + unsigned int intr = enic_msix_rq_intr(enic, rq->index); + struct vnic_cq *cq = &enic->cq[enic_cq_rq(enic, rq->index)]; + u32 timer = cq->tobe_rx_coal_timeval; + + if (cq->tobe_rx_coal_timeval != cq->cur_rx_coal_timeval) { + vnic_intr_coalescing_timer_set(&enic->intr[intr], timer); + cq->cur_rx_coal_timeval = cq->tobe_rx_coal_timeval; + } +} + +static void enic_calc_int_moderation(struct enic *enic, struct vnic_rq *rq) +{ + struct enic_rx_coal *rx_coal = &enic->rx_coalesce_setting; + struct vnic_cq *cq = &enic->cq[enic_cq_rq(enic, rq->index)]; + struct vnic_rx_bytes_counter *pkt_size_counter = &cq->pkt_size_counter; + int index; + u32 timer; + u32 range_start; + u32 traffic; + u64 delta; + ktime_t now = ktime_get(); + + delta = ktime_us_delta(now, cq->prev_ts); + if (delta < ENIC_AIC_TS_BREAK) + return; + cq->prev_ts = now; + + traffic = pkt_size_counter->large_pkt_bytes_cnt + + pkt_size_counter->small_pkt_bytes_cnt; + /* The table takes Mbps + * traffic *= 8 => bits + * traffic *= (10^6 / delta) => bps + * traffic /= 10^6 => Mbps + * + * Combining, traffic *= (8 / delta) + */ + + traffic <<= 3; + traffic /= delta; + + for (index = 0; index < ENIC_MAX_COALESCE_TIMERS; index++) + if (traffic < mod_table[index].rx_rate) + break; + range_start = (pkt_size_counter->small_pkt_bytes_cnt > + pkt_size_counter->large_pkt_bytes_cnt << 1) ? + rx_coal->small_pkt_range_start : + rx_coal->large_pkt_range_start; + timer = range_start + ((rx_coal->range_end - range_start) * + mod_table[index].range_percent / 100); + /* Damping */ + cq->tobe_rx_coal_timeval = (timer + cq->tobe_rx_coal_timeval) >> 1; + + pkt_size_counter->large_pkt_bytes_cnt = 0; + pkt_size_counter->small_pkt_bytes_cnt = 0; +} + static int enic_poll_msix(struct napi_struct *napi, int budget) { struct net_device *netdev = napi->dev; @@ -1171,6 +1272,13 @@ static int enic_poll_msix(struct napi_struct *napi, int budget) if (err) work_done = work_to_do; + if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce) + /* Call the function which refreshes + * the intr coalescing timer value based on + * the traffic. This is supported only in + * the case of MSI-x mode + */ + enic_calc_int_moderation(enic, &enic->rq[rq]); if (work_done < work_to_do) { @@ -1179,6 +1287,8 @@ static int enic_poll_msix(struct napi_struct *napi, int budget) */ napi_complete(napi); + if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce) + enic_set_int_moderation(enic, &enic->rq[rq]); vnic_intr_unmask(&enic->intr[intr]); } @@ -1314,6 +1424,42 @@ static void enic_synchronize_irqs(struct enic *enic) } } +static void enic_set_rx_coal_setting(struct enic *enic) +{ + unsigned int speed; + int index = -1; + struct enic_rx_coal *rx_coal = &enic->rx_coalesce_setting; + + /* If intr mode is not MSIX, do not do adaptive coalescing */ + if (VNIC_DEV_INTR_MODE_MSIX != vnic_dev_get_intr_mode(enic->vdev)) { + netdev_info(enic->netdev, "INTR mode is not MSIX, Not initializing adaptive coalescing"); + return; + } + + /* 1. Read the link speed from fw + * 2. Pick the default range for the speed + * 3. Update it in enic->rx_coalesce_setting + */ + speed = vnic_dev_port_speed(enic->vdev); + if (ENIC_LINK_SPEED_10G < speed) + index = ENIC_LINK_40G_INDEX; + else if (ENIC_LINK_SPEED_4G < speed) + index = ENIC_LINK_10G_INDEX; + else + index = ENIC_LINK_4G_INDEX; + + rx_coal->small_pkt_range_start = mod_range[index].small_pkt_range_start; + rx_coal->large_pkt_range_start = mod_range[index].large_pkt_range_start; + rx_coal->range_end = ENIC_RX_COALESCE_RANGE_END; + + /* Start with the value provided by UCSM */ + for (index = 0; index < enic->rq_count; index++) + enic->cq[index].cur_rx_coal_timeval = + enic->config.intr_timer_usec; + + rx_coal->use_adaptive_rx_coalesce = 1; +} + static int enic_dev_notify_set(struct enic *enic) { int err; @@ -2231,6 +2377,7 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) enic->notify_timer.function = enic_notify_timer; enic->notify_timer.data = (unsigned long)enic; + enic_set_rx_coal_setting(enic); INIT_WORK(&enic->reset, enic_reset); INIT_WORK(&enic->change_mtu_work, enic_change_mtu_work); @@ -2250,6 +2397,9 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } enic->tx_coalesce_usecs = enic->config.intr_timer_usec; + /* rx coalesce time already got initialized. This gets used + * if adaptive coal is turned off + */ enic->rx_coalesce_usecs = enic->tx_coalesce_usecs; if (enic_is_dynamic(enic) || enic_is_sriov_vf(enic)) diff --git a/drivers/net/ethernet/cisco/enic/vnic_cq.h b/drivers/net/ethernet/cisco/enic/vnic_cq.h index 579315cbe80..4e6aa65857f 100644 --- a/drivers/net/ethernet/cisco/enic/vnic_cq.h +++ b/drivers/net/ethernet/cisco/enic/vnic_cq.h @@ -50,6 +50,11 @@ struct vnic_cq_ctrl { u32 pad10; }; +struct vnic_rx_bytes_counter { + unsigned int small_pkt_bytes_cnt; + unsigned int large_pkt_bytes_cnt; +}; + struct vnic_cq { unsigned int index; struct vnic_dev *vdev; @@ -58,6 +63,10 @@ struct vnic_cq { unsigned int to_clean; unsigned int last_color; unsigned int interrupt_offset; + struct vnic_rx_bytes_counter pkt_size_counter; + unsigned int cur_rx_coal_timeval; + unsigned int tobe_rx_coal_timeval; + ktime_t prev_ts; }; static inline unsigned int vnic_cq_service(struct vnic_cq *cq, diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 8d69e439f0c..cb5c987bee3 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1255,6 +1255,49 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum, return 0; } +static int fec_enet_clk_enable(struct net_device *ndev, bool enable) +{ + struct fec_enet_private *fep = netdev_priv(ndev); + int ret; + + if (enable) { + ret = clk_prepare_enable(fep->clk_ahb); + if (ret) + return ret; + ret = clk_prepare_enable(fep->clk_ipg); + if (ret) + goto failed_clk_ipg; + if (fep->clk_enet_out) { + ret = clk_prepare_enable(fep->clk_enet_out); + if (ret) + goto failed_clk_enet_out; + } + if (fep->clk_ptp) { + ret = clk_prepare_enable(fep->clk_ptp); + if (ret) + goto failed_clk_ptp; + } + } else { + clk_disable_unprepare(fep->clk_ahb); + clk_disable_unprepare(fep->clk_ipg); + if (fep->clk_enet_out) + clk_disable_unprepare(fep->clk_enet_out); + if (fep->clk_ptp) + clk_disable_unprepare(fep->clk_ptp); + } + + return 0; +failed_clk_ptp: + if (fep->clk_enet_out) + clk_disable_unprepare(fep->clk_enet_out); +failed_clk_enet_out: + clk_disable_unprepare(fep->clk_ipg); +failed_clk_ipg: + clk_disable_unprepare(fep->clk_ahb); + + return ret; +} + static int fec_enet_mii_probe(struct net_device *ndev) { struct fec_enet_private *fep = netdev_priv(ndev); @@ -1364,7 +1407,7 @@ static int fec_enet_mii_init(struct platform_device *pdev) * Reference Manual has an error on this, and gets fixed on i.MX6Q * document. */ - fep->phy_speed = DIV_ROUND_UP(clk_get_rate(fep->clk_ahb), 5000000); + fep->phy_speed = DIV_ROUND_UP(clk_get_rate(fep->clk_ipg), 5000000); if (id_entry->driver_data & FEC_QUIRK_ENET_MAC) fep->phy_speed--; fep->phy_speed <<= 1; @@ -1773,6 +1816,10 @@ fec_enet_open(struct net_device *ndev) struct fec_enet_private *fep = netdev_priv(ndev); int ret; + ret = fec_enet_clk_enable(ndev, true); + if (ret) + return ret; + /* I should reset the ring buffers here, but I don't yet know * a simple way to do that. */ @@ -1811,6 +1858,7 @@ fec_enet_close(struct net_device *ndev) phy_disconnect(fep->phy_dev); } + fec_enet_clk_enable(ndev, false); fec_enet_free_buffers(ndev); return 0; @@ -2164,26 +2212,10 @@ fec_probe(struct platform_device *pdev) fep->bufdesc_ex = 0; } - ret = clk_prepare_enable(fep->clk_ahb); + ret = fec_enet_clk_enable(ndev, true); if (ret) goto failed_clk; - ret = clk_prepare_enable(fep->clk_ipg); - if (ret) - goto failed_clk_ipg; - - if (fep->clk_enet_out) { - ret = clk_prepare_enable(fep->clk_enet_out); - if (ret) - goto failed_clk_enet_out; - } - - if (fep->clk_ptp) { - ret = clk_prepare_enable(fep->clk_ptp); - if (ret) - goto failed_clk_ptp; - } - fep->reg_phy = devm_regulator_get(&pdev->dev, "phy"); if (!IS_ERR(fep->reg_phy)) { ret = regulator_enable(fep->reg_phy); @@ -2225,6 +2257,7 @@ fec_probe(struct platform_device *pdev) /* Carrier starts down, phylib will bring it up */ netif_carrier_off(ndev); + fec_enet_clk_enable(ndev, false); ret = register_netdev(ndev); if (ret) @@ -2244,15 +2277,7 @@ failed_init: if (fep->reg_phy) regulator_disable(fep->reg_phy); failed_regulator: - if (fep->clk_ptp) - clk_disable_unprepare(fep->clk_ptp); -failed_clk_ptp: - if (fep->clk_enet_out) - clk_disable_unprepare(fep->clk_enet_out); -failed_clk_enet_out: - clk_disable_unprepare(fep->clk_ipg); -failed_clk_ipg: - clk_disable_unprepare(fep->clk_ahb); + fec_enet_clk_enable(ndev, false); failed_clk: failed_ioremap: free_netdev(ndev); @@ -2272,14 +2297,9 @@ fec_drv_remove(struct platform_device *pdev) del_timer_sync(&fep->time_keep); if (fep->reg_phy) regulator_disable(fep->reg_phy); - if (fep->clk_ptp) - clk_disable_unprepare(fep->clk_ptp); if (fep->ptp_clock) ptp_clock_unregister(fep->ptp_clock); - if (fep->clk_enet_out) - clk_disable_unprepare(fep->clk_enet_out); - clk_disable_unprepare(fep->clk_ipg); - clk_disable_unprepare(fep->clk_ahb); + fec_enet_clk_enable(ndev, false); free_netdev(ndev); return 0; @@ -2296,12 +2316,7 @@ fec_suspend(struct device *dev) fec_stop(ndev); netif_device_detach(ndev); } - if (fep->clk_ptp) - clk_disable_unprepare(fep->clk_ptp); - if (fep->clk_enet_out) - clk_disable_unprepare(fep->clk_enet_out); - clk_disable_unprepare(fep->clk_ipg); - clk_disable_unprepare(fep->clk_ahb); + fec_enet_clk_enable(ndev, false); if (fep->reg_phy) regulator_disable(fep->reg_phy); @@ -2322,25 +2337,9 @@ fec_resume(struct device *dev) return ret; } - ret = clk_prepare_enable(fep->clk_ahb); - if (ret) - goto failed_clk_ahb; - - ret = clk_prepare_enable(fep->clk_ipg); + ret = fec_enet_clk_enable(ndev, true); if (ret) - goto failed_clk_ipg; - - if (fep->clk_enet_out) { - ret = clk_prepare_enable(fep->clk_enet_out); - if (ret) - goto failed_clk_enet_out; - } - - if (fep->clk_ptp) { - ret = clk_prepare_enable(fep->clk_ptp); - if (ret) - goto failed_clk_ptp; - } + goto failed_clk; if (netif_running(ndev)) { fec_restart(ndev, fep->full_duplex); @@ -2349,14 +2348,7 @@ fec_resume(struct device *dev) return 0; -failed_clk_ptp: - if (fep->clk_enet_out) - clk_disable_unprepare(fep->clk_enet_out); -failed_clk_enet_out: - clk_disable_unprepare(fep->clk_ipg); -failed_clk_ipg: - clk_disable_unprepare(fep->clk_ahb); -failed_clk_ahb: +failed_clk: if (fep->reg_phy) regulator_disable(fep->reg_phy); return ret; diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index dc80db41d6b..cfaf17b70f3 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -792,10 +792,6 @@ static int fs_init_phy(struct net_device *dev) phydev = of_phy_connect(dev, fep->fpi->phy_node, &fs_adjust_link, 0, iface); if (!phydev) { - phydev = of_phy_connect_fixed_link(dev, &fs_adjust_link, - iface); - } - if (!phydev) { dev_err(&dev->dev, "Could not attach to PHY\n"); return -ENODEV; } @@ -1029,9 +1025,16 @@ static int fs_enet_probe(struct platform_device *ofdev) fpi->use_napi = 1; fpi->napi_weight = 17; fpi->phy_node = of_parse_phandle(ofdev->dev.of_node, "phy-handle", 0); - if ((!fpi->phy_node) && (!of_get_property(ofdev->dev.of_node, "fixed-link", - NULL))) - goto out_free_fpi; + if (!fpi->phy_node && of_phy_is_fixed_link(ofdev->dev.of_node)) { + err = of_phy_register_fixed_link(ofdev->dev.of_node); + if (err) + goto out_free_fpi; + + /* In the case of a fixed PHY, the DT node associated + * to the PHY is the Ethernet MAC DT node. + */ + fpi->phy_node = ofdev->dev.of_node; + } if (of_device_is_compatible(ofdev->dev.of_node, "fsl,mpc5125-fec")) { phy_connection_type = of_get_property(ofdev->dev.of_node, diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index e2d42475b00..282674027c9 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -889,6 +889,17 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev) priv->phy_node = of_parse_phandle(np, "phy-handle", 0); + /* In the case of a fixed PHY, the DT node associated + * to the PHY is the Ethernet MAC DT node. + */ + if (of_phy_is_fixed_link(np)) { + err = of_phy_register_fixed_link(np); + if (err) + goto err_grp_init; + + priv->phy_node = np; + } + /* Find the TBI PHY. If it's not there, we don't support SGMII */ priv->tbi_node = of_parse_phandle(np, "tbi-handle", 0); @@ -1660,9 +1671,6 @@ static int init_phy(struct net_device *dev) priv->phydev = of_phy_connect(dev, priv->phy_node, &adjust_link, 0, interface); - if (!priv->phydev) - priv->phydev = of_phy_connect_fixed_link(dev, &adjust_link, - interface); if (!priv->phydev) { dev_err(&dev->dev, "could not attach to PHY\n"); return -ENODEV; diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index c8299c31b21..fab39e29544 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -1728,9 +1728,6 @@ static int init_phy(struct net_device *dev) phydev = of_phy_connect(dev, ug_info->phy_node, &adjust_link, 0, priv->phy_interface); - if (!phydev) - phydev = of_phy_connect_fixed_link(dev, &adjust_link, - priv->phy_interface); if (!phydev) { dev_err(&dev->dev, "Could not attach to PHY\n"); return -ENODEV; @@ -3790,6 +3787,17 @@ static int ucc_geth_probe(struct platform_device* ofdev) ug_info->uf_info.irq = irq_of_parse_and_map(np, 0); ug_info->phy_node = of_parse_phandle(np, "phy-handle", 0); + if (!ug_info->phy_node) { + /* In the case of a fixed PHY, the DT node associated + * to the PHY is the Ethernet MAC DT node. + */ + if (of_phy_is_fixed_link(np)) { + err = of_phy_register_fixed_link(np); + if (err) + return err; + } + ug_info->phy_node = np; + } /* Find the TBI PHY node. If it's not there, we don't support SGMII */ ug_info->tbi_node = of_parse_phandle(np, "tbi-handle", 0); diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index a46571c1e9f..ef5bb11557e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -97,10 +97,6 @@ #define STRINGIFY(foo) #foo #define XSTRINGIFY(bar) STRINGIFY(bar) -#ifndef ARCH_HAS_PREFETCH -#define prefetch(X) -#endif - #define I40E_RX_DESC(R, i) \ ((ring_is_16byte_desc_enabled(R)) \ ? (union i40e_32byte_rx_desc *) \ diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index f62929419a0..861e1db47a7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -112,7 +112,6 @@ static struct i40e_stats i40e_gstrings_stats[] = { I40E_PF_STAT("rx_oversize", stats.rx_oversize), I40E_PF_STAT("rx_jabber", stats.rx_jabber), I40E_PF_STAT("VF_admin_queue_requests", vf_aq_requests), - I40E_PF_STAT("tx_hwtstamp_timeouts", tx_hwtstamp_timeouts), I40E_PF_STAT("rx_hwtstamp_cleared", rx_hwtstamp_cleared), /* LPI stats */ I40E_PF_STAT("tx_lpi_status", stats.tx_lpi_status), @@ -122,8 +121,9 @@ static struct i40e_stats i40e_gstrings_stats[] = { }; #define I40E_QUEUE_STATS_LEN(n) \ - ((((struct i40e_netdev_priv *)netdev_priv((n)))->vsi->num_queue_pairs + \ - ((struct i40e_netdev_priv *)netdev_priv((n)))->vsi->num_queue_pairs) * 2) + (((struct i40e_netdev_priv *)netdev_priv((n)))->vsi->num_queue_pairs \ + * 2 /* Tx and Rx together */ \ + * (sizeof(struct i40e_queue_stats) / sizeof(u64))) #define I40E_GLOBAL_STATS_LEN ARRAY_SIZE(i40e_gstrings_stats) #define I40E_NETDEV_STATS_LEN ARRAY_SIZE(i40e_gstrings_net_stats) #define I40E_VSI_STATS_LEN(n) (I40E_NETDEV_STATS_LEN + \ @@ -1009,14 +1009,13 @@ static int i40e_get_coalesce(struct net_device *netdev, ec->rx_max_coalesced_frames_irq = vsi->work_limit; if (ITR_IS_DYNAMIC(vsi->rx_itr_setting)) - ec->rx_coalesce_usecs = 1; - else - ec->rx_coalesce_usecs = vsi->rx_itr_setting; + ec->use_adaptive_rx_coalesce = 1; if (ITR_IS_DYNAMIC(vsi->tx_itr_setting)) - ec->tx_coalesce_usecs = 1; - else - ec->tx_coalesce_usecs = vsi->tx_itr_setting; + ec->use_adaptive_tx_coalesce = 1; + + ec->rx_coalesce_usecs = vsi->rx_itr_setting & ~I40E_ITR_DYNAMIC; + ec->tx_coalesce_usecs = vsi->tx_itr_setting & ~I40E_ITR_DYNAMIC; return 0; } @@ -1035,37 +1034,27 @@ static int i40e_set_coalesce(struct net_device *netdev, if (ec->tx_max_coalesced_frames_irq || ec->rx_max_coalesced_frames_irq) vsi->work_limit = ec->tx_max_coalesced_frames_irq; - switch (ec->rx_coalesce_usecs) { - case 0: - vsi->rx_itr_setting = 0; - break; - case 1: - vsi->rx_itr_setting = (I40E_ITR_DYNAMIC | - ITR_REG_TO_USEC(I40E_ITR_RX_DEF)); - break; - default: - if ((ec->rx_coalesce_usecs < (I40E_MIN_ITR << 1)) || - (ec->rx_coalesce_usecs > (I40E_MAX_ITR << 1))) - return -EINVAL; + if ((ec->rx_coalesce_usecs >= (I40E_MIN_ITR << 1)) && + (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1))) vsi->rx_itr_setting = ec->rx_coalesce_usecs; - break; - } + else + return -EINVAL; - switch (ec->tx_coalesce_usecs) { - case 0: - vsi->tx_itr_setting = 0; - break; - case 1: - vsi->tx_itr_setting = (I40E_ITR_DYNAMIC | - ITR_REG_TO_USEC(I40E_ITR_TX_DEF)); - break; - default: - if ((ec->tx_coalesce_usecs < (I40E_MIN_ITR << 1)) || - (ec->tx_coalesce_usecs > (I40E_MAX_ITR << 1))) - return -EINVAL; + if ((ec->tx_coalesce_usecs >= (I40E_MIN_ITR << 1)) && + (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1))) vsi->tx_itr_setting = ec->tx_coalesce_usecs; - break; - } + else + return -EINVAL; + + if (ec->use_adaptive_rx_coalesce) + vsi->rx_itr_setting |= I40E_ITR_DYNAMIC; + else + vsi->rx_itr_setting &= ~I40E_ITR_DYNAMIC; + + if (ec->use_adaptive_tx_coalesce) + vsi->tx_itr_setting |= I40E_ITR_DYNAMIC; + else + vsi->tx_itr_setting &= ~I40E_ITR_DYNAMIC; vector = vsi->base_vector; for (i = 0; i < vsi->num_q_vectors; i++, vector++) { diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index e399f9b7077..e0e5c6a867b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -4333,7 +4333,7 @@ int i40e_vsi_open(struct i40e_vsi *vsi) dev_driver_string(&pf->pdev->dev)); err = i40e_vsi_request_irq(vsi, int_name); } else { - err = EINVAL; + err = -EINVAL; goto err_setup_rx; } @@ -6368,6 +6368,10 @@ static int i40e_sw_init(struct i40e_pf *pf) I40E_FLAG_MSIX_ENABLED | I40E_FLAG_RX_1BUF_ENABLED; + /* Set default ITR */ + pf->rx_itr_default = I40E_ITR_DYNAMIC | I40E_ITR_RX_DEF; + pf->tx_itr_default = I40E_ITR_DYNAMIC | I40E_ITR_TX_DEF; + /* Depending on PF configurations, it is possible that the RSS * maximum might end up larger than the available queues */ @@ -8364,6 +8368,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } } +#ifdef CONFIG_PCI_IOV /* prep for VF support */ if ((pf->flags & I40E_FLAG_SRIOV_ENABLED) && (pf->flags & I40E_FLAG_MSIX_ENABLED) && @@ -8386,6 +8391,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err); } } +#endif /* CONFIG_PCI_IOV */ pfs_found++; diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c index 1fedc7a1589..101f439acda 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c @@ -48,7 +48,6 @@ I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT) #define I40E_PRTTSYN_CTL1_TSYNTYPE_V2 (0x2 << \ I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT) -#define I40E_PTP_TX_TIMEOUT (HZ * 15) /** * i40e_ptp_read - Read the PHC time from the device diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index ece7ae99b03..8d0ef445fa4 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -24,6 +24,7 @@ * ******************************************************************************/ +#include <linux/prefetch.h> #include "i40e.h" #include "i40e_prototype.h" diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 82e7abf6430..4d219566a04 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -842,6 +842,10 @@ void i40e_free_vfs(struct i40e_pf *pf) kfree(pf->vf); pf->vf = NULL; + /* This check is for when the driver is unloaded while VFs are + * assigned. Setting the number of VFs to 0 through sysfs is caught + * before this function ever gets called. + */ if (!i40e_vfs_are_assigned(pf)) { pci_disable_sriov(pf->pdev); /* Acknowledge VFLR for all VFS. Without this, VFs will fail to @@ -978,7 +982,12 @@ int i40e_pci_sriov_configure(struct pci_dev *pdev, int num_vfs) if (num_vfs) return i40e_pci_sriov_enable(pdev, num_vfs); - i40e_free_vfs(pf); + if (!i40e_vfs_are_assigned(pf)) { + i40e_free_vfs(pf); + } else { + dev_warn(&pdev->dev, "Unable to free VFs because some are assigned to VMs.\n"); + return -EINVAL; + } return 0; } diff --git a/drivers/net/ethernet/intel/i40evf/Makefile b/drivers/net/ethernet/intel/i40evf/Makefile index e09be37a07a..3a423836a56 100644 --- a/drivers/net/ethernet/intel/i40evf/Makefile +++ b/drivers/net/ethernet/intel/i40evf/Makefile @@ -1,7 +1,7 @@ ################################################################################ # # Intel Ethernet Controller XL710 Family Linux Virtual Function Driver -# Copyright(c) 2013 Intel Corporation. +# Copyright(c) 2013 - 2014 Intel Corporation. # # This program is free software; you can redistribute it and/or modify it # under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# # The full GNU General Public License is included in this distribution in # the file called "COPYING". # diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq.c b/drivers/net/ethernet/intel/i40evf/i40e_adminq.c index c79df257f83..68b4aacd43f 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_adminq.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq.c @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq.h index 7d24be52860..e3472c62e15 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_adminq.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq.h @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h index 6e617669c32..89d9209ff2b 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * @@ -676,7 +679,6 @@ struct i40e_aqc_add_get_update_vsi { #define I40E_AQ_VSI_TYPE_PF 0x2 #define I40E_AQ_VSI_TYPE_EMP_MNG 0x3 #define I40E_AQ_VSI_FLAG_CASCADED_PV 0x4 -#define I40E_AQ_VSI_FLAG_CLOUD_VSI 0x8 __le32 addr_high; __le32 addr_low; }; @@ -1038,7 +1040,9 @@ struct i40e_aqc_set_vsi_promiscuous_modes { #define I40E_AQC_SET_VSI_PROMISC_VLAN 0x10 __le16 seid; #define I40E_AQC_VSI_PROM_CMD_SEID_MASK 0x3FF - u8 reserved[10]; + __le16 vlan_tag; +#define I40E_AQC_SET_VSI_VLAN_VALID 0x8000 + u8 reserved[8]; }; I40E_CHECK_CMD_LENGTH(i40e_aqc_set_vsi_promiscuous_modes); @@ -1931,19 +1935,12 @@ I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_start); /* Add Udp Tunnel command and completion (direct 0x0B00) */ struct i40e_aqc_add_udp_tunnel { __le16 udp_port; - u8 header_len; /* in DWords, 1 to 15 */ + u8 reserved0[3]; u8 protocol_type; -#define I40E_AQC_TUNNEL_TYPE_TEREDO 0x0 -#define I40E_AQC_TUNNEL_TYPE_VXLAN 0x2 -#define I40E_AQC_TUNNEL_TYPE_NGE 0x3 - u8 variable_udp_length; -#define I40E_AQC_TUNNEL_FIXED_UDP_LENGTH 0x0 -#define I40E_AQC_TUNNEL_VARIABLE_UDP_LENGTH 0x1 - u8 udp_key_index; -#define I40E_AQC_TUNNEL_KEY_INDEX_VXLAN 0x0 -#define I40E_AQC_TUNNEL_KEY_INDEX_NGE 0x1 -#define I40E_AQC_TUNNEL_KEY_INDEX_PROPRIETARY_UDP 0x2 - u8 reserved[10]; +#define I40E_AQC_TUNNEL_TYPE_VXLAN 0x00 +#define I40E_AQC_TUNNEL_TYPE_NGE 0x01 +#define I40E_AQC_TUNNEL_TYPE_TEREDO 0x10 + u8 reserved1[10]; }; I40E_CHECK_CMD_LENGTH(i40e_aqc_add_udp_tunnel); diff --git a/drivers/net/ethernet/intel/i40evf/i40e_alloc.h b/drivers/net/ethernet/intel/i40evf/i40e_alloc.h index d8654fb9e52..8e6a6dd9212 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_alloc.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_alloc.h @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_common.c b/drivers/net/ethernet/intel/i40evf/i40e_common.c index ae084378faa..ac660963b8b 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_common.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_common.c @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_hmc.h b/drivers/net/ethernet/intel/i40evf/i40e_hmc.h index cb97b3eed44..9d906514fc3 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_hmc.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_hmc.h @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_lan_hmc.h b/drivers/net/ethernet/intel/i40evf/i40e_lan_hmc.h index 775fcb2463d..d6f76224153 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_lan_hmc.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_lan_hmc.h @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_osdep.h b/drivers/net/ethernet/intel/i40evf/i40e_osdep.h index 622f373b745..21a91b14bf8 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_osdep.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_osdep.h @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h index 97ab8c2b76f..849edcc2e39 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_register.h b/drivers/net/ethernet/intel/i40evf/i40e_register.h index 30af953cf10..aa4a92e3b12 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_register.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_register.h @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_status.h b/drivers/net/ethernet/intel/i40evf/i40e_status.h index 7c08cc2e339..7fa7a41915c 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_status.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_status.h @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index b9f50f40abe..82d6844245b 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h index 10bf49e18d7..d0119d0a9fc 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h index 51a6dee3c7b..fb5371ad1cb 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_type.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * @@ -101,15 +104,6 @@ enum i40e_debug_mask { I40E_DEBUG_ALL = 0xFFFFFFFF }; -/* PCI Bus Info */ -#define I40E_PCI_LINK_WIDTH_1 0x10 -#define I40E_PCI_LINK_WIDTH_2 0x20 -#define I40E_PCI_LINK_WIDTH_4 0x40 -#define I40E_PCI_LINK_WIDTH_8 0x80 -#define I40E_PCI_LINK_SPEED_2500 0x1 -#define I40E_PCI_LINK_SPEED_5000 0x2 -#define I40E_PCI_LINK_SPEED_8000 0x3 - /* These are structs for managing the hardware information and the operations. * The structures of function pointers are filled out at init time when we * know for sure exactly which hardware we're working with. This gives us the diff --git a/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h b/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h index ccf45d04b7e..1ef5b31ece9 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h @@ -1,7 +1,7 @@ /******************************************************************************* * * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver - * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2013 - 2014 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h index 807807d6238..2913bc3332a 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf.h +++ b/drivers/net/ethernet/intel/i40evf/i40evf.h @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c index 77e786d2d0e..df4dcfd364d 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * @@ -56,10 +59,12 @@ static const struct i40evf_stats i40evf_gstrings_stats[] = { }; #define I40EVF_GLOBAL_STATS_LEN ARRAY_SIZE(i40evf_gstrings_stats) -#define I40EVF_QUEUE_STATS_LEN \ +#define I40EVF_QUEUE_STATS_LEN(_dev) \ (((struct i40evf_adapter *) \ - netdev_priv(netdev))->vsi_res->num_queue_pairs * 4) -#define I40EVF_STATS_LEN (I40EVF_GLOBAL_STATS_LEN + I40EVF_QUEUE_STATS_LEN) + netdev_priv(_dev))->vsi_res->num_queue_pairs \ + * 2 * (sizeof(struct i40e_queue_stats) / sizeof(u64))) +#define I40EVF_STATS_LEN(_dev) \ + (I40EVF_GLOBAL_STATS_LEN + I40EVF_QUEUE_STATS_LEN(_dev)) /** * i40evf_get_settings - Get Link Speed and Duplex settings @@ -75,7 +80,7 @@ static int i40evf_get_settings(struct net_device *netdev, /* In the future the VF will be able to query the PF for * some information - for now use a dummy value */ - ecmd->supported = SUPPORTED_10000baseT_Full; + ecmd->supported = 0; ecmd->autoneg = AUTONEG_DISABLE; ecmd->transceiver = XCVR_DUMMY1; ecmd->port = PORT_NONE; @@ -94,9 +99,9 @@ static int i40evf_get_settings(struct net_device *netdev, static int i40evf_get_sset_count(struct net_device *netdev, int sset) { if (sset == ETH_SS_STATS) - return I40EVF_STATS_LEN; + return I40EVF_STATS_LEN(netdev); else - return -ENOTSUPP; + return -EINVAL; } /** @@ -290,14 +295,13 @@ static int i40evf_get_coalesce(struct net_device *netdev, ec->rx_max_coalesced_frames = vsi->work_limit; if (ITR_IS_DYNAMIC(vsi->rx_itr_setting)) - ec->rx_coalesce_usecs = 1; - else - ec->rx_coalesce_usecs = vsi->rx_itr_setting; + ec->use_adaptive_rx_coalesce = 1; if (ITR_IS_DYNAMIC(vsi->tx_itr_setting)) - ec->tx_coalesce_usecs = 1; - else - ec->tx_coalesce_usecs = vsi->tx_itr_setting; + ec->use_adaptive_tx_coalesce = 1; + + ec->rx_coalesce_usecs = vsi->rx_itr_setting & ~I40E_ITR_DYNAMIC; + ec->tx_coalesce_usecs = vsi->tx_itr_setting & ~I40E_ITR_DYNAMIC; return 0; } @@ -318,40 +322,34 @@ static int i40evf_set_coalesce(struct net_device *netdev, struct i40e_q_vector *q_vector; int i; - if (ec->tx_max_coalesced_frames || ec->rx_max_coalesced_frames) - vsi->work_limit = ec->tx_max_coalesced_frames; + if (ec->tx_max_coalesced_frames_irq || ec->rx_max_coalesced_frames_irq) + vsi->work_limit = ec->tx_max_coalesced_frames_irq; - switch (ec->rx_coalesce_usecs) { - case 0: - vsi->rx_itr_setting = 0; - break; - case 1: - vsi->rx_itr_setting = (I40E_ITR_DYNAMIC - | ITR_REG_TO_USEC(I40E_ITR_RX_DEF)); - break; - default: - if ((ec->rx_coalesce_usecs < (I40E_MIN_ITR << 1)) || - (ec->rx_coalesce_usecs > (I40E_MAX_ITR << 1))) - return -EINVAL; + if ((ec->rx_coalesce_usecs >= (I40E_MIN_ITR << 1)) && + (ec->rx_coalesce_usecs <= (I40E_MAX_ITR << 1))) vsi->rx_itr_setting = ec->rx_coalesce_usecs; - break; - } - switch (ec->tx_coalesce_usecs) { - case 0: - vsi->tx_itr_setting = 0; - break; - case 1: - vsi->tx_itr_setting = (I40E_ITR_DYNAMIC - | ITR_REG_TO_USEC(I40E_ITR_TX_DEF)); - break; - default: - if ((ec->tx_coalesce_usecs < (I40E_MIN_ITR << 1)) || - (ec->tx_coalesce_usecs > (I40E_MAX_ITR << 1))) - return -EINVAL; + else + return -EINVAL; + + if ((ec->tx_coalesce_usecs >= (I40E_MIN_ITR << 1)) && + (ec->tx_coalesce_usecs <= (I40E_MAX_ITR << 1))) vsi->tx_itr_setting = ec->tx_coalesce_usecs; - break; - } + else if (ec->use_adaptive_tx_coalesce) + vsi->tx_itr_setting = (I40E_ITR_DYNAMIC | + ITR_REG_TO_USEC(I40E_ITR_RX_DEF)); + else + return -EINVAL; + + if (ec->use_adaptive_rx_coalesce) + vsi->rx_itr_setting |= I40E_ITR_DYNAMIC; + else + vsi->rx_itr_setting &= ~I40E_ITR_DYNAMIC; + + if (ec->use_adaptive_tx_coalesce) + vsi->tx_itr_setting |= I40E_ITR_DYNAMIC; + else + vsi->tx_itr_setting &= ~I40E_ITR_DYNAMIC; for (i = 0; i < adapter->num_msix_vectors - NONQ_VECS; i++) { q_vector = adapter->q_vector[i]; @@ -675,7 +673,7 @@ static int i40evf_set_rxfh_indir(struct net_device *netdev, const u32 *indir) return 0; } -static struct ethtool_ops i40evf_ethtool_ops = { +static const struct ethtool_ops i40evf_ethtool_ops = { .get_settings = i40evf_get_settings, .get_drvinfo = i40evf_get_drvinfo, .get_link = ethtool_op_get_link, diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index 6edd581dffa..6f6bd3f0180 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * @@ -1029,30 +1032,21 @@ i40evf_acquire_msix_vectors(struct i40evf_adapter *adapter, int vectors) * Right now, we simply care about how many we'll get; we'll * set them up later while requesting irq's. */ - while (vectors >= vector_threshold) { - err = pci_enable_msix(adapter->pdev, adapter->msix_entries, - vectors); - if (!err) /* Success in acquiring all requested vectors. */ - break; - else if (err < 0) - vectors = 0; /* Nasty failure, quit now */ - else /* err == number of vectors we should try again with */ - vectors = err; - } - - if (vectors < vector_threshold) { + err = pci_enable_msix_range(adapter->pdev, adapter->msix_entries, + vector_threshold, vectors); + if (err < 0) { dev_err(&adapter->pdev->dev, "Unable to allocate MSI-X interrupts.\n"); kfree(adapter->msix_entries); adapter->msix_entries = NULL; - err = -EIO; - } else { - /* Adjust for only the vectors we'll use, which is minimum - * of max_msix_q_vectors + NONQ_VECS, or the number of - * vectors we were allocated. - */ - adapter->num_msix_vectors = vectors; + return err; } - return err; + + /* Adjust for only the vectors we'll use, which is minimum + * of max_msix_q_vectors + NONQ_VECS, or the number of + * vectors we were allocated. + */ + adapter->num_msix_vectors = err; + return 0; } /** @@ -2119,8 +2113,10 @@ static void i40evf_init_task(struct work_struct *work) adapter->vsi.back = adapter; adapter->vsi.base_vector = 1; adapter->vsi.work_limit = I40E_DEFAULT_IRQ_WORK; - adapter->vsi.rx_itr_setting = I40E_ITR_DYNAMIC; - adapter->vsi.tx_itr_setting = I40E_ITR_DYNAMIC; + adapter->vsi.rx_itr_setting = (I40E_ITR_DYNAMIC | + ITR_REG_TO_USEC(I40E_ITR_RX_DEF)); + adapter->vsi.tx_itr_setting = (I40E_ITR_DYNAMIC | + ITR_REG_TO_USEC(I40E_ITR_TX_DEF)); adapter->vsi.netdev = adapter->netdev; if (!adapter->netdev_registered) { diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c index e294f012647..7f80bb41772 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c @@ -12,6 +12,9 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index df1d1b97187..3b0f818a4f5 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -42,6 +42,7 @@ #include <linux/dma-mapping.h> #include <linux/in.h> #include <linux/ip.h> +#include <net/tso.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/etherdevice.h> @@ -179,9 +180,10 @@ static char mv643xx_eth_driver_version[] = "1.4"; * Misc definitions. */ #define DEFAULT_RX_QUEUE_SIZE 128 -#define DEFAULT_TX_QUEUE_SIZE 256 +#define DEFAULT_TX_QUEUE_SIZE 512 #define SKB_DMA_REALIGN ((PAGE_SIZE - NET_SKB_PAD) % SMP_CACHE_BYTES) +#define TSO_HEADER_SIZE 128 /* * RX/TX descriptors. @@ -250,6 +252,7 @@ struct tx_desc { #define GEN_TCP_UDP_CHECKSUM 0x00020000 #define UDP_FRAME 0x00010000 #define MAC_HDR_EXTRA_4_BYTES 0x00008000 +#define GEN_TCP_UDP_CHK_FULL 0x00000400 #define MAC_HDR_EXTRA_8_BYTES 0x00000200 #define TX_IHL_SHIFT 11 @@ -345,6 +348,9 @@ struct tx_queue { int tx_curr_desc; int tx_used_desc; + char *tso_hdrs; + dma_addr_t tso_hdrs_dma; + struct tx_desc *tx_desc_area; dma_addr_t tx_desc_dma; int tx_desc_area_size; @@ -661,6 +667,198 @@ static inline unsigned int has_tiny_unaligned_frags(struct sk_buff *skb) return 0; } +static inline __be16 sum16_as_be(__sum16 sum) +{ + return (__force __be16)sum; +} + +static int skb_tx_csum(struct mv643xx_eth_private *mp, struct sk_buff *skb, + u16 *l4i_chk, u32 *command, int length) +{ + int ret; + u32 cmd = 0; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + int hdr_len; + int tag_bytes; + + BUG_ON(skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_8021Q)); + + hdr_len = (void *)ip_hdr(skb) - (void *)skb->data; + tag_bytes = hdr_len - ETH_HLEN; + + if (length - hdr_len > mp->shared->tx_csum_limit || + unlikely(tag_bytes & ~12)) { + ret = skb_checksum_help(skb); + if (!ret) + goto no_csum; + return ret; + } + + if (tag_bytes & 4) + cmd |= MAC_HDR_EXTRA_4_BYTES; + if (tag_bytes & 8) + cmd |= MAC_HDR_EXTRA_8_BYTES; + + cmd |= GEN_TCP_UDP_CHECKSUM | GEN_TCP_UDP_CHK_FULL | + GEN_IP_V4_CHECKSUM | + ip_hdr(skb)->ihl << TX_IHL_SHIFT; + + /* TODO: Revisit this. With the usage of GEN_TCP_UDP_CHK_FULL + * it seems we don't need to pass the initial checksum. */ + switch (ip_hdr(skb)->protocol) { + case IPPROTO_UDP: + cmd |= UDP_FRAME; + *l4i_chk = 0; + break; + case IPPROTO_TCP: + *l4i_chk = 0; + break; + default: + WARN(1, "protocol not supported"); + } + } else { +no_csum: + /* Errata BTS #50, IHL must be 5 if no HW checksum */ + cmd |= 5 << TX_IHL_SHIFT; + } + *command = cmd; + return 0; +} + +static inline int +txq_put_data_tso(struct net_device *dev, struct tx_queue *txq, + struct sk_buff *skb, char *data, int length, + bool last_tcp, bool is_last) +{ + int tx_index; + u32 cmd_sts; + struct tx_desc *desc; + + tx_index = txq->tx_curr_desc++; + if (txq->tx_curr_desc == txq->tx_ring_size) + txq->tx_curr_desc = 0; + desc = &txq->tx_desc_area[tx_index]; + + desc->l4i_chk = 0; + desc->byte_cnt = length; + desc->buf_ptr = dma_map_single(dev->dev.parent, data, + length, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(dev->dev.parent, desc->buf_ptr))) { + WARN(1, "dma_map_single failed!\n"); + return -ENOMEM; + } + + cmd_sts = BUFFER_OWNED_BY_DMA; + if (last_tcp) { + /* last descriptor in the TCP packet */ + cmd_sts |= ZERO_PADDING | TX_LAST_DESC; + /* last descriptor in SKB */ + if (is_last) + cmd_sts |= TX_ENABLE_INTERRUPT; + } + desc->cmd_sts = cmd_sts; + return 0; +} + +static inline void +txq_put_hdr_tso(struct sk_buff *skb, struct tx_queue *txq, int length) +{ + struct mv643xx_eth_private *mp = txq_to_mp(txq); + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int tx_index; + struct tx_desc *desc; + int ret; + u32 cmd_csum = 0; + u16 l4i_chk = 0; + + tx_index = txq->tx_curr_desc; + desc = &txq->tx_desc_area[tx_index]; + + ret = skb_tx_csum(mp, skb, &l4i_chk, &cmd_csum, length); + if (ret) + WARN(1, "failed to prepare checksum!"); + + /* Should we set this? Can't use the value from skb_tx_csum() + * as it's not the correct initial L4 checksum to use. */ + desc->l4i_chk = 0; + + desc->byte_cnt = hdr_len; + desc->buf_ptr = txq->tso_hdrs_dma + + txq->tx_curr_desc * TSO_HEADER_SIZE; + desc->cmd_sts = cmd_csum | BUFFER_OWNED_BY_DMA | TX_FIRST_DESC | + GEN_CRC; + + txq->tx_curr_desc++; + if (txq->tx_curr_desc == txq->tx_ring_size) + txq->tx_curr_desc = 0; +} + +static int txq_submit_tso(struct tx_queue *txq, struct sk_buff *skb, + struct net_device *dev) +{ + struct mv643xx_eth_private *mp = txq_to_mp(txq); + int total_len, data_left, ret; + int desc_count = 0; + struct tso_t tso; + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + + /* Count needed descriptors */ + if ((txq->tx_desc_count + tso_count_descs(skb)) >= txq->tx_ring_size) { + netdev_dbg(dev, "not enough descriptors for TSO!\n"); + return -EBUSY; + } + + /* Initialize the TSO handler, and prepare the first payload */ + tso_start(skb, &tso); + + total_len = skb->len - hdr_len; + while (total_len > 0) { + char *hdr; + + data_left = min_t(int, skb_shinfo(skb)->gso_size, total_len); + total_len -= data_left; + desc_count++; + + /* prepare packet headers: MAC + IP + TCP */ + hdr = txq->tso_hdrs + txq->tx_curr_desc * TSO_HEADER_SIZE; + tso_build_hdr(skb, hdr, &tso, data_left, total_len == 0); + txq_put_hdr_tso(skb, txq, data_left); + + while (data_left > 0) { + int size; + desc_count++; + + size = min_t(int, tso.size, data_left); + ret = txq_put_data_tso(dev, txq, skb, tso.data, size, + size == data_left, + total_len == 0); + if (ret) + goto err_release; + data_left -= size; + tso_build_data(skb, &tso, size); + } + } + + __skb_queue_tail(&txq->tx_skb, skb); + skb_tx_timestamp(skb); + + /* clear TX_END status */ + mp->work_tx_end &= ~(1 << txq->index); + + /* ensure all descriptors are written before poking hardware */ + wmb(); + txq_enable(txq); + txq->tx_desc_count += desc_count; + return 0; +err_release: + /* TODO: Release all used data descriptors; header descriptors must not + * be DMA-unmapped. + */ + return ret; +} + static void txq_submit_frag_skb(struct tx_queue *txq, struct sk_buff *skb) { struct mv643xx_eth_private *mp = txq_to_mp(txq); @@ -671,8 +869,10 @@ static void txq_submit_frag_skb(struct tx_queue *txq, struct sk_buff *skb) skb_frag_t *this_frag; int tx_index; struct tx_desc *desc; + void *addr; this_frag = &skb_shinfo(skb)->frags[frag]; + addr = page_address(this_frag->page.p) + this_frag->page_offset; tx_index = txq->tx_curr_desc++; if (txq->tx_curr_desc == txq->tx_ring_size) txq->tx_curr_desc = 0; @@ -692,18 +892,11 @@ static void txq_submit_frag_skb(struct tx_queue *txq, struct sk_buff *skb) desc->l4i_chk = 0; desc->byte_cnt = skb_frag_size(this_frag); - desc->buf_ptr = skb_frag_dma_map(mp->dev->dev.parent, - this_frag, 0, - skb_frag_size(this_frag), - DMA_TO_DEVICE); + desc->buf_ptr = dma_map_single(mp->dev->dev.parent, addr, + desc->byte_cnt, DMA_TO_DEVICE); } } -static inline __be16 sum16_as_be(__sum16 sum) -{ - return (__force __be16)sum; -} - static int txq_submit_skb(struct tx_queue *txq, struct sk_buff *skb) { struct mv643xx_eth_private *mp = txq_to_mp(txq); @@ -712,53 +905,17 @@ static int txq_submit_skb(struct tx_queue *txq, struct sk_buff *skb) struct tx_desc *desc; u32 cmd_sts; u16 l4i_chk; - int length; + int length, ret; - cmd_sts = TX_FIRST_DESC | GEN_CRC | BUFFER_OWNED_BY_DMA; + cmd_sts = 0; l4i_chk = 0; - if (skb->ip_summed == CHECKSUM_PARTIAL) { - int hdr_len; - int tag_bytes; - - BUG_ON(skb->protocol != htons(ETH_P_IP) && - skb->protocol != htons(ETH_P_8021Q)); - - hdr_len = (void *)ip_hdr(skb) - (void *)skb->data; - tag_bytes = hdr_len - ETH_HLEN; - if (skb->len - hdr_len > mp->shared->tx_csum_limit || - unlikely(tag_bytes & ~12)) { - if (skb_checksum_help(skb) == 0) - goto no_csum; - dev_kfree_skb_any(skb); - return 1; - } - - if (tag_bytes & 4) - cmd_sts |= MAC_HDR_EXTRA_4_BYTES; - if (tag_bytes & 8) - cmd_sts |= MAC_HDR_EXTRA_8_BYTES; - - cmd_sts |= GEN_TCP_UDP_CHECKSUM | - GEN_IP_V4_CHECKSUM | - ip_hdr(skb)->ihl << TX_IHL_SHIFT; - - switch (ip_hdr(skb)->protocol) { - case IPPROTO_UDP: - cmd_sts |= UDP_FRAME; - l4i_chk = ntohs(sum16_as_be(udp_hdr(skb)->check)); - break; - case IPPROTO_TCP: - l4i_chk = ntohs(sum16_as_be(tcp_hdr(skb)->check)); - break; - default: - BUG(); - } - } else { -no_csum: - /* Errata BTS #50, IHL must be 5 if no HW checksum */ - cmd_sts |= 5 << TX_IHL_SHIFT; + ret = skb_tx_csum(mp, skb, &l4i_chk, &cmd_sts, skb->len); + if (ret) { + dev_kfree_skb_any(skb); + return ret; } + cmd_sts |= TX_FIRST_DESC | GEN_CRC | BUFFER_OWNED_BY_DMA; tx_index = txq->tx_curr_desc++; if (txq->tx_curr_desc == txq->tx_ring_size) @@ -801,7 +958,7 @@ no_csum: static netdev_tx_t mv643xx_eth_xmit(struct sk_buff *skb, struct net_device *dev) { struct mv643xx_eth_private *mp = netdev_priv(dev); - int length, queue; + int length, queue, ret; struct tx_queue *txq; struct netdev_queue *nq; @@ -825,7 +982,11 @@ static netdev_tx_t mv643xx_eth_xmit(struct sk_buff *skb, struct net_device *dev) length = skb->len; - if (!txq_submit_skb(txq, skb)) { + if (skb_is_gso(skb)) + ret = txq_submit_tso(txq, skb, dev); + else + ret = txq_submit_skb(txq, skb); + if (!ret) { int entries_left; txq->tx_bytes += length; @@ -834,6 +995,8 @@ static netdev_tx_t mv643xx_eth_xmit(struct sk_buff *skb, struct net_device *dev) entries_left = txq->tx_ring_size - txq->tx_desc_count; if (entries_left < MAX_SKB_FRAGS + 1) netif_tx_stop_queue(nq); + } else if (ret == -EBUSY) { + return NETDEV_TX_BUSY; } return NETDEV_TX_OK; @@ -907,14 +1070,8 @@ static int txq_reclaim(struct tx_queue *txq, int budget, int force) mp->dev->stats.tx_errors++; } - if (cmd_sts & TX_FIRST_DESC) { - dma_unmap_single(mp->dev->dev.parent, desc->buf_ptr, - desc->byte_cnt, DMA_TO_DEVICE); - } else { - dma_unmap_page(mp->dev->dev.parent, desc->buf_ptr, - desc->byte_cnt, DMA_TO_DEVICE); - } - + dma_unmap_single(mp->dev->dev.parent, desc->buf_ptr, + desc->byte_cnt, DMA_TO_DEVICE); dev_kfree_skb(skb); } @@ -1871,6 +2028,15 @@ static int txq_init(struct mv643xx_eth_private *mp, int index) nexti * sizeof(struct tx_desc); } + /* Allocate DMA buffers for TSO MAC/IP/TCP headers */ + txq->tso_hdrs = dma_alloc_coherent(mp->dev->dev.parent, + txq->tx_ring_size * TSO_HEADER_SIZE, + &txq->tso_hdrs_dma, GFP_KERNEL); + if (txq->tso_hdrs == NULL) { + dma_free_coherent(mp->dev->dev.parent, txq->tx_desc_area_size, + txq->tx_desc_area, txq->tx_desc_dma); + return -ENOMEM; + } skb_queue_head_init(&txq->tx_skb); return 0; @@ -1891,6 +2057,10 @@ static void txq_deinit(struct tx_queue *txq) else dma_free_coherent(mp->dev->dev.parent, txq->tx_desc_area_size, txq->tx_desc_area, txq->tx_desc_dma); + if (txq->tso_hdrs) + dma_free_coherent(mp->dev->dev.parent, + txq->tx_ring_size * TSO_HEADER_SIZE, + txq->tso_hdrs, txq->tso_hdrs_dma); } @@ -2921,9 +3091,11 @@ static int mv643xx_eth_probe(struct platform_device *pdev) dev->watchdog_timeo = 2 * HZ; dev->base_addr = 0; - dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM; - dev->features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM; - dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM; + dev->features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO; + dev->vlan_features = dev->features; + + dev->features |= NETIF_F_RXCSUM; + dev->hw_features = dev->features; dev->priv_flags |= IFF_UNICAST_FLT; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index c7cbaf3b4af..18c698d9ef9 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -23,6 +23,7 @@ #include <net/ip.h> #include <net/ipv6.h> #include <linux/io.h> +#include <net/tso.h> #include <linux/of.h> #include <linux/of_irq.h> #include <linux/of_mdio.h> @@ -244,6 +245,9 @@ #define MVNETA_TX_MTU_MAX 0x3ffff +/* TSO header size */ +#define TSO_HEADER_SIZE 128 + /* Max number of Rx descriptors */ #define MVNETA_MAX_RXD 128 @@ -413,6 +417,12 @@ struct mvneta_tx_queue { /* Index of the next TX DMA descriptor to process */ int next_desc_to_proc; + + /* DMA buffers for TSO headers */ + char *tso_hdrs; + + /* DMA address of TSO headers */ + dma_addr_t tso_hdrs_phys; }; struct mvneta_rx_queue { @@ -1519,6 +1529,126 @@ static int mvneta_rx(struct mvneta_port *pp, int rx_todo, return rx_done; } +static inline void +mvneta_tso_put_hdr(struct sk_buff *skb, + struct mvneta_port *pp, struct mvneta_tx_queue *txq) +{ + struct mvneta_tx_desc *tx_desc; + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + + txq->tx_skb[txq->txq_put_index] = NULL; + tx_desc = mvneta_txq_next_desc_get(txq); + tx_desc->data_size = hdr_len; + tx_desc->command = mvneta_skb_tx_csum(pp, skb); + tx_desc->command |= MVNETA_TXD_F_DESC; + tx_desc->buf_phys_addr = txq->tso_hdrs_phys + + txq->txq_put_index * TSO_HEADER_SIZE; + mvneta_txq_inc_put(txq); +} + +static inline int +mvneta_tso_put_data(struct net_device *dev, struct mvneta_tx_queue *txq, + struct sk_buff *skb, char *data, int size, + bool last_tcp, bool is_last) +{ + struct mvneta_tx_desc *tx_desc; + + tx_desc = mvneta_txq_next_desc_get(txq); + tx_desc->data_size = size; + tx_desc->buf_phys_addr = dma_map_single(dev->dev.parent, data, + size, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(dev->dev.parent, + tx_desc->buf_phys_addr))) { + mvneta_txq_desc_put(txq); + return -ENOMEM; + } + + tx_desc->command = 0; + txq->tx_skb[txq->txq_put_index] = NULL; + + if (last_tcp) { + /* last descriptor in the TCP packet */ + tx_desc->command = MVNETA_TXD_L_DESC; + + /* last descriptor in SKB */ + if (is_last) + txq->tx_skb[txq->txq_put_index] = skb; + } + mvneta_txq_inc_put(txq); + return 0; +} + +static int mvneta_tx_tso(struct sk_buff *skb, struct net_device *dev, + struct mvneta_tx_queue *txq) +{ + int total_len, data_left; + int desc_count = 0; + struct mvneta_port *pp = netdev_priv(dev); + struct tso_t tso; + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int i; + + /* Count needed descriptors */ + if ((txq->count + tso_count_descs(skb)) >= txq->size) + return 0; + + if (skb_headlen(skb) < (skb_transport_offset(skb) + tcp_hdrlen(skb))) { + pr_info("*** Is this even possible???!?!?\n"); + return 0; + } + + /* Initialize the TSO handler, and prepare the first payload */ + tso_start(skb, &tso); + + total_len = skb->len - hdr_len; + while (total_len > 0) { + char *hdr; + + data_left = min_t(int, skb_shinfo(skb)->gso_size, total_len); + total_len -= data_left; + desc_count++; + + /* prepare packet headers: MAC + IP + TCP */ + hdr = txq->tso_hdrs + txq->txq_put_index * TSO_HEADER_SIZE; + tso_build_hdr(skb, hdr, &tso, data_left, total_len == 0); + + mvneta_tso_put_hdr(skb, pp, txq); + + while (data_left > 0) { + int size; + desc_count++; + + size = min_t(int, tso.size, data_left); + + if (mvneta_tso_put_data(dev, txq, skb, + tso.data, size, + size == data_left, + total_len == 0)) + goto err_release; + data_left -= size; + + tso_build_data(skb, &tso, size); + } + } + + return desc_count; + +err_release: + /* Release all used data descriptors; header descriptors must not + * be DMA-unmapped. + */ + for (i = desc_count - 1; i >= 0; i--) { + struct mvneta_tx_desc *tx_desc = txq->descs + i; + if (!(tx_desc->command & MVNETA_TXD_F_DESC)) + dma_unmap_single(pp->dev->dev.parent, + tx_desc->buf_phys_addr, + tx_desc->data_size, + DMA_TO_DEVICE); + mvneta_txq_desc_put(txq); + } + return 0; +} + /* Handle tx fragmentation processing */ static int mvneta_tx_frag_process(struct mvneta_port *pp, struct sk_buff *skb, struct mvneta_tx_queue *txq) @@ -1584,15 +1714,18 @@ static int mvneta_tx(struct sk_buff *skb, struct net_device *dev) u16 txq_id = skb_get_queue_mapping(skb); struct mvneta_tx_queue *txq = &pp->txqs[txq_id]; struct mvneta_tx_desc *tx_desc; - struct netdev_queue *nq; int frags = 0; u32 tx_cmd; if (!netif_running(dev)) goto out; + if (skb_is_gso(skb)) { + frags = mvneta_tx_tso(skb, dev, txq); + goto out; + } + frags = skb_shinfo(skb)->nr_frags + 1; - nq = netdev_get_tx_queue(dev, txq_id); /* Get a descriptor for the first part of the packet */ tx_desc = mvneta_txq_next_desc_get(txq); @@ -1635,15 +1768,16 @@ static int mvneta_tx(struct sk_buff *skb, struct net_device *dev) } } - txq->count += frags; - mvneta_txq_pend_desc_add(pp, txq, frags); - - if (txq->size - txq->count < MAX_SKB_FRAGS + 1) - netif_tx_stop_queue(nq); - out: if (frags > 0) { struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats); + struct netdev_queue *nq = netdev_get_tx_queue(dev, txq_id); + + txq->count += frags; + mvneta_txq_pend_desc_add(pp, txq, frags); + + if (txq->size - txq->count < MAX_SKB_FRAGS + 1) + netif_tx_stop_queue(nq); u64_stats_update_begin(&stats->syncp); stats->tx_packets++; @@ -2109,6 +2243,18 @@ static int mvneta_txq_init(struct mvneta_port *pp, txq->descs, txq->descs_phys); return -ENOMEM; } + + /* Allocate DMA buffers for TSO MAC/IP/TCP headers */ + txq->tso_hdrs = dma_alloc_coherent(pp->dev->dev.parent, + txq->size * TSO_HEADER_SIZE, + &txq->tso_hdrs_phys, GFP_KERNEL); + if (txq->tso_hdrs == NULL) { + kfree(txq->tx_skb); + dma_free_coherent(pp->dev->dev.parent, + txq->size * MVNETA_DESC_ALIGNED_SIZE, + txq->descs, txq->descs_phys); + return -ENOMEM; + } mvneta_tx_done_pkts_coal_set(pp, txq, txq->done_pkts_coal); return 0; @@ -2120,6 +2266,10 @@ static void mvneta_txq_deinit(struct mvneta_port *pp, { kfree(txq->tx_skb); + if (txq->tso_hdrs) + dma_free_coherent(pp->dev->dev.parent, + txq->size * TSO_HEADER_SIZE, + txq->tso_hdrs, txq->tso_hdrs_phys); if (txq->descs) dma_free_coherent(pp->dev->dev.parent, txq->size * MVNETA_DESC_ALIGNED_SIZE, @@ -2895,9 +3045,9 @@ static int mvneta_probe(struct platform_device *pdev) netif_napi_add(dev, &pp->napi, mvneta_poll, pp->weight); - dev->features = NETIF_F_SG | NETIF_F_IP_CSUM; - dev->hw_features |= NETIF_F_SG | NETIF_F_IP_CSUM; - dev->vlan_features |= NETIF_F_SG | NETIF_F_IP_CSUM; + dev->features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO; + dev->hw_features |= dev->features; + dev->vlan_features |= dev->features; dev->priv_flags |= IFF_UNICAST_FLT; err = register_netdev(dev); diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c index d107bcbb854..7a0deadd53b 100644 --- a/drivers/net/ethernet/neterion/vxge/vxge-main.c +++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c @@ -2122,7 +2122,7 @@ static int vxge_open_vpaths(struct vxgedev *vdev) static void adaptive_coalesce_tx_interrupts(struct vxge_fifo *fifo) { fifo->interrupt_count++; - if (jiffies > fifo->jiffies + HZ / 100) { + if (time_before(fifo->jiffies + HZ / 100, jiffies)) { struct __vxge_hw_fifo *hw_fifo = fifo->handle; fifo->jiffies = jiffies; @@ -2150,7 +2150,7 @@ static void adaptive_coalesce_tx_interrupts(struct vxge_fifo *fifo) static void adaptive_coalesce_rx_interrupts(struct vxge_ring *ring) { ring->interrupt_count++; - if (jiffies > ring->jiffies + HZ / 100) { + if (time_before(ring->jiffies + HZ / 100, jiffies)) { struct __vxge_hw_ring *hw_ring = ring->handle; ring->jiffies = jiffies; diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index e76eae54115..f32d730f55c 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -1865,7 +1865,6 @@ static int davinci_emac_probe(struct platform_device *pdev) struct emac_priv *priv; unsigned long hw_ram_addr; struct emac_platform_data *pdata; - struct device *emac_dev; struct cpdma_params dma_params; struct clk *emac_clk; unsigned long emac_bus_frequency; @@ -1911,7 +1910,6 @@ static int davinci_emac_probe(struct platform_device *pdev) priv->coal_intvl = 0; priv->bus_freq_mhz = (u32)(emac_bus_frequency / 1000000); - emac_dev = &ndev->dev; /* Get EMAC platform data */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); priv->emac_base_phys = res->start + pdata->ctrl_reg_offset; @@ -1930,7 +1928,7 @@ static int davinci_emac_probe(struct platform_device *pdev) hw_ram_addr = (u32 __force)res->start + pdata->ctrl_ram_offset; memset(&dma_params, 0, sizeof(dma_params)); - dma_params.dev = emac_dev; + dma_params.dev = &pdev->dev; dma_params.dmaregs = priv->emac_base; dma_params.rxthresh = priv->emac_base + 0x120; dma_params.rxfree = priv->emac_base + 0x140; @@ -1994,7 +1992,7 @@ static int davinci_emac_probe(struct platform_device *pdev) if (netif_msg_probe(priv)) { - dev_notice(emac_dev, "DaVinci EMAC Probe found device "\ + dev_notice(&pdev->dev, "DaVinci EMAC Probe found device " "(regs: %p, irq: %d)\n", (void *)priv->emac_base_phys, ndev->irq); } diff --git a/drivers/net/ieee802154/fakelb.c b/drivers/net/ieee802154/fakelb.c index b8d22173925..27d83207d24 100644 --- a/drivers/net/ieee802154/fakelb.c +++ b/drivers/net/ieee802154/fakelb.c @@ -26,6 +26,7 @@ #include <linux/timer.h> #include <linux/platform_device.h> #include <linux/netdevice.h> +#include <linux/device.h> #include <linux/spinlock.h> #include <net/mac802154.h> #include <net/wpan-phy.h> @@ -228,7 +229,8 @@ static int fakelb_probe(struct platform_device *pdev) int err = -ENOMEM; int i; - priv = kzalloc(sizeof(struct fakelb_priv), GFP_KERNEL); + priv = devm_kzalloc(&pdev->dev, sizeof(struct fakelb_priv), + GFP_KERNEL); if (!priv) goto err_alloc; @@ -248,7 +250,6 @@ static int fakelb_probe(struct platform_device *pdev) err_slave: list_for_each_entry(dp, &priv->list, list) fakelb_del(dp); - kfree(priv); err_alloc: return err; } @@ -260,7 +261,6 @@ static int fakelb_remove(struct platform_device *pdev) list_for_each_entry_safe(dp, temp, &priv->list, list) fakelb_del(dp); - kfree(priv); return 0; } diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 767fe61b5ac..9a9ce8debef 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -968,7 +968,7 @@ static void team_port_disable(struct team *team, static void __team_compute_features(struct team *team) { struct team_port *port; - u32 vlan_features = TEAM_VLAN_FEATURES; + u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL; unsigned short max_hard_header_len = ETH_HLEN; unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index ee328ba101e..98bad1fb1bf 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -498,12 +498,12 @@ static void tun_detach_all(struct net_device *dev) for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); - wake_up_all(&tfile->wq.wait); + tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); --tun->numqueues; } list_for_each_entry(tfile, &tun->disabled, next) { - wake_up_all(&tfile->wq.wait); + tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); } BUG_ON(tun->numqueues != 0); @@ -807,8 +807,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); - wake_up_interruptible_poll(&tfile->wq.wait, POLLIN | - POLLRDNORM | POLLRDBAND); + tfile->socket.sk->sk_data_ready(tfile->socket.sk); rcu_read_unlock(); return NETDEV_TX_OK; @@ -965,7 +964,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) tun_debug(KERN_INFO, tun, "tun_chr_poll\n"); - poll_wait(file, &tfile->wq.wait, wait); + poll_wait(file, sk_sleep(sk), wait); if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; @@ -1330,47 +1329,26 @@ done: static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, const struct iovec *iv, ssize_t len, int noblock) { - DECLARE_WAITQUEUE(wait, current); struct sk_buff *skb; ssize_t ret = 0; + int peeked, err, off = 0; tun_debug(KERN_INFO, tun, "tun_do_read\n"); - if (unlikely(!noblock)) - add_wait_queue(&tfile->wq.wait, &wait); - while (len) { - if (unlikely(!noblock)) - current->state = TASK_INTERRUPTIBLE; + if (!len) + return ret; - /* Read frames from the queue */ - if (!(skb = skb_dequeue(&tfile->socket.sk->sk_receive_queue))) { - if (noblock) { - ret = -EAGAIN; - break; - } - if (signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - if (tun->dev->reg_state != NETREG_REGISTERED) { - ret = -EIO; - break; - } - - /* Nothing to read, let's sleep */ - schedule(); - continue; - } + if (tun->dev->reg_state != NETREG_REGISTERED) + return -EIO; + /* Read frames from queue */ + skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, + &peeked, &off, &err); + if (skb) { ret = tun_put_user(tun, tfile, skb, iv, len); kfree_skb(skb); - break; - } - - if (unlikely(!noblock)) { - current->state = TASK_RUNNING; - remove_wait_queue(&tfile->wq.wait, &wait); - } + } else + ret = err; return ret; } @@ -2199,8 +2177,8 @@ static int tun_chr_open(struct inode *inode, struct file * file) tfile->flags = 0; tfile->ifindex = 0; - rcu_assign_pointer(tfile->socket.wq, &tfile->wq); init_waitqueue_head(&tfile->wq.wait); + RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq); tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c index 9c34d2fccfa..9c78090e72f 100644 --- a/drivers/net/wimax/i2400m/driver.c +++ b/drivers/net/wimax/i2400m/driver.c @@ -500,26 +500,23 @@ int i2400m_pm_notifier(struct notifier_block *notifier, */ int i2400m_pre_reset(struct i2400m *i2400m) { - int result; struct device *dev = i2400m_dev(i2400m); d_fnstart(3, dev, "(i2400m %p)\n", i2400m); d_printf(1, dev, "pre-reset shut down\n"); - result = 0; mutex_lock(&i2400m->init_mutex); if (i2400m->updown) { netif_tx_disable(i2400m->wimax_dev.net_dev); __i2400m_dev_stop(i2400m); - result = 0; /* down't set updown to zero -- this way * post_reset can restore properly */ } mutex_unlock(&i2400m->init_mutex); if (i2400m->bus_release) i2400m->bus_release(i2400m); - d_fnend(3, dev, "(i2400m %p) = %d\n", i2400m, result); - return result; + d_fnend(3, dev, "(i2400m %p) = 0\n", i2400m); + return 0; } EXPORT_SYMBOL_GPL(i2400m_pre_reset); diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 1def0bb5cb3..4c1e01ed16d 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -246,44 +246,6 @@ struct phy_device *of_phy_connect(struct net_device *dev, EXPORT_SYMBOL(of_phy_connect); /** - * of_phy_connect_fixed_link - Parse fixed-link property and return a dummy phy - * @dev: pointer to net_device claiming the phy - * @hndlr: Link state callback for the network device - * @iface: PHY data interface type - * - * This function is a temporary stop-gap and will be removed soon. It is - * only to support the fs_enet, ucc_geth and gianfar Ethernet drivers. Do - * not call this function from new drivers. - */ -struct phy_device *of_phy_connect_fixed_link(struct net_device *dev, - void (*hndlr)(struct net_device *), - phy_interface_t iface) -{ - struct device_node *net_np; - char bus_id[MII_BUS_ID_SIZE + 3]; - struct phy_device *phy; - const __be32 *phy_id; - int sz; - - if (!dev->dev.parent) - return NULL; - - net_np = dev->dev.parent->of_node; - if (!net_np) - return NULL; - - phy_id = of_get_property(net_np, "fixed-link", &sz); - if (!phy_id || sz < sizeof(*phy_id)) - return NULL; - - sprintf(bus_id, PHY_ID_FMT, "fixed-0", be32_to_cpu(phy_id[0])); - - phy = phy_connect(dev, bus_id, hndlr, iface); - return IS_ERR(phy) ? NULL : phy; -} -EXPORT_SYMBOL(of_phy_connect_fixed_link); - -/** * of_phy_attach - Attach to a PHY without starting the state machine * @dev: pointer to net_device claiming the phy * @phy_np: Node pointer for the PHY diff --git a/include/linux/filter.h b/include/linux/filter.h index 9d5ae0a2c95..7977b3958e2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -184,10 +184,8 @@ static inline unsigned int sk_filter_size(unsigned int proglen) int sk_filter(struct sock *sk, struct sk_buff *skb); -u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx, - const struct sock_filter_int *insni); -u32 sk_run_filter_int_skb(const struct sk_buff *ctx, - const struct sock_filter_int *insni); +void sk_filter_select_runtime(struct sk_filter *fp); +void sk_filter_free(struct sk_filter *fp); int sk_convert_filter(struct sock_filter *prog, int len, struct sock_filter_int *new_prog, int *new_len); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2dea98cbbdb..f4ad247fd32 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3153,6 +3153,20 @@ const char *netdev_drivername(const struct net_device *dev); void linkwatch_run_queue(void); +static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, + netdev_features_t f2) +{ + if (f1 & NETIF_F_GEN_CSUM) + f1 |= (NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + if (f2 & NETIF_F_GEN_CSUM) + f2 |= (NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + f1 &= f2; + if (f1 & NETIF_F_GEN_CSUM) + f1 &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + + return f1; +} + static inline netdev_features_t netdev_get_wanted_features( struct net_device *dev) { diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index 0aa367e316c..d449018d072 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -22,9 +22,6 @@ extern struct phy_device *of_phy_connect(struct net_device *dev, struct phy_device *of_phy_attach(struct net_device *dev, struct device_node *phy_np, u32 flags, phy_interface_t iface); -extern struct phy_device *of_phy_connect_fixed_link(struct net_device *dev, - void (*hndlr)(struct net_device *), - phy_interface_t iface); extern struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np); @@ -59,13 +56,6 @@ static inline struct phy_device *of_phy_attach(struct net_device *dev, return NULL; } -static inline struct phy_device *of_phy_connect_fixed_link(struct net_device *dev, - void (*hndlr)(struct net_device *), - phy_interface_t iface) -{ - return NULL; -} - static inline struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np) { return NULL; diff --git a/include/linux/tcp.h b/include/linux/tcp.h index bc35e4709e8..a0513210798 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -197,7 +197,8 @@ struct tcp_sock { u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ - syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ + syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ + is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ /* RTT measurement */ @@ -209,6 +210,8 @@ struct tcp_sock { u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ + u32 max_packets_out; /* max packets_out in last window */ + u32 max_packets_seq; /* right edge of max_packets_out flight */ u16 urg_data; /* Saved octet of OOB data and control flags */ u8 ecn_flags; /* ECN status bits. */ @@ -230,7 +233,6 @@ struct tcp_sock { u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; u32 snd_cwnd_stamp; - u32 lsnd_pending; /* packets inflight or unsent since last xmit */ u32 prior_cwnd; /* Congestion window at start of Recovery. */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e6bc14d8fa9..7ee6ce6564a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -72,21 +72,23 @@ static inline void nft_data_debug(const struct nft_data *data) * struct nft_ctx - nf_tables rule/set context * * @net: net namespace - * @skb: netlink skb - * @nlh: netlink message header * @afi: address family info * @table: the table the chain is contained in * @chain: the chain the rule is contained in * @nla: netlink attributes + * @portid: netlink portID of the original message + * @seq: netlink sequence number + * @report: notify via unicast netlink message */ struct nft_ctx { struct net *net; - const struct sk_buff *skb; - const struct nlmsghdr *nlh; - const struct nft_af_info *afi; - const struct nft_table *table; - const struct nft_chain *chain; + struct nft_af_info *afi; + struct nft_table *table; + struct nft_chain *chain; const struct nlattr * const *nla; + u32 portid; + u32 seq; + bool report; }; struct nft_data_desc { @@ -146,6 +148,44 @@ struct nft_set_iter { }; /** + * struct nft_set_desc - description of set elements + * + * @klen: key length + * @dlen: data length + * @size: number of set elements + */ +struct nft_set_desc { + unsigned int klen; + unsigned int dlen; + unsigned int size; +}; + +/** + * enum nft_set_class - performance class + * + * @NFT_LOOKUP_O_1: constant, O(1) + * @NFT_LOOKUP_O_LOG_N: logarithmic, O(log N) + * @NFT_LOOKUP_O_N: linear, O(N) + */ +enum nft_set_class { + NFT_SET_CLASS_O_1, + NFT_SET_CLASS_O_LOG_N, + NFT_SET_CLASS_O_N, +}; + +/** + * struct nft_set_estimate - estimation of memory and performance + * characteristics + * + * @size: required memory + * @class: lookup performance class + */ +struct nft_set_estimate { + unsigned int size; + enum nft_set_class class; +}; + +/** * struct nft_set_ops - nf_tables set operations * * @lookup: look up an element within the set @@ -174,7 +214,11 @@ struct nft_set_ops { struct nft_set_iter *iter); unsigned int (*privsize)(const struct nlattr * const nla[]); + bool (*estimate)(const struct nft_set_desc *desc, + u32 features, + struct nft_set_estimate *est); int (*init)(const struct nft_set *set, + const struct nft_set_desc *desc, const struct nlattr * const nla[]); void (*destroy)(const struct nft_set *set); @@ -194,6 +238,8 @@ void nft_unregister_set(struct nft_set_ops *ops); * @name: name of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) + * @size: maximum set size + * @nelems: number of elements * @ops: set ops * @flags: set flags * @klen: key length @@ -206,6 +252,8 @@ struct nft_set { char name[IFNAMSIZ]; u32 ktype; u32 dtype; + u32 size; + u32 nelems; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags; @@ -222,6 +270,8 @@ static inline void *nft_set_priv(const struct nft_set *set) struct nft_set *nf_tables_set_lookup(const struct nft_table *table, const struct nlattr *nla); +struct nft_set *nf_tables_set_lookup_byid(const struct net *net, + const struct nlattr *nla); /** * struct nft_set_binding - nf_tables set binding @@ -341,18 +391,75 @@ struct nft_rule { }; /** - * struct nft_rule_trans - nf_tables rule update in transaction + * struct nft_trans - nf_tables object update in transaction * + * @rcu_head: rcu head to defer release of transaction data * @list: used internally - * @ctx: rule context - * @rule: rule that needs to be updated + * @msg_type: message type + * @ctx: transaction context + * @data: internal information related to the transaction */ -struct nft_rule_trans { +struct nft_trans { + struct rcu_head rcu_head; struct list_head list; + int msg_type; struct nft_ctx ctx; + char data[0]; +}; + +struct nft_trans_rule { struct nft_rule *rule; }; +#define nft_trans_rule(trans) \ + (((struct nft_trans_rule *)trans->data)->rule) + +struct nft_trans_set { + struct nft_set *set; + u32 set_id; +}; + +#define nft_trans_set(trans) \ + (((struct nft_trans_set *)trans->data)->set) +#define nft_trans_set_id(trans) \ + (((struct nft_trans_set *)trans->data)->set_id) + +struct nft_trans_chain { + bool update; + char name[NFT_CHAIN_MAXNAMELEN]; + struct nft_stats __percpu *stats; + u8 policy; +}; + +#define nft_trans_chain_update(trans) \ + (((struct nft_trans_chain *)trans->data)->update) +#define nft_trans_chain_name(trans) \ + (((struct nft_trans_chain *)trans->data)->name) +#define nft_trans_chain_stats(trans) \ + (((struct nft_trans_chain *)trans->data)->stats) +#define nft_trans_chain_policy(trans) \ + (((struct nft_trans_chain *)trans->data)->policy) + +struct nft_trans_table { + bool update; + bool enable; +}; + +#define nft_trans_table_update(trans) \ + (((struct nft_trans_table *)trans->data)->update) +#define nft_trans_table_enable(trans) \ + (((struct nft_trans_table *)trans->data)->enable) + +struct nft_trans_elem { + struct nft_set *set; + struct nft_set_elem elem; +}; + +#define nft_trans_elem_set(trans) \ + (((struct nft_trans_elem *)trans->data)->set) +#define nft_trans_elem(trans) \ + (((struct nft_trans_elem *)trans->data)->elem) + static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[0]; @@ -385,6 +492,7 @@ static inline void *nft_userdata(const struct nft_rule *rule) enum nft_chain_flags { NFT_BASE_CHAIN = 0x1, + NFT_CHAIN_INACTIVE = 0x2, }; /** diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h new file mode 100644 index 00000000000..0ee47c3e2e3 --- /dev/null +++ b/include/net/netfilter/nft_meta.h @@ -0,0 +1,36 @@ +#ifndef _NFT_META_H_ +#define _NFT_META_H_ + +struct nft_meta { + enum nft_meta_keys key:8; + union { + enum nft_registers dreg:8; + enum nft_registers sreg:8; + }; +}; + +extern const struct nla_policy nft_meta_policy[]; + +int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]); + +int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]); + +int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr); + +int nft_meta_set_dump(struct sk_buff *skb, + const struct nft_expr *expr); + +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt); + +void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt); + +#endif diff --git a/include/net/tcp.h b/include/net/tcp.h index f5d6ca4a9d2..e80abe4486c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -971,8 +971,9 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp) /* We follow the spirit of RFC2861 to validate cwnd but implement a more * flexible approach. The RFC suggests cwnd should not be raised unless - * it was fully used previously. But we allow cwnd to grow as long as the - * application has used half the cwnd. + * it was fully used previously. And that's exactly what we do in + * congestion avoidance mode. But in slow start we allow cwnd to grow + * as long as the application has used half the cwnd. * Example : * cwnd is 10 (IW10), but application sends 9 frames. * We allow cwnd to reach 18 when all frames are ACKed. @@ -985,7 +986,11 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_cwnd < 2 * tp->lsnd_pending; + /* If in slow start, ensure cwnd grows to twice what was ACKed. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + return tp->snd_cwnd < 2 * tp->max_packets_out; + + return tp->is_cwnd_limited; } static inline void tcp_check_probe_timer(struct sock *sk) diff --git a/include/net/tso.h b/include/net/tso.h new file mode 100644 index 00000000000..47e5444f7d1 --- /dev/null +++ b/include/net/tso.h @@ -0,0 +1,20 @@ +#ifndef _TSO_H +#define _TSO_H + +#include <net/ip.h> + +struct tso_t { + int next_frag_idx; + void *data; + size_t size; + u16 ip_id; + u32 tcp_seq; +}; + +int tso_count_descs(struct sk_buff *skb); +void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, + int size, bool is_last); +void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size); +void tso_start(struct sk_buff *skb, struct tso_t *tso); + +#endif /* _TSO_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c88ccbfda5f..2a88f645a5d 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -212,6 +212,29 @@ enum nft_set_flags { }; /** + * enum nft_set_policies - set selection policy + * + * @NFT_SET_POL_PERFORMANCE: prefer high performance over low memory use + * @NFT_SET_POL_MEMORY: prefer low memory use over high performance + */ +enum nft_set_policies { + NFT_SET_POL_PERFORMANCE, + NFT_SET_POL_MEMORY, +}; + +/** + * enum nft_set_desc_attributes - set element description + * + * @NFTA_SET_DESC_SIZE: number of elements in set (NLA_U32) + */ +enum nft_set_desc_attributes { + NFTA_SET_DESC_UNSPEC, + NFTA_SET_DESC_SIZE, + __NFTA_SET_DESC_MAX +}; +#define NFTA_SET_DESC_MAX (__NFTA_SET_DESC_MAX - 1) + +/** * enum nft_set_attributes - nf_tables set netlink attributes * * @NFTA_SET_TABLE: table name (NLA_STRING) @@ -221,6 +244,9 @@ enum nft_set_flags { * @NFTA_SET_KEY_LEN: key data length (NLA_U32) * @NFTA_SET_DATA_TYPE: mapping data type (NLA_U32) * @NFTA_SET_DATA_LEN: mapping data length (NLA_U32) + * @NFTA_SET_POLICY: selection policy (NLA_U32) + * @NFTA_SET_DESC: set description (NLA_NESTED) + * @NFTA_SET_ID: uniquely identifies a set in a transaction (NLA_U32) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -231,6 +257,9 @@ enum nft_set_attributes { NFTA_SET_KEY_LEN, NFTA_SET_DATA_TYPE, NFTA_SET_DATA_LEN, + NFTA_SET_POLICY, + NFTA_SET_DESC, + NFTA_SET_ID, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) @@ -266,12 +295,14 @@ enum nft_set_elem_attributes { * @NFTA_SET_ELEM_LIST_TABLE: table of the set to be changed (NLA_STRING) * @NFTA_SET_ELEM_LIST_SET: name of the set to be changed (NLA_STRING) * @NFTA_SET_ELEM_LIST_ELEMENTS: list of set elements (NLA_NESTED: nft_set_elem_attributes) + * @NFTA_SET_ELEM_LIST_SET_ID: uniquely identifies a set in a transaction (NLA_U32) */ enum nft_set_elem_list_attributes { NFTA_SET_ELEM_LIST_UNSPEC, NFTA_SET_ELEM_LIST_TABLE, NFTA_SET_ELEM_LIST_SET, NFTA_SET_ELEM_LIST_ELEMENTS, + NFTA_SET_ELEM_LIST_SET_ID, __NFTA_SET_ELEM_LIST_MAX }; #define NFTA_SET_ELEM_LIST_MAX (__NFTA_SET_ELEM_LIST_MAX - 1) @@ -457,12 +488,14 @@ enum nft_cmp_attributes { * @NFTA_LOOKUP_SET: name of the set where to look for (NLA_STRING) * @NFTA_LOOKUP_SREG: source register of the data to look for (NLA_U32: nft_registers) * @NFTA_LOOKUP_DREG: destination register (NLA_U32: nft_registers) + * @NFTA_LOOKUP_SET_ID: uniquely identifies a set in a transaction (NLA_U32) */ enum nft_lookup_attributes { NFTA_LOOKUP_UNSPEC, NFTA_LOOKUP_SET, NFTA_LOOKUP_SREG, NFTA_LOOKUP_DREG, + NFTA_LOOKUP_SET_ID, __NFTA_LOOKUP_MAX }; #define NFTA_LOOKUP_MAX (__NFTA_LOOKUP_MAX - 1) @@ -536,6 +569,8 @@ enum nft_exthdr_attributes { * @NFT_META_SECMARK: packet secmark (skb->secmark) * @NFT_META_NFPROTO: netfilter protocol * @NFT_META_L4PROTO: layer 4 protocol number + * @NFT_META_BRI_IIFNAME: packet input bridge interface name + * @NFT_META_BRI_OIFNAME: packet output bridge interface name */ enum nft_meta_keys { NFT_META_LEN, @@ -555,6 +590,8 @@ enum nft_meta_keys { NFT_META_SECMARK, NFT_META_NFPROTO, NFT_META_L4PROTO, + NFT_META_BRI_IIFNAME, + NFT_META_BRI_OIFNAME, }; /** diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7e02d624cc5..1036b6f2fde 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -273,10 +273,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) atomic_set(&filter->usage, 1); filter->prog->len = new_len; - filter->prog->bpf_func = (void *)sk_run_filter_int_seccomp; - /* JIT internal BPF into native HW instructions */ - bpf_int_jit_compile(filter->prog); + sk_filter_select_runtime(filter->prog); /* * If there is an existing filter, make it the prev and don't drop its @@ -340,7 +338,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; - bpf_jit_free(freeme->prog); + sk_filter_free(freeme->prog); kfree(freeme); } } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 3603ebcd5d6..e160934430e 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1489,7 +1489,7 @@ static __init int test_bpf(void) memcpy(fp_ext->insns, tests[i].insns_int, fprog.len * 8); fp->len = fprog.len; - fp->bpf_func = sk_run_filter_int_skb; + sk_filter_select_runtime(fp); } else { err = sk_unattached_filter_create(&fp, &fprog); if (tests[i].data_type == EXPECTED_FAIL) { @@ -1516,7 +1516,7 @@ static __init int test_bpf(void) if (tests[i].data_type != SKB_INT) sk_unattached_filter_destroy(fp); else - kfree(fp); + sk_filter_free(fp); if (err) { pr_cont("FAIL %d\n", err); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 8f025afa29f..4181fb71ba7 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -678,9 +678,9 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev, struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; netdev_features_t old_features = features; - features &= real_dev->vlan_features; + features = netdev_intersect_features(features, real_dev->vlan_features); features |= NETIF_F_RXCSUM; - features &= real_dev->features; + features = netdev_intersect_features(features, real_dev->features); features |= old_features & NETIF_F_SOFT_FEATURES; features |= NETIF_F_LLTX; diff --git a/net/bridge/Makefile b/net/bridge/Makefile index e85498b2f16..29b6e2a8ca9 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ - br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \ + br_ioctl.o br_stp.o br_stp_bpdu.o \ br_stp_if.o br_stp_timer.o br_netlink.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o @@ -16,4 +16,4 @@ bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o -obj-$(CONFIG_BRIDGE_NF_EBTABLES) += netfilter/ +obj-$(CONFIG_BRIDGE_NETFILTER) += netfilter/ diff --git a/net/bridge/br.c b/net/bridge/br.c index 19311aafcf5..1a755a1e541 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -22,6 +22,104 @@ #include "br_private.h" +/* + * Handle changes in state of network devices enslaved to a bridge. + * + * Note: don't care about up/down if bridge itself is down, because + * port state is checked when bridge is brought up. + */ +static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net_bridge_port *p; + struct net_bridge *br; + bool changed_addr; + int err; + + /* register of bridge completed, add sysfs entries */ + if ((dev->priv_flags & IFF_EBRIDGE) && event == NETDEV_REGISTER) { + br_sysfs_addbr(dev); + return NOTIFY_DONE; + } + + /* not a port of a bridge */ + p = br_port_get_rtnl(dev); + if (!p) + return NOTIFY_DONE; + + br = p->br; + + switch (event) { + case NETDEV_CHANGEMTU: + dev_set_mtu(br->dev, br_min_mtu(br)); + break; + + case NETDEV_CHANGEADDR: + spin_lock_bh(&br->lock); + br_fdb_changeaddr(p, dev->dev_addr); + changed_addr = br_stp_recalculate_bridge_id(br); + spin_unlock_bh(&br->lock); + + if (changed_addr) + call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); + + break; + + case NETDEV_CHANGE: + br_port_carrier_check(p); + break; + + case NETDEV_FEAT_CHANGE: + netdev_update_features(br->dev); + break; + + case NETDEV_DOWN: + spin_lock_bh(&br->lock); + if (br->dev->flags & IFF_UP) + br_stp_disable_port(p); + spin_unlock_bh(&br->lock); + break; + + case NETDEV_UP: + if (netif_running(br->dev) && netif_oper_up(dev)) { + spin_lock_bh(&br->lock); + br_stp_enable_port(p); + spin_unlock_bh(&br->lock); + } + break; + + case NETDEV_UNREGISTER: + br_del_if(br, dev); + break; + + case NETDEV_CHANGENAME: + err = br_sysfs_renameif(p); + if (err) + return notifier_from_errno(err); + break; + + case NETDEV_PRE_TYPE_CHANGE: + /* Forbid underlaying device to change its type. */ + return NOTIFY_BAD; + + case NETDEV_RESEND_IGMP: + /* Propagate to master device */ + call_netdevice_notifiers(event, br->dev); + break; + } + + /* Events that may cause spanning tree to refresh */ + if (event == NETDEV_CHANGEADDR || event == NETDEV_UP || + event == NETDEV_CHANGE || event == NETDEV_DOWN) + br_ifinfo_notify(RTM_NEWLINK, p); + + return NOTIFY_DONE; +} + +static struct notifier_block br_device_notifier = { + .notifier_call = br_device_event +}; + static void __net_exit br_net_exit(struct net *net) { struct net_device *dev; diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c deleted file mode 100644 index 2998dd1769a..00000000000 --- a/net/bridge/br_notify.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Device event handling - * Linux ethernet bridge - * - * Authors: - * Lennert Buytenhek <buytenh@gnu.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/rtnetlink.h> -#include <net/net_namespace.h> - -#include "br_private.h" - -static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr); - -struct notifier_block br_device_notifier = { - .notifier_call = br_device_event -}; - -/* - * Handle changes in state of network devices enslaved to a bridge. - * - * Note: don't care about up/down if bridge itself is down, because - * port state is checked when bridge is brought up. - */ -static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net_bridge_port *p; - struct net_bridge *br; - bool changed_addr; - int err; - - /* register of bridge completed, add sysfs entries */ - if ((dev->priv_flags & IFF_EBRIDGE) && event == NETDEV_REGISTER) { - br_sysfs_addbr(dev); - return NOTIFY_DONE; - } - - /* not a port of a bridge */ - p = br_port_get_rtnl(dev); - if (!p) - return NOTIFY_DONE; - - br = p->br; - - switch (event) { - case NETDEV_CHANGEMTU: - dev_set_mtu(br->dev, br_min_mtu(br)); - break; - - case NETDEV_CHANGEADDR: - spin_lock_bh(&br->lock); - br_fdb_changeaddr(p, dev->dev_addr); - changed_addr = br_stp_recalculate_bridge_id(br); - spin_unlock_bh(&br->lock); - - if (changed_addr) - call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); - - break; - - case NETDEV_CHANGE: - br_port_carrier_check(p); - break; - - case NETDEV_FEAT_CHANGE: - netdev_update_features(br->dev); - break; - - case NETDEV_DOWN: - spin_lock_bh(&br->lock); - if (br->dev->flags & IFF_UP) - br_stp_disable_port(p); - spin_unlock_bh(&br->lock); - break; - - case NETDEV_UP: - if (netif_running(br->dev) && netif_oper_up(dev)) { - spin_lock_bh(&br->lock); - br_stp_enable_port(p); - spin_unlock_bh(&br->lock); - } - break; - - case NETDEV_UNREGISTER: - br_del_if(br, dev); - break; - - case NETDEV_CHANGENAME: - err = br_sysfs_renameif(p); - if (err) - return notifier_from_errno(err); - break; - - case NETDEV_PRE_TYPE_CHANGE: - /* Forbid underlaying device to change its type. */ - return NOTIFY_BAD; - - case NETDEV_RESEND_IGMP: - /* Propagate to master device */ - call_netdevice_notifiers(event, br->dev); - break; - } - - /* Events that may cause spanning tree to refresh */ - if (event == NETDEV_CHANGEADDR || event == NETDEV_UP || - event == NETDEV_CHANGE || event == NETDEV_DOWN) - br_ifinfo_notify(RTM_NEWLINK, p); - - return NOTIFY_DONE; -} diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index af067711574..53d6e32965f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -333,8 +333,6 @@ struct br_input_skb_cb { #define br_debug(br, format, args...) \ pr_debug("%s: " format, (br)->dev->name, ##args) -extern struct notifier_block br_device_notifier; - /* called under bridge lock */ static inline int br_is_root_bridge(const struct net_bridge *br) { diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 5ca74a0e595..3baf29d34e6 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -2,13 +2,25 @@ # Bridge netfilter configuration # # -config NF_TABLES_BRIDGE +menuconfig NF_TABLES_BRIDGE depends on NF_TABLES + select BRIDGE_NETFILTER tristate "Ethernet Bridge nf_tables support" +if NF_TABLES_BRIDGE + +config NFT_BRIDGE_META + tristate "Netfilter nf_table bridge meta support" + depends on NFT_META + help + Add support for bridge dedicated meta key. + +endif # NF_TABLES_BRIDGE + menuconfig BRIDGE_NF_EBTABLES tristate "Ethernet Bridge tables (ebtables) support" depends on BRIDGE && NETFILTER + select BRIDGE_NETFILTER select NETFILTER_XTABLES help ebtables is a general, extensible frame/packet identification diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index ea7629f58b3..6f2f3943d66 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -3,6 +3,7 @@ # obj-$(CONFIG_NF_TABLES_BRIDGE) += nf_tables_bridge.o +obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c new file mode 100644 index 00000000000..4f02109d708 --- /dev/null +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2014 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> + +#include "../br_private.h" + +static void nft_meta_bridge_get_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + const struct net_device *in = pkt->in, *out = pkt->out; + struct nft_data *dest = &data[priv->dreg]; + const struct net_bridge_port *p; + + switch (priv->key) { + case NFT_META_BRI_IIFNAME: + if (in == NULL || (p = br_port_get_rcu(in)) == NULL) + goto err; + break; + case NFT_META_BRI_OIFNAME: + if (out == NULL || (p = br_port_get_rcu(out)) == NULL) + goto err; + break; + default: + goto out; + } + + strncpy((char *)dest->data, p->br->dev->name, sizeof(dest->data)); + return; +out: + return nft_meta_get_eval(expr, data, pkt); +err: + data[NFT_REG_VERDICT].verdict = NFT_BREAK; +} + +static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + int err; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_BRI_IIFNAME: + case NFT_META_BRI_OIFNAME: + break; + default: + return nft_meta_get_init(ctx, expr, tb); + } + + priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + return 0; +} + +static struct nft_expr_type nft_meta_bridge_type; +static const struct nft_expr_ops nft_meta_bridge_get_ops = { + .type = &nft_meta_bridge_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_bridge_get_eval, + .init = nft_meta_bridge_get_init, + .dump = nft_meta_get_dump, +}; + +static const struct nft_expr_ops nft_meta_bridge_set_ops = { + .type = &nft_meta_bridge_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_set_eval, + .init = nft_meta_set_init, + .dump = nft_meta_set_dump, +}; + +static const struct nft_expr_ops * +nft_meta_bridge_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_META_KEY] == NULL) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG]) + return &nft_meta_bridge_get_ops; + + if (tb[NFTA_META_SREG]) + return &nft_meta_bridge_set_ops; + + return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_meta_bridge_type __read_mostly = { + .family = NFPROTO_BRIDGE, + .name = "meta", + .select_ops = &nft_meta_bridge_select_ops, + .policy = nft_meta_policy, + .maxattr = NFTA_META_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_meta_bridge_module_init(void) +{ + return nft_register_expr(&nft_meta_bridge_type); +} + +static void __exit nft_meta_bridge_module_exit(void) +{ + nft_unregister_expr(&nft_meta_bridge_type); +} + +module_init(nft_meta_bridge_module_init); +module_exit(nft_meta_bridge_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta"); diff --git a/net/core/Makefile b/net/core/Makefile index 826b925aa45..71093d94ad2 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o dev_ioctl.o + sock_diag.o dev_ioctl.o tso.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o diff --git a/net/core/filter.c b/net/core/filter.c index 32c5b44c537..7067cb240d3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -153,7 +153,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) * keep, 0 for none. @ctx is the data we are operating on, @insn is the * array of filter instructions. */ -unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) +static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) { u64 stack[MAX_BPF_STACK / sizeof(u64)]; u64 regs[MAX_BPF_REG], tmp; @@ -571,15 +571,6 @@ load_byte: return 0; } -u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx, - const struct sock_filter_int *insni) - __attribute__ ((alias ("__sk_run_filter"))); - -u32 sk_run_filter_int_skb(const struct sk_buff *ctx, - const struct sock_filter_int *insni) - __attribute__ ((alias ("__sk_run_filter"))); -EXPORT_SYMBOL_GPL(sk_run_filter_int_skb); - /* Helper to find the offset of pkt_type in sk_buff structure. We want * to make sure its still a 3bit field starting at a byte boundary; * taken from arch/x86/net/bpf_jit_comp.c. @@ -1397,7 +1388,7 @@ static void sk_filter_release_rcu(struct rcu_head *rcu) struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); sk_release_orig_filter(fp); - bpf_jit_free(fp); + sk_filter_free(fp); } /** @@ -1497,7 +1488,6 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, goto out_err_free; } - fp->bpf_func = sk_run_filter_int_skb; fp->len = new_len; /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */ @@ -1510,6 +1500,8 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, */ goto out_err_free; + sk_filter_select_runtime(fp); + kfree(old_prog); return fp; @@ -1528,6 +1520,29 @@ void __weak bpf_int_jit_compile(struct sk_filter *prog) { } +/** + * sk_filter_select_runtime - select execution runtime for BPF program + * @fp: sk_filter populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via SK_RUN_FILTER() macro + */ +void sk_filter_select_runtime(struct sk_filter *fp) +{ + fp->bpf_func = (void *) __sk_run_filter; + + /* Probe if internal BPF can be JITed */ + bpf_int_jit_compile(fp); +} +EXPORT_SYMBOL_GPL(sk_filter_select_runtime); + +/* free internal BPF program */ +void sk_filter_free(struct sk_filter *fp) +{ + bpf_jit_free(fp); +} +EXPORT_SYMBOL_GPL(sk_filter_free); + static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, struct sock *sk) { @@ -1548,12 +1563,9 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, /* JIT compiler couldn't process this filter, so do the * internal BPF translation for the optimized interpreter. */ - if (!fp->jited) { + if (!fp->jited) fp = __sk_migrate_filter(fp, sk); - /* Probe if internal BPF can be jit-ed */ - bpf_int_jit_compile(fp); - } return fp; } diff --git a/net/core/tso.c b/net/core/tso.c new file mode 100644 index 00000000000..097821dd3a8 --- /dev/null +++ b/net/core/tso.c @@ -0,0 +1,72 @@ +#include <net/ip.h> +#include <net/tso.h> + +/* Calculate expected number of TX descriptors */ +int tso_count_descs(struct sk_buff *skb) +{ + /* The Marvell Way */ + return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; +} + +void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso, + int size, bool is_last) +{ + struct iphdr *iph; + struct tcphdr *tcph; + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + int mac_hdr_len = skb_network_offset(skb); + + memcpy(hdr, skb->data, hdr_len); + iph = (struct iphdr *)(hdr + mac_hdr_len); + iph->id = htons(tso->ip_id); + iph->tot_len = htons(size + hdr_len - mac_hdr_len); + tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb)); + tcph->seq = htonl(tso->tcp_seq); + tso->ip_id++; + + if (!is_last) { + /* Clear all special flags for not last packet */ + tcph->psh = 0; + tcph->fin = 0; + tcph->rst = 0; + } +} + +void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size) +{ + tso->tcp_seq += size; + tso->size -= size; + tso->data += size; + + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = frag->size; + tso->data = page_address(frag->page.p) + frag->page_offset; + tso->next_frag_idx++; + } +} + +void tso_start(struct sk_buff *skb, struct tso_t *tso) +{ + int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + + tso->ip_id = ntohs(ip_hdr(skb)->id); + tso->tcp_seq = ntohl(tcp_hdr(skb)->seq); + tso->next_frag_idx = 0; + + /* Build first data */ + tso->size = skb_headlen(skb) - hdr_len; + tso->data = skb->data + hdr_len; + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = frag->size; + tso->data = page_address(frag->page.p) + frag->page_offset; + tso->next_frag_idx++; + } +} diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 16f0b223102..1cd46a345cb 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -280,7 +280,7 @@ static ktime_t dccp_timestamp_seed; */ u32 dccp_timestamp(void) { - s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); + u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); do_div(delta, 10); return delta; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 812b1835146..4bc508f0db9 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -486,4 +486,5 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("ipip"); MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3d61c52bdf7..d463c35db33 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1402,11 +1402,19 @@ static void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs) +static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { struct tcp_sock *tp = tcp_sk(sk); - tp->lsnd_pending = tp->packets_out + unsent_segs; + /* Track the maximum number of outstanding packets in each + * window, and remember whether we were cwnd-limited then. + */ + if (!before(tp->snd_una, tp->max_packets_seq) || + tp->packets_out > tp->max_packets_out) { + tp->max_packets_out = tp->packets_out; + tp->max_packets_seq = tp->snd_nxt; + tp->is_cwnd_limited = is_cwnd_limited; + } if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ @@ -1660,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * * This algorithm is from John Heffner. */ -static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, + bool *is_cwnd_limited) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1724,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) if (!tp->tso_deferred) tp->tso_deferred = 1 | (jiffies << 1); + if (cong_win < send_win && cong_win < skb->len) + *is_cwnd_limited = true; + return true; send_now: @@ -1881,9 +1893,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int tso_segs, sent_pkts, unsent_segs = 0; + unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; + bool is_cwnd_limited = false; sent_pkts = 0; @@ -1908,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { + is_cwnd_limited = true; if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; @@ -1924,8 +1938,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle : TCP_NAGLE_PUSH)))) break; } else { - if (!push_one && tcp_tso_should_defer(sk, skb)) - goto compute_unsent_segs; + if (!push_one && + tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) + break; } /* TCP Small Queues : @@ -1950,14 +1965,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, * there is no smp_mb__after_set_bit() yet */ smp_mb__after_clear_bit(); - if (atomic_read(&sk->sk_wmem_alloc) > limit) { - u32 unsent_bytes; - -compute_unsent_segs: - unsent_bytes = tp->write_seq - tp->snd_nxt; - unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now); + if (atomic_read(&sk->sk_wmem_alloc) > limit) break; - } } limit = mss_now; @@ -1997,7 +2006,7 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk); - tcp_cwnd_validate(sk, unsent_segs); + tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b05b609f69d..fe61545dde7 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -61,6 +61,7 @@ MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #ifdef IP6_TNL_DEBUG diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f0a8ff9ed89..aa883afa652 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1455,7 +1455,7 @@ static int ip6_dst_gc(struct dst_ops *ops) goto out; net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, entries > rt_max_size); + fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index e5a453ca302..f4380041f5e 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1828,4 +1828,5 @@ xfrm_tunnel_failed: module_init(sit_init); module_exit(sit_cleanup); MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("sit"); MODULE_ALIAS_NETDEV("sit0"); diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index a83674edaaf..e4a25589ec1 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -207,6 +207,8 @@ static bool llsec_key_id_equal(const struct ieee802154_llsec_key_id *a, return false; switch (a->mode) { + case IEEE802154_SCF_KEY_INDEX: + return true; case IEEE802154_SCF_KEY_SHORT_INDEX: return a->short_source == b->short_source; case IEEE802154_SCF_KEY_HW_INDEX: @@ -773,10 +775,10 @@ int mac802154_llsec_encrypt(struct mac802154_llsec *sec, struct sk_buff *skb) rc = llsec_do_encrypt(skb, sec, &hdr, key); llsec_key_put(key); - return rc < 0 ? rc : 0; + return rc; fail_read: - read_unlock(&sec->lock); + read_unlock_bh(&sec->lock); fail: rcu_read_unlock(); return rc; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3fd159db9f0..04788477658 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -88,6 +88,45 @@ nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) return ERR_PTR(-EAFNOSUPPORT); } +static void nft_ctx_init(struct nft_ctx *ctx, + const struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nft_af_info *afi, + struct nft_table *table, + struct nft_chain *chain, + const struct nlattr * const *nla) +{ + ctx->net = sock_net(skb->sk); + ctx->afi = afi; + ctx->table = table; + ctx->chain = chain; + ctx->nla = nla; + ctx->portid = NETLINK_CB(skb).portid; + ctx->report = nlmsg_report(nlh); + ctx->seq = nlh->nlmsg_seq; +} + +static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type, + u32 size) +{ + struct nft_trans *trans; + + trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL); + if (trans == NULL) + return NULL; + + trans->msg_type = msg_type; + trans->ctx = *ctx; + + return trans; +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + list_del(&trans->list); + kfree(trans); +} + /* * Tables */ @@ -197,20 +236,13 @@ nla_put_failure: return -1; } -static int nf_tables_table_notify(const struct sk_buff *oskb, - const struct nlmsghdr *nlh, - const struct nft_table *table, - int event, int family) +static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; - u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; - u32 seq = nlh ? nlh->nlmsg_seq : 0; - struct net *net = oskb ? sock_net(oskb->sk) : &init_net; - bool report; int err; - report = nlh ? nlmsg_report(nlh) : false; - if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; @@ -218,18 +250,20 @@ static int nf_tables_table_notify(const struct sk_buff *oskb, if (skb == NULL) goto err; - err = nf_tables_fill_table_info(skb, portid, seq, event, 0, - family, table); + err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } return err; } @@ -269,6 +303,9 @@ done: return skb->len; } +/* Internal table flags */ +#define NFT_TABLE_INACTIVE (1 << 15) + static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -295,6 +332,8 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -343,7 +382,7 @@ err: return err; } -static int nf_tables_table_disable(const struct nft_af_info *afi, +static void nf_tables_table_disable(const struct nft_af_info *afi, struct nft_table *table) { struct nft_chain *chain; @@ -353,45 +392,63 @@ static int nf_tables_table_disable(const struct nft_af_info *afi, nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); } - - return 0; } -static int nf_tables_updtable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct nft_af_info *afi, struct nft_table *table) +static int nf_tables_updtable(struct nft_ctx *ctx) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - int family = nfmsg->nfgen_family, ret = 0; + struct nft_trans *trans; + u32 flags; + int ret = 0; - if (nla[NFTA_TABLE_FLAGS]) { - u32 flags; + if (!ctx->nla[NFTA_TABLE_FLAGS]) + return 0; - flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS])); - if (flags & ~NFT_TABLE_F_DORMANT) - return -EINVAL; + flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS])); + if (flags & ~NFT_TABLE_F_DORMANT) + return -EINVAL; + + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, + sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; - if ((flags & NFT_TABLE_F_DORMANT) && - !(table->flags & NFT_TABLE_F_DORMANT)) { - ret = nf_tables_table_disable(afi, table); - if (ret >= 0) - table->flags |= NFT_TABLE_F_DORMANT; - } else if (!(flags & NFT_TABLE_F_DORMANT) && - table->flags & NFT_TABLE_F_DORMANT) { - ret = nf_tables_table_enable(afi, table); - if (ret >= 0) - table->flags &= ~NFT_TABLE_F_DORMANT; + if ((flags & NFT_TABLE_F_DORMANT) && + !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { + nft_trans_table_enable(trans) = false; + } else if (!(flags & NFT_TABLE_F_DORMANT) && + ctx->table->flags & NFT_TABLE_F_DORMANT) { + ret = nf_tables_table_enable(ctx->afi, ctx->table); + if (ret >= 0) { + ctx->table->flags &= ~NFT_TABLE_F_DORMANT; + nft_trans_table_enable(trans) = true; } - if (ret < 0) - goto err; } + if (ret < 0) + goto err; - nf_tables_table_notify(skb, nlh, table, NFT_MSG_NEWTABLE, family); + nft_trans_table_update(trans) = true; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; err: + nft_trans_destroy(trans); return ret; } +static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWTABLE) + ctx->table->flags |= NFT_TABLE_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +} + static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -403,6 +460,8 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; u32 flags = 0; + struct nft_ctx ctx; + int err; afi = nf_tables_afinfo_lookup(net, family, true); if (IS_ERR(afi)) @@ -417,11 +476,15 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, } if (table != NULL) { + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - return nf_tables_updtable(nlsk, skb, nlh, nla, afi, table); + + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + return nf_tables_updtable(&ctx); } if (nla[NFTA_TABLE_FLAGS]) { @@ -444,8 +507,14 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, INIT_LIST_HEAD(&table->sets); table->flags = flags; + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); + if (err < 0) { + kfree(table); + module_put(afi->owner); + return err; + } list_add_tail(&table->list, &afi->tables); - nf_tables_table_notify(skb, nlh, table, NFT_MSG_NEWTABLE, family); return 0; } @@ -457,7 +526,8 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, struct nft_af_info *afi; struct nft_table *table; struct net *net = sock_net(skb->sk); - int family = nfmsg->nfgen_family; + int family = nfmsg->nfgen_family, err; + struct nft_ctx ctx; afi = nf_tables_afinfo_lookup(net, family, false); if (IS_ERR(afi)) @@ -466,17 +536,27 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; if (!list_empty(&table->chains) || !list_empty(&table->sets)) return -EBUSY; + nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + err = nft_trans_table_add(&ctx, NFT_MSG_DELTABLE); + if (err < 0) + return err; + list_del(&table->list); - nf_tables_table_notify(skb, nlh, table, NFT_MSG_DELTABLE, family); - kfree(table); - module_put(afi->owner); return 0; } +static void nf_tables_table_destroy(struct nft_ctx *ctx) +{ + kfree(ctx->table); + module_put(ctx->afi->owner); +} + int nft_register_chain_type(const struct nf_chain_type *ctype) { int err = 0; @@ -541,7 +621,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED }, [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, - [NFTA_CHAIN_TYPE] = { .type = NLA_NUL_STRING }, + [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, }; @@ -637,21 +717,13 @@ nla_put_failure: return -1; } -static int nf_tables_chain_notify(const struct sk_buff *oskb, - const struct nlmsghdr *nlh, - const struct nft_table *table, - const struct nft_chain *chain, - int event, int family) +static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; - u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; - struct net *net = oskb ? sock_net(oskb->sk) : &init_net; - u32 seq = nlh ? nlh->nlmsg_seq : 0; - bool report; int err; - report = nlh ? nlmsg_report(nlh) : false; - if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; @@ -659,18 +731,21 @@ static int nf_tables_chain_notify(const struct sk_buff *oskb, if (skb == NULL) goto err; - err = nf_tables_fill_chain_info(skb, portid, seq, event, 0, family, - table, chain); + err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table, + ctx->chain); if (err < 0) { kfree_skb(skb); goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } return err; } @@ -740,10 +815,14 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); if (IS_ERR(chain)) return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -767,8 +846,7 @@ static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, }; -static int -nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr) +static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) { struct nlattr *tb[NFTA_COUNTER_MAX+1]; struct nft_stats __percpu *newstats; @@ -777,14 +855,14 @@ nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr) err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy); if (err < 0) - return err; + return ERR_PTR(err); if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) - return -EINVAL; + return ERR_PTR(-EINVAL); newstats = alloc_percpu(struct nft_stats); if (newstats == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* Restore old counters on this cpu, no problem. Per-cpu statistics * are not exposed to userspace. @@ -793,6 +871,12 @@ nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr) stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + return newstats; +} + +static void nft_chain_stats_replace(struct nft_base_chain *chain, + struct nft_stats __percpu *newstats) +{ if (chain->stats) { struct nft_stats __percpu *oldstats = nft_dereference(chain->stats); @@ -802,17 +886,43 @@ nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr) free_percpu(oldstats); } else rcu_assign_pointer(chain->stats, newstats); +} + +static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWCHAIN) + ctx->chain->flags |= NFT_CHAIN_INACTIVE; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; } +static void nf_tables_chain_destroy(struct nft_chain *chain) +{ + BUG_ON(chain->use > 0); + + if (chain->flags & NFT_BASE_CHAIN) { + module_put(nft_base_chain(chain)->type->owner); + free_percpu(nft_base_chain(chain)->stats); + kfree(nft_base_chain(chain)); + } else { + kfree(chain); + } +} + static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nlattr * uninitialized_var(name); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; struct nft_base_chain *basechain = NULL; @@ -822,8 +932,10 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, u8 policy = NF_ACCEPT; u64 handle = 0; unsigned int i; + struct nft_stats __percpu *stats; int err; bool create; + struct nft_ctx ctx; create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; @@ -869,6 +981,11 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, } if (chain != NULL) { + struct nft_stats *stats = NULL; + struct nft_trans *trans; + + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) @@ -882,19 +999,31 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (!(chain->flags & NFT_BASE_CHAIN)) return -EOPNOTSUPP; - err = nf_tables_counters(nft_base_chain(chain), - nla[NFTA_CHAIN_COUNTERS]); - if (err < 0) - return err; + stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); + if (IS_ERR(stats)) + return PTR_ERR(stats); } - if (nla[NFTA_CHAIN_POLICY]) - nft_base_chain(chain)->policy = policy; + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, + sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; - if (nla[NFTA_CHAIN_HANDLE] && name) - nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); + nft_trans_chain_stats(trans) = stats; + nft_trans_chain_update(trans) = true; - goto notify; + if (nla[NFTA_CHAIN_POLICY]) + nft_trans_chain_policy(trans) = policy; + else + nft_trans_chain_policy(trans) = -1; + + if (nla[NFTA_CHAIN_HANDLE] && name) { + nla_strlcpy(nft_trans_chain_name(trans), name, + NFT_CHAIN_MAXNAMELEN); + } + list_add_tail(&trans->list, &net->nft.commit_list); + return 0; } if (table->use == UINT_MAX) @@ -939,23 +1068,21 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, return -ENOMEM; if (nla[NFTA_CHAIN_COUNTERS]) { - err = nf_tables_counters(basechain, - nla[NFTA_CHAIN_COUNTERS]); - if (err < 0) { + stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); + if (IS_ERR(stats)) { module_put(type->owner); kfree(basechain); - return err; + return PTR_ERR(stats); } + basechain->stats = stats; } else { - struct nft_stats __percpu *newstats; - - newstats = alloc_percpu(struct nft_stats); - if (newstats == NULL) { + stats = alloc_percpu(struct nft_stats); + if (IS_ERR(stats)) { module_put(type->owner); kfree(basechain); - return -ENOMEM; + return PTR_ERR(stats); } - rcu_assign_pointer(basechain->stats, newstats); + rcu_assign_pointer(basechain->stats, stats); } basechain->type = type; @@ -992,31 +1119,26 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (!(table->flags & NFT_TABLE_F_DORMANT) && chain->flags & NFT_BASE_CHAIN) { err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); - if (err < 0) { - module_put(basechain->type->owner); - free_percpu(basechain->stats); - kfree(basechain); - return err; - } + if (err < 0) + goto err1; } - list_add_tail(&chain->list, &table->chains); - table->use++; -notify: - nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_NEWCHAIN, - family); - return 0; -} -static void nf_tables_chain_destroy(struct nft_chain *chain) -{ - BUG_ON(chain->use > 0); + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); + if (err < 0) + goto err2; - if (chain->flags & NFT_BASE_CHAIN) { - module_put(nft_base_chain(chain)->type->owner); - free_percpu(nft_base_chain(chain)->stats); - kfree(nft_base_chain(chain)); - } else - kfree(chain); + list_add_tail(&chain->list, &table->chains); + return 0; +err2: + if (!(table->flags & NFT_TABLE_F_DORMANT) && + chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(chain)->ops, + afi->nops); + } +err1: + nf_tables_chain_destroy(chain); + return err; } static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, @@ -1024,11 +1146,13 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nft_ctx ctx; + int err; afi = nf_tables_afinfo_lookup(net, family, false); if (IS_ERR(afi)) @@ -1037,48 +1161,26 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); if (IS_ERR(chain)) return PTR_ERR(chain); - + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; if (!list_empty(&chain->rules) || chain->use > 0) return -EBUSY; - list_del(&chain->list); - table->use--; - - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) - nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); - - nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_DELCHAIN, - family); - - /* Make sure all rule references are gone before this is released */ - synchronize_rcu(); + nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + err = nft_trans_chain_add(&ctx, NFT_MSG_DELCHAIN); + if (err < 0) + return err; - nf_tables_chain_destroy(chain); + list_del(&chain->list); return 0; } -static void nft_ctx_init(struct nft_ctx *ctx, - const struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nft_af_info *afi, - const struct nft_table *table, - const struct nft_chain *chain, - const struct nlattr * const *nla) -{ - ctx->net = sock_net(skb->sk); - ctx->skb = skb; - ctx->nlh = nlh; - ctx->afi = afi; - ctx->table = table; - ctx->chain = chain; - ctx->nla = nla; -} - /* * Expressions */ @@ -1093,7 +1195,10 @@ static void nft_ctx_init(struct nft_ctx *ctx, int nft_register_expr(struct nft_expr_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail(&type->list, &nf_tables_expressions); + if (type->family == NFPROTO_UNSPEC) + list_add_tail(&type->list, &nf_tables_expressions); + else + list_add(&type->list, &nf_tables_expressions); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } @@ -1361,22 +1466,15 @@ nla_put_failure: return -1; } -static int nf_tables_rule_notify(const struct sk_buff *oskb, - const struct nlmsghdr *nlh, - const struct nft_table *table, - const struct nft_chain *chain, +static int nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, - int event, u32 flags, int family) + int event) { struct sk_buff *skb; - u32 portid = NETLINK_CB(oskb).portid; - struct net *net = oskb ? sock_net(oskb->sk) : &init_net; - u32 seq = nlh->nlmsg_seq; - bool report; int err; - report = nlmsg_report(nlh); - if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; @@ -1384,18 +1482,21 @@ static int nf_tables_rule_notify(const struct sk_buff *oskb, if (skb == NULL) goto err; - err = nf_tables_fill_rule_info(skb, portid, seq, event, flags, - family, table, chain, rule); + err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, + ctx->afi->family, ctx->table, + ctx->chain, rule); if (err < 0) { kfree_skb(skb); goto err; } - err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); err: - if (err < 0) - nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } return err; } @@ -1511,10 +1612,14 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); if (IS_ERR(chain)) return PTR_ERR(chain); + if (chain->flags & NFT_CHAIN_INACTIVE) + return -ENOENT; rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) @@ -1554,37 +1659,36 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -#define NFT_RULE_MAXEXPRS 128 - -static struct nft_expr_info *info; - -static struct nft_rule_trans * -nf_tables_trans_add(struct nft_ctx *ctx, struct nft_rule *rule) +static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, + struct nft_rule *rule) { - struct nft_rule_trans *rupd; + struct nft_trans *trans; - rupd = kmalloc(sizeof(struct nft_rule_trans), GFP_KERNEL); - if (rupd == NULL) - return NULL; + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); + if (trans == NULL) + return NULL; - rupd->ctx = *ctx; - rupd->rule = rule; - list_add_tail(&rupd->list, &ctx->net->nft.commit_list); + nft_trans_rule(trans) = rule; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); - return rupd; + return trans; } +#define NFT_RULE_MAXEXPRS 128 + +static struct nft_expr_info *info; + static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; - struct nft_rule_trans *repl = NULL; + struct nft_trans *trans = NULL; struct nft_expr *expr; struct nft_ctx ctx; struct nlattr *tmp; @@ -1682,8 +1786,9 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_REPLACE) { if (nft_rule_is_active_next(net, old_rule)) { - repl = nf_tables_trans_add(&ctx, old_rule); - if (repl == NULL) { + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, + old_rule); + if (trans == NULL) { err = -ENOMEM; goto err2; } @@ -1705,7 +1810,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, list_add_rcu(&rule->list, &chain->rules); } - if (nf_tables_trans_add(&ctx, rule) == NULL) { + if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { err = -ENOMEM; goto err3; } @@ -1713,11 +1818,10 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, err3: list_del_rcu(&rule->list); - if (repl) { - list_del_rcu(&repl->rule->list); - list_del(&repl->list); - nft_rule_clear(net, repl->rule); - kfree(repl); + if (trans) { + list_del_rcu(&nft_trans_rule(trans)->list); + nft_rule_clear(net, nft_trans_rule(trans)); + nft_trans_destroy(trans); } err2: nf_tables_rule_destroy(&ctx, rule); @@ -1734,7 +1838,7 @@ nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule) { /* You cannot delete the same rule twice */ if (nft_rule_is_active_next(ctx->net, rule)) { - if (nf_tables_trans_add(ctx, rule) == NULL) + if (nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule) == NULL) return -ENOMEM; nft_rule_disactivate_next(ctx->net, rule); return 0; @@ -1760,9 +1864,9 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; + struct nft_af_info *afi; struct net *net = sock_net(skb->sk); - const struct nft_table *table; + struct nft_table *table; struct nft_chain *chain = NULL; struct nft_rule *rule; int family = nfmsg->nfgen_family, err = 0; @@ -1775,6 +1879,8 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; if (nla[NFTA_RULE_CHAIN]) { chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); @@ -1807,88 +1913,6 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, return err; } -static int nf_tables_commit(struct sk_buff *skb) -{ - struct net *net = sock_net(skb->sk); - struct nft_rule_trans *rupd, *tmp; - - /* Bump generation counter, invalidate any dump in progress */ - net->nft.genctr++; - - /* A new generation has just started */ - net->nft.gencursor = gencursor_next(net); - - /* Make sure all packets have left the previous generation before - * purging old rules. - */ - synchronize_rcu(); - - list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - /* This rule was inactive in the past and just became active. - * Clear the next bit of the genmask since its meaning has - * changed, now it is the future. - */ - if (nft_rule_is_active(net, rupd->rule)) { - nft_rule_clear(net, rupd->rule); - nf_tables_rule_notify(skb, rupd->ctx.nlh, - rupd->ctx.table, rupd->ctx.chain, - rupd->rule, NFT_MSG_NEWRULE, 0, - rupd->ctx.afi->family); - list_del(&rupd->list); - kfree(rupd); - continue; - } - - /* This rule is in the past, get rid of it */ - list_del_rcu(&rupd->rule->list); - nf_tables_rule_notify(skb, rupd->ctx.nlh, - rupd->ctx.table, rupd->ctx.chain, - rupd->rule, NFT_MSG_DELRULE, 0, - rupd->ctx.afi->family); - } - - /* Make sure we don't see any packet traversing old rules */ - synchronize_rcu(); - - /* Now we can safely release unused old rules */ - list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - nf_tables_rule_destroy(&rupd->ctx, rupd->rule); - list_del(&rupd->list); - kfree(rupd); - } - - return 0; -} - -static int nf_tables_abort(struct sk_buff *skb) -{ - struct net *net = sock_net(skb->sk); - struct nft_rule_trans *rupd, *tmp; - - list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - if (!nft_rule_is_active_next(net, rupd->rule)) { - nft_rule_clear(net, rupd->rule); - list_del(&rupd->list); - kfree(rupd); - continue; - } - - /* This rule is inactive, get rid of it */ - list_del_rcu(&rupd->rule->list); - } - - /* Make sure we don't see any packet accessing aborted rules */ - synchronize_rcu(); - - list_for_each_entry_safe(rupd, tmp, &net->nft.commit_list, list) { - nf_tables_rule_destroy(&rupd->ctx, rupd->rule); - list_del(&rupd->list); - kfree(rupd); - } - - return 0; -} - /* * Sets */ @@ -1912,9 +1936,18 @@ void nft_unregister_set(struct nft_set_ops *ops) } EXPORT_SYMBOL_GPL(nft_unregister_set); -static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const nla[]) +/* + * Select a set implementation based on the data characteristics and the + * given policy. The total memory use might not be known if no size is + * given, in that case the amount of memory per element is used. + */ +static const struct nft_set_ops * +nft_select_set_ops(const struct nlattr * const nla[], + const struct nft_set_desc *desc, + enum nft_set_policies policy) { - const struct nft_set_ops *ops; + const struct nft_set_ops *ops, *bops; + struct nft_set_estimate est, best; u32 features; #ifdef CONFIG_MODULES @@ -1932,15 +1965,45 @@ static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const features &= NFT_SET_INTERVAL | NFT_SET_MAP; } - // FIXME: implement selection properly + bops = NULL; + best.size = ~0; + best.class = ~0; + list_for_each_entry(ops, &nf_tables_set_ops, list) { if ((ops->features & features) != features) continue; + if (!ops->estimate(desc, features, &est)) + continue; + + switch (policy) { + case NFT_SET_POL_PERFORMANCE: + if (est.class < best.class) + break; + if (est.class == best.class && est.size < best.size) + break; + continue; + case NFT_SET_POL_MEMORY: + if (est.size < best.size) + break; + if (est.size == best.size && est.class < best.class) + break; + continue; + default: + break; + } + if (!try_module_get(ops->owner)) continue; - return ops; + if (bops != NULL) + module_put(bops->owner); + + bops = ops; + best = est; } + if (bops != NULL) + return bops; + return ERR_PTR(-EOPNOTSUPP); } @@ -1953,6 +2016,13 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, [NFTA_SET_DATA_TYPE] = { .type = NLA_U32 }, [NFTA_SET_DATA_LEN] = { .type = NLA_U32 }, + [NFTA_SET_POLICY] = { .type = NLA_U32 }, + [NFTA_SET_DESC] = { .type = NLA_NESTED }, + [NFTA_SET_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { + [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, }; static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, @@ -1962,8 +2032,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, { struct net *net = sock_net(skb->sk); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi = NULL; - const struct nft_table *table = NULL; + struct nft_af_info *afi = NULL; + struct nft_table *table = NULL; if (nfmsg->nfgen_family != NFPROTO_UNSPEC) { afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); @@ -1978,6 +2048,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; } nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); @@ -1999,13 +2071,27 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-ENOENT); } +struct nft_set *nf_tables_set_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + struct nft_trans *trans; + u32 id = ntohl(nla_get_be32(nla)); + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->msg_type == NFT_MSG_NEWSET && + id == nft_trans_set_id(trans)) + return nft_trans_set(trans); + } + return ERR_PTR(-ENOENT); +} + static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, const char *name) { const struct nft_set *i; const char *p; unsigned long *inuse; - unsigned int n = 0; + unsigned int n = 0, min = 0; p = strnchr(name, IFNAMSIZ, '%'); if (p != NULL) { @@ -2015,23 +2101,28 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (inuse == NULL) return -ENOMEM; - +cont: list_for_each_entry(i, &ctx->table->sets, list) { int tmp; if (!sscanf(i->name, name, &tmp)) continue; - if (tmp < 0 || tmp >= BITS_PER_BYTE * PAGE_SIZE) + if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) continue; - set_bit(tmp, inuse); + set_bit(tmp - min, inuse); } n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE); + if (n >= BITS_PER_BYTE * PAGE_SIZE) { + min += BITS_PER_BYTE * PAGE_SIZE; + memset(inuse, 0, PAGE_SIZE); + goto cont; + } free_page((unsigned long)inuse); } - snprintf(set->name, sizeof(set->name), name, n); + snprintf(set->name, sizeof(set->name), name, min + n); list_for_each_entry(i, &ctx->table->sets, list) { if (!strcmp(set->name, i->name)) return -ENFILE; @@ -2044,8 +2135,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, { struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; - u32 portid = NETLINK_CB(ctx->skb).portid; - u32 seq = ctx->nlh->nlmsg_seq; + struct nlattr *desc; + u32 portid = ctx->portid; + u32 seq = ctx->seq; event |= NFNL_SUBSYS_NFTABLES << 8; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), @@ -2077,6 +2169,14 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + desc = nla_nest_start(skb, NFTA_SET_DESC); + if (desc == NULL) + goto nla_put_failure; + if (set->size && + nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) + goto nla_put_failure; + nla_nest_end(skb, desc); + return nlmsg_end(skb, nlh); nla_put_failure: @@ -2089,12 +2189,11 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; - u32 portid = NETLINK_CB(ctx->skb).portid; - bool report; + u32 portid = ctx->portid; int err; - report = nlmsg_report(ctx->nlh); - if (!report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) return 0; err = -ENOBUFS; @@ -2108,8 +2207,8 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx, goto err; } - err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, report, - GFP_KERNEL); + err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); err: if (err < 0) nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err); @@ -2183,7 +2282,7 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, { const struct nft_set *set; unsigned int idx, s_idx = cb->args[0]; - const struct nft_af_info *afi; + struct nft_af_info *afi; struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; struct net *net = sock_net(skb->sk); int cur_family = cb->args[3]; @@ -2260,6 +2359,8 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) return ret; } +#define NFT_SET_INACTIVE (1 << 15) /* Internal set flag */ + static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2289,6 +2390,8 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb2 == NULL) @@ -2305,13 +2408,50 @@ err: return err; } +static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, + struct nft_set_desc *desc, + const struct nlattr *nla) +{ + struct nlattr *da[NFTA_SET_DESC_MAX + 1]; + int err; + + err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy); + if (err < 0) + return err; + + if (da[NFTA_SET_DESC_SIZE] != NULL) + desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + + return 0; +} + +static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { + nft_trans_set_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); + set->flags |= NFT_SET_INACTIVE; + } + nft_trans_set(trans) = set; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +} + static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nft_set_ops *ops; - const struct nft_af_info *afi; + struct nft_af_info *afi; struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_set *set; @@ -2319,14 +2459,18 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, char name[IFNAMSIZ]; unsigned int size; bool create; - u32 ktype, klen, dlen, dtype, flags; + u32 ktype, dtype, flags, policy; + struct nft_set_desc desc; int err; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || - nla[NFTA_SET_KEY_LEN] == NULL) + nla[NFTA_SET_KEY_LEN] == NULL || + nla[NFTA_SET_ID] == NULL) return -EINVAL; + memset(&desc, 0, sizeof(desc)); + ktype = NFT_DATA_VALUE; if (nla[NFTA_SET_KEY_TYPE] != NULL) { ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE])); @@ -2334,8 +2478,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, return -EINVAL; } - klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); - if (klen == 0 || klen > FIELD_SIZEOF(struct nft_data, data)) + desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); + if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data)) return -EINVAL; flags = 0; @@ -2347,7 +2491,6 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, } dtype = 0; - dlen = 0; if (nla[NFTA_SET_DATA_TYPE] != NULL) { if (!(flags & NFT_SET_MAP)) return -EINVAL; @@ -2360,15 +2503,25 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (dtype != NFT_DATA_VERDICT) { if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; - dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); - if (dlen == 0 || - dlen > FIELD_SIZEOF(struct nft_data, data)) + desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); + if (desc.dlen == 0 || + desc.dlen > FIELD_SIZEOF(struct nft_data, data)) return -EINVAL; } else - dlen = sizeof(struct nft_data); + desc.dlen = sizeof(struct nft_data); } else if (flags & NFT_SET_MAP) return -EINVAL; + policy = NFT_SET_POL_PERFORMANCE; + if (nla[NFTA_SET_POLICY] != NULL) + policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + + if (nla[NFTA_SET_DESC] != NULL) { + err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]); + if (err < 0) + return err; + } + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create); @@ -2399,7 +2552,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (!(nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - ops = nft_select_set_ops(nla); + ops = nft_select_set_ops(nla, &desc, policy); if (IS_ERR(ops)) return PTR_ERR(ops); @@ -2420,17 +2573,21 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, INIT_LIST_HEAD(&set->bindings); set->ops = ops; set->ktype = ktype; - set->klen = klen; + set->klen = desc.klen; set->dtype = dtype; - set->dlen = dlen; + set->dlen = desc.dlen; set->flags = flags; + set->size = desc.size; - err = ops->init(set, nla); + err = ops->init(set, &desc, nla); + if (err < 0) + goto err2; + + err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) goto err2; list_add_tail(&set->list, &table->sets); - nf_tables_set_notify(&ctx, set, NFT_MSG_NEWSET); return 0; err2: @@ -2440,16 +2597,20 @@ err1: return err; } -static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) +static void nft_set_destroy(struct nft_set *set) { - list_del(&set->list); - nf_tables_set_notify(ctx, set, NFT_MSG_DELSET); - set->ops->destroy(set); module_put(set->ops->owner); kfree(set); } +static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) +{ + list_del(&set->list); + nf_tables_set_notify(ctx, set, NFT_MSG_DELSET); + nft_set_destroy(set); +} + static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2471,10 +2632,16 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; if (!list_empty(&set->bindings)) return -EBUSY; - nf_tables_set_destroy(&ctx, set); + err = nft_trans_set_add(&ctx, NFT_MSG_DELSET, set); + if (err < 0) + return err; + + list_del(&set->list); return 0; } @@ -2534,7 +2701,8 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, { list_del(&binding->list); - if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS) + if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && + !(set->flags & NFT_SET_INACTIVE)) nf_tables_set_destroy(ctx, set); } @@ -2552,16 +2720,18 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING }, [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING }, [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, const struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + bool trans) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nft_af_info *afi; - const struct nft_table *table; + struct nft_af_info *afi; + struct nft_table *table; struct net *net = sock_net(skb->sk); afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); @@ -2571,6 +2741,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); + if (!trans && (table->flags & NFT_TABLE_INACTIVE)) + return -ENOENT; nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); return 0; @@ -2644,13 +2816,16 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) return err; - err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla); + err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, + false); if (err < 0) return err; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; event = NFT_MSG_NEWSETELEM; event |= NFNL_SUBSYS_NFTABLES << 8; @@ -2707,13 +2882,15 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, struct nft_ctx ctx; int err; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); if (err < 0) return err; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) return PTR_ERR(set); + if (set->flags & NFT_SET_INACTIVE) + return -ENOENT; if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -2724,7 +2901,98 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, return -EOPNOTSUPP; } -static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, +static int nf_tables_fill_setelem_info(struct sk_buff *skb, + const struct nft_ctx *ctx, u32 seq, + u32 portid, int event, u16 flags, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct nlattr *nest; + int err; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), + flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = ctx->afi->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_SET_NAME, set->name)) + goto nla_put_failure; + + nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); + if (nest == NULL) + goto nla_put_failure; + + err = nf_tables_fill_setelem(skb, set, elem); + if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_setelem_notify(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_set_elem *elem, + int event, u16 flags) +{ + struct net *net = ctx->net; + u32 portid = ctx->portid; + struct sk_buff *skb; + int err; + + if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, + set, elem); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, + GFP_KERNEL); +err: + if (err < 0) + nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err); + return err; +} + +static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, + int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); + if (trans == NULL) + return NULL; + + nft_trans_elem_set(trans) = set; + return trans; +} + +static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; @@ -2732,8 +3000,12 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem elem; struct nft_set_binding *binding; enum nft_registers dreg; + struct nft_trans *trans; int err; + if (set->size && set->nelems == set->size) + return -ENFILE; + err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy); if (err < 0) @@ -2786,7 +3058,7 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, struct nft_ctx bind_ctx = { .afi = ctx->afi, .table = ctx->table, - .chain = binding->chain, + .chain = (struct nft_chain *)binding->chain, }; err = nft_validate_data_load(&bind_ctx, dreg, @@ -2796,12 +3068,20 @@ static int nft_add_set_elem(const struct nft_ctx *ctx, struct nft_set *set, } } + trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); + if (trans == NULL) + goto err3; + err = set->ops->insert(set, &elem); if (err < 0) - goto err3; + goto err4; + nft_trans_elem(trans) = elem; + list_add(&trans->list, &ctx->net->nft.commit_list); return 0; +err4: + kfree(trans); err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_uninit(&elem.data, d2.type); @@ -2815,35 +3095,44 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + struct net *net = sock_net(skb->sk); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; - int rem, err; + int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); if (err < 0) return err; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); - if (IS_ERR(set)) - return PTR_ERR(set); + if (IS_ERR(set)) { + if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { + set = nf_tables_set_lookup_byid(net, + nla[NFTA_SET_ELEM_LIST_SET_ID]); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } + if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_add_set_elem(&ctx, set, attr); if (err < 0) - return err; + break; } - return 0; + return err; } -static int nft_del_setelem(const struct nft_ctx *ctx, struct nft_set *set, +static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_data_desc desc; struct nft_set_elem elem; + struct nft_trans *trans; int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, @@ -2867,7 +3156,12 @@ static int nft_del_setelem(const struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto err2; - set->ops->remove(set, &elem); + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); + if (trans == NULL) + goto err2; + + nft_trans_elem(trans) = elem; + list_add(&trans->list, &ctx->net->nft.commit_list); nft_data_uninit(&elem.key, NFT_DATA_VALUE); if (set->flags & NFT_SET_MAP) @@ -2886,9 +3180,9 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; - int rem, err; + int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); if (err < 0) return err; @@ -2901,14 +3195,14 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_del_setelem(&ctx, set, attr); if (err < 0) - return err; + break; } - return 0; + return err; } static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { - .call = nf_tables_newtable, + .call_batch = nf_tables_newtable, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, @@ -2918,12 +3212,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_table_policy, }, [NFT_MSG_DELTABLE] = { - .call = nf_tables_deltable, + .call_batch = nf_tables_deltable, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_NEWCHAIN] = { - .call = nf_tables_newchain, + .call_batch = nf_tables_newchain, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, @@ -2933,7 +3227,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_chain_policy, }, [NFT_MSG_DELCHAIN] = { - .call = nf_tables_delchain, + .call_batch = nf_tables_delchain, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, @@ -2953,7 +3247,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_rule_policy, }, [NFT_MSG_NEWSET] = { - .call = nf_tables_newset, + .call_batch = nf_tables_newset, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, @@ -2963,12 +3257,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_policy, }, [NFT_MSG_DELSET] = { - .call = nf_tables_delset, + .call_batch = nf_tables_delset, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_NEWSETELEM] = { - .call = nf_tables_newsetelem, + .call_batch = nf_tables_newsetelem, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, @@ -2978,12 +3272,270 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_elem_list_policy, }, [NFT_MSG_DELSETELEM] = { - .call = nf_tables_delsetelem, + .call_batch = nf_tables_delsetelem, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, }; +static void nft_chain_commit_update(struct nft_trans *trans) +{ + struct nft_base_chain *basechain; + + if (nft_trans_chain_name(trans)[0]) + strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans)); + + if (!(trans->ctx.chain->flags & NFT_BASE_CHAIN)) + return; + + basechain = nft_base_chain(trans->ctx.chain); + nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans)); + + switch (nft_trans_chain_policy(trans)) { + case NF_DROP: + case NF_ACCEPT: + basechain->policy = nft_trans_chain_policy(trans); + break; + } +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * removed rules. + */ +static void nf_tables_commit_release_rcu(struct rcu_head *rt) +{ + struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + + switch (trans->msg_type) { + case NFT_MSG_DELTABLE: + nf_tables_table_destroy(&trans->ctx); + break; + case NFT_MSG_DELCHAIN: + nf_tables_chain_destroy(trans->ctx.chain); + break; + case NFT_MSG_DELRULE: + nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + break; + case NFT_MSG_DELSET: + nft_set_destroy(nft_trans_set(trans)); + break; + } + kfree(trans); +} + +static int nf_tables_commit(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + struct nft_trans *trans, *next; + struct nft_set *set; + + /* Bump generation counter, invalidate any dump in progress */ + net->nft.genctr++; + + /* A new generation has just started */ + net->nft.gencursor = gencursor_next(net); + + /* Make sure all packets have left the previous generation before + * purging old rules. + */ + synchronize_rcu(); + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + if (nft_trans_table_update(trans)) { + if (!nft_trans_table_enable(trans)) { + nf_tables_table_disable(trans->ctx.afi, + trans->ctx.table); + trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + } + } else { + trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE; + } + nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELTABLE: + nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); + break; + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain_update(trans)) + nft_chain_commit_update(trans); + else { + trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE; + trans->ctx.table->use++; + } + nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELCHAIN: + trans->ctx.table->use--; + nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && + trans->ctx.chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, + trans->ctx.afi->nops); + } + break; + case NFT_MSG_NEWRULE: + nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nf_tables_rule_notify(&trans->ctx, + nft_trans_rule(trans), + NFT_MSG_NEWRULE); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELRULE: + list_del_rcu(&nft_trans_rule(trans)->list); + nf_tables_rule_notify(&trans->ctx, + nft_trans_rule(trans), + NFT_MSG_DELRULE); + break; + case NFT_MSG_NEWSET: + nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE; + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_NEWSET); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSET: + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_DELSET); + break; + case NFT_MSG_NEWSETELEM: + nft_trans_elem_set(trans)->nelems++; + nf_tables_setelem_notify(&trans->ctx, + nft_trans_elem_set(trans), + &nft_trans_elem(trans), + NFT_MSG_NEWSETELEM, 0); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSETELEM: + nft_trans_elem_set(trans)->nelems--; + nf_tables_setelem_notify(&trans->ctx, + nft_trans_elem_set(trans), + &nft_trans_elem(trans), + NFT_MSG_DELSETELEM, 0); + set = nft_trans_elem_set(trans); + set->ops->get(set, &nft_trans_elem(trans)); + set->ops->remove(set, &nft_trans_elem(trans)); + nft_trans_destroy(trans); + break; + } + } + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_del(&trans->list); + trans->ctx.nla = NULL; + call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); + } + + return 0; +} + +/* Schedule objects for release via rcu to make sure no packets are accesing + * aborted rules. + */ +static void nf_tables_abort_release_rcu(struct rcu_head *rt) +{ + struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); + + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + nf_tables_table_destroy(&trans->ctx); + break; + case NFT_MSG_NEWCHAIN: + nf_tables_chain_destroy(trans->ctx.chain); + break; + case NFT_MSG_NEWRULE: + nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); + break; + case NFT_MSG_NEWSET: + nft_set_destroy(nft_trans_set(trans)); + break; + } + kfree(trans); +} + +static int nf_tables_abort(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + struct nft_trans *trans, *next; + struct nft_set *set; + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWTABLE: + if (nft_trans_table_update(trans)) { + if (nft_trans_table_enable(trans)) { + nf_tables_table_disable(trans->ctx.afi, + trans->ctx.table); + trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; + } + nft_trans_destroy(trans); + } else { + list_del(&trans->ctx.table->list); + } + break; + case NFT_MSG_DELTABLE: + list_add_tail(&trans->ctx.table->list, + &trans->ctx.afi->tables); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain_update(trans)) { + if (nft_trans_chain_stats(trans)) + free_percpu(nft_trans_chain_stats(trans)); + + nft_trans_destroy(trans); + } else { + list_del(&trans->ctx.chain->list); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && + trans->ctx.chain->flags & NFT_BASE_CHAIN) { + nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, + trans->ctx.afi->nops); + } + } + break; + case NFT_MSG_DELCHAIN: + list_add_tail(&trans->ctx.chain->list, + &trans->ctx.table->chains); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWRULE: + list_del_rcu(&nft_trans_rule(trans)->list); + break; + case NFT_MSG_DELRULE: + nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWSET: + list_del(&nft_trans_set(trans)->list); + break; + case NFT_MSG_DELSET: + list_add_tail(&nft_trans_set(trans)->list, + &trans->ctx.table->sets); + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWSETELEM: + set = nft_trans_elem_set(trans); + set->ops->get(set, &nft_trans_elem(trans)); + set->ops->remove(set, &nft_trans_elem(trans)); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSETELEM: + nft_trans_destroy(trans); + break; + } + } + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_del(&trans->list); + trans->ctx.nla = NULL; + call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu); + } + + return 0; +} + static const struct nfnetlink_subsystem nf_tables_subsys = { .name = "nf_tables", .subsys_id = NFNL_SUBSYS_NFTABLES, diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index bd0d41e6934..cc560301624 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -215,22 +215,14 @@ static void nft_ct_l3proto_module_put(uint8_t family) nf_ct_l3proto_module_put(family); } -static int nft_ct_init_validate_get(const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_ct_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); + int err; - if (tb[NFTA_CT_DIRECTION] != NULL) { - priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); - switch (priv->dir) { - case IP_CT_DIR_ORIGINAL: - case IP_CT_DIR_REPLY: - break; - default: - return -EINVAL; - } - } - + priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (priv->key) { case NFT_CT_STATE: case NFT_CT_DIRECTION: @@ -262,55 +254,55 @@ static int nft_ct_init_validate_get(const struct nft_expr *expr, return -EOPNOTSUPP; } - return 0; -} - -static int nft_ct_init_validate_set(uint32_t key) -{ - switch (key) { - case NFT_CT_MARK: - break; - default: - return -EOPNOTSUPP; + if (tb[NFTA_CT_DIRECTION] != NULL) { + priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); + switch (priv->dir) { + case IP_CT_DIR_ORIGINAL: + case IP_CT_DIR_REPLY: + break; + default: + return -EINVAL; + } } + priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + err = nft_ct_l3proto_try_module_get(ctx->afi->family); + if (err < 0) + return err; + return 0; } -static int nft_ct_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_ct_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); int err; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); - - if (tb[NFTA_CT_DREG]) { - err = nft_ct_init_validate_get(expr, tb); - if (err < 0) - return err; - - priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG])); - err = nft_validate_output_register(priv->dreg); - if (err < 0) - return err; - - err = nft_validate_data_load(ctx, priv->dreg, NULL, - NFT_DATA_VALUE); - if (err < 0) - return err; - } else { - err = nft_ct_init_validate_set(priv->key); - if (err < 0) - return err; - - priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG])); - err = nft_validate_input_register(priv->sreg); - if (err < 0) - return err; + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_MARK + case NFT_CT_MARK: + break; +#endif + default: + return -EOPNOTSUPP; } + priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG])); + err = nft_validate_input_register(priv->sreg); + if (err < 0) + return err; + err = nft_ct_l3proto_try_module_get(ctx->afi->family); if (err < 0) return err; @@ -370,7 +362,7 @@ static const struct nft_expr_ops nft_ct_get_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_eval, - .init = nft_ct_init, + .init = nft_ct_get_init, .destroy = nft_ct_destroy, .dump = nft_ct_get_dump, }; @@ -379,7 +371,7 @@ static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_eval, - .init = nft_ct_init, + .init = nft_ct_set_init, .destroy = nft_ct_destroy, .dump = nft_ct_set_dump, }; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 3b1ad876d6b..1dfeb678683 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/list.h> +#include <linux/log2.h> #include <linux/jhash.h> #include <linux/netlink.h> #include <linux/vmalloc.h> @@ -19,7 +20,7 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> -#define NFT_HASH_MIN_SIZE 4 +#define NFT_HASH_MIN_SIZE 4UL struct nft_hash { struct nft_hash_table __rcu *tbl; @@ -27,7 +28,6 @@ struct nft_hash { struct nft_hash_table { unsigned int size; - unsigned int elements; struct nft_hash_elem __rcu *buckets[]; }; @@ -82,6 +82,11 @@ static void nft_hash_tbl_free(const struct nft_hash_table *tbl) kfree(tbl); } +static unsigned int nft_hash_tbl_size(unsigned int nelem) +{ + return max(roundup_pow_of_two(nelem * 4 / 3), NFT_HASH_MIN_SIZE); +} + static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets) { struct nft_hash_table *tbl; @@ -161,7 +166,6 @@ static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv) break; } } - ntbl->elements = tbl->elements; /* Publish new table */ rcu_assign_pointer(priv->tbl, ntbl); @@ -201,7 +205,6 @@ static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv) ; RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]); } - ntbl->elements = tbl->elements; /* Publish new table */ rcu_assign_pointer(priv->tbl, ntbl); @@ -237,10 +240,9 @@ static int nft_hash_insert(const struct nft_set *set, h = nft_hash_data(&he->key, tbl->size, set->klen); RCU_INIT_POINTER(he->next, tbl->buckets[h]); rcu_assign_pointer(tbl->buckets[h], he); - tbl->elements++; /* Expand table when exceeding 75% load */ - if (tbl->elements > tbl->size / 4 * 3) + if (set->nelems + 1 > tbl->size / 4 * 3) nft_hash_tbl_expand(set, priv); return 0; @@ -268,10 +270,9 @@ static void nft_hash_remove(const struct nft_set *set, RCU_INIT_POINTER(*pprev, he->next); synchronize_rcu(); kfree(he); - tbl->elements--; /* Shrink table beneath 30% load */ - if (tbl->elements < tbl->size * 3 / 10 && + if (set->nelems - 1 < tbl->size * 3 / 10 && tbl->size > NFT_HASH_MIN_SIZE) nft_hash_tbl_shrink(set, priv); } @@ -335,17 +336,23 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) } static int nft_hash_init(const struct nft_set *set, + const struct nft_set_desc *desc, const struct nlattr * const tb[]) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_table *tbl; + unsigned int size; if (unlikely(!nft_hash_rnd_initted)) { get_random_bytes(&nft_hash_rnd, 4); nft_hash_rnd_initted = true; } - tbl = nft_hash_tbl_alloc(NFT_HASH_MIN_SIZE); + size = NFT_HASH_MIN_SIZE; + if (desc->size) + size = nft_hash_tbl_size(desc->size); + + tbl = nft_hash_tbl_alloc(size); if (tbl == NULL) return -ENOMEM; RCU_INIT_POINTER(priv->tbl, tbl); @@ -369,8 +376,37 @@ static void nft_hash_destroy(const struct nft_set *set) kfree(tbl); } +static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned int esize; + + esize = sizeof(struct nft_hash_elem); + if (features & NFT_SET_MAP) + esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]); + + if (desc->size) { + est->size = sizeof(struct nft_hash) + + nft_hash_tbl_size(desc->size) * + sizeof(struct nft_hash_elem *) + + desc->size * esize; + } else { + /* Resizing happens when the load drops below 30% or goes + * above 75%. The average of 52.5% load (approximated by 50%) + * is used for the size estimation of the hash buckets, + * meaning we calculate two buckets per element. + */ + est->size = esize + 2 * sizeof(struct nft_hash_elem *); + } + + est->class = NFT_SET_CLASS_O_1; + + return true; +} + static struct nft_set_ops nft_hash_ops __read_mostly = { .privsize = nft_hash_privsize, + .estimate = nft_hash_estimate, .init = nft_hash_init, .destroy = nft_hash_destroy, .get = nft_hash_get, diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 7fd2bea8aa2..6404a726d17 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -56,8 +56,14 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return -EINVAL; set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]); - if (IS_ERR(set)) - return PTR_ERR(set); + if (IS_ERR(set)) { + if (tb[NFTA_LOOKUP_SET_ID]) { + set = nf_tables_set_lookup_byid(ctx->net, + tb[NFTA_LOOKUP_SET_ID]); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } priv->sreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_SREG])); err = nft_validate_input_register(priv->sreg); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 425cf39af89..852b178c6ae 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -18,18 +18,11 @@ #include <net/sock.h> #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> -struct nft_meta { - enum nft_meta_keys key:8; - union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; - }; -}; - -static void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; @@ -140,10 +133,11 @@ static void nft_meta_get_eval(const struct nft_expr *expr, err: data[NFT_REG_VERDICT].verdict = NFT_BREAK; } +EXPORT_SYMBOL_GPL(nft_meta_get_eval); -static void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; @@ -163,28 +157,24 @@ static void nft_meta_set_eval(const struct nft_expr *expr, WARN_ON(1); } } +EXPORT_SYMBOL_GPL(nft_meta_set_eval); -static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { +const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, [NFTA_META_KEY] = { .type = NLA_U32 }, [NFTA_META_SREG] = { .type = NLA_U32 }, }; +EXPORT_SYMBOL_GPL(nft_meta_policy); -static int nft_meta_init_validate_set(uint32_t key) +int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { - switch (key) { - case NFT_META_MARK: - case NFT_META_PRIORITY: - case NFT_META_NFTRACE: - return 0; - default: - return -EOPNOTSUPP; - } -} + struct nft_meta *priv = nft_expr_priv(expr); + int err; -static int nft_meta_init_validate_get(uint32_t key) -{ - switch (key) { + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { case NFT_META_LEN: case NFT_META_PROTOCOL: case NFT_META_NFPROTO: @@ -205,39 +195,41 @@ static int nft_meta_init_validate_get(uint32_t key) #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: #endif - return 0; + break; default: return -EOPNOTSUPP; } + priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); + err = nft_validate_output_register(priv->dreg); + if (err < 0) + return err; + + err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + if (err < 0) + return err; + + return 0; } +EXPORT_SYMBOL_GPL(nft_meta_get_init); -static int nft_meta_init(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); int err; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); - - if (tb[NFTA_META_DREG]) { - err = nft_meta_init_validate_get(priv->key); - if (err < 0) - return err; - - priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); - err = nft_validate_output_register(priv->dreg); - if (err < 0) - return err; - - return nft_validate_data_load(ctx, priv->dreg, NULL, - NFT_DATA_VALUE); + switch (priv->key) { + case NFT_META_MARK: + case NFT_META_PRIORITY: + case NFT_META_NFTRACE: + break; + default: + return -EOPNOTSUPP; } - err = nft_meta_init_validate_set(priv->key); - if (err < 0) - return err; - priv->sreg = ntohl(nla_get_be32(tb[NFTA_META_SREG])); err = nft_validate_input_register(priv->sreg); if (err < 0) @@ -245,9 +237,10 @@ static int nft_meta_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return 0; } +EXPORT_SYMBOL_GPL(nft_meta_set_init); -static int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) +int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -260,9 +253,10 @@ static int nft_meta_get_dump(struct sk_buff *skb, nla_put_failure: return -1; } +EXPORT_SYMBOL_GPL(nft_meta_get_dump); -static int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr) +int nft_meta_set_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -276,13 +270,14 @@ static int nft_meta_set_dump(struct sk_buff *skb, nla_put_failure: return -1; } +EXPORT_SYMBOL_GPL(nft_meta_set_dump); static struct nft_expr_type nft_meta_type; static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_get_eval, - .init = nft_meta_init, + .init = nft_meta_get_init, .dump = nft_meta_get_dump, }; @@ -290,7 +285,7 @@ static const struct nft_expr_ops nft_meta_set_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_set_eval, - .init = nft_meta_init, + .init = nft_meta_set_init, .dump = nft_meta_set_dump, }; diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index e21d69d1350..072e611e9f7 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -201,6 +201,7 @@ static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[]) } static int nft_rbtree_init(const struct nft_set *set, + const struct nft_set_desc *desc, const struct nlattr * const nla[]) { struct nft_rbtree *priv = nft_set_priv(set); @@ -222,8 +223,28 @@ static void nft_rbtree_destroy(const struct nft_set *set) } } +static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned int nsize; + + nsize = sizeof(struct nft_rbtree_elem); + if (features & NFT_SET_MAP) + nsize += FIELD_SIZEOF(struct nft_rbtree_elem, data[0]); + + if (desc->size) + est->size = sizeof(struct nft_rbtree) + desc->size * nsize; + else + est->size = nsize; + + est->class = NFT_SET_CLASS_O_LOG_N; + + return true; +} + static struct nft_set_ops nft_rbtree_ops __read_mostly = { .privsize = nft_rbtree_privsize, + .estimate = nft_rbtree_estimate, .init = nft_rbtree_init, .destroy = nft_rbtree_destroy, .insert = nft_rbtree_insert, |