From 50ac6607845755e594c8a39b9c6a00d1c9b48ea4 Mon Sep 17 00:00:00 2001 From: Amitkumar Karwar Date: Tue, 25 Jun 2013 19:03:56 -0700 Subject: cfg80211/nl80211: rename packet pattern related structures and enums Currently packet patterns and it's enum/structures are used only for WoWLAN feature. As we intend to reuse them for new feature packet coalesce, they are renamed in this patch. Older names are kept for backward compatibility purpose. Signed-off-by: Amitkumar Karwar Signed-off-by: Bing Zhao Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 +++--- include/uapi/linux/nl80211.h | 45 ++++++++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7b0730aeb89..207b079a8fa 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1698,7 +1698,7 @@ struct cfg80211_pmksa { }; /** - * struct cfg80211_wowlan_trig_pkt_pattern - packet pattern + * struct cfg80211_pkt_pattern - packet pattern * @mask: bitmask where to match pattern and where to ignore bytes, * one bit per byte, in same format as nl80211 * @pattern: bytes to match where bitmask is 1 @@ -1708,7 +1708,7 @@ struct cfg80211_pmksa { * Internal note: @mask and @pattern are allocated in one chunk of * memory, free @mask only! */ -struct cfg80211_wowlan_trig_pkt_pattern { +struct cfg80211_pkt_pattern { u8 *mask, *pattern; int pattern_len; int pkt_offset; @@ -1770,7 +1770,7 @@ struct cfg80211_wowlan { bool any, disconnect, magic_pkt, gtk_rekey_failure, eap_identity_req, four_way_handshake, rfkill_release; - struct cfg80211_wowlan_trig_pkt_pattern *patterns; + struct cfg80211_pkt_pattern *patterns; struct cfg80211_wowlan_tcp *tcp; int n_patterns; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 861e5eba395..de0ce809068 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3060,11 +3060,11 @@ enum nl80211_tx_power_setting { }; /** - * enum nl80211_wowlan_packet_pattern_attr - WoWLAN packet pattern attribute - * @__NL80211_WOWLAN_PKTPAT_INVALID: invalid number for nested attribute - * @NL80211_WOWLAN_PKTPAT_PATTERN: the pattern, values where the mask has + * enum nl80211_packet_pattern_attr - packet pattern attribute + * @__NL80211_PKTPAT_INVALID: invalid number for nested attribute + * @NL80211_PKTPAT_PATTERN: the pattern, values where the mask has * a zero bit are ignored - * @NL80211_WOWLAN_PKTPAT_MASK: pattern mask, must be long enough to have + * @NL80211_PKTPAT_MASK: pattern mask, must be long enough to have * a bit for each byte in the pattern. The lowest-order bit corresponds * to the first byte of the pattern, but the bytes of the pattern are * in a little-endian-like format, i.e. the 9th byte of the pattern @@ -3075,23 +3075,23 @@ enum nl80211_tx_power_setting { * Note that the pattern matching is done as though frames were not * 802.11 frames but 802.3 frames, i.e. the frame is fully unpacked * first (including SNAP header unpacking) and then matched. - * @NL80211_WOWLAN_PKTPAT_OFFSET: packet offset, pattern is matched after + * @NL80211_PKTPAT_OFFSET: packet offset, pattern is matched after * these fixed number of bytes of received packet - * @NUM_NL80211_WOWLAN_PKTPAT: number of attributes - * @MAX_NL80211_WOWLAN_PKTPAT: max attribute number + * @NUM_NL80211_PKTPAT: number of attributes + * @MAX_NL80211_PKTPAT: max attribute number */ -enum nl80211_wowlan_packet_pattern_attr { - __NL80211_WOWLAN_PKTPAT_INVALID, - NL80211_WOWLAN_PKTPAT_MASK, - NL80211_WOWLAN_PKTPAT_PATTERN, - NL80211_WOWLAN_PKTPAT_OFFSET, +enum nl80211_packet_pattern_attr { + __NL80211_PKTPAT_INVALID, + NL80211_PKTPAT_MASK, + NL80211_PKTPAT_PATTERN, + NL80211_PKTPAT_OFFSET, - NUM_NL80211_WOWLAN_PKTPAT, - MAX_NL80211_WOWLAN_PKTPAT = NUM_NL80211_WOWLAN_PKTPAT - 1, + NUM_NL80211_PKTPAT, + MAX_NL80211_PKTPAT = NUM_NL80211_PKTPAT - 1, }; /** - * struct nl80211_wowlan_pattern_support - pattern support information + * struct nl80211_pattern_support - packet pattern support information * @max_patterns: maximum number of patterns supported * @min_pattern_len: minimum length of each pattern * @max_pattern_len: maximum length of each pattern @@ -3101,13 +3101,22 @@ enum nl80211_wowlan_packet_pattern_attr { * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED in the * capability information given by the kernel to userspace. */ -struct nl80211_wowlan_pattern_support { +struct nl80211_pattern_support { __u32 max_patterns; __u32 min_pattern_len; __u32 max_pattern_len; __u32 max_pkt_offset; } __attribute__((packed)); +/* only for backward compatibility */ +#define __NL80211_WOWLAN_PKTPAT_INVALID __NL80211_PKTPAT_INVALID +#define NL80211_WOWLAN_PKTPAT_MASK NL80211_PKTPAT_MASK +#define NL80211_WOWLAN_PKTPAT_PATTERN NL80211_PKTPAT_PATTERN +#define NL80211_WOWLAN_PKTPAT_OFFSET NL80211_PKTPAT_OFFSET +#define NUM_NL80211_WOWLAN_PKTPAT NUM_NL80211_PKTPAT +#define MAX_NL80211_WOWLAN_PKTPAT MAX_NL80211_PKTPAT +#define nl80211_wowlan_pattern_support nl80211_pattern_support + /** * enum nl80211_wowlan_triggers - WoWLAN trigger definitions * @__NL80211_WOWLAN_TRIG_INVALID: invalid number for nested attributes @@ -3127,7 +3136,7 @@ struct nl80211_wowlan_pattern_support { * pattern matching is done after the packet is converted to the MSDU. * * In %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED, it is a binary attribute - * carrying a &struct nl80211_wowlan_pattern_support. + * carrying a &struct nl80211_pattern_support. * * When reporting wakeup. it is a u32 attribute containing the 0-based * index of the pattern that caused the wakeup, in the patterns passed @@ -3284,7 +3293,7 @@ struct nl80211_wowlan_tcp_data_token_feature { * @NL80211_WOWLAN_TCP_WAKE_PAYLOAD: wake packet payload, for advertising a * u32 attribute holding the maximum length * @NL80211_WOWLAN_TCP_WAKE_MASK: Wake packet payload mask, not used for - * feature advertising. The mask works like @NL80211_WOWLAN_PKTPAT_MASK + * feature advertising. The mask works like @NL80211_PKTPAT_MASK * but on the TCP payload only. * @NUM_NL80211_WOWLAN_TCP: number of TCP attributes * @MAX_NL80211_WOWLAN_TCP: highest attribute number -- cgit v1.2.3-70-g09d2 From 803768f54ef84cb4aaac0b51274b11b31885588c Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Fri, 28 Jun 2013 10:39:58 +0200 Subject: nl80211: enable HT overrides for ibss Signed-off-by: Simon Wunderlich Signed-off-by: Mathias Kretschmer Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 +++++ net/wireless/nl80211.c | 13 +++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 207b079a8fa..79c2277e7c6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1592,6 +1592,9 @@ struct cfg80211_disassoc_request { * user space. Otherwise, port is marked authorized by default. * @basic_rates: bitmap of basic rates to use when creating the IBSS * @mcast_rate: per-band multicast rate index + 1 (0: disabled) + * @ht_capa: HT Capabilities over-rides. Values set in ht_capa_mask + * will be used in ht_capa. Un-supported values will be ignored. + * @ht_capa_mask: The bits of ht_capa which are to be used. */ struct cfg80211_ibss_params { u8 *ssid; @@ -1605,6 +1608,8 @@ struct cfg80211_ibss_params { bool privacy; bool control_port; int mcast_rate[IEEE80211_NUM_BANDS]; + struct ieee80211_ht_cap ht_capa; + struct ieee80211_ht_cap ht_capa_mask; }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a044762f5ea..0492478ab74 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6346,6 +6346,19 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) return err; } + if (info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) + memcpy(&ibss.ht_capa_mask, + nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]), + sizeof(ibss.ht_capa_mask)); + + if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) { + if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) + return -EINVAL; + memcpy(&ibss.ht_capa, + nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]), + sizeof(ibss.ht_capa)); + } + if (info->attrs[NL80211_ATTR_MCAST_RATE] && !nl80211_parse_mcast_rate(rdev, ibss.mcast_rate, nla_get_u32(info->attrs[NL80211_ATTR_MCAST_RATE]))) -- cgit v1.2.3-70-g09d2 From ad24b0da9ecec433091f74596843703e4cf5da77 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 5 Jul 2013 11:53:28 +0200 Subject: wireless: indent kernel-doc with tabs Almost everywhere tabs are used to indent continuation lines, replace the few places that use spaces. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 16 ++++++++-------- include/net/mac80211.h | 24 ++++++++++++------------ 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 79c2277e7c6..49409602fe3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -490,7 +490,7 @@ enum survey_info_flags { * @channel: the channel this survey record reports, mandatory * @filled: bitflag of flags from &enum survey_info_flags * @noise: channel noise in dBm. This and all following fields are - * optional + * optional * @channel_time: amount of time in ms the radio spent on the channel * @channel_time_busy: amount of time the primary channel was sensed busy * @channel_time_ext_busy: amount of time the extension channel was sensed busy @@ -546,9 +546,9 @@ struct cfg80211_crypto_settings { /** * struct cfg80211_beacon_data - beacon data * @head: head portion of beacon (before TIM IE) - * or %NULL if not changed + * or %NULL if not changed * @tail: tail portion of beacon (after TIM IE) - * or %NULL if not changed + * or %NULL if not changed * @head_len: length of @head * @tail_len: length of @tail * @beacon_ies: extra information element(s) to add into Beacon frames or %NULL @@ -764,7 +764,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @STATION_INFO_PLINK_STATE: @plink_state filled * @STATION_INFO_SIGNAL: @signal filled * @STATION_INFO_TX_BITRATE: @txrate fields are filled - * (tx_bitrate, tx_bitrate_flags and tx_bitrate_mcs) + * (tx_bitrate, tx_bitrate_flags and tx_bitrate_mcs) * @STATION_INFO_RX_PACKETS: @rx_packets filled with 32-bit value * @STATION_INFO_TX_PACKETS: @tx_packets filled with 32-bit value * @STATION_INFO_TX_RETRIES: @tx_retries filled @@ -1509,7 +1509,7 @@ enum cfg80211_assoc_req_flags { * @prev_bssid: previous BSSID, if not %NULL use reassociate frame * @flags: See &enum cfg80211_assoc_req_flags * @ht_capa: HT Capabilities over-rides. Values set in ht_capa_mask - * will be used in ht_capa. Un-supported values will be ignored. + * will be used in ht_capa. Un-supported values will be ignored. * @ht_capa_mask: The bits of ht_capa which are to be used. * @vht_capa: VHT capability override * @vht_capa_mask: VHT capability mask indicating which fields to use @@ -1593,7 +1593,7 @@ struct cfg80211_disassoc_request { * @basic_rates: bitmap of basic rates to use when creating the IBSS * @mcast_rate: per-band multicast rate index + 1 (0: disabled) * @ht_capa: HT Capabilities over-rides. Values set in ht_capa_mask - * will be used in ht_capa. Un-supported values will be ignored. + * will be used in ht_capa. Un-supported values will be ignored. * @ht_capa_mask: The bits of ht_capa which are to be used. */ struct cfg80211_ibss_params { @@ -1635,9 +1635,9 @@ struct cfg80211_ibss_params { * @key: WEP key for shared key authentication * @flags: See &enum cfg80211_assoc_req_flags * @bg_scan_period: Background scan period in seconds - * or -1 to indicate that default value is to be used. + * or -1 to indicate that default value is to be used. * @ht_capa: HT Capabilities over-rides. Values set in ht_capa_mask - * will be used in ht_capa. Un-supported values will be ignored. + * will be used in ht_capa. Un-supported values will be ignored. * @ht_capa_mask: The bits of ht_capa which are to be used. * @vht_capa: VHT Capability overrides * @vht_capa_mask: The bits of vht_capa which are to be used. diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5b7a3dadadd..7c5dc787a8e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1004,11 +1004,11 @@ enum ieee80211_smps_mode { * @radar_enabled: whether radar detection is enabled * * @long_frame_max_tx_count: Maximum number of transmissions for a "long" frame - * (a frame not RTS protected), called "dot11LongRetryLimit" in 802.11, - * but actually means the number of transmissions not the number of retries + * (a frame not RTS protected), called "dot11LongRetryLimit" in 802.11, + * but actually means the number of transmissions not the number of retries * @short_frame_max_tx_count: Maximum number of transmissions for a "short" - * frame, called "dot11ShortRetryLimit" in 802.11, but actually means the - * number of transmissions not the number of retries + * frame, called "dot11ShortRetryLimit" in 802.11, but actually means the + * number of transmissions not the number of retries * * @smps_mode: spatial multiplexing powersave mode; note that * %IEEE80211_SMPS_STATIC is used when the device is not @@ -1092,7 +1092,7 @@ enum ieee80211_vif_flags { * be off when it is %NULL there can still be races and packets could be * processed after it switches back to %NULL. * @debugfs_dir: debugfs dentry, can be used by drivers to create own per - * interface debug files. Note that it will be NULL for the virtual + * interface debug files. Note that it will be NULL for the virtual * monitor interface (if that is requested.) * @drv_priv: data area for driver use, will always be aligned to * sizeof(void *). @@ -1425,10 +1425,10 @@ struct ieee80211_tx_control { * the stack. * * @IEEE80211_HW_CONNECTION_MONITOR: - * The hardware performs its own connection monitoring, including - * periodic keep-alives to the AP and probing the AP on beacon loss. - * When this flag is set, signaling beacon-loss will cause an immediate - * change to disassociated state. + * The hardware performs its own connection monitoring, including + * periodic keep-alives to the AP and probing the AP on beacon loss. + * When this flag is set, signaling beacon-loss will cause an immediate + * change to disassociated state. * * @IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC: * This device needs to get data from beacon before association (i.e. @@ -1526,10 +1526,10 @@ enum ieee80211_hw_flags { * @channel_change_time: time (in microseconds) it takes to change channels. * * @max_signal: Maximum value for signal (rssi) in RX information, used - * only when @IEEE80211_HW_SIGNAL_UNSPEC or @IEEE80211_HW_SIGNAL_DB + * only when @IEEE80211_HW_SIGNAL_UNSPEC or @IEEE80211_HW_SIGNAL_DB * * @max_listen_interval: max listen interval in units of beacon interval - * that HW supports + * that HW supports * * @queues: number of available hardware transmit queues for * data packets. WMM/QoS requires at least four, these @@ -2443,7 +2443,7 @@ enum ieee80211_roc_type { * The callback can sleep. * * @set_tsf: Set the TSF timer to the specified value in the firmware/hardware. - * Currently, this is only used for IBSS mode debugging. Is not a + * Currently, this is only used for IBSS mode debugging. Is not a * required function. * The callback can sleep. * -- cgit v1.2.3-70-g09d2 From be29b99a9b51b0338eea3c66a58de53bbd01de24 Mon Sep 17 00:00:00 2001 From: Amitkumar Karwar Date: Fri, 28 Jun 2013 11:51:26 -0700 Subject: cfg80211/nl80211: Add packet coalesce support In most cases, host that receives IPv4 and IPv6 multicast/broadcast packets does not do anything with these packets. Therefore the reception of these unwanted packets causes unnecessary processing and power consumption. Packet coalesce feature helps to reduce number of received interrupts to host by buffering these packets in firmware/hardware for some predefined time. Received interrupt will be generated when one of the following events occur. a) Expiration of hardware timer whose expiration time is set to maximum coalescing delay of matching coalesce rule. b) Coalescing buffer in hardware reaches it's limit. c) Packet doesn't match any of the configured coalesce rules. This patch adds set/get configuration support for packet coalesce. User needs to configure following parameters for creating a coalesce rule. a) Maximum coalescing delay b) List of packet patterns which needs to be matched c) Condition for coalescence. pattern 'match' or 'no match' Multiple such rules can be created. This feature needs to be advertised during driver initialization. Drivers are supposed to do required firmware/hardware settings based on user configuration. Signed-off-by: Amitkumar Karwar Signed-off-by: Bing Zhao [fix kernel-doc, change free function, fix copy/paste error] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 54 ++++++++ include/uapi/linux/nl80211.h | 90 ++++++++++++- net/wireless/core.c | 9 ++ net/wireless/core.h | 2 + net/wireless/nl80211.c | 308 +++++++++++++++++++++++++++++++++++++++++++ net/wireless/nl80211.h | 2 + 6 files changed, 463 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 49409602fe3..071ed2395c9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1780,6 +1780,35 @@ struct cfg80211_wowlan { int n_patterns; }; +/** + * struct cfg80211_coalesce_rules - Coalesce rule parameters + * + * This structure defines coalesce rule for the device. + * @delay: maximum coalescing delay in msecs. + * @condition: condition for packet coalescence. + * see &enum nl80211_coalesce_condition. + * @patterns: array of packet patterns + * @n_patterns: number of patterns + */ +struct cfg80211_coalesce_rules { + int delay; + enum nl80211_coalesce_condition condition; + struct cfg80211_pkt_pattern *patterns; + int n_patterns; +}; + +/** + * struct cfg80211_coalesce - Packet coalescing settings + * + * This structure defines coalescing settings. + * @rules: array of coalesce rules + * @n_rules: number of rules + */ +struct cfg80211_coalesce { + struct cfg80211_coalesce_rules *rules; + int n_rules; +}; + /** * struct cfg80211_wowlan_wakeup - wakeup report * @disconnect: woke up by getting disconnected @@ -2076,6 +2105,7 @@ struct cfg80211_update_ft_ies_params { * driver can take the most appropriate actions. * @crit_proto_stop: Indicates critical protocol no longer needs increased link * reliability. This operation can not fail. + * @set_coalesce: Set coalesce parameters. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2311,6 +2341,8 @@ struct cfg80211_ops { u16 duration); void (*crit_proto_stop)(struct wiphy *wiphy, struct wireless_dev *wdev); + int (*set_coalesce)(struct wiphy *wiphy, + struct cfg80211_coalesce *coalesce); }; /* @@ -2536,6 +2568,25 @@ struct wiphy_wowlan_support { const struct wiphy_wowlan_tcp_support *tcp; }; +/** + * struct wiphy_coalesce_support - coalesce support data + * @n_rules: maximum number of coalesce rules + * @max_delay: maximum supported coalescing delay in msecs + * @n_patterns: number of supported patterns in a rule + * (see nl80211.h for the pattern definition) + * @pattern_max_len: maximum length of each pattern + * @pattern_min_len: minimum length of each pattern + * @max_pkt_offset: maximum Rx packet offset + */ +struct wiphy_coalesce_support { + int n_rules; + int max_delay; + int n_patterns; + int pattern_max_len; + int pattern_min_len; + int max_pkt_offset; +}; + /** * struct wiphy - wireless hardware description * @reg_notifier: the driver's regulatory notification callback, @@ -2646,6 +2697,7 @@ struct wiphy_wowlan_support { * 802.11-2012 8.4.2.29 for the defined fields. * @extended_capabilities_mask: mask of the valid values * @extended_capabilities_len: length of the extended capabilities + * @coalesce: packet coalescing support information */ struct wiphy { /* assign these fields before you register the wiphy */ @@ -2755,6 +2807,8 @@ struct wiphy { const struct iw_handler_def *wext; #endif + const struct wiphy_coalesce_support *coalesce; + char priv[0] __aligned(NETDEV_ALIGN); }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index de0ce809068..5abc54d14d4 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -125,6 +125,31 @@ * interfaces that a given device supports. */ +/** + * DOC: packet coalesce support + * + * In most cases, host that receives IPv4 and IPv6 multicast/broadcast + * packets does not do anything with these packets. Therefore the + * reception of these unwanted packets causes unnecessary processing + * and power consumption. + * + * Packet coalesce feature helps to reduce number of received interrupts + * to host by buffering these packets in firmware/hardware for some + * predefined time. Received interrupt will be generated when one of the + * following events occur. + * a) Expiration of hardware timer whose expiration time is set to maximum + * coalescing delay of matching coalesce rule. + * b) Coalescing buffer in hardware reaches it's limit. + * c) Packet doesn't match any of the configured coalesce rules. + * + * User needs to configure following parameters for creating a coalesce + * rule. + * a) Maximum coalescing delay + * b) List of packet patterns which needs to be matched + * c) Condition for coalescence. pattern 'match' or 'no match' + * Multiple such rules can be created. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -648,6 +673,9 @@ * @NL80211_CMD_CRIT_PROTOCOL_STOP: Indicates the connection reliability can * return back to normal. * + * @NL80211_CMD_GET_COALESCE: Get currently supported coalesce rules. + * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -810,6 +838,9 @@ enum nl80211_commands { NL80211_CMD_CRIT_PROTOCOL_START, NL80211_CMD_CRIT_PROTOCOL_STOP, + NL80211_CMD_GET_COALESCE, + NL80211_CMD_SET_COALESCE, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -1436,6 +1467,8 @@ enum nl80211_commands { * allowed to be used with the first @NL80211_CMD_SET_STATION command to * update a TDLS peer STA entry. * + * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1736,6 +1769,8 @@ enum nl80211_attrs { NL80211_ATTR_PEER_AID, + NL80211_ATTR_COALESCE_RULE, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3098,8 +3133,10 @@ enum nl80211_packet_pattern_attr { * @max_pkt_offset: maximum Rx packet offset * * This struct is carried in %NL80211_WOWLAN_TRIG_PKT_PATTERN when - * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED in the - * capability information given by the kernel to userspace. + * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED or in + * %NL80211_ATTR_COALESCE_RULE_PKT_PATTERN when that is part of + * %NL80211_ATTR_COALESCE_RULE in the capability information given + * by the kernel to userspace. */ struct nl80211_pattern_support { __u32 max_patterns; @@ -3317,6 +3354,55 @@ enum nl80211_wowlan_tcp_attrs { MAX_NL80211_WOWLAN_TCP = NUM_NL80211_WOWLAN_TCP - 1 }; +/** + * struct nl80211_coalesce_rule_support - coalesce rule support information + * @max_rules: maximum number of rules supported + * @pat: packet pattern support information + * @max_delay: maximum supported coalescing delay in msecs + * + * This struct is carried in %NL80211_ATTR_COALESCE_RULE in the + * capability information given by the kernel to userspace. + */ +struct nl80211_coalesce_rule_support { + __u32 max_rules; + struct nl80211_pattern_support pat; + __u32 max_delay; +} __attribute__((packed)); + +/** + * enum nl80211_attr_coalesce_rule - coalesce rule attribute + * @__NL80211_COALESCE_RULE_INVALID: invalid number for nested attribute + * @NL80211_ATTR_COALESCE_RULE_DELAY: delay in msecs used for packet coalescing + * @NL80211_ATTR_COALESCE_RULE_CONDITION: condition for packet coalescence, + * see &enum nl80211_coalesce_condition. + * @NL80211_ATTR_COALESCE_RULE_PKT_PATTERN: packet offset, pattern is matched + * after these fixed number of bytes of received packet + * @NUM_NL80211_ATTR_COALESCE_RULE: number of attributes + * @NL80211_ATTR_COALESCE_RULE_MAX: max attribute number + */ +enum nl80211_attr_coalesce_rule { + __NL80211_COALESCE_RULE_INVALID, + NL80211_ATTR_COALESCE_RULE_DELAY, + NL80211_ATTR_COALESCE_RULE_CONDITION, + NL80211_ATTR_COALESCE_RULE_PKT_PATTERN, + + /* keep last */ + NUM_NL80211_ATTR_COALESCE_RULE, + NL80211_ATTR_COALESCE_RULE_MAX = NUM_NL80211_ATTR_COALESCE_RULE - 1 +}; + +/** + * enum nl80211_coalesce_condition - coalesce rule conditions + * @NL80211_COALESCE_CONDITION_MATCH: coalaesce Rx packets when patterns + * in a rule are matched. + * @NL80211_COALESCE_CONDITION_NO_MATCH: coalesce Rx packets when patterns + * in a rule are not matched. + */ +enum nl80211_coalesce_condition { + NL80211_COALESCE_CONDITION_MATCH, + NL80211_COALESCE_CONDITION_NO_MATCH +}; + /** * enum nl80211_iface_limit_attrs - limit attributes * @NL80211_IFACE_LIMIT_UNSPEC: (reserved) diff --git a/net/wireless/core.c b/net/wireless/core.c index 4f9f216665e..389a3f2ee46 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -462,6 +462,14 @@ int wiphy_register(struct wiphy *wiphy) return -EINVAL; #endif + if (WARN_ON(wiphy->coalesce && + (!wiphy->coalesce->n_rules || + !wiphy->coalesce->n_patterns) && + (!wiphy->coalesce->pattern_min_len || + wiphy->coalesce->pattern_min_len > + wiphy->coalesce->pattern_max_len))) + return -EINVAL; + if (WARN_ON(wiphy->ap_sme_capa && !(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME))) return -EINVAL; @@ -668,6 +676,7 @@ void wiphy_unregister(struct wiphy *wiphy) rdev_set_wakeup(rdev, false); #endif cfg80211_rdev_free_wowlan(rdev); + cfg80211_rdev_free_coalesce(rdev); } EXPORT_SYMBOL(wiphy_unregister); diff --git a/net/wireless/core.h b/net/wireless/core.h index a6b45bf00f3..9ad43c619c5 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -79,6 +79,8 @@ struct cfg80211_registered_device { /* netlink port which started critical protocol (0 means not started) */ u32 crit_proto_nlportid; + struct cfg80211_coalesce *coalesce; + /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wiphy wiphy __aligned(NETDEV_ALIGN); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0492478ab74..6dca5a70017 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -403,6 +403,14 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = { [NL80211_WOWLAN_TCP_WAKE_MASK] = { .len = 1 }, }; +/* policy for coalesce rule attributes */ +static const struct nla_policy +nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = { + [NL80211_ATTR_COALESCE_RULE_DELAY] = { .type = NLA_U32 }, + [NL80211_ATTR_COALESCE_RULE_CONDITION] = { .type = NLA_U32 }, + [NL80211_ATTR_COALESCE_RULE_PKT_PATTERN] = { .type = NLA_NESTED }, +}; + /* policy for GTK rekey offload attributes */ static const struct nla_policy nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { @@ -995,6 +1003,27 @@ static int nl80211_send_wowlan(struct sk_buff *msg, } #endif +static int nl80211_send_coalesce(struct sk_buff *msg, + struct cfg80211_registered_device *dev) +{ + struct nl80211_coalesce_rule_support rule; + + if (!dev->wiphy.coalesce) + return 0; + + rule.max_rules = dev->wiphy.coalesce->n_rules; + rule.max_delay = dev->wiphy.coalesce->max_delay; + rule.pat.max_patterns = dev->wiphy.coalesce->n_patterns; + rule.pat.min_pattern_len = dev->wiphy.coalesce->pattern_min_len; + rule.pat.max_pattern_len = dev->wiphy.coalesce->pattern_max_len; + rule.pat.max_pkt_offset = dev->wiphy.coalesce->max_pkt_offset; + + if (nla_put(msg, NL80211_ATTR_COALESCE_RULE, sizeof(rule), &rule)) + return -ENOBUFS; + + return 0; +} + static int nl80211_send_band_rateinfo(struct sk_buff *msg, struct ieee80211_supported_band *sband) { @@ -1513,6 +1542,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, dev->wiphy.vht_capa_mod_mask)) goto nla_put_failure; + state->split_start++; + break; + case 10: + if (nl80211_send_coalesce(msg, dev)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; @@ -8043,6 +8078,264 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) } #endif +static int nl80211_send_coalesce_rules(struct sk_buff *msg, + struct cfg80211_registered_device *rdev) +{ + struct nlattr *nl_pats, *nl_pat, *nl_rule, *nl_rules; + int i, j, pat_len; + struct cfg80211_coalesce_rules *rule; + + if (!rdev->coalesce->n_rules) + return 0; + + nl_rules = nla_nest_start(msg, NL80211_ATTR_COALESCE_RULE); + if (!nl_rules) + return -ENOBUFS; + + for (i = 0; i < rdev->coalesce->n_rules; i++) { + nl_rule = nla_nest_start(msg, i + 1); + if (!nl_rule) + return -ENOBUFS; + + rule = &rdev->coalesce->rules[i]; + if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_DELAY, + rule->delay)) + return -ENOBUFS; + + if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_CONDITION, + rule->condition)) + return -ENOBUFS; + + nl_pats = nla_nest_start(msg, + NL80211_ATTR_COALESCE_RULE_PKT_PATTERN); + if (!nl_pats) + return -ENOBUFS; + + for (j = 0; j < rule->n_patterns; j++) { + nl_pat = nla_nest_start(msg, j + 1); + if (!nl_pat) + return -ENOBUFS; + pat_len = rule->patterns[j].pattern_len; + if (nla_put(msg, NL80211_PKTPAT_MASK, + DIV_ROUND_UP(pat_len, 8), + rule->patterns[j].mask) || + nla_put(msg, NL80211_PKTPAT_PATTERN, pat_len, + rule->patterns[j].pattern) || + nla_put_u32(msg, NL80211_PKTPAT_OFFSET, + rule->patterns[j].pkt_offset)) + return -ENOBUFS; + nla_nest_end(msg, nl_pat); + } + nla_nest_end(msg, nl_pats); + nla_nest_end(msg, nl_rule); + } + nla_nest_end(msg, nl_rules); + + return 0; +} + +static int nl80211_get_coalesce(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct sk_buff *msg; + void *hdr; + + if (!rdev->wiphy.coalesce) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, + NL80211_CMD_GET_COALESCE); + if (!hdr) + goto nla_put_failure; + + if (rdev->coalesce && nl80211_send_coalesce_rules(msg, rdev)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} + +void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev) +{ + struct cfg80211_coalesce *coalesce = rdev->coalesce; + int i, j; + struct cfg80211_coalesce_rules *rule; + + if (!coalesce) + return; + + for (i = 0; i < coalesce->n_rules; i++) { + rule = &coalesce->rules[i]; + for (j = 0; j < rule->n_patterns; j++) + kfree(rule->patterns[j].mask); + kfree(rule->patterns); + } + kfree(coalesce->rules); + kfree(coalesce); + rdev->coalesce = NULL; +} + +static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, + struct nlattr *rule, + struct cfg80211_coalesce_rules *new_rule) +{ + int err, i; + const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce; + struct nlattr *tb[NUM_NL80211_ATTR_COALESCE_RULE], *pat; + int rem, pat_len, mask_len, pkt_offset, n_patterns = 0; + struct nlattr *pat_tb[NUM_NL80211_PKTPAT]; + + err = nla_parse(tb, NL80211_ATTR_COALESCE_RULE_MAX, nla_data(rule), + nla_len(rule), nl80211_coalesce_policy); + if (err) + return err; + + if (tb[NL80211_ATTR_COALESCE_RULE_DELAY]) + new_rule->delay = + nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_DELAY]); + if (new_rule->delay > coalesce->max_delay) + return -EINVAL; + + if (tb[NL80211_ATTR_COALESCE_RULE_CONDITION]) + new_rule->condition = + nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_CONDITION]); + if (new_rule->condition != NL80211_COALESCE_CONDITION_MATCH && + new_rule->condition != NL80211_COALESCE_CONDITION_NO_MATCH) + return -EINVAL; + + if (!tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN]) + return -EINVAL; + + nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN], + rem) + n_patterns++; + if (n_patterns > coalesce->n_patterns) + return -EINVAL; + + new_rule->patterns = kcalloc(n_patterns, sizeof(new_rule->patterns[0]), + GFP_KERNEL); + if (!new_rule->patterns) + return -ENOMEM; + + new_rule->n_patterns = n_patterns; + i = 0; + + nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN], + rem) { + nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat), + nla_len(pat), NULL); + if (!pat_tb[NL80211_PKTPAT_MASK] || + !pat_tb[NL80211_PKTPAT_PATTERN]) + return -EINVAL; + pat_len = nla_len(pat_tb[NL80211_PKTPAT_PATTERN]); + mask_len = DIV_ROUND_UP(pat_len, 8); + if (nla_len(pat_tb[NL80211_PKTPAT_MASK]) != mask_len) + return -EINVAL; + if (pat_len > coalesce->pattern_max_len || + pat_len < coalesce->pattern_min_len) + return -EINVAL; + + if (!pat_tb[NL80211_PKTPAT_OFFSET]) + pkt_offset = 0; + else + pkt_offset = nla_get_u32(pat_tb[NL80211_PKTPAT_OFFSET]); + if (pkt_offset > coalesce->max_pkt_offset) + return -EINVAL; + new_rule->patterns[i].pkt_offset = pkt_offset; + + new_rule->patterns[i].mask = + kmalloc(mask_len + pat_len, GFP_KERNEL); + if (!new_rule->patterns[i].mask) + return -ENOMEM; + new_rule->patterns[i].pattern = + new_rule->patterns[i].mask + mask_len; + memcpy(new_rule->patterns[i].mask, + nla_data(pat_tb[NL80211_PKTPAT_MASK]), mask_len); + new_rule->patterns[i].pattern_len = pat_len; + memcpy(new_rule->patterns[i].pattern, + nla_data(pat_tb[NL80211_PKTPAT_PATTERN]), pat_len); + i++; + } + + return 0; +} + +static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce; + struct cfg80211_coalesce new_coalesce = {}; + struct cfg80211_coalesce *n_coalesce; + int err, rem_rule, n_rules = 0, i, j; + struct nlattr *rule; + struct cfg80211_coalesce_rules *tmp_rule; + + if (!rdev->wiphy.coalesce || !rdev->ops->set_coalesce) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_COALESCE_RULE]) { + cfg80211_rdev_free_coalesce(rdev); + rdev->ops->set_coalesce(&rdev->wiphy, NULL); + return 0; + } + + nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE], + rem_rule) + n_rules++; + if (n_rules > coalesce->n_rules) + return -EINVAL; + + new_coalesce.rules = kcalloc(n_rules, sizeof(new_coalesce.rules[0]), + GFP_KERNEL); + if (!new_coalesce.rules) + return -ENOMEM; + + new_coalesce.n_rules = n_rules; + i = 0; + + nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE], + rem_rule) { + err = nl80211_parse_coalesce_rule(rdev, rule, + &new_coalesce.rules[i]); + if (err) + goto error; + + i++; + } + + err = rdev->ops->set_coalesce(&rdev->wiphy, &new_coalesce); + if (err) + goto error; + + n_coalesce = kmemdup(&new_coalesce, sizeof(new_coalesce), GFP_KERNEL); + if (!n_coalesce) { + err = -ENOMEM; + goto error; + } + cfg80211_rdev_free_coalesce(rdev); + rdev->coalesce = n_coalesce; + + return 0; +error: + for (i = 0; i < new_coalesce.n_rules; i++) { + tmp_rule = &new_coalesce.rules[i]; + for (j = 0; j < tmp_rule->n_patterns; j++) + kfree(tmp_rule->patterns[j].mask); + kfree(tmp_rule->patterns); + } + kfree(new_coalesce.rules); + + return err; +} + static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -9050,6 +9343,21 @@ static struct genl_ops nl80211_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_COALESCE, + .doit = nl80211_get_coalesce, + .policy = nl80211_policy, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_COALESCE, + .doit = nl80211_set_coalesce, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, } }; diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index a4073e808c1..44341bf53cf 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -74,4 +74,6 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev, enum nl80211_radar_event event, struct net_device *netdev, gfp_t gfp); +void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev); + #endif /* __NET_WIRELESS_NL80211_H */ -- cgit v1.2.3-70-g09d2 From dcd6eac1f3b5fa1df11dfa99da0cf75b76cfef97 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 8 Jul 2013 16:55:49 +0200 Subject: nl80211: add scan width to bss and scan request structs To allow scanning and working with 5 MHz and 10 MHz BSS, extend the inform bss commands and add wrappers to take 5 and 10 MHz bss into account. Signed-off-by: Simon Wunderlich Signed-off-by: Mathias Kretschmer Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 54 +++++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/nl80211.h | 18 +++++++++++++++ net/wireless/nl80211.c | 1 + net/wireless/scan.c | 31 +++++++++++++++---------- net/wireless/trace.h | 12 ++++++---- 5 files changed, 97 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 071ed2395c9..bae8614a450 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1285,6 +1285,7 @@ struct cfg80211_ssid { * @n_ssids: number of SSIDs * @channels: channels to scan on. * @n_channels: total number of channels to scan + * @scan_width: channel width for scanning * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets * @flags: bit field of flags controlling operation @@ -1300,6 +1301,7 @@ struct cfg80211_scan_request { struct cfg80211_ssid *ssids; int n_ssids; u32 n_channels; + enum nl80211_bss_scan_width scan_width; const u8 *ie; size_t ie_len; u32 flags; @@ -1333,6 +1335,7 @@ struct cfg80211_match_set { * @ssids: SSIDs to scan for (passed in the probe_reqs in active scans) * @n_ssids: number of SSIDs * @n_channels: total number of channels to scan + * @scan_width: channel width for scanning * @interval: interval between each scheduled scan cycle * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets @@ -1352,6 +1355,7 @@ struct cfg80211_sched_scan_request { struct cfg80211_ssid *ssids; int n_ssids; u32 n_channels; + enum nl80211_bss_scan_width scan_width; u32 interval; const u8 *ie; size_t ie_len; @@ -1403,6 +1407,7 @@ struct cfg80211_bss_ies { * for use in scan results and similar. * * @channel: channel this BSS is on + * @scan_width: width of the control channel * @bssid: BSSID of the BSS * @beacon_interval: the beacon interval as from the frame * @capability: the capability field in host byte order @@ -1424,6 +1429,7 @@ struct cfg80211_bss_ies { */ struct cfg80211_bss { struct ieee80211_channel *channel; + enum nl80211_bss_scan_width scan_width; const struct cfg80211_bss_ies __rcu *ies; const struct cfg80211_bss_ies __rcu *beacon_ies; @@ -3438,10 +3444,11 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy); void cfg80211_sched_scan_stopped(struct wiphy *wiphy); /** - * cfg80211_inform_bss_frame - inform cfg80211 of a received BSS frame + * cfg80211_inform_bss_width_frame - inform cfg80211 of a received BSS frame * * @wiphy: the wiphy reporting the BSS * @channel: The channel the frame was received on + * @scan_width: width of the control channel * @mgmt: the management frame (probe response or beacon) * @len: length of the management frame * @signal: the signal strength, type depends on the wiphy's signal_type @@ -3454,16 +3461,29 @@ void cfg80211_sched_scan_stopped(struct wiphy *wiphy); * Or %NULL on error. */ struct cfg80211_bss * __must_check +cfg80211_inform_bss_width_frame(struct wiphy *wiphy, + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width, + struct ieee80211_mgmt *mgmt, size_t len, + s32 signal, gfp_t gfp); + +static inline struct cfg80211_bss * __must_check cfg80211_inform_bss_frame(struct wiphy *wiphy, struct ieee80211_channel *channel, struct ieee80211_mgmt *mgmt, size_t len, - s32 signal, gfp_t gfp); + s32 signal, gfp_t gfp) +{ + return cfg80211_inform_bss_width_frame(wiphy, channel, + NL80211_BSS_CHAN_WIDTH_20, + mgmt, len, signal, gfp); +} /** * cfg80211_inform_bss - inform cfg80211 of a new BSS * * @wiphy: the wiphy reporting the BSS * @channel: The channel the frame was received on + * @scan_width: width of the control channel * @bssid: the BSSID of the BSS * @tsf: the TSF sent by the peer in the beacon/probe response (or 0) * @capability: the capability field sent by the peer @@ -3480,11 +3500,26 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, * Or %NULL on error. */ struct cfg80211_bss * __must_check +cfg80211_inform_bss_width(struct wiphy *wiphy, + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width, + const u8 *bssid, u64 tsf, u16 capability, + u16 beacon_interval, const u8 *ie, size_t ielen, + s32 signal, gfp_t gfp); + +static inline struct cfg80211_bss * __must_check cfg80211_inform_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, - s32 signal, gfp_t gfp); + s32 signal, gfp_t gfp) +{ + return cfg80211_inform_bss_width(wiphy, channel, + NL80211_BSS_CHAN_WIDTH_20, + bssid, tsf, capability, + beacon_interval, ie, ielen, signal, + gfp); +} struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, @@ -3530,6 +3565,19 @@ void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *bss); */ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *bss); +static inline enum nl80211_bss_scan_width +cfg80211_chandef_to_scan_width(const struct cfg80211_chan_def *chandef) +{ + switch (chandef->width) { + case NL80211_CHAN_WIDTH_5: + return NL80211_BSS_CHAN_WIDTH_5; + case NL80211_CHAN_WIDTH_10: + return NL80211_BSS_CHAN_WIDTH_10; + default: + return NL80211_BSS_CHAN_WIDTH_20; + } +} + /** * cfg80211_rx_mlme_mgmt - notification of processed MLME management frame * @dev: network device diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5abc54d14d4..eb68735b331 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2807,6 +2807,21 @@ enum nl80211_chan_width { NL80211_CHAN_WIDTH_10, }; +/** + * enum nl80211_bss_scan_width - control channel width for a BSS + * + * These values are used with the %NL80211_BSS_CHAN_WIDTH attribute. + * + * @NL80211_BSS_CHAN_WIDTH_20: control channel is 20 MHz wide or compatible + * @NL80211_BSS_CHAN_WIDTH_10: control channel is 10 MHz wide + * @NL80211_BSS_CHAN_WIDTH_5: control channel is 5 MHz wide + */ +enum nl80211_bss_scan_width { + NL80211_BSS_CHAN_WIDTH_20, + NL80211_BSS_CHAN_WIDTH_10, + NL80211_BSS_CHAN_WIDTH_5, +}; + /** * enum nl80211_bss - netlink attributes for a BSS * @@ -2831,6 +2846,8 @@ enum nl80211_chan_width { * @NL80211_BSS_BEACON_IES: binary attribute containing the raw information * elements from a Beacon frame (bin); not present if no Beacon frame has * yet been received + * @NL80211_BSS_CHAN_WIDTH: channel width of the control channel + * (u32, enum nl80211_bss_scan_width) * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -2847,6 +2864,7 @@ enum nl80211_bss { NL80211_BSS_STATUS, NL80211_BSS_SEEN_MS_AGO, NL80211_BSS_BEACON_IES, + NL80211_BSS_CHAN_WIDTH, /* keep last */ __NL80211_BSS_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6dca5a70017..ef4c312cc92 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5674,6 +5674,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, goto nla_put_failure; if (nla_put_u16(msg, NL80211_BSS_CAPABILITY, res->capability) || nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) || + nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) || nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO, jiffies_to_msecs(jiffies - intbss->ts))) goto nla_put_failure; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ae8c186b50d..ad1e4068ce0 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -651,6 +651,8 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *dev, continue; if (bss->pub.channel != new->pub.channel) continue; + if (bss->pub.scan_width != new->pub.scan_width) + continue; if (rcu_access_pointer(bss->pub.beacon_ies)) continue; ies = rcu_access_pointer(bss->pub.ies); @@ -870,11 +872,12 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_bss* -cfg80211_inform_bss(struct wiphy *wiphy, - struct ieee80211_channel *channel, - const u8 *bssid, u64 tsf, u16 capability, - u16 beacon_interval, const u8 *ie, size_t ielen, - s32 signal, gfp_t gfp) +cfg80211_inform_bss_width(struct wiphy *wiphy, + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width, + const u8 *bssid, u64 tsf, u16 capability, + u16 beacon_interval, const u8 *ie, size_t ielen, + s32 signal, gfp_t gfp) { struct cfg80211_bss_ies *ies; struct cfg80211_internal_bss tmp = {}, *res; @@ -892,6 +895,7 @@ cfg80211_inform_bss(struct wiphy *wiphy, memcpy(tmp.pub.bssid, bssid, ETH_ALEN); tmp.pub.channel = channel; + tmp.pub.scan_width = scan_width; tmp.pub.signal = signal; tmp.pub.beacon_interval = beacon_interval; tmp.pub.capability = capability; @@ -924,14 +928,15 @@ cfg80211_inform_bss(struct wiphy *wiphy, /* cfg80211_bss_update gives us a referenced result */ return &res->pub; } -EXPORT_SYMBOL(cfg80211_inform_bss); +EXPORT_SYMBOL(cfg80211_inform_bss_width); /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_bss * -cfg80211_inform_bss_frame(struct wiphy *wiphy, - struct ieee80211_channel *channel, - struct ieee80211_mgmt *mgmt, size_t len, - s32 signal, gfp_t gfp) +cfg80211_inform_bss_width_frame(struct wiphy *wiphy, + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width, + struct ieee80211_mgmt *mgmt, size_t len, + s32 signal, gfp_t gfp) { struct cfg80211_internal_bss tmp = {}, *res; struct cfg80211_bss_ies *ies; @@ -941,7 +946,8 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) != offsetof(struct ieee80211_mgmt, u.beacon.variable)); - trace_cfg80211_inform_bss_frame(wiphy, channel, mgmt, len, signal); + trace_cfg80211_inform_bss_width_frame(wiphy, channel, scan_width, mgmt, + len, signal); if (WARN_ON(!mgmt)) return NULL; @@ -976,6 +982,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN); tmp.pub.channel = channel; + tmp.pub.scan_width = scan_width; tmp.pub.signal = signal; tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); @@ -991,7 +998,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, /* cfg80211_bss_update gives us a referenced result */ return &res->pub; } -EXPORT_SYMBOL(cfg80211_inform_bss_frame); +EXPORT_SYMBOL(cfg80211_inform_bss_width_frame); void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { diff --git a/net/wireless/trace.h b/net/wireless/trace.h index e1534baf2eb..09af6eb426a 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2391,26 +2391,30 @@ TRACE_EVENT(cfg80211_get_bss, __entry->capa_mask, __entry->capa_val) ); -TRACE_EVENT(cfg80211_inform_bss_frame, +TRACE_EVENT(cfg80211_inform_bss_width_frame, TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width, struct ieee80211_mgmt *mgmt, size_t len, s32 signal), - TP_ARGS(wiphy, channel, mgmt, len, signal), + TP_ARGS(wiphy, channel, scan_width, mgmt, len, signal), TP_STRUCT__entry( WIPHY_ENTRY CHAN_ENTRY + __field(enum nl80211_bss_scan_width, scan_width) __dynamic_array(u8, mgmt, len) __field(s32, signal) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_ASSIGN(channel); + __entry->scan_width = scan_width; if (mgmt) memcpy(__get_dynamic_array(mgmt), mgmt, len); __entry->signal = signal; ), - TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "signal: %d", - WIPHY_PR_ARG, CHAN_PR_ARG, __entry->signal) + TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d", + WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, + __entry->signal) ); DECLARE_EVENT_CLASS(cfg80211_bss_evt, -- cgit v1.2.3-70-g09d2 From 3de805cf965d69c8d3d7d69368d5fd2c925a2d5a Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 8 Jul 2013 16:55:50 +0200 Subject: mac80211/rc80211: add chandef to rate initialization 5 and 10 MHz support needs to know the current operating channel width, add the chandef to the rate control API. Signed-off-by: Simon Wunderlich Signed-off-by: Mathias Kretschmer Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath9k/rc.c | 2 ++ drivers/net/wireless/iwlegacy/3945-rs.c | 1 + drivers/net/wireless/iwlegacy/4965-rs.c | 1 + drivers/net/wireless/iwlwifi/dvm/rs.c | 3 ++- drivers/net/wireless/iwlwifi/mvm/rs.c | 5 +++-- drivers/net/wireless/rtlwifi/rc.c | 1 + include/net/mac80211.h | 2 ++ net/mac80211/rate.h | 22 +++++++++++++++++----- net/mac80211/rc80211_minstrel.c | 3 ++- net/mac80211/rc80211_minstrel_ht.c | 10 +++++++--- net/mac80211/rc80211_pid_algo.c | 1 + 11 files changed, 39 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath9k/rc.c b/drivers/net/wireless/ath/ath9k/rc.c index 7eb1f4b458e..a3c4ca0c94b 100644 --- a/drivers/net/wireless/ath/ath9k/rc.c +++ b/drivers/net/wireless/ath/ath9k/rc.c @@ -1275,6 +1275,7 @@ static void ath_tx_status(void *priv, struct ieee80211_supported_band *sband, } static void ath_rate_init(void *priv, struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta) { struct ath_softc *sc = priv; @@ -1313,6 +1314,7 @@ static void ath_rate_init(void *priv, struct ieee80211_supported_band *sband, } static void ath_rate_update(void *priv, struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta, u32 changed) { diff --git a/drivers/net/wireless/iwlegacy/3945-rs.c b/drivers/net/wireless/iwlegacy/3945-rs.c index fe31590a51b..aea667b430c 100644 --- a/drivers/net/wireless/iwlegacy/3945-rs.c +++ b/drivers/net/wireless/iwlegacy/3945-rs.c @@ -887,6 +887,7 @@ il3945_remove_debugfs(void *il, void *il_sta) */ static void il3945_rs_rate_init_stub(void *il_r, struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *il_sta) { } diff --git a/drivers/net/wireless/iwlegacy/4965-rs.c b/drivers/net/wireless/iwlegacy/4965-rs.c index ed3c42a63a4..3ccbaf791b4 100644 --- a/drivers/net/wireless/iwlegacy/4965-rs.c +++ b/drivers/net/wireless/iwlegacy/4965-rs.c @@ -2803,6 +2803,7 @@ il4965_rs_remove_debugfs(void *il, void *il_sta) */ static void il4965_rs_rate_init_stub(void *il_r, struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *il_sta) { } diff --git a/drivers/net/wireless/iwlwifi/dvm/rs.c b/drivers/net/wireless/iwlwifi/dvm/rs.c index 1b693944123..91eb09b9b56 100644 --- a/drivers/net/wireless/iwlwifi/dvm/rs.c +++ b/drivers/net/wireless/iwlwifi/dvm/rs.c @@ -3319,7 +3319,8 @@ static void rs_remove_debugfs(void *priv, void *priv_sta) * station is added we ignore it. */ static void rs_rate_init_stub(void *priv_r, struct ieee80211_supported_band *sband, - struct ieee80211_sta *sta, void *priv_sta) + struct cfg80211_chan_def *chandef, + struct ieee80211_sta *sta, void *priv_sta) { } static struct rate_control_ops rs_ops = { diff --git a/drivers/net/wireless/iwlwifi/mvm/rs.c b/drivers/net/wireless/iwlwifi/mvm/rs.c index b328a988c13..376ea2112de 100644 --- a/drivers/net/wireless/iwlwifi/mvm/rs.c +++ b/drivers/net/wireless/iwlwifi/mvm/rs.c @@ -3159,8 +3159,9 @@ static void rs_remove_debugfs(void *mvm, void *mvm_sta) * station is added we ignore it. */ static void rs_rate_init_stub(void *mvm_r, - struct ieee80211_supported_band *sband, - struct ieee80211_sta *sta, void *mvm_sta) + struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, + struct ieee80211_sta *sta, void *mvm_sta) { } static struct rate_control_ops rs_mvm_ops = { diff --git a/drivers/net/wireless/rtlwifi/rc.c b/drivers/net/wireless/rtlwifi/rc.c index f9f059dadb7..a98acefb8c0 100644 --- a/drivers/net/wireless/rtlwifi/rc.c +++ b/drivers/net/wireless/rtlwifi/rc.c @@ -218,6 +218,7 @@ static void rtl_tx_status(void *ppriv, static void rtl_rate_init(void *ppriv, struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta) { } diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7c5dc787a8e..fe5fee830aa 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4204,8 +4204,10 @@ struct rate_control_ops { void *(*alloc_sta)(void *priv, struct ieee80211_sta *sta, gfp_t gfp); void (*rate_init)(void *priv, struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta); void (*rate_update)(void *priv, struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta, u32 changed); void (*free_sta)(void *priv, struct ieee80211_sta *sta, diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index d35a5dd3fb1..5dedc56c94d 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -66,11 +66,12 @@ static inline void rate_control_rate_init(struct sta_info *sta) } sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band]; - rcu_read_unlock(); ieee80211_sta_set_rx_nss(sta); - ref->ops->rate_init(ref->priv, sband, ista, priv_sta); + ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista, + priv_sta); + rcu_read_unlock(); set_sta_flag(sta, WLAN_STA_RATE_CONTROL); } @@ -81,10 +82,21 @@ static inline void rate_control_rate_update(struct ieee80211_local *local, struct rate_control_ref *ref = local->rate_ctrl; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; + struct ieee80211_chanctx_conf *chanctx_conf; + + if (ref && ref->ops->rate_update) { + rcu_read_lock(); - if (ref && ref->ops->rate_update) - ref->ops->rate_update(ref->priv, sband, ista, - priv_sta, changed); + chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf); + if (WARN_ON(!chanctx_conf)) { + rcu_read_unlock(); + return; + } + + ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def, + ista, priv_sta, changed); + rcu_read_unlock(); + } drv_sta_rc_update(local, sta->sdata, &sta->sta, changed); } diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index ac7ef5414bd..5b25966add1 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -417,7 +417,8 @@ init_sample_table(struct minstrel_sta_info *mi) static void minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, - struct ieee80211_sta *sta, void *priv_sta) + struct cfg80211_chan_def *chandef, + struct ieee80211_sta *sta, void *priv_sta) { struct minstrel_sta_info *mi = priv_sta; struct minstrel_priv *mp = priv; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 5b2d3012b98..52562973fbd 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -836,6 +836,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, static void minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta) { struct minstrel_priv *mp = priv; @@ -931,22 +932,25 @@ use_legacy: memset(&msp->legacy, 0, sizeof(msp->legacy)); msp->legacy.r = msp->ratelist; msp->legacy.sample_table = msp->sample_table; - return mac80211_minstrel.rate_init(priv, sband, sta, &msp->legacy); + return mac80211_minstrel.rate_init(priv, sband, chandef, sta, + &msp->legacy); } static void minstrel_ht_rate_init(void *priv, struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta) { - minstrel_ht_update_caps(priv, sband, sta, priv_sta); + minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta); } static void minstrel_ht_rate_update(void *priv, struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta, u32 changed) { - minstrel_ht_update_caps(priv, sband, sta, priv_sta); + minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta); } static void * diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c index 502d3ecc4a7..958fad07b54 100644 --- a/net/mac80211/rc80211_pid_algo.c +++ b/net/mac80211/rc80211_pid_algo.c @@ -293,6 +293,7 @@ rate_control_pid_get_rate(void *priv, struct ieee80211_sta *sta, static void rate_control_pid_rate_init(void *priv, struct ieee80211_supported_band *sband, + struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta) { struct rc_pid_sta_info *spinfo = priv_sta; -- cgit v1.2.3-70-g09d2 From a5e70697d0c4836e69c60de92db27eac9ae71e05 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 8 Jul 2013 16:55:52 +0200 Subject: mac80211: add radiotap flag and handling for 5/10 MHz Wireshark already defines radiotap channel flags for 5 and 10 MHz, so just use them in Linux radiotap too. Furthermore, add rx status flags to allow drivers to report when they received data on 5 or 10 MHz channels. Signed-off-by: Simon Wunderlich Signed-off-by: Mathias Kretschmer Signed-off-by: Johannes Berg --- include/net/ieee80211_radiotap.h | 4 ++++ include/net/mac80211.h | 4 ++++ net/mac80211/rx.c | 21 ++++++++++++--------- 3 files changed, 20 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index c6d07cb074b..8b5b7143329 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -230,6 +230,10 @@ enum ieee80211_radiotap_type { #define IEEE80211_CHAN_PASSIVE 0x0200 /* Only passive scan allowed */ #define IEEE80211_CHAN_DYN 0x0400 /* Dynamic CCK-OFDM channel */ #define IEEE80211_CHAN_GFSK 0x0800 /* GFSK channel (FHSS PHY) */ +#define IEEE80211_CHAN_GSM 0x1000 /* GSM (900 MHz) */ +#define IEEE80211_CHAN_STURBO 0x2000 /* Static Turbo */ +#define IEEE80211_CHAN_HALF 0x4000 /* Half channel (10 MHz wide) */ +#define IEEE80211_CHAN_QUARTER 0x8000 /* Quarter channel (5 MHz wide) */ /* For IEEE80211_RADIOTAP_FLAGS */ #define IEEE80211_RADIOTAP_F_CFP 0x01 /* sent/received diff --git a/include/net/mac80211.h b/include/net/mac80211.h index fe5fee830aa..3124036285e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -811,6 +811,8 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC * is stored in the @ampdu_delimiter_crc field) * @RX_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3 + * @RX_FLAG_10MHZ: 10 MHz (half channel) was used + * @RX_FLAG_5MHZ: 5 MHz (quarter channel) was used */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), @@ -839,6 +841,8 @@ enum mac80211_rx_flags { RX_FLAG_80P80MHZ = BIT(24), RX_FLAG_160MHZ = BIT(25), RX_FLAG_STBC_MASK = BIT(26) | BIT(27), + RX_FLAG_10MHZ = BIT(28), + RX_FLAG_5MHZ = BIT(29), }; #define RX_FLAG_STBC_SHIFT 26 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index c9f2ca43856..deeeca1299b 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -159,6 +159,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, __le32 *it_present; u32 it_present_val; u16 rx_flags = 0; + u16 channel_flags = 0; int mpdulen, chain; unsigned long chains = status->chains; @@ -243,20 +244,22 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, /* IEEE80211_RADIOTAP_CHANNEL */ put_unaligned_le16(status->freq, pos); pos += 2; + if (status->flag & RX_FLAG_10MHZ) + channel_flags |= IEEE80211_CHAN_HALF; + else if (status->flag & RX_FLAG_5MHZ) + channel_flags |= IEEE80211_CHAN_QUARTER; + if (status->band == IEEE80211_BAND_5GHZ) - put_unaligned_le16(IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ, - pos); + channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ; else if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) - put_unaligned_le16(IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ, - pos); + channel_flags |= IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ; else if (rate && rate->flags & IEEE80211_RATE_ERP_G) - put_unaligned_le16(IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ, - pos); + channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ; else if (rate) - put_unaligned_le16(IEEE80211_CHAN_CCK | IEEE80211_CHAN_2GHZ, - pos); + channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ; else - put_unaligned_le16(IEEE80211_CHAN_2GHZ, pos); + channel_flags |= IEEE80211_CHAN_2GHZ; + put_unaligned_le16(channel_flags, pos); pos += 2; /* IEEE80211_RADIOTAP_DBM_ANTSIGNAL */ -- cgit v1.2.3-70-g09d2 From 74608aca4d82e2855b1a6a2cd60331d1a98f2d6a Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 8 Jul 2013 16:55:54 +0200 Subject: cfg80211/mac80211: get mandatory rates based on scan width Mandatory rates for 5 and 10 MHz are different from the rates used for 20 MHz in 2.4 GHz mode, as they use OFDM only. Signed-off-by: Simon Wunderlich Signed-off-by: Mathias Kretschmer Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- net/mac80211/ibss.c | 19 +++++++++++++++---- net/wireless/mesh.c | 5 ++++- net/wireless/util.c | 14 ++++++++++---- 4 files changed, 32 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bae8614a450..88f15631701 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3128,11 +3128,13 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband, /** * ieee80211_mandatory_rates - get mandatory rates for a given band * @sband: the band to look for rates in + * @scan_width: width of the control channel * * This function returns a bitmap of the mandatory rates for the given * band, bits are set according to the rate position in the bitrates array. */ -u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband); +u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, + enum nl80211_bss_scan_width scan_width); /* * Radiotap parsing functions -- for controlled injection support diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 272a3b37615..299603e4939 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -371,6 +371,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; + enum nl80211_bss_scan_width scan_width; int band; /* @@ -399,6 +400,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, if (WARN_ON_ONCE(!chanctx_conf)) return NULL; band = chanctx_conf->def.chan->band; + scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def); rcu_read_unlock(); sta = sta_info_alloc(sdata, addr, GFP_KERNEL); @@ -412,7 +414,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; sta->sta.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(sband); + ieee80211_mandatory_rates(sband, scan_width); return ieee80211_ibss_finish_sta(sta); } @@ -476,6 +478,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, u64 beacon_timestamp, rx_timestamp; u32 supp_rates = 0; enum ieee80211_band band = rx_status->band; + enum nl80211_bss_scan_width scan_width; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; bool rates_updated = false; @@ -504,9 +507,15 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, prev_rates = sta->sta.supp_rates[band]; /* make sure mandatory rates are always added */ - sta->sta.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(sband); + scan_width = NL80211_BSS_CHAN_WIDTH_20; + if (rx_status->flag & RX_FLAG_5MHZ) + scan_width = NL80211_BSS_CHAN_WIDTH_5; + if (rx_status->flag & RX_FLAG_10MHZ) + scan_width = NL80211_BSS_CHAN_WIDTH_10; + sta->sta.supp_rates[band] = supp_rates | + ieee80211_mandatory_rates(sband, + scan_width); if (sta->sta.supp_rates[band] != prev_rates) { ibss_dbg(sdata, "updated supp_rates set for %pM based on beacon/probe_resp (0x%x -> 0x%x)\n", @@ -640,6 +649,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; + enum nl80211_bss_scan_width scan_width; int band; /* @@ -665,6 +675,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, return; } band = chanctx_conf->def.chan->band; + scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def); rcu_read_unlock(); sta = sta_info_alloc(sdata, addr, GFP_ATOMIC); @@ -676,7 +687,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; sta->sta.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(sband); + ieee80211_mandatory_rates(sband, scan_width); spin_lock(&ifibss->incomplete_lock); list_add(&sta->list, &ifibss->incomplete_stations); diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index 30c49202ee4..0553fd4d85a 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -167,9 +167,12 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, * basic rates */ if (!setup->basic_rates) { + enum nl80211_bss_scan_width scan_width; struct ieee80211_supported_band *sband = rdev->wiphy.bands[setup->chandef.chan->band]; - setup->basic_rates = ieee80211_mandatory_rates(sband); + scan_width = cfg80211_chandef_to_scan_width(&setup->chandef); + setup->basic_rates = ieee80211_mandatory_rates(sband, + scan_width); } if (!cfg80211_reg_can_beacon(&rdev->wiphy, &setup->chandef)) diff --git a/net/wireless/util.c b/net/wireless/util.c index 74458b7f61e..ce090c1c5e4 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -33,7 +33,8 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband, } EXPORT_SYMBOL(ieee80211_get_response_rate); -u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband) +u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, + enum nl80211_bss_scan_width scan_width) { struct ieee80211_rate *bitrates; u32 mandatory_rates = 0; @@ -43,10 +44,15 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband) if (WARN_ON(!sband)) return 1; - if (sband->band == IEEE80211_BAND_2GHZ) - mandatory_flag = IEEE80211_RATE_MANDATORY_B; - else + if (sband->band == IEEE80211_BAND_2GHZ) { + if (scan_width == NL80211_BSS_CHAN_WIDTH_5 || + scan_width == NL80211_BSS_CHAN_WIDTH_10) + mandatory_flag = IEEE80211_RATE_MANDATORY_G; + else + mandatory_flag = IEEE80211_RATE_MANDATORY_B; + } else { mandatory_flag = IEEE80211_RATE_MANDATORY_A; + } bitrates = sband->bitrates; for (i = 0; i < sband->n_bitrates; i++) -- cgit v1.2.3-70-g09d2 From 0430c883470d0c9a23661ea9f02c56b1d91cf93c Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 8 Jul 2013 16:55:55 +0200 Subject: cfg80211/mac80211: use reduced txpower for 5 and 10 MHz Some regulations (like germany, but also FCC) express their transmission power limit in dBm/MHz or mW/MHz. To cope with that and be on the safe side, reduce the maximum power to half (10 MHz) or quarter (5 MHz) when operating on these reduced bandwidth channels. Signed-off-by: Simon Wunderlich Signed-off-by: Mathias Kretschmer Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 27 +++++++++++++++++++++++++++ net/mac80211/iface.c | 2 +- net/mac80211/main.c | 2 +- net/mac80211/mlme.c | 3 ++- 4 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 88f15631701..aeaf6dff6e0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -460,6 +460,33 @@ ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef) return 0; } +/** + * ieee80211_chandef_max_power - maximum transmission power for the chandef + * + * In some regulations, the transmit power may depend on the configured channel + * bandwidth which may be defined as dBm/MHz. This function returns the actual + * max_power for non-standard (20 MHz) channels. + * + * @chandef: channel definition for the channel + * + * Returns: maximum allowed transmission power in dBm for the chandef + */ +static inline int +ieee80211_chandef_max_power(struct cfg80211_chan_def *chandef) +{ + switch (chandef->width) { + case NL80211_CHAN_WIDTH_5: + return min(chandef->chan->max_reg_power - 6, + chandef->chan->max_power); + case NL80211_CHAN_WIDTH_10: + return min(chandef->chan->max_reg_power - 3, + chandef->chan->max_power); + default: + break; + } + return chandef->chan->max_power; +} + /** * enum survey_info_flags - survey information flags * diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index cc117591f67..4c41c11958c 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -54,7 +54,7 @@ bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) return false; } - power = chanctx_conf->def.chan->max_power; + power = ieee80211_chandef_max_power(&chanctx_conf->def); rcu_read_unlock(); if (sdata->user_power_level != IEEE80211_UNSET_POWER_LEVEL) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 091088ac789..6da98c77496 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -151,7 +151,7 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local) changed |= IEEE80211_CONF_CHANGE_SMPS; } - power = chandef.chan->max_power; + power = ieee80211_chandef_max_power(&chandef); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f7552c2b74e..211246b4681 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -747,7 +747,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) *pos++ = WLAN_EID_PWR_CAPABILITY; *pos++ = 2; *pos++ = 0; /* min tx power */ - *pos++ = chan->max_power; /* max tx power */ + /* max tx power */ + *pos++ = ieee80211_chandef_max_power(&chanctx_conf->def); /* 2. supported channels */ /* TODO: get this in reg domain format */ -- cgit v1.2.3-70-g09d2 From bb4997a1afbff61084b243d62aaaf23ea38a290e Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Mon, 15 Jul 2013 13:15:04 +0200 Subject: bcma: add some more core names These cores were found on a BCM4708 (chipid 53010), this is a ARM SoC with two Cortex A9 cores. bcma: bus0: Found chip with id 0xCF12, rev 0x00 and package 0x02 bcma: bus0: Core 0 found: ChipCommon (manuf 0x4BF, id 0x800, rev 0x2A, class 0x0) bcma: bus0: Core 1 found: DMA (manuf 0x4BF, id 0x502, rev 0x01, class 0x0) bcma: bus0: Core 2 found: GBit MAC (manuf 0x4BF, id 0x82D, rev 0x04, class 0x0) bcma: bus0: Core 3 found: GBit MAC (manuf 0x4BF, id 0x82D, rev 0x04, class 0x0) bcma: bus0: Core 4 found: GBit MAC (manuf 0x4BF, id 0x82D, rev 0x04, class 0x0) bcma: bus0: Core 5 found: GBit MAC (manuf 0x4BF, id 0x82D, rev 0x04, class 0x0) bcma: bus0: Core 6 found: PCIe Gen 2 (manuf 0x4BF, id 0x501, rev 0x01, class 0x0) bcma: bus0: Core 7 found: PCIe Gen 2 (manuf 0x4BF, id 0x501, rev 0x01, class 0x0) bcma: bus0: Core 8 found: ARM Cortex A9 core (ihost) (manuf 0x4BF, id 0x510, rev 0x01, class 0x0) bcma: bus0: Core 9 found: USB 2.0 (manuf 0x4BF, id 0x504, rev 0x01, class 0x0) bcma: bus0: Core 10 found: USB 3.0 (manuf 0x4BF, id 0x505, rev 0x01, class 0x0) bcma: bus0: Core 11 found: SDIO3 (manuf 0x4BF, id 0x503, rev 0x01, class 0x0) bcma: bus0: Core 12 found: ARM Cortex A9 JTAG (manuf 0x4BF, id 0x506, rev 0x01, class 0x0) bcma: bus0: Core 13 found: Denali DDR2/DDR3 memory controller (manuf 0x4BF, id 0x507, rev 0x01, class 0x0) bcma: bus0: Core 14 found: ROM (manuf 0x4BF, id 0x508, rev 0x01, class 0x0) bcma: bus0: Core 15 found: NAND flash controller (manuf 0x4BF, id 0x509, rev 0x01, class 0x0) bcma: bus0: Core 16 found: SPI flash controller (manuf 0x4BF, id 0x50A, rev 0x01, class 0x0) Signed-off-by: Hauke Mehrtens Signed-off-by: John W. Linville --- drivers/bcma/scan.c | 12 ++++++++++++ include/linux/bcma/bcma.h | 12 ++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/drivers/bcma/scan.c b/drivers/bcma/scan.c index 8bffa5c9818..f2f1415376d 100644 --- a/drivers/bcma/scan.c +++ b/drivers/bcma/scan.c @@ -32,6 +32,18 @@ static const struct bcma_device_id_name bcma_bcm_device_names[] = { { BCMA_CORE_4706_CHIPCOMMON, "BCM4706 ChipCommon" }, { BCMA_CORE_4706_SOC_RAM, "BCM4706 SOC RAM" }, { BCMA_CORE_4706_MAC_GBIT, "BCM4706 GBit MAC" }, + { BCMA_CORE_PCIEG2, "PCIe Gen 2" }, + { BCMA_CORE_DMA, "DMA" }, + { BCMA_CORE_SDIO3, "SDIO3" }, + { BCMA_CORE_USB20, "USB 2.0" }, + { BCMA_CORE_USB30, "USB 3.0" }, + { BCMA_CORE_A9JTAG, "ARM Cortex A9 JTAG" }, + { BCMA_CORE_DDR23, "Denali DDR2/DDR3 memory controller" }, + { BCMA_CORE_ROM, "ROM" }, + { BCMA_CORE_NAND, "NAND flash controller" }, + { BCMA_CORE_QSPI, "SPI flash controller" }, + { BCMA_CORE_CHIPCOMMON_B, "Chipcommon B" }, + { BCMA_CORE_ARMCA9, "ARM Cortex A9 core (ihost)" }, { BCMA_CORE_AMEMC, "AMEMC (DDR)" }, { BCMA_CORE_ALTA, "ALTA (I2S)" }, { BCMA_CORE_INVALID, "Invalid" }, diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 622fc505d3e..8fee0280720 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -72,7 +72,19 @@ struct bcma_host_ops { /* Core-ID values. */ #define BCMA_CORE_OOB_ROUTER 0x367 /* Out of band */ #define BCMA_CORE_4706_CHIPCOMMON 0x500 +#define BCMA_CORE_PCIEG2 0x501 +#define BCMA_CORE_DMA 0x502 +#define BCMA_CORE_SDIO3 0x503 +#define BCMA_CORE_USB20 0x504 +#define BCMA_CORE_USB30 0x505 +#define BCMA_CORE_A9JTAG 0x506 +#define BCMA_CORE_DDR23 0x507 +#define BCMA_CORE_ROM 0x508 +#define BCMA_CORE_NAND 0x509 +#define BCMA_CORE_QSPI 0x50A +#define BCMA_CORE_CHIPCOMMON_B 0x50B #define BCMA_CORE_4706_SOC_RAM 0x50E +#define BCMA_CORE_ARMCA9 0x510 #define BCMA_CORE_4706_MAC_GBIT 0x52D #define BCMA_CORE_AMEMC 0x52E /* DDR1/2 memory controller core */ #define BCMA_CORE_ALTA 0x534 /* I2S core */ -- cgit v1.2.3-70-g09d2 From 16041990d1c75efb4408d19413cf4fd27aa148dd Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Mon, 15 Jul 2013 13:15:06 +0200 Subject: bcma: add constants for new ARM based SoCs These are the chipIDs of some ARM based SoCs from the BCM47xx line. Signed-off-by: Hauke Mehrtens Signed-off-by: John W. Linville --- include/linux/bcma/bcma.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 8fee0280720..4d043c30216 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -189,6 +189,11 @@ struct bcma_host_ops { #define BCMA_PKG_ID_BCM5357 11 #define BCMA_CHIP_ID_BCM53572 53572 #define BCMA_PKG_ID_BCM47188 9 +#define BCMA_CHIP_ID_BCM4707 53010 +#define BCMA_PKG_ID_BCM4707 1 +#define BCMA_PKG_ID_BCM4708 2 +#define BCMA_PKG_ID_BCM4709 0 +#define BCMA_CHIP_ID_BCM53018 53018 /* Board types (on PCI usually equals to the subsystem dev id) */ /* BCM4313 */ -- cgit v1.2.3-70-g09d2 From cb820f8e4b7f73d1a32175e6591735b25bb5398d Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 19 Jul 2013 19:40:09 +0200 Subject: net: Provide a generic socket error queue delivery method for Tx time stamps. This patch moves the private error queue delivery function from the af_packet code to the core socket method. In this way, network layers only needing the error queue for transmit time stamping can share common code. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ net/core/sock.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ net/packet/af_packet.c | 48 ++---------------------------------------------- 3 files changed, 51 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 95a5a2c6925..e0473f61693 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2249,6 +2249,8 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb) extern void sock_enable_timestamp(struct sock *sk, int flag); extern int sock_get_timestamp(struct sock *, struct timeval __user *); extern int sock_get_timestampns(struct sock *, struct timespec __user *); +extern int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type); /* * Enable debug/info messages diff --git a/net/core/sock.c b/net/core/sock.c index 548d716c5f6..85e8de1bc7f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -93,6 +93,7 @@ #include #include +#include #include #include #include @@ -2425,6 +2426,52 @@ void sock_enable_timestamp(struct sock *sk, int flag) } } +int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + int copied, err; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else + spin_unlock_bh(&sk->sk_error_queue.lock); + +out_free_skb: + kfree_skb(skb); +out: + return err; +} +EXPORT_SYMBOL(sock_recv_errqueue); + /* * Get a socket option on an socket. * diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 4b66c752eae..4cb28a7f639 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2638,51 +2638,6 @@ out: return err; } -static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len) -{ - struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; - int copied, err; - - err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); - if (skb == NULL) - goto out; - - copied = skb->len; - if (copied > len) { - msg->msg_flags |= MSG_TRUNC; - copied = len; - } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); - if (err) - goto out_free_skb; - - sock_recv_timestamp(msg, sk, skb); - - serr = SKB_EXT_ERR(skb); - put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP, - sizeof(serr->ee), &serr->ee); - - msg->msg_flags |= MSG_ERRQUEUE; - err = copied; - - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else - spin_unlock_bh(&sk->sk_error_queue.lock); - -out_free_skb: - kfree_skb(skb); -out: - return err; -} - /* * Pull a packet from our receive queue and hand it to the user. * If necessary we block. @@ -2708,7 +2663,8 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, #endif if (flags & MSG_ERRQUEUE) { - err = packet_recv_error(sk, msg, len); + err = sock_recv_errqueue(sk, msg, len, + SOL_PACKET, PACKET_TX_TIMESTAMP); goto out; } -- cgit v1.2.3-70-g09d2 From eda297729171fe16bf34fe5b0419dfb69060f623 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 19 Jul 2013 19:40:10 +0200 Subject: tun: Support software transmit time stamping. This patch adds transmit time stamping to the tun/tap driver. Similar support already exists for UDP, can, and raw packets. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/tun.c | 14 ++++++++++++-- include/uapi/linux/if_tun.h | 3 +++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index db690a37226..a72d141047c 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -739,6 +739,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) >= dev->tx_queue_len / tun->numqueues) goto drop; + if (skb->sk) { + sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags); + sw_tx_timestamp(skb); + } + /* Orphan the skb - required as we might hang on to it * for indefinite time. */ if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) @@ -1476,7 +1481,6 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, return ret; } - static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len, int flags) @@ -1488,10 +1492,15 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, if (!tun) return -EBADFD; - if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) { + if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { ret = -EINVAL; goto out; } + if (flags & MSG_ERRQUEUE) { + ret = sock_recv_errqueue(sock->sk, m, total_len, + SOL_PACKET, TUN_TX_TIMESTAMP); + goto out; + } ret = tun_do_read(tun, tfile, iocb, m->msg_iov, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { @@ -2274,6 +2283,7 @@ static const struct ethtool_ops tun_ethtool_ops = { .get_msglevel = tun_get_msglevel, .set_msglevel = tun_set_msglevel, .get_link = ethtool_op_get_link, + .get_ts_info = ethtool_op_get_ts_info, }; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 82334f88967..1870ee29bb3 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -71,6 +71,9 @@ /* read-only flag */ #define IFF_PERSIST 0x0800 +/* Socket options */ +#define TUN_TX_TIMESTAMP 1 + /* Features for GSO (TUNSETOFFLOAD). */ #define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ #define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */ -- cgit v1.2.3-70-g09d2 From 375fe02c91792917aa26d68a87ab110d1937f44e Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 22 Jul 2013 16:20:45 -0700 Subject: tcp: consolidate SYNACK RTT sampling The first patch consolidates SYNACK and other RTT measurement to use a central function tcp_ack_update_rtt(). A (small) bonus is now SYNACK RTT measurement happens after PAWS check, potentially reducing the impact of RTO seeding on bad TCP timestamps values. Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 9 --------- net/ipv4/tcp_input.c | 14 ++++++++++++-- net/ipv4/tcp_ipv4.c | 2 -- net/ipv4/tcp_minisocks.c | 8 ++------ net/ipv6/tcp_ipv6.c | 2 -- 5 files changed, 14 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index d1980054ec7..f9777dbede7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1094,15 +1094,6 @@ static inline void tcp_openreq_init(struct request_sock *req, ireq->loc_port = tcp_hdr(skb)->dest; } -/* Compute time elapsed between SYNACK and the ACK completing 3WHS */ -static inline void tcp_synack_rtt_meas(struct sock *sk, - struct request_sock *req) -{ - if (tcp_rsk(req)->snt_synack) - tcp_valid_rtt_meas(sk, - tcp_time_stamp - tcp_rsk(req)->snt_synack); -} - extern void tcp_enter_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 28af45abe06..b531710596e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2853,6 +2853,17 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } +/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ +static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) +{ + struct tcp_sock *tp = tcp_sk(sk); + s32 seq_rtt = -1; + + if (tp->lsndtime && !tp->total_retrans) + seq_rtt = tcp_time_stamp - tp->lsndtime; + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt); +} + static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -5624,9 +5635,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * so release it. */ if (req) { - tcp_synack_rtt_meas(sk, req); tp->total_retrans = req->num_retrans; - reqsk_fastopen_remove(sk, req, false); } else { /* Make sure socket is routed, for correct metrics. */ @@ -5651,6 +5660,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + tcp_synack_rtt_meas(sk, req); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b299da5ff49..2e3f129df0e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1671,8 +1671,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); - tcp_synack_rtt_meas(newsk, req); - newtp->total_retrans = req->num_retrans; #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ab1c0865852..58a3e69aef6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; + newtp->lsndtime = treq->snt_synack; + newtp->total_retrans = req->num_retrans; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; - /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */ - if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) - tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; - else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */ - tcp_rsk(req)->snt_synack = 0; - /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6e1649d5853..80fe69ef218 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1237,8 +1237,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); - tcp_synack_rtt_meas(newsk, req); - newtp->total_retrans = req->num_retrans; newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; -- cgit v1.2.3-70-g09d2 From 5b08e47caf1f2034a3a5b566bbccc8b0be3961ca Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 22 Jul 2013 16:20:46 -0700 Subject: tcp: prefer packet timing to TS-ECR for RTT Prefer packet timings to TS-ecr for RTT measurements when both sources are available. That's because broken middle-boxes and remote peer can return packets with corrupted TS ECR fields. Similarly most congestion controls that require RTT signals favor timing-based sources as well. Also check for bad TS ECR values to avoid RTT blow-ups. It has happened on production Web servers. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_input.c | 67 ++++++++++++++-------------------------------------- 2 files changed, 18 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index f9777dbede7..c5868471aba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -591,7 +591,6 @@ extern void tcp_initialize_rcv_mss(struct sock *sk); extern int tcp_mtu_to_mss(struct sock *sk, int pmtu); extern int tcp_mss_to_mtu(struct sock *sk, int mss); extern void tcp_mtup_init(struct sock *sk); -extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt); extern void tcp_init_buffer_space(struct sock *sk); static inline void tcp_bound_rto(const struct sock *sk) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b531710596e..c7398f05d12 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2792,65 +2792,36 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_xmit_retransmit_queue(sk); } -void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, + s32 seq_rtt) { - tcp_rtt_estimator(sk, seq_rtt); - tcp_set_rto(sk); - inet_csk(sk)->icsk_backoff = 0; -} -EXPORT_SYMBOL(tcp_valid_rtt_meas); + const struct tcp_sock *tp = tcp_sk(sk); + + /* Prefer RTT measured from ACK's timing to TS-ECR. This is because + * broken middle-boxes or peers may corrupt TS-ECR fields. But + * Karn's algorithm forbids taking RTT if some retransmitted data + * is acked (RFC6298). + */ + if (flag & FLAG_RETRANS_DATA_ACKED) + seq_rtt = -1; -/* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Supersedes RFC1323) - */ -static void tcp_ack_saw_tstamp(struct sock *sk, int flag) -{ /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the * left edge of the send window. - * * See draft-ietf-tcplw-high-performance-00, section 3.3. - * 1998/04/10 Andrey V. Savochkin - * - * Changed: reset backoff as soon as we see the first valid sample. - * If we do not, we get strongly overestimated rto. With timestamps - * samples are accepted even from very old segments: f.e., when rtt=1 - * increases to 8, we retransmit 5 times and after 8 seconds delayed - * answer arrives rto becomes 120 seconds! If at least one of segments - * in window is lost... Voila. --ANK (010210) */ - struct tcp_sock *tp = tcp_sk(sk); + if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); -} - -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) -{ - /* We don't have a timestamp. Can only use - * packets that are not retransmitted to determine - * rtt estimates. Also, we must not reset the - * backoff for rto until we get a non-retransmitted - * packet. This allows us to deal with a situation - * where the network delay has increased suddenly. - * I.e. Karn's algorithm. (SIGCOMM '87, p5.) - */ - - if (flag & FLAG_RETRANS_DATA_ACKED) + if (seq_rtt < 0) return; - tcp_valid_rtt_meas(sk, seq_rtt); -} + tcp_rtt_estimator(sk, seq_rtt); + tcp_set_rto(sk); -static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - const s32 seq_rtt) -{ - const struct tcp_sock *tp = tcp_sk(sk); - /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ - if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(sk, flag); - else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, flag); + /* RFC6298: only reset backoff on valid RTT measurement. */ + inet_csk(sk)->icsk_backoff = 0; } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ @@ -2989,8 +2960,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; - ca_seq_rtt = -1; - seq_rtt = -1; } else { ca_seq_rtt = now - scb->when; last_ackt = skb->tstamp; -- cgit v1.2.3-70-g09d2 From fc423ff00df3a19554414eed80aef9de9b50313e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 20 Jul 2013 12:13:52 +0200 Subject: team: add peer notification When port is enabled or disabled, allow to notify peers by unsolicitated NAs or gratuitous ARPs. Disabled by default. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/team/team.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/if_team.h | 8 ++++- 2 files changed, 94 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index bff7e0b0b4e..0433ee994f8 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -622,6 +622,46 @@ static int team_change_mode(struct team *team, const char *kind) } +/********************* + * Peers notification + *********************/ + +static void team_notify_peers_work(struct work_struct *work) +{ + struct team *team; + + team = container_of(work, struct team, notify_peers.dw.work); + + if (!rtnl_trylock()) { + schedule_delayed_work(&team->notify_peers.dw, 0); + return; + } + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, team->dev); + rtnl_unlock(); + if (!atomic_dec_and_test(&team->notify_peers.count_pending)) + schedule_delayed_work(&team->notify_peers.dw, + msecs_to_jiffies(team->notify_peers.interval)); +} + +static void team_notify_peers(struct team *team) +{ + if (!team->notify_peers.count || !netif_running(team->dev)) + return; + atomic_set(&team->notify_peers.count_pending, team->notify_peers.count); + schedule_delayed_work(&team->notify_peers.dw, 0); +} + +static void team_notify_peers_init(struct team *team) +{ + INIT_DELAYED_WORK(&team->notify_peers.dw, team_notify_peers_work); +} + +static void team_notify_peers_fini(struct team *team) +{ + cancel_delayed_work_sync(&team->notify_peers.dw); +} + + /************************ * Rx path frame handler ************************/ @@ -846,6 +886,7 @@ static void team_port_enable(struct team *team, team_queue_override_port_add(team, port); if (team->ops.port_enabled) team->ops.port_enabled(team, port); + team_notify_peers(team); } static void __reconstruct_port_hlist(struct team *team, int rm_index) @@ -875,6 +916,7 @@ static void team_port_disable(struct team *team, team->en_port_count--; team_queue_override_port_del(team, port); team_adjust_ops(team); + team_notify_peers(team); } #define TEAM_VLAN_FEATURES (NETIF_F_ALL_CSUM | NETIF_F_SG | \ @@ -1205,6 +1247,34 @@ static int team_mode_option_set(struct team *team, struct team_gsetter_ctx *ctx) return team_change_mode(team, ctx->data.str_val); } +static int team_notify_peers_count_get(struct team *team, + struct team_gsetter_ctx *ctx) +{ + ctx->data.u32_val = team->notify_peers.count; + return 0; +} + +static int team_notify_peers_count_set(struct team *team, + struct team_gsetter_ctx *ctx) +{ + team->notify_peers.count = ctx->data.u32_val; + return 0; +} + +static int team_notify_peers_interval_get(struct team *team, + struct team_gsetter_ctx *ctx) +{ + ctx->data.u32_val = team->notify_peers.interval; + return 0; +} + +static int team_notify_peers_interval_set(struct team *team, + struct team_gsetter_ctx *ctx) +{ + team->notify_peers.interval = ctx->data.u32_val; + return 0; +} + static int team_port_en_option_get(struct team *team, struct team_gsetter_ctx *ctx) { @@ -1316,6 +1386,18 @@ static const struct team_option team_options[] = { .getter = team_mode_option_get, .setter = team_mode_option_set, }, + { + .name = "notify_peers_count", + .type = TEAM_OPTION_TYPE_U32, + .getter = team_notify_peers_count_get, + .setter = team_notify_peers_count_set, + }, + { + .name = "notify_peers_interval", + .type = TEAM_OPTION_TYPE_U32, + .getter = team_notify_peers_interval_get, + .setter = team_notify_peers_interval_set, + }, { .name = "enabled", .type = TEAM_OPTION_TYPE_BOOL, @@ -1396,6 +1478,9 @@ static int team_init(struct net_device *dev) INIT_LIST_HEAD(&team->option_list); INIT_LIST_HEAD(&team->option_inst_list); + + team_notify_peers_init(team); + err = team_options_register(team, team_options, ARRAY_SIZE(team_options)); if (err) goto err_options_register; @@ -1406,6 +1491,7 @@ static int team_init(struct net_device *dev) return 0; err_options_register: + team_notify_peers_fini(team); team_queue_override_fini(team); err_team_queue_override_init: free_percpu(team->pcpu_stats); @@ -1425,6 +1511,7 @@ static void team_uninit(struct net_device *dev) __team_change_mode(team, NULL); /* cleanup */ __team_options_unregister(team, team_options, ARRAY_SIZE(team_options)); + team_notify_peers_fini(team); team_queue_override_fini(team); mutex_unlock(&team->lock); } diff --git a/include/linux/if_team.h b/include/linux/if_team.h index f6156f91eb1..b0b83683461 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -10,9 +10,9 @@ #ifndef _LINUX_IF_TEAM_H_ #define _LINUX_IF_TEAM_H_ - #include #include +#include #include struct team_pcpu_stats { @@ -194,6 +194,12 @@ struct team { bool user_carrier_enabled; bool queue_override_enabled; struct list_head *qom_lists; /* array of queue override mapping lists */ + struct { + unsigned int count; + unsigned int interval; /* in ms */ + atomic_t count_pending; + struct delayed_work dw; + } notify_peers; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; -- cgit v1.2.3-70-g09d2 From 4aa5dee4d9997879adff858514844efab5a15a01 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 20 Jul 2013 12:13:53 +0200 Subject: net: convert resend IGMP to notifier event Until now, bond_resend_igmp_join_requests() looks for vlans attached to bonding device, bridge where bonding act as port manually. It does not care of other scenarios, like stacked bonds or team device above. Make this more generic and use netdev notifier to propagate the event to upper devices and to actually call ip_mc_rejoin_groups(). Signed-off-by: Jiri Pirko Acked-by: Veaceslav Falico Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 44 ++++++++------------------------------- drivers/net/team/team.c | 4 ++++ include/linux/igmp.h | 1 - include/linux/netdevice.h | 1 + net/8021q/vlan.c | 1 + net/bridge/br_notify.c | 5 +++++ net/ipv4/igmp.c | 46 +++++++++++++++++++++++++++++++++++------ 7 files changed, 60 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 07f257d44a1..ae9864c9fa3 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -715,15 +715,6 @@ static int bond_set_allmulti(struct bonding *bond, int inc) return err; } -static void __bond_resend_igmp_join_requests(struct net_device *dev) -{ - struct in_device *in_dev; - - in_dev = __in_dev_get_rcu(dev); - if (in_dev) - ip_mc_rejoin_groups(in_dev); -} - /* * Retrieve the list of registered multicast addresses for the bonding * device and retransmit an IGMP JOIN request to the current active @@ -731,33 +722,12 @@ static void __bond_resend_igmp_join_requests(struct net_device *dev) */ static void bond_resend_igmp_join_requests(struct bonding *bond) { - struct net_device *bond_dev, *vlan_dev, *upper_dev; - struct vlan_entry *vlan; - - read_lock(&bond->lock); - rcu_read_lock(); - - bond_dev = bond->dev; - - /* rejoin all groups on bond device */ - __bond_resend_igmp_join_requests(bond_dev); - - /* - * if bond is enslaved to a bridge, - * then rejoin all groups on its master - */ - upper_dev = netdev_master_upper_dev_get_rcu(bond_dev); - if (upper_dev && upper_dev->priv_flags & IFF_EBRIDGE) - __bond_resend_igmp_join_requests(upper_dev); - - /* rejoin all groups on vlan devices */ - list_for_each_entry(vlan, &bond->vlan_list, vlan_list) { - vlan_dev = __vlan_find_dev_deep(bond_dev, htons(ETH_P_8021Q), - vlan->vlan_id); - if (vlan_dev) - __bond_resend_igmp_join_requests(vlan_dev); + if (!rtnl_trylock()) { + queue_delayed_work(bond->wq, &bond->mcast_work, 0); + return; } - rcu_read_unlock(); + call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev); + rtnl_unlock(); /* We use curr_slave_lock to protect against concurrent access to * igmp_retrans from multiple running instances of this function and @@ -3234,6 +3204,10 @@ static int bond_slave_netdev_event(unsigned long event, case NETDEV_FEAT_CHANGE: bond_compute_features(bond); break; + case NETDEV_RESEND_IGMP: + /* Propagate to master device */ + call_netdevice_notifiers(event, slave->bond->dev); + break; default: break; } diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 0433ee994f8..2587dc86d53 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2785,6 +2785,10 @@ static int team_device_event(struct notifier_block *unused, case NETDEV_PRE_TYPE_CHANGE: /* Forbid to change type of underlaying device */ return NOTIFY_BAD; + case NETDEV_RESEND_IGMP: + /* Propagate to master device */ + call_netdevice_notifiers(event, port->team->dev); + break; } return NOTIFY_DONE; } diff --git a/include/linux/igmp.h b/include/linux/igmp.h index e3362b5f13e..f47550d75f8 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -129,6 +129,5 @@ extern void ip_mc_unmap(struct in_device *); extern void ip_mc_remap(struct in_device *); extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr); extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr); -extern void ip_mc_rejoin_groups(struct in_device *in_dev); #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0741a1e919a..2bb2357d83b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1633,6 +1633,7 @@ struct packet_offload { #define NETDEV_NOTIFY_PEERS 0x0013 #define NETDEV_JOIN 0x0014 #define NETDEV_CHANGEUPPER 0x0015 +#define NETDEV_RESEND_IGMP 0x0016 extern int register_netdevice_notifier(struct notifier_block *nb); extern int unregister_netdevice_notifier(struct notifier_block *nb); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 2fb2d88e8c2..03a92e11765 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -459,6 +459,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: + case NETDEV_RESEND_IGMP: /* Propagate to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index 3a3f371b284..2998dd1769a 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -102,6 +102,11 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ return NOTIFY_BAD; + + case NETDEV_RESEND_IGMP: + /* Propagate to master device */ + call_netdevice_notifiers(event, br->dev); + break; } /* Events that may cause spanning tree to refresh */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index cd71190d296..375aca37225 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1323,16 +1323,17 @@ out: EXPORT_SYMBOL(ip_mc_inc_group); /* - * Resend IGMP JOIN report; used for bonding. - * Called with rcu_read_lock() + * Resend IGMP JOIN report; used by netdev notifier. */ -void ip_mc_rejoin_groups(struct in_device *in_dev) +static void ip_mc_rejoin_groups(struct in_device *in_dev) { #ifdef CONFIG_IP_MULTICAST struct ip_mc_list *im; int type; - for_each_pmc_rcu(in_dev, im) { + ASSERT_RTNL(); + + for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; @@ -1349,7 +1350,6 @@ void ip_mc_rejoin_groups(struct in_device *in_dev) } #endif } -EXPORT_SYMBOL(ip_mc_rejoin_groups); /* * A socket has left a multicast group on device dev @@ -2735,8 +2735,42 @@ static struct pernet_operations igmp_net_ops = { .exit = igmp_net_exit, }; +static int igmp_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct in_device *in_dev; + + switch (event) { + case NETDEV_RESEND_IGMP: + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) + ip_mc_rejoin_groups(in_dev); + break; + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block igmp_notifier = { + .notifier_call = igmp_netdev_event, +}; + int __init igmp_mc_proc_init(void) { - return register_pernet_subsys(&igmp_net_ops); + int err; + + err = register_pernet_subsys(&igmp_net_ops); + if (err) + return err; + err = register_netdevice_notifier(&igmp_notifier); + if (err) + goto reg_notif_fail; + return 0; + +reg_notif_fail: + unregister_pernet_subsys(&igmp_net_ops); + return err; } #endif -- cgit v1.2.3-70-g09d2 From 492b200efdd20b8fcfdac873f3cd8d4902386581 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 20 Jul 2013 12:13:54 +0200 Subject: team: add support for sending multicast rejoins Similar to what is implemented in bonding. User is able to ask team driver to send IGMP rejoins in case port is enabled or disabled. Using previously introduced netdev notifier. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/team/team.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/if_team.h | 6 ++++ 2 files changed, 91 insertions(+) (limited to 'include') diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 2587dc86d53..75159e4184f 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -662,6 +662,46 @@ static void team_notify_peers_fini(struct team *team) } +/******************************* + * Send multicast group rejoins + *******************************/ + +static void team_mcast_rejoin_work(struct work_struct *work) +{ + struct team *team; + + team = container_of(work, struct team, mcast_rejoin.dw.work); + + if (!rtnl_trylock()) { + schedule_delayed_work(&team->mcast_rejoin.dw, 0); + return; + } + call_netdevice_notifiers(NETDEV_RESEND_IGMP, team->dev); + rtnl_unlock(); + if (!atomic_dec_and_test(&team->mcast_rejoin.count_pending)) + schedule_delayed_work(&team->mcast_rejoin.dw, + msecs_to_jiffies(team->mcast_rejoin.interval)); +} + +static void team_mcast_rejoin(struct team *team) +{ + if (!team->mcast_rejoin.count || !netif_running(team->dev)) + return; + atomic_set(&team->mcast_rejoin.count_pending, team->mcast_rejoin.count); + schedule_delayed_work(&team->mcast_rejoin.dw, 0); +} + +static void team_mcast_rejoin_init(struct team *team) +{ + INIT_DELAYED_WORK(&team->mcast_rejoin.dw, team_mcast_rejoin_work); +} + +static void team_mcast_rejoin_fini(struct team *team) +{ + cancel_delayed_work_sync(&team->mcast_rejoin.dw); +} + + /************************ * Rx path frame handler ************************/ @@ -887,6 +927,7 @@ static void team_port_enable(struct team *team, if (team->ops.port_enabled) team->ops.port_enabled(team, port); team_notify_peers(team); + team_mcast_rejoin(team); } static void __reconstruct_port_hlist(struct team *team, int rm_index) @@ -917,6 +958,7 @@ static void team_port_disable(struct team *team, team_queue_override_port_del(team, port); team_adjust_ops(team); team_notify_peers(team); + team_mcast_rejoin(team); } #define TEAM_VLAN_FEATURES (NETIF_F_ALL_CSUM | NETIF_F_SG | \ @@ -1275,6 +1317,34 @@ static int team_notify_peers_interval_set(struct team *team, return 0; } +static int team_mcast_rejoin_count_get(struct team *team, + struct team_gsetter_ctx *ctx) +{ + ctx->data.u32_val = team->mcast_rejoin.count; + return 0; +} + +static int team_mcast_rejoin_count_set(struct team *team, + struct team_gsetter_ctx *ctx) +{ + team->mcast_rejoin.count = ctx->data.u32_val; + return 0; +} + +static int team_mcast_rejoin_interval_get(struct team *team, + struct team_gsetter_ctx *ctx) +{ + ctx->data.u32_val = team->mcast_rejoin.interval; + return 0; +} + +static int team_mcast_rejoin_interval_set(struct team *team, + struct team_gsetter_ctx *ctx) +{ + team->mcast_rejoin.interval = ctx->data.u32_val; + return 0; +} + static int team_port_en_option_get(struct team *team, struct team_gsetter_ctx *ctx) { @@ -1398,6 +1468,18 @@ static const struct team_option team_options[] = { .getter = team_notify_peers_interval_get, .setter = team_notify_peers_interval_set, }, + { + .name = "mcast_rejoin_count", + .type = TEAM_OPTION_TYPE_U32, + .getter = team_mcast_rejoin_count_get, + .setter = team_mcast_rejoin_count_set, + }, + { + .name = "mcast_rejoin_interval", + .type = TEAM_OPTION_TYPE_U32, + .getter = team_mcast_rejoin_interval_get, + .setter = team_mcast_rejoin_interval_set, + }, { .name = "enabled", .type = TEAM_OPTION_TYPE_BOOL, @@ -1480,6 +1562,7 @@ static int team_init(struct net_device *dev) INIT_LIST_HEAD(&team->option_inst_list); team_notify_peers_init(team); + team_mcast_rejoin_init(team); err = team_options_register(team, team_options, ARRAY_SIZE(team_options)); if (err) @@ -1491,6 +1574,7 @@ static int team_init(struct net_device *dev) return 0; err_options_register: + team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); err_team_queue_override_init: @@ -1511,6 +1595,7 @@ static void team_uninit(struct net_device *dev) __team_change_mode(team, NULL); /* cleanup */ __team_options_unregister(team, team_options, ARRAY_SIZE(team_options)); + team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); mutex_unlock(&team->lock); diff --git a/include/linux/if_team.h b/include/linux/if_team.h index b0b83683461..a899dc24be1 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -200,6 +200,12 @@ struct team { atomic_t count_pending; struct delayed_work dw; } notify_peers; + struct { + unsigned int count; + unsigned int interval; /* in ms */ + atomic_t count_pending; + struct delayed_work dw; + } mcast_rejoin; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; -- cgit v1.2.3-70-g09d2 From 91705c61b52029ab5da67a15a23eef08667bf40e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Jul 2013 14:51:47 +0200 Subject: net: sctp: trivial: update mailing list address The SCTP mailing list address to send patches or questions to is linux-sctp@vger.kernel.org and not lksctp-developers@lists.sourceforge.net anymore. Therefore, update all occurences. Signed-off-by: Daniel Borkmann Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- Documentation/networking/sctp.txt | 5 +---- include/net/sctp/auth.h | 2 +- include/net/sctp/checksum.h | 2 +- include/net/sctp/constants.h | 2 +- include/net/sctp/sctp.h | 2 +- include/net/sctp/sm.h | 2 +- include/net/sctp/structs.h | 2 +- include/net/sctp/tsnmap.h | 2 +- include/net/sctp/ulpevent.h | 2 +- include/net/sctp/ulpqueue.h | 2 +- include/uapi/linux/sctp.h | 2 +- net/sctp/associola.c | 2 +- net/sctp/auth.c | 2 +- net/sctp/bind_addr.c | 2 +- net/sctp/chunk.c | 2 +- net/sctp/command.c | 2 +- net/sctp/debug.c | 2 +- net/sctp/endpointola.c | 2 +- net/sctp/input.c | 2 +- net/sctp/inqueue.c | 2 +- net/sctp/ipv6.c | 2 +- net/sctp/objcnt.c | 2 +- net/sctp/output.c | 2 +- net/sctp/outqueue.c | 2 +- net/sctp/primitive.c | 2 +- net/sctp/proc.c | 2 +- net/sctp/protocol.c | 4 ++-- net/sctp/sm_make_chunk.c | 2 +- net/sctp/sm_sideeffect.c | 2 +- net/sctp/sm_statefuns.c | 2 +- net/sctp/sm_statetable.c | 2 +- net/sctp/socket.c | 2 +- net/sctp/ssnmap.c | 2 +- net/sctp/sysctl.c | 2 +- net/sctp/transport.c | 2 +- net/sctp/tsnmap.c | 2 +- net/sctp/ulpevent.c | 2 +- net/sctp/ulpqueue.c | 2 +- 38 files changed, 39 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/Documentation/networking/sctp.txt b/Documentation/networking/sctp.txt index 0c790a76910..97b810ca908 100644 --- a/Documentation/networking/sctp.txt +++ b/Documentation/networking/sctp.txt @@ -19,7 +19,6 @@ of SCTP that is RFC 2960 compliant and provides an programming interface referred to as the UDP-style API of the Sockets Extensions for SCTP, as proposed in IETF Internet-Drafts. - Caveats: -lksctp can be built as statically or as a module. However, be aware that @@ -33,6 +32,4 @@ For more information, please visit the lksctp project website: http://www.sf.net/projects/lksctp Or contact the lksctp developers through the mailing list: - - - + diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index 49bc9577c61..754c511f4cf 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -22,7 +22,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index 0cb08e6fb6d..78b88aa8c81 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -25,7 +25,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index ca50e0751e4..a1e7aa5c8bb 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -25,7 +25,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index d8e37ecea69..554cf88605f 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 2a82d138470..2aa66dda4a5 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email addresses: - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index e745c92a153..75c4c16601b 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -25,7 +25,7 @@ * * Please send any bug reports or fixes you make to the * email addresses: - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/tsnmap.h b/include/net/sctp/tsnmap.h index 2c5d2b4d5d1..c361d6f7e0c 100644 --- a/include/net/sctp/tsnmap.h +++ b/include/net/sctp/tsnmap.h @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index ca4693b4e09..34f0d34d22c 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -31,7 +31,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h index 00e50ba3f24..add3237a9fd 100644 --- a/include/net/sctp/ulpqueue.h +++ b/include/net/sctp/ulpqueue.h @@ -30,7 +30,7 @@ * * Please send any bug reports or fixes you make to the * email addresses: - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 66b466e4ca0..ca451e99b28 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/associola.c b/net/sctp/associola.c index bce5b79662a..e425ba0bd79 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/auth.c b/net/sctp/auth.c index ba1dfc3f8de..3aab967081b 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -22,7 +22,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 64977ea0f9c..f34ce8bc139 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 5780565f5b7..b50b90c16a8 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -24,7 +24,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/command.c b/net/sctp/command.c index c0044019db9..f4bebdadf00 100644 --- a/net/sctp/command.c +++ b/net/sctp/command.c @@ -25,7 +25,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/debug.c b/net/sctp/debug.c index f4998780d6d..44aa4e7543a 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 9e3d257de0e..825b7543a43 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -29,7 +29,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/input.c b/net/sctp/input.c index 3fa4d858c35..7993495a4c0 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -29,7 +29,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index cb25f040fed..e70f60ae92e 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -30,7 +30,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 09ffcc912d2..85d688f59e6 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index fe012c44f8d..aec346cbed6 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -26,7 +26,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/output.c b/net/sctp/output.c index a46d1eb4176..5a55c55d71a 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -26,7 +26,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index ef9e2bbc0f2..51313233e63 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c index 794bb14decd..31fa437da11 100644 --- a/net/sctp/primitive.c +++ b/net/sctp/primitive.c @@ -29,7 +29,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 62526c47705..aff0cac45b6 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -22,7 +22,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 4a17494d736..b52ec251010 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -29,7 +29,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp @@ -1547,7 +1547,7 @@ module_exit(sctp_exit); */ MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132"); MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132"); -MODULE_AUTHOR("Linux Kernel SCTP developers "); +MODULE_AUTHOR("Linux Kernel SCTP developers "); MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); module_param_named(no_checksums, sctp_checksum_disable, bool, 0644); MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification"); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 362ae6e2fd9..780a2d4b98a 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -29,7 +29,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 9da68852ee9..f1f3aac3f3b 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index f6b7109195a..93271f0966c 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 84d98d8a5a7..64ea5fd87eb 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c6670d2e3f8..02457123bde 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -34,7 +34,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c index da860352380..72b5939101b 100644 --- a/net/sctp/ssnmap.c +++ b/net/sctp/ssnmap.c @@ -24,7 +24,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9a5c4c9edda..190674702b2 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -25,7 +25,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/transport.c b/net/sctp/transport.c index bdbbc3fd7c1..9602c52aa61 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -30,7 +30,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c index b46019568a8..0eff8667d4c 100644 --- a/net/sctp/tsnmap.c +++ b/net/sctp/tsnmap.c @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 44a45dbee4d..c07624fca7e 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 04e3d470f87..2cdb3010f50 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp -- cgit v1.2.3-70-g09d2 From 64dc61306ce7da370833289739e2f52dfc6b37ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Jul 2013 20:26:31 -0700 Subject: net: add sk_stream_is_writeable() helper Several call sites use the hardcoded following condition : sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) Lets use a helper because TCP_NOTSENT_LOWAT support will change this condition for TCP sockets. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++++ net/ceph/messenger.c | 2 +- net/core/stream.c | 2 +- net/dccp/proto.c | 4 ++-- net/ipv4/tcp.c | 4 ++-- net/sunrpc/svcsock.c | 2 +- net/sunrpc/xprtsock.c | 2 +- 7 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index e0473f61693..d0b5fdee50a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1088,6 +1088,10 @@ static inline struct cg_proto *parent_cg_proto(struct proto *proto, } #endif +static inline bool sk_stream_is_writeable(const struct sock *sk) +{ + return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk); +} static inline bool sk_has_memory_pressure(const struct sock *sk) { diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index eb0a46a49bd..3be308e1430 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -409,7 +409,7 @@ static void ceph_sock_write_space(struct sock *sk) * and net/core/stream.c:sk_stream_write_space(). */ if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) { - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + if (sk_stream_is_writeable(sk)) { dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); queue_con(con); diff --git a/net/core/stream.c b/net/core/stream.c index f5df85dcd20..512f0a24269 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -30,7 +30,7 @@ void sk_stream_write_space(struct sock *sk) struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { + if (sk_stream_is_writeable(sk) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 6c7c78b8394..ba64750f038 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -336,7 +336,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock, mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ set_bit(SOCK_ASYNC_NOSPACE, @@ -347,7 +347,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock, * wspace test but before the flags are set, * IO signal will be lost. */ - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + if (sk_stream_is_writeable(sk)) mask |= POLLOUT | POLLWRNORM; } } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5423223e93c..5eca9060bb8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -499,7 +499,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ set_bit(SOCK_ASYNC_NOSPACE, @@ -510,7 +510,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * wspace test but before the flags are set, * IO signal will be lost. */ - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + if (sk_stream_is_writeable(sk)) mask |= POLLOUT | POLLWRNORM; } } else diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 305374d4fb9..0da6785ec15 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -442,7 +442,7 @@ static void svc_tcp_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) + if (sk_stream_is_writeable(sk) && sock) clear_bit(SOCK_NOSPACE, &sock->flags); svc_write_space(sk); } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ddf0602603b..d6656d7768f 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1602,7 +1602,7 @@ static void xs_tcp_write_space(struct sock *sk) read_lock_bh(&sk->sk_callback_lock); /* from net/core/stream.c:sk_stream_write_space */ - if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + if (sk_stream_is_writeable(sk)) xs_write_space(sk); read_unlock_bh(&sk->sk_callback_lock); -- cgit v1.2.3-70-g09d2 From c9bee3b7fdecb0c1d070c7b54113b3bdfb9a3d36 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Jul 2013 20:27:07 -0700 Subject: tcp: TCP_NOTSENT_LOWAT socket option Idea of this patch is to add optional limitation of number of unsent bytes in TCP sockets, to reduce usage of kernel memory. TCP receiver might announce a big window, and TCP sender autotuning might allow a large amount of bytes in write queue, but this has little performance impact if a large part of this buffering is wasted : Write queue needs to be large only to deal with large BDP, not necessarily to cope with scheduling delays (incoming ACKS make room for the application to queue more bytes) For most workloads, using a value of 128 KB or less is OK to give applications enough time to react to POLLOUT events in time (or being awaken in a blocking sendmsg()) This patch adds two ways to set the limit : 1) Per socket option TCP_NOTSENT_LOWAT 2) A sysctl (/proc/sys/net/ipv4/tcp_notsent_lowat) for sockets not using TCP_NOTSENT_LOWAT socket option (or setting a zero value) Default value being UINT_MAX (0xFFFFFFFF), meaning this has no effect. This changes poll()/select()/epoll() to report POLLOUT only if number of unsent bytes is below tp->nosent_lowat Note this might increase number of sendmsg()/sendfile() calls when using non blocking sockets, and increase number of context switches for blocking sockets. Note this is not related to SO_SNDLOWAT (as SO_SNDLOWAT is defined as : Specify the minimum number of bytes in the buffer until the socket layer will pass the data to the protocol) Tested: netperf sessions, and watching /proc/net/protocols "memory" column for TCP With 200 concurrent netperf -t TCP_STREAM sessions, amount of kernel memory used by TCP buffers shrinks by ~55 % (20567 pages instead of 45458) lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 45458 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 45458 no 208 yes kernel y y y y y y y y y y y y y n y y y y y lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 20567 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 20567 no 208 yes kernel y y y y y y y y y y y y y n y y y y y Using 128KB has no bad effect on the throughput or cpu usage of a single flow, although there is an increase of context switches. A bonus is that we hold socket lock for a shorter amount of time and should improve latencies of ACK processing. lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1651584 6291456 16384 20.00 17447.90 10^6bits/s 3.13 S -1.00 U 0.353 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 412,514 context-switches 200.034645535 seconds time elapsed lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1593240 6291456 16384 20.00 17321.16 10^6bits/s 3.35 S -1.00 U 0.381 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 2,675,818 context-switches 200.029651391 seconds time elapsed Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-By: Yuchung Cheng Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 13 +++++++++++++ include/linux/tcp.h | 1 + include/net/sock.h | 19 +++++++++++++------ include/net/tcp.h | 14 ++++++++++++++ include/uapi/linux/tcp.h | 1 + net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp.c | 7 +++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 3 +++ net/ipv6/tcp_ipv6.c | 1 + 10 files changed, 61 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 10742902146..53cea9bcb14 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -516,6 +516,19 @@ tcp_wmem - vector of 3 INTEGERs: min, default, max this value is ignored. Default: between 64K and 4MB, depending on RAM size. +tcp_notsent_lowat - UNSIGNED INTEGER + A TCP socket can control the amount of unsent bytes in its write queue, + thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll() + reports POLLOUT events if the amount of unsent bytes is below a per + socket value, and if the write queue is not full. sendmsg() will + also not add new buffers if the limit is hit. + + This global variable controls the amount of unsent data for + sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change + to the global variable has immediate effect. + + Default: UINT_MAX (0xFFFFFFFF) + tcp_workaround_signed_windows - BOOLEAN If set, assume no receipt of a window scaling option means the remote TCP is broken and treats the window as a signed quantity. diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 472120b4fac..9640803a17a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -238,6 +238,7 @@ struct tcp_sock { u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ + u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ diff --git a/include/net/sock.h b/include/net/sock.h index d0b5fdee50a..b9f2b095b1a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -746,11 +746,6 @@ static inline int sk_stream_wspace(const struct sock *sk) extern void sk_stream_write_space(struct sock *sk); -static inline bool sk_stream_memory_free(const struct sock *sk) -{ - return sk->sk_wmem_queued < sk->sk_sndbuf; -} - /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { @@ -950,6 +945,7 @@ struct proto { unsigned int inuse_idx; #endif + bool (*stream_memory_free)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ @@ -1088,11 +1084,22 @@ static inline struct cg_proto *parent_cg_proto(struct proto *proto, } #endif +static inline bool sk_stream_memory_free(const struct sock *sk) +{ + if (sk->sk_wmem_queued >= sk->sk_sndbuf) + return false; + + return sk->sk_prot->stream_memory_free ? + sk->sk_prot->stream_memory_free(sk) : true; +} + static inline bool sk_stream_is_writeable(const struct sock *sk) { - return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk); + return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && + sk_stream_memory_free(sk); } + static inline bool sk_has_memory_pressure(const struct sock *sk) { return sk->sk_prot->memory_pressure != NULL; diff --git a/include/net/tcp.h b/include/net/tcp.h index c5868471aba..18fc999dae3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -284,6 +284,7 @@ extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; +extern unsigned int sysctl_tcp_notsent_lowat; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -1539,6 +1540,19 @@ extern int tcp_gro_complete(struct sk_buff *skb); extern void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); +static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) +{ + return tp->notsent_lowat ?: sysctl_tcp_notsent_lowat; +} + +static inline bool tcp_stream_memory_free(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 notsent_bytes = tp->write_seq - tp->snd_nxt; + + return notsent_bytes < tcp_notsent_lowat(tp); +} + #ifdef CONFIG_PROC_FS extern int tcp4_proc_init(void); extern void tcp4_proc_exit(void); diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8d776ebc482..377f1e59411 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -111,6 +111,7 @@ enum { #define TCP_REPAIR_OPTIONS 22 #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ #define TCP_TIMESTAMP 24 +#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ struct tcp_repair_opt { __u32 opt_code; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b2c123c44d6..69ed203802d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -554,6 +554,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one, }, + { + .procname = "tcp_notsent_lowat", + .data = &sysctl_tcp_notsent_lowat, + .maxlen = sizeof(sysctl_tcp_notsent_lowat), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "tcp_rmem", .data = &sysctl_tcp_rmem, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5eca9060bb8..c27e8139239 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2631,6 +2631,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->tsoffset = val - tcp_time_stamp; break; + case TCP_NOTSENT_LOWAT: + tp->notsent_lowat = val; + sk->sk_write_space(sk); + break; default: err = -ENOPROTOOPT; break; @@ -2847,6 +2851,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_TIMESTAMP: val = tcp_time_stamp + tp->tsoffset; break; + case TCP_NOTSENT_LOWAT: + val = tp->notsent_lowat; + break; default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2e3f129df0e..2a5d5c469d1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2800,6 +2800,7 @@ struct proto tcp_prot = { .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, + .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 92fde8d1aa8..884efff5b53 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; +EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); + static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 80fe69ef218..b792e870686 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1924,6 +1924,7 @@ struct proto tcpv6_prot = { .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, + .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, -- cgit v1.2.3-70-g09d2 From 18afa4b028b46f8b45ca64f94aefe717c297b07d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 Jul 2013 16:13:17 +0200 Subject: net: Make devnet_rename_seq static No users outside net/core/dev.c. Signed-off-by: Thomas Gleixner Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 --- net/core/dev.c | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2bb2357d83b..3ca60b070ef 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1666,9 +1666,6 @@ extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev); extern rwlock_t dev_base_lock; /* Device list lock */ -extern seqcount_t devnet_rename_seq; /* Device rename seq */ - - #define for_each_netdev(net, d) \ list_for_each_entry(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_reverse(net, d) \ diff --git a/net/core/dev.c b/net/core/dev.c index 26755dd40da..dfd9f5d56ae 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -174,7 +174,7 @@ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id; static DEFINE_HASHTABLE(napi_hash, 8); -seqcount_t devnet_rename_seq; +static seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) { -- cgit v1.2.3-70-g09d2 From a77b15a60cb1d54d87e8794bfbc94c9ccee679ed Mon Sep 17 00:00:00 2001 From: Mikel Astiz Date: Fri, 28 Jun 2013 10:56:27 +0200 Subject: Bluetooth: Add HCI authentication capabilities macros Add macros for the HCI capabilities as described in the Bluetooth Core Specification v4.0, Volume 2, part E, section 7.1.29. Signed-off-by: Mikel Astiz Signed-off-by: Gustavo Padovan --- include/net/bluetooth/hci.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 3c592cf473d..a01fbb4a049 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -296,6 +296,12 @@ enum { #define HCI_AT_GENERAL_BONDING 0x04 #define HCI_AT_GENERAL_BONDING_MITM 0x05 +/* I/O capabilities */ +#define HCI_IO_DISPLAY_ONLY 0x00 +#define HCI_IO_DISPLAY_YESNO 0x01 +#define HCI_IO_KEYBOARD_ONLY 0x02 +#define HCI_IO_NO_INPUT_OUTPUT 0x03 + /* Link Key types */ #define HCI_LK_COMBINATION 0x00 #define HCI_LK_LOCAL_UNIT 0x01 -- cgit v1.2.3-70-g09d2 From e7428e95a06fb516fac1308bd0e176e27c0b9287 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 25 Jul 2013 10:20:23 +0930 Subject: virtio-net: put virtio net header inline with data For small packets we can simplify xmit processing by linearizing buffers with the header: most packets seem to have enough head room we can use for this purpose. Since existing hypervisors require that header is the first s/g element, we need a feature bit for this. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 42 +++++++++++++++++++++++++++++++++-------- include/uapi/linux/virtio_net.h | 4 +++- 2 files changed, 37 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3d2a90a6264..f2160027758 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -106,6 +106,9 @@ struct virtnet_info { /* Has control virtqueue */ bool has_cvq; + /* Host can handle any s/g split between our header and packet data */ + bool any_header_sg; + /* enable config space updates */ bool config_enable; @@ -669,12 +672,28 @@ static void free_old_xmit_skbs(struct send_queue *sq) static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) { - struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb); + struct skb_vnet_hdr *hdr; const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; struct virtnet_info *vi = sq->vq->vdev->priv; unsigned num_sg; + unsigned hdr_len; + bool can_push; pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); + if (vi->mergeable_rx_bufs) + hdr_len = sizeof hdr->mhdr; + else + hdr_len = sizeof hdr->hdr; + + can_push = vi->any_header_sg && + !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) && + !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len; + /* Even if we can, don't push here yet as this would skew + * csum_start offset below. */ + if (can_push) + hdr = (struct skb_vnet_hdr *)(skb->data - hdr_len); + else + hdr = skb_vnet_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; @@ -703,15 +722,18 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) hdr->hdr.gso_size = hdr->hdr.hdr_len = 0; } - hdr->mhdr.num_buffers = 0; - - /* Encode metadata header at front. */ if (vi->mergeable_rx_bufs) - sg_set_buf(sq->sg, &hdr->mhdr, sizeof hdr->mhdr); - else - sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr); + hdr->mhdr.num_buffers = 0; - num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; + if (can_push) { + __skb_push(skb, hdr_len); + num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len); + /* Pull header back to avoid skew in tx bytes calculations. */ + __skb_pull(skb, hdr_len); + } else { + sg_set_buf(sq->sg, hdr, hdr_len); + num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; + } return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); } @@ -1552,6 +1574,9 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) vi->mergeable_rx_bufs = true; + if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT)) + vi->any_header_sg = true; + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) vi->has_cvq = true; @@ -1727,6 +1752,7 @@ static unsigned int features[] = { VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, VIRTIO_NET_F_CTRL_MAC_ADDR, + VIRTIO_F_ANY_LAYOUT, }; static struct virtio_driver virtio_net_driver = { diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index c520203fac2..227d4ce84e7 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -70,7 +70,9 @@ struct virtio_net_config { __u16 max_virtqueue_pairs; } __attribute__((packed)); -/* This is the first element of the scatter-gather list. If you don't +/* This header comes first in the scatter-gather list. + * If VIRTIO_F_ANY_LAYOUT is not negotiated, it must + * be the first element of the scatter-gather list. If you don't * specify GSO or CSUM features, you can simply ignore the header. */ struct virtio_net_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset -- cgit v1.2.3-70-g09d2 From 024ec3deac33ddbd81f3c887506f132b24ea21a7 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Thu, 25 Jul 2013 10:52:05 +0900 Subject: net/sctp: Refactor SCTP skb checksum computation This patch consolidates the SCTP checksum calculation code from various places to a single new function, sctp_compute_cksum(skb, offset). Signed-off-by: Joe Stringer Reviewed-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: David S. Miller --- include/net/sctp/checksum.h | 15 +++++++++++++++ net/netfilter/ipvs/ip_vs_proto_sctp.c | 23 +++-------------------- net/netfilter/nf_nat_proto_sctp.c | 8 +------- net/sctp/input.c | 10 +--------- 4 files changed, 20 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index 78b88aa8c81..483e6303458 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -85,4 +85,19 @@ static inline __le32 sctp_end_cksum(__u32 crc32) return cpu_to_le32(~crc32); } +/* Calculate the CRC32C checksum of an SCTP packet. */ +static inline __le32 sctp_compute_cksum(const struct sk_buff *skb, + unsigned int offset) +{ + const struct sk_buff *iter; + + __u32 crc32 = sctp_start_cksum(skb->data + offset, + skb_headlen(skb) - offset); + skb_walk_frags(skb, iter) + crc32 = sctp_update_cksum((__u8 *) iter->data, + skb_headlen(iter), crc32); + + return sctp_end_cksum(crc32); +} + #endif /* __sctp_checksum_h__ */ diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 3c0da872803..23e596e438b 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -66,15 +66,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph, unsigned int sctphoff) { - __u32 crc32; - struct sk_buff *iter; - - crc32 = sctp_start_cksum((__u8 *)sctph, skb_headlen(skb) - sctphoff); - skb_walk_frags(skb, iter) - crc32 = sctp_update_cksum((u8 *) iter->data, - skb_headlen(iter), crc32); - sctph->checksum = sctp_end_cksum(crc32); - + sctph->checksum = sctp_compute_cksum(skb, sctphoff); skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -151,10 +143,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { unsigned int sctphoff; struct sctphdr *sh, _sctph; - struct sk_buff *iter; - __le32 cmp; - __le32 val; - __u32 tmp; + __le32 cmp, val; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) @@ -168,13 +157,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) return 0; cmp = sh->checksum; - - tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb)); - skb_walk_frags(skb, iter) - tmp = sctp_update_cksum((__u8 *) iter->data, - skb_headlen(iter), tmp); - - val = sctp_end_cksum(tmp); + val = sctp_compute_cksum(skb, sctphoff); if (val != cmp) { /* CRC failure, dump it. */ diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c index 396e55d46f9..754536f2c67 100644 --- a/net/netfilter/nf_nat_proto_sctp.c +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -34,9 +34,7 @@ sctp_manip_pkt(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { - struct sk_buff *frag; sctp_sctphdr_t *hdr; - __u32 crc32; if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) return false; @@ -51,11 +49,7 @@ sctp_manip_pkt(struct sk_buff *skb, hdr->dest = tuple->dst.u.sctp.port; } - crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff); - skb_walk_frags(skb, frag) - crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag), - crc32); - hdr->checksum = sctp_end_cksum(crc32); + hdr->checksum = sctp_compute_cksum(skb, hdroff); return true; } diff --git a/net/sctp/input.c b/net/sctp/input.c index 7993495a4c0..fa91aff0238 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -87,15 +87,7 @@ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) { struct sctphdr *sh = sctp_hdr(skb); __le32 cmp = sh->checksum; - struct sk_buff *list; - __le32 val; - __u32 tmp = sctp_start_cksum((__u8 *)sh, skb_headlen(skb)); - - skb_walk_frags(skb, list) - tmp = sctp_update_cksum((__u8 *)list->data, skb_headlen(list), - tmp); - - val = sctp_end_cksum(tmp); + __le32 val = sctp_compute_cksum(skb, 0); if (val != cmp) { /* CRC failure, dump it. */ -- cgit v1.2.3-70-g09d2 From a88c32ae15f25fcf0a3c9fadd92f840a1abf0e43 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Jul 2013 13:47:53 +0800 Subject: USBNET: centralize computing of max rx/tx qlen This patch centralizes computing of max rx/tx qlen, because: - RX_QLEN()/TX_QLEN() is called in hot path - computing depends on device's usb speed, now we have ls/fs, hs, ss, so more checks need to be involved - in fact, max rx/tx qlen should not only depend on device USB speed, but also depend on ethernet link speed, so we need to consider that in future. - if SG support is done, max tx qlen may need change too Generally, hard_mtu and rx_urb_size are changed in bind(), reset() and link_reset() callback, and change mtu network operation, this patches introduces the API of usbnet_update_max_qlen(), and calls it in above path. Signed-off-by: Ming Lei Signed-off-by: David S. Miller --- drivers/net/usb/asix_devices.c | 3 +++ drivers/net/usb/ax88179_178a.c | 3 +++ drivers/net/usb/usbnet.c | 45 ++++++++++++++++++++++++++++++++++-------- include/linux/usb/usbnet.h | 3 +++ 4 files changed, 46 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index ad5d1e4384d..b96ad4f15b8 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -778,6 +778,9 @@ static int ax88178_change_mtu(struct net_device *net, int new_mtu) dev->hard_mtu = net->mtu + net->hard_header_len; ax88178_set_mfb(dev); + /* max qlen depend on hard_mtu and rx_urb_size */ + usbnet_update_max_qlen(dev); + return 0; } diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 1e3c302d94f..658540433fa 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -688,6 +688,9 @@ static int ax88179_change_mtu(struct net_device *net, int new_mtu) 2, 2, &tmp16); } + /* max qlen depend on hard_mtu and rx_urb_size */ + usbnet_update_max_qlen(dev); + return 0; } diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 06ee82f557d..d74de074a43 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -59,15 +59,13 @@ * For high speed, each frame comfortably fits almost 36 max size * Ethernet packets (so queues should be bigger). * - * REVISIT qlens should be members of 'struct usbnet'; the goal is to - * let the USB host controller be busy for 5msec or more before an irq - * is required, under load. Jumbograms change the equation. + * The goal is to let the USB host controller be busy for 5msec or + * more before an irq is required, under load. Jumbograms change + * the equation. */ -#define RX_MAX_QUEUE_MEMORY (60 * 1518) -#define RX_QLEN(dev) (((dev)->udev->speed == USB_SPEED_HIGH) ? \ - (RX_MAX_QUEUE_MEMORY/(dev)->rx_urb_size) : 4) -#define TX_QLEN(dev) (((dev)->udev->speed == USB_SPEED_HIGH) ? \ - (RX_MAX_QUEUE_MEMORY/(dev)->hard_mtu) : 4) +#define MAX_QUEUE_MEMORY (60 * 1518) +#define RX_QLEN(dev) ((dev)->rx_qlen) +#define TX_QLEN(dev) ((dev)->tx_qlen) // reawaken network queue this soon after stopping; else watchdog barks #define TX_TIMEOUT_JIFFIES (5*HZ) @@ -347,6 +345,22 @@ void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(usbnet_skb_return); +/* must be called if hard_mtu or rx_urb_size changed */ +void usbnet_update_max_qlen(struct usbnet *dev) +{ + enum usb_device_speed speed = dev->udev->speed; + + switch (speed) { + case USB_SPEED_HIGH: + dev->rx_qlen = MAX_QUEUE_MEMORY / dev->rx_urb_size; + dev->tx_qlen = MAX_QUEUE_MEMORY / dev->hard_mtu; + break; + default: + dev->rx_qlen = dev->tx_qlen = 4; + } +} +EXPORT_SYMBOL_GPL(usbnet_update_max_qlen); + /*------------------------------------------------------------------------- * @@ -375,6 +389,9 @@ int usbnet_change_mtu (struct net_device *net, int new_mtu) usbnet_unlink_rx_urbs(dev); } + /* max qlen depend on hard_mtu and rx_urb_size */ + usbnet_update_max_qlen(dev); + return 0; } EXPORT_SYMBOL_GPL(usbnet_change_mtu); @@ -843,6 +860,9 @@ int usbnet_open (struct net_device *net) goto done; } + /* hard_mtu or rx_urb_size may change in reset() */ + usbnet_update_max_qlen(dev); + // insist peer be connected if (info->check_connect && (retval = info->check_connect (dev)) < 0) { netif_dbg(dev, ifup, dev->net, "can't open; %d\n", retval); @@ -927,6 +947,9 @@ int usbnet_set_settings (struct net_device *net, struct ethtool_cmd *cmd) if (dev->driver_info->link_reset) dev->driver_info->link_reset(dev); + /* hard_mtu or rx_urb_size may change in link_reset() */ + usbnet_update_max_qlen(dev); + return retval; } @@ -1020,6 +1043,9 @@ static void __handle_link_change(struct usbnet *dev) tasklet_schedule(&dev->bh); } + /* hard_mtu or rx_urb_size may change during link change */ + usbnet_update_max_qlen(dev); + clear_bit(EVENT_LINK_CHANGE, &dev->flags); } @@ -1599,6 +1625,9 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) if ((dev->driver_info->flags & FLAG_WWAN) != 0) SET_NETDEV_DEVTYPE(net, &wwan_type); + /* initialize max rx_qlen and tx_qlen */ + usbnet_update_max_qlen(dev); + status = register_netdev (net); if (status) goto out4; diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index f18d64129f9..8fbc008e183 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -34,6 +34,7 @@ struct usbnet { struct mutex phy_mutex; unsigned char suspend_count; unsigned char pkt_cnt, pkt_err; + unsigned short rx_qlen, tx_qlen; /* i/o info: pipes etc */ unsigned in, out; @@ -253,4 +254,6 @@ extern void usbnet_link_change(struct usbnet *, bool, bool); extern int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags); extern void usbnet_status_stop(struct usbnet *dev); +extern void usbnet_update_max_qlen(struct usbnet *dev); + #endif /* __LINUX_USB_USBNET_H */ -- cgit v1.2.3-70-g09d2 From 82a54d0ebbee03a8dcf4e1e4016a53fed4d6c933 Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 25 Jul 2013 17:39:34 +0800 Subject: VSOCK: Move af_vsock.h and vsock_addr.h to include/net This is useful for other VSOCK transport implemented outside the net/vmw_vsock/ directory to use these headers. Signed-off-by: Asias He Acked-by: Andy King Signed-off-by: David S. Miller --- include/net/af_vsock.h | 175 +++++++++++++++++++++++++++++++++++++++++ include/net/vsock_addr.h | 30 +++++++ net/vmw_vsock/af_vsock.c | 3 +- net/vmw_vsock/af_vsock.h | 175 ----------------------------------------- net/vmw_vsock/vmci_transport.c | 2 +- net/vmw_vsock/vmci_transport.h | 4 +- net/vmw_vsock/vsock_addr.c | 3 +- net/vmw_vsock/vsock_addr.h | 30 ------- 8 files changed, 210 insertions(+), 212 deletions(-) create mode 100644 include/net/af_vsock.h create mode 100644 include/net/vsock_addr.h delete mode 100644 net/vmw_vsock/af_vsock.h delete mode 100644 net/vmw_vsock/vsock_addr.h (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h new file mode 100644 index 00000000000..7d64d3609ec --- /dev/null +++ b/include/net/af_vsock.h @@ -0,0 +1,175 @@ +/* + * VMware vSockets Driver + * + * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef __AF_VSOCK_H__ +#define __AF_VSOCK_H__ + +#include +#include +#include + +#include "vsock_addr.h" + +#define LAST_RESERVED_PORT 1023 + +#define vsock_sk(__sk) ((struct vsock_sock *)__sk) +#define sk_vsock(__vsk) (&(__vsk)->sk) + +struct vsock_sock { + /* sk must be the first member. */ + struct sock sk; + struct sockaddr_vm local_addr; + struct sockaddr_vm remote_addr; + /* Links for the global tables of bound and connected sockets. */ + struct list_head bound_table; + struct list_head connected_table; + /* Accessed without the socket lock held. This means it can never be + * modified outsided of socket create or destruct. + */ + bool trusted; + bool cached_peer_allow_dgram; /* Dgram communication allowed to + * cached peer? + */ + u32 cached_peer; /* Context ID of last dgram destination check. */ + const struct cred *owner; + /* Rest are SOCK_STREAM only. */ + long connect_timeout; + /* Listening socket that this came from. */ + struct sock *listener; + /* Used for pending list and accept queue during connection handshake. + * The listening socket is the head for both lists. Sockets created + * for connection requests are placed in the pending list until they + * are connected, at which point they are put in the accept queue list + * so they can be accepted in accept(). If accept() cannot accept the + * connection, it is marked as rejected so the cleanup function knows + * to clean up the socket. + */ + struct list_head pending_links; + struct list_head accept_queue; + bool rejected; + struct delayed_work dwork; + u32 peer_shutdown; + bool sent_request; + bool ignore_connecting_rst; + + /* Private to transport. */ + void *trans; +}; + +s64 vsock_stream_has_data(struct vsock_sock *vsk); +s64 vsock_stream_has_space(struct vsock_sock *vsk); +void vsock_pending_work(struct work_struct *work); +struct sock *__vsock_create(struct net *net, + struct socket *sock, + struct sock *parent, + gfp_t priority, unsigned short type); + +/**** TRANSPORT ****/ + +struct vsock_transport_recv_notify_data { + u64 data1; /* Transport-defined. */ + u64 data2; /* Transport-defined. */ + bool notify_on_block; +}; + +struct vsock_transport_send_notify_data { + u64 data1; /* Transport-defined. */ + u64 data2; /* Transport-defined. */ +}; + +struct vsock_transport { + /* Initialize/tear-down socket. */ + int (*init)(struct vsock_sock *, struct vsock_sock *); + void (*destruct)(struct vsock_sock *); + void (*release)(struct vsock_sock *); + + /* Connections. */ + int (*connect)(struct vsock_sock *); + + /* DGRAM. */ + int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *); + int (*dgram_dequeue)(struct kiocb *kiocb, struct vsock_sock *vsk, + struct msghdr *msg, size_t len, int flags); + int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *, + struct iovec *, size_t len); + bool (*dgram_allow)(u32 cid, u32 port); + + /* STREAM. */ + /* TODO: stream_bind() */ + ssize_t (*stream_dequeue)(struct vsock_sock *, struct iovec *, + size_t len, int flags); + ssize_t (*stream_enqueue)(struct vsock_sock *, struct iovec *, + size_t len); + s64 (*stream_has_data)(struct vsock_sock *); + s64 (*stream_has_space)(struct vsock_sock *); + u64 (*stream_rcvhiwat)(struct vsock_sock *); + bool (*stream_is_active)(struct vsock_sock *); + bool (*stream_allow)(u32 cid, u32 port); + + /* Notification. */ + int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); + int (*notify_poll_out)(struct vsock_sock *, size_t, bool *); + int (*notify_recv_init)(struct vsock_sock *, size_t, + struct vsock_transport_recv_notify_data *); + int (*notify_recv_pre_block)(struct vsock_sock *, size_t, + struct vsock_transport_recv_notify_data *); + int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t, + struct vsock_transport_recv_notify_data *); + int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t, + ssize_t, bool, struct vsock_transport_recv_notify_data *); + int (*notify_send_init)(struct vsock_sock *, + struct vsock_transport_send_notify_data *); + int (*notify_send_pre_block)(struct vsock_sock *, + struct vsock_transport_send_notify_data *); + int (*notify_send_pre_enqueue)(struct vsock_sock *, + struct vsock_transport_send_notify_data *); + int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t, + struct vsock_transport_send_notify_data *); + + /* Shutdown. */ + int (*shutdown)(struct vsock_sock *, int); + + /* Buffer sizes. */ + void (*set_buffer_size)(struct vsock_sock *, u64); + void (*set_min_buffer_size)(struct vsock_sock *, u64); + void (*set_max_buffer_size)(struct vsock_sock *, u64); + u64 (*get_buffer_size)(struct vsock_sock *); + u64 (*get_min_buffer_size)(struct vsock_sock *); + u64 (*get_max_buffer_size)(struct vsock_sock *); + + /* Addressing. */ + u32 (*get_local_cid)(void); +}; + +/**** CORE ****/ + +int vsock_core_init(const struct vsock_transport *t); +void vsock_core_exit(void); + +/**** UTILS ****/ + +void vsock_release_pending(struct sock *pending); +void vsock_add_pending(struct sock *listener, struct sock *pending); +void vsock_remove_pending(struct sock *listener, struct sock *pending); +void vsock_enqueue_accept(struct sock *listener, struct sock *connected); +void vsock_insert_connected(struct vsock_sock *vsk); +void vsock_remove_bound(struct vsock_sock *vsk); +void vsock_remove_connected(struct vsock_sock *vsk); +struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); +struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, + struct sockaddr_vm *dst); +void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); + +#endif /* __AF_VSOCK_H__ */ diff --git a/include/net/vsock_addr.h b/include/net/vsock_addr.h new file mode 100644 index 00000000000..9ccd5316eac --- /dev/null +++ b/include/net/vsock_addr.h @@ -0,0 +1,30 @@ +/* + * VMware vSockets Driver + * + * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _VSOCK_ADDR_H_ +#define _VSOCK_ADDR_H_ + +#include + +void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port); +int vsock_addr_validate(const struct sockaddr_vm *addr); +bool vsock_addr_bound(const struct sockaddr_vm *addr); +void vsock_addr_unbind(struct sockaddr_vm *addr); +bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, + const struct sockaddr_vm *other); +int vsock_addr_cast(const struct sockaddr *addr, size_t len, + struct sockaddr_vm **out_addr); + +#endif diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 593071dabd1..57896eee1c2 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -96,8 +96,7 @@ #include #include #include - -#include "af_vsock.h" +#include static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static void vsock_sk_destruct(struct sock *sk); diff --git a/net/vmw_vsock/af_vsock.h b/net/vmw_vsock/af_vsock.h deleted file mode 100644 index 7d64d3609ec..00000000000 --- a/net/vmw_vsock/af_vsock.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * VMware vSockets Driver - * - * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation version 2 and no later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -#ifndef __AF_VSOCK_H__ -#define __AF_VSOCK_H__ - -#include -#include -#include - -#include "vsock_addr.h" - -#define LAST_RESERVED_PORT 1023 - -#define vsock_sk(__sk) ((struct vsock_sock *)__sk) -#define sk_vsock(__vsk) (&(__vsk)->sk) - -struct vsock_sock { - /* sk must be the first member. */ - struct sock sk; - struct sockaddr_vm local_addr; - struct sockaddr_vm remote_addr; - /* Links for the global tables of bound and connected sockets. */ - struct list_head bound_table; - struct list_head connected_table; - /* Accessed without the socket lock held. This means it can never be - * modified outsided of socket create or destruct. - */ - bool trusted; - bool cached_peer_allow_dgram; /* Dgram communication allowed to - * cached peer? - */ - u32 cached_peer; /* Context ID of last dgram destination check. */ - const struct cred *owner; - /* Rest are SOCK_STREAM only. */ - long connect_timeout; - /* Listening socket that this came from. */ - struct sock *listener; - /* Used for pending list and accept queue during connection handshake. - * The listening socket is the head for both lists. Sockets created - * for connection requests are placed in the pending list until they - * are connected, at which point they are put in the accept queue list - * so they can be accepted in accept(). If accept() cannot accept the - * connection, it is marked as rejected so the cleanup function knows - * to clean up the socket. - */ - struct list_head pending_links; - struct list_head accept_queue; - bool rejected; - struct delayed_work dwork; - u32 peer_shutdown; - bool sent_request; - bool ignore_connecting_rst; - - /* Private to transport. */ - void *trans; -}; - -s64 vsock_stream_has_data(struct vsock_sock *vsk); -s64 vsock_stream_has_space(struct vsock_sock *vsk); -void vsock_pending_work(struct work_struct *work); -struct sock *__vsock_create(struct net *net, - struct socket *sock, - struct sock *parent, - gfp_t priority, unsigned short type); - -/**** TRANSPORT ****/ - -struct vsock_transport_recv_notify_data { - u64 data1; /* Transport-defined. */ - u64 data2; /* Transport-defined. */ - bool notify_on_block; -}; - -struct vsock_transport_send_notify_data { - u64 data1; /* Transport-defined. */ - u64 data2; /* Transport-defined. */ -}; - -struct vsock_transport { - /* Initialize/tear-down socket. */ - int (*init)(struct vsock_sock *, struct vsock_sock *); - void (*destruct)(struct vsock_sock *); - void (*release)(struct vsock_sock *); - - /* Connections. */ - int (*connect)(struct vsock_sock *); - - /* DGRAM. */ - int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *); - int (*dgram_dequeue)(struct kiocb *kiocb, struct vsock_sock *vsk, - struct msghdr *msg, size_t len, int flags); - int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *, - struct iovec *, size_t len); - bool (*dgram_allow)(u32 cid, u32 port); - - /* STREAM. */ - /* TODO: stream_bind() */ - ssize_t (*stream_dequeue)(struct vsock_sock *, struct iovec *, - size_t len, int flags); - ssize_t (*stream_enqueue)(struct vsock_sock *, struct iovec *, - size_t len); - s64 (*stream_has_data)(struct vsock_sock *); - s64 (*stream_has_space)(struct vsock_sock *); - u64 (*stream_rcvhiwat)(struct vsock_sock *); - bool (*stream_is_active)(struct vsock_sock *); - bool (*stream_allow)(u32 cid, u32 port); - - /* Notification. */ - int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); - int (*notify_poll_out)(struct vsock_sock *, size_t, bool *); - int (*notify_recv_init)(struct vsock_sock *, size_t, - struct vsock_transport_recv_notify_data *); - int (*notify_recv_pre_block)(struct vsock_sock *, size_t, - struct vsock_transport_recv_notify_data *); - int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t, - struct vsock_transport_recv_notify_data *); - int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t, - ssize_t, bool, struct vsock_transport_recv_notify_data *); - int (*notify_send_init)(struct vsock_sock *, - struct vsock_transport_send_notify_data *); - int (*notify_send_pre_block)(struct vsock_sock *, - struct vsock_transport_send_notify_data *); - int (*notify_send_pre_enqueue)(struct vsock_sock *, - struct vsock_transport_send_notify_data *); - int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t, - struct vsock_transport_send_notify_data *); - - /* Shutdown. */ - int (*shutdown)(struct vsock_sock *, int); - - /* Buffer sizes. */ - void (*set_buffer_size)(struct vsock_sock *, u64); - void (*set_min_buffer_size)(struct vsock_sock *, u64); - void (*set_max_buffer_size)(struct vsock_sock *, u64); - u64 (*get_buffer_size)(struct vsock_sock *); - u64 (*get_min_buffer_size)(struct vsock_sock *); - u64 (*get_max_buffer_size)(struct vsock_sock *); - - /* Addressing. */ - u32 (*get_local_cid)(void); -}; - -/**** CORE ****/ - -int vsock_core_init(const struct vsock_transport *t); -void vsock_core_exit(void); - -/**** UTILS ****/ - -void vsock_release_pending(struct sock *pending); -void vsock_add_pending(struct sock *listener, struct sock *pending); -void vsock_remove_pending(struct sock *listener, struct sock *pending); -void vsock_enqueue_accept(struct sock *listener, struct sock *connected); -void vsock_insert_connected(struct vsock_sock *vsk); -void vsock_remove_bound(struct vsock_sock *vsk); -void vsock_remove_connected(struct vsock_sock *vsk); -struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); -struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, - struct sockaddr_vm *dst); -void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); - -#endif /* __AF_VSOCK_H__ */ diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index ffc11df02af..9d6986634e0 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -34,8 +34,8 @@ #include #include #include +#include -#include "af_vsock.h" #include "vmci_transport_notify.h" static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg); diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h index fd88ea8924e..ce6c9623d5f 100644 --- a/net/vmw_vsock/vmci_transport.h +++ b/net/vmw_vsock/vmci_transport.h @@ -19,8 +19,8 @@ #include #include -#include "vsock_addr.h" -#include "af_vsock.h" +#include +#include /* If the packet format changes in a release then this should change too. */ #define VMCI_TRANSPORT_PACKET_VERSION 1 diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c index ec2611b4ea0..82486ee55ea 100644 --- a/net/vmw_vsock/vsock_addr.c +++ b/net/vmw_vsock/vsock_addr.c @@ -17,8 +17,7 @@ #include #include #include - -#include "vsock_addr.h" +#include void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port) { diff --git a/net/vmw_vsock/vsock_addr.h b/net/vmw_vsock/vsock_addr.h deleted file mode 100644 index 9ccd5316eac..00000000000 --- a/net/vmw_vsock/vsock_addr.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * VMware vSockets Driver - * - * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation version 2 and no later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -#ifndef _VSOCK_ADDR_H_ -#define _VSOCK_ADDR_H_ - -#include - -void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port); -int vsock_addr_validate(const struct sockaddr_vm *addr); -bool vsock_addr_bound(const struct sockaddr_vm *addr); -void vsock_addr_unbind(struct sockaddr_vm *addr); -bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, - const struct sockaddr_vm *other); -int vsock_addr_cast(const struct sockaddr *addr, size_t len, - struct sockaddr_vm **out_addr); - -#endif -- cgit v1.2.3-70-g09d2 From c26bf4a51308c85a6f97628253b99767a84ff90a Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 25 Jul 2013 18:12:18 +0200 Subject: pktgen: Add UDPCSUM flag to support UDP checksums UDP checksums are optional, hence pktgen has been omitting them in favour of performance. The optional flag UDPCSUM enables UDP checksumming. If the output device supports hardware checksumming the skb is prepared and marked CHECKSUM_PARTIAL, otherwise the checksum is generated in software. Signed-off-by: Thomas Graf Cc: Eric Dumazet Cc: Ben Greear Signed-off-by: David S. Miller --- include/net/udp.h | 1 + net/core/pktgen.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++----- net/ipv4/udp.c | 3 ++- 3 files changed, 55 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 74c10ec5e74..ef2e0b7843a 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -183,6 +183,7 @@ extern int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); extern int udp_push_pending_frames(struct sock *sk); extern void udp_flush_pending_frames(struct sock *sk); +extern void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); extern int udp_rcv(struct sk_buff *skb); extern int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); extern int udp_disconnect(struct sock *sk, int flags); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 9640972ec50..48cebf2c3e7 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -160,6 +160,7 @@ #include #include #include +#include #include #ifdef CONFIG_XFRM #include @@ -198,6 +199,7 @@ #define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ #define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ #define F_NODE (1<<15) /* Node memory alloc*/ +#define F_UDPCSUM (1<<16) /* Include UDP checksum */ /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -631,6 +633,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->flags & F_UDPDST_RND) seq_printf(seq, "UDPDST_RND "); + if (pkt_dev->flags & F_UDPCSUM) + seq_printf(seq, "UDPCSUM "); + if (pkt_dev->flags & F_MPLS_RND) seq_printf(seq, "MPLS_RND "); @@ -1228,6 +1233,12 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "!NODE_ALLOC") == 0) pkt_dev->flags &= ~F_NODE; + else if (strcmp(f, "UDPCSUM") == 0) + pkt_dev->flags |= F_UDPCSUM; + + else if (strcmp(f, "!UDPCSUM") == 0) + pkt_dev->flags &= ~F_UDPCSUM; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", @@ -2733,7 +2744,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, udph->source = htons(pkt_dev->cur_udp_src); udph->dest = htons(pkt_dev->cur_udp_dst); udph->len = htons(datalen + 8); /* DATA + udphdr */ - udph->check = 0; /* No checksum */ + udph->check = 0; iph->ihl = 5; iph->version = 4; @@ -2752,6 +2763,24 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->protocol = protocol; skb->dev = odev; skb->pkt_type = PACKET_HOST; + + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & NETIF_F_V4_CSUM) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + udp4_hwcsum(skb, udph->source, udph->dest); + } else { + __wsum csum = udp_csum(skb); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_tcpudp_magic(udph->source, udph->dest, + datalen + 8, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + pktgen_finalize_skb(pkt_dev, skb, datalen); #ifdef CONFIG_XFRM @@ -2768,7 +2797,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, struct sk_buff *skb = NULL; __u8 *eth; struct udphdr *udph; - int datalen; + int datalen, udplen; struct ipv6hdr *iph; __be16 protocol = htons(ETH_P_IPV6); __be32 *mpls; @@ -2844,10 +2873,11 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, net_info_ratelimited("increased datalen to %d\n", datalen); } + udplen = datalen + sizeof(struct udphdr); udph->source = htons(pkt_dev->cur_udp_src); udph->dest = htons(pkt_dev->cur_udp_dst); - udph->len = htons(datalen + sizeof(struct udphdr)); - udph->check = 0; /* No checksum */ + udph->len = htons(udplen); + udph->check = 0; *(__be32 *) iph = htonl(0x60000000); /* Version + flow */ @@ -2858,7 +2888,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, iph->hop_limit = 32; - iph->payload_len = htons(sizeof(struct udphdr) + datalen); + iph->payload_len = htons(udplen); iph->nexthdr = IPPROTO_UDP; iph->daddr = pkt_dev->cur_in6_daddr; @@ -2868,6 +2898,23 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb->dev = odev; skb->pkt_type = PACKET_HOST; + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & NETIF_F_V6_CSUM) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0); + } else { + __wsum csum = udp_csum(skb); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + pktgen_finalize_skb(pkt_dev, skb, datalen); return skb; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 766e6bab911..9e88af0e8ab 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -704,7 +704,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames); * @src: source IP address * @dst: destination IP address */ -static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) +void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { struct udphdr *uh = udp_hdr(skb); struct sk_buff *frags = skb_shinfo(skb)->frag_list; @@ -740,6 +740,7 @@ static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) uh->check = CSUM_MANGLED_0; } } +EXPORT_SYMBOL_GPL(udp4_hwcsum); static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) { -- cgit v1.2.3-70-g09d2 From fe6f700d6cbb7e8a61711e325f53d9c9e0a42a4c Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Sun, 28 Jul 2013 18:54:21 +0300 Subject: net/mlx4_core: Respond to operation request by firmware This commit adds new firmware command and new firmware event. The firmware raises the MLX4_EVENT_TYPE_OP_REQUIRED event in order to signal the driver it needs to perform an administrative operation throughout the MLX4_CMD_GET_OP_REQ command. At the moment the supported operation is adding/removing multicast entries which are used by the firmware for handling NCSI traffic in B0 steering mode. Also, had to swap the order of mlx4_init_mcg_table() and mlx4_init_eq_table() to make sure that driver will get events only after resources are initialized to handle it. Signed-off-by: Yevgeny Petrilin Signed-off-by: Jack Morgenstein Signed-off-by: Eugenia Emantayev Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 18 ++++++ drivers/net/ethernet/mellanox/mlx4/eq.c | 9 +++ drivers/net/ethernet/mellanox/mlx4/fw.c | 104 ++++++++++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx4/fw.h | 1 + drivers/net/ethernet/mellanox/mlx4/main.c | 35 +++++----- drivers/net/ethernet/mellanox/mlx4/mcg.c | 11 ---- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 13 ++++ include/linux/mlx4/cmd.h | 1 + include/linux/mlx4/device.h | 1 + 9 files changed, 166 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 299d0184f98..141322c31ae 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -809,6 +809,15 @@ int MLX4_CMD_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave, return -EPERM; } +int MLX4_CMD_GET_OP_REQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + return -EPERM; +} + int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, @@ -1251,6 +1260,15 @@ static struct mlx4_cmd_info cmd_info[] = { .verify = NULL, .wrapper = MLX4_CMD_UPDATE_QP_wrapper }, + { + .opcode = MLX4_CMD_GET_OP_REQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = MLX4_CMD_GET_OP_REQ_wrapper, + }, { .opcode = MLX4_CMD_CONF_SPECIAL_QP, .has_inbox = false, diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index 7e042869ef0..0416c5b3b35 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -79,6 +79,7 @@ enum { (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE) | \ (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT) | \ (1ull << MLX4_EVENT_TYPE_CMD) | \ + (1ull << MLX4_EVENT_TYPE_OP_REQUIRED) | \ (1ull << MLX4_EVENT_TYPE_COMM_CHANNEL) | \ (1ull << MLX4_EVENT_TYPE_FLR_EVENT) | \ (1ull << MLX4_EVENT_TYPE_FATAL_WARNING)) @@ -629,6 +630,14 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn); break; + case MLX4_EVENT_TYPE_OP_REQUIRED: + atomic_inc(&priv->opreq_count); + /* FW commands can't be executed from interrupt context + * working in deferred task + */ + queue_work(mlx4_wq, &priv->opreq_task); + break; + case MLX4_EVENT_TYPE_COMM_CHANNEL: if (!mlx4_is_master(dev)) { mlx4_warn(dev, "Received comm channel event " diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 8873d6802c8..aec6f5802da 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -1705,3 +1705,107 @@ int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port) MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); } EXPORT_SYMBOL_GPL(mlx4_wol_write); + +enum { + ADD_TO_MCG = 0x26, +}; + + +void mlx4_opreq_action(struct work_struct *work) +{ + struct mlx4_priv *priv = container_of(work, struct mlx4_priv, + opreq_task); + struct mlx4_dev *dev = &priv->dev; + int num_tasks = atomic_read(&priv->opreq_count); + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_mgm *mgm; + u32 *outbox; + u32 modifier; + u16 token; + u16 type_m; + u16 type; + int err; + u32 num_qps; + struct mlx4_qp qp; + int i; + u8 rem_mcg; + u8 prot; + +#define GET_OP_REQ_MODIFIER_OFFSET 0x08 +#define GET_OP_REQ_TOKEN_OFFSET 0x14 +#define GET_OP_REQ_TYPE_OFFSET 0x1a +#define GET_OP_REQ_DATA_OFFSET 0x20 + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + mlx4_err(dev, "Failed to allocate mailbox for GET_OP_REQ\n"); + return; + } + outbox = mailbox->buf; + + while (num_tasks) { + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, + MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) { + mlx4_err(dev, "Failed to retreive required operation: %d\n", + err); + return; + } + MLX4_GET(modifier, outbox, GET_OP_REQ_MODIFIER_OFFSET); + MLX4_GET(token, outbox, GET_OP_REQ_TOKEN_OFFSET); + MLX4_GET(type, outbox, GET_OP_REQ_TYPE_OFFSET); + type_m = type >> 12; + type &= 0xfff; + + switch (type) { + case ADD_TO_MCG: + if (dev->caps.steering_mode == + MLX4_STEERING_MODE_DEVICE_MANAGED) { + mlx4_warn(dev, "ADD MCG operation is not supported in DEVICE_MANAGED steering mode\n"); + err = EPERM; + break; + } + mgm = (struct mlx4_mgm *)((u8 *)(outbox) + + GET_OP_REQ_DATA_OFFSET); + num_qps = be32_to_cpu(mgm->members_count) & + MGM_QPN_MASK; + rem_mcg = ((u8 *)(&mgm->members_count))[0] & 1; + prot = ((u8 *)(&mgm->members_count))[0] >> 6; + + for (i = 0; i < num_qps; i++) { + qp.qpn = be32_to_cpu(mgm->qp[i]); + if (rem_mcg) + err = mlx4_multicast_detach(dev, &qp, + mgm->gid, + prot, 0); + else + err = mlx4_multicast_attach(dev, &qp, + mgm->gid, + mgm->gid[5] + , 0, prot, + NULL); + if (err) + break; + } + break; + default: + mlx4_warn(dev, "Bad type for required operation\n"); + err = EINVAL; + break; + } + err = mlx4_cmd(dev, 0, ((u32) err | cpu_to_be32(token) << 16), + 1, MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) { + mlx4_err(dev, "Failed to acknowledge required request: %d\n", + err); + goto out; + } + memset(outbox, 0, 0xffc); + num_tasks = atomic_dec_return(&priv->opreq_count); + } + +out: + mlx4_free_cmd_mailbox(dev, mailbox); +} diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h index fdf41665a05..a0a368b7c93 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.h +++ b/drivers/net/ethernet/mellanox/mlx4/fw.h @@ -220,5 +220,6 @@ int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm); int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev); int mlx4_NOP(struct mlx4_dev *dev); int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg); +void mlx4_opreq_action(struct work_struct *work); #endif /* MLX4_FW_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index e85af922dcd..f1d818f9bb0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -1692,11 +1692,19 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) goto err_xrcd_table_free; } + if (!mlx4_is_slave(dev)) { + err = mlx4_init_mcg_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize multicast group table, aborting.\n"); + goto err_mr_table_free; + } + } + err = mlx4_init_eq_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "event queue table, aborting.\n"); - goto err_mr_table_free; + goto err_mcg_table_free; } err = mlx4_cmd_use_events(dev); @@ -1746,19 +1754,10 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) goto err_srq_table_free; } - if (!mlx4_is_slave(dev)) { - err = mlx4_init_mcg_table(dev); - if (err) { - mlx4_err(dev, "Failed to initialize " - "multicast group table, aborting.\n"); - goto err_qp_table_free; - } - } - err = mlx4_init_counters_table(dev); if (err && err != -ENOENT) { mlx4_err(dev, "Failed to initialize counters table, aborting.\n"); - goto err_mcg_table_free; + goto err_qp_table_free; } if (!mlx4_is_slave(dev)) { @@ -1803,9 +1802,6 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) err_counters_table_free: mlx4_cleanup_counters_table(dev); -err_mcg_table_free: - mlx4_cleanup_mcg_table(dev); - err_qp_table_free: mlx4_cleanup_qp_table(dev); @@ -1821,6 +1817,10 @@ err_cmd_poll: err_eq_table_free: mlx4_cleanup_eq_table(dev); +err_mcg_table_free: + if (!mlx4_is_slave(dev)) + mlx4_cleanup_mcg_table(dev); + err_mr_table_free: mlx4_cleanup_mr_table(dev); @@ -2197,6 +2197,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) } } + atomic_set(&priv->opreq_count, 0); + INIT_WORK(&priv->opreq_task, mlx4_opreq_action); + /* * Now reset the HCA before we touch the PCI capabilities or * attempt a firmware command, since a boot ROM may have left @@ -2315,12 +2318,12 @@ err_port: mlx4_cleanup_port_info(&priv->port[port]); mlx4_cleanup_counters_table(dev); - mlx4_cleanup_mcg_table(dev); mlx4_cleanup_qp_table(dev); mlx4_cleanup_srq_table(dev); mlx4_cleanup_cq_table(dev); mlx4_cmd_use_polling(dev); mlx4_cleanup_eq_table(dev); + mlx4_cleanup_mcg_table(dev); mlx4_cleanup_mr_table(dev); mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); @@ -2403,12 +2406,12 @@ static void mlx4_remove_one(struct pci_dev *pdev) RES_TR_FREE_SLAVES_ONLY); mlx4_cleanup_counters_table(dev); - mlx4_cleanup_mcg_table(dev); mlx4_cleanup_qp_table(dev); mlx4_cleanup_srq_table(dev); mlx4_cleanup_cq_table(dev); mlx4_cmd_use_polling(dev); mlx4_cleanup_eq_table(dev); + mlx4_cleanup_mcg_table(dev); mlx4_cleanup_mr_table(dev); mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c index f3e804f2a35..55f6245efb6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mcg.c +++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c @@ -39,19 +39,8 @@ #include "mlx4.h" -#define MGM_QPN_MASK 0x00FFFFFF -#define MGM_BLCK_LB_BIT 30 - static const u8 zero_gid[16]; /* automatically initialized to 0 */ -struct mlx4_mgm { - __be32 next_gid_index; - __be32 members_count; - u32 reserved[2]; - u8 gid[16]; - __be32 qp[MLX4_MAX_QP_PER_MGM]; -}; - int mlx4_get_mgm_entry_size(struct mlx4_dev *dev) { return 1 << dev->oper_log_mgm_entry_size; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index 17d9277e33e..348bb8c7d9a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -554,6 +554,17 @@ struct mlx4_mfunc { struct mlx4_mfunc_master_ctx master; }; +#define MGM_QPN_MASK 0x00FFFFFF +#define MGM_BLCK_LB_BIT 30 + +struct mlx4_mgm { + __be32 next_gid_index; + __be32 members_count; + u32 reserved[2]; + u8 gid[16]; + __be32 qp[MLX4_MAX_QP_PER_MGM]; +}; + struct mlx4_cmd { struct pci_pool *pool; void __iomem *hcr; @@ -802,6 +813,8 @@ struct mlx4_priv { u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; __be64 slave_node_guids[MLX4_MFUNC_MAX]; + atomic_t opreq_count; + struct work_struct opreq_task; }; static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev) diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index bb1c8096a7e..cd1fdf75103 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -69,6 +69,7 @@ enum { MLX4_CMD_SET_ICM_SIZE = 0xffd, /*master notify fw on finish for slave's flr*/ MLX4_CMD_INFORM_FLR_DONE = 0x5b, + MLX4_CMD_GET_OP_REQ = 0x59, /* TPT commands */ MLX4_CMD_SW2HW_MPT = 0xd, diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 52c23a892ba..6aebdfe0ed8 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -207,6 +207,7 @@ enum mlx4_event { MLX4_EVENT_TYPE_CMD = 0x0a, MLX4_EVENT_TYPE_VEP_UPDATE = 0x19, MLX4_EVENT_TYPE_COMM_CHANNEL = 0x18, + MLX4_EVENT_TYPE_OP_REQUIRED = 0x1a, MLX4_EVENT_TYPE_FATAL_WARNING = 0x1b, MLX4_EVENT_TYPE_FLR_EVENT = 0x1c, MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT = 0x1d, -- cgit v1.2.3-70-g09d2 From 60ff779c4abba37a31bd8624ef45026f7fb1b70c Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Thu, 25 Jul 2013 10:54:24 +0200 Subject: 9p: client: remove unused code and any reference to "cancelled" function This patch reverts commit 80b45261a0b263536b043c5ccfc4ba4fc27c2acc which was implementing a 'cancelled' functionality to notify that a cancelled request will not be replied. This implementation was not used anywhere and therefore removed. Signed-off-by: Andi Shyti Signed-off-by: David S. Miller --- include/net/9p/transport.h | 3 --- net/9p/client.c | 9 ++------- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index d9fa68f26c4..9a36d929711 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -40,8 +40,6 @@ * @close: member function to discard a connection on this transport * @request: member function to issue a request to the transport * @cancel: member function to cancel a request (if it hasn't been sent) - * @cancelled: member function to notify that a cancelled request will not - * not receive a reply * * This is the basic API for a transport module which is registered by the * transport module with the 9P core network module and used by the client @@ -60,7 +58,6 @@ struct p9_trans_module { void (*close) (struct p9_client *); int (*request) (struct p9_client *, struct p9_req_t *req); int (*cancel) (struct p9_client *, struct p9_req_t *req); - int (*cancelled)(struct p9_client *, struct p9_req_t *req); int (*zc_request)(struct p9_client *, struct p9_req_t *, char *, char *, int , int, int, int); }; diff --git a/net/9p/client.c b/net/9p/client.c index 8b93cae2d11..ba93bdab270 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -658,17 +658,12 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) /* * if we haven't received a response for oldreq, - * remove it from the list, and notify the transport - * layer that the reply will never arrive. + * remove it from the list */ - spin_lock(&c->lock); if (oldreq->status == REQ_STATUS_FLSH) { + spin_lock(&c->lock); list_del(&oldreq->req_list); spin_unlock(&c->lock); - if (c->trans_mod->cancelled) - c->trans_mod->cancelled(c, req); - } else { - spin_unlock(&c->lock); } p9_free_req(c, req); -- cgit v1.2.3-70-g09d2 From 66b52b0dc82c5c88d769dc1c7d44cf45d0deb07c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 29 Jul 2013 18:16:49 +0200 Subject: net: add ndo to get id of physical port of the device This patch adds a ndo for getting physical port of the device. Driver which is aware of being virtual function of some physical port should implement this ndo. This is applicable not only for IOV, but for other solutions (NPAR, multichannel) as well. Basically if there is possible to have multiple netdevs on the single hw port. Signed-off-by: Jiri Pirko Acked-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 ++++++++++++++++++++ net/core/dev.c | 18 ++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ca60b070ef..875f869dc38 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -728,6 +728,16 @@ struct netdev_fcoe_hbainfo { }; #endif +#define MAX_PHYS_PORT_ID_LEN 32 + +/* This structure holds a unique identifier to identify the + * physical port used by a netdevice. + */ +struct netdev_phys_port_id { + unsigned char id[MAX_PHYS_PORT_ID_LEN]; + unsigned char id_len; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -932,6 +942,12 @@ struct netdev_fcoe_hbainfo { * that determine carrier state from physical hardware properties (eg * network cables) or protocol-dependent mechanisms (eg * USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function. + * + * int (*ndo_get_phys_port_id)(struct net_device *dev, + * struct netdev_phys_port_id *ppid); + * Called to get ID of physical port of this device. If driver does + * not implement this, it is assumed that the hw is not able to have + * multiple net devices on single physical port. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1060,6 +1076,8 @@ struct net_device_ops { struct nlmsghdr *nlh); int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); + int (*ndo_get_phys_port_id)(struct net_device *dev, + struct netdev_phys_port_id *ppid); }; /* @@ -2315,6 +2333,8 @@ extern int dev_set_mac_address(struct net_device *, struct sockaddr *); extern int dev_change_carrier(struct net_device *, bool new_carrier); +extern int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_port_id *ppid); extern int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq); diff --git a/net/core/dev.c b/net/core/dev.c index dfd9f5d56ae..58eb802584b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4988,6 +4988,24 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier) } EXPORT_SYMBOL(dev_change_carrier); +/** + * dev_get_phys_port_id - Get device physical port ID + * @dev: device + * @ppid: port ID + * + * Get device physical port ID + */ +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_port_id *ppid) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_id(dev, ppid); +} +EXPORT_SYMBOL(dev_get_phys_port_id); + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace -- cgit v1.2.3-70-g09d2 From 66cae9ed6bc46b8cc57a9693f99f69926f3cc7ef Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 29 Jul 2013 18:16:50 +0200 Subject: rtnl: export physical port id via RT netlink Signed-off-by: Jiri Pirko Acked-by: Ben Hutchings Signed-off-by: Narendra K Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/core/rtnetlink.c | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 03f6170ab33..04c0e7a5d48 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -143,6 +143,7 @@ enum { IFLA_NUM_TX_QUEUES, IFLA_NUM_RX_QUEUES, IFLA_CARRIER, + IFLA_PHYS_PORT_ID, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3de740834d1..0b2972c59b9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -767,7 +767,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ - + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ + + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -846,6 +847,24 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + struct netdev_phys_port_id ppid; + + err = dev_get_phys_port_id(dev, &ppid); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -913,6 +932,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } + if (rtnl_phys_port_id_fill(skb, dev)) + goto nla_put_failure; + attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (attr == NULL) @@ -1113,6 +1135,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, }; EXPORT_SYMBOL(ifla_policy); -- cgit v1.2.3-70-g09d2 From 59da381ee2afc806f85becf3aa64ffc952355552 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 31 Jul 2013 06:53:21 +0000 Subject: PCI: move enum pcie_link_width into pci.h pcie_link_width is the enum used to define the link width values for a pcie device. This enum should not be contained solely in pci_hotplug.h, and this patch moves it next to pci_bus_speed in pci.h Signed-off-by: Jacob Keller Tested-by: Phil Schmitt Acked-by: Bjorn Helgaas Signed-off-by: Jeff Kirsher --- include/linux/pci.h | 13 +++++++++++++ include/linux/pci_hotplug.h | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0fd1f1582fa..a0bf22d816c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -183,6 +183,19 @@ enum pci_bus_flags { PCI_BUS_FLAGS_NO_MMRBC = (__force pci_bus_flags_t) 2, }; +/* These values come from the PCI Express Spec */ +enum pcie_link_width { + PCIE_LNK_WIDTH_RESRV = 0x00, + PCIE_LNK_X1 = 0x01, + PCIE_LNK_X2 = 0x02, + PCIE_LNK_X4 = 0x04, + PCIE_LNK_X8 = 0x08, + PCIE_LNK_X12 = 0x0C, + PCIE_LNK_X16 = 0x10, + PCIE_LNK_X32 = 0x20, + PCIE_LNK_WIDTH_UNKNOWN = 0xFF, +}; + /* Based on the PCI Hotplug Spec, but some values are made up by us */ enum pci_bus_speed { PCI_SPEED_33MHz = 0x00, diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h index 8db71dcd633..64e61e05d9a 100644 --- a/include/linux/pci_hotplug.h +++ b/include/linux/pci_hotplug.h @@ -28,19 +28,6 @@ #ifndef _PCI_HOTPLUG_H #define _PCI_HOTPLUG_H -/* These values come from the PCI Express Spec */ -enum pcie_link_width { - PCIE_LNK_WIDTH_RESRV = 0x00, - PCIE_LNK_X1 = 0x01, - PCIE_LNK_X2 = 0x02, - PCIE_LNK_X4 = 0x04, - PCIE_LNK_X8 = 0x08, - PCIE_LNK_X12 = 0x0C, - PCIE_LNK_X16 = 0x10, - PCIE_LNK_X32 = 0x20, - PCIE_LNK_WIDTH_UNKNOWN = 0xFF, -}; - /** * struct hotplug_slot_ops -the callbacks that the hotplug pci core can use * @owner: The module owner of this structure -- cgit v1.2.3-70-g09d2 From 81377c8d3563e7aec5c8baaaacacb48034f430a0 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 31 Jul 2013 06:53:26 +0000 Subject: PCI: Add function to obtain minimum link width and speed A PCI Express device can potentially report a link width and speed which it will not properly fulfill due to being plugged into a slower link higher in the chain. This function walks up the PCI bus chain and calculates the minimum link width and speed of this entire chain. This can be useful to enable a device to determine if it has enough bandwidth for optimum functionality. Signed-off-by: Jacob Keller Tested-by: Phil Schmitt Acked-by: Bjorn Helgaas Signed-off-by: Jeff Kirsher --- drivers/pci/pci.c | 43 +++++++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 2 ++ 2 files changed, 45 insertions(+) (limited to 'include') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index e37fea6e178..c71e78c4670 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3578,6 +3578,49 @@ int pcie_set_mps(struct pci_dev *dev, int mps) PCI_EXP_DEVCTL_PAYLOAD, v); } +/** + * pcie_get_minimum_link - determine minimum link settings of a PCI device + * @dev: PCI device to query + * @speed: storage for minimum speed + * @width: storage for minimum width + * + * This function will walk up the PCI device chain and determine the minimum + * link width and speed of the device. + */ +int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed, + enum pcie_link_width *width) +{ + int ret; + + *speed = PCI_SPEED_UNKNOWN; + *width = PCIE_LNK_WIDTH_UNKNOWN; + + while (dev) { + u16 lnksta; + enum pci_bus_speed next_speed; + enum pcie_link_width next_width; + + ret = pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta); + if (ret) + return ret; + + next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS]; + next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >> + PCI_EXP_LNKSTA_NLW_SHIFT; + + if (next_speed < *speed) + *speed = next_speed; + + if (next_width < *width) + *width = next_width; + + dev = dev->bus->self; + } + + return 0; +} +EXPORT_SYMBOL(pcie_get_minimum_link); + /** * pci_select_bars - Make BAR mask from the type of resource * @dev: the PCI device for which BAR mask is made diff --git a/include/linux/pci.h b/include/linux/pci.h index a0bf22d816c..2edbee64aeb 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -934,6 +934,8 @@ int pcie_get_readrq(struct pci_dev *dev); int pcie_set_readrq(struct pci_dev *dev, int rq); int pcie_get_mps(struct pci_dev *dev); int pcie_set_mps(struct pci_dev *dev, int mps); +int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed, + enum pcie_link_width *width); int __pci_reset_function(struct pci_dev *dev); int __pci_reset_function_locked(struct pci_dev *dev); int pci_reset_function(struct pci_dev *dev); -- cgit v1.2.3-70-g09d2 From 6704af53fc3c5d4113d67d5ff40943d420966cd8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 28 Jul 2013 22:54:07 +0200 Subject: netfilter: nf_conntrack: remove net_ratelimit() for LOG_INVALID() Logging of invalid packets has to be explicitly enabled. Rate-limiting these messages is inconsistent with other netfilter logging features and makes debugging harder. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 914d8d90079..b411d7b17de 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -148,17 +148,10 @@ extern int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL -#ifdef DEBUG_INVALID_PACKETS #define LOG_INVALID(net, proto) \ ((net)->ct.sysctl_log_invalid == (proto) || \ (net)->ct.sysctl_log_invalid == IPPROTO_RAW) #else -#define LOG_INVALID(net, proto) \ - (((net)->ct.sysctl_log_invalid == (proto) || \ - (net)->ct.sysctl_log_invalid == IPPROTO_RAW) \ - && net_ratelimit()) -#endif -#else static inline int LOG_INVALID(struct net *net, int proto) { return 0; } #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3-70-g09d2 From 312a0c16c1fa9dd7cb5af413cf73b2fe2806c962 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 28 Jul 2013 22:54:08 +0200 Subject: netfilter: nf_conntrack: constify sk_buff argument to nf_ct_attach() Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 4 ++-- net/netfilter/core.c | 7 ++++--- net/netfilter/nf_conntrack_core.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index de70f7b45b6..f4bbf2cd22d 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -314,8 +314,8 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) #endif /*CONFIG_NETFILTER*/ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) -extern void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu; -extern void nf_ct_attach(struct sk_buff *, struct sk_buff *); +extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu; +extern void nf_ct_attach(struct sk_buff *, const struct sk_buff *); extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu; struct nf_conn; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 2217363ab42..593b16ea45e 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -234,12 +234,13 @@ EXPORT_SYMBOL(skb_make_writable); /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence manufactured ICMP or RST packets will not be associated with it. */ -void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly; +void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) + __rcu __read_mostly; EXPORT_SYMBOL(ip_ct_attach); -void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) { - void (*attach)(struct sk_buff *, struct sk_buff *); + void (*attach)(struct sk_buff *, const struct sk_buff *); if (skb->nfct) { rcu_read_lock(); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0283baedcdf..d32afaff72f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1192,7 +1192,7 @@ EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); #endif /* Used by ipt_REJECT and ip6t_REJECT. */ -static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; -- cgit v1.2.3-70-g09d2 From fd158d79d33d3c8b693e3e2d8c0e3068d529c2dc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 29 Jul 2013 15:41:52 +0200 Subject: netfilter: tproxy: remove nf_tproxy_core, keep tw sk assigned to skb The module was "permanent", due to the special tproxy skb->destructor. Nowadays we have tcp early demux and its sock_edemux destructor in networking core which can be used instead. Thanks to early demux changes the input path now also handles "skb->sk is tw socket" correctly, so this no longer needs the special handling introduced with commit d503b30bd648b3cb4e5f50b65d27e389960cc6d9 (netfilter: tproxy: do not assign timewait sockets to skb->sk). Thus: - move assign_sock function to where its needed - don't prevent timewait sockets from being assigned to the skb - remove nf_tproxy_core. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/tproxy.txt | 5 ++- include/net/netfilter/nf_tproxy_core.h | 4 --- net/netfilter/Kconfig | 22 +++--------- net/netfilter/Makefile | 3 -- net/netfilter/nf_tproxy_core.c | 62 ---------------------------------- net/netfilter/xt_TPROXY.c | 9 +++++ 6 files changed, 16 insertions(+), 89 deletions(-) delete mode 100644 net/netfilter/nf_tproxy_core.c (limited to 'include') diff --git a/Documentation/networking/tproxy.txt b/Documentation/networking/tproxy.txt index 7b5996d9357..ec11429e1d4 100644 --- a/Documentation/networking/tproxy.txt +++ b/Documentation/networking/tproxy.txt @@ -2,9 +2,8 @@ Transparent proxy support ========================= This feature adds Linux 2.2-like transparent proxy support to current kernels. -To use it, enable NETFILTER_TPROXY, the socket match and the TPROXY target in -your kernel config. You will need policy routing too, so be sure to enable that -as well. +To use it, enable the socket match and the TPROXY target in your kernel config. +You will need policy routing too, so be sure to enable that as well. 1. Making non-local sockets work diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h index 36d9379d4c4..975ffa4545a 100644 --- a/include/net/netfilter/nf_tproxy_core.h +++ b/include/net/netfilter/nf_tproxy_core.h @@ -203,8 +203,4 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, } #endif -/* assign a socket to the skb -- consumes sk */ -void -nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk); - #endif diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 56d22cae590..c45fc1a60e0 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -410,20 +410,6 @@ config NF_NAT_TFTP endif # NF_CONNTRACK -# transparent proxy support -config NETFILTER_TPROXY - tristate "Transparent proxying support" - depends on IP_NF_MANGLE - depends on NETFILTER_ADVANCED - help - This option enables transparent proxying support, that is, - support for handling non-locally bound IPv4 TCP and UDP sockets. - For it to work you will have to configure certain iptables rules - and use policy routing. For more information on how to set it up - see Documentation/networking/tproxy.txt. - - To compile it as a module, choose M here. If unsure, say N. - config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" default m if NETFILTER_ADVANCED=n @@ -720,10 +706,10 @@ config NETFILTER_XT_TARGET_TEE this clone be rerouted to another nexthop. config NETFILTER_XT_TARGET_TPROXY - tristate '"TPROXY" target support' - depends on NETFILTER_TPROXY + tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED + depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help @@ -731,6 +717,9 @@ config NETFILTER_XT_TARGET_TPROXY REDIRECT. It can only be used in the mangle table and is useful to redirect traffic to a transparent proxy. It does _not_ depend on Netfilter connection tracking and NAT, unlike REDIRECT. + For it to work you will have to configure certain iptables rules + and use policy routing. For more information on how to set it up + see Documentation/networking/tproxy.txt. To compile it as a module, choose M here. If unsure, say N. @@ -1180,7 +1169,6 @@ config NETFILTER_XT_MATCH_SCTP config NETFILTER_XT_MATCH_SOCKET tristate '"socket" match support' - depends on NETFILTER_TPROXY depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index a1abf87d43b..ebfa7dc747c 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -61,9 +61,6 @@ obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o -# transparent proxy support -obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o - # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c deleted file mode 100644 index 474d621cbc2..00000000000 --- a/net/netfilter/nf_tproxy_core.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Transparent proxy support for Linux/iptables - * - * Copyright (c) 2006-2007 BalaBit IT Ltd. - * Author: Balazs Scheidler, Krisztian Kovacs - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include - -#include -#include -#include -#include -#include - - -static void -nf_tproxy_destructor(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - - skb->sk = NULL; - skb->destructor = NULL; - - if (sk) - sock_put(sk); -} - -/* consumes sk */ -void -nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) -{ - /* assigning tw sockets complicates things; most - * skb->sk->X checks would have to test sk->sk_state first */ - if (sk->sk_state == TCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return; - } - - skb_orphan(skb); - skb->sk = sk; - skb->destructor = nf_tproxy_destructor; -} -EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock); - -static int __init nf_tproxy_init(void) -{ - pr_info("NF_TPROXY: Transparent proxy support initialized, version 4.1.0\n"); - pr_info("NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.\n"); - return 0; -} - -module_init(nf_tproxy_init); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Krisztian Kovacs"); -MODULE_DESCRIPTION("Transparent proxy support core routines"); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index d7f195388f6..17c40deafa4 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -117,6 +117,15 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, return sk; } +/* assign a socket to the skb -- consumes sk */ +static void +nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_edemux; +} + static unsigned int tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, u_int32_t mark_mask, u_int32_t mark_value) -- cgit v1.2.3-70-g09d2 From 93742cf8af9dd3b053242b273040aa35fcbf93b3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 29 Jul 2013 15:41:53 +0200 Subject: netfilter: tproxy: remove nf_tproxy_core.h We've removed nf_tproxy_core.ko, so also remove its header. The lookup helpers are split and then moved to tproxy target/socket match. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tproxy_core.h | 206 --------------------------------- net/netfilter/xt_TPROXY.c | 160 ++++++++++++++++++++++++- net/netfilter/xt_socket.c | 66 ++++++++++- 3 files changed, 220 insertions(+), 212 deletions(-) delete mode 100644 include/net/netfilter/nf_tproxy_core.h (limited to 'include') diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h deleted file mode 100644 index 975ffa4545a..00000000000 --- a/include/net/netfilter/nf_tproxy_core.h +++ /dev/null @@ -1,206 +0,0 @@ -#ifndef _NF_TPROXY_CORE_H -#define _NF_TPROXY_CORE_H - -#include -#include -#include -#include -#include -#include -#include - -#define NFT_LOOKUP_ANY 0 -#define NFT_LOOKUP_LISTENER 1 -#define NFT_LOOKUP_ESTABLISHED 2 - -/* look up and get a reference to a matching socket */ - - -/* This function is used by the 'TPROXY' target and the 'socket' - * match. The following lookups are supported: - * - * Explicit TProxy target rule - * =========================== - * - * This is used when the user wants to intercept a connection matching - * an explicit iptables rule. In this case the sockets are assumed - * matching in preference order: - * - * - match: if there's a fully established connection matching the - * _packet_ tuple, it is returned, assuming the redirection - * already took place and we process a packet belonging to an - * established connection - * - * - match: if there's a listening socket matching the redirection - * (e.g. on-port & on-ip of the connection), it is returned, - * regardless if it was bound to 0.0.0.0 or an explicit - * address. The reasoning is that if there's an explicit rule, it - * does not really matter if the listener is bound to an interface - * or to 0. The user already stated that he wants redirection - * (since he added the rule). - * - * "socket" match based redirection (no specific rule) - * =================================================== - * - * There are connections with dynamic endpoints (e.g. FTP data - * connection) that the user is unable to add explicit rules - * for. These are taken care of by a generic "socket" rule. It is - * assumed that the proxy application is trusted to open such - * connections without explicit iptables rule (except of course the - * generic 'socket' rule). In this case the following sockets are - * matched in preference order: - * - * - match: if there's a fully established connection matching the - * _packet_ tuple - * - * - match: if there's a non-zero bound listener (possibly with a - * non-local address) We don't accept zero-bound listeners, since - * then local services could intercept traffic going through the - * box. - * - * Please note that there's an overlap between what a TPROXY target - * and a socket match will match. Normally if you have both rules the - * "socket" match will be the first one, effectively all packets - * belonging to established connections going through that one. - */ -static inline struct sock * -nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, - const __be32 saddr, const __be32 daddr, - const __be16 sport, const __be16 dport, - const struct net_device *in, int lookup_type) -{ - struct sock *sk; - - /* look up socket */ - switch (protocol) { - case IPPROTO_TCP: - switch (lookup_type) { - case NFT_LOOKUP_ANY: - sk = __inet_lookup(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); - break; - case NFT_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, &tcp_hashinfo, - saddr, sport, - daddr, dport, - in->ifindex); - - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too */ - - break; - case NFT_LOOKUP_ESTABLISHED: - sk = inet_lookup_established(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); - break; - default: - WARN_ON(1); - sk = NULL; - break; - } - break; - case IPPROTO_UDP: - sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, - in->ifindex); - if (sk && lookup_type != NFT_LOOKUP_ANY) { - int connected = (sk->sk_state == TCP_ESTABLISHED); - int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0); - - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too */ - if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || - (lookup_type == NFT_LOOKUP_LISTENER && connected)) { - sock_put(sk); - sk = NULL; - } - } - break; - default: - WARN_ON(1); - sk = NULL; - } - - pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", - protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); - - return sk; -} - -#if IS_ENABLED(CONFIG_IPV6) -static inline struct sock * -nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, - const struct in6_addr *saddr, const struct in6_addr *daddr, - const __be16 sport, const __be16 dport, - const struct net_device *in, int lookup_type) -{ - struct sock *sk; - - /* look up socket */ - switch (protocol) { - case IPPROTO_TCP: - switch (lookup_type) { - case NFT_LOOKUP_ANY: - sk = inet6_lookup(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); - break; - case NFT_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, &tcp_hashinfo, - saddr, sport, - daddr, ntohs(dport), - in->ifindex); - - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too */ - - break; - case NFT_LOOKUP_ESTABLISHED: - sk = __inet6_lookup_established(net, &tcp_hashinfo, - saddr, sport, daddr, ntohs(dport), - in->ifindex); - break; - default: - WARN_ON(1); - sk = NULL; - break; - } - break; - case IPPROTO_UDP: - sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, - in->ifindex); - if (sk && lookup_type != NFT_LOOKUP_ANY) { - int connected = (sk->sk_state == TCP_ESTABLISHED); - int wildcard = ipv6_addr_any(&inet6_sk(sk)->rcv_saddr); - - /* NOTE: we return listeners even if bound to - * 0.0.0.0, those are filtered out in - * xt_socket, since xt_TPROXY needs 0 bound - * listeners too */ - if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || - (lookup_type == NFT_LOOKUP_LISTENER && connected)) { - sock_put(sk); - sk = NULL; - } - } - break; - default: - WARN_ON(1); - sk = NULL; - } - - pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n", - protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk); - - return sk; -} -#endif - -#endif diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 17c40deafa4..851383a7f46 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -15,7 +15,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -26,13 +28,18 @@ #define XT_TPROXY_HAVE_IPV6 1 #include #include +#include #include #include #endif -#include #include +enum nf_tproxy_lookup_t { + NFT_LOOKUP_LISTENER, + NFT_LOOKUP_ESTABLISHED, +}; + static bool tproxy_sk_is_transparent(struct sock *sk) { if (sk->sk_state != TCP_TIME_WAIT) { @@ -68,6 +75,157 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) return laddr ? laddr : daddr; } +/* + * This is used when the user wants to intercept a connection matching + * an explicit iptables rule. In this case the sockets are assumed + * matching in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple, it is returned, assuming the redirection + * already took place and we process a packet belonging to an + * established connection + * + * - match: if there's a listening socket matching the redirection + * (e.g. on-port & on-ip of the connection), it is returned, + * regardless if it was bound to 0.0.0.0 or an explicit + * address. The reasoning is that if there's an explicit rule, it + * does not really matter if the listener is bound to an interface + * or to 0. The user already stated that he wants redirection + * (since he added the rule). + * + * Please note that there's an overlap between what a TPROXY target + * and a socket match will match. Normally if you have both rules the + * "socket" match will be the first one, effectively all packets + * belonging to established connections going through that one. + */ +static inline struct sock * +nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type) +{ + struct sock *sk; + + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NFT_LOOKUP_LISTENER: + sk = inet_lookup_listener(net, &tcp_hashinfo, + saddr, sport, + daddr, dport, + in->ifindex); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + break; + case NFT_LOOKUP_ESTABLISHED: + sk = inet_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + break; + default: + BUG(); + } + break; + case IPPROTO_UDP: + sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || + (lookup_type == NFT_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", + protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); + + return sk; +} + +#if IS_ENABLED(CONFIG_IPV6) +static inline struct sock * +nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type) +{ + struct sock *sk; + + switch (protocol) { + case IPPROTO_TCP: + switch (lookup_type) { + case NFT_LOOKUP_LISTENER: + sk = inet6_lookup_listener(net, &tcp_hashinfo, + saddr, sport, + daddr, ntohs(dport), + in->ifindex); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + break; + case NFT_LOOKUP_ESTABLISHED: + sk = __inet6_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, ntohs(dport), + in->ifindex); + break; + default: + BUG(); + } + break; + case IPPROTO_UDP: + sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + if (sk) { + int connected = (sk->sk_state == TCP_ESTABLISHED); + int wildcard = ipv6_addr_any(&inet6_sk(sk)->rcv_saddr); + + /* NOTE: we return listeners even if bound to + * 0.0.0.0, those are filtered out in + * xt_socket, since xt_TPROXY needs 0 bound + * listeners too + */ + if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) || + (lookup_type == NFT_LOOKUP_LISTENER && connected)) { + sock_put(sk); + sk = NULL; + } + } + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n", + protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk); + + return sk; +} +#endif + /** * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections * @skb: The skb being processed. diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index f8b71911037..a7dd108d406 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -19,12 +19,12 @@ #include #include #include -#include #include #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) #define XT_SOCKET_HAVE_IPV6 1 #include +#include #include #endif @@ -101,6 +101,43 @@ extract_icmp4_fields(const struct sk_buff *skb, return 0; } +/* "socket" match based redirection (no specific rule) + * =================================================== + * + * There are connections with dynamic endpoints (e.g. FTP data + * connection) that the user is unable to add explicit rules + * for. These are taken care of by a generic "socket" rule. It is + * assumed that the proxy application is trusted to open such + * connections without explicit iptables rule (except of course the + * generic 'socket' rule). In this case the following sockets are + * matched in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple + * + * - match: if there's a non-zero bound listener (possibly with a + * non-local address) We don't accept zero-bound listeners, since + * then local services could intercept traffic going through the + * box. + */ +static struct sock * +xt_socket_get_sock_v4(struct net *net, const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in) +{ + switch (protocol) { + case IPPROTO_TCP: + return __inet_lookup(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + case IPPROTO_UDP: + return udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + } + return NULL; +} + static bool socket_match(const struct sk_buff *skb, struct xt_action_param *par, const struct xt_socket_mtinfo1 *info) @@ -156,9 +193,9 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, #endif if (!sk) - sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, + sk = xt_socket_get_sock_v4(dev_net(skb->dev), protocol, saddr, daddr, sport, dport, - par->in, NFT_LOOKUP_ANY); + par->in); if (sk) { bool wildcard; bool transparent = true; @@ -261,6 +298,25 @@ extract_icmp6_fields(const struct sk_buff *skb, return 0; } +static struct sock * +xt_socket_get_sock_v6(struct net *net, const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in) +{ + switch (protocol) { + case IPPROTO_TCP: + return inet6_lookup(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + case IPPROTO_UDP: + return udp6_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + } + + return NULL; +} + static bool socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) { @@ -298,9 +354,9 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) } if (!sk) - sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + sk = xt_socket_get_sock_v6(dev_net(skb->dev), tproto, saddr, daddr, sport, dport, - par->in, NFT_LOOKUP_ANY); + par->in); if (sk) { bool wildcard; bool transparent = true; -- cgit v1.2.3-70-g09d2 From 02982c27ba1e1bd9f9d4747214e19ca83aa88d0e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 29 Jul 2013 15:41:54 +0200 Subject: netfilter: nf_conntrack: remove duplicate code in ctnetlink ctnetlink contains copy-paste code from death_by_timeout. In order to avoid changing both places in upcoming event delivery patch, export death_by_timeout functionality and use it in the ctnetlink code. Based on earlier patch from Pablo Neira. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 3 +-- net/netfilter/nf_conntrack_core.c | 29 ++++++++++++++++------------- net/netfilter/nf_conntrack_netlink.c | 18 +++--------------- 3 files changed, 20 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 644d9c223d2..939aced35a0 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -181,8 +181,7 @@ __nf_conntrack_find(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple); extern int nf_conntrack_hash_check_insert(struct nf_conn *ct); -extern void nf_ct_delete_from_lists(struct nf_conn *ct); -extern void nf_ct_dying_timeout(struct nf_conn *ct); +bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report); extern void nf_conntrack_flush_report(struct net *net, u32 portid, int report); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index d32afaff72f..089e408676f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -238,7 +238,7 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_conntrack_free(ct); } -void nf_ct_delete_from_lists(struct nf_conn *ct) +static void nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); @@ -253,7 +253,6 @@ void nf_ct_delete_from_lists(struct nf_conn *ct) &net->ct.dying); spin_unlock_bh(&nf_conntrack_lock); } -EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); static void death_by_event(unsigned long ul_conntrack) { @@ -275,7 +274,7 @@ static void death_by_event(unsigned long ul_conntrack) nf_ct_put(ct); } -void nf_ct_dying_timeout(struct nf_conn *ct) +static void nf_ct_dying_timeout(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); @@ -288,27 +287,33 @@ void nf_ct_dying_timeout(struct nf_conn *ct) (prandom_u32() % net->ct.sysctl_events_retry_timeout); add_timer(&ecache->timeout); } -EXPORT_SYMBOL_GPL(nf_ct_dying_timeout); -static void death_by_timeout(unsigned long ul_conntrack) +bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { - struct nf_conn *ct = (void *)ul_conntrack; struct nf_conn_tstamp *tstamp; tstamp = nf_conn_tstamp_find(ct); if (tstamp && tstamp->stop == 0) tstamp->stop = ktime_to_ns(ktime_get_real()); - if (!test_bit(IPS_DYING_BIT, &ct->status) && - unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { + if (!nf_ct_is_dying(ct) && + unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct, + portid, report) < 0)) { /* destroy event was not delivered */ nf_ct_delete_from_lists(ct); nf_ct_dying_timeout(ct); - return; + return false; } set_bit(IPS_DYING_BIT, &ct->status); nf_ct_delete_from_lists(ct); nf_ct_put(ct); + return true; +} +EXPORT_SYMBOL_GPL(nf_ct_delete); + +static void death_by_timeout(unsigned long ul_conntrack) +{ + nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0); } /* @@ -643,10 +648,7 @@ static noinline int early_drop(struct net *net, unsigned int hash) return dropped; if (del_timer(&ct->timeout)) { - death_by_timeout((unsigned long)ct); - /* Check if we indeed killed this entry. Reliable event - delivery may have inserted it into the dying list. */ - if (test_bit(IPS_DYING_BIT, &ct->status)) { + if (nf_ct_delete(ct, 0, 0)) { dropped = 1; NF_CT_STAT_INC_ATOMIC(net, early_drop); } @@ -1253,6 +1255,7 @@ void nf_ct_iterate_cleanup(struct net *net, /* Time to push up daises... */ if (del_timer(&ct->timeout)) death_by_timeout((unsigned long)ct); + /* ... else the timer will get him soon. */ nf_ct_put(ct); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index edc410e778f..e842c0ded79 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1038,21 +1038,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, } } - if (del_timer(&ct->timeout)) { - if (nf_conntrack_event_report(IPCT_DESTROY, ct, - NETLINK_CB(skb).portid, - nlmsg_report(nlh)) < 0) { - nf_ct_delete_from_lists(ct); - /* we failed to report the event, try later */ - nf_ct_dying_timeout(ct); - nf_ct_put(ct); - return 0; - } - /* death_by_timeout would report the event again */ - set_bit(IPS_DYING_BIT, &ct->status); - nf_ct_delete_from_lists(ct); - nf_ct_put(ct); - } + if (del_timer(&ct->timeout)) + nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); + nf_ct_put(ct); return 0; -- cgit v1.2.3-70-g09d2 From 2d89c68ac78ae432038ef23371d2fa949d725d43 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 28 Jul 2013 22:54:10 +0200 Subject: netfilter: nf_nat: change sequence number adjustments to 32 bits Using 16 bits is too small, when many adjustments happen the offsets might overflow and break the connection. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 2 +- include/net/netfilter/nf_conntrack.h | 2 +- include/net/netfilter/nf_nat.h | 2 +- include/net/netfilter/nf_nat_helper.h | 6 +++--- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 4 ++-- net/netfilter/nf_nat_helper.c | 8 ++++---- 7 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index f4bbf2cd22d..655d5d198d4 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -330,7 +330,7 @@ extern struct nfq_ct_hook __rcu *nfq_ct_hook; struct nfq_ct_nat_hook { void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, - u32 ctinfo, int off); + u32 ctinfo, s32 off); }; extern struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook; #else diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 939aced35a0..e5eb8b62538 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -234,7 +234,7 @@ static inline bool nf_ct_kill(struct nf_conn *ct) } /* These are for NAT. Icky. */ -extern s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, +extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq); diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index ad14a799fd2..e2441413675 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -19,7 +19,7 @@ struct nf_nat_seq { u_int32_t correction_pos; /* sequence number offset before and after last modification */ - int16_t offset_before, offset_after; + int32_t offset_before, offset_after; }; #include diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h index b4d6bfc2af0..194c3479492 100644 --- a/include/net/netfilter/nf_nat_helper.h +++ b/include/net/netfilter/nf_nat_helper.h @@ -41,7 +41,7 @@ extern int nf_nat_mangle_udp_packet(struct sk_buff *skb, extern void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - __be32 seq, s16 off); + __be32 seq, s32 off); extern int nf_nat_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -56,11 +56,11 @@ extern int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, extern void nf_nat_follow_master(struct nf_conn *ct, struct nf_conntrack_expect *this); -extern s16 nf_nat_get_offset(const struct nf_conn *ct, +extern s32 nf_nat_get_offset(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq); extern void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, - u32 dir, int off); + u32 dir, s32 off); #endif diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 089e408676f..0934611ff9f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1695,7 +1695,7 @@ err_stat: return ret; } -s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, +s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq); EXPORT_SYMBOL_GPL(nf_ct_nat_offset); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 7dcc376eea5..8f308d89632 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -496,7 +496,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } #ifdef CONFIG_NF_NAT_NEEDED -static inline s16 nat_offset(const struct nf_conn *ct, +static inline s32 nat_offset(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq) { @@ -525,7 +525,7 @@ static bool tcp_in_window(const struct nf_conn *ct, struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; - s16 receiver_offset; + s32 receiver_offset; bool res; /* diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index a7262ed055c..ff4a589e3e3 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -68,13 +68,13 @@ adjust_tcp_sequence(u32 seq, } /* Get the offset value, for conntrack */ -s16 nf_nat_get_offset(const struct nf_conn *ct, +s32 nf_nat_get_offset(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq) { struct nf_conn_nat *nat = nfct_nat(ct); struct nf_nat_seq *this_way; - s16 offset; + s32 offset; if (!nat) return 0; @@ -143,7 +143,7 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra) } void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - __be32 seq, s16 off) + __be32 seq, s32 off) { if (!off) return; @@ -370,7 +370,7 @@ nf_nat_seq_adjust(struct sk_buff *skb, struct tcphdr *tcph; int dir; __be32 newseq, newack; - s16 seqoff, ackoff; + s32 seqoff, ackoff; struct nf_conn_nat *nat = nfct_nat(ct); struct nf_nat_seq *this_way, *other_way; int res; -- cgit v1.2.3-70-g09d2 From c0155b2da4cd6583b1b729451249ca346e1c05a2 Mon Sep 17 00:00:00 2001 From: Dmitry Popov Date: Wed, 31 Jul 2013 13:39:45 +0400 Subject: tcp: Remove unused tcpct declarations and comments Remove declaration, 4 defines and confusing comment that are no longer used since 1a2c6181c4 ("tcp: Remove TCPCT"). Signed-off-by: Dmitry Popov Acked-by: Christoph Paasch Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 - include/net/tcp.h | 4 ---- net/ipv4/tcp.c | 4 ---- 3 files changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 9640803a17a..d68633452d9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -107,7 +107,6 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) * only four options will fit in a standard TCP header */ #define TCP_NUM_SACKS 4 -struct tcp_cookie_values; struct tcp_request_sock_ops; struct tcp_request_sock { diff --git a/include/net/tcp.h b/include/net/tcp.h index 18fc999dae3..27b652f379e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -192,10 +192,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_EXP_FASTOPEN_BASE 4 -#define TCPOLEN_COOKIE_BASE 2 /* Cookie-less header extension */ -#define TCPOLEN_COOKIE_PAIR 3 /* Cookie pair header extension */ -#define TCPOLEN_COOKIE_MIN (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MIN) -#define TCPOLEN_COOKIE_MAX (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MAX) /* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c27e8139239..ab64eea042f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -410,10 +410,6 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - /* Presumed zeroed, in order of appearance: - * cookie_in_always, cookie_out_never, - * s_data_constant, s_data_in, s_data_out - */ sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; -- cgit v1.2.3-70-g09d2 From ca4c3fc24e293719fe7410c4e63da9b6bc633b83 Mon Sep 17 00:00:00 2001 From: "fan.du" Date: Tue, 30 Jul 2013 08:33:53 +0800 Subject: net: split rt_genid for ipv4 and ipv6 Current net name space has only one genid for both IPv4 and IPv6, it has below drawbacks: - Add/delete an IPv4 address will invalidate all IPv6 routing table entries. - Insert/remove XFRM policy will also invalidate both IPv4/IPv6 routing table entries even when the policy is only applied for one address family. Thus, this patch attempt to split one genid for two to cater for IPv4 and IPv6 separately in a fine granularity. Signed-off-by: Fan Du Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/net_namespace.h | 37 ++++++++++++++++++++++++++++++++----- include/net/netns/ipv4.h | 1 + include/net/netns/ipv6.h | 1 + net/ipv4/route.c | 16 ++++++++-------- net/ipv6/af_inet6.c | 1 + net/ipv6/route.c | 4 ++-- net/xfrm/xfrm_policy.c | 8 +++++++- security/selinux/include/xfrm.h | 7 ++++++- 8 files changed, 58 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 84e37b1ca9e..1313456a099 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -119,7 +119,6 @@ struct net { struct netns_ipvs *ipvs; #endif struct sock *diag_nlsk; - atomic_t rt_genid; atomic_t fnhe_genid; }; @@ -333,14 +332,42 @@ static inline void unregister_net_sysctl_table(struct ctl_table_header *header) } #endif -static inline int rt_genid(struct net *net) +static inline int rt_genid_ipv4(struct net *net) { - return atomic_read(&net->rt_genid); + return atomic_read(&net->ipv4.rt_genid); } -static inline void rt_genid_bump(struct net *net) +static inline void rt_genid_bump_ipv4(struct net *net) { - atomic_inc(&net->rt_genid); + atomic_inc(&net->ipv4.rt_genid); +} + +#if IS_ENABLED(CONFIG_IPV6) +static inline int rt_genid_ipv6(struct net *net) +{ + return atomic_read(&net->ipv6.rt_genid); +} + +static inline void rt_genid_bump_ipv6(struct net *net) +{ + atomic_inc(&net->ipv6.rt_genid); +} +#else +static inline int rt_genid_ipv6(struct net *net) +{ + return 0; +} + +static inline void rt_genid_bump_ipv6(struct net *net) +{ +} +#endif + +/* For callers who don't really care about whether it's IPv4 or IPv6 */ +static inline void rt_genid_bump_all(struct net *net) +{ + rt_genid_bump_ipv4(net); + rt_genid_bump_ipv6(net); } static inline int fnhe_genid(struct net *net) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2ba9de89e8e..bf2ec2202c5 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -77,5 +77,6 @@ struct netns_ipv4 { struct fib_rules_ops *mr_rules_ops; #endif #endif + atomic_t rt_genid; }; #endif diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 005e2c2e39a..0fb2401197c 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -72,6 +72,7 @@ struct netns_ipv6 { #endif #endif atomic_t dev_addr_genid; + atomic_t rt_genid; }; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a9a54a23683..e805481eff7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -435,12 +435,12 @@ static inline int ip_rt_proc_init(void) static inline bool rt_is_expired(const struct rtable *rth) { - return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); + return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); } void rt_cache_flush(struct net *net) { - rt_genid_bump(net); + rt_genid_bump_ipv4(net); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, @@ -1458,7 +1458,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif rth->dst.output = ip_rt_bug; - rth->rt_genid = rt_genid(dev_net(dev)); + rth->rt_genid = rt_genid_ipv4(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; rth->rt_is_input= 1; @@ -1589,7 +1589,7 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); + rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); rth->rt_flags = flags; rth->rt_type = res->type; rth->rt_is_input = 1; @@ -1760,7 +1760,7 @@ local_input: rth->dst.tclassid = itag; #endif - rth->rt_genid = rt_genid(net); + rth->rt_genid = rt_genid_ipv4(net); rth->rt_flags = flags|RTCF_LOCAL; rth->rt_type = res.type; rth->rt_is_input = 1; @@ -1945,7 +1945,7 @@ add: rth->dst.output = ip_output; - rth->rt_genid = rt_genid(dev_net(dev_out)); + rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); rth->rt_flags = flags; rth->rt_type = type; rth->rt_is_input = 0; @@ -2227,7 +2227,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_iif = ort->rt_iif; rt->rt_pmtu = ort->rt_pmtu; - rt->rt_genid = rt_genid(net); + rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_gateway = ort->rt_gateway; @@ -2665,7 +2665,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { static __net_init int rt_genid_init(struct net *net) { - atomic_set(&net->rt_genid, 0); + atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); get_random_bytes(&net->ipv4.dev_addr_genid, sizeof(net->ipv4.dev_addr_genid)); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a5ac969aeef..0d1a9b153fb 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -766,6 +766,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; + atomic_set(&net->ipv6.rt_genid, 0); err = ipv6_init_mibs(net); if (err) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 74ab1f74abc..ce961630452 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -283,7 +283,7 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); - rt->rt6i_genid = rt_genid(net); + rt->rt6i_genid = rt_genid_ipv6(net); INIT_LIST_HEAD(&rt->rt6i_siblings); } return rt; @@ -1061,7 +1061,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - if (rt->rt6i_genid != rt_genid(dev_net(rt->dst.dev))) + if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev))) return NULL; if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e52cab3591d..d8da6b8c6ba 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -660,7 +660,13 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_pol_hold(policy); net->xfrm.policy_count[dir]++; atomic_inc(&flow_cache_genid); - rt_genid_bump(net); + + /* After previous checking, family can either be AF_INET or AF_INET6 */ + if (policy->family == AF_INET) + rt_genid_bump_ipv4(net); + else + rt_genid_bump_ipv6(net); + if (delpol) { xfrm_policy_requeue(delpol, policy); __xfrm_policy_unlink(delpol, dir); diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 65f67cb0aef..6713f04e30b 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -50,8 +50,13 @@ int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall); static inline void selinux_xfrm_notify_policyload(void) { + struct net *net; + atomic_inc(&flow_cache_genid); - rt_genid_bump(&init_net); + rtnl_lock(); + for_each_net(net) + rt_genid_bump_all(net); + rtnl_unlock(); } #else static inline int selinux_xfrm_enabled(void) -- cgit v1.2.3-70-g09d2 From f2f872f9272a79a1048877ea14c15576f46c225e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Jul 2013 17:55:08 -0700 Subject: netem: Introduce skb_orphan_partial() helper Commit 547669d483e578 ("tcp: xps: fix reordering issues") added unexpected reorders in case netem is used in a MQ setup for high performance test bed. ETH=eth0 tc qd del dev $ETH root 2>/dev/null tc qd add dev $ETH root handle 1: mq for i in `seq 1 32` do tc qd add dev $ETH parent 1:$i netem delay 100ms done As all tcp packets are orphaned by netem, TCP stack believes it can set skb->ooo_okay on all packets. In order to allow producers to send more packets, we want to keep sk_wmem_alloc from reaching sk_sndbuf limit. We can do that by accounting one byte per skb in netem queues, so that TCP stack is not fooled too much. Tested: With above MQ/netem setup, scaling number of concurrent flows gives linear results and no reorders/retransmits lpq83:~# for n in 1 10 20 30 40 50 60 70 80 90 100 do echo -n "n:$n " ; ./super_netperf $n -H 10.7.7.84; done n:1 198.46 n:10 2002.69 n:20 4000.98 n:30 6006.35 n:40 8020.93 n:50 10032.3 n:60 12081.9 n:70 13971.3 n:80 16009.7 n:90 17117.3 n:100 17425.5 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 19 +++++++++++++++++++ net/sched/sch_netem.c | 5 +---- 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index b9f2b095b1a..53d4714709a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1520,6 +1520,7 @@ extern struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); extern void sock_wfree(struct sk_buff *skb); +extern void skb_orphan_partial(struct sk_buff *skb); extern void sock_rfree(struct sk_buff *skb); extern void sock_edemux(struct sk_buff *skb); diff --git a/net/core/sock.c b/net/core/sock.c index 85e8de1bc7f..a753d97434d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1576,6 +1576,25 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +void skb_orphan_partial(struct sk_buff *skb) +{ + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, + * so we do not completely orphan skb, but transfert all + * accounted bytes but one, to avoid unexpected reorders. + */ + if (skb->destructor == sock_wfree +#ifdef CONFIG_INET + || skb->destructor == tcp_wfree +#endif + ) { + atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); + skb->truesize = 1; + } else { + skb_orphan(skb); + } +} +EXPORT_SYMBOL(skb_orphan_partial); + /* * Read buffer destructor automatically called from kfree_skb. */ diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 82f6016d89a..a6d788d4521 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -412,12 +412,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* If a delay is expected, orphan the skb. (orphaning usually takes * place at TX completion time, so _before_ the link transit delay) - * Ideally, this orphaning should be done after the rate limiting - * module, because this breaks TCP Small Queue, and other mechanisms - * based on socket sk_wmem_alloc. */ if (q->latency || q->jitter) - skb_orphan(skb); + skb_orphan_partial(skb); /* * If we need to duplicate packet, then re-insert at top of the -- cgit v1.2.3-70-g09d2 From c34a761231b56dea4bd205cb9c29ffe37475d232 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Jul 2013 16:11:15 -0700 Subject: net: skb_orphan() changes It is illegal to set skb->sk without corresponding destructor. Its therefore safe for skb_orphan() to not clear skb->sk if skb->destructor is not set. Also avoid clearing skb->destructor if already NULL. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5afefa01a13..a95547adff2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1805,10 +1805,11 @@ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len) */ static inline void skb_orphan(struct sk_buff *skb) { - if (skb->destructor) + if (skb->destructor) { skb->destructor(skb); - skb->destructor = NULL; - skb->sk = NULL; + skb->destructor = NULL; + skb->sk = NULL; + } } /** -- cgit v1.2.3-70-g09d2 From 5c15257f93234aaa9775291a041b49eeb38fd95a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 30 Jul 2013 22:47:13 -0700 Subject: net: Remove extern from include/net/ scheduling prototypes There are a mix of function prototypes with and without extern in the kernel sources. Standardize on not using extern for function prototypes. Function prototypes don't need to be written with extern. extern is assumed by the compiler. Its use is as unnecessary as using auto to declare automatic/local variables in a block. Reflow modified prototypes to 80 columns. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/act_api.h | 60 +++++++++++++++++++++++------------------------ include/net/pkt_cls.h | 42 ++++++++++++++++----------------- include/net/pkt_sched.h | 50 +++++++++++++++++++-------------------- include/net/sch_generic.h | 53 +++++++++++++++++++++-------------------- 4 files changed, 104 insertions(+), 101 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index b8ffac7b6ba..9e90fdff470 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -82,36 +82,36 @@ struct tc_action_ops { int (*walk)(struct sk_buff *, struct netlink_callback *, int, struct tc_action *); }; -extern struct tcf_common *tcf_hash_lookup(u32 index, - struct tcf_hashinfo *hinfo); -extern void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo); -extern int tcf_hash_release(struct tcf_common *p, int bind, - struct tcf_hashinfo *hinfo); -extern int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, - int type, struct tc_action *a); -extern u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo); -extern int tcf_hash_search(struct tc_action *a, u32 index); -extern struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, - int bind, struct tcf_hashinfo *hinfo); -extern struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est, - struct tc_action *a, int size, - int bind, u32 *idx_gen, - struct tcf_hashinfo *hinfo); -extern void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo); +struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo); +void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo); +int tcf_hash_release(struct tcf_common *p, int bind, + struct tcf_hashinfo *hinfo); +int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, + int type, struct tc_action *a); +u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo); +int tcf_hash_search(struct tc_action *a, u32 index); +struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, + int bind, struct tcf_hashinfo *hinfo); +struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est, + struct tc_action *a, int size, + int bind, u32 *idx_gen, + struct tcf_hashinfo *hinfo); +void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo); -extern int tcf_register_action(struct tc_action_ops *a); -extern int tcf_unregister_action(struct tc_action_ops *a); -extern void tcf_action_destroy(struct tc_action *a, int bind); -extern int tcf_action_exec(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res); -extern struct tc_action *tcf_action_init(struct net *net, struct nlattr *nla, - struct nlattr *est, char *n, int ovr, - int bind); -extern struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, - struct nlattr *est, char *n, int ovr, - int bind); -extern int tcf_action_dump(struct sk_buff *skb, struct tc_action *a, int, int); -extern int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); -extern int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); -extern int tcf_action_copy_stats (struct sk_buff *,struct tc_action *, int); +int tcf_register_action(struct tc_action_ops *a); +int tcf_unregister_action(struct tc_action_ops *a); +void tcf_action_destroy(struct tc_action *a, int bind); +int tcf_action_exec(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res); +struct tc_action *tcf_action_init(struct net *net, struct nlattr *nla, + struct nlattr *est, char *n, int ovr, + int bind); +struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, + struct nlattr *est, char *n, int ovr, + int bind); +int tcf_action_dump(struct sk_buff *skb, struct tc_action *a, int, int); +int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); +int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); +int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); #endif /* CONFIG_NET_CLS_ACT */ #endif diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 13174509cdf..2ebef77a2f9 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -14,8 +14,8 @@ struct tcf_walker { int (*fn)(struct tcf_proto *, unsigned long node, struct tcf_walker *); }; -extern int register_tcf_proto_ops(struct tcf_proto_ops *ops); -extern int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); +int register_tcf_proto_ops(struct tcf_proto_ops *ops); +int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); static inline unsigned long __cls_set_class(unsigned long *clp, unsigned long cl) @@ -126,17 +126,17 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, return 0; } -extern int tcf_exts_validate(struct net *net, struct tcf_proto *tp, - struct nlattr **tb, struct nlattr *rate_tlv, - struct tcf_exts *exts, - const struct tcf_ext_map *map); -extern void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts); -extern void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, - struct tcf_exts *src); -extern int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, - const struct tcf_ext_map *map); -extern int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, - const struct tcf_ext_map *map); +int tcf_exts_validate(struct net *net, struct tcf_proto *tp, + struct nlattr **tb, struct nlattr *rate_tlv, + struct tcf_exts *exts, + const struct tcf_ext_map *map); +void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts); +void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, + struct tcf_exts *src); +int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, + const struct tcf_ext_map *map); +int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, + const struct tcf_ext_map *map); /** * struct tcf_pkt_info - packet information @@ -239,14 +239,14 @@ struct tcf_ematch_ops { struct list_head link; }; -extern int tcf_em_register(struct tcf_ematch_ops *); -extern void tcf_em_unregister(struct tcf_ematch_ops *); -extern int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *, - struct tcf_ematch_tree *); -extern void tcf_em_tree_destroy(struct tcf_proto *, struct tcf_ematch_tree *); -extern int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int); -extern int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, - struct tcf_pkt_info *); +int tcf_em_register(struct tcf_ematch_ops *); +void tcf_em_unregister(struct tcf_ematch_ops *); +int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *, + struct tcf_ematch_tree *); +void tcf_em_tree_destroy(struct tcf_proto *, struct tcf_ematch_tree *); +int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int); +int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, + struct tcf_pkt_info *); /** * tcf_em_tree_change - replace ematch tree of a running classifier diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 388bf8b6d06..f7c24f8fbdc 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -64,8 +64,8 @@ struct qdisc_watchdog { struct Qdisc *qdisc; }; -extern void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); -extern void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) @@ -73,31 +73,31 @@ static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires)); } -extern void qdisc_watchdog_cancel(struct qdisc_watchdog *wd); +void qdisc_watchdog_cancel(struct qdisc_watchdog *wd); extern struct Qdisc_ops pfifo_qdisc_ops; extern struct Qdisc_ops bfifo_qdisc_ops; extern struct Qdisc_ops pfifo_head_drop_qdisc_ops; -extern int fifo_set_limit(struct Qdisc *q, unsigned int limit); -extern struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, - unsigned int limit); - -extern int register_qdisc(struct Qdisc_ops *qops); -extern int unregister_qdisc(struct Qdisc_ops *qops); -extern void qdisc_list_del(struct Qdisc *q); -extern struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); -extern struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); -extern struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, - struct nlattr *tab); -extern void qdisc_put_rtab(struct qdisc_rate_table *tab); -extern void qdisc_put_stab(struct qdisc_size_table *tab); -extern void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc); -extern int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, - struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock); - -extern void __qdisc_run(struct Qdisc *q); +int fifo_set_limit(struct Qdisc *q, unsigned int limit); +struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, + unsigned int limit); + +int register_qdisc(struct Qdisc_ops *qops); +int unregister_qdisc(struct Qdisc_ops *qops); +void qdisc_list_del(struct Qdisc *q); +struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); +struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, + struct nlattr *tab); +void qdisc_put_rtab(struct qdisc_rate_table *tab); +void qdisc_put_stab(struct qdisc_size_table *tab); +void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc); +int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, struct netdev_queue *txq, + spinlock_t *root_lock); + +void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { @@ -105,10 +105,10 @@ static inline void qdisc_run(struct Qdisc *q) __qdisc_run(q); } -extern int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res); -extern int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, +int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res); +int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res); /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 6eab63363e5..ffc9d883b16 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -350,30 +350,32 @@ qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id) return NULL; } -extern int qdisc_class_hash_init(struct Qdisc_class_hash *); -extern void qdisc_class_hash_insert(struct Qdisc_class_hash *, struct Qdisc_class_common *); -extern void qdisc_class_hash_remove(struct Qdisc_class_hash *, struct Qdisc_class_common *); -extern void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *); -extern void qdisc_class_hash_destroy(struct Qdisc_class_hash *); - -extern void dev_init_scheduler(struct net_device *dev); -extern void dev_shutdown(struct net_device *dev); -extern void dev_activate(struct net_device *dev); -extern void dev_deactivate(struct net_device *dev); -extern void dev_deactivate_many(struct list_head *head); -extern struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, - struct Qdisc *qdisc); -extern void qdisc_reset(struct Qdisc *qdisc); -extern void qdisc_destroy(struct Qdisc *qdisc); -extern void qdisc_tree_decrease_qlen(struct Qdisc *qdisc, unsigned int n); -extern struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops); -extern struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops, u32 parentid); -extern void __qdisc_calculate_pkt_len(struct sk_buff *skb, - const struct qdisc_size_table *stab); -extern void tcf_destroy(struct tcf_proto *tp); -extern void tcf_destroy_chain(struct tcf_proto **fl); +int qdisc_class_hash_init(struct Qdisc_class_hash *); +void qdisc_class_hash_insert(struct Qdisc_class_hash *, + struct Qdisc_class_common *); +void qdisc_class_hash_remove(struct Qdisc_class_hash *, + struct Qdisc_class_common *); +void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *); +void qdisc_class_hash_destroy(struct Qdisc_class_hash *); + +void dev_init_scheduler(struct net_device *dev); +void dev_shutdown(struct net_device *dev); +void dev_activate(struct net_device *dev); +void dev_deactivate(struct net_device *dev); +void dev_deactivate_many(struct list_head *head); +struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, + struct Qdisc *qdisc); +void qdisc_reset(struct Qdisc *qdisc); +void qdisc_destroy(struct Qdisc *qdisc); +void qdisc_tree_decrease_qlen(struct Qdisc *qdisc, unsigned int n); +struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + struct Qdisc_ops *ops); +struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, + struct Qdisc_ops *ops, u32 parentid); +void __qdisc_calculate_pkt_len(struct sk_buff *skb, + const struct qdisc_size_table *stab); +void tcf_destroy(struct tcf_proto *tp); +void tcf_destroy_chain(struct tcf_proto **fl); /* Reset all TX qdiscs greater then index of a device. */ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) @@ -692,7 +694,8 @@ static inline u64 psched_l2t_ns(const struct psched_ratecfg *r, return ((u64)(len + r->overhead) * r->mult) >> r->shift; } -extern void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf); +void psched_ratecfg_precompute(struct psched_ratecfg *r, + const struct tc_ratespec *conf); static inline void psched_ratecfg_getrate(struct tc_ratespec *res, const struct psched_ratecfg *r) -- cgit v1.2.3-70-g09d2 From 7764a45a8f1fe74d4f7d301eaca2e558e7e2831a Mon Sep 17 00:00:00 2001 From: Stefan Tomanek Date: Thu, 1 Aug 2013 02:17:15 +0200 Subject: fib_rules: add .suppress operation This change adds a new operation to the fib_rules_ops struct; it allows the suppression of routing decisions if certain criteria are not met by its results. The first implemented constraint is a minimum prefix length added to the structures of routing rules. If a rule is added with a minimum prefix length >0, only routes meeting this threshold will be considered. Any other (more general) routing table entries will be ignored. When configuring a system with multiple network uplinks and default routes, it is often convinient to reference the main routing table multiple times - but omitting the default route. Using this patch and a modified "ip" utility, this can be achieved by using the following command sequence: $ ip route add table secuplink default via 10.42.23.1 $ ip rule add pref 100 table main prefixlength 1 $ ip rule add pref 150 fwmark 0xA table secuplink With this setup, packets marked 0xA will be processed by the additional routing table "secuplink", but only if no suitable route in the main routing table can be found. By using a minimal prefixlength of 1, the default route (/0) of the table "main" is hidden to packets processed by rule 100; packets traveling to destinations with more specific routing entries are processed as usual. Signed-off-by: Stefan Tomanek Signed-off-by: David S. Miller --- include/net/fib_rules.h | 4 ++++ include/uapi/linux/fib_rules.h | 2 +- net/core/fib_rules.c | 8 ++++++++ net/ipv4/fib_rules.c | 14 ++++++++++++++ net/ipv6/fib6_rules.c | 13 +++++++++++++ 5 files changed, 40 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index e361f488242..2f286dce925 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -18,6 +18,7 @@ struct fib_rule { u32 pref; u32 flags; u32 table; + u8 table_prefixlen_min; u8 action; u32 target; struct fib_rule __rcu *ctarget; @@ -46,6 +47,8 @@ struct fib_rules_ops { int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); + bool (*suppress)(struct fib_rule *, + struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); int (*configure)(struct fib_rule *, @@ -80,6 +83,7 @@ struct fib_rules_ops { [FRA_FWMARK] = { .type = NLA_U32 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ + [FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \ [FRA_GOTO] = { .type = NLA_U32 } static inline void fib_rule_get(struct fib_rule *rule) diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 51da65b68b8..59cd31b3455 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -45,7 +45,7 @@ enum { FRA_FLOW, /* flow/class id */ FRA_UNUSED6, FRA_UNUSED7, - FRA_UNUSED8, + FRA_TABLE_PREFIXLEN_MIN, FRA_TABLE, /* Extended table id */ FRA_FWMASK, /* mask for netfilter mark */ FRA_OIFNAME, diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 21735440c44..2ef5040c99c 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -226,6 +226,9 @@ jumped: else err = ops->action(rule, fl, flags, arg); + if (!err && ops->suppress && ops->suppress(rule, arg)) + continue; + if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(atomic_inc_not_zero(&rule->refcnt))) { @@ -337,6 +340,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); + if (tb[FRA_TABLE_PREFIXLEN_MIN]) + rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]); if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); @@ -523,6 +528,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4); /* FRA_FWMASK */ @@ -548,6 +554,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->table = rule->table; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; + if (nla_put_u8(skb, FRA_TABLE_PREFIXLEN_MIN, rule->table_prefixlen_min)) + goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 26aa65d1fce..9f2906679d1 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -101,6 +101,19 @@ errout: return err; } +static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +{ + /* do not accept result if the route does + * not meet the required prefix length + */ + struct fib_result *result = (struct fib_result *) arg->result; + if (result->prefixlen < rule->table_prefixlen_min) { + if (!(arg->flags & FIB_LOOKUP_NOREF)) + fib_info_put(result->fi); + return true; + } + return false; +} static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { @@ -267,6 +280,7 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), .action = fib4_rule_action, + .suppress = fib4_rule_suppress, .match = fib4_rule_match, .configure = fib4_rule_configure, .delete = fib4_rule_delete, diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 2e1a432867c..e64e6a55fc4 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -111,6 +111,18 @@ out: return rt == NULL ? -EAGAIN : 0; } +static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +{ + struct rt6_info *rt = (struct rt6_info *) arg->result; + /* do not accept result if the route does + * not meet the required prefix length + */ + if (rt->rt6i_dst.plen < rule->table_prefixlen_min) { + ip6_rt_put(rt); + return true; + } + return false; +} static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { @@ -244,6 +256,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .addr_size = sizeof(struct in6_addr), .action = fib6_rule_action, .match = fib6_rule_match, + .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, .compare = fib6_rule_compare, .fill = fib6_rule_fill, -- cgit v1.2.3-70-g09d2 From e8e54d3c13b35479294f3dceb6bd6ed7a7dc5bf8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 17:31:32 -0700 Subject: addrconf.h: Remove extern function prototypes There are a mix of function prototypes with and without extern in the kernel sources. Standardize on not using extern for function prototypes. Function prototypes don't need to be written with extern. extern is assumed by the compiler. Its use is as unnecessary as using auto to declare automatic/local variables in a block. Reflow modified prototypes to 80 columns. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/addrconf.h | 160 ++++++++++++++++++++++--------------------------- 1 file changed, 72 insertions(+), 88 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index c7b181cb47a..43fa31a610b 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -53,51 +53,36 @@ struct prefix_info { #define IN6_ADDR_HSIZE_SHIFT 4 #define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) -extern int addrconf_init(void); -extern void addrconf_cleanup(void); +int addrconf_init(void); +void addrconf_cleanup(void); -extern int addrconf_add_ifaddr(struct net *net, - void __user *arg); -extern int addrconf_del_ifaddr(struct net *net, - void __user *arg); -extern int addrconf_set_dstaddr(struct net *net, - void __user *arg); +int addrconf_add_ifaddr(struct net *net, void __user *arg); +int addrconf_del_ifaddr(struct net *net, void __user *arg); +int addrconf_set_dstaddr(struct net *net, void __user *arg); -extern int ipv6_chk_addr(struct net *net, - const struct in6_addr *addr, - const struct net_device *dev, - int strict); +int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, + const struct net_device *dev, int strict); #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) -extern int ipv6_chk_home_addr(struct net *net, - const struct in6_addr *addr); +int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr); #endif -extern int ipv6_chk_prefix(const struct in6_addr *addr, - struct net_device *dev); - -extern struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, - const struct in6_addr *addr, - struct net_device *dev, - int strict); - -extern int ipv6_dev_get_saddr(struct net *net, - const struct net_device *dev, - const struct in6_addr *daddr, - unsigned int srcprefs, - struct in6_addr *saddr); -extern int __ipv6_get_lladdr(struct inet6_dev *idev, - struct in6_addr *addr, - unsigned char banned_flags); -extern int ipv6_get_lladdr(struct net_device *dev, - struct in6_addr *addr, - unsigned char banned_flags); -extern int ipv6_rcv_saddr_equal(const struct sock *sk, - const struct sock *sk2); -extern void addrconf_join_solict(struct net_device *dev, - const struct in6_addr *addr); -extern void addrconf_leave_solict(struct inet6_dev *idev, - const struct in6_addr *addr); +int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev); + +struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, + const struct in6_addr *addr, + struct net_device *dev, int strict); + +int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, + const struct in6_addr *daddr, unsigned int srcprefs, + struct in6_addr *saddr); +int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, + unsigned char banned_flags); +int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, + unsigned char banned_flags); +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2); +void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); +void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); static inline unsigned long addrconf_timeout_fixup(u32 timeout, unsigned int unit) @@ -124,41 +109,38 @@ static inline int addrconf_finite_timeout(unsigned long timeout) /* * IPv6 Address Label subsystem (addrlabel.c) */ -extern int ipv6_addr_label_init(void); -extern void ipv6_addr_label_cleanup(void); -extern void ipv6_addr_label_rtnl_register(void); -extern u32 ipv6_addr_label(struct net *net, - const struct in6_addr *addr, - int type, int ifindex); +int ipv6_addr_label_init(void); +void ipv6_addr_label_cleanup(void); +void ipv6_addr_label_rtnl_register(void); +u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, + int type, int ifindex); /* * multicast prototypes (mcast.c) */ -extern int ipv6_sock_mc_join(struct sock *sk, int ifindex, - const struct in6_addr *addr); -extern int ipv6_sock_mc_drop(struct sock *sk, int ifindex, - const struct in6_addr *addr); -extern void ipv6_sock_mc_close(struct sock *sk); -extern bool inet6_mc_check(struct sock *sk, - const struct in6_addr *mc_addr, - const struct in6_addr *src_addr); - -extern int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr); -extern int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr); -extern int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr); -extern void ipv6_mc_up(struct inet6_dev *idev); -extern void ipv6_mc_down(struct inet6_dev *idev); -extern void ipv6_mc_unmap(struct inet6_dev *idev); -extern void ipv6_mc_remap(struct inet6_dev *idev); -extern void ipv6_mc_init_dev(struct inet6_dev *idev); -extern void ipv6_mc_destroy_dev(struct inet6_dev *idev); -extern void addrconf_dad_failure(struct inet6_ifaddr *ifp); - -extern bool ipv6_chk_mcast_addr(struct net_device *dev, - const struct in6_addr *group, - const struct in6_addr *src_addr); - -extern void ipv6_mc_dad_complete(struct inet6_dev *idev); +int ipv6_sock_mc_join(struct sock *sk, int ifindex, + const struct in6_addr *addr); +int ipv6_sock_mc_drop(struct sock *sk, int ifindex, + const struct in6_addr *addr); +void ipv6_sock_mc_close(struct sock *sk); +bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, + const struct in6_addr *src_addr); + +int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr); +int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr); +int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr); +void ipv6_mc_up(struct inet6_dev *idev); +void ipv6_mc_down(struct inet6_dev *idev); +void ipv6_mc_unmap(struct inet6_dev *idev); +void ipv6_mc_remap(struct inet6_dev *idev); +void ipv6_mc_init_dev(struct inet6_dev *idev); +void ipv6_mc_destroy_dev(struct inet6_dev *idev); +void addrconf_dad_failure(struct inet6_ifaddr *ifp); + +bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, + const struct in6_addr *src_addr); + +void ipv6_mc_dad_complete(struct inet6_dev *idev); /* * identify MLD packets for MLD filter exceptions */ @@ -184,29 +166,31 @@ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset) return false; } -extern void addrconf_prefix_rcv(struct net_device *dev, - u8 *opt, int len, bool sllao); +void addrconf_prefix_rcv(struct net_device *dev, + u8 *opt, int len, bool sllao); /* * anycast prototypes (anycast.c) */ -extern int ipv6_sock_ac_join(struct sock *sk,int ifindex, const struct in6_addr *addr); -extern int ipv6_sock_ac_drop(struct sock *sk,int ifindex, const struct in6_addr *addr); -extern void ipv6_sock_ac_close(struct sock *sk); - -extern int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr); -extern int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr); -extern bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, +int ipv6_sock_ac_join(struct sock *sk, int ifindex, + const struct in6_addr *addr); +int ipv6_sock_ac_drop(struct sock *sk, int ifindex, + const struct in6_addr *addr); +void ipv6_sock_ac_close(struct sock *sk); + +int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr); +int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr); +bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr); /* Device notifier */ -extern int register_inet6addr_notifier(struct notifier_block *nb); -extern int unregister_inet6addr_notifier(struct notifier_block *nb); -extern int inet6addr_notifier_call_chain(unsigned long val, void *v); +int register_inet6addr_notifier(struct notifier_block *nb); +int unregister_inet6addr_notifier(struct notifier_block *nb); +int inet6addr_notifier_call_chain(unsigned long val, void *v); -extern void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, - struct ipv6_devconf *devconf); +void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, + struct ipv6_devconf *devconf); /** * __in6_dev_get - get inet6_dev pointer from netdevice @@ -240,7 +224,7 @@ static inline struct inet6_dev *in6_dev_get(const struct net_device *dev) return idev; } -extern void in6_dev_finish_destroy(struct inet6_dev *idev); +void in6_dev_finish_destroy(struct inet6_dev *idev); static inline void in6_dev_put(struct inet6_dev *idev) { @@ -258,7 +242,7 @@ static inline void in6_dev_hold(struct inet6_dev *idev) atomic_inc(&idev->refcnt); } -extern void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); +void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); static inline void in6_ifa_put(struct inet6_ifaddr *ifp) { @@ -340,8 +324,8 @@ static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) } #ifdef CONFIG_PROC_FS -extern int if6_proc_init(void); -extern void if6_proc_exit(void); +int if6_proc_init(void); +void if6_proc_exit(void); #endif #endif -- cgit v1.2.3-70-g09d2 From b60a8280ba2a34351dd8b98664df3785c27528dd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 17:31:33 -0700 Subject: af_unix.h: Remove extern from function prototypes There are a mix of function prototypes with and without extern in the kernel sources. Standardize on not using extern for function prototypes. Function prototypes don't need to be written with extern. extern is assumed by the compiler. Its use is as unnecessary as using auto to declare automatic/local variables in a block. Reflow modified prototypes to 80 columns. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/af_unix.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index dbdfd2b0f3b..05442df4ca8 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -6,12 +6,12 @@ #include #include -extern void unix_inflight(struct file *fp); -extern void unix_notinflight(struct file *fp); -extern void unix_gc(void); -extern void wait_for_unix_gc(void); -extern struct sock *unix_get_socket(struct file *filp); -extern struct sock *unix_peer_get(struct sock *); +void unix_inflight(struct file *fp); +void unix_notinflight(struct file *fp); +void unix_gc(void); +void wait_for_unix_gc(void); +struct sock *unix_get_socket(struct file *filp); +struct sock *unix_peer_get(struct sock *); #define UNIX_HASH_SIZE 256 #define UNIX_HASH_BITS 8 @@ -71,8 +71,8 @@ long unix_inq_len(struct sock *sk); long unix_outq_len(struct sock *sk); #ifdef CONFIG_SYSCTL -extern int unix_sysctl_register(struct net *net); -extern void unix_sysctl_unregister(struct net *net); +int unix_sysctl_register(struct net *net); +void unix_sysctl_unregister(struct net *net); #else static inline int unix_sysctl_register(struct net *net) { return 0; } static inline void unix_sysctl_unregister(struct net *net) {} -- cgit v1.2.3-70-g09d2 From cd2cf63a567fb45be445ccc0aed8e551d86f0314 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 17:31:34 -0700 Subject: af_rxrpc.h: Remove extern from function prototypes There are a mix of function prototypes with and without extern in the kernel sources. Standardize on not using extern for function prototypes. Function prototypes don't need to be written with extern. extern is assumed by the compiler. Its use is as unnecessary as using auto to declare automatic/local variables in a block. Reflow modified prototypes to 80 columns. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/af_rxrpc.h | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 03e6e945362..e797d45a5ae 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -31,24 +31,21 @@ enum { typedef void (*rxrpc_interceptor_t)(struct sock *, unsigned long, struct sk_buff *); -extern void rxrpc_kernel_intercept_rx_messages(struct socket *, - rxrpc_interceptor_t); -extern struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, - struct sockaddr_rxrpc *, - struct key *, - unsigned long, - gfp_t); -extern int rxrpc_kernel_send_data(struct rxrpc_call *, struct msghdr *, - size_t); -extern void rxrpc_kernel_abort_call(struct rxrpc_call *, u32); -extern void rxrpc_kernel_end_call(struct rxrpc_call *); -extern bool rxrpc_kernel_is_data_last(struct sk_buff *); -extern u32 rxrpc_kernel_get_abort_code(struct sk_buff *); -extern int rxrpc_kernel_get_error_number(struct sk_buff *); -extern void rxrpc_kernel_data_delivered(struct sk_buff *); -extern void rxrpc_kernel_free_skb(struct sk_buff *); -extern struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, - unsigned long); -extern int rxrpc_kernel_reject_call(struct socket *); +void rxrpc_kernel_intercept_rx_messages(struct socket *, rxrpc_interceptor_t); +struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, + struct sockaddr_rxrpc *, + struct key *, + unsigned long, + gfp_t); +int rxrpc_kernel_send_data(struct rxrpc_call *, struct msghdr *, size_t); +void rxrpc_kernel_abort_call(struct rxrpc_call *, u32); +void rxrpc_kernel_end_call(struct rxrpc_call *); +bool rxrpc_kernel_is_data_last(struct sk_buff *); +u32 rxrpc_kernel_get_abort_code(struct sk_buff *); +int rxrpc_kernel_get_error_number(struct sk_buff *); +void rxrpc_kernel_data_delivered(struct sk_buff *); +void rxrpc_kernel_free_skb(struct sk_buff *); +struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long); +int rxrpc_kernel_reject_call(struct socket *); #endif /* _NET_RXRPC_H */ -- cgit v1.2.3-70-g09d2 From 90972b22116bc22181717517ad5b0c7043edb178 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 17:31:35 -0700 Subject: arp/neighbour.h: Remove extern from function prototypes There are a mix of function prototypes with and without extern in the kernel sources. Standardize on not using extern for function prototypes. Function prototypes don't need to be written with extern. extern is assumed by the compiler. Its use is as unnecessary as using auto to declare automatic/local variables in a block. Reflow modified prototypes to 80 columns. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/arp.h | 30 +++++++-------- include/net/neighbour.h | 98 ++++++++++++++++++++++++------------------------- 2 files changed, 63 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/include/net/arp.h b/include/net/arp.h index b630dae0341..7509d9da4e3 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -46,22 +46,22 @@ static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 return n; } -extern void arp_init(void); -extern int arp_find(unsigned char *haddr, struct sk_buff *skb); -extern int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); -extern void arp_send(int type, int ptype, __be32 dest_ip, - struct net_device *dev, __be32 src_ip, - const unsigned char *dest_hw, - const unsigned char *src_hw, const unsigned char *th); -extern int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); -extern void arp_ifdown(struct net_device *dev); +void arp_init(void); +int arp_find(unsigned char *haddr, struct sk_buff *skb); +int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); +void arp_send(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, + const unsigned char *src_hw, const unsigned char *th); +int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); +void arp_ifdown(struct net_device *dev); -extern struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, - struct net_device *dev, __be32 src_ip, - const unsigned char *dest_hw, - const unsigned char *src_hw, - const unsigned char *target_hw); -extern void arp_xmit(struct sk_buff *skb); +struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, + const unsigned char *src_hw, + const unsigned char *target_hw); +void arp_xmit(struct sk_buff *skb); int arp_invalidate(struct net_device *dev, __be32 ip); #endif /* _ARP_H */ diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7e748ad8b50..536501a3e58 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -195,68 +195,67 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 -extern void neigh_table_init(struct neigh_table *tbl); -extern int neigh_table_clear(struct neigh_table *tbl); -extern struct neighbour * neigh_lookup(struct neigh_table *tbl, - const void *pkey, - struct net_device *dev); -extern struct neighbour * neigh_lookup_nodev(struct neigh_table *tbl, - struct net *net, - const void *pkey); -extern struct neighbour * __neigh_create(struct neigh_table *tbl, - const void *pkey, - struct net_device *dev, - bool want_ref); +void neigh_table_init(struct neigh_table *tbl); +int neigh_table_clear(struct neigh_table *tbl); +struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, + struct net_device *dev); +struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, + const void *pkey); +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref); static inline struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return __neigh_create(tbl, pkey, dev, true); } -extern void neigh_destroy(struct neighbour *neigh); -extern int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); -extern int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - u32 flags); -extern void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); -extern int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); -extern int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb); -extern int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb); -extern int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb); -extern int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb); -extern struct neighbour *neigh_event_ns(struct neigh_table *tbl, +void neigh_destroy(struct neighbour *neigh); +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags); +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); +int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb); +int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb); +int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb); +int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb); +struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev); -extern struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl); -extern void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms); +struct neigh_parms *neigh_parms_alloc(struct net_device *dev, + struct neigh_table *tbl); +void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms); static inline -struct net *neigh_parms_net(const struct neigh_parms *parms) +struct net *neigh_parms_net(const struct neigh_parms *parms) { return read_pnet(&parms->net); } -extern unsigned long neigh_rand_reach_time(unsigned long base); +unsigned long neigh_rand_reach_time(unsigned long base); -extern void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, - struct sk_buff *skb); -extern struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev, int creat); -extern struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, - struct net *net, - const void *key, - struct net_device *dev); -extern int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); +void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, + struct sk_buff *skb); +struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, + const void *key, struct net_device *dev, + int creat); +struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, + const void *key, struct net_device *dev); +int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, + struct net_device *dev); -static inline -struct net *pneigh_net(const struct pneigh_entry *pneigh) +static inline struct net *pneigh_net(const struct pneigh_entry *pneigh) { return read_pnet(&pneigh->net); } -extern void neigh_app_ns(struct neighbour *n); -extern void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie); -extern void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)); -extern void pneigh_for_each(struct neigh_table *tbl, void (*cb)(struct pneigh_entry *)); +void neigh_app_ns(struct neighbour *n); +void neigh_for_each(struct neigh_table *tbl, + void (*cb)(struct neighbour *, void *), void *cookie); +void __neigh_for_each_release(struct neigh_table *tbl, + int (*cb)(struct neighbour *)); +void pneigh_for_each(struct neigh_table *tbl, + void (*cb)(struct pneigh_entry *)); struct neigh_seq_state { struct seq_net_private p; @@ -270,15 +269,14 @@ struct neigh_seq_state { #define NEIGH_SEQ_IS_PNEIGH 0x00000002 #define NEIGH_SEQ_SKIP_NOARP 0x00000004 }; -extern void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *, unsigned int); -extern void *neigh_seq_next(struct seq_file *, void *, loff_t *); -extern void neigh_seq_stop(struct seq_file *, void *); - -extern int neigh_sysctl_register(struct net_device *dev, - struct neigh_parms *p, - char *p_name, - proc_handler *proc_handler); -extern void neigh_sysctl_unregister(struct neigh_parms *p); +void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *, + unsigned int); +void *neigh_seq_next(struct seq_file *, void *, loff_t *); +void neigh_seq_stop(struct seq_file *, void *); + +int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, + char *p_name, proc_handler *proc_handler); +void neigh_sysctl_unregister(struct neigh_parms *p); static inline void __neigh_parms_put(struct neigh_parms *parms) { -- cgit v1.2.3-70-g09d2 From c1d8f8041cd0357643d5d051d1425a17b75e9d2c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 17:31:36 -0700 Subject: ax25.h: Remove extern from function prototypes There are a mix of function prototypes with and without extern in the kernel sources. Standardize on not using extern for function prototypes. Function prototypes don't need to be written with extern. extern is assumed by the compiler. Its use is as unnecessary as using auto to declare automatic/local variables in a block. Reflow modified prototypes to 80 columns. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/ax25.h | 215 +++++++++++++++++++++++++++-------------------------- 1 file changed, 109 insertions(+), 106 deletions(-) (limited to 'include') diff --git a/include/net/ax25.h b/include/net/ax25.h index 89ed9ac5701..bf0396e9a5d 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -195,7 +195,7 @@ static inline void ax25_hold_route(ax25_route *ax25_rt) atomic_inc(&ax25_rt->refcount); } -extern void __ax25_put_route(ax25_route *ax25_rt); +void __ax25_put_route(ax25_route *ax25_rt); static inline void ax25_put_route(ax25_route *ax25_rt) { @@ -272,30 +272,31 @@ static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev /* af_ax25.c */ extern struct hlist_head ax25_list; extern spinlock_t ax25_list_lock; -extern void ax25_cb_add(ax25_cb *); +void ax25_cb_add(ax25_cb *); struct sock *ax25_find_listener(ax25_address *, int, struct net_device *, int); struct sock *ax25_get_socket(ax25_address *, ax25_address *, int); -extern ax25_cb *ax25_find_cb(ax25_address *, ax25_address *, ax25_digi *, struct net_device *); -extern void ax25_send_to_raw(ax25_address *, struct sk_buff *, int); -extern void ax25_destroy_socket(ax25_cb *); -extern ax25_cb * __must_check ax25_create_cb(void); -extern void ax25_fillin_cb(ax25_cb *, ax25_dev *); -extern struct sock *ax25_make_new(struct sock *, struct ax25_dev *); +ax25_cb *ax25_find_cb(ax25_address *, ax25_address *, ax25_digi *, + struct net_device *); +void ax25_send_to_raw(ax25_address *, struct sk_buff *, int); +void ax25_destroy_socket(ax25_cb *); +ax25_cb * __must_check ax25_create_cb(void); +void ax25_fillin_cb(ax25_cb *, ax25_dev *); +struct sock *ax25_make_new(struct sock *, struct ax25_dev *); /* ax25_addr.c */ extern const ax25_address ax25_bcast; extern const ax25_address ax25_defaddr; extern const ax25_address null_ax25_address; -extern char *ax2asc(char *buf, const ax25_address *); -extern void asc2ax(ax25_address *addr, const char *callsign); -extern int ax25cmp(const ax25_address *, const ax25_address *); -extern int ax25digicmp(const ax25_digi *, const ax25_digi *); -extern const unsigned char *ax25_addr_parse(const unsigned char *, int, +char *ax2asc(char *buf, const ax25_address *); +void asc2ax(ax25_address *addr, const char *callsign); +int ax25cmp(const ax25_address *, const ax25_address *); +int ax25digicmp(const ax25_digi *, const ax25_digi *); +const unsigned char *ax25_addr_parse(const unsigned char *, int, ax25_address *, ax25_address *, ax25_digi *, int *, int *); -extern int ax25_addr_build(unsigned char *, const ax25_address *, - const ax25_address *, const ax25_digi *, int, int); -extern int ax25_addr_size(const ax25_digi *); -extern void ax25_digi_invert(const ax25_digi *, ax25_digi *); +int ax25_addr_build(unsigned char *, const ax25_address *, + const ax25_address *, const ax25_digi *, int, int); +int ax25_addr_size(const ax25_digi *); +void ax25_digi_invert(const ax25_digi *, ax25_digi *); /* ax25_dev.c */ extern ax25_dev *ax25_dev_list; @@ -306,33 +307,33 @@ static inline ax25_dev *ax25_dev_ax25dev(struct net_device *dev) return dev->ax25_ptr; } -extern ax25_dev *ax25_addr_ax25dev(ax25_address *); -extern void ax25_dev_device_up(struct net_device *); -extern void ax25_dev_device_down(struct net_device *); -extern int ax25_fwd_ioctl(unsigned int, struct ax25_fwd_struct *); -extern struct net_device *ax25_fwd_dev(struct net_device *); -extern void ax25_dev_free(void); +ax25_dev *ax25_addr_ax25dev(ax25_address *); +void ax25_dev_device_up(struct net_device *); +void ax25_dev_device_down(struct net_device *); +int ax25_fwd_ioctl(unsigned int, struct ax25_fwd_struct *); +struct net_device *ax25_fwd_dev(struct net_device *); +void ax25_dev_free(void); /* ax25_ds_in.c */ -extern int ax25_ds_frame_in(ax25_cb *, struct sk_buff *, int); +int ax25_ds_frame_in(ax25_cb *, struct sk_buff *, int); /* ax25_ds_subr.c */ -extern void ax25_ds_nr_error_recovery(ax25_cb *); -extern void ax25_ds_enquiry_response(ax25_cb *); -extern void ax25_ds_establish_data_link(ax25_cb *); -extern void ax25_dev_dama_off(ax25_dev *); -extern void ax25_dama_on(ax25_cb *); -extern void ax25_dama_off(ax25_cb *); +void ax25_ds_nr_error_recovery(ax25_cb *); +void ax25_ds_enquiry_response(ax25_cb *); +void ax25_ds_establish_data_link(ax25_cb *); +void ax25_dev_dama_off(ax25_dev *); +void ax25_dama_on(ax25_cb *); +void ax25_dama_off(ax25_cb *); /* ax25_ds_timer.c */ -extern void ax25_ds_setup_timer(ax25_dev *); -extern void ax25_ds_set_timer(ax25_dev *); -extern void ax25_ds_del_timer(ax25_dev *); -extern void ax25_ds_timer(ax25_cb *); -extern void ax25_ds_t1_timeout(ax25_cb *); -extern void ax25_ds_heartbeat_expiry(ax25_cb *); -extern void ax25_ds_t3timer_expiry(ax25_cb *); -extern void ax25_ds_idletimer_expiry(ax25_cb *); +void ax25_ds_setup_timer(ax25_dev *); +void ax25_ds_set_timer(ax25_dev *); +void ax25_ds_del_timer(ax25_dev *); +void ax25_ds_timer(ax25_cb *); +void ax25_ds_t1_timeout(ax25_cb *); +void ax25_ds_heartbeat_expiry(ax25_cb *); +void ax25_ds_t3timer_expiry(ax25_cb *); +void ax25_ds_idletimer_expiry(ax25_cb *); /* ax25_iface.c */ @@ -342,107 +343,109 @@ struct ax25_protocol { int (*func)(struct sk_buff *, ax25_cb *); }; -extern void ax25_register_pid(struct ax25_protocol *ap); -extern void ax25_protocol_release(unsigned int); +void ax25_register_pid(struct ax25_protocol *ap); +void ax25_protocol_release(unsigned int); struct ax25_linkfail { struct hlist_node lf_node; void (*func)(ax25_cb *, int); }; -extern void ax25_linkfail_register(struct ax25_linkfail *lf); -extern void ax25_linkfail_release(struct ax25_linkfail *lf); -extern int __must_check ax25_listen_register(ax25_address *, - struct net_device *); -extern void ax25_listen_release(ax25_address *, struct net_device *); -extern int (*ax25_protocol_function(unsigned int))(struct sk_buff *, ax25_cb *); -extern int ax25_listen_mine(ax25_address *, struct net_device *); -extern void ax25_link_failed(ax25_cb *, int); -extern int ax25_protocol_is_registered(unsigned int); +void ax25_linkfail_register(struct ax25_linkfail *lf); +void ax25_linkfail_release(struct ax25_linkfail *lf); +int __must_check ax25_listen_register(ax25_address *, struct net_device *); +void ax25_listen_release(ax25_address *, struct net_device *); +int(*ax25_protocol_function(unsigned int))(struct sk_buff *, ax25_cb *); +int ax25_listen_mine(ax25_address *, struct net_device *); +void ax25_link_failed(ax25_cb *, int); +int ax25_protocol_is_registered(unsigned int); /* ax25_in.c */ -extern int ax25_rx_iframe(ax25_cb *, struct sk_buff *); -extern int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); +int ax25_rx_iframe(ax25_cb *, struct sk_buff *); +int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *, + struct net_device *); /* ax25_ip.c */ -extern int ax25_hard_header(struct sk_buff *, struct net_device *, - unsigned short, const void *, - const void *, unsigned int); -extern int ax25_rebuild_header(struct sk_buff *); +int ax25_hard_header(struct sk_buff *, struct net_device *, unsigned short, + const void *, const void *, unsigned int); +int ax25_rebuild_header(struct sk_buff *); extern const struct header_ops ax25_header_ops; /* ax25_out.c */ -extern ax25_cb *ax25_send_frame(struct sk_buff *, int, ax25_address *, ax25_address *, ax25_digi *, struct net_device *); -extern void ax25_output(ax25_cb *, int, struct sk_buff *); -extern void ax25_kick(ax25_cb *); -extern void ax25_transmit_buffer(ax25_cb *, struct sk_buff *, int); -extern void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev); -extern int ax25_check_iframes_acked(ax25_cb *, unsigned short); +ax25_cb *ax25_send_frame(struct sk_buff *, int, ax25_address *, ax25_address *, + ax25_digi *, struct net_device *); +void ax25_output(ax25_cb *, int, struct sk_buff *); +void ax25_kick(ax25_cb *); +void ax25_transmit_buffer(ax25_cb *, struct sk_buff *, int); +void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev); +int ax25_check_iframes_acked(ax25_cb *, unsigned short); /* ax25_route.c */ -extern void ax25_rt_device_down(struct net_device *); -extern int ax25_rt_ioctl(unsigned int, void __user *); +void ax25_rt_device_down(struct net_device *); +int ax25_rt_ioctl(unsigned int, void __user *); extern const struct file_operations ax25_route_fops; -extern ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev); -extern int ax25_rt_autobind(ax25_cb *, ax25_address *); -extern struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *, ax25_address *, ax25_digi *); -extern void ax25_rt_free(void); +ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev); +int ax25_rt_autobind(ax25_cb *, ax25_address *); +struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *, + ax25_address *, ax25_digi *); +void ax25_rt_free(void); /* ax25_std_in.c */ -extern int ax25_std_frame_in(ax25_cb *, struct sk_buff *, int); +int ax25_std_frame_in(ax25_cb *, struct sk_buff *, int); /* ax25_std_subr.c */ -extern void ax25_std_nr_error_recovery(ax25_cb *); -extern void ax25_std_establish_data_link(ax25_cb *); -extern void ax25_std_transmit_enquiry(ax25_cb *); -extern void ax25_std_enquiry_response(ax25_cb *); -extern void ax25_std_timeout_response(ax25_cb *); +void ax25_std_nr_error_recovery(ax25_cb *); +void ax25_std_establish_data_link(ax25_cb *); +void ax25_std_transmit_enquiry(ax25_cb *); +void ax25_std_enquiry_response(ax25_cb *); +void ax25_std_timeout_response(ax25_cb *); /* ax25_std_timer.c */ -extern void ax25_std_heartbeat_expiry(ax25_cb *); -extern void ax25_std_t1timer_expiry(ax25_cb *); -extern void ax25_std_t2timer_expiry(ax25_cb *); -extern void ax25_std_t3timer_expiry(ax25_cb *); -extern void ax25_std_idletimer_expiry(ax25_cb *); +void ax25_std_heartbeat_expiry(ax25_cb *); +void ax25_std_t1timer_expiry(ax25_cb *); +void ax25_std_t2timer_expiry(ax25_cb *); +void ax25_std_t3timer_expiry(ax25_cb *); +void ax25_std_idletimer_expiry(ax25_cb *); /* ax25_subr.c */ -extern void ax25_clear_queues(ax25_cb *); -extern void ax25_frames_acked(ax25_cb *, unsigned short); -extern void ax25_requeue_frames(ax25_cb *); -extern int ax25_validate_nr(ax25_cb *, unsigned short); -extern int ax25_decode(ax25_cb *, struct sk_buff *, int *, int *, int *); -extern void ax25_send_control(ax25_cb *, int, int, int); -extern void ax25_return_dm(struct net_device *, ax25_address *, ax25_address *, ax25_digi *); -extern void ax25_calculate_t1(ax25_cb *); -extern void ax25_calculate_rtt(ax25_cb *); -extern void ax25_disconnect(ax25_cb *, int); +void ax25_clear_queues(ax25_cb *); +void ax25_frames_acked(ax25_cb *, unsigned short); +void ax25_requeue_frames(ax25_cb *); +int ax25_validate_nr(ax25_cb *, unsigned short); +int ax25_decode(ax25_cb *, struct sk_buff *, int *, int *, int *); +void ax25_send_control(ax25_cb *, int, int, int); +void ax25_return_dm(struct net_device *, ax25_address *, ax25_address *, + ax25_digi *); +void ax25_calculate_t1(ax25_cb *); +void ax25_calculate_rtt(ax25_cb *); +void ax25_disconnect(ax25_cb *, int); /* ax25_timer.c */ -extern void ax25_setup_timers(ax25_cb *); -extern void ax25_start_heartbeat(ax25_cb *); -extern void ax25_start_t1timer(ax25_cb *); -extern void ax25_start_t2timer(ax25_cb *); -extern void ax25_start_t3timer(ax25_cb *); -extern void ax25_start_idletimer(ax25_cb *); -extern void ax25_stop_heartbeat(ax25_cb *); -extern void ax25_stop_t1timer(ax25_cb *); -extern void ax25_stop_t2timer(ax25_cb *); -extern void ax25_stop_t3timer(ax25_cb *); -extern void ax25_stop_idletimer(ax25_cb *); -extern int ax25_t1timer_running(ax25_cb *); -extern unsigned long ax25_display_timer(struct timer_list *); +void ax25_setup_timers(ax25_cb *); +void ax25_start_heartbeat(ax25_cb *); +void ax25_start_t1timer(ax25_cb *); +void ax25_start_t2timer(ax25_cb *); +void ax25_start_t3timer(ax25_cb *); +void ax25_start_idletimer(ax25_cb *); +void ax25_stop_heartbeat(ax25_cb *); +void ax25_stop_t1timer(ax25_cb *); +void ax25_stop_t2timer(ax25_cb *); +void ax25_stop_t3timer(ax25_cb *); +void ax25_stop_idletimer(ax25_cb *); +int ax25_t1timer_running(ax25_cb *); +unsigned long ax25_display_timer(struct timer_list *); /* ax25_uid.c */ extern int ax25_uid_policy; -extern ax25_uid_assoc *ax25_findbyuid(kuid_t); -extern int __must_check ax25_uid_ioctl(int, struct sockaddr_ax25 *); +ax25_uid_assoc *ax25_findbyuid(kuid_t); +int __must_check ax25_uid_ioctl(int, struct sockaddr_ax25 *); extern const struct file_operations ax25_uid_fops; -extern void ax25_uid_free(void); +void ax25_uid_free(void); /* sysctl_net_ax25.c */ #ifdef CONFIG_SYSCTL -extern int ax25_register_dev_sysctl(ax25_dev *ax25_dev); -extern void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev); +int ax25_register_dev_sysctl(ax25_dev *ax25_dev); +void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev); #else static inline int ax25_register_dev_sysctl(ax25_dev *ax25_dev) { return 0; } static inline void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev) {} -- cgit v1.2.3-70-g09d2 From 10dd9b7ce37e359066b9ab59592f81684360561a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 17:31:37 -0700 Subject: cfg80211.h/mac80211.h: Remove extern from function prototypes There are a mix of function prototypes with and without extern in the kernel sources. Standardize on not using extern for function prototypes. Function prototypes don't need to be written with extern. extern is assumed by the compiler. Its use is as unnecessary as using auto to declare automatic/local variables in a block. Reflow modified prototypes to 80 columns. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/cfg80211.h | 34 +++++++++++++++++----------------- include/net/mac80211.h | 16 ++++++++-------- 2 files changed, 25 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7b0730aeb89..f49de283f50 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2841,7 +2841,7 @@ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv); * * Return: A non-negative wiphy index or a negative error code. */ -extern int wiphy_register(struct wiphy *wiphy); +int wiphy_register(struct wiphy *wiphy); /** * wiphy_unregister - deregister a wiphy from cfg80211 @@ -2852,14 +2852,14 @@ extern int wiphy_register(struct wiphy *wiphy); * pointer, but the call may sleep to wait for an outstanding * request that is being handled. */ -extern void wiphy_unregister(struct wiphy *wiphy); +void wiphy_unregister(struct wiphy *wiphy); /** * wiphy_free - free wiphy * * @wiphy: The wiphy to free */ -extern void wiphy_free(struct wiphy *wiphy); +void wiphy_free(struct wiphy *wiphy); /* internal structs */ struct cfg80211_conn; @@ -3014,14 +3014,14 @@ static inline void *wdev_priv(struct wireless_dev *wdev) * @band: band, necessary due to channel number overlap * Return: The corresponding frequency (in MHz), or 0 if the conversion failed. */ -extern int ieee80211_channel_to_frequency(int chan, enum ieee80211_band band); +int ieee80211_channel_to_frequency(int chan, enum ieee80211_band band); /** * ieee80211_frequency_to_channel - convert frequency to channel number * @freq: center frequency * Return: The corresponding channel, or 0 if the conversion failed. */ -extern int ieee80211_frequency_to_channel(int freq); +int ieee80211_frequency_to_channel(int freq); /* * Name indirection necessary because the ieee80211 code also has @@ -3030,8 +3030,8 @@ extern int ieee80211_frequency_to_channel(int freq); * to include both header files you'll (rightfully!) get a symbol * clash. */ -extern struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, - int freq); +struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, + int freq); /** * ieee80211_get_channel - get channel struct from wiphy for specified frequency * @wiphy: the struct wiphy to get the channel for @@ -3141,13 +3141,14 @@ struct ieee80211_radiotap_iterator { int _reset_on_ext; }; -extern int ieee80211_radiotap_iterator_init( - struct ieee80211_radiotap_iterator *iterator, - struct ieee80211_radiotap_header *radiotap_header, - int max_length, const struct ieee80211_radiotap_vendor_namespaces *vns); +int +ieee80211_radiotap_iterator_init(struct ieee80211_radiotap_iterator *iterator, + struct ieee80211_radiotap_header *radiotap_header, + int max_length, + const struct ieee80211_radiotap_vendor_namespaces *vns); -extern int ieee80211_radiotap_iterator_next( - struct ieee80211_radiotap_iterator *iterator); +int +ieee80211_radiotap_iterator_next(struct ieee80211_radiotap_iterator *iterator); extern const unsigned char rfc1042_header[6]; @@ -3307,7 +3308,7 @@ const u8 *cfg80211_find_vendor_ie(unsigned int oui, u8 oui_type, * * Return: 0 on success. -ENOMEM. */ -extern int regulatory_hint(struct wiphy *wiphy, const char *alpha2); +int regulatory_hint(struct wiphy *wiphy, const char *alpha2); /** * wiphy_apply_custom_regulatory - apply a custom driver regulatory domain @@ -3321,9 +3322,8 @@ extern int regulatory_hint(struct wiphy *wiphy, const char *alpha2); * default channel settings will be disregarded. If no rule is found for a * channel on the regulatory domain the channel will be disabled. */ -extern void wiphy_apply_custom_regulatory( - struct wiphy *wiphy, - const struct ieee80211_regdomain *regd); +void wiphy_apply_custom_regulatory(struct wiphy *wiphy, + const struct ieee80211_regdomain *regd); /** * freq_reg_info - get regulatory information for the given frequency diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5b7a3dadadd..e5f02909f1a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2877,14 +2877,14 @@ enum ieee80211_tpt_led_trigger_flags { }; #ifdef CONFIG_MAC80211_LEDS -extern char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw); -extern char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw); -extern char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw); -extern char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw); -extern char *__ieee80211_create_tpt_led_trigger( - struct ieee80211_hw *hw, unsigned int flags, - const struct ieee80211_tpt_blink *blink_table, - unsigned int blink_table_len); +char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw); +char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw); +char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw); +char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw); +char *__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, + unsigned int flags, + const struct ieee80211_tpt_blink *blink_table, + unsigned int blink_table_len); #endif /** * ieee80211_get_tx_led_name - get name of TX LED -- cgit v1.2.3-70-g09d2 From 4fc7074703321e897cc05c0de858fba63e94e59f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 17:31:38 -0700 Subject: checksum: Remove extern from function prototypes There are a mix of function prototypes with and without extern in the kernel sources. Standardize on not using extern for function prototypes. Function prototypes don't need to be written with extern. extern is assumed by the compiler. Its use is as unnecessary as using auto to declare automatic/local variables in a block. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/checksum.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 600d1d705bb..8f59ca50477 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -107,11 +107,11 @@ static inline void csum_replace2(__sum16 *sum, __be16 from, __be16 to) } struct sk_buff; -extern void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, - __be32 from, __be32 to, int pseudohdr); -extern void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, - const __be32 *from, const __be32 *to, - int pseudohdr); +void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, + __be32 from, __be32 to, int pseudohdr); +void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, + const __be32 *from, const __be32 *to, + int pseudohdr); static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, __be16 from, __be16 to, -- cgit v1.2.3-70-g09d2 From 378307217ed9c318212ec3050d38d0e34b77604c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 17:31:39 -0700 Subject: cls_cgroup.h netprio_cgroup.h: Remove extern from function prototypes There are a mix of function prototypes with and without extern in the kernel sources. Standardize on not using extern for function prototypes. Function prototypes don't need to be written with extern. extern is assumed by the compiler. Its use is as unnecessary as using auto to declare automatic/local variables in a block. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/cls_cgroup.h | 2 +- include/net/netprio_cgroup.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 0fee0617fb7..f55c14510bd 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -24,7 +24,7 @@ struct cgroup_cls_state u32 classid; }; -extern void sock_update_classid(struct sock *sk); +void sock_update_classid(struct sock *sk); #if IS_BUILTIN(CONFIG_NET_CLS_CGROUP) static inline u32 task_cls_classid(struct task_struct *p) diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index 50ab8c26ab5..379dd5db0f2 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -29,7 +29,7 @@ struct cgroup_netprio_state { struct cgroup_subsys_state css; }; -extern void sock_update_netprioidx(struct sock *sk); +void sock_update_netprioidx(struct sock *sk); #if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) -- cgit v1.2.3-70-g09d2 From dcb7a6ce0a08012a17cca34711a02017abed554b Mon Sep 17 00:00:00 2001 From: Avinash Patil Date: Fri, 26 Jul 2013 17:02:29 -0700 Subject: ieee80211: add definition for interworking support IEEE802.11u interworking support is advertised via extended capabilities IE bit 31. This is 7th bit of 4th byte of extended capabilities. Signed-off-by: Avinash Patil Signed-off-by: Bing Zhao Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b0dc87a2a37..22c909413d3 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1860,6 +1860,11 @@ enum ieee80211_tdls_actioncode { WLAN_TDLS_DISCOVERY_REQUEST = 10, }; +/* Interworking capabilities are set in 7th bit of 4th byte of the + * @WLAN_EID_EXT_CAPABILITY information element + */ +#define WLAN_EXT_CAPA4_INTERWORKING_ENABLED BIT(7) + /* * TDLS capabililites to be enabled in the 5th byte of the * @WLAN_EID_EXT_CAPABILITY information element -- cgit v1.2.3-70-g09d2 From 9e2bc79bce58a1ce0005015c9351b3bcaaa02e5c Mon Sep 17 00:00:00 2001 From: Fred Zhou Date: Thu, 1 Aug 2013 14:16:28 +0800 Subject: ieee80211: add definition for 802.11ac information elements Add element IDs for Extended BSS Load, VHT TX Power Envelope, AID, and Quiet Channel. Signed-off-by: Fred Zhou Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 22c909413d3..b3ce299782a 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1709,6 +1709,10 @@ enum ieee80211_eid { WLAN_EID_OPMODE_NOTIF = 199, WLAN_EID_WIDE_BW_CHANNEL_SWITCH = 194, WLAN_EID_CHANNEL_SWITCH_WRAPPER = 196, + WLAN_EID_EXTENDED_BSS_LOAD = 193, + WLAN_EID_VHT_TX_POWER_ENVELOPE = 195, + WLAN_EID_AID = 197, + WLAN_EID_QUIET_CHANNEL = 198, /* 802.11ad */ WLAN_EID_NON_TX_BSSID_CAP = 83, -- cgit v1.2.3-70-g09d2 From 16ef1fe272332b2f7fd99236017b891db48d9cd6 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Thu, 11 Jul 2013 16:09:05 +0200 Subject: nl80211/cfg80211: add channel switch command To allow channel switch announcements within beacons, add the channel switch command to nl80211/cfg80211. This is implementation is intended for AP and (later) IBSS mode. Signed-off-by: Simon Wunderlich Signed-off-by: Mathias Kretschmer Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 33 ++++++++++++ include/uapi/linux/nl80211.h | 30 +++++++++++ net/wireless/nl80211.c | 122 ++++++++++++++++++++++++++++++++++++++++++- net/wireless/rdev-ops.h | 12 +++++ net/wireless/trace.h | 33 ++++++++++++ 5 files changed, 229 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index aeaf6dff6e0..b7495c72061 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -665,6 +665,30 @@ struct cfg80211_ap_settings { bool radar_required; }; +/** + * struct cfg80211_csa_settings - channel switch settings + * + * Used for channel switch + * + * @chandef: defines the channel to use after the switch + * @beacon_csa: beacon data while performing the switch + * @counter_offset_beacon: offset for the counter within the beacon (tail) + * @counter_offset_presp: offset for the counter within the probe response + * @beacon_after: beacon data to be used on the new channel + * @radar_required: whether radar detection is required on the new channel + * @block_tx: whether transmissions should be blocked while changing + * @count: number of beacons until switch + */ +struct cfg80211_csa_settings { + struct cfg80211_chan_def chandef; + struct cfg80211_beacon_data beacon_csa; + u16 counter_offset_beacon, counter_offset_presp; + struct cfg80211_beacon_data beacon_after; + bool radar_required; + bool block_tx; + u8 count; +}; + /** * enum station_parameters_apply_mask - station parameter values to apply * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp) @@ -2139,6 +2163,8 @@ struct cfg80211_update_ft_ies_params { * @crit_proto_stop: Indicates critical protocol no longer needs increased link * reliability. This operation can not fail. * @set_coalesce: Set coalesce parameters. + * + * @channel_switch: initiate channel-switch procedure (with CSA) */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2376,6 +2402,10 @@ struct cfg80211_ops { struct wireless_dev *wdev); int (*set_coalesce)(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce); + + int (*channel_switch)(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_csa_settings *params); }; /* @@ -2441,6 +2471,8 @@ struct cfg80211_ops { * @WIPHY_FLAG_OFFCHAN_TX: Device supports direct off-channel TX. * @WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL: Device supports remain-on-channel call. * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels. + * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in + * beaconing mode (AP, IBSS, Mesh, ...). */ enum wiphy_flags { WIPHY_FLAG_CUSTOM_REGULATORY = BIT(0), @@ -2465,6 +2497,7 @@ enum wiphy_flags { WIPHY_FLAG_OFFCHAN_TX = BIT(20), WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL = BIT(21), WIPHY_FLAG_SUPPORTS_5_10_MHZ = BIT(22), + WIPHY_FLAG_HAS_CHANNEL_SWITCH = BIT(23), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index eb68735b331..1f42bc3dcb9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -676,6 +676,16 @@ * @NL80211_CMD_GET_COALESCE: Get currently supported coalesce rules. * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules. * + * @NL80211_CMD_CHANNEL_SWITCH: Perform a channel switch by announcing the + * the new channel information (Channel Switch Announcement - CSA) + * in the beacon for some time (as defined in the + * %NL80211_ATTR_CH_SWITCH_COUNT parameter) and then change to the + * new channel. Userspace provides the new channel information (using + * %NL80211_ATTR_WIPHY_FREQ and the attributes determining channel + * width). %NL80211_ATTR_CH_SWITCH_BLOCK_TX may be supplied to inform + * other station that transmission must be blocked until the channel + * switch is complete. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -841,6 +851,8 @@ enum nl80211_commands { NL80211_CMD_GET_COALESCE, NL80211_CMD_SET_COALESCE, + NL80211_CMD_CHANNEL_SWITCH, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -1469,6 +1481,18 @@ enum nl80211_commands { * * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information. * + * @NL80211_ATTR_CH_SWITCH_COUNT: u32 attribute specifying the number of TBTT's + * until the channel switch event. + * @NL80211_ATTR_CH_SWITCH_BLOCK_TX: flag attribute specifying that transmission + * must be blocked on the current channel (before the channel switch + * operation). + * @NL80211_ATTR_CSA_IES: Nested set of attributes containing the IE information + * for the time while performing a channel switch. + * @NL80211_ATTR_CSA_C_OFF_BEACON: Offset of the channel switch counter + * field in the beacons tail (%NL80211_ATTR_BEACON_TAIL). + * @NL80211_ATTR_CSA_C_OFF_PRESP: Offset of the channel switch counter + * field in the probe response (%NL80211_ATTR_PROBE_RESP). + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1771,6 +1795,12 @@ enum nl80211_attrs { NL80211_ATTR_COALESCE_RULE, + NL80211_ATTR_CH_SWITCH_COUNT, + NL80211_ATTR_CH_SWITCH_BLOCK_TX, + NL80211_ATTR_CSA_IES, + NL80211_ATTR_CSA_C_OFF_BEACON, + NL80211_ATTR_CSA_C_OFF_PRESP, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 03d4ef95292..f7cb12178bd 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -349,6 +349,11 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, [NL80211_ATTR_PEER_AID] = { .type = NLA_U16 }, + [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 }, + [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG }, + [NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED }, + [NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_U16 }, + [NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_U16 }, }; /* policy for the key attributes */ @@ -1422,6 +1427,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, if (state->split) { CMD(crit_proto_start, CRIT_PROTOCOL_START); CMD(crit_proto_stop, CRIT_PROTOCOL_STOP); + if (dev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH) + CMD(channel_switch, CHANNEL_SWITCH); } #ifdef CONFIG_NL80211_TESTMODE @@ -5613,6 +5620,111 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, return err; } +static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_csa_settings params; + /* csa_attrs is defined static to avoid waste of stack size - this + * function is called under RTNL lock, so this should not be a problem. + */ + static struct nlattr *csa_attrs[NL80211_ATTR_MAX+1]; + u8 radar_detect_width = 0; + int err; + + if (!rdev->ops->channel_switch || + !(rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH)) + return -EOPNOTSUPP; + + /* may add IBSS support later */ + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; + + memset(¶ms, 0, sizeof(params)); + + if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] || + !info->attrs[NL80211_ATTR_CH_SWITCH_COUNT]) + return -EINVAL; + + /* only important for AP, IBSS and mesh create IEs internally */ + if (!info->attrs[NL80211_ATTR_CSA_IES]) + return -EINVAL; + + /* useless if AP is not running */ + if (!wdev->beacon_interval) + return -EINVAL; + + params.count = nla_get_u32(info->attrs[NL80211_ATTR_CH_SWITCH_COUNT]); + + err = nl80211_parse_beacon(info->attrs, ¶ms.beacon_after); + if (err) + return err; + + err = nla_parse_nested(csa_attrs, NL80211_ATTR_MAX, + info->attrs[NL80211_ATTR_CSA_IES], + nl80211_policy); + if (err) + return err; + + err = nl80211_parse_beacon(csa_attrs, ¶ms.beacon_csa); + if (err) + return err; + + if (!csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]) + return -EINVAL; + + params.counter_offset_beacon = + nla_get_u16(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]); + if (params.counter_offset_beacon >= params.beacon_csa.tail_len) + return -EINVAL; + + /* sanity check - counters should be the same */ + if (params.beacon_csa.tail[params.counter_offset_beacon] != + params.count) + return -EINVAL; + + if (csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]) { + params.counter_offset_presp = + nla_get_u16(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]); + if (params.counter_offset_presp >= + params.beacon_csa.probe_resp_len) + return -EINVAL; + + if (params.beacon_csa.probe_resp[params.counter_offset_presp] != + params.count) + return -EINVAL; + } + + err = nl80211_parse_chandef(rdev, info, ¶ms.chandef); + if (err) + return err; + + if (!cfg80211_reg_can_beacon(&rdev->wiphy, ¶ms.chandef)) + return -EINVAL; + + err = cfg80211_chandef_dfs_required(wdev->wiphy, ¶ms.chandef); + if (err < 0) { + return err; + } else if (err) { + radar_detect_width = BIT(params.chandef.width); + params.radar_required = true; + } + + err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype, + params.chandef.chan, + CHAN_MODE_SHARED, + radar_detect_width); + if (err) + return err; + + if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX]) + params.block_tx = true; + + return rdev_channel_switch(rdev, dev, ¶ms); +} + static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, u32 seq, int flags, struct cfg80211_registered_device *rdev, @@ -9361,7 +9473,15 @@ static struct genl_ops nl80211_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, - } + }, + { + .cmd = NL80211_CMD_CHANNEL_SWITCH, + .doit = nl80211_channel_switch, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_multicast_group nl80211_mlme_mcgrp = { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 9f15f0ac824..de870d4d0bc 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -923,4 +923,16 @@ static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev, trace_rdev_return_void(&rdev->wiphy); } +static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_csa_settings *params) +{ + int ret; + + trace_rdev_channel_switch(&rdev->wiphy, dev, params); + ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 09af6eb426a..f0ebdcd394e 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1841,6 +1841,39 @@ TRACE_EVENT(rdev_crit_proto_stop, WIPHY_PR_ARG, WDEV_PR_ARG) ); +TRACE_EVENT(rdev_channel_switch, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_csa_settings *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + CHAN_DEF_ENTRY + __field(u16, counter_offset_beacon) + __field(u16, counter_offset_presp) + __field(bool, radar_required) + __field(bool, block_tx) + __field(u8, count) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + CHAN_DEF_ASSIGN(¶ms->chandef); + __entry->counter_offset_beacon = params->counter_offset_beacon; + __entry->counter_offset_presp = params->counter_offset_presp; + __entry->radar_required = params->radar_required; + __entry->block_tx = params->block_tx; + __entry->count = params->count; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT + ", block_tx: %d, count: %u, radar_required: %d" + ", counter offsets (beacon/presp): %u/%u", + WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, + __entry->block_tx, __entry->count, __entry->radar_required, + __entry->counter_offset_beacon, + __entry->counter_offset_presp) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ -- cgit v1.2.3-70-g09d2 From 73da7d5bab79ad7e16ff44d67c3fe8b9c0b33e5b Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Thu, 11 Jul 2013 16:09:06 +0200 Subject: mac80211: add channel switch command and beacon callbacks The count field in CSA must be decremented with each beacon transmitted. This patch implements the functionality for drivers using ieee80211_beacon_get(). Other drivers must call back manually after reaching count == 0. This patch also contains the handling and finish worker for the channel switch command, and mac80211/chanctx code to allow to change a channel definition of an active channel context. Signed-off-by: Simon Wunderlich Signed-off-by: Mathias Kretschmer [small cleanups, catch identical chandef] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 37 +++++++++ net/mac80211/cfg.c | 187 ++++++++++++++++++++++++++++++++++++++++++++- net/mac80211/chan.c | 58 ++++++++++++++ net/mac80211/driver-ops.h | 13 ++++ net/mac80211/ieee80211_i.h | 17 +++++ net/mac80211/iface.c | 9 +++ net/mac80211/trace.h | 26 +++++++ net/mac80211/tx.c | 78 +++++++++++++++++++ 8 files changed, 423 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 3124036285e..9cda3728c2c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -152,11 +152,14 @@ struct ieee80211_low_level_stats { * @IEEE80211_CHANCTX_CHANGE_WIDTH: The channel width changed * @IEEE80211_CHANCTX_CHANGE_RX_CHAINS: The number of RX chains changed * @IEEE80211_CHANCTX_CHANGE_RADAR: radar detection flag changed + * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel, + * this is used only with channel switching with CSA */ enum ieee80211_chanctx_change { IEEE80211_CHANCTX_CHANGE_WIDTH = BIT(0), IEEE80211_CHANCTX_CHANGE_RX_CHAINS = BIT(1), IEEE80211_CHANCTX_CHANGE_RADAR = BIT(2), + IEEE80211_CHANCTX_CHANGE_CHANNEL = BIT(3), }; /** @@ -1084,6 +1087,7 @@ enum ieee80211_vif_flags { * @addr: address of this interface * @p2p: indicates whether this AP or STA interface is a p2p * interface, i.e. a GO or p2p-sta respectively + * @csa_active: marks whether a channel switch is going on * @driver_flags: flags/capabilities the driver has for this interface, * these need to be set (or cleared) when the interface is added * or, if supported by the driver, the interface type is changed @@ -1106,6 +1110,7 @@ struct ieee80211_vif { struct ieee80211_bss_conf bss_conf; u8 addr[ETH_ALEN]; bool p2p; + bool csa_active; u8 cab_queue; u8 hw_queue[IEEE80211_NUM_ACS]; @@ -2637,6 +2642,16 @@ enum ieee80211_roc_type { * @ipv6_addr_change: IPv6 address assignment on the given interface changed. * Currently, this is only called for managed or P2P client interfaces. * This callback is optional; it must not sleep. + * + * @channel_switch_beacon: Starts a channel switch to a new channel. + * Beacons are modified to include CSA or ECSA IEs before calling this + * function. The corresponding count fields in these IEs must be + * decremented, and when they reach zero the driver must call + * ieee80211_csa_finish(). Drivers which use ieee80211_beacon_get() + * get the csa counter decremented by mac80211, but must check if it is + * zero using ieee80211_csa_is_complete() after the beacon has been + * transmitted and then call ieee80211_csa_finish(). + * */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -2824,6 +2839,9 @@ struct ieee80211_ops { struct ieee80211_vif *vif, struct inet6_dev *idev); #endif + void (*channel_switch_beacon)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct cfg80211_chan_def *chandef); }; /** @@ -3318,6 +3336,25 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, return ieee80211_beacon_get_tim(hw, vif, NULL, NULL); } +/** + * ieee80211_csa_finish - notify mac80211 about channel switch + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * After a channel switch announcement was scheduled and the counter in this + * announcement hit zero, this function must be called by the driver to + * notify mac80211 that the channel can be changed. + */ +void ieee80211_csa_finish(struct ieee80211_vif *vif); + +/** + * ieee80211_csa_is_complete - find out if counters reached zero + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * This function returns whether the channel switch counters reached zero. + */ +bool ieee80211_csa_is_complete(struct ieee80211_vif *vif); + + /** * ieee80211_proberesp_get - retrieve a Probe Response template * @hw: pointer obtained from ieee80211_alloc_hw(). diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b82fff6c0b3..44449ceb796 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -860,8 +860,8 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, return 0; } -static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, - struct cfg80211_beacon_data *params) +int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, + struct cfg80211_beacon_data *params) { struct beacon_data *new, *old; int new_head_len, new_tail_len; @@ -1024,6 +1024,12 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, sdata = IEEE80211_DEV_TO_SUB_IF(dev); + /* don't allow changing the beacon while CSA is in place - offset + * of channel switch counter may change + */ + if (sdata->vif.csa_active) + return -EBUSY; + old = rtnl_dereference(sdata->u.ap.beacon); if (!old) return -ENOENT; @@ -1048,6 +1054,10 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) return -ENOENT; old_probe_resp = rtnl_dereference(sdata->u.ap.probe_resp); + /* abort any running channel switch */ + sdata->vif.csa_active = false; + cancel_work_sync(&sdata->csa_finalize_work); + /* turn off carrier for this interface and dependent VLANs */ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_off(vlan->dev); @@ -2775,6 +2785,178 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy, return 0; } +static struct cfg80211_beacon_data * +cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) +{ + struct cfg80211_beacon_data *new_beacon; + u8 *pos; + int len; + + len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len + + beacon->proberesp_ies_len + beacon->assocresp_ies_len + + beacon->probe_resp_len; + + new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL); + if (!new_beacon) + return NULL; + + pos = (u8 *)(new_beacon + 1); + if (beacon->head_len) { + new_beacon->head_len = beacon->head_len; + new_beacon->head = pos; + memcpy(pos, beacon->head, beacon->head_len); + pos += beacon->head_len; + } + if (beacon->tail_len) { + new_beacon->tail_len = beacon->tail_len; + new_beacon->tail = pos; + memcpy(pos, beacon->tail, beacon->tail_len); + pos += beacon->tail_len; + } + if (beacon->beacon_ies_len) { + new_beacon->beacon_ies_len = beacon->beacon_ies_len; + new_beacon->beacon_ies = pos; + memcpy(pos, beacon->beacon_ies, beacon->beacon_ies_len); + pos += beacon->beacon_ies_len; + } + if (beacon->proberesp_ies_len) { + new_beacon->proberesp_ies_len = beacon->proberesp_ies_len; + new_beacon->proberesp_ies = pos; + memcpy(pos, beacon->proberesp_ies, beacon->proberesp_ies_len); + pos += beacon->proberesp_ies_len; + } + if (beacon->assocresp_ies_len) { + new_beacon->assocresp_ies_len = beacon->assocresp_ies_len; + new_beacon->assocresp_ies = pos; + memcpy(pos, beacon->assocresp_ies, beacon->assocresp_ies_len); + pos += beacon->assocresp_ies_len; + } + if (beacon->probe_resp_len) { + new_beacon->probe_resp_len = beacon->probe_resp_len; + beacon->probe_resp = pos; + memcpy(pos, beacon->probe_resp, beacon->probe_resp_len); + pos += beacon->probe_resp_len; + } + + return new_beacon; +} + +void ieee80211_csa_finalize_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + csa_finalize_work); + struct ieee80211_local *local = sdata->local; + int err, changed; + + if (!ieee80211_sdata_running(sdata)) + return; + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP)) + return; + + sdata->radar_required = sdata->csa_radar_required; + err = ieee80211_vif_change_channel(sdata, &local->csa_chandef, + &changed); + if (WARN_ON(err < 0)) + return; + + err = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon); + if (err < 0) + return; + + changed |= err; + kfree(sdata->u.ap.next_beacon); + sdata->u.ap.next_beacon = NULL; + sdata->vif.csa_active = false; + + ieee80211_wake_queues_by_reason(&sdata->local->hw, + IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_CSA); + + ieee80211_bss_info_change_notify(sdata, changed); + + cfg80211_ch_switch_notify(sdata->dev, &local->csa_chandef); +} + +static int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_csa_settings *params) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *chanctx_conf; + struct ieee80211_chanctx *chanctx; + int err, num_chanctx; + + if (!list_empty(&local->roc_list) || local->scanning) + return -EBUSY; + + if (sdata->wdev.cac_started) + return -EBUSY; + + if (cfg80211_chandef_identical(¶ms->chandef, + &sdata->vif.bss_conf.chandef)) + return -EINVAL; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (!chanctx_conf) { + rcu_read_unlock(); + return -EBUSY; + } + + /* don't handle for multi-VIF cases */ + chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); + if (chanctx->refcount > 1) { + rcu_read_unlock(); + return -EBUSY; + } + num_chanctx = 0; + list_for_each_entry_rcu(chanctx, &local->chanctx_list, list) + num_chanctx++; + rcu_read_unlock(); + + if (num_chanctx > 1) + return -EBUSY; + + /* don't allow another channel switch if one is already active. */ + if (sdata->vif.csa_active) + return -EBUSY; + + /* only handle AP for now. */ + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: + break; + default: + return -EOPNOTSUPP; + } + + sdata->u.ap.next_beacon = cfg80211_beacon_dup(¶ms->beacon_after); + if (!sdata->u.ap.next_beacon) + return -ENOMEM; + + sdata->csa_counter_offset_beacon = params->counter_offset_beacon; + sdata->csa_counter_offset_presp = params->counter_offset_presp; + sdata->csa_radar_required = params->radar_required; + + if (params->block_tx) + ieee80211_stop_queues_by_reason(&local->hw, + IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_CSA); + + err = ieee80211_assign_beacon(sdata, ¶ms->beacon_csa); + if (err < 0) + return err; + + local->csa_chandef = params->chandef; + sdata->vif.csa_active = true; + + ieee80211_bss_info_change_notify(sdata, err); + drv_channel_switch_beacon(sdata, ¶ms->chandef); + + return 0; +} + static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct ieee80211_channel *chan, bool offchan, unsigned int wait, const u8 *buf, size_t len, @@ -3492,4 +3674,5 @@ struct cfg80211_ops mac80211_config_ops = { .get_et_strings = ieee80211_get_et_strings, .get_channel = ieee80211_cfg_get_channel, .start_radar_detection = ieee80211_start_radar_detection, + .channel_switch = ieee80211_channel_switch, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 03e8d2e3270..3a4764b2869 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -410,6 +410,64 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, return ret; } +int ieee80211_vif_change_channel(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_chan_def *chandef, + u32 *changed) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *ctx; + int ret; + u32 chanctx_changed = 0; + + /* should never be called if not performing a channel switch. */ + if (WARN_ON(!sdata->vif.csa_active)) + return -EINVAL; + + if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, + IEEE80211_CHAN_DISABLED)) + return -EINVAL; + + mutex_lock(&local->chanctx_mtx); + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (!conf) { + ret = -EINVAL; + goto out; + } + + ctx = container_of(conf, struct ieee80211_chanctx, conf); + if (ctx->refcount != 1) { + ret = -EINVAL; + goto out; + } + + if (sdata->vif.bss_conf.chandef.width != chandef->width) { + chanctx_changed = IEEE80211_CHANCTX_CHANGE_WIDTH; + *changed |= BSS_CHANGED_BANDWIDTH; + } + + sdata->vif.bss_conf.chandef = *chandef; + ctx->conf.def = *chandef; + + chanctx_changed |= IEEE80211_CHANCTX_CHANGE_CHANNEL; + drv_change_chanctx(local, ctx, chanctx_changed); + + if (!local->use_chanctx) { + local->_oper_chandef = *chandef; + ieee80211_hw_config(local, 0); + } + + ieee80211_recalc_chanctx_chantype(local, ctx); + ieee80211_recalc_smps_chanctx(local, ctx); + ieee80211_recalc_radar_chanctx(local, ctx); + + ret = 0; + out: + mutex_unlock(&local->chanctx_mtx); + return ret; +} + int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, u32 *changed) diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index b931c96a596..b3ea11f3d52 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1072,4 +1072,17 @@ static inline void drv_ipv6_addr_change(struct ieee80211_local *local, } #endif +static inline void +drv_channel_switch_beacon(struct ieee80211_sub_if_data *sdata, + struct cfg80211_chan_def *chandef) +{ + struct ieee80211_local *local = sdata->local; + + if (local->ops->channel_switch_beacon) { + trace_drv_channel_switch_beacon(local, sdata, chandef); + local->ops->channel_switch_beacon(&local->hw, &sdata->vif, + chandef); + } +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index d779383c52d..e94c84050e9 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -259,6 +259,8 @@ struct ieee80211_if_ap { struct beacon_data __rcu *beacon; struct probe_resp __rcu *probe_resp; + /* to be used after channel switch. */ + struct cfg80211_beacon_data *next_beacon; struct list_head vlans; struct ps_data ps; @@ -716,6 +718,11 @@ struct ieee80211_sub_if_data { struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS]; + struct work_struct csa_finalize_work; + int csa_counter_offset_beacon; + int csa_counter_offset_presp; + bool csa_radar_required; + /* used to reconfigure hardware SM PS */ struct work_struct recalc_smps; @@ -1372,6 +1379,9 @@ void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc, bool free); void ieee80211_sw_roc_work(struct work_struct *work); void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc); +/* channel switch handling */ +void ieee80211_csa_finalize_work(struct work_struct *work); + /* interface handling */ int ieee80211_iface_init(void); void ieee80211_iface_exit(void); @@ -1393,6 +1403,8 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local); bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata); void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata); +int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, + struct cfg80211_beacon_data *params); static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) { @@ -1654,6 +1666,11 @@ int __must_check ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, u32 *changed); +/* NOTE: only use ieee80211_vif_change_channel() for channel switch */ +int __must_check +ieee80211_vif_change_channel(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_chan_def *chandef, + u32 *changed); void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 4c41c11958c..7ca534bf4ce 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -274,6 +274,12 @@ static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, if (iftype == NL80211_IFTYPE_ADHOC && nsdata->vif.type == NL80211_IFTYPE_ADHOC) return -EBUSY; + /* + * will not add another interface while any channel + * switch is active. + */ + if (nsdata->vif.csa_active) + return -EBUSY; /* * The remaining checks are only performed for interfaces @@ -804,6 +810,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, cancel_work_sync(&local->dynamic_ps_enable_work); cancel_work_sync(&sdata->recalc_smps); + sdata->vif.csa_active = false; + cancel_work_sync(&sdata->csa_finalize_work); cancel_delayed_work_sync(&sdata->dfs_cac_timer_work); @@ -1267,6 +1275,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, skb_queue_head_init(&sdata->skb_queue); INIT_WORK(&sdata->work, ieee80211_iface_work); INIT_WORK(&sdata->recalc_smps, ieee80211_recalc_smps_work); + INIT_WORK(&sdata->csa_finalize_work, ieee80211_csa_finalize_work); switch (type) { case NL80211_IFTYPE_P2P_GO: diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index c215fafd7a2..1aba645882b 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -1906,6 +1906,32 @@ TRACE_EVENT(api_radar_detected, ) ); +TRACE_EVENT(drv_channel_switch_beacon, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_chan_def *chandef), + + TP_ARGS(local, sdata, chandef), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + CHANDEF_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + CHANDEF_ASSIGN(chandef); + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT " channel switch to " CHANDEF_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG + ) +); + + #ifdef CONFIG_MAC80211_MESSAGE_TRACING #undef TRACE_SYSTEM #define TRACE_SYSTEM mac80211_msg diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index f65873f0c89..0e42322aa6b 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2338,6 +2338,81 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, return 0; } +void ieee80211_csa_finish(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + ieee80211_queue_work(&sdata->local->hw, + &sdata->csa_finalize_work); +} +EXPORT_SYMBOL(ieee80211_csa_finish); + +static void ieee80211_update_csa(struct ieee80211_sub_if_data *sdata, + struct beacon_data *beacon) +{ + struct probe_resp *resp; + int counter_offset_beacon = sdata->csa_counter_offset_beacon; + int counter_offset_presp = sdata->csa_counter_offset_presp; + + /* warn if the driver did not check for/react to csa completeness */ + if (WARN_ON(((u8 *)beacon->tail)[counter_offset_beacon] == 0)) + return; + + ((u8 *)beacon->tail)[counter_offset_beacon]--; + + if (sdata->vif.type == NL80211_IFTYPE_AP && + counter_offset_presp) { + rcu_read_lock(); + resp = rcu_dereference(sdata->u.ap.probe_resp); + + /* if nl80211 accepted the offset, this should not happen. */ + if (WARN_ON(!resp)) { + rcu_read_unlock(); + return; + } + resp->data[counter_offset_presp]--; + rcu_read_unlock(); + } +} + +bool ieee80211_csa_is_complete(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct beacon_data *beacon = NULL; + u8 *beacon_data; + size_t beacon_data_len; + int counter_beacon = sdata->csa_counter_offset_beacon; + int ret = false; + + if (!ieee80211_sdata_running(sdata)) + return false; + + rcu_read_lock(); + if (vif->type == NL80211_IFTYPE_AP) { + struct ieee80211_if_ap *ap = &sdata->u.ap; + + beacon = rcu_dereference(ap->beacon); + if (WARN_ON(!beacon || !beacon->tail)) + goto out; + beacon_data = beacon->tail; + beacon_data_len = beacon->tail_len; + } else { + WARN_ON(1); + goto out; + } + + if (WARN_ON(counter_beacon > beacon_data_len)) + goto out; + + if (beacon_data[counter_beacon] == 0) + ret = true; + out: + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(ieee80211_csa_is_complete); + struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 *tim_offset, u16 *tim_length) @@ -2368,6 +2443,9 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, struct beacon_data *beacon = rcu_dereference(ap->beacon); if (beacon) { + if (sdata->vif.csa_active) + ieee80211_update_csa(sdata, beacon); + /* * headroom, head length, * tail length and maximum TIM length -- cgit v1.2.3-70-g09d2 From 376c7311bdb6efea3322310333576a04d73fbe4c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 1 Aug 2013 11:43:08 -0700 Subject: net: add a temporary sanity check in skb_orphan() David suggested to add a BUG_ON() to catch if some layer sets skb->sk pointer without a corresponding destructor. As skb can sit in a queue, it's mandatory to make sure the socket cannot disappear, and it's usually done by taking a reference on the socket, then releasing it from the skb destructor. This patch is a follow-up to commit c34a761231b5 ("net: skb_orphan() changes") and will be reverted after catching all possible offenders if any. Suggested-by: David Miller Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a95547adff2..d48de687c21 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1809,6 +1809,8 @@ static inline void skb_orphan(struct sk_buff *skb) skb->destructor(skb); skb->destructor = NULL; skb->sk = NULL; + } else { + BUG_ON(skb->sk); } } -- cgit v1.2.3-70-g09d2 From e216975ad97cfcfc436789aa66d59a0e93f337f7 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 1 Aug 2013 16:17:47 -0700 Subject: uapi: Convert some uses of 6 to ETH_ALEN Use the #define where appropriate. Add #include where appropriate too. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/uapi/linux/dn.h | 3 ++- include/uapi/linux/if_bridge.h | 3 ++- include/uapi/linux/netfilter_bridge/ebt_802_3.h | 5 +++-- include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h | 3 ++- include/uapi/linux/virtio_net.h | 2 +- include/uapi/linux/wimax/i2400m.h | 4 ++-- 6 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/dn.h b/include/uapi/linux/dn.h index 9c50445462d..5fbdd3d49eb 100644 --- a/include/uapi/linux/dn.h +++ b/include/uapi/linux/dn.h @@ -2,6 +2,7 @@ #define _LINUX_DN_H #include +#include /* @@ -120,7 +121,7 @@ struct linkinfo_dn { * Ethernet address format (for DECnet) */ union etheraddress { - __u8 dne_addr[6]; /* Full ethernet address */ + __u8 dne_addr[ETH_ALEN]; /* Full ethernet address */ struct { __u8 dne_hiord[4]; /* DECnet HIORD prefix */ __u8 dne_nodeaddr[2]; /* DECnet node address */ diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 2d70d79ce2f..39f621a9fe8 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -14,6 +14,7 @@ #define _UAPI_LINUX_IF_BRIDGE_H #include +#include #define SYSFS_BRIDGE_ATTR "bridge" #define SYSFS_BRIDGE_FDB "brforward" @@ -88,7 +89,7 @@ struct __port_info { }; struct __fdb_entry { - __u8 mac_addr[6]; + __u8 mac_addr[ETH_ALEN]; __u8 port_no; __u8 is_local; __u32 ageing_timer_value; diff --git a/include/uapi/linux/netfilter_bridge/ebt_802_3.h b/include/uapi/linux/netfilter_bridge/ebt_802_3.h index 5bf84912a08..f37522aade2 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_802_3.h +++ b/include/uapi/linux/netfilter_bridge/ebt_802_3.h @@ -2,6 +2,7 @@ #define _UAPI__LINUX_BRIDGE_EBT_802_3_H #include +#include #define EBT_802_3_SAP 0x01 #define EBT_802_3_TYPE 0x02 @@ -42,8 +43,8 @@ struct hdr_ni { }; struct ebt_802_3_hdr { - __u8 daddr[6]; - __u8 saddr[6]; + __u8 daddr[ETH_ALEN]; + __u8 saddr[ETH_ALEN]; __be16 len; union { struct hdr_ui ui; diff --git a/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h index c6a204c9704..eac0f6548f4 100644 --- a/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h +++ b/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h @@ -2,6 +2,7 @@ #define _IPT_CLUSTERIP_H_target #include +#include enum clusterip_hashmode { CLUSTERIP_HASHMODE_SIP = 0, @@ -22,7 +23,7 @@ struct ipt_clusterip_tgt_info { __u32 flags; /* only relevant for new ones */ - __u8 clustermac[6]; + __u8 clustermac[ETH_ALEN]; __u16 num_total_nodes; __u16 num_local_nodes; __u16 local_nodes[CLUSTERIP_MAX_NODES]; diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 227d4ce84e7..172a7f00780 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -60,7 +60,7 @@ struct virtio_net_config { /* The config defining mac address (if VIRTIO_NET_F_MAC) */ - __u8 mac[6]; + __u8 mac[ETH_ALEN]; /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ __u16 status; /* Maximum number of each of transmit and receive queues; diff --git a/include/uapi/linux/wimax/i2400m.h b/include/uapi/linux/wimax/i2400m.h index 62d35615356..fd198bc24a3 100644 --- a/include/uapi/linux/wimax/i2400m.h +++ b/include/uapi/linux/wimax/i2400m.h @@ -122,7 +122,7 @@ #define __LINUX__WIMAX__I2400M_H__ #include - +#include /* * Host Device Interface (HDI) common to all busses @@ -487,7 +487,7 @@ struct i2400m_tlv_l4_message_versions { struct i2400m_tlv_detailed_device_info { struct i2400m_tlv_hdr hdr; __u8 reserved1[400]; - __u8 mac_address[6]; + __u8 mac_address[ETH_ALEN]; __u8 reserved2[2]; } __attribute__((packed)); -- cgit v1.2.3-70-g09d2 From 574e2af7c0af3273836def5e66f236521bb433c9 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 1 Aug 2013 16:17:48 -0700 Subject: include: Convert ethernet mac address declarations to use ETH_ALEN It's convenient to have ethernet mac addresses use ETH_ALEN to be able to grep for them a bit easier and also to ensure that the addresses are __aligned(2). Add #include as necessary. Signed-off-by: Joe Perches Acked-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- include/linux/dm9000.h | 4 ++- include/linux/fs_enet_pd.h | 3 ++- include/linux/ieee80211.h | 59 +++++++++++++++++++++-------------------- include/linux/mlx4/device.h | 11 ++++---- include/linux/mlx4/qp.h | 5 ++-- include/linux/mv643xx_eth.h | 3 ++- include/linux/sh_eth.h | 3 ++- include/linux/smsc911x.h | 3 ++- include/linux/uwb/spec.h | 5 ++-- include/media/tveeprom.h | 4 ++- include/net/irda/irlan_common.h | 3 ++- 11 files changed, 58 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/linux/dm9000.h b/include/linux/dm9000.h index 96e87693d93..841925fbfe8 100644 --- a/include/linux/dm9000.h +++ b/include/linux/dm9000.h @@ -14,6 +14,8 @@ #ifndef __DM9000_PLATFORM_DATA #define __DM9000_PLATFORM_DATA __FILE__ +#include + /* IO control flags */ #define DM9000_PLATF_8BITONLY (0x0001) @@ -27,7 +29,7 @@ struct dm9000_plat_data { unsigned int flags; - unsigned char dev_addr[6]; + unsigned char dev_addr[ETH_ALEN]; /* allow replacement IO routines */ diff --git a/include/linux/fs_enet_pd.h b/include/linux/fs_enet_pd.h index 51b793466ff..343d82a5446 100644 --- a/include/linux/fs_enet_pd.h +++ b/include/linux/fs_enet_pd.h @@ -18,6 +18,7 @@ #include #include +#include #include #define FS_ENET_NAME "fs_enet" @@ -135,7 +136,7 @@ struct fs_platform_info { const struct fs_mii_bus_info *bus_info; int rx_ring, tx_ring; /* number of buffers on rx */ - __u8 macaddr[6]; /* mac address */ + __u8 macaddr[ETH_ALEN]; /* mac address */ int rx_copybreak; /* limit we copy small frames */ int use_napi; /* use NAPI */ int napi_weight; /* NAPI weight */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b0dc87a2a37..4e101af9c1f 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -16,6 +16,7 @@ #define LINUX_IEEE80211_H #include +#include #include /* @@ -209,28 +210,28 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) struct ieee80211_hdr { __le16 frame_control; __le16 duration_id; - u8 addr1[6]; - u8 addr2[6]; - u8 addr3[6]; + u8 addr1[ETH_ALEN]; + u8 addr2[ETH_ALEN]; + u8 addr3[ETH_ALEN]; __le16 seq_ctrl; - u8 addr4[6]; + u8 addr4[ETH_ALEN]; } __packed __aligned(2); struct ieee80211_hdr_3addr { __le16 frame_control; __le16 duration_id; - u8 addr1[6]; - u8 addr2[6]; - u8 addr3[6]; + u8 addr1[ETH_ALEN]; + u8 addr2[ETH_ALEN]; + u8 addr3[ETH_ALEN]; __le16 seq_ctrl; } __packed __aligned(2); struct ieee80211_qos_hdr { __le16 frame_control; __le16 duration_id; - u8 addr1[6]; - u8 addr2[6]; - u8 addr3[6]; + u8 addr1[ETH_ALEN]; + u8 addr2[ETH_ALEN]; + u8 addr3[ETH_ALEN]; __le16 seq_ctrl; __le16 qos_ctrl; } __packed __aligned(2); @@ -608,8 +609,8 @@ struct ieee80211s_hdr { u8 flags; u8 ttl; __le32 seqnum; - u8 eaddr1[6]; - u8 eaddr2[6]; + u8 eaddr1[ETH_ALEN]; + u8 eaddr2[ETH_ALEN]; } __packed __aligned(2); /* Mesh flags */ @@ -758,7 +759,7 @@ struct ieee80211_rann_ie { u8 rann_flags; u8 rann_hopcount; u8 rann_ttl; - u8 rann_addr[6]; + u8 rann_addr[ETH_ALEN]; __le32 rann_seq; __le32 rann_interval; __le32 rann_metric; @@ -802,9 +803,9 @@ enum ieee80211_vht_opmode_bits { struct ieee80211_mgmt { __le16 frame_control; __le16 duration; - u8 da[6]; - u8 sa[6]; - u8 bssid[6]; + u8 da[ETH_ALEN]; + u8 sa[ETH_ALEN]; + u8 bssid[ETH_ALEN]; __le16 seq_ctrl; union { struct { @@ -833,7 +834,7 @@ struct ieee80211_mgmt { struct { __le16 capab_info; __le16 listen_interval; - u8 current_ap[6]; + u8 current_ap[ETH_ALEN]; /* followed by SSID and Supported rates */ u8 variable[0]; } __packed reassoc_req; @@ -966,21 +967,21 @@ struct ieee80211_vendor_ie { struct ieee80211_rts { __le16 frame_control; __le16 duration; - u8 ra[6]; - u8 ta[6]; + u8 ra[ETH_ALEN]; + u8 ta[ETH_ALEN]; } __packed __aligned(2); struct ieee80211_cts { __le16 frame_control; __le16 duration; - u8 ra[6]; + u8 ra[ETH_ALEN]; } __packed __aligned(2); struct ieee80211_pspoll { __le16 frame_control; __le16 aid; - u8 bssid[6]; - u8 ta[6]; + u8 bssid[ETH_ALEN]; + u8 ta[ETH_ALEN]; } __packed __aligned(2); /* TDLS */ @@ -989,14 +990,14 @@ struct ieee80211_pspoll { struct ieee80211_tdls_lnkie { u8 ie_type; /* Link Identifier IE */ u8 ie_len; - u8 bssid[6]; - u8 init_sta[6]; - u8 resp_sta[6]; + u8 bssid[ETH_ALEN]; + u8 init_sta[ETH_ALEN]; + u8 resp_sta[ETH_ALEN]; } __packed; struct ieee80211_tdls_data { - u8 da[6]; - u8 sa[6]; + u8 da[ETH_ALEN]; + u8 sa[ETH_ALEN]; __be16 ether_type; u8 payload_type; u8 category; @@ -1090,8 +1091,8 @@ struct ieee80211_p2p_noa_attr { struct ieee80211_bar { __le16 frame_control; __le16 duration; - __u8 ra[6]; - __u8 ta[6]; + __u8 ra[ETH_ALEN]; + __u8 ta[ETH_ALEN]; __le16 control; __le16 start_seq_num; } __packed; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 6aebdfe0ed8..09ef2f448a0 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -33,6 +33,7 @@ #ifndef MLX4_DEVICE_H #define MLX4_DEVICE_H +#include #include #include #include @@ -620,7 +621,7 @@ struct mlx4_eth_av { u8 dgid[16]; u32 reserved4[2]; __be16 vlan; - u8 mac[6]; + u8 mac[ETH_ALEN]; }; union mlx4_ext_av { @@ -914,10 +915,10 @@ enum mlx4_net_trans_promisc_mode { }; struct mlx4_spec_eth { - u8 dst_mac[6]; - u8 dst_mac_msk[6]; - u8 src_mac[6]; - u8 src_mac_msk[6]; + u8 dst_mac[ETH_ALEN]; + u8 dst_mac_msk[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; + u8 src_mac_msk[ETH_ALEN]; u8 ether_type_enable; __be16 ether_type; __be16 vlan_id_msk; diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 262deac02c9..6d351473c29 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -34,6 +34,7 @@ #define MLX4_QP_H #include +#include #include @@ -143,7 +144,7 @@ struct mlx4_qp_path { u8 feup; u8 fvl_rx; u8 reserved4[2]; - u8 dmac[6]; + u8 dmac[ETH_ALEN]; }; enum { /* fl */ @@ -318,7 +319,7 @@ struct mlx4_wqe_datagram_seg { __be32 dqpn; __be32 qkey; __be16 vlan; - u8 mac[6]; + u8 mac[ETH_ALEN]; }; struct mlx4_wqe_lso_seg { diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h index 6e8215b1599..61a0da38d0c 100644 --- a/include/linux/mv643xx_eth.h +++ b/include/linux/mv643xx_eth.h @@ -6,6 +6,7 @@ #define __LINUX_MV643XX_ETH_H #include +#include #define MV643XX_ETH_SHARED_NAME "mv643xx_eth" #define MV643XX_ETH_NAME "mv643xx_eth_port" @@ -48,7 +49,7 @@ struct mv643xx_eth_platform_data { * Use this MAC address if it is valid, overriding the * address that is already in the hardware. */ - u8 mac_addr[6]; + u8 mac_addr[ETH_ALEN]; /* * If speed is 0, autonegotiation is enabled. diff --git a/include/linux/sh_eth.h b/include/linux/sh_eth.h index fc305713fc6..6205eeba392 100644 --- a/include/linux/sh_eth.h +++ b/include/linux/sh_eth.h @@ -2,6 +2,7 @@ #define __ASM_SH_ETH_H__ #include +#include enum {EDMAC_LITTLE_ENDIAN, EDMAC_BIG_ENDIAN}; enum { @@ -18,7 +19,7 @@ struct sh_eth_plat_data { phy_interface_t phy_interface; void (*set_mdio_gate)(void *addr); - unsigned char mac_addr[6]; + unsigned char mac_addr[ETH_ALEN]; unsigned no_ether_link:1; unsigned ether_link_active_low:1; unsigned needs_init:1; diff --git a/include/linux/smsc911x.h b/include/linux/smsc911x.h index 4dde70e7482..eec3efd19be 100644 --- a/include/linux/smsc911x.h +++ b/include/linux/smsc911x.h @@ -22,6 +22,7 @@ #define __LINUX_SMSC911X_H__ #include +#include /* platform_device configuration data, should be assigned to * the platform_device's dev.platform_data */ @@ -31,7 +32,7 @@ struct smsc911x_platform_config { unsigned int flags; unsigned int shift; phy_interface_t phy_interface; - unsigned char mac[6]; + unsigned char mac[ETH_ALEN]; }; /* Constants for platform_device irq polarity configuration */ diff --git a/include/linux/uwb/spec.h b/include/linux/uwb/spec.h index b52e44f1bd3..0df24bfcdb3 100644 --- a/include/linux/uwb/spec.h +++ b/include/linux/uwb/spec.h @@ -32,6 +32,7 @@ #include #include +#include #define i1480_FW 0x00000303 /* #define i1480_FW 0x00000302 */ @@ -130,7 +131,7 @@ enum { UWB_DRP_BACKOFF_WIN_MAX = 16 }; * it is also used to define headers sent down and up the wire/radio). */ struct uwb_mac_addr { - u8 data[6]; + u8 data[ETH_ALEN]; } __attribute__((packed)); @@ -568,7 +569,7 @@ struct uwb_rc_evt_confirm { /* Device Address Management event. [WHCI] section 3.1.3.2. */ struct uwb_rc_evt_dev_addr_mgmt { struct uwb_rceb rceb; - u8 baAddr[6]; + u8 baAddr[ETH_ALEN]; u8 bResultCode; } __attribute__((packed)); diff --git a/include/media/tveeprom.h b/include/media/tveeprom.h index 4a1191abd93..f7119ee3977 100644 --- a/include/media/tveeprom.h +++ b/include/media/tveeprom.h @@ -12,6 +12,8 @@ enum tveeprom_audio_processor { TVEEPROM_AUDPROC_OTHER, }; +#include + struct tveeprom { u32 has_radio; /* If has_ir == 0, then it is unknown what the IR capabilities are, @@ -40,7 +42,7 @@ struct tveeprom { u32 revision; u32 serial_number; char rev_str[5]; - u8 MAC_address[6]; + u8 MAC_address[ETH_ALEN]; }; void tveeprom_hauppauge_analog(struct i2c_client *c, struct tveeprom *tvee, diff --git a/include/net/irda/irlan_common.h b/include/net/irda/irlan_common.h index 0af8b8dfbc2..550c2d6ec7f 100644 --- a/include/net/irda/irlan_common.h +++ b/include/net/irda/irlan_common.h @@ -32,6 +32,7 @@ #include #include #include +#include #include @@ -161,7 +162,7 @@ struct irlan_provider_cb { int access_type; /* Access type */ __u16 send_arb_val; - __u8 mac_address[6]; /* Generated MAC address for peer device */ + __u8 mac_address[ETH_ALEN]; /* Generated MAC address for peer device */ }; /* -- cgit v1.2.3-70-g09d2 From d27fc78208b53ccdfd6a57d4ac44a459ca66806f Mon Sep 17 00:00:00 2001 From: "fan.du" Date: Fri, 2 Aug 2013 10:45:13 +0800 Subject: sctp: Don't lookup dst if transport dst is still valid When sctp sits on IPv6, sctp_transport_dst_check pass cookie as ZERO, as a result ip6_dst_check always fail out. This behaviour makes transport->dst useless, because every sctp_packet_transmit must look for valid dst. Add a dst_cookie into sctp_transport, and set the cookie whenever we get new dst for sctp_transport. So dst validness could be checked against it. Since I have split genid for IPv4 and IPv6, also delete/add IPv6 address will also bump IPv6 genid. So issues we discussed in: http://marc.info/?l=linux-netdev&m=137404469219410&w=4 have all been sloved for this patch. Signed-off-by: Fan Du Acked-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 2 +- include/net/sctp/structs.h | 1 + net/sctp/ipv6.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 554cf88605f..cb28df9c278 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -613,7 +613,7 @@ static inline void sctp_v4_map_v6(union sctp_addr *addr) */ static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t) { - if (t->dst && !dst_check(t->dst, 0)) { + if (t->dst && !dst_check(t->dst, t->dst_cookie)) { dst_release(t->dst); t->dst = NULL; } diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 75c4c16601b..c0f4e29eedd 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -946,6 +946,7 @@ struct sctp_transport { __u64 hb_nonce; struct rcu_head rcu; + u32 dst_cookie; }; struct sctp_transport *sctp_transport_new(struct net *, const union sctp_addr *, diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 85d688f59e6..5a9402e3c0c 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -351,7 +351,7 @@ out: rt = (struct rt6_info *)dst; t->dst = dst; - + t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; pr_debug("rt6_dst:%pI6 rt6_src:%pI6\n", &rt->rt6i_dst.addr, &fl6->saddr); } else { -- cgit v1.2.3-70-g09d2 From 6ef94cfafba159d6b1a902ccb3349ac6a34ff6ad Mon Sep 17 00:00:00 2001 From: Stefan Tomanek Date: Fri, 2 Aug 2013 17:19:56 +0200 Subject: fib_rules: add route suppression based on ifgroup This change adds the ability to suppress a routing decision based upon the interface group the selected interface belongs to. This allows it to exclude specific devices from a routing decision. Signed-off-by: Stefan Tomanek Signed-off-by: David S. Miller --- include/net/fib_rules.h | 2 ++ include/uapi/linux/fib_rules.h | 2 +- net/core/fib_rules.c | 10 ++++++++++ net/ipv4/fib_rules.c | 23 +++++++++++++++++------ net/ipv6/fib6_rules.c | 16 +++++++++++++--- 5 files changed, 43 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 2f286dce925..d13c461b4b5 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -18,6 +18,7 @@ struct fib_rule { u32 pref; u32 flags; u32 table; + int suppress_ifgroup; u8 table_prefixlen_min; u8 action; u32 target; @@ -84,6 +85,7 @@ struct fib_rules_ops { [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ [FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \ + [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 } static inline void fib_rule_get(struct fib_rule *rule) diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 59cd31b3455..63e31166e85 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -44,7 +44,7 @@ enum { FRA_FWMARK, /* mark */ FRA_FLOW, /* flow/class id */ FRA_UNUSED6, - FRA_UNUSED7, + FRA_SUPPRESS_IFGROUP, FRA_TABLE_PREFIXLEN_MIN, FRA_TABLE, /* Extended table id */ FRA_FWMASK, /* mask for netfilter mark */ diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 2ef5040c99c..5040a61bf28 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -343,6 +343,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (tb[FRA_TABLE_PREFIXLEN_MIN]) rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]); + if (tb[FRA_SUPPRESS_IFGROUP]) + rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); @@ -529,6 +532,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */ + + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4); /* FRA_FWMASK */ @@ -588,6 +592,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target))) goto nla_put_failure; + + if (rule->suppress_ifgroup != -1) { + if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) + goto nla_put_failure; + } + if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 9f2906679d1..b78fd28970c 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -103,16 +103,27 @@ errout: static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { + struct fib_result *result = (struct fib_result *) arg->result; + struct net_device *dev = result->fi->fib_dev; + /* do not accept result if the route does * not meet the required prefix length */ - struct fib_result *result = (struct fib_result *) arg->result; - if (result->prefixlen < rule->table_prefixlen_min) { - if (!(arg->flags & FIB_LOOKUP_NOREF)) - fib_info_put(result->fi); - return true; - } + if (result->prefixlen < rule->table_prefixlen_min) + goto suppress_route; + + /* do not accept result if the route uses a device + * belonging to a forbidden interface group + */ + if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) + goto suppress_route; + return false; + +suppress_route: + if (!(arg->flags & FIB_LOOKUP_NOREF)) + fib_info_put(result->fi); + return true; } static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 554a4fbabfb..36283267e2f 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -122,14 +122,24 @@ out: static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { struct rt6_info *rt = (struct rt6_info *) arg->result; + struct net_device *dev = rt->rt6i_idev->dev; /* do not accept result if the route does * not meet the required prefix length */ - if (rt->rt6i_dst.plen < rule->table_prefixlen_min) { + if (rt->rt6i_dst.plen < rule->table_prefixlen_min) + goto suppress_route; + + /* do not accept result if the route uses a device + * belonging to a forbidden interface group + */ + if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) + goto suppress_route; + + return false; + +suppress_route: ip6_rt_put(rt); return true; - } - return false; } static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) -- cgit v1.2.3-70-g09d2 From 73f5698e77219bfc3ea1903759fe8e20ab5b285e Mon Sep 17 00:00:00 2001 From: Stefan Tomanek Date: Sat, 3 Aug 2013 14:14:43 +0200 Subject: fib_rules: fix suppressor names and default values This change brings the suppressor attribute names into line; it also changes the data types to provide a more consistent interface. While -1 indicates that the suppressor is not enabled, values >= 0 for suppress_prefixlen or suppress_ifgroup reject routing decisions violating the constraint. This changes the previously presented behaviour of suppress_prefixlen, where a prefix length _less_ than the attribute value was rejected. After this change, a prefix length less than *or* equal to the value is considered a violation of the rule constraint. It also changes the default values for default and newly added rules (disabling any suppression for those). Signed-off-by: Stefan Tomanek Signed-off-by: David S. Miller --- include/net/fib_rules.h | 4 ++-- include/uapi/linux/fib_rules.h | 2 +- net/core/fib_rules.c | 15 +++++++++++---- net/ipv4/fib_rules.c | 2 +- net/ipv6/fib6_rules.c | 2 +- 5 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index d13c461b4b5..9d0fcbaa9cb 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -19,7 +19,7 @@ struct fib_rule { u32 flags; u32 table; int suppress_ifgroup; - u8 table_prefixlen_min; + int suppress_prefixlen; u8 action; u32 target; struct fib_rule __rcu *ctarget; @@ -84,7 +84,7 @@ struct fib_rules_ops { [FRA_FWMARK] = { .type = NLA_U32 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ - [FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \ + [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 } diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 63e31166e85..2b82d7e3097 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -45,7 +45,7 @@ enum { FRA_FLOW, /* flow/class id */ FRA_UNUSED6, FRA_SUPPRESS_IFGROUP, - FRA_TABLE_PREFIXLEN_MIN, + FRA_SUPPRESS_PREFIXLEN, FRA_TABLE, /* Extended table id */ FRA_FWMASK, /* mask for netfilter mark */ FRA_OIFNAME, diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 5040a61bf28..2e654138433 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -33,6 +33,9 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->flags = flags; r->fr_net = hold_net(ops->fro_net); + r->suppress_prefixlen = -1; + r->suppress_ifgroup = -1; + /* The lock is not required here, the list in unreacheable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); @@ -340,11 +343,15 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); - if (tb[FRA_TABLE_PREFIXLEN_MIN]) - rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]); + if (tb[FRA_SUPPRESS_PREFIXLEN]) + rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); + else + rule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + else + rule->suppress_ifgroup = -1; if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); @@ -531,7 +538,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ - + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */ + + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4); /* FRA_FWMASK */ @@ -558,7 +565,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->table = rule->table; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; - if (nla_put_u8(skb, FRA_TABLE_PREFIXLEN_MIN, rule->table_prefixlen_min)) + if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b78fd28970c..523be38e37d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -109,7 +109,7 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg /* do not accept result if the route does * not meet the required prefix length */ - if (result->prefixlen < rule->table_prefixlen_min) + if (result->prefixlen <= rule->suppress_prefixlen) goto suppress_route; /* do not accept result if the route uses a device diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 36283267e2f..a6c58ce43d3 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -126,7 +126,7 @@ static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg /* do not accept result if the route does * not meet the required prefix length */ - if (rt->rt6i_dst.plen < rule->table_prefixlen_min) + if (rt->rt6i_dst.plen <= rule->suppress_prefixlen) goto suppress_route; /* do not accept result if the route uses a device -- cgit v1.2.3-70-g09d2 From fba3679d34511c42bf452e89dda457a1219eb43a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 3 Aug 2013 11:50:35 -0700 Subject: fib_rules: reorder struct fib_rules fields Move refcnt, pref, suppress_ifgroup, suppress_prefixlen out of first cache line, as they are not used in fast path. Make sure ctarget & fr_net are in first cache line. (Assuming 64 bit arches and 64 bytes cache lines) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/fib_rules.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 9d0fcbaa9cb..4b2b557fb0e 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -10,23 +10,25 @@ struct fib_rule { struct list_head list; - atomic_t refcnt; int iifindex; int oifindex; u32 mark; u32 mark_mask; - u32 pref; u32 flags; u32 table; - int suppress_ifgroup; - int suppress_prefixlen; u8 action; + /* 3 bytes hole, try to use */ u32 target; struct fib_rule __rcu *ctarget; + struct net *fr_net; + + atomic_t refcnt; + u32 pref; + int suppress_ifgroup; + int suppress_prefixlen; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; struct rcu_head rcu; - struct net * fr_net; }; struct fib_lookup_arg { -- cgit v1.2.3-70-g09d2 From e473fcb472574de978e47f980aeca510020a1286 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 26 Jun 2013 23:56:58 +0200 Subject: xfrm: constify mark argument of xfrm_find_acq() The mark argument is read only, so constify it. Also make dummy_mark in af_key const -- only used as dummy argument for this very function. Signed-off-by: Mathias Krause Cc: "David S. Miller" Cc: Herbert Xu Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- net/key/af_key.c | 2 +- net/xfrm/xfrm_state.c | 12 +++++++----- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 94ce082b29d..89d3d8ae204 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1548,7 +1548,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir, u32 int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info); u32 xfrm_get_acqseq(void); extern int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); -struct xfrm_state *xfrm_find_acq(struct net *net, struct xfrm_mark *mark, +struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, diff --git a/net/key/af_key.c b/net/key/af_key.c index ab8bd2cabfa..4089a210e85 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -45,7 +45,7 @@ struct netns_pfkey { static DEFINE_MUTEX(pfkey_mutex); #define DUMMY_MARK 0 -static struct xfrm_mark dummy_mark = {0, 0}; +static const struct xfrm_mark dummy_mark = {0, 0}; struct pfkey_sock { /* struct sock must be the first member of struct pfkey_sock */ struct sock sk; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 78f66fa9244..b2cd806c0be 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -990,11 +990,13 @@ void xfrm_state_insert(struct xfrm_state *x) EXPORT_SYMBOL(xfrm_state_insert); /* xfrm_state_lock is held */ -static struct xfrm_state *__find_acq_core(struct net *net, struct xfrm_mark *m, +static struct xfrm_state *__find_acq_core(struct net *net, + const struct xfrm_mark *m, unsigned short family, u8 mode, u32 reqid, u8 proto, const xfrm_address_t *daddr, - const xfrm_address_t *saddr, int create) + const xfrm_address_t *saddr, + int create) { unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family); struct xfrm_state *x; @@ -1399,9 +1401,9 @@ xfrm_state_lookup_byaddr(struct net *net, u32 mark, EXPORT_SYMBOL(xfrm_state_lookup_byaddr); struct xfrm_state * -xfrm_find_acq(struct net *net, struct xfrm_mark *mark, u8 mode, u32 reqid, u8 proto, - const xfrm_address_t *daddr, const xfrm_address_t *saddr, - int create, unsigned short family) +xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, + u8 proto, const xfrm_address_t *daddr, + const xfrm_address_t *saddr, int create, unsigned short family) { struct xfrm_state *x; -- cgit v1.2.3-70-g09d2 From 5a139296f872d438f17f7219411321b603c1622b Mon Sep 17 00:00:00 2001 From: "fan.du" Date: Mon, 5 Aug 2013 17:13:03 +0800 Subject: sctp: Pack dst_cookie into 1st cacheline hole for 64bit host As dst_cookie is used in fast path sctp_transport_dst_check. Before: struct sctp_transport { struct list_head transports; /* 0 16 */ atomic_t refcnt; /* 16 4 */ __u32 dead:1; /* 20:31 4 */ __u32 rto_pending:1; /* 20:30 4 */ __u32 hb_sent:1; /* 20:29 4 */ __u32 pmtu_pending:1; /* 20:28 4 */ /* XXX 28 bits hole, try to pack */ __u32 sack_generation; /* 24 4 */ /* XXX 4 bytes hole, try to pack */ struct flowi fl; /* 32 64 */ /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ union sctp_addr ipaddr; /* 96 28 */ After: struct sctp_transport { struct list_head transports; /* 0 16 */ atomic_t refcnt; /* 16 4 */ __u32 dead:1; /* 20:31 4 */ __u32 rto_pending:1; /* 20:30 4 */ __u32 hb_sent:1; /* 20:29 4 */ __u32 pmtu_pending:1; /* 20:28 4 */ /* XXX 28 bits hole, try to pack */ __u32 sack_generation; /* 24 4 */ u32 dst_cookie; /* 28 4 */ struct flowi fl; /* 32 64 */ /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ union sctp_addr ipaddr; /* 96 28 */ Signed-off-by: Fan Du Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index c0f4e29eedd..d9c93a77b1a 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -782,6 +782,7 @@ struct sctp_transport { /* Has this transport moved the ctsn since we last sacked */ __u32 sack_generation; + u32 dst_cookie; struct flowi fl; @@ -946,7 +947,6 @@ struct sctp_transport { __u64 hb_nonce; struct rcu_head rcu; - u32 dst_cookie; }; struct sctp_transport *sctp_transport_new(struct net *, const union sctp_addr *, -- cgit v1.2.3-70-g09d2 From e7f1935c11269bc53cd52425b1025657adddb839 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 25 Jul 2013 21:45:17 +0200 Subject: wireless: make TU conversion macros available A few places in the code (mac80211 and iwlmvm) use the same TU_TO_JIFFIES() macro and could use TU_TO_EXP_TIME() that mac80211 has. Make these available to everyone and use them. Signed-off-by: Johannes Berg --- drivers/net/wireless/iwlwifi/mvm/time-event.c | 7 ++----- include/linux/ieee80211.h | 4 ++++ net/mac80211/ieee80211_i.h | 3 --- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/iwlwifi/mvm/time-event.c b/drivers/net/wireless/iwlwifi/mvm/time-event.c index ad9bbca9921..9f100363b17 100644 --- a/drivers/net/wireless/iwlwifi/mvm/time-event.c +++ b/drivers/net/wireless/iwlwifi/mvm/time-event.c @@ -73,7 +73,6 @@ #include "iwl-prph.h" /* A TimeUnit is 1024 microsecond */ -#define TU_TO_JIFFIES(_tu) (usecs_to_jiffies((_tu) * 1024)) #define MSEC_TO_TU(_msec) (_msec*1000/1024) /* @@ -191,8 +190,7 @@ static void iwl_mvm_te_handle_notif(struct iwl_mvm *mvm, iwl_mvm_te_clear_data(mvm, te_data); } else if (le32_to_cpu(notif->action) & TE_NOTIF_HOST_EVENT_START) { te_data->running = true; - te_data->end_jiffies = jiffies + - TU_TO_JIFFIES(te_data->duration); + te_data->end_jiffies = TU_TO_EXP_TIME(te_data->duration); if (te_data->vif->type == NL80211_IFTYPE_P2P_DEVICE) { set_bit(IWL_MVM_STATUS_ROC_RUNNING, &mvm->status); @@ -329,8 +327,7 @@ void iwl_mvm_protect_session(struct iwl_mvm *mvm, lockdep_assert_held(&mvm->mutex); if (te_data->running && - time_after(te_data->end_jiffies, - jiffies + TU_TO_JIFFIES(min_duration))) { + time_after(te_data->end_jiffies, TU_TO_EXP_TIME(min_duration))) { IWL_DEBUG_TE(mvm, "We have enough time in the current TE: %u\n", jiffies_to_msecs(te_data->end_jiffies - jiffies)); return; diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b3ce299782a..23a8877f4de 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2288,4 +2288,8 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, return !!(tim->virtual_map[index] & mask); } +/* convert time units */ +#define TU_TO_JIFFIES(x) (usecs_to_jiffies((x) * 1024)) +#define TU_TO_EXP_TIME(x) (jiffies + TU_TO_JIFFIES(x)) + #endif /* LINUX_IEEE80211_H */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index e94c84050e9..b6186517ec5 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -53,9 +53,6 @@ struct ieee80211_local; * increased memory use (about 2 kB of RAM per entry). */ #define IEEE80211_FRAGMENT_MAX 4 -#define TU_TO_JIFFIES(x) (usecs_to_jiffies((x) * 1024)) -#define TU_TO_EXP_TIME(x) (jiffies + TU_TO_JIFFIES(x)) - /* power level hasn't been configured (or set to automatic) */ #define IEEE80211_UNSET_POWER_LEVEL INT_MIN -- cgit v1.2.3-70-g09d2 From 6261d983f226f0a6a8d4d32b57a032bc23a5ebb6 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 5 Aug 2013 22:51:37 -0700 Subject: ip_tunnel: embed hash list head The IP tunnel hash heads can be embedded in the per-net structure since it is a fixed size. Reduce the size so that the total structure fits in a page size. The original size was overly large, even NETDEV_HASHBITS is only 8 bits! Also, add some white space for readability. Signed-off-by: Stephen Hemminger Acked-by: Pravin B Shelar . Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 4 ++-- net/ipv4/ip_tunnel.c | 13 ++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 781b3cf86a2..c6acd9f8f87 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -86,12 +86,12 @@ struct tnl_ptk_info { #define PACKET_RCVD 0 #define PACKET_REJECT 1 -#define IP_TNL_HASH_BITS 10 +#define IP_TNL_HASH_BITS 7 #define IP_TNL_HASH_SIZE (1 << IP_TNL_HASH_BITS) struct ip_tunnel_net { - struct hlist_head *tunnels; struct net_device *fb_tunnel_dev; + struct hlist_head tunnels[IP_TNL_HASH_SIZE]; }; #ifdef CONFIG_INET diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index ca1cb2d5f6e..9fdf8a6d95f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -838,15 +838,16 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); struct ip_tunnel_parm parms; + unsigned int i; - itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL); - if (!itn->tunnels) - return -ENOMEM; + for (i = 0; i < IP_TNL_HASH_SIZE; i++) + INIT_HLIST_HEAD(&itn->tunnels[i]); if (!ops) { itn->fb_tunnel_dev = NULL; return 0; } + memset(&parms, 0, sizeof(parms)); if (devname) strlcpy(parms.name, devname, IFNAMSIZ); @@ -854,10 +855,9 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, rtnl_lock(); itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); rtnl_unlock(); - if (IS_ERR(itn->fb_tunnel_dev)) { - kfree(itn->tunnels); + + if (IS_ERR(itn->fb_tunnel_dev)) return PTR_ERR(itn->fb_tunnel_dev); - } return 0; } @@ -887,7 +887,6 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn) ip_tunnel_destroy(itn, &list); unregister_netdevice_many(&list); rtnl_unlock(); - kfree(itn->tunnels); } EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); -- cgit v1.2.3-70-g09d2 From b4bf07771faaf959b0a916d35b1b930c030e30a8 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 6 Aug 2013 17:45:03 +0800 Subject: net: move iov_pages() to net/core/iovec.c To let it be reused and reduce code duplication. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 23 ----------------------- drivers/net/tun.c | 23 ----------------------- include/linux/socket.h | 2 ++ net/core/iovec.c | 24 ++++++++++++++++++++++++ 4 files changed, 26 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index a98fb0ed6ae..dfec20df17b 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -698,29 +698,6 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb, return 0; } -static unsigned long iov_pages(const struct iovec *iv, int offset, - unsigned long nr_segs) -{ - unsigned long seg, base; - int pages = 0, len, size; - - while (nr_segs && (offset >= iv->iov_len)) { - offset -= iv->iov_len; - ++iv; - --nr_segs; - } - - for (seg = 0; seg < nr_segs; seg++) { - base = (unsigned long)iv[seg].iov_base + offset; - len = iv[seg].iov_len - offset; - size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; - pages += size; - offset = 0; - } - - return pages; -} - /* Get packet from user space buffer */ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, const struct iovec *iv, unsigned long total_len, diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 5dce262f538..18527ef0cc7 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1041,29 +1041,6 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, return 0; } -static unsigned long iov_pages(const struct iovec *iv, int offset, - unsigned long nr_segs) -{ - unsigned long seg, base; - int pages = 0, len, size; - - while (nr_segs && (offset >= iv->iov_len)) { - offset -= iv->iov_len; - ++iv; - --nr_segs; - } - - for (seg = 0; seg < nr_segs; seg++) { - base = (unsigned long)iv[seg].iov_base + offset; - len = iv[seg].iov_len - offset; - size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; - pages += size; - offset = 0; - } - - return pages; -} - /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, const struct iovec *iv, diff --git a/include/linux/socket.h b/include/linux/socket.h index 230c04bda3e..445ef7519dc 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -313,6 +313,8 @@ extern int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, unsigned int len, __wsum *csump); +extern unsigned long iov_pages(const struct iovec *iov, int offset, + unsigned long nr_segs); extern int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode); extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata, diff --git a/net/core/iovec.c b/net/core/iovec.c index de178e46268..b77eeecc001 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -212,3 +212,27 @@ out_fault: goto out; } EXPORT_SYMBOL(csum_partial_copy_fromiovecend); + +unsigned long iov_pages(const struct iovec *iov, int offset, + unsigned long nr_segs) +{ + unsigned long seg, base; + int pages = 0, len, size; + + while (nr_segs && (offset >= iov->iov_len)) { + offset -= iov->iov_len; + ++iov; + --nr_segs; + } + + for (seg = 0; seg < nr_segs; seg++) { + base = (unsigned long)iov[seg].iov_base + offset; + len = iov[seg].iov_len - offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + pages += size; + offset = 0; + } + + return pages; +} +EXPORT_SYMBOL(iov_pages); -- cgit v1.2.3-70-g09d2 From c3bdeb5c7cc073ccf5ff9624642022a8613a956e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 6 Aug 2013 17:45:04 +0800 Subject: net: move zerocopy_sg_from_iovec() to net/core/datagram.c To let it be reused and reduce code duplication. Also document this function. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 80 ------------------------------------------- drivers/net/tun.c | 80 ------------------------------------------- include/linux/skbuff.h | 4 +++ net/core/datagram.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+), 160 deletions(-) (limited to 'include') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index dfec20df17b..182364abfa3 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -536,86 +536,6 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad, return skb; } -/* set skb frags from iovec, this can move to core network code for reuse */ -static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, - int offset, size_t count) -{ - int len = iov_length(from, count) - offset; - int copy = skb_headlen(skb); - int size, offset1 = 0; - int i = 0; - - /* Skip over from offset */ - while (count && (offset >= from->iov_len)) { - offset -= from->iov_len; - ++from; - --count; - } - - /* copy up to skb headlen */ - while (count && (copy > 0)) { - size = min_t(unsigned int, copy, from->iov_len - offset); - if (copy_from_user(skb->data + offset1, from->iov_base + offset, - size)) - return -EFAULT; - if (copy > size) { - ++from; - --count; - offset = 0; - } else - offset += size; - copy -= size; - offset1 += size; - } - - if (len == offset1) - return 0; - - while (count--) { - struct page *page[MAX_SKB_FRAGS]; - int num_pages; - unsigned long base; - unsigned long truesize; - - len = from->iov_len - offset; - if (!len) { - offset = 0; - ++from; - continue; - } - base = (unsigned long)from->iov_base + offset; - size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; - if (i + size > MAX_SKB_FRAGS) - return -EMSGSIZE; - num_pages = get_user_pages_fast(base, size, 0, &page[i]); - if (num_pages != size) { - int j; - - for (j = 0; j < num_pages; j++) - put_page(page[i + j]); - return -EFAULT; - } - truesize = size * PAGE_SIZE; - skb->data_len += len; - skb->len += len; - skb->truesize += truesize; - atomic_add(truesize, &skb->sk->sk_wmem_alloc); - while (len) { - int off = base & ~PAGE_MASK; - int size = min_t(int, len, PAGE_SIZE - off); - __skb_fill_page_desc(skb, i, page[i], off, size); - skb_shinfo(skb)->nr_frags++; - /* increase sk_wmem_alloc */ - base += size; - len -= size; - i++; - } - offset = 0; - ++from; - } - return 0; -} - /* * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should * be shared with the tun/tap driver. diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 18527ef0cc7..b1630479d4a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -961,86 +961,6 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, return skb; } -/* set skb frags from iovec, this can move to core network code for reuse */ -static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, - int offset, size_t count) -{ - int len = iov_length(from, count) - offset; - int copy = skb_headlen(skb); - int size, offset1 = 0; - int i = 0; - - /* Skip over from offset */ - while (count && (offset >= from->iov_len)) { - offset -= from->iov_len; - ++from; - --count; - } - - /* copy up to skb headlen */ - while (count && (copy > 0)) { - size = min_t(unsigned int, copy, from->iov_len - offset); - if (copy_from_user(skb->data + offset1, from->iov_base + offset, - size)) - return -EFAULT; - if (copy > size) { - ++from; - --count; - offset = 0; - } else - offset += size; - copy -= size; - offset1 += size; - } - - if (len == offset1) - return 0; - - while (count--) { - struct page *page[MAX_SKB_FRAGS]; - int num_pages; - unsigned long base; - unsigned long truesize; - - len = from->iov_len - offset; - if (!len) { - offset = 0; - ++from; - continue; - } - base = (unsigned long)from->iov_base + offset; - size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; - if (i + size > MAX_SKB_FRAGS) - return -EMSGSIZE; - num_pages = get_user_pages_fast(base, size, 0, &page[i]); - if (num_pages != size) { - int j; - - for (j = 0; j < num_pages; j++) - put_page(page[i + j]); - return -EFAULT; - } - truesize = size * PAGE_SIZE; - skb->data_len += len; - skb->len += len; - skb->truesize += truesize; - atomic_add(truesize, &skb->sk->sk_wmem_alloc); - while (len) { - int off = base & ~PAGE_MASK; - int size = min_t(int, len, PAGE_SIZE - off); - __skb_fill_page_desc(skb, i, page[i], off, size); - skb_shinfo(skb)->nr_frags++; - /* increase sk_wmem_alloc */ - base += size; - len -= size; - i++; - } - offset = 0; - ++from; - } - return 0; -} - /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, const struct iovec *iv, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ae46475d456..5ac96f31d54 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2359,6 +2359,10 @@ extern int skb_copy_datagram_from_iovec(struct sk_buff *skb, const struct iovec *from, int from_offset, int len); +extern int zerocopy_sg_from_iovec(struct sk_buff *skb, + const struct iovec *frm, + int offset, + size_t count); extern int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset, const struct iovec *to, diff --git a/net/core/datagram.c b/net/core/datagram.c index 8ab48cd8955..f987faef21c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -573,6 +573,99 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iovec); +/** + * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec + * @skb: buffer to copy + * @from: io vector to copy to + * @offset: offset in the io vector to start copying from + * @count: amount of vectors to copy to buffer from + * + * The function will first copy up to headlen, and then pin the userspace + * pages and build frags through them. + * + * Returns 0, -EFAULT or -EMSGSIZE. + * Note: the iovec is not modified during the copy + */ +int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, + int offset, size_t count) +{ + int len = iov_length(from, count) - offset; + int copy = skb_headlen(skb); + int size, offset1 = 0; + int i = 0; + + /* Skip over from offset */ + while (count && (offset >= from->iov_len)) { + offset -= from->iov_len; + ++from; + --count; + } + + /* copy up to skb headlen */ + while (count && (copy > 0)) { + size = min_t(unsigned int, copy, from->iov_len - offset); + if (copy_from_user(skb->data + offset1, from->iov_base + offset, + size)) + return -EFAULT; + if (copy > size) { + ++from; + --count; + offset = 0; + } else + offset += size; + copy -= size; + offset1 += size; + } + + if (len == offset1) + return 0; + + while (count--) { + struct page *page[MAX_SKB_FRAGS]; + int num_pages; + unsigned long base; + unsigned long truesize; + + len = from->iov_len - offset; + if (!len) { + offset = 0; + ++from; + continue; + } + base = (unsigned long)from->iov_base + offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + if (i + size > MAX_SKB_FRAGS) + return -EMSGSIZE; + num_pages = get_user_pages_fast(base, size, 0, &page[i]); + if (num_pages != size) { + int j; + + for (j = 0; j < num_pages; j++) + put_page(page[i + j]); + return -EFAULT; + } + truesize = size * PAGE_SIZE; + skb->data_len += len; + skb->len += len; + skb->truesize += truesize; + atomic_add(truesize, &skb->sk->sk_wmem_alloc); + while (len) { + int off = base & ~PAGE_MASK; + int size = min_t(int, len, PAGE_SIZE - off); + __skb_fill_page_desc(skb, i, page[i], off, size); + skb_shinfo(skb)->nr_frags++; + /* increase sk_wmem_alloc */ + base += size; + len -= size; + i++; + } + offset = 0; + ++from; + } + return 0; +} +EXPORT_SYMBOL(zerocopy_sg_from_iovec); + static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 __user *to, int len, __wsum *csump) -- cgit v1.2.3-70-g09d2 From 1f07d03e2069df2ecd82301936b598a3b257c6d6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 6 Aug 2013 03:32:11 -0700 Subject: net: add SNMP counters tracking incoming ECN bits With GRO/LRO processing, there is a problem because Ip[6]InReceives SNMP counters do not count the number of frames, but number of aggregated segments. Its probably too late to change this now. This patch adds four new counters, tracking number of frames, regardless of LRO/GRO, and on a per ECN status basis, for IPv4 and IPv6. Ip[6]NoECTPkts : Number of packets received with NOECT Ip[6]ECT1Pkts : Number of packets received with ECT(1) Ip[6]ECT0Pkts : Number of packets received with ECT(0) Ip[6]CEPkts : Number of packets received with Congestion Experienced lph37:~# nstat | egrep "Pkts|InReceive" IpInReceives 1634137 0.0 Ip6InReceives 3714107 0.0 Ip6InNoECTPkts 19205 0.0 Ip6InECT0Pkts 52651828 0.0 IpExtInNoECTPkts 33630 0.0 IpExtInECT0Pkts 15581379 0.0 IpExtInCEPkts 6 0.0 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 4 ++++ net/ipv4/ip_input.c | 8 ++++++++ net/ipv4/proc.c | 7 ++++++- net/ipv6/ip6_input.c | 6 +++++- net/ipv6/proc.c | 4 ++++ 5 files changed, 27 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index af0a674cc67..60601d28da7 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -51,6 +51,10 @@ enum IPSTATS_MIB_INBCASTOCTETS, /* InBcastOctets */ IPSTATS_MIB_OUTBCASTOCTETS, /* OutBcastOctets */ IPSTATS_MIB_CSUMERRORS, /* InCsumErrors */ + IPSTATS_MIB_NOECTPKTS, /* InNoECTPkts */ + IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */ + IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */ + IPSTATS_MIB_CEPKTS, /* InCEPkts */ __IPSTATS_MIB_MAX }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 15e3e683ade..054a3e97d82 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -141,6 +141,7 @@ #include #include #include +#include #include #include #include @@ -410,6 +411,13 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; + BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); + BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); + BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); + IP_ADD_STATS_BH(dev_net(dev), + IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); + if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6577a1149a4..5f5fa612647 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -111,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_SENTINEL }; -/* Following RFC4293 items are displayed in /proc/net/netstat */ +/* Following items are displayed in /proc/net/netstat */ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), @@ -125,7 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), + /* Non RFC4293 fields */ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), + SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), + SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), + SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), + SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 2bab2aa5974..302d6fb1ff2 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -44,7 +44,7 @@ #include #include #include - +#include int ip6_rcv_finish(struct sk_buff *skb) @@ -109,6 +109,10 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; + IP6_ADD_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_NOECTPKTS + + (ipv6_get_dsfield(hdr) & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); /* * RFC4291 2.5.3 * A packet received on an interface with a destination address diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 51c3285b5d9..091d066a57b 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -91,6 +91,10 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), /* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */ + SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS), + SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS), + SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), + SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_SENTINEL }; -- cgit v1.2.3-70-g09d2 From c655bc6896b94ee0223393f26155c6daf1e2d148 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 29 Jul 2013 15:41:55 +0200 Subject: netfilter: nf_conntrack: don't send destroy events from iterator Let nf_ct_delete handle delivery of the DESTROY event. Based on earlier patch from Pablo Neira. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 4 +++- net/ipv4/netfilter/ipt_MASQUERADE.c | 2 +- net/ipv6/netfilter/ip6t_MASQUERADE.c | 2 +- net/netfilter/nf_conntrack_core.c | 36 ++++-------------------------------- net/netfilter/nf_conntrack_proto.c | 4 ++-- net/netfilter/nf_nat_core.c | 6 +++--- 6 files changed, 14 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index e5eb8b62538..0c1288a50e8 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -248,7 +248,9 @@ extern void nf_ct_untracked_status_or(unsigned long bits); /* Iterate over all conntracks: if iter returns true, it's deleted. */ extern void -nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data); +nf_ct_iterate_cleanup(struct net *net, + int (*iter)(struct nf_conn *i, void *data), + void *data, u32 portid, int report); extern void nf_conntrack_free(struct nf_conn *ct); extern struct nf_conn * nf_conntrack_alloc(struct net *net, u16 zone, diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 30e4de94056..00352ce0f0d 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -118,7 +118,7 @@ static int masq_device_event(struct notifier_block *this, NF_CT_ASSERT(dev->ifindex != 0); nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex); + (void *)(long)dev->ifindex, 0, 0); } return NOTIFY_DONE; diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 47bff610751..3e4e92d5e15 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -76,7 +76,7 @@ static int masq_device_event(struct notifier_block *this, if (event == NETDEV_DOWN) nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex); + (void *)(long)dev->ifindex, 0, 0); return NOTIFY_DONE; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0934611ff9f..da6f1787a10 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1246,7 +1246,7 @@ found: void nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), - void *data) + void *data, u32 portid, int report) { struct nf_conn *ct; unsigned int bucket = 0; @@ -1254,7 +1254,7 @@ void nf_ct_iterate_cleanup(struct net *net, while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) - death_by_timeout((unsigned long)ct); + nf_ct_delete(ct, portid, report); /* ... else the timer will get him soon. */ @@ -1263,30 +1263,6 @@ void nf_ct_iterate_cleanup(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); -struct __nf_ct_flush_report { - u32 portid; - int report; -}; - -static int kill_report(struct nf_conn *i, void *data) -{ - struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; - struct nf_conn_tstamp *tstamp; - - tstamp = nf_conn_tstamp_find(i); - if (tstamp && tstamp->stop == 0) - tstamp->stop = ktime_to_ns(ktime_get_real()); - - /* If we fail to deliver the event, death_by_timeout() will retry */ - if (nf_conntrack_event_report(IPCT_DESTROY, i, - fr->portid, fr->report) < 0) - return 1; - - /* Avoid the delivery of the destroy event in death_by_timeout(). */ - set_bit(IPS_DYING_BIT, &i->status); - return 1; -} - static int kill_all(struct nf_conn *i, void *data) { return 1; @@ -1304,11 +1280,7 @@ EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); void nf_conntrack_flush_report(struct net *net, u32 portid, int report) { - struct __nf_ct_flush_report fr = { - .portid = portid, - .report = report, - }; - nf_ct_iterate_cleanup(net, kill_report, &fr); + nf_ct_iterate_cleanup(net, kill_all, NULL, portid, report); } EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); @@ -1389,7 +1361,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { - nf_ct_iterate_cleanup(net, kill_all, NULL); + nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0); nf_ct_release_dying_list(net); if (atomic_read(&net->ct.count) != 0) busy = 1; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 0ab9636ac57..ce3004156ee 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -281,7 +281,7 @@ void nf_ct_l3proto_pernet_unregister(struct net *net, nf_ct_l3proto_unregister_sysctl(net, proto); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(net, kill_l3proto, proto); + nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0); } EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister); @@ -476,7 +476,7 @@ void nf_ct_l4proto_pernet_unregister(struct net *net, nf_ct_l4proto_unregister_sysctl(net, pn, l4proto); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(net, kill_l4proto, l4proto); + nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0); } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 038eee5c8f8..6ff808375b5 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -497,7 +497,7 @@ static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) rtnl_lock(); for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean); + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); rtnl_unlock(); } @@ -511,7 +511,7 @@ static void nf_nat_l3proto_clean(u8 l3proto) rtnl_lock(); for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean); + nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); rtnl_unlock(); } @@ -749,7 +749,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; - nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean); + nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean, 0, 0); synchronize_rcu(); nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } -- cgit v1.2.3-70-g09d2 From 5c6fe01c1fe3140a8fb088d2a9c32548b731c35e Mon Sep 17 00:00:00 2001 From: William Manley Date: Tue, 6 Aug 2013 19:03:14 +0100 Subject: net: igmp: Don't flush routing cache when force_igmp_version is modified The procfs knob /proc/sys/net/ipv4/conf/*/force_igmp_version allows the IGMP protocol version to use to be explicitly set. As a side effect this caused the routing cache to be flushed as it was declared as a DEVINET_SYSCTL_FLUSHING_ENTRY. Flushing is unnecessary and this patch makes it so flushing does not occur. Requested by Hannes Frederic Sowa as he was reviewing other patches adding procfs entries. Suggested-by: Hannes Frederic Sowa Signed-off-by: William Manley Acked-by: Hannes Frederic Sowa Acked-by: Benjamin LaHaise Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 2 +- net/ipv4/devinet.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index b99cd23f347..0a4a6cb3533 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -27,9 +27,9 @@ enum IPV4_DEVCONF_TAG, IPV4_DEVCONF_ARPFILTER, IPV4_DEVCONF_MEDIUM_ID, + IPV4_DEVCONF_FORCE_IGMP_VERSION, IPV4_DEVCONF_NOXFRM, IPV4_DEVCONF_NOPOLICY, - IPV4_DEVCONF_FORCE_IGMP_VERSION, IPV4_DEVCONF_ARP_ANNOUNCE, IPV4_DEVCONF_ARP_IGNORE, IPV4_DEVCONF_PROMOTE_SECONDARIES, diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 43923dc7744..87d47ce9fa3 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2094,11 +2094,11 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), + DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, + "force_igmp_version"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), - DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION, - "force_igmp_version"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, -- cgit v1.2.3-70-g09d2 From 2690048c01f32bf45d1c1e1ab3079bc10ad2aea7 Mon Sep 17 00:00:00 2001 From: William Manley Date: Tue, 6 Aug 2013 19:03:15 +0100 Subject: net: igmp: Allow user-space configuration of igmp unsolicited report interval Adds the new procfs knobs: /proc/sys/net/ipv4/conf/*/igmpv2_unsolicited_report_interval /proc/sys/net/ipv4/conf/*/igmpv3_unsolicited_report_interval Which will allow userspace configuration of the IGMP unsolicited report interval (see below) in milliseconds. The defaults are 10000ms for IGMPv2 and 1000ms for IGMPv3 in accordance with RFC2236 and RFC3376. Background: If an IGMP join packet is lost you will not receive data sent to the multicast group so if no data arrives from that multicast group in a period of time after the IGMP join a second IGMP join will be sent. The delay between joins is the "IGMP Unsolicited Report Interval". Prior to this patch this value was hard coded in the kernel to 10s for IGMPv2 and 1s for IGMPv3. 10s is unsuitable for some use-cases, such as IPTV as it can cause channel change to be slow in the presence of packet loss. This patch allows the value to be overridden from userspace for both IGMPv2 and IGMPv3 such that it can be tuned accoding to the network. Tested with Wireshark and a simple program to join a (non-existent) multicast group. The distribution of timings for the second join differ based upon setting the procfs knobs. igmpvX_unsolicited_report_interval is intended to follow the pattern established by force_igmp_version, and while a procfs entry has been added a corresponding sysctl knob has not as it is my understanding that sysctl is deprecated[1]. [1]: http://lwn.net/Articles/247243/ Signed-off-by: William Manley Acked-by: Hannes Frederic Sowa Acked-by: Benjamin LaHaise Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 2 ++ net/ipv4/devinet.c | 8 ++++++++ net/ipv4/igmp.c | 19 +++++++++++++++++-- 3 files changed, 27 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 0a4a6cb3533..c796ce26c7c 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -28,6 +28,8 @@ enum IPV4_DEVCONF_ARPFILTER, IPV4_DEVCONF_MEDIUM_ID, IPV4_DEVCONF_FORCE_IGMP_VERSION, + IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, + IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, IPV4_DEVCONF_NOXFRM, IPV4_DEVCONF_NOPOLICY, IPV4_DEVCONF_ARP_ANNOUNCE, diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 87d47ce9fa3..a1b5bcbd04a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -73,6 +73,8 @@ static struct ipv4_devconf ipv4_devconf = { [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, + [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, + [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, }, }; @@ -83,6 +85,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = { [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, + [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, + [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, }, }; @@ -2096,6 +2100,10 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, "force_igmp_version"), + DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, + "igmpv2_unsolicited_report_interval"), + DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, + "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index ceb19ab9cf9..d6c0e64ec97 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -142,10 +142,25 @@ static int unsolicited_report_interval(struct in_device *in_dev) { + int interval_ms, interval_jiffies; + if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) - return IGMP_V2_Unsolicited_Report_Interval; + interval_ms = IN_DEV_CONF_GET( + in_dev, + IGMPV2_UNSOLICITED_REPORT_INTERVAL); else /* v3 */ - return IGMP_V3_Unsolicited_Report_Interval; + interval_ms = IN_DEV_CONF_GET( + in_dev, + IGMPV3_UNSOLICITED_REPORT_INTERVAL); + + interval_jiffies = msecs_to_jiffies(interval_ms); + + /* _timer functions can't handle a delay of 0 jiffies so ensure + * we always return a positive value. + */ + if (interval_jiffies <= 0) + interval_jiffies = 1; + return interval_jiffies; } static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); -- cgit v1.2.3-70-g09d2 From cda5f98e36576596b9230483ec52bff3cc97eb21 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 6 Aug 2013 21:18:12 +0200 Subject: net: sctp: convert sctp_checksum_disable module param into sctp sysctl Get rid of the last module parameter for SCTP and make this configurable via sysctl for SCTP like all the rest of SCTP's configuration knobs. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 8 ++++++++ include/net/netns/sctp.h | 3 +++ include/net/sctp/structs.h | 5 ----- net/sctp/input.c | 4 ++-- net/sctp/output.c | 5 ++++- net/sctp/protocol.c | 5 +++-- net/sctp/sysctl.c | 10 +++++++++- 7 files changed, 29 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 36be26b2ef7..1b27b0b8687 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1507,6 +1507,14 @@ sack_timeout - INTEGER Default: 200 +checksum_disable - BOOLEAN + Disable SCTP checksum computing and verification for debugging purpose. + + 1: Disable checksumming + 0: Enable checksumming + + Default: 0 + valid_cookie_life - INTEGER The default lifetime of the SCTP cookie (in milliseconds). The cookie is used during association establishment. diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index 3573a81815a..ebfdf1e7d40 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -129,6 +129,9 @@ struct netns_sctp { /* Threshold for autoclose timeout, in seconds. */ unsigned long max_autoclose; + + /* Flag to disable SCTP checksumming. */ + int checksum_disable; }; #endif /* __NETNS_SCTP_H__ */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index d9c93a77b1a..06ebeaaaa9a 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -141,10 +141,6 @@ extern struct sctp_globals { /* This is the sctp port control hash. */ int port_hashsize; struct sctp_bind_hashbucket *port_hashtable; - - /* Flag to indicate whether computing and verifying checksum - * is disabled. */ - bool checksum_disable; } sctp_globals; #define sctp_max_instreams (sctp_globals.max_instreams) @@ -156,7 +152,6 @@ extern struct sctp_globals { #define sctp_assoc_hashtable (sctp_globals.assoc_hashtable) #define sctp_port_hashsize (sctp_globals.port_hashsize) #define sctp_port_hashtable (sctp_globals.port_hashtable) -#define sctp_checksum_disable (sctp_globals.checksum_disable) /* SCTP Socket type: UDP or TCP style. */ typedef enum { diff --git a/net/sctp/input.c b/net/sctp/input.c index fa91aff0238..b9a25e18bb2 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -140,8 +140,8 @@ int sctp_rcv(struct sk_buff *skb) __skb_pull(skb, skb_transport_offset(skb)); if (skb->len < sizeof(struct sctphdr)) goto discard_it; - if (!sctp_checksum_disable && !skb_csum_unnecessary(skb) && - sctp_rcv_checksum(net, skb) < 0) + if (!net->sctp.checksum_disable && !skb_csum_unnecessary(skb) && + sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb_pull(skb, sizeof(struct sctphdr)); diff --git a/net/sctp/output.c b/net/sctp/output.c index 5a55c55d71a..cdb5f4914e1 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -395,6 +395,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) int padding; /* How much padding do we need? */ __u8 has_data = 0; struct dst_entry *dst = tp->dst; + struct net *net; unsigned char *auth = NULL; /* pointer to auth in skb data */ __u32 cksum_buf_len = sizeof(struct sctphdr); @@ -541,7 +542,9 @@ int sctp_packet_transmit(struct sctp_packet *packet) * Note: Adler-32 is no longer applicable, as has been replaced * by CRC32-C as described in . */ - if (!sctp_checksum_disable) { + net = dev_net(dst->dev); + + if (!net->sctp.checksum_disable) { if (!(dst->dev->features & NETIF_F_SCTP_CSUM)) { __u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index b52ec251010..a570a6365f8 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1193,6 +1193,9 @@ static int __net_init sctp_net_init(struct net *net) /* Whether Cookie Preservative is enabled(1) or not(0) */ net->sctp.cookie_preserve_enable = 1; + /* Whether SCTP checksumming is disabled(1) or not(0) */ + net->sctp.checksum_disable = 0; + /* Default sctp sockets to use md5 as their hmac alg */ #if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5) net->sctp.sctp_hmac_alg = "md5"; @@ -1549,6 +1552,4 @@ MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132"); MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132"); MODULE_AUTHOR("Linux Kernel SCTP developers "); MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); -module_param_named(no_checksums, sctp_checksum_disable, bool, 0644); -MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification"); MODULE_LICENSE("GPL"); diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 190674702b2..754809a7183 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -296,7 +296,15 @@ static struct ctl_table sctp_net_table[] = { .extra1 = &max_autoclose_min, .extra2 = &max_autoclose_max, }, - + { + .procname = "checksum_disable", + .data = &init_net.sctp.checksum_disable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { /* sentinel */ } }; -- cgit v1.2.3-70-g09d2 From 477143e3fece3dc12629bb1ebd7b47e8e6e72b2b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 6 Aug 2013 21:18:13 +0200 Subject: net: sctp: trivial: update bug report in header comment With the restructuring of the lksctp.org site, we only allow bug reports through the SCTP mailing list linux-sctp@vger.kernel.org, not via SF, as SF is only used for web hosting and nothing more. While at it, also remove the obvious statement that bugs will be fixed and incooperated into the kernel. Signed-off-by: Daniel Borkmann Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/auth.h | 6 ------ include/net/sctp/checksum.h | 6 ------ include/net/sctp/command.h | 18 ++++++++---------- include/net/sctp/constants.h | 6 ------ include/net/sctp/sctp.h | 6 ------ include/net/sctp/sm.h | 6 ------ include/net/sctp/structs.h | 6 ------ include/net/sctp/tsnmap.h | 6 ------ include/net/sctp/ulpevent.h | 6 ------ include/net/sctp/ulpqueue.h | 6 ------ net/sctp/associola.c | 6 ------ net/sctp/auth.c | 6 ------ net/sctp/bind_addr.c | 6 ------ net/sctp/chunk.c | 6 ------ net/sctp/command.c | 6 ------ net/sctp/debug.c | 6 ------ net/sctp/endpointola.c | 6 ------ net/sctp/input.c | 6 ------ net/sctp/inqueue.c | 6 ------ net/sctp/ipv6.c | 6 ------ net/sctp/objcnt.c | 6 ------ net/sctp/output.c | 6 ------ net/sctp/outqueue.c | 6 ------ net/sctp/primitive.c | 6 ------ net/sctp/proc.c | 6 ------ net/sctp/protocol.c | 6 ------ net/sctp/sm_make_chunk.c | 6 ------ net/sctp/sm_sideeffect.c | 6 ------ net/sctp/sm_statefuns.c | 6 ------ net/sctp/sm_statetable.c | 6 ------ net/sctp/socket.c | 6 ------ net/sctp/ssnmap.c | 6 ------ net/sctp/sysctl.c | 6 ------ net/sctp/transport.c | 6 ------ net/sctp/tsnmap.c | 6 ------ net/sctp/ulpevent.c | 6 ------ net/sctp/ulpqueue.c | 6 ------ 37 files changed, 8 insertions(+), 226 deletions(-) (limited to 'include') diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index 754c511f4cf..aa80bef3c9d 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -24,14 +24,8 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Vlad Yasevich - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #ifndef __sctp_auth_h__ diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index 483e6303458..259924d63ba 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -27,9 +27,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Dinakaran Joseph * Jon Grimm @@ -37,9 +34,6 @@ * * Rewritten to use libcrc32c by: * Vlad Yasevich - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #ifndef __sctp_checksum_h__ diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h index 35247271e55..832f2191489 100644 --- a/include/net/sctp/command.h +++ b/include/net/sctp/command.h @@ -23,19 +23,17 @@ * the Free Software Foundation, 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. * - * Please send any bug reports or fixes you make to one of the - * following email addresses: + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers * - * La Monte H.P. Yarroll - * Karl Knutson - * Ardelle Fan - * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Ardelle Fan + * Sridhar Samudrala */ - #ifndef __net_sctp_command_h__ #define __net_sctp_command_h__ diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index a1e7aa5c8bb..2f0a565a0fd 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -27,9 +27,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -39,9 +36,6 @@ * Xingang Guo * Sridhar Samudrala * Daisy Chang - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #ifndef __sctp_constants_h__ diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index cb28df9c278..3794c5ad20f 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -29,9 +29,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Xingang Guo @@ -41,9 +38,6 @@ * Ardelle Fan * Ryan Layer * Kevin Gao - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #ifndef __net_sctp_h__ diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 2aa66dda4a5..4ef75af340b 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -29,9 +29,6 @@ * email addresses: * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -42,9 +39,6 @@ * Daisy Chang * Ardelle Fan * Kevin Gao - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 06ebeaaaa9a..78eedf0cd4b 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -27,9 +27,6 @@ * email addresses: * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Randall Stewart * Ken Morneau @@ -46,9 +43,6 @@ * Ryan Layer * Anup Pemmaiah * Kevin Gao - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #ifndef __sctp_structs_h__ diff --git a/include/net/sctp/tsnmap.h b/include/net/sctp/tsnmap.h index c361d6f7e0c..54bbbe54730 100644 --- a/include/net/sctp/tsnmap.h +++ b/include/net/sctp/tsnmap.h @@ -30,17 +30,11 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Jon Grimm * La Monte H.P. Yarroll * Karl Knutson * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index 34f0d34d22c..27b9f5c9015 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -33,17 +33,11 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Jon Grimm * La Monte H.P. Yarroll * Karl Knutson * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #ifndef __sctp_ulpevent_h__ diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h index add3237a9fd..b0cf5d54d71 100644 --- a/include/net/sctp/ulpqueue.h +++ b/include/net/sctp/ulpqueue.h @@ -32,16 +32,10 @@ * email addresses: * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Jon Grimm * La Monte H.P. Yarroll * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #ifndef __sctp_ulpqueue_h__ diff --git a/net/sctp/associola.c b/net/sctp/associola.c index e425ba0bd79..e051920816e 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -30,9 +30,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -43,9 +40,6 @@ * Daisy Chang * Ryan Layer * Kevin Gao - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 3aab967081b..8c4fa5dec82 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -24,14 +24,8 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Vlad Yasevich - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index f34ce8bc139..077bb070052 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -29,17 +29,11 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson * Jon Grimm * Daisy Chang - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index b50b90c16a8..bd0bdd0ba8f 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -26,15 +26,9 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Jon Grimm * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/command.c b/net/sctp/command.c index f4bebdadf00..3d9a9ff69c0 100644 --- a/net/sctp/command.c +++ b/net/sctp/command.c @@ -27,15 +27,9 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/debug.c b/net/sctp/debug.c index 44aa4e7543a..e89015d8935 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -30,9 +30,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -40,9 +37,6 @@ * Jon Grimm * Daisy Chang * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 825b7543a43..09b8daac87c 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -31,18 +31,12 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson * Jon Grimm * Daisy Chang * Dajiang Zhang - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/input.c b/net/sctp/input.c index b9a25e18bb2..2873e221e18 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -31,9 +31,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -43,9 +40,6 @@ * Daisy Chang * Sridhar Samudrala * Ardelle Fan - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index e70f60ae92e..5856932fdc3 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -32,15 +32,9 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 5a9402e3c0c..da613ceae28 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -29,9 +29,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Le Yanqun * Hui Huang @@ -42,9 +39,6 @@ * * Based on: * linux/net/ipv6/tcp_ipv6.c - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index aec346cbed6..5ea573b3764 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -28,14 +28,8 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Jon Grimm - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/output.c b/net/sctp/output.c index cdb5f4914e1..e35b84cc1d9 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -28,17 +28,11 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson * Jon Grimm * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 51313233e63..94df7587786 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -30,9 +30,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -41,9 +38,6 @@ * Hui Huang * Sridhar Samudrala * Jon Grimm - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c index 31fa437da11..ce1ffd81177 100644 --- a/net/sctp/primitive.c +++ b/net/sctp/primitive.c @@ -31,18 +31,12 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Narasimha Budihal * Karl Knutson * Ardelle Fan * Kevin Gao - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/proc.c b/net/sctp/proc.c index aff0cac45b6..82432bfb742 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -24,14 +24,8 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index a570a6365f8..54482977a48 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -31,9 +31,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -41,9 +38,6 @@ * Sridhar Samudrala * Daisy Chang * Ardelle Fan - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 780a2d4b98a..9c131338ccf 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -31,9 +31,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -45,9 +42,6 @@ * Daisy Chang * Ardelle Fan * Kevin Gao - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index f1f3aac3f3b..666c6684279 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -30,9 +30,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -42,9 +39,6 @@ * Daisy Chang * Sridhar Samudrala * Ardelle Fan - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 93271f0966c..dfe3f36ff2a 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -30,9 +30,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -45,9 +42,6 @@ * Ardelle Fan * Ryan Layer * Kevin Gao - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 64ea5fd87eb..c5999b2dde7 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -30,9 +30,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -41,9 +38,6 @@ * Daisy Chang * Ardelle Fan * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 02457123bde..d5d5882a289 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -36,9 +36,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Narasimha Budihal @@ -52,9 +49,6 @@ * Ryan Layer * Anup Pemmaiah * Kevin Gao - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c index 72b5939101b..6007124aefa 100644 --- a/net/sctp/ssnmap.c +++ b/net/sctp/ssnmap.c @@ -26,14 +26,8 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Jon Grimm - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 754809a7183..1b1ee769b5b 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -27,18 +27,12 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Mingqin Liu * Jon Grimm * Ardelle Fan * Ryan Layer * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 9602c52aa61..6836ead5c95 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -32,9 +32,6 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Karl Knutson @@ -43,9 +40,6 @@ * Hui Huang * Sridhar Samudrala * Ardelle Fan - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c index 0eff8667d4c..fbda2002828 100644 --- a/net/sctp/tsnmap.c +++ b/net/sctp/tsnmap.c @@ -29,17 +29,11 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * La Monte H.P. Yarroll * Jon Grimm * Karl Knutson * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index c07624fca7e..81089ed6545 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -30,17 +30,11 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Jon Grimm * La Monte H.P. Yarroll * Ardelle Fan * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 2cdb3010f50..1c1484ed605 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -29,16 +29,10 @@ * email address(es): * lksctp developers * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * * Written or modified by: * Jon Grimm * La Monte H.P. Yarroll * Sridhar Samudrala - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. */ #include -- cgit v1.2.3-70-g09d2 From 71acc0ddd499cc323199fb1ae350ce9ea0744352 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 9 Aug 2013 13:09:41 -0700 Subject: Revert "net: sctp: convert sctp_checksum_disable module param into sctp sysctl" This reverts commit cda5f98e36576596b9230483ec52bff3cc97eb21. As per Vlad's request. Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 8 -------- include/net/netns/sctp.h | 3 --- include/net/sctp/structs.h | 5 +++++ net/sctp/input.c | 4 ++-- net/sctp/output.c | 5 +---- net/sctp/protocol.c | 5 ++--- net/sctp/sysctl.c | 10 +--------- 7 files changed, 11 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 1b27b0b8687..36be26b2ef7 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1507,14 +1507,6 @@ sack_timeout - INTEGER Default: 200 -checksum_disable - BOOLEAN - Disable SCTP checksum computing and verification for debugging purpose. - - 1: Disable checksumming - 0: Enable checksumming - - Default: 0 - valid_cookie_life - INTEGER The default lifetime of the SCTP cookie (in milliseconds). The cookie is used during association establishment. diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index ebfdf1e7d40..3573a81815a 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -129,9 +129,6 @@ struct netns_sctp { /* Threshold for autoclose timeout, in seconds. */ unsigned long max_autoclose; - - /* Flag to disable SCTP checksumming. */ - int checksum_disable; }; #endif /* __NETNS_SCTP_H__ */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 78eedf0cd4b..422db6cc321 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -135,6 +135,10 @@ extern struct sctp_globals { /* This is the sctp port control hash. */ int port_hashsize; struct sctp_bind_hashbucket *port_hashtable; + + /* Flag to indicate whether computing and verifying checksum + * is disabled. */ + bool checksum_disable; } sctp_globals; #define sctp_max_instreams (sctp_globals.max_instreams) @@ -146,6 +150,7 @@ extern struct sctp_globals { #define sctp_assoc_hashtable (sctp_globals.assoc_hashtable) #define sctp_port_hashsize (sctp_globals.port_hashsize) #define sctp_port_hashtable (sctp_globals.port_hashtable) +#define sctp_checksum_disable (sctp_globals.checksum_disable) /* SCTP Socket type: UDP or TCP style. */ typedef enum { diff --git a/net/sctp/input.c b/net/sctp/input.c index 2873e221e18..5f2068679f8 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -134,8 +134,8 @@ int sctp_rcv(struct sk_buff *skb) __skb_pull(skb, skb_transport_offset(skb)); if (skb->len < sizeof(struct sctphdr)) goto discard_it; - if (!net->sctp.checksum_disable && !skb_csum_unnecessary(skb) && - sctp_rcv_checksum(net, skb) < 0) + if (!sctp_checksum_disable && !skb_csum_unnecessary(skb) && + sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb_pull(skb, sizeof(struct sctphdr)); diff --git a/net/sctp/output.c b/net/sctp/output.c index e35b84cc1d9..0ac3a65dacc 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -389,7 +389,6 @@ int sctp_packet_transmit(struct sctp_packet *packet) int padding; /* How much padding do we need? */ __u8 has_data = 0; struct dst_entry *dst = tp->dst; - struct net *net; unsigned char *auth = NULL; /* pointer to auth in skb data */ __u32 cksum_buf_len = sizeof(struct sctphdr); @@ -536,9 +535,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) * Note: Adler-32 is no longer applicable, as has been replaced * by CRC32-C as described in . */ - net = dev_net(dst->dev); - - if (!net->sctp.checksum_disable) { + if (!sctp_checksum_disable) { if (!(dst->dev->features & NETIF_F_SCTP_CSUM)) { __u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 54482977a48..5e17092f4ad 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1187,9 +1187,6 @@ static int __net_init sctp_net_init(struct net *net) /* Whether Cookie Preservative is enabled(1) or not(0) */ net->sctp.cookie_preserve_enable = 1; - /* Whether SCTP checksumming is disabled(1) or not(0) */ - net->sctp.checksum_disable = 0; - /* Default sctp sockets to use md5 as their hmac alg */ #if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5) net->sctp.sctp_hmac_alg = "md5"; @@ -1546,4 +1543,6 @@ MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132"); MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132"); MODULE_AUTHOR("Linux Kernel SCTP developers "); MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); +module_param_named(no_checksums, sctp_checksum_disable, bool, 0644); +MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification"); MODULE_LICENSE("GPL"); diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 1b1ee769b5b..6b36561a1b3 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -290,15 +290,7 @@ static struct ctl_table sctp_net_table[] = { .extra1 = &max_autoclose_min, .extra2 = &max_autoclose_max, }, - { - .procname = "checksum_disable", - .data = &init_net.sctp.checksum_disable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, + { /* sentinel */ } }; -- cgit v1.2.3-70-g09d2 From 149479d019e06df5a7f4096f95c00cfb1380309c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 8 Aug 2013 14:06:22 -0700 Subject: tcp: add server ip to encrypt cookie in fast open Encrypt the cookie with both server and client IPv4 addresses, such that multi-homed server will grant different cookies based on both the source and destination IPs. No client change is needed since cookie is opaque to the client. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- net/ipv4/tcp_fastopen.c | 13 ++++++------- net/ipv4/tcp_ipv4.c | 10 +++++++--- 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 27b652f379e..09cb5c11cee 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1300,7 +1300,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp); extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; int tcp_fastopen_reset_cipher(void *key, unsigned int len); -void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc); +extern void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, + struct tcp_fastopen_cookie *foc); #define TCP_FASTOPEN_KEY_LENGTH 16 diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8f7ef0ad80e..ab7bd35bb31 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -58,23 +58,22 @@ error: kfree(ctx); return err; } -/* Computes the fastopen cookie for the peer. - * The peer address is a 128 bits long (pad with zeros for IPv4). +/* Computes the fastopen cookie for the IP path. + * The path is a 128 bits long (pad with zeros for IPv4). * * The caller must check foc->len to determine if a valid cookie * has been generated successfully. */ -void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc) +void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, + struct tcp_fastopen_cookie *foc) { - __be32 peer_addr[4] = { addr, 0, 0, 0 }; + __be32 path[4] = { src, dst, 0, 0 }; struct tcp_fastopen_context *ctx; rcu_read_lock(); ctx = rcu_dereference(tcp_fastopen_ctx); if (ctx) { - crypto_cipher_encrypt_one(ctx->tfm, - foc->val, - (__u8 *)peer_addr); + crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; } rcu_read_unlock(); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 280efe5f19c..ec2702882d8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1316,9 +1316,11 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; return true; } + if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, valid_foc); if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || memcmp(&foc->val[0], &valid_foc->val[0], TCP_FASTOPEN_COOKIE_SIZE) != 0) @@ -1329,14 +1331,16 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; return true; } else if (foc->len == 0) { /* Client requesting a cookie */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, valid_foc); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); } else { /* Client sent a cookie with wrong size. Treat it * the same as invalid and return a valid one. */ - tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); + tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, valid_foc); } return false; } -- cgit v1.2.3-70-g09d2 From e370a7236321773245c5522d8bb299380830d3b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Aug 2013 14:37:32 -0700 Subject: af_unix: improve STREAM behavior with fragmented memory unix_stream_sendmsg() currently uses order-2 allocations, and we had numerous reports this can fail. The __GFP_REPEAT flag present in sock_alloc_send_pskb() is not helping. This patch extends the work done in commit eb6a24816b247c ("af_unix: reduce high order page allocations) for datagram sockets. This opens the possibility of zero copy IO (splice() and friends) The trick is to not use skb_pull() anymore in recvmsg() path, and instead add a @consumed field in UNIXCB() to track amount of already read payload in the skb. There is a performance regression for large sends because of extra page allocations that will be addressed in a follow-up patch, allowing sock_alloc_send_pskb() to attempt high order page allocations. Signed-off-by: Eric Dumazet Cc: David Rientjes Signed-off-by: David S. Miller --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 65 ++++++++++++++++++++++++--------------------------- 2 files changed, 31 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 05442df4ca8..a175ba4a7ad 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -35,6 +35,7 @@ struct unix_skb_parms { #ifdef CONFIG_SECURITY_NETWORK u32 secid; /* Security ID */ #endif + u32 consumed; }; #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c4ce243824b..99dc760cdd9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1596,6 +1596,10 @@ out: return err; } +/* We use paged skbs for stream sockets, and limit occupancy to 32768 + * bytes, and a minimun of a full page. + */ +#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, struct msghdr *msg, size_t len) @@ -1609,6 +1613,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, struct scm_cookie tmp_scm; bool fds_sent = false; int max_level; + int data_len; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1635,40 +1640,21 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, goto pipe_err; while (sent < len) { - /* - * Optimisation for the fact that under 0.01% of X - * messages typically need breaking up. - */ - - size = len-sent; + size = len - sent; /* Keep two messages in the pipe so it schedules better */ - if (size > ((sk->sk_sndbuf >> 1) - 64)) - size = (sk->sk_sndbuf >> 1) - 64; + size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); - if (size > SKB_MAX_ALLOC) - size = SKB_MAX_ALLOC; - - /* - * Grab a buffer - */ + /* allow fallback to order-0 allocations */ + size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); - skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, - &err); + data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); - if (skb == NULL) + skb = sock_alloc_send_pskb(sk, size - data_len, data_len, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) goto out_err; - /* - * If you pass two values to the sock_alloc_send_skb - * it tries to grab the large buffer with GFP_NOFS - * (which can fail easily), and if it fails grab the - * fallback size buffer which is under a page and will - * succeed. [Alan] - */ - size = min_t(int, size, skb_tailroom(skb)); - - /* Only send the fds in the first buffer */ err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); if (err < 0) { @@ -1678,7 +1664,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, max_level = err + 1; fds_sent = true; - err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + skb_put(skb, size - data_len); + skb->data_len = data_len; + skb->len = size; + err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, size); if (err) { kfree_skb(skb); goto out_err; @@ -1890,6 +1879,11 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, return timeo; } +static unsigned int unix_skb_len(const struct sk_buff *skb) +{ + return skb->len - UNIXCB(skb).consumed; +} + static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) @@ -1977,8 +1971,8 @@ again: } skip = sk_peek_offset(sk, flags); - while (skip >= skb->len) { - skip -= skb->len; + while (skip >= unix_skb_len(skb)) { + skip -= unix_skb_len(skb); last = skb; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) @@ -2005,8 +1999,9 @@ again: sunaddr = NULL; } - chunk = min_t(unsigned int, skb->len - skip, size); - if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) { + chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); + if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip, + msg->msg_iov, chunk)) { if (copied == 0) copied = -EFAULT; break; @@ -2016,14 +2011,14 @@ again: /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { - skb_pull(skb, chunk); + UNIXCB(skb).consumed += chunk; sk_peek_offset_bwd(sk, chunk); if (UNIXCB(skb).fp) unix_detach_fds(siocb->scm, skb); - if (skb->len) + if (unix_skb_len(skb)) break; skb_unlink(skb, &sk->sk_receive_queue); @@ -2107,7 +2102,7 @@ long unix_inq_len(struct sock *sk) if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) - amount += skb->len; + amount += unix_skb_len(skb); } else { skb = skb_peek(&sk->sk_receive_queue); if (skb) -- cgit v1.2.3-70-g09d2 From 28d6427109d13b0f447cba5761f88d3548e83605 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Aug 2013 14:38:47 -0700 Subject: net: attempt high order allocations in sock_alloc_send_pskb() Adding paged frags skbs to af_unix sockets introduced a performance regression on large sends because of additional page allocations, even if each skb could carry at least 100% more payload than before. We can instruct sock_alloc_send_pskb() to attempt high order allocations. Most of the time, it does a single page allocation instead of 8. I added an additional parameter to sock_alloc_send_pskb() to let other users to opt-in for this new feature on followup patches. Tested: Before patch : $ netperf -t STREAM_STREAM STREAM STREAM TEST Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 2304 212992 212992 10.00 46861.15 After patch : $ netperf -t STREAM_STREAM STREAM STREAM TEST Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 2304 212992 212992 10.00 57981.11 Signed-off-by: Eric Dumazet Cc: David Rientjes Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 2 +- drivers/net/tun.c | 2 +- include/net/sock.h | 3 +- net/core/sock.c | 100 +++++++++++++++++++++++++------------------------ net/packet/af_packet.c | 2 +- net/unix/af_unix.c | 6 ++- 6 files changed, 60 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 182364abfa3..8f6056d9ba9 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -524,7 +524,7 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad, linear = len; skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, - err); + err, 0); if (!skb) return NULL; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b1630479d4a..978d8654b14 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -949,7 +949,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, linear = len; skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, - &err); + &err, 0); if (!skb) return ERR_PTR(err); diff --git a/include/net/sock.h b/include/net/sock.h index ab6a8b75207..e4bbcbfd07e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1539,7 +1539,8 @@ extern struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode); + int *errcode, + int max_page_order); extern void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); extern void sock_kfree_s(struct sock *sk, void *mem, int size); diff --git a/net/core/sock.c b/net/core/sock.c index 83667de45ec..5b6beba494a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1741,24 +1741,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, - int *errcode) + int *errcode, int max_page_order) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; + unsigned long chunk; gfp_t gfp_mask; long timeo; int err; int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + struct page *page; + int i; err = -EMSGSIZE; if (npages > MAX_SKB_FRAGS) goto failure; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; - timeo = sock_sndtimeo(sk, noblock); - while (1) { + while (!skb) { err = sock_error(sk); if (err != 0) goto failure; @@ -1767,50 +1766,52 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - skb = alloc_skb(header_len, gfp_mask); - if (skb) { - int i; - - /* No pages, we're done... */ - if (!data_len) - break; - - skb->truesize += data_len; - skb_shinfo(skb)->nr_frags = npages; - for (i = 0; i < npages; i++) { - struct page *page; - - page = alloc_pages(sk->sk_allocation, 0); - if (!page) { - err = -ENOBUFS; - skb_shinfo(skb)->nr_frags = i; - kfree_skb(skb); - goto failure; - } - - __skb_fill_page_desc(skb, i, - page, 0, - (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len)); - data_len -= PAGE_SIZE; - } + if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + continue; + } - /* Full success... */ - break; - } - err = -ENOBUFS; + err = -ENOBUFS; + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + skb = alloc_skb(header_len, gfp_mask); + if (!skb) goto failure; + + skb->truesize += data_len; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(sk->sk_allocation | + __GFP_COMP | __GFP_NOWARN, + order); + if (page) + goto fill_page; + } + order--; + } + page = alloc_page(sk->sk_allocation); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; } - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); } skb_set_owner_w(skb, sk); @@ -1819,6 +1820,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, interrupted: err = sock_intr_errno(timeo); failure: + kfree_skb(skb); *errcode = err; return NULL; } @@ -1827,7 +1829,7 @@ EXPORT_SYMBOL(sock_alloc_send_pskb); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } EXPORT_SYMBOL(sock_alloc_send_skb); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 4cb28a7f639..6c53dd9f5cc 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2181,7 +2181,7 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, linear = len; skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, - err); + err, 0); if (!skb) return NULL; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 99dc760cdd9..fee9e3397cd 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1479,7 +1479,8 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, MAX_SKB_FRAGS * PAGE_SIZE); skb = sock_alloc_send_pskb(sk, len - data_len, data_len, - msg->msg_flags & MSG_DONTWAIT, &err); + msg->msg_flags & MSG_DONTWAIT, &err, + PAGE_ALLOC_COSTLY_ORDER); if (skb == NULL) goto out; @@ -1651,7 +1652,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); skb = sock_alloc_send_pskb(sk, size - data_len, data_len, - msg->msg_flags & MSG_DONTWAIT, &err); + msg->msg_flags & MSG_DONTWAIT, &err, + get_order(UNIX_SKB_FRAGS_SZ)); if (!skb) goto out_err; -- cgit v1.2.3-70-g09d2 From af61a165187bb94b1dc7628ef815c23d0eacf40b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 2 Jul 2013 18:09:12 +0200 Subject: mac80211: add control port protocol TX control flag A lot of drivers check the frame protocol for ETH_P_PAE, for various reasons (like making those more reliable). Add a new flags bitmap to the TX control info and a new flag indicating the control port protocol is in use to let all drivers also apply such logic to other control port protocols, should they be configured. Also use the new flag in the iwlwifi drivers. Signed-off-by: Johannes Berg --- Documentation/DocBook/80211.tmpl | 1 + drivers/net/wireless/iwlwifi/dvm/tx.c | 2 +- drivers/net/wireless/iwlwifi/iwl-devtrace.h | 7 ++++--- drivers/net/wireless/iwlwifi/mvm/tx.c | 9 ++++----- include/net/mac80211.h | 19 ++++++++++++++++--- net/mac80211/rc80211_minstrel_ht.c | 5 +++-- net/mac80211/tx.c | 8 +++++--- 7 files changed, 34 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/Documentation/DocBook/80211.tmpl b/Documentation/DocBook/80211.tmpl index 49267ea9756..f403ec3c5c9 100644 --- a/Documentation/DocBook/80211.tmpl +++ b/Documentation/DocBook/80211.tmpl @@ -325,6 +325,7 @@ functions/definitions !Finclude/net/mac80211.h ieee80211_rx_status !Finclude/net/mac80211.h mac80211_rx_flags +!Finclude/net/mac80211.h mac80211_tx_info_flags !Finclude/net/mac80211.h mac80211_tx_control_flags !Finclude/net/mac80211.h mac80211_rate_control_flags !Finclude/net/mac80211.h ieee80211_tx_rate diff --git a/drivers/net/wireless/iwlwifi/dvm/tx.c b/drivers/net/wireless/iwlwifi/dvm/tx.c index 5ee983faa67..f364583b153 100644 --- a/drivers/net/wireless/iwlwifi/dvm/tx.c +++ b/drivers/net/wireless/iwlwifi/dvm/tx.c @@ -87,7 +87,7 @@ static void iwlagn_tx_cmd_build_basic(struct iwl_priv *priv, priv->lib->bt_params->advanced_bt_coexist && (ieee80211_is_auth(fc) || ieee80211_is_assoc_req(fc) || ieee80211_is_reassoc_req(fc) || - skb->protocol == cpu_to_be16(ETH_P_PAE))) + info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) tx_flags |= TX_CMD_FLG_IGNORE_BT; diff --git a/drivers/net/wireless/iwlwifi/iwl-devtrace.h b/drivers/net/wireless/iwlwifi/iwl-devtrace.h index 4491c1c72cc..684c416d349 100644 --- a/drivers/net/wireless/iwlwifi/iwl-devtrace.h +++ b/drivers/net/wireless/iwlwifi/iwl-devtrace.h @@ -33,10 +33,11 @@ static inline bool iwl_trace_data(struct sk_buff *skb) { struct ieee80211_hdr *hdr = (void *)skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - if (ieee80211_is_data(hdr->frame_control)) - return skb->protocol != cpu_to_be16(ETH_P_PAE); - return false; + if (!ieee80211_is_data(hdr->frame_control)) + return false; + return !(info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO); } static inline size_t iwl_rx_trace_len(const struct iwl_trans *trans, diff --git a/drivers/net/wireless/iwlwifi/mvm/tx.c b/drivers/net/wireless/iwlwifi/mvm/tx.c index f0e96a92740..d62a6d3053f 100644 --- a/drivers/net/wireless/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/iwlwifi/mvm/tx.c @@ -91,11 +91,10 @@ static void iwl_mvm_set_tx_cmd(struct iwl_mvm *mvm, struct sk_buff *skb, tx_flags |= TX_CMD_FLG_ACK | TX_CMD_FLG_BAR; /* High prio packet (wrt. BT coex) if it is EAPOL, MCAST or MGMT */ - if (info->band == IEEE80211_BAND_2GHZ && - (skb->protocol == cpu_to_be16(ETH_P_PAE) || - is_multicast_ether_addr(hdr->addr1) || - ieee80211_is_back_req(fc) || - ieee80211_is_mgmt(fc))) + if (info->band == IEEE80211_BAND_2GHZ && + (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO || + is_multicast_ether_addr(hdr->addr1) || + ieee80211_is_back_req(fc) || ieee80211_is_mgmt(fc))) tx_flags |= TX_CMD_FLG_BT_DIS; if (ieee80211_has_morefrags(fc)) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 9cda3728c2c..b70c0011132 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -375,7 +375,7 @@ struct ieee80211_bss_conf { }; /** - * enum mac80211_tx_control_flags - flags to describe transmission information/status + * enum mac80211_tx_info_flags - flags to describe transmission information/status * * These flags are used with the @flags member of &ieee80211_tx_info. * @@ -471,7 +471,7 @@ struct ieee80211_bss_conf { * Note: If you have to add new flags to the enumeration, then don't * forget to update %IEEE80211_TX_TEMPORARY_FLAGS when necessary. */ -enum mac80211_tx_control_flags { +enum mac80211_tx_info_flags { IEEE80211_TX_CTL_REQ_TX_STATUS = BIT(0), IEEE80211_TX_CTL_ASSIGN_SEQ = BIT(1), IEEE80211_TX_CTL_NO_ACK = BIT(2), @@ -507,6 +507,18 @@ enum mac80211_tx_control_flags { #define IEEE80211_TX_CTL_STBC_SHIFT 23 +/** + * enum mac80211_tx_control_flags - flags to describe transmit control + * + * @IEEE80211_TX_CTRL_PORT_CTRL_PROTO: this frame is a port control + * protocol frame (e.g. EAP) + * + * These flags are used in tx_info->control.flags. + */ +enum mac80211_tx_control_flags { + IEEE80211_TX_CTRL_PORT_CTRL_PROTO = BIT(0), +}; + /* * This definition is used as a mask to clear all temporary flags, which are * set by the tx handlers for each transmission attempt by the mac80211 stack. @@ -680,7 +692,8 @@ struct ieee80211_tx_info { /* NB: vif can be NULL for injected frames */ struct ieee80211_vif *vif; struct ieee80211_key_conf *hw_key; - /* 8 bytes free */ + u32 flags; + /* 4 bytes free */ } control; struct { struct ieee80211_tx_rate rates[IEEE80211_TX_MAX_RATES]; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 7475a7a3379..9eff3824f2d 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -439,12 +439,13 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 tid; if (unlikely(!ieee80211_is_data_qos(hdr->frame_control))) return; - if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE))) + if (unlikely(info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) return; tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; @@ -776,7 +777,7 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, /* Don't use EAPOL frames for sampling on non-mrr hw */ if (mp->hw->max_rates == 1 && - txrc->skb->protocol == cpu_to_be16(ETH_P_PAE)) + (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) sample_idx = -1; else sample_idx = minstrel_get_sample_rate(mp, mi); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 0e42322aa6b..098ae854ad3 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -539,9 +539,11 @@ ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); - if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol && - tx->sdata->control_port_no_encrypt)) - info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol)) { + if (tx->sdata->control_port_no_encrypt) + info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; + } return TX_CONTINUE; } -- cgit v1.2.3-70-g09d2 From fc73f11f5fa230f8c687d51b0fddb00433092ce0 Mon Sep 17 00:00:00 2001 From: David Spinadel Date: Wed, 31 Jul 2013 18:04:15 +0300 Subject: cfg80211: add wdev to testmode cmd To allow drivers to implement per-interface testmode operations more easily, pass a wdev pointer if any identification for one was given from userspace. Clean up the code a bit while at it. Signed-off-by: David Spinadel Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/testmode.c | 3 ++- drivers/net/wireless/ath/ath6kl/testmode.h | 7 +++++-- .../net/wireless/brcm80211/brcmfmac/wl_cfg80211.c | 4 +++- include/net/cfg80211.h | 5 +++-- net/mac80211/cfg.c | 4 +++- net/wireless/nl80211.c | 23 ++++++++++++++++------ net/wireless/rdev-ops.h | 5 +++-- net/wireless/trace.h | 8 +++++--- 8 files changed, 41 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath6kl/testmode.c b/drivers/net/wireless/ath/ath6kl/testmode.c index acc9aa832f7..d67170ea103 100644 --- a/drivers/net/wireless/ath/ath6kl/testmode.c +++ b/drivers/net/wireless/ath/ath6kl/testmode.c @@ -66,7 +66,8 @@ nla_put_failure: ath6kl_warn("nla_put failed on testmode rx skb!\n"); } -int ath6kl_tm_cmd(struct wiphy *wiphy, void *data, int len) +int ath6kl_tm_cmd(struct wiphy *wiphy, struct wireless_dev *wdev, + void *data, int len) { struct ath6kl *ar = wiphy_priv(wiphy); struct nlattr *tb[ATH6KL_TM_ATTR_MAX + 1]; diff --git a/drivers/net/wireless/ath/ath6kl/testmode.h b/drivers/net/wireless/ath/ath6kl/testmode.h index fe651d6707d..9fbcdec3e20 100644 --- a/drivers/net/wireless/ath/ath6kl/testmode.h +++ b/drivers/net/wireless/ath/ath6kl/testmode.h @@ -20,7 +20,8 @@ #ifdef CONFIG_NL80211_TESTMODE void ath6kl_tm_rx_event(struct ath6kl *ar, void *buf, size_t buf_len); -int ath6kl_tm_cmd(struct wiphy *wiphy, void *data, int len); +int ath6kl_tm_cmd(struct wiphy *wiphy, struct wireless_dev *wdev, + void *data, int len); #else @@ -29,7 +30,9 @@ static inline void ath6kl_tm_rx_event(struct ath6kl *ar, void *buf, { } -static inline int ath6kl_tm_cmd(struct wiphy *wiphy, void *data, int len) +static inline int ath6kl_tm_cmd(struct wiphy *wiphy, + struct wireless_dev *wdev, + void *data, int len) { return 0; } diff --git a/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c b/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c index 277b37ae712..b7d885020dd 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c @@ -3152,7 +3152,9 @@ static int brcmf_cfg80211_sched_scan_stop(struct wiphy *wiphy, } #ifdef CONFIG_NL80211_TESTMODE -static int brcmf_cfg80211_testmode(struct wiphy *wiphy, void *data, int len) +static int brcmf_cfg80211_testmode(struct wiphy *wiphy, + struct wireless_dev *wdev, + void *data, int len) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct net_device *ndev = cfg_to_ndev(cfg); diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b7495c72061..9ab7a0690d9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2081,7 +2081,7 @@ struct cfg80211_update_ft_ies_params { * @mgmt_tx_cancel_wait: Cancel the wait time from transmitting a management * frame on another channel * - * @testmode_cmd: run a test mode command + * @testmode_cmd: run a test mode command; @wdev may be %NULL * @testmode_dump: Implement a test mode dump. The cb->args[2] and up may be * used by the function, but 0 and 1 must not be touched. Additionally, * return error codes other than -ENOBUFS and -ENOENT will terminate the @@ -2290,7 +2290,8 @@ struct cfg80211_ops { void (*rfkill_poll)(struct wiphy *wiphy); #ifdef CONFIG_NL80211_TESTMODE - int (*testmode_cmd)(struct wiphy *wiphy, void *data, int len); + int (*testmode_cmd)(struct wiphy *wiphy, struct wireless_dev *wdev, + void *data, int len); int (*testmode_dump)(struct wiphy *wiphy, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 44449ceb796..c77916ffe74 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2300,7 +2300,9 @@ static void ieee80211_rfkill_poll(struct wiphy *wiphy) } #ifdef CONFIG_NL80211_TESTMODE -static int ieee80211_testmode_cmd(struct wiphy *wiphy, void *data, int len) +static int ieee80211_testmode_cmd(struct wiphy *wiphy, + struct wireless_dev *wdev, + void *data, int len) { struct ieee80211_local *local = wiphy_priv(wiphy); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c2a40a2e56b..334697de5cc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6591,19 +6591,30 @@ static struct genl_multicast_group nl80211_testmode_mcgrp = { static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = + __cfg80211_wdev_from_attrs(genl_info_net(info), info->attrs); int err; + if (!rdev->ops->testmode_cmd) + return -EOPNOTSUPP; + + if (IS_ERR(wdev)) { + err = PTR_ERR(wdev); + if (err != -EINVAL) + return err; + wdev = NULL; + } else if (wdev->wiphy != &rdev->wiphy) { + return -EINVAL; + } + if (!info->attrs[NL80211_ATTR_TESTDATA]) return -EINVAL; - err = -EOPNOTSUPP; - if (rdev->ops->testmode_cmd) { - rdev->testmode_info = info; - err = rdev_testmode_cmd(rdev, + rdev->testmode_info = info; + err = rdev_testmode_cmd(rdev, wdev, nla_data(info->attrs[NL80211_ATTR_TESTDATA]), nla_len(info->attrs[NL80211_ATTR_TESTDATA])); - rdev->testmode_info = NULL; - } + rdev->testmode_info = NULL; return err; } diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index de870d4d0bc..37ce9fdfe93 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -516,11 +516,12 @@ static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev) #ifdef CONFIG_NL80211_TESTMODE static inline int rdev_testmode_cmd(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, void *data, int len) { int ret; - trace_rdev_testmode_cmd(&rdev->wiphy); - ret = rdev->ops->testmode_cmd(&rdev->wiphy, data, len); + trace_rdev_testmode_cmd(&rdev->wiphy, wdev); + ret = rdev->ops->testmode_cmd(&rdev->wiphy, wdev, data, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index f0ebdcd394e..ba5f0d6614d 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1293,15 +1293,17 @@ TRACE_EVENT(rdev_return_int_int, #ifdef CONFIG_NL80211_TESTMODE TRACE_EVENT(rdev_testmode_cmd, - TP_PROTO(struct wiphy *wiphy), - TP_ARGS(wiphy), + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), + TP_ARGS(wiphy, wdev), TP_STRUCT__entry( WIPHY_ENTRY + WDEV_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; + WDEV_ASSIGN; ), - TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG) + TP_printk(WIPHY_PR_FMT WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) ); TRACE_EVENT(rdev_testmode_dump, -- cgit v1.2.3-70-g09d2 From 52981cd79461e47fe683febfcbd3d380c72b1c6c Mon Sep 17 00:00:00 2001 From: David Spinadel Date: Wed, 31 Jul 2013 18:06:22 +0300 Subject: mac80211: add vif to testmode cmd Pass the wdev from cfg80211 on to the driver as the vif if given and it's valid for the driver. Signed-off-by: David Spinadel Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 1 + drivers/net/wireless/ti/wlcore/testmode.c | 3 ++- drivers/net/wireless/ti/wlcore/testmode.h | 3 ++- include/net/mac80211.h | 7 ++++--- net/mac80211/cfg.c | 11 ++++++++++- 5 files changed, 19 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 7b2a6229eed..a0d2aacd5e0 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1364,6 +1364,7 @@ static const struct nla_policy hwsim_testmode_policy[HWSIM_TM_ATTR_MAX + 1] = { static int hwsim_fops_ps_write(void *dat, u64 val); static int mac80211_hwsim_testmode_cmd(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, void *data, int len) { struct mac80211_hwsim_data *hwsim = hw->priv; diff --git a/drivers/net/wireless/ti/wlcore/testmode.c b/drivers/net/wireless/ti/wlcore/testmode.c index f3442762d88..527590f2adf 100644 --- a/drivers/net/wireless/ti/wlcore/testmode.c +++ b/drivers/net/wireless/ti/wlcore/testmode.c @@ -356,7 +356,8 @@ out: return ret; } -int wl1271_tm_cmd(struct ieee80211_hw *hw, void *data, int len) +int wl1271_tm_cmd(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + void *data, int len) { struct wl1271 *wl = hw->priv; struct nlattr *tb[WL1271_TM_ATTR_MAX + 1]; diff --git a/drivers/net/wireless/ti/wlcore/testmode.h b/drivers/net/wireless/ti/wlcore/testmode.h index 8071654259e..61d8434d859 100644 --- a/drivers/net/wireless/ti/wlcore/testmode.h +++ b/drivers/net/wireless/ti/wlcore/testmode.h @@ -26,6 +26,7 @@ #include -int wl1271_tm_cmd(struct ieee80211_hw *hw, void *data, int len); +int wl1271_tm_cmd(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + void *data, int len); #endif /* __WL1271_TESTMODE_H__ */ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b70c0011132..df93c77c97a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2516,8 +2516,8 @@ enum ieee80211_roc_type { * in IEEE 802.11-2007 section 17.3.8.6 and modify ACK timeout * accordingly. This callback is not required and may sleep. * - * @testmode_cmd: Implement a cfg80211 test mode command. - * The callback can sleep. + * @testmode_cmd: Implement a cfg80211 test mode command. The passed @vif may + * be %NULL. The callback can sleep. * @testmode_dump: Implement a cfg80211 test mode dump. The callback can sleep. * * @flush: Flush all pending frames from the hardware queue, making sure @@ -2778,7 +2778,8 @@ struct ieee80211_ops { void (*rfkill_poll)(struct ieee80211_hw *hw); void (*set_coverage_class)(struct ieee80211_hw *hw, u8 coverage_class); #ifdef CONFIG_NL80211_TESTMODE - int (*testmode_cmd)(struct ieee80211_hw *hw, void *data, int len); + int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + void *data, int len); int (*testmode_dump)(struct ieee80211_hw *hw, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c77916ffe74..7aa38ce0b52 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2305,11 +2305,20 @@ static int ieee80211_testmode_cmd(struct wiphy *wiphy, void *data, int len) { struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_vif *vif = NULL; if (!local->ops->testmode_cmd) return -EOPNOTSUPP; - return local->ops->testmode_cmd(&local->hw, data, len); + if (wdev) { + struct ieee80211_sub_if_data *sdata; + + sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + if (sdata->flags & IEEE80211_SDATA_IN_DRIVER) + vif = &sdata->vif; + } + + return local->ops->testmode_cmd(&local->hw, vif, data, len); } static int ieee80211_testmode_dump(struct wiphy *wiphy, -- cgit v1.2.3-70-g09d2 From bd0779370588386e4a67ba5d0b176cfded8e6a53 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 7 Aug 2013 18:13:20 +0200 Subject: netfilter: nfnetlink_queue: allow to attach expectations to conntracks This patch adds the capability to attach expectations via nfnetlink_queue. This is required by conntrack helpers that trigger expectations based on the first packet seen like the TFTP and the DHCPv6 user-space helpers. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 2 + include/net/netfilter/nfnetlink_queue.h | 8 +++ include/uapi/linux/netfilter/nfnetlink_queue.h | 1 + net/netfilter/nf_conntrack_netlink.c | 95 ++++++++++++++++++++++---- net/netfilter/nfnetlink_queue_core.c | 9 ++- net/netfilter/nfnetlink_queue_ct.c | 15 ++++ 6 files changed, 114 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 655d5d198d4..e2cf786be22 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -325,6 +325,8 @@ struct nfq_ct_hook { size_t (*build_size)(const struct nf_conn *ct); int (*build)(struct sk_buff *skb, struct nf_conn *ct); int (*parse)(const struct nlattr *attr, struct nf_conn *ct); + int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct, + u32 portid, u32 report); }; extern struct nfq_ct_hook __rcu *nfq_ct_hook; diff --git a/include/net/netfilter/nfnetlink_queue.h b/include/net/netfilter/nfnetlink_queue.h index 86267a52951..aff88ba9139 100644 --- a/include/net/netfilter/nfnetlink_queue.h +++ b/include/net/netfilter/nfnetlink_queue.h @@ -15,6 +15,8 @@ int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo); void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff); +int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, + u32 portid, u32 report); #else inline struct nf_conn * nfqnl_ct_get(struct sk_buff *entskb, size_t *size, enum ip_conntrack_info *ctinfo) @@ -39,5 +41,11 @@ inline void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff) { } + +inline int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, + u32 portid, u32 report) +{ + return 0; +} #endif /* NF_CONNTRACK */ #endif diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h index 3a9b9214733..0132bad79de 100644 --- a/include/uapi/linux/netfilter/nfnetlink_queue.h +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h @@ -46,6 +46,7 @@ enum nfqnl_attr_type { NFQA_CT_INFO, /* enum ip_conntrack_info */ NFQA_CAP_LEN, /* __u32 length of captured packet */ NFQA_SKB_INFO, /* __u32 skb meta information */ + NFQA_EXP, /* nf_conntrack_netlink.h */ __NFQA_MAX }; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9aaa68bbbcd..fa61fea6323 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1987,6 +1987,27 @@ out: return err == -EAGAIN ? -ENOBUFS : err; } +static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { + [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, + [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, + [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, + [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [CTA_EXPECT_ID] = { .type = NLA_U32 }, + [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN - 1 }, + [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, + [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, + [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, + [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, + [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, +}; + +static struct nf_conntrack_expect * +ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, + struct nf_conntrack_helper *helper, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask); + #ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT static size_t ctnetlink_nfqueue_build_size(const struct nf_conn *ct) @@ -2127,10 +2148,69 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) return ret; } +static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda, + const struct nf_conn *ct, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask) +{ + int err; + + err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, + nf_ct_l3num(ct)); + if (err < 0) + return err; + + return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, + nf_ct_l3num(ct)); +} + +static int +ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, + u32 portid, u32 report) +{ + struct nlattr *cda[CTA_EXPECT_MAX+1]; + struct nf_conntrack_tuple tuple, mask; + struct nf_conntrack_helper *helper; + struct nf_conntrack_expect *exp; + int err; + + err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy); + if (err < 0) + return err; + + err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda, + ct, &tuple, &mask); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) + return -EOPNOTSUPP; + } + + exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, + helper, &tuple, &mask); + if (IS_ERR(exp)) + return PTR_ERR(exp); + + err = nf_ct_expect_related_report(exp, portid, report); + if (err < 0) { + nf_ct_expect_put(exp); + return err; + } + + return 0; +} + static struct nfq_ct_hook ctnetlink_nfqueue_hook = { .build_size = ctnetlink_nfqueue_build_size, .build = ctnetlink_nfqueue_build, .parse = ctnetlink_nfqueue_parse, + .attach_expect = ctnetlink_nfqueue_attach_expect, }; #endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */ @@ -2498,21 +2578,6 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, return err; } -static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { - [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, - [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, - [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, - [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, - [CTA_EXPECT_ID] = { .type = NLA_U32 }, - [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, - .len = NF_CT_HELPER_NAME_LEN - 1 }, - [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, - [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, - [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, - [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, - [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, -}; - static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index ec9de12aa48..e8c9f3bb779 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -859,6 +859,7 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, [NFQA_CT] = { .type = NLA_UNSPEC }, + [NFQA_EXP] = { .type = NLA_UNSPEC }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { @@ -987,8 +988,14 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, if (entry == NULL) return -ENOENT; - if (nfqa[NFQA_CT]) + if (nfqa[NFQA_CT]) { ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo); + if (ct && nfqa[NFQA_EXP]) { + nfqnl_attach_expect(ct, nfqa[NFQA_EXP], + NETLINK_CB(skb).portid, + nlmsg_report(nlh)); + } + } if (nfqa[NFQA_PAYLOAD]) { u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c index ab61d66bc0b..be893039966 100644 --- a/net/netfilter/nfnetlink_queue_ct.c +++ b/net/netfilter/nfnetlink_queue_ct.c @@ -96,3 +96,18 @@ void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, if ((ct->status & IPS_NAT_MASK) && diff) nfq_nat_ct->seq_adjust(skb, ct, ctinfo, diff); } + +int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, + u32 portid, u32 report) +{ + struct nfq_ct_hook *nfq_ct; + + if (nf_ct_is_untracked(ct)) + return 0; + + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return -EOPNOTSUPP; + + return nfq_ct->attach_expect(attr, ct, portid, report); +} -- cgit v1.2.3-70-g09d2 From ebd8b934e23f45ad3fc8a5a28bc5a96741a6a106 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sat, 10 Aug 2013 15:22:58 -0700 Subject: pptp: fix byte order warnings Pptp driver has lots of byte order warnings from sparse. This was because the on-the-wire header is in network byte order (obviously) but the definition did not reflect that. Also, the address structure to user space actually put the call id in host order. Rather than break ABI compatibility, just acknowledge the existing design. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/ppp/pptp.c | 10 +++++----- include/uapi/linux/if_pppox.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 9785a3ac02b..6fa5ae00039 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -83,11 +83,11 @@ static const struct proto_ops pptp_ops; struct pptp_gre_header { u8 flags; u8 ver; - u16 protocol; - u16 payload_len; - u16 call_id; - u32 seq; - u32 ack; + __be16 protocol; + __be16 payload_len; + __be16 call_id; + __be32 seq; + __be32 ack; } __packed; static struct pppox_sock *lookup_chan(u16 call_id, __be32 s_addr) diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h index e36a4aecd31..e128769331b 100644 --- a/include/uapi/linux/if_pppox.h +++ b/include/uapi/linux/if_pppox.h @@ -46,7 +46,7 @@ struct pppoe_addr { * PPTP addressing definition */ struct pptp_addr { - __be16 call_id; + __u16 call_id; struct in_addr sin_addr; }; -- cgit v1.2.3-70-g09d2 From 1972b5b3a66f1b6a9df04cc91faf401354919262 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Tue, 25 Jun 2013 12:42:54 +0200 Subject: NFC: Document secure element addition/removal netlink events Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 8137dd8d2ad..40ada984cb7 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -71,6 +71,11 @@ * @NFC_CMD_DISABLE_SE: Disable the physical link to a specific secure element. * @NFC_CMD_FW_DOWNLOAD: Request to Load/flash firmware, or event to inform * that some firmware was loaded + * @NFC_EVENT_SE_ADDED: Event emitted when a new secure element is discovered. + * This typically will be sent whenever a new NFC controller with either + * an embedded SE or an UICC one connected to it through SWP. + * @NFC_EVENT_SE_REMOVED: Event emitted when a secure element is removed from + * the system, as a consequence of e.g. an NFC controller being unplugged. */ enum nfc_commands { NFC_CMD_UNSPEC, -- cgit v1.2.3-70-g09d2 From 91a32269e31282405db405b9ec9ce1a30568e4b0 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Tue, 25 Jun 2013 16:22:08 +0200 Subject: NFC: Define secure element connectivity and transaction events The SE_CONNECTIVITY event is for an SE to request connection to e.g. a modem. The SE_TRANSACTION one is sent when an application running on a specific SE wants to notify the host CPU about the end of a transaction. Those events respectively map to the EVT_CONNECTIVITY and the EVT_TRANSACTION HCI events. Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 40ada984cb7..539d60494c0 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -76,6 +76,14 @@ * an embedded SE or an UICC one connected to it through SWP. * @NFC_EVENT_SE_REMOVED: Event emitted when a secure element is removed from * the system, as a consequence of e.g. an NFC controller being unplugged. + * @NFC_EVENT_SE_CONNECTIVITY: This event is emitted whenever a secure element + * is requesting connectivity access. For example a UICC SE may need to + * talk with a sleeping modem and will notify this need by sending this + * event. It is then up to userspace to decide if it will wake the modem + * up or not. + * @NFC_EVENT_SE_TRANSACTION: This event is sent when an application running on + * a specific SE notifies us about the end of a transaction. The parameter + * for this event is the application ID (AID). */ enum nfc_commands { NFC_CMD_UNSPEC, @@ -102,6 +110,8 @@ enum nfc_commands { NFC_CMD_FW_DOWNLOAD, NFC_EVENT_SE_ADDED, NFC_EVENT_SE_REMOVED, + NFC_EVENT_SE_CONNECTIVITY, + NFC_EVENT_SE_TRANSACTION, /* private: internal use only */ __NFC_CMD_AFTER_LAST }; @@ -159,6 +169,7 @@ enum nfc_attrs { NFC_ATTR_FIRMWARE_NAME, NFC_ATTR_SE_INDEX, NFC_ATTR_SE_TYPE, + NFC_ATTR_SE_AID, /* private: internal use only */ __NFC_ATTR_AFTER_LAST }; -- cgit v1.2.3-70-g09d2 From ac22ac466a659f1b2e02a2e2ee23fc5c42da2c95 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Wed, 24 Jul 2013 18:10:50 +0200 Subject: NFC: Add a GET_SE netlink API In order to fetch the discovered secure elements from an NFC controller, we need to send a netlink command that will dump the list of available SEs from NFC. Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 2 ++ net/nfc/netlink.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 539d60494c0..029921b067f 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -84,6 +84,7 @@ * @NFC_EVENT_SE_TRANSACTION: This event is sent when an application running on * a specific SE notifies us about the end of a transaction. The parameter * for this event is the application ID (AID). + * @NFC_CMD_GET_SE: Dump all discovered secure elements from an NFC controller. */ enum nfc_commands { NFC_CMD_UNSPEC, @@ -112,6 +113,7 @@ enum nfc_commands { NFC_EVENT_SE_REMOVED, NFC_EVENT_SE_CONNECTIVITY, NFC_EVENT_SE_TRANSACTION, + NFC_CMD_GET_SE, /* private: internal use only */ __NFC_CMD_AFTER_LAST }; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index f16fd59d416..3b08ef90e04 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1191,6 +1191,91 @@ static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info) return rc; } +static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, + u32 portid, u32 seq, + struct netlink_callback *cb, + int flags) +{ + void *hdr; + struct nfc_se *se, *n; + + list_for_each_entry_safe(se, n, &dev->secure_elements, list) { + hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, + NFC_CMD_GET_SE); + if (!hdr) + goto nla_put_failure; + + if (cb) + genl_dump_check_consistent(cb, hdr, &nfc_genl_family); + + if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || + nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) || + nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) + goto nla_put_failure; + + if (genlmsg_end(msg, hdr) < 0) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int nfc_genl_dump_ses(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; + struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; + bool first_call = false; + + if (!iter) { + first_call = true; + iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + cb->args[0] = (long) iter; + } + + mutex_lock(&nfc_devlist_mutex); + + cb->seq = nfc_devlist_generation; + + if (first_call) { + nfc_device_iter_init(iter); + dev = nfc_device_iter_next(iter); + } + + while (dev) { + int rc; + + rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); + if (rc < 0) + break; + + dev = nfc_device_iter_next(iter); + } + + mutex_unlock(&nfc_devlist_mutex); + + cb->args[1] = (long) dev; + + return skb->len; +} + +static int nfc_genl_dump_ses_done(struct netlink_callback *cb) +{ + struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; + + nfc_device_iter_exit(iter); + kfree(iter); + + return 0; +} + static struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, @@ -1265,6 +1350,12 @@ static struct genl_ops nfc_genl_ops[] = { .doit = nfc_genl_disable_se, .policy = nfc_genl_policy, }, + { + .cmd = NFC_CMD_GET_SE, + .dumpit = nfc_genl_dump_ses, + .done = nfc_genl_dump_ses_done, + .policy = nfc_genl_policy, + }, }; -- cgit v1.2.3-70-g09d2 From ef04158e13e827315680cf8449d9af3bd8dc6280 Mon Sep 17 00:00:00 2001 From: Eric Lapuyade Date: Fri, 19 Jul 2013 14:56:08 +0200 Subject: NFC: Move nfc_fw_download_done() definition from private to public This API must be called by NFC drivers, and its prototype was incorrectly placed. Signed-off-by: Eric Lapuyade Signed-off-by: Samuel Ortiz --- include/net/nfc/nfc.h | 2 ++ net/nfc/nfc.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 5f286b726bb..10059556058 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -224,6 +224,8 @@ int nfc_set_remote_general_bytes(struct nfc_dev *dev, u8 *gt, u8 gt_len); u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len); +int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name); + int nfc_targets_found(struct nfc_dev *dev, struct nfc_target *targets, int ntargets); int nfc_target_lost(struct nfc_dev *dev, u32 target_idx); diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index 820a7850c36..4e2e5a787c4 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -126,8 +126,6 @@ static inline void nfc_device_iter_exit(struct class_dev_iter *iter) int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name); int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name); -int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name); - int nfc_dev_up(struct nfc_dev *dev); int nfc_dev_down(struct nfc_dev *dev); -- cgit v1.2.3-70-g09d2 From 352a5f5fb3ad8f829cfd4248fe6119895bda881f Mon Sep 17 00:00:00 2001 From: Eric Lapuyade Date: Fri, 19 Jul 2013 14:57:55 +0200 Subject: NFC: netlink: Add result of firmware operation to completion event Result is added as an NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS attribute containing the standard errno positive value of the completion result. This event will be sent when the firmare download operation is done and will contain the operation result. Signed-off-by: Eric Lapuyade Signed-off-by: Samuel Ortiz --- include/net/nfc/nfc.h | 3 ++- include/uapi/linux/nfc.h | 2 ++ net/nfc/core.c | 12 ++++++++++-- net/nfc/netlink.c | 4 +++- net/nfc/nfc.h | 3 ++- 5 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 10059556058..f68ee68e4e3 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -224,7 +224,8 @@ int nfc_set_remote_general_bytes(struct nfc_dev *dev, u8 *gt, u8 gt_len); u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len); -int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name); +int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name, + u32 result); int nfc_targets_found(struct nfc_dev *dev, struct nfc_target *targets, int ntargets); diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 029921b067f..29bed72a4ac 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -146,6 +146,7 @@ enum nfc_commands { * @NFC_ATTR_FIRMWARE_NAME: Free format firmware version * @NFC_ATTR_SE_INDEX: Secure element index * @NFC_ATTR_SE_TYPE: Secure element type (UICC or EMBEDDED) + * @NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS: Firmware download operation status */ enum nfc_attrs { NFC_ATTR_UNSPEC, @@ -172,6 +173,7 @@ enum nfc_attrs { NFC_ATTR_SE_INDEX, NFC_ATTR_SE_TYPE, NFC_ATTR_SE_AID, + NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, /* private: internal use only */ __NFC_ATTR_AFTER_LAST }; diff --git a/net/nfc/core.c b/net/nfc/core.c index aad7f8f5978..d252912b8de 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -77,11 +77,19 @@ error: return rc; } -int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name) +/** + * nfc_fw_download_done - inform that a firmware download was completed + * + * @dev: The nfc device to which firmware was downloaded + * @firmware_name: The firmware filename + * @result: The positive value of a standard errno value + */ +int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name, + u32 result) { dev->fw_download_in_progress = false; - return nfc_genl_fw_download_done(dev, firmware_name); + return nfc_genl_fw_download_done(dev, firmware_name, result); } EXPORT_SYMBOL(nfc_fw_download_done); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 3b08ef90e04..68063b2025d 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1114,7 +1114,8 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) return rc; } -int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name) +int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, + u32 result) { struct sk_buff *msg; void *hdr; @@ -1129,6 +1130,7 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name) goto free_msg; if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) || + nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index 4e2e5a787c4..aaf606fc1fa 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -124,7 +124,8 @@ static inline void nfc_device_iter_exit(struct class_dev_iter *iter) } int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name); -int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name); +int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, + u32 result); int nfc_dev_up(struct nfc_dev *dev); -- cgit v1.2.3-70-g09d2 From fc4eba58b4c1462ff3d6247b66fb47d6928db6d2 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 14 Aug 2013 01:03:46 +0200 Subject: ipv6: make unsolicited report intervals configurable for mld Commit cab70040dfd95ee32144f02fade64f0cb94f31a0 ("net: igmp: Reduce Unsolicited report interval to 1s when using IGMPv3") and 2690048c01f32bf45d1c1e1ab3079bc10ad2aea7 ("net: igmp: Allow user-space configuration of igmp unsolicited report interval") by William Manley made igmp unsolicited report intervals configurable per interface and corrected the interval of unsolicited igmpv3 report messages resendings to 1s. Same needs to be done for IPv6: MLDv1 (RFC2710 7.10.): 10 seconds MLDv2 (RFC3810 9.11.): 1 second Both intervals are configurable via new procfs knobs mldv1_unsolicited_report_interval and mldv2_unsolicited_report_interval. (also added .force_mld_version to ipv6_devconf_dflt to bring structs in line without semantic changes) v2: a) Joined documentation update for IPv4 and IPv6 MLD/IGMP unsolicited_report_interval procfs knobs. b) incorporate stylistic feedback from William Manley v3: a) add new DEVCONF_* values to the end of the enum (thanks to David Miller) Cc: Cong Wang Cc: William Manley Cc: Benjamin LaHaise Cc: YOSHIFUJI Hideaki Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 18 ++++++++++++++++++ include/linux/ipv6.h | 2 ++ include/uapi/linux/ipv6.h | 2 ++ net/ipv6/addrconf.c | 25 +++++++++++++++++++++++++ net/ipv6/mcast.c | 17 ++++++++++++++--- 5 files changed, 61 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 36be26b2ef7..debfe857d8f 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1039,7 +1039,15 @@ disable_policy - BOOLEAN disable_xfrm - BOOLEAN Disable IPSEC encryption on this interface, whatever the policy +igmpv2_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + IGMPv1 or IGMPv2 report retransmit will take place. + Default: 10000 (10 seconds) +igmpv3_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + IGMPv3 report retransmit will take place. + Default: 1000 (1 seconds) tag - INTEGER Allows you to write a number, which can be used as required. @@ -1331,6 +1339,16 @@ ndisc_notify - BOOLEAN 1 - Generate unsolicited neighbour advertisements when device is brought up or hardware address changes. +mldv1_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + MLDv1 report retransmit will take place. + Default: 10000 (10 seconds) + +mldv2_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + MLDv2 report retransmit will take place. + Default: 1000 (1 second) + icmp/*: ratelimit - INTEGER Limit the maximal rates for sending ICMPv6 packets. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 850e95bc766..77a478462d8 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -19,6 +19,8 @@ struct ipv6_devconf { __s32 rtr_solicit_interval; __s32 rtr_solicit_delay; __s32 force_mld_version; + __s32 mldv1_unsolicited_report_interval; + __s32 mldv2_unsolicited_report_interval; #ifdef CONFIG_IPV6_PRIVACY __s32 use_tempaddr; __s32 temp_valid_lft; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 4bda4cf5b0f..d07ac6903e5 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -160,6 +160,8 @@ enum { DEVCONF_ACCEPT_DAD, DEVCONF_FORCE_TLLAO, DEVCONF_NDISC_NOTIFY, + DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7fd8572bac8..ad12f7c43f2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -177,6 +177,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_redirects = 1, .autoconf = 1, .force_mld_version = 0, + .mldv1_unsolicited_report_interval = 10 * HZ, + .mldv2_unsolicited_report_interval = HZ, .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, @@ -211,6 +213,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_ra = 1, .accept_redirects = 1, .autoconf = 1, + .force_mld_version = 0, + .mldv1_unsolicited_report_interval = 10 * HZ, + .mldv2_unsolicited_report_interval = HZ, .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, @@ -4179,6 +4184,10 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_RTR_SOLICIT_DELAY] = jiffies_to_msecs(cnf->rtr_solicit_delay); array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; + array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] = + jiffies_to_msecs(cnf->mldv1_unsolicited_report_interval); + array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] = + jiffies_to_msecs(cnf->mldv2_unsolicited_report_interval); #ifdef CONFIG_IPV6_PRIVACY array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; @@ -4862,6 +4871,22 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "mldv1_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv1_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "mldv2_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv2_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, #ifdef CONFIG_IPV6_PRIVACY { .procname = "use_tempaddr", diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index db25b8eb62b..6c76df9909b 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -108,7 +108,6 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); -#define IGMP6_UNSOLICITED_IVAL (10*HZ) #define MLD_QRV_DEFAULT 2 #define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \ @@ -129,6 +128,18 @@ int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; pmc != NULL; \ pmc = rcu_dereference(pmc->next)) +static int unsolicited_report_interval(struct inet6_dev *idev) +{ + int iv; + + if (MLD_V1_SEEN(idev)) + iv = idev->cnf.mldv1_unsolicited_report_interval; + else + iv = idev->cnf.mldv2_unsolicited_report_interval; + + return iv > 0 ? iv : 1; +} + int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct net_device *dev = NULL; @@ -2158,7 +2169,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); - delay = net_random() % IGMP6_UNSOLICITED_IVAL; + delay = net_random() % unsolicited_report_interval(ma->idev); spin_lock_bh(&ma->mca_lock); if (del_timer(&ma->mca_timer)) { @@ -2325,7 +2336,7 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire, (unsigned long)idev); idev->mc_qrv = MLD_QRV_DEFAULT; - idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL; + idev->mc_maxdelay = unsolicited_report_interval(idev); idev->mc_v1_seen = 0; write_unlock_bh(&idev->lock); } -- cgit v1.2.3-70-g09d2 From f0c03956ac40fdc4fbce7bed262ae74ac372e765 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 13 Aug 2013 15:05:38 +0200 Subject: netfilter: export xt_rpfilter.h to userland This file contains the API for the match "rpfilter", hence it should be exported to userland. Signed-off-by: Nicolas Dichtel Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/xt_rpfilter.h | 23 ----------------------- include/uapi/linux/netfilter/Kbuild | 1 + include/uapi/linux/netfilter/xt_rpfilter.h | 23 +++++++++++++++++++++++ 3 files changed, 24 insertions(+), 23 deletions(-) delete mode 100644 include/linux/netfilter/xt_rpfilter.h create mode 100644 include/uapi/linux/netfilter/xt_rpfilter.h (limited to 'include') diff --git a/include/linux/netfilter/xt_rpfilter.h b/include/linux/netfilter/xt_rpfilter.h deleted file mode 100644 index 8358d4f7195..00000000000 --- a/include/linux/netfilter/xt_rpfilter.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _XT_RPATH_H -#define _XT_RPATH_H - -#include - -enum { - XT_RPFILTER_LOOSE = 1 << 0, - XT_RPFILTER_VALID_MARK = 1 << 1, - XT_RPFILTER_ACCEPT_LOCAL = 1 << 2, - XT_RPFILTER_INVERT = 1 << 3, -#ifdef __KERNEL__ - XT_RPFILTER_OPTION_MASK = XT_RPFILTER_LOOSE | - XT_RPFILTER_VALID_MARK | - XT_RPFILTER_ACCEPT_LOCAL | - XT_RPFILTER_INVERT, -#endif -}; - -struct xt_rpfilter_info { - __u8 flags; -}; - -#endif diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild index 41115776d76..dc00927ffd6 100644 --- a/include/uapi/linux/netfilter/Kbuild +++ b/include/uapi/linux/netfilter/Kbuild @@ -68,6 +68,7 @@ header-y += xt_quota.h header-y += xt_rateest.h header-y += xt_realm.h header-y += xt_recent.h +header-y += xt_rpfilter.h header-y += xt_sctp.h header-y += xt_set.h header-y += xt_socket.h diff --git a/include/uapi/linux/netfilter/xt_rpfilter.h b/include/uapi/linux/netfilter/xt_rpfilter.h new file mode 100644 index 00000000000..8358d4f7195 --- /dev/null +++ b/include/uapi/linux/netfilter/xt_rpfilter.h @@ -0,0 +1,23 @@ +#ifndef _XT_RPATH_H +#define _XT_RPATH_H + +#include + +enum { + XT_RPFILTER_LOOSE = 1 << 0, + XT_RPFILTER_VALID_MARK = 1 << 1, + XT_RPFILTER_ACCEPT_LOCAL = 1 << 2, + XT_RPFILTER_INVERT = 1 << 3, +#ifdef __KERNEL__ + XT_RPFILTER_OPTION_MASK = XT_RPFILTER_LOOSE | + XT_RPFILTER_VALID_MARK | + XT_RPFILTER_ACCEPT_LOCAL | + XT_RPFILTER_INVERT, +#endif +}; + +struct xt_rpfilter_info { + __u8 flags; +}; + +#endif -- cgit v1.2.3-70-g09d2 From 38c67328ac79cb9eaf61b5d4750fe3b9cff0dd15 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 13 Aug 2013 15:05:39 +0200 Subject: netfilter: export xt_HMARK.h to userland This file contains the API for the target "HMARK", hence it should be exported to userland. Signed-off-by: Nicolas Dichtel Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/xt_HMARK.h | 50 --------------------------------- include/uapi/linux/netfilter/Kbuild | 1 + include/uapi/linux/netfilter/xt_HMARK.h | 50 +++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 50 deletions(-) delete mode 100644 include/linux/netfilter/xt_HMARK.h create mode 100644 include/uapi/linux/netfilter/xt_HMARK.h (limited to 'include') diff --git a/include/linux/netfilter/xt_HMARK.h b/include/linux/netfilter/xt_HMARK.h deleted file mode 100644 index 826fc580757..00000000000 --- a/include/linux/netfilter/xt_HMARK.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef XT_HMARK_H_ -#define XT_HMARK_H_ - -#include - -enum { - XT_HMARK_SADDR_MASK, - XT_HMARK_DADDR_MASK, - XT_HMARK_SPI, - XT_HMARK_SPI_MASK, - XT_HMARK_SPORT, - XT_HMARK_DPORT, - XT_HMARK_SPORT_MASK, - XT_HMARK_DPORT_MASK, - XT_HMARK_PROTO_MASK, - XT_HMARK_RND, - XT_HMARK_MODULUS, - XT_HMARK_OFFSET, - XT_HMARK_CT, - XT_HMARK_METHOD_L3, - XT_HMARK_METHOD_L3_4, -}; -#define XT_HMARK_FLAG(flag) (1 << flag) - -union hmark_ports { - struct { - __u16 src; - __u16 dst; - } p16; - struct { - __be16 src; - __be16 dst; - } b16; - __u32 v32; - __be32 b32; -}; - -struct xt_hmark_info { - union nf_inet_addr src_mask; - union nf_inet_addr dst_mask; - union hmark_ports port_mask; - union hmark_ports port_set; - __u32 flags; - __u16 proto_mask; - __u32 hashrnd; - __u32 hmodulus; - __u32 hoffset; /* Mark offset to start from */ -}; - -#endif /* XT_HMARK_H_ */ diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild index dc00927ffd6..174915420d3 100644 --- a/include/uapi/linux/netfilter/Kbuild +++ b/include/uapi/linux/netfilter/Kbuild @@ -22,6 +22,7 @@ header-y += xt_CONNMARK.h header-y += xt_CONNSECMARK.h header-y += xt_CT.h header-y += xt_DSCP.h +header-y += xt_HMARK.h header-y += xt_IDLETIMER.h header-y += xt_LED.h header-y += xt_LOG.h diff --git a/include/uapi/linux/netfilter/xt_HMARK.h b/include/uapi/linux/netfilter/xt_HMARK.h new file mode 100644 index 00000000000..826fc580757 --- /dev/null +++ b/include/uapi/linux/netfilter/xt_HMARK.h @@ -0,0 +1,50 @@ +#ifndef XT_HMARK_H_ +#define XT_HMARK_H_ + +#include + +enum { + XT_HMARK_SADDR_MASK, + XT_HMARK_DADDR_MASK, + XT_HMARK_SPI, + XT_HMARK_SPI_MASK, + XT_HMARK_SPORT, + XT_HMARK_DPORT, + XT_HMARK_SPORT_MASK, + XT_HMARK_DPORT_MASK, + XT_HMARK_PROTO_MASK, + XT_HMARK_RND, + XT_HMARK_MODULUS, + XT_HMARK_OFFSET, + XT_HMARK_CT, + XT_HMARK_METHOD_L3, + XT_HMARK_METHOD_L3_4, +}; +#define XT_HMARK_FLAG(flag) (1 << flag) + +union hmark_ports { + struct { + __u16 src; + __u16 dst; + } p16; + struct { + __be16 src; + __be16 dst; + } b16; + __u32 v32; + __be32 b32; +}; + +struct xt_hmark_info { + union nf_inet_addr src_mask; + union nf_inet_addr dst_mask; + union hmark_ports port_mask; + union hmark_ports port_set; + __u32 flags; + __u16 proto_mask; + __u32 hashrnd; + __u32 hmodulus; + __u32 hoffset; /* Mark offset to start from */ +}; + +#endif /* XT_HMARK_H_ */ -- cgit v1.2.3-70-g09d2 From 6c742e714d8c282fd8f8b22d3e20b5141738c1ee Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 13 Aug 2013 17:51:11 +0200 Subject: ipip: add x-netns support This patch allows to switch the netns when packet is encapsulated or decapsulated. In other word, the encapsulated packet is received in a netns, where the lookup is done to find the tunnel. Once the tunnel is found, the packet is decapsulated and injecting into the corresponding interface which stands to another netns. When one of the two netns is removed, the tunnel is destroyed. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 2 +- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ip_tunnel.c | 43 ++++++++++++++++++++++++++++--------------- net/ipv4/ip_vti.c | 2 +- net/ipv4/ipip.c | 3 +-- 5 files changed, 33 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index c6acd9f8f87..5a76f2bef82 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -102,7 +102,7 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); -void ip_tunnel_delete_net(struct ip_tunnel_net *itn); +void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1f6eab66f7c..bc3a76521de 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -534,7 +534,7 @@ static int __net_init ipgre_init_net(struct net *net) static void __net_exit ipgre_exit_net(struct net *net) { struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); - ip_tunnel_delete_net(itn); + ip_tunnel_delete_net(itn, &ipgre_link_ops); } static struct pernet_operations ipgre_net_ops = { @@ -767,7 +767,7 @@ static int __net_init ipgre_tap_init_net(struct net *net) static void __net_exit ipgre_tap_exit_net(struct net *net) { struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); - ip_tunnel_delete_net(itn); + ip_tunnel_delete_net(itn, &ipgre_tap_ops); } static struct pernet_operations ipgre_tap_net_ops = { diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index a351a003ee6..a4d9126c7b5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -350,7 +350,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - rt = ip_route_output_tunnel(dev_net(dev), &fl4, + rt = ip_route_output_tunnel(tunnel->net, &fl4, tunnel->parms.iph.protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, @@ -365,7 +365,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); + tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev) { hlen = tdev->hard_header_len + tdev->needed_headroom; @@ -654,7 +654,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } } - err = iptunnel_xmit(dev_net(dev), rt, skb, + err = iptunnel_xmit(tunnel->net, rt, skb, fl4.saddr, fl4.daddr, protocol, ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); @@ -821,11 +821,10 @@ static void ip_tunnel_dev_free(struct net_device *dev) void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) { - struct net *net = dev_net(dev); struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_net *itn; - itn = net_generic(net, tunnel->ip_tnl_net_id); + itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); if (itn->fb_tunnel_dev != dev) { ip_tunnel_del(netdev_priv(dev)); @@ -855,6 +854,10 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, rtnl_lock(); itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; rtnl_unlock(); if (IS_ERR(itn->fb_tunnel_dev)) @@ -864,28 +867,39 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head) +static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, + struct rtnl_link_ops *ops) { + struct net *net = dev_net(itn->fb_tunnel_dev); + struct net_device *dev, *aux; int h; + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == ops) + unregister_netdevice_queue(dev, head); + for (h = 0; h < IP_TNL_HASH_SIZE; h++) { struct ip_tunnel *t; struct hlist_node *n; struct hlist_head *thead = &itn->tunnels[h]; hlist_for_each_entry_safe(t, n, thead, hash_node) - unregister_netdevice_queue(t->dev, head); + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, head); } if (itn->fb_tunnel_dev) unregister_netdevice_queue(itn->fb_tunnel_dev, head); } -void ip_tunnel_delete_net(struct ip_tunnel_net *itn) +void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) { LIST_HEAD(list); rtnl_lock(); - ip_tunnel_destroy(itn, &list); + ip_tunnel_destroy(itn, &list, ops); unregister_netdevice_many(&list); rtnl_unlock(); } @@ -929,23 +943,21 @@ EXPORT_SYMBOL_GPL(ip_tunnel_newlink); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p) { - struct ip_tunnel *t, *nt; - struct net *net = dev_net(dev); + struct ip_tunnel *t; struct ip_tunnel *tunnel = netdev_priv(dev); + struct net *net = tunnel->net; struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); if (dev == itn->fb_tunnel_dev) return -EINVAL; - nt = netdev_priv(dev); - t = ip_tunnel_find(itn, p, dev->type); if (t) { if (t->dev != dev) return -EEXIST; } else { - t = nt; + t = tunnel; if (dev->type != ARPHRD_ETHER) { unsigned int nflags = 0; @@ -984,6 +996,7 @@ int ip_tunnel_init(struct net_device *dev) } tunnel->dev = dev; + tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); iph->version = 4; iph->ihl = 5; @@ -994,8 +1007,8 @@ EXPORT_SYMBOL_GPL(ip_tunnel_init); void ip_tunnel_uninit(struct net_device *dev) { - struct net *net = dev_net(dev); struct ip_tunnel *tunnel = netdev_priv(dev); + struct net *net = tunnel->net; struct ip_tunnel_net *itn; itn = net_generic(net, tunnel->ip_tnl_net_id); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 79b263da416..e805e7b3030 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -318,7 +318,7 @@ static int __net_init vti_init_net(struct net *net) static void __net_exit vti_exit_net(struct net *net) { struct ip_tunnel_net *itn = net_generic(net, vti_net_id); - ip_tunnel_delete_net(itn); + ip_tunnel_delete_net(itn, &vti_link_ops); } static struct pernet_operations vti_net_ops = { diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 51fc2a1dcdd..87bd2952c73 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -286,7 +286,6 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; - dev->features |= NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; @@ -437,7 +436,7 @@ static int __net_init ipip_init_net(struct net *net) static void __net_exit ipip_exit_net(struct net *net) { struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); - ip_tunnel_delete_net(itn); + ip_tunnel_delete_net(itn, &ipip_link_ops); } static struct pernet_operations ipip_net_ops = { -- cgit v1.2.3-70-g09d2 From 0bd8762824e73a3cce7b7560a97463301764b616 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 13 Aug 2013 17:51:12 +0200 Subject: ip6tnl: add x-netns support This patch allows to switch the netns when packet is encapsulated or decapsulated. In other word, the encapsulated packet is received in a netns, where the lookup is done to find the tunnel. Once the tunnel is found, the packet is decapsulated and injecting into the corresponding interface which stands to another netns. When one of the two netns is removed, the tunnel is destroyed. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 1 + net/ipv6/ip6_gre.c | 5 +++++ net/ipv6/ip6_tunnel.c | 41 +++++++++++++++++++++++++++++++---------- 3 files changed, 37 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 4da5de10d1d..2265b0bf97e 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -36,6 +36,7 @@ struct __ip6_tnl_parm { struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ struct net_device *dev; /* virtual device associated with tunnel */ + struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ struct dst_entry *dst_cache; /* cached dst */ diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index ecd60733e5e..f2d0a42f805 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -335,6 +335,7 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, dev->rtnl_link_ops = &ip6gre_link_ops; nt->dev = dev; + nt->net = dev_net(dev); ip6gre_tnl_link_config(nt, 1); if (register_netdevice(dev) < 0) @@ -1255,6 +1256,7 @@ static int ip6gre_tunnel_init(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; + tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); @@ -1275,6 +1277,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) struct ip6_tnl *tunnel = netdev_priv(dev); tunnel->dev = dev; + tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); tunnel->hlen = sizeof(struct ipv6hdr) + 4; @@ -1450,6 +1453,7 @@ static int ip6gre_tap_init(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; + tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ip6gre_tnl_link_config(tunnel, 1); @@ -1501,6 +1505,7 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, eth_hw_addr_random(dev); nt->dev = dev; + nt->net = dev_net(dev); ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); /* Can use a lockless transmit, unless we generate output sequences */ diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 1e55866cead..cc3bb201b8b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -315,6 +315,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) t = netdev_priv(dev); t->parms = *p; + t->net = dev_net(dev); err = ip6_tnl_create2(dev); if (err < 0) goto failed_free; @@ -374,7 +375,7 @@ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); + struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) @@ -741,7 +742,7 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; - struct net *net = dev_net(t->dev); + struct net *net = t->net; if ((p->flags & IP6_TNL_F_CAP_RCV) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && @@ -827,6 +828,9 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, tstats->rx_packets++; tstats->rx_bytes += skb->len; + if (!net_eq(t->net, dev_net(t->dev))) + skb_scrub_packet(skb); + netif_rx(skb); rcu_read_unlock(); @@ -895,7 +899,7 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; - struct net *net = dev_net(t->dev); + struct net *net = t->net; if (p->flags & IP6_TNL_F_CAP_XMIT) { struct net_device *ldev = NULL; @@ -945,8 +949,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, int encap_limit, __u32 *pmtu) { - struct net *net = dev_net(dev); struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; struct net_device_stats *stats = &t->dev->stats; struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct ipv6_tel_txoption opt; @@ -996,6 +1000,9 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, goto tx_err_dst_release; } + if (!net_eq(t->net, dev_net(dev))) + skb_scrub_packet(skb); + /* * Okay, now see if we can stuff it in the buffer as-is. */ @@ -1202,7 +1209,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); - struct rt6_info *rt = rt6_lookup(dev_net(dev), + struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, strict); @@ -1251,7 +1258,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { - struct net *net = dev_net(t->dev); + struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); int err; @@ -1463,7 +1470,6 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->mtu-=8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } @@ -1479,6 +1485,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); t->dev = dev; + t->net = dev_net(dev); dev->tstats = alloc_percpu(struct pcpu_tstats); if (!dev->tstats) return -ENOMEM; @@ -1596,9 +1603,9 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct ip6_tnl *t; + struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; - struct net *net = dev_net(dev); + struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) @@ -1699,14 +1706,24 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n) { + struct net *net = dev_net(ip6n->fb_tnl_dev); + struct net_device *dev, *aux; int h; struct ip6_tnl *t; LIST_HEAD(list); + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &ip6_link_ops) + unregister_netdevice_queue(dev, &list); + for (h = 0; h < HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t != NULL) { - unregister_netdevice_queue(t->dev, &list); + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, &list); t = rtnl_dereference(t->next); } } @@ -1732,6 +1749,10 @@ static int __net_init ip6_tnl_init_net(struct net *net) if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) -- cgit v1.2.3-70-g09d2 From b05e92545d9582be15699e4a33d0f93ac00b37dd Mon Sep 17 00:00:00 2001 From: Franky Lin Date: Sat, 10 Aug 2013 12:27:26 +0200 Subject: brcmfmac: abstract tx packet processing functions Abstract brcmf_sdio_txpkt_prep and brcmf_sdio_txpkt_postp as a preparation of chained tx packets for host side tx glomming. Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Arend van Spriel Signed-off-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: John W. Linville --- drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c | 16 +- drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c | 237 +++++++++++++++------ .../net/wireless/brcm80211/brcmfmac/sdio_host.h | 2 +- include/linux/platform_data/brcmfmac-sdio.h | 6 + 4 files changed, 183 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c index e3f3c48f86d..e13b1a65c65 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c @@ -592,6 +592,7 @@ brcmf_sdcard_send_buf(struct brcmf_sdio_dev *sdiodev, u32 addr, uint fn, uint flags, u8 *buf, uint nbytes) { struct sk_buff *mypkt; + struct sk_buff_head pktq; int err; mypkt = brcmu_pkt_buf_get_skb(nbytes); @@ -602,7 +603,10 @@ brcmf_sdcard_send_buf(struct brcmf_sdio_dev *sdiodev, u32 addr, uint fn, } memcpy(mypkt->data, buf, nbytes); - err = brcmf_sdcard_send_pkt(sdiodev, addr, fn, flags, mypkt); + __skb_queue_head_init(&pktq); + __skb_queue_tail(&pktq, mypkt); + err = brcmf_sdcard_send_pkt(sdiodev, addr, fn, flags, &pktq); + __skb_dequeue_tail(&pktq); brcmu_pkt_buf_free_skb(mypkt); return err; @@ -611,22 +615,18 @@ brcmf_sdcard_send_buf(struct brcmf_sdio_dev *sdiodev, u32 addr, uint fn, int brcmf_sdcard_send_pkt(struct brcmf_sdio_dev *sdiodev, u32 addr, uint fn, - uint flags, struct sk_buff *pkt) + uint flags, struct sk_buff_head *pktq) { uint width; int err = 0; - struct sk_buff_head pkt_list; brcmf_dbg(SDIO, "fun = %d, addr = 0x%x, size = %d\n", - fn, addr, pkt->len); + fn, addr, pktq->qlen); width = (flags & SDIO_REQ_4BYTE) ? 4 : 2; brcmf_sdio_addrprep(sdiodev, width, &addr); - skb_queue_head_init(&pkt_list); - skb_queue_tail(&pkt_list, pkt); - err = brcmf_sdio_buffrw(sdiodev, fn, true, addr, &pkt_list); - skb_dequeue_tail(&pkt_list); + err = brcmf_sdio_buffrw(sdiodev, fn, true, addr, pktq); return err; } diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c b/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c index db31312eba6..aa4cacaf8b0 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c @@ -510,7 +510,6 @@ struct brcmf_sdio { #ifdef DEBUG static int qcount[NUMPRIO]; -static int tx_packets[NUMPRIO]; #endif /* DEBUG */ #define DEFAULT_SDIO_DRIVE_STRENGTH 6 /* in milliamps */ @@ -1759,85 +1758,185 @@ brcmf_sdbrcm_wait_event_wakeup(struct brcmf_sdio *bus) return; } +/* flag marking a dummy skb added for DMA alignment requirement */ +#define DUMMY_SKB_FLAG 0x10000 +/* bit mask of data length chopped from the previous packet */ +#define DUMMY_SKB_CHOP_LEN_MASK 0xffff +/** + * brcmf_sdio_txpkt_prep - packet preparation for transmit + * @bus: brcmf_sdio structure pointer + * @pktq: packet list pointer + * @chan: virtual channel to transmit the packet + * + * Processes to be applied to the packet + * - Align data buffer pointer + * - Align data buffer length + * - Prepare header + * Return: negative value if there is error + */ +static int +brcmf_sdio_txpkt_prep(struct brcmf_sdio *bus, struct sk_buff_head *pktq, + uint chan) +{ + u16 head_pad, tail_pad, tail_chop, pkt_len; + u16 head_align, sg_align; + u32 sw_header; + int ntail; + struct sk_buff *pkt_next, *pkt_new; + u8 *dat_buf; + unsigned blksize = bus->sdiodev->func[SDIO_FUNC_2]->cur_blksize; + + /* SDIO ADMA requires at least 32 bit alignment */ + head_align = 4; + sg_align = 4; + if (bus->sdiodev->pdata) { + head_align = bus->sdiodev->pdata->sd_head_align > 4 ? + bus->sdiodev->pdata->sd_head_align : 4; + sg_align = bus->sdiodev->pdata->sd_sgentry_align > 4 ? + bus->sdiodev->pdata->sd_sgentry_align : 4; + } + /* sg entry alignment should be a divisor of block size */ + WARN_ON(blksize % sg_align); + + pkt_next = pktq->next; + dat_buf = (u8 *)(pkt_next->data); + + /* Check head padding */ + head_pad = ((unsigned long)dat_buf % head_align); + if (head_pad) { + if (skb_headroom(pkt_next) < head_pad) { + bus->sdiodev->bus_if->tx_realloc++; + head_pad = 0; + if (skb_cow(pkt_next, head_pad)) + return -ENOMEM; + } + skb_push(pkt_next, head_pad); + dat_buf = (u8 *)(pkt_next->data); + memset(dat_buf, 0, head_pad + SDPCM_HDRLEN); + } + + /* Check tail padding */ + pkt_new = NULL; + tail_chop = pkt_next->len % sg_align; + tail_pad = sg_align - tail_chop; + tail_pad += blksize - (pkt_next->len + tail_pad) % blksize; + if (skb_tailroom(pkt_next) < tail_pad && pkt_next->len > blksize) { + pkt_new = brcmu_pkt_buf_get_skb(tail_pad + tail_chop); + if (pkt_new == NULL) + return -ENOMEM; + memcpy(pkt_new->data, + pkt_next->data + pkt_next->len - tail_chop, + tail_chop); + *(u32 *)(pkt_new->cb) = DUMMY_SKB_FLAG + tail_chop; + skb_trim(pkt_next, pkt_next->len - tail_chop); + __skb_queue_after(pktq, pkt_next, pkt_new); + } else { + ntail = pkt_next->data_len + tail_pad - + (pkt_next->end - pkt_next->tail); + if (skb_cloned(pkt_next) || ntail > 0) + if (pskb_expand_head(pkt_next, 0, ntail, GFP_ATOMIC)) + return -ENOMEM; + if (skb_linearize(pkt_next)) + return -ENOMEM; + dat_buf = (u8 *)(pkt_next->data); + __skb_put(pkt_next, tail_pad); + } + + /* Now prep the header */ + /* 4 bytes hardware header (frame tag) + * Byte 0~1: Frame length + * Byte 2~3: Checksum, bit-wise inverse of frame length + */ + if (pkt_new) + pkt_len = pkt_next->len + tail_chop; + else + pkt_len = pkt_next->len - tail_pad; + *(__le16 *)dat_buf = cpu_to_le16(pkt_len); + *(((__le16 *)dat_buf) + 1) = cpu_to_le16(~pkt_len); + /* 8 bytes software header + * Byte 0: Tx sequence number + * Byte 1: 4 MSB Channel number + * Byte 2: Reserved + * Byte 3: Data offset + * Byte 4~7: Reserved + */ + sw_header = bus->tx_seq; + sw_header |= ((chan << SDPCM_CHANNEL_SHIFT) & SDPCM_CHANNEL_MASK); + sw_header |= ((head_pad + SDPCM_HDRLEN) << SDPCM_DOFFSET_SHIFT) & + SDPCM_DOFFSET_MASK; + *(((__le32 *)dat_buf) + 1) = cpu_to_le32(sw_header); + *(((__le32 *)dat_buf) + 2) = 0; + + if (BRCMF_BYTES_ON() && + ((BRCMF_CTL_ON() && chan == SDPCM_CONTROL_CHANNEL) || + (BRCMF_DATA_ON() && chan != SDPCM_CONTROL_CHANNEL))) + brcmf_dbg_hex_dump(true, pkt_next, pkt_len, "Tx Frame:\n"); + else if (BRCMF_HDRS_ON()) + brcmf_dbg_hex_dump(true, pkt_next, head_pad + SDPCM_HDRLEN, + "Tx Header:\n"); + + return 0; +} + +/** + * brcmf_sdio_txpkt_postp - packet post processing for transmit + * @bus: brcmf_sdio structure pointer + * @pktq: packet list pointer + * + * Processes to be applied to the packet + * - Remove head padding + * - Remove tail padding + */ +static void +brcmf_sdio_txpkt_postp(struct brcmf_sdio *bus, struct sk_buff_head *pktq) +{ + u8 *hdr; + u32 dat_offset; + u32 dummy_flags, chop_len; + struct sk_buff *pkt_next, *tmp, *pkt_prev; + + skb_queue_walk_safe(pktq, pkt_next, tmp) { + dummy_flags = *(u32 *)(pkt_next->cb); + if (dummy_flags & DUMMY_SKB_FLAG) { + chop_len = dummy_flags & DUMMY_SKB_CHOP_LEN_MASK; + if (chop_len) { + pkt_prev = pkt_next->prev; + memcpy(pkt_prev->data + pkt_prev->len, + pkt_next->data, chop_len); + skb_put(pkt_prev, chop_len); + } + __skb_unlink(pkt_next, pktq); + brcmu_pkt_buf_free_skb(pkt_next); + } else { + hdr = pkt_next->data + SDPCM_FRAMETAG_LEN; + dat_offset = le32_to_cpu(*(__le32 *)hdr); + dat_offset = (dat_offset & SDPCM_DOFFSET_MASK) >> + SDPCM_DOFFSET_SHIFT; + skb_pull(pkt_next, dat_offset); + } + } +} + /* Writes a HW/SW header into the packet and sends it. */ /* Assumes: (a) header space already there, (b) caller holds lock */ static int brcmf_sdbrcm_txpkt(struct brcmf_sdio *bus, struct sk_buff *pkt, uint chan) { int ret; - u8 *frame; - u16 len, pad = 0; - u32 swheader; int i; + struct sk_buff_head localq; brcmf_dbg(TRACE, "Enter\n"); - frame = (u8 *) (pkt->data); - - /* Add alignment padding, allocate new packet if needed */ - pad = ((unsigned long)frame % BRCMF_SDALIGN); - if (pad) { - if (skb_headroom(pkt) < pad) { - brcmf_dbg(INFO, "insufficient headroom %d for %d pad\n", - skb_headroom(pkt), pad); - bus->sdiodev->bus_if->tx_realloc++; - ret = skb_cow(pkt, BRCMF_SDALIGN); - if (ret) - goto done; - pad = ((unsigned long)frame % BRCMF_SDALIGN); - } - skb_push(pkt, pad); - frame = (u8 *) (pkt->data); - memset(frame, 0, pad + SDPCM_HDRLEN); - } - /* precondition: pad < BRCMF_SDALIGN */ - - /* Hardware tag: 2 byte len followed by 2 byte ~len check (all LE) */ - len = (u16) (pkt->len); - *(__le16 *) frame = cpu_to_le16(len); - *(((__le16 *) frame) + 1) = cpu_to_le16(~len); - - /* Software tag: channel, sequence number, data offset */ - swheader = - ((chan << SDPCM_CHANNEL_SHIFT) & SDPCM_CHANNEL_MASK) | bus->tx_seq | - (((pad + - SDPCM_HDRLEN) << SDPCM_DOFFSET_SHIFT) & SDPCM_DOFFSET_MASK); - - *(((__le32 *) frame) + 1) = cpu_to_le32(swheader); - *(((__le32 *) frame) + 2) = 0; - -#ifdef DEBUG - tx_packets[pkt->priority]++; -#endif - - brcmf_dbg_hex_dump(BRCMF_BYTES_ON() && - ((BRCMF_CTL_ON() && chan == SDPCM_CONTROL_CHANNEL) || - (BRCMF_DATA_ON() && chan != SDPCM_CONTROL_CHANNEL)), - frame, len, "Tx Frame:\n"); - brcmf_dbg_hex_dump(!(BRCMF_BYTES_ON() && - ((BRCMF_CTL_ON() && - chan == SDPCM_CONTROL_CHANNEL) || - (BRCMF_DATA_ON() && - chan != SDPCM_CONTROL_CHANNEL))) && - BRCMF_HDRS_ON(), - frame, min_t(u16, len, 16), "TxHdr:\n"); - - /* Raise len to next SDIO block to eliminate tail command */ - if (bus->roundup && bus->blocksize && (len > bus->blocksize)) { - u16 pad = bus->blocksize - (len % bus->blocksize); - if ((pad <= bus->roundup) && (pad < bus->blocksize)) - len += pad; - } else if (len % BRCMF_SDALIGN) { - len += BRCMF_SDALIGN - (len % BRCMF_SDALIGN); - } - - /* Some controllers have trouble with odd bytes -- round to even */ - if (len & (ALIGNMENT - 1)) - len = roundup(len, ALIGNMENT); + __skb_queue_head_init(&localq); + __skb_queue_tail(&localq, pkt); + ret = brcmf_sdio_txpkt_prep(bus, &localq, chan); + if (ret) + goto done; sdio_claim_host(bus->sdiodev->func[1]); ret = brcmf_sdcard_send_pkt(bus->sdiodev, bus->sdiodev->sbwad, - SDIO_FUNC_2, F2SYNC, pkt); + SDIO_FUNC_2, F2SYNC, &localq); bus->sdcnt.f2txdata++; if (ret < 0) { @@ -1868,8 +1967,8 @@ static int brcmf_sdbrcm_txpkt(struct brcmf_sdio *bus, struct sk_buff *pkt, bus->tx_seq = (bus->tx_seq + 1) % SDPCM_SEQUENCE_WRAP; done: - /* restore pkt buffer pointer before calling tx complete routine */ - skb_pull(pkt, SDPCM_HDRLEN + pad); + brcmf_sdio_txpkt_postp(bus, &localq); + __skb_dequeue_tail(&localq); brcmf_txcomplete(bus->sdiodev->dev, pkt, ret == 0); return ret; } diff --git a/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h b/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h index 09786a53995..2b5407f002e 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h +++ b/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h @@ -208,7 +208,7 @@ extern int brcmf_sdio_regrw_helper(struct brcmf_sdio_dev *sdiodev, u32 addr, */ extern int brcmf_sdcard_send_pkt(struct brcmf_sdio_dev *sdiodev, u32 addr, uint fn, - uint flags, struct sk_buff *pkt); + uint flags, struct sk_buff_head *pktq); extern int brcmf_sdcard_send_buf(struct brcmf_sdio_dev *sdiodev, u32 addr, uint fn, uint flags, u8 *buf, uint nbytes); diff --git a/include/linux/platform_data/brcmfmac-sdio.h b/include/linux/platform_data/brcmfmac-sdio.h index b7174998c24..e75dcbf2b23 100644 --- a/include/linux/platform_data/brcmfmac-sdio.h +++ b/include/linux/platform_data/brcmfmac-sdio.h @@ -94,6 +94,10 @@ void __init brcmfmac_init_pdata(void) * Set this to true if the SDIO host controller has higher align requirement * than 32 bytes for each scatterlist item. * + * sd_head_align: alignment requirement for start of data buffer + * + * sd_sgentry_align: length alignment requirement for each sg entry + * * power_on: This function is called by the brcmfmac when the module gets * loaded. This can be particularly useful for low power devices. The platform * spcific routine may for example decide to power up the complete device. @@ -121,6 +125,8 @@ struct brcmfmac_sdio_platform_data { unsigned int oob_irq_nr; unsigned long oob_irq_flags; bool broken_sg_support; + unsigned short sd_head_align; + unsigned short sd_sgentry_align; void (*power_on)(void); void (*power_off)(void); void (*reset)(void); -- cgit v1.2.3-70-g09d2 From 27b3eb9c06a7193bdc9800cd00764a130343bc8a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 7 Aug 2013 20:11:55 +0200 Subject: mac80211: add APIs to allow keeping connections after WoWLAN In order to be able to (securely) keep connections alive after the system was suspended for WoWLAN, we need some additional APIs. We already have API (ieee80211_gtk_rekey_notify) to tell wpa_supplicant about the new replay counter if GTK rekeying was done by the device while the host was asleep, but that's not sufficient. If GTK rekeying wasn't done, we need to tell the host about sequence counters for the GTK (and PTK regardless of rekeying) that was used while asleep, add ieee80211_set_key_rx_seq() for that. If GTK rekeying was done, then we need to be able to disable the old keys (with ieee80211_remove_key()) and allocate the new GTK key(s) in mac80211 (with ieee80211_gtk_rekey_add()). If protocol offload (e.g. ARP) is implemented, then also the TX sequence counter for the PTK must be updated, using the new ieee80211_set_key_tx_seq() function. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 83 ++++++++++++++++++++++++++ net/mac80211/key.c | 154 ++++++++++++++++++++++++++++++++++++++++++++++--- net/mac80211/util.c | 2 +- 3 files changed, 231 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index df93c77c97a..e3e30377893 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3687,6 +3687,89 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); +/** + * ieee80211_set_key_tx_seq - set key TX sequence counter + * + * @keyconf: the parameter passed with the set key + * @seq: new sequence data + * + * This function allows a driver to set the current TX IV/PNs for the + * given key. This is useful when resuming from WoWLAN sleep and the + * device may have transmitted frames using the PTK, e.g. replies to + * ARP requests. + * + * Note that this function may only be called when no TX processing + * can be done concurrently. + */ +void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf, + struct ieee80211_key_seq *seq); + +/** + * ieee80211_set_key_rx_seq - set key RX sequence counter + * + * @keyconf: the parameter passed with the set key + * @tid: The TID, or -1 for the management frame value (CCMP only); + * the value on TID 0 is also used for non-QoS frames. For + * CMAC, only TID 0 is valid. + * @seq: new sequence data + * + * This function allows a driver to set the current RX IV/PNs for the + * given key. This is useful when resuming from WoWLAN sleep and GTK + * rekey may have been done while suspended. It should not be called + * if IV checking is done by the device and not by mac80211. + * + * Note that this function may only be called when no RX processing + * can be done concurrently. + */ +void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, + int tid, struct ieee80211_key_seq *seq); + +/** + * ieee80211_remove_key - remove the given key + * @keyconf: the parameter passed with the set key + * + * Remove the given key. If the key was uploaded to the hardware at the + * time this function is called, it is not deleted in the hardware but + * instead assumed to have been removed already. + * + * Note that due to locking considerations this function can (currently) + * only be called during key iteration (ieee80211_iter_keys().) + */ +void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); + +/** + * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN + * @vif: the virtual interface to add the key on + * @keyconf: new key data + * + * When GTK rekeying was done while the system was suspended, (a) new + * key(s) will be available. These will be needed by mac80211 for proper + * RX processing, so this function allows setting them. + * + * The function returns the newly allocated key structure, which will + * have similar contents to the passed key configuration but point to + * mac80211-owned memory. In case of errors, the function returns an + * ERR_PTR(), use IS_ERR() etc. + * + * Note that this function assumes the key isn't added to hardware + * acceleration, so no TX will be done with the key. Since it's a GTK + * on managed (station) networks, this is true anyway. If the driver + * calls this function from the resume callback and subsequently uses + * the return code 1 to reconfigure the device, this key will be part + * of the reconfiguration. + * + * Note that the driver should also call ieee80211_set_key_rx_seq() + * for the new key for each TID to set up sequence counters properly. + * + * IMPORTANT: If this replaces a key that is present in the hardware, + * then it will attempt to remove it during this call. In many cases + * this isn't what you want, so call ieee80211_remove_key() first for + * the key that's being replaced. + */ +struct ieee80211_key_conf * +ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, + struct ieee80211_key_conf *keyconf); + /** * ieee80211_gtk_rekey_notify - notify userspace supplicant of rekeying * @vif: virtual interface the rekeying was done on diff --git a/net/mac80211/key.c b/net/mac80211/key.c index e39cc91d0cf..620677e897b 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -93,6 +93,9 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) might_sleep(); + if (key->flags & KEY_FLAG_TAINTED) + return -EINVAL; + if (!key->local->ops->set_key) goto out_unsupported; @@ -455,6 +458,7 @@ int ieee80211_key_link(struct ieee80211_key *key, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { + struct ieee80211_local *local = sdata->local; struct ieee80211_key *old_key; int idx, ret; bool pairwise; @@ -484,10 +488,13 @@ int ieee80211_key_link(struct ieee80211_key *key, ieee80211_debugfs_key_add(key); - ret = ieee80211_key_enable_hw_accel(key); - - if (ret) - ieee80211_key_free(key, true); + if (!local->wowlan) { + ret = ieee80211_key_enable_hw_accel(key); + if (ret) + ieee80211_key_free(key, true); + } else { + ret = 0; + } mutex_unlock(&sdata->local->key_mtx); @@ -540,7 +547,7 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw, void *iter_data) { struct ieee80211_local *local = hw_to_local(hw); - struct ieee80211_key *key; + struct ieee80211_key *key, *tmp; struct ieee80211_sub_if_data *sdata; ASSERT_RTNL(); @@ -548,13 +555,14 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw, mutex_lock(&local->key_mtx); if (vif) { sdata = vif_to_sdata(vif); - list_for_each_entry(key, &sdata->key_list, list) + list_for_each_entry_safe(key, tmp, &sdata->key_list, list) iter(hw, &sdata->vif, key->sta ? &key->sta->sta : NULL, &key->conf, iter_data); } else { list_for_each_entry(sdata, &local->interfaces, list) - list_for_each_entry(key, &sdata->key_list, list) + list_for_each_entry_safe(key, tmp, + &sdata->key_list, list) iter(hw, &sdata->vif, key->sta ? &key->sta->sta : NULL, &key->conf, iter_data); @@ -751,3 +759,135 @@ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, } } EXPORT_SYMBOL(ieee80211_get_key_rx_seq); + +void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf, + struct ieee80211_key_seq *seq) +{ + struct ieee80211_key *key; + u64 pn64; + + key = container_of(keyconf, struct ieee80211_key, conf); + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_TKIP: + key->u.tkip.tx.iv32 = seq->tkip.iv32; + key->u.tkip.tx.iv16 = seq->tkip.iv16; + break; + case WLAN_CIPHER_SUITE_CCMP: + pn64 = (u64)seq->ccmp.pn[5] | + ((u64)seq->ccmp.pn[4] << 8) | + ((u64)seq->ccmp.pn[3] << 16) | + ((u64)seq->ccmp.pn[2] << 24) | + ((u64)seq->ccmp.pn[1] << 32) | + ((u64)seq->ccmp.pn[0] << 40); + atomic64_set(&key->u.ccmp.tx_pn, pn64); + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + pn64 = (u64)seq->aes_cmac.pn[5] | + ((u64)seq->aes_cmac.pn[4] << 8) | + ((u64)seq->aes_cmac.pn[3] << 16) | + ((u64)seq->aes_cmac.pn[2] << 24) | + ((u64)seq->aes_cmac.pn[1] << 32) | + ((u64)seq->aes_cmac.pn[0] << 40); + atomic64_set(&key->u.aes_cmac.tx_pn, pn64); + break; + default: + WARN_ON(1); + break; + } +} +EXPORT_SYMBOL_GPL(ieee80211_set_key_tx_seq); + +void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, + int tid, struct ieee80211_key_seq *seq) +{ + struct ieee80211_key *key; + u8 *pn; + + key = container_of(keyconf, struct ieee80211_key, conf); + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_TKIP: + if (WARN_ON(tid < 0 || tid >= IEEE80211_NUM_TIDS)) + return; + key->u.tkip.rx[tid].iv32 = seq->tkip.iv32; + key->u.tkip.rx[tid].iv16 = seq->tkip.iv16; + break; + case WLAN_CIPHER_SUITE_CCMP: + if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS)) + return; + if (tid < 0) + pn = key->u.ccmp.rx_pn[IEEE80211_NUM_TIDS]; + else + pn = key->u.ccmp.rx_pn[tid]; + memcpy(pn, seq->ccmp.pn, IEEE80211_CCMP_PN_LEN); + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + if (WARN_ON(tid != 0)) + return; + pn = key->u.aes_cmac.rx_pn; + memcpy(pn, seq->aes_cmac.pn, IEEE80211_CMAC_PN_LEN); + break; + default: + WARN_ON(1); + break; + } +} +EXPORT_SYMBOL_GPL(ieee80211_set_key_rx_seq); + +void ieee80211_remove_key(struct ieee80211_key_conf *keyconf) +{ + struct ieee80211_key *key; + + key = container_of(keyconf, struct ieee80211_key, conf); + + assert_key_lock(key->local); + + /* + * if key was uploaded, we assume the driver will/has remove(d) + * it, so adjust bookkeeping accordingly + */ + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { + key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; + + if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || + (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) || + (key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))) + increment_tailroom_need_count(key->sdata); + } + + ieee80211_key_free(key, false); +} +EXPORT_SYMBOL_GPL(ieee80211_remove_key); + +struct ieee80211_key_conf * +ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, + struct ieee80211_key_conf *keyconf) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct ieee80211_key *key; + int err; + + if (WARN_ON(!local->wowlan)) + return ERR_PTR(-EINVAL); + + if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) + return ERR_PTR(-EINVAL); + + key = ieee80211_key_alloc(keyconf->cipher, keyconf->keyidx, + keyconf->keylen, keyconf->key, + 0, NULL); + if (IS_ERR(key)) + return ERR_PTR(PTR_ERR(key)); + + if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED) + key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; + + err = ieee80211_key_link(key, sdata, NULL); + if (err) + return ERR_PTR(err); + + return &key->conf; +} +EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_add); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index d23c5a705a6..e1b34a18b24 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1453,8 +1453,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) local->resuming = true; if (local->wowlan) { - local->wowlan = false; res = drv_resume(local); + local->wowlan = false; if (res < 0) { local->resuming = false; return res; -- cgit v1.2.3-70-g09d2 From 012a5729ff933ecd07e7470a65d84577aef9ae03 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 19 Aug 2013 11:23:07 -0700 Subject: vxlan: Extend vxlan handlers for openvswitch. Following patch adds data field to vxlan socket and export vxlan handler api. vh->data is required to store private data per vxlan handler. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 45 +++++++++++++++++++-------------------------- include/net/vxlan.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 26 deletions(-) create mode 100644 include/net/vxlan.h (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 236c445b193..aab927b92fb 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -41,6 +41,7 @@ #include #include #include +#include #define VXLAN_VERSION "0.1" @@ -83,20 +84,6 @@ static int vxlan_net_id; static const u8 all_zeros_mac[ETH_ALEN]; -struct vxlan_sock; -typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb, __be32 key); - -/* per UDP socket information */ -struct vxlan_sock { - vxlan_rcv_t *rcv; - struct hlist_node hlist; - struct rcu_head rcu; - struct work_struct del_work; - atomic_t refcnt; - struct socket *sock; - struct hlist_head vni_list[VNI_HASH_SIZE]; -}; - /* per-network namespace private data for this module */ struct vxlan_net { struct list_head vxlan_list; @@ -813,8 +800,10 @@ static void vxlan_sock_hold(struct vxlan_sock *vs) atomic_inc(&vs->refcnt); } -static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs) +void vxlan_sock_release(struct vxlan_sock *vs) { + struct vxlan_net *vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); + if (!atomic_dec_and_test(&vs->refcnt)) return; @@ -824,6 +813,7 @@ static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs) queue_work(vxlan_wq, &vs->del_work); } +EXPORT_SYMBOL_GPL(vxlan_sock_release); /* Callback to update multicast group membership when first VNI on * multicast asddress is brought up @@ -832,7 +822,6 @@ static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs) static void vxlan_igmp_join(struct work_struct *work) { struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join); - struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; struct ip_mreqn mreq = { @@ -844,7 +833,7 @@ static void vxlan_igmp_join(struct work_struct *work) ip_mc_join_group(sk, &mreq); release_sock(sk); - vxlan_sock_release(vn, vs); + vxlan_sock_release(vs); dev_put(vxlan->dev); } @@ -852,7 +841,6 @@ static void vxlan_igmp_join(struct work_struct *work) static void vxlan_igmp_leave(struct work_struct *work) { struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave); - struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; struct ip_mreqn mreq = { @@ -864,7 +852,7 @@ static void vxlan_igmp_leave(struct work_struct *work) ip_mc_leave_group(sk, &mreq); release_sock(sk); - vxlan_sock_release(vn, vs); + vxlan_sock_release(vs); dev_put(vxlan->dev); } @@ -1429,13 +1417,12 @@ static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan) static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; vxlan_fdb_delete_default(vxlan); if (vs) - vxlan_sock_release(vn, vs); + vxlan_sock_release(vs); free_percpu(dev->tstats); } @@ -1653,7 +1640,7 @@ static void vxlan_del_work(struct work_struct *work) } static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, - vxlan_rcv_t *rcv) + vxlan_rcv_t *rcv, void *data) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; @@ -1700,6 +1687,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, } atomic_set(&vs->refcnt, 1); vs->rcv = rcv; + vs->data = data; /* Disable multicast loopback */ inet_sk(sk)->mc_loop = 0; @@ -1714,16 +1702,20 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, return vs; } -static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, - vxlan_rcv_t *rcv) +struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, + vxlan_rcv_t *rcv, void *data, + bool no_share) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; - vs = vxlan_socket_create(net, port, rcv); + vs = vxlan_socket_create(net, port, rcv, data); if (!IS_ERR(vs)) return vs; + if (no_share) /* Return error if sharing is not allowed. */ + return vs; + spin_lock(&vn->sock_lock); vs = vxlan_find_sock(net, port); if (vs) { @@ -1739,6 +1731,7 @@ static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, return vs; } +EXPORT_SYMBOL_GPL(vxlan_sock_add); /* Scheduled at device creation to bind to a socket */ static void vxlan_sock_work(struct work_struct *work) @@ -1749,7 +1742,7 @@ static void vxlan_sock_work(struct work_struct *work) __be16 port = vxlan->dst_port; struct vxlan_sock *nvs; - nvs = vxlan_sock_add(net, port, vxlan_rcv); + nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false); spin_lock(&vn->sock_lock); if (!IS_ERR(nvs)) vxlan_vs_add_dev(nvs, vxlan); diff --git a/include/net/vxlan.h b/include/net/vxlan.h new file mode 100644 index 00000000000..43de27585cb --- /dev/null +++ b/include/net/vxlan.h @@ -0,0 +1,31 @@ +#ifndef __NET_VXLAN_H +#define __NET_VXLAN_H 1 + +#include +#include +#include + +#define VNI_HASH_BITS 10 +#define VNI_HASH_SIZE (1< Date: Mon, 19 Aug 2013 11:23:17 -0700 Subject: vxlan: Factor out vxlan send api. Following patch allows more code sharing between vxlan and ovs-vxlan. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 91 +++++++++++++++++++++++++++++++---------------------- include/net/vxlan.h | 8 +++++ 2 files changed, 62 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index aab927b92fb..f3496e99530 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1082,11 +1082,8 @@ static void vxlan_sock_put(struct sk_buff *skb) } /* On transmit, associate with the tunnel socket */ -static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) +static void vxlan_set_owner(struct sock *sk, struct sk_buff *skb) { - struct vxlan_dev *vxlan = netdev_priv(dev); - struct sock *sk = vxlan->vn_sock->sock->sk; - skb_orphan(skb); sock_hold(sk); skb->sk = sk; @@ -1098,9 +1095,9 @@ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) * better and maybe available from hardware * secondary choice is to use jhash on the Ethernet header */ -static __be16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb) +__be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb) { - unsigned int range = (vxlan->port_max - vxlan->port_min) + 1; + unsigned int range = (port_max - port_min) + 1; u32 hash; hash = skb_get_rxhash(skb); @@ -1108,8 +1105,9 @@ static __be16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb) hash = jhash(skb->data, 2 * ETH_ALEN, (__force u32) skb->protocol); - return htons((((u64) hash * range) >> 32) + vxlan->port_min); + return htons((((u64) hash * range) >> 32) + port_min); } +EXPORT_SYMBOL_GPL(vxlan_src_port); static int handle_offloads(struct sk_buff *skb) { @@ -1125,6 +1123,45 @@ static int handle_offloads(struct sk_buff *skb) return 0; } +int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs, + struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, + __be16 src_port, __be16 dst_port, __be32 vni) +{ + struct vxlanhdr *vxh; + struct udphdr *uh; + int err; + + if (!skb->encapsulation) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); + vxh->vx_flags = htonl(VXLAN_FLAGS); + vxh->vx_vni = vni; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + + uh->len = htons(skb->len); + uh->check = 0; + + vxlan_set_owner(vs->sock->sk, skb); + + err = handle_offloads(skb); + if (err) + return err; + + return iptunnel_xmit(net, rt, skb, src, dst, + IPPROTO_UDP, tos, ttl, df); +} +EXPORT_SYMBOL_GPL(vxlan_xmit_skb); + /* Bypass encapsulation if the destination is local */ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, struct vxlan_dev *dst_vxlan) @@ -1162,8 +1199,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct rtable *rt; const struct iphdr *old_iph; - struct vxlanhdr *vxh; - struct udphdr *uh; struct flowi4 fl4; __be32 dst; __be16 src_port, dst_port; @@ -1185,11 +1220,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto drop; } - if (!skb->encapsulation) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } - /* Need space for new headers (invalidates iph ptr) */ if (skb_cow_head(skb, VXLAN_HEADROOM)) goto drop; @@ -1204,7 +1234,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); - src_port = vxlan_src_port(vxlan, skb); + src_port = vxlan_src_port(vxlan->port_min, vxlan->port_max, skb); memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = rdst->remote_ifindex; @@ -1221,9 +1251,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (rt->dst.dev == dev) { netdev_dbg(dev, "circular route to %pI4\n", &dst); - ip_rt_put(rt); dev->stats.collisions++; - goto tx_error; + goto rt_tx_error; } /* Bypass encapsulation if the destination is local */ @@ -1238,30 +1267,16 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vxlan_encap_bypass(skb, vxlan, dst_vxlan); return; } - vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); - vxh->vx_flags = htonl(VXLAN_FLAGS); - vxh->vx_vni = htonl(vni << 8); - - __skb_push(skb, sizeof(*uh)); - skb_reset_transport_header(skb); - uh = udp_hdr(skb); - - uh->dest = dst_port; - uh->source = src_port; - - uh->len = htons(skb->len); - uh->check = 0; - - vxlan_set_owner(dev, skb); - - if (handle_offloads(skb)) - goto drop; tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, dst, - IPPROTO_UDP, tos, ttl, df); + err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb, + fl4.saddr, dst, tos, ttl, df, + src_port, dst_port, htonl(vni << 8)); + + if (err < 0) + goto rt_tx_error; iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return; @@ -1270,6 +1285,8 @@ drop: dev->stats.tx_dropped++; goto tx_free; +rt_tx_error: + ip_rt_put(rt); tx_error: dev->stats.tx_errors++; tx_free: diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 43de27585cb..ad342e3688a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -28,4 +28,12 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, bool no_share); void vxlan_sock_release(struct vxlan_sock *vs); + +int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs, + struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, + __be16 src_port, __be16 dst_port, __be32 vni); + +__be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb); + #endif -- cgit v1.2.3-70-g09d2 From 58264848a5a7b91195f43c4729072e8cc980288d Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 19 Aug 2013 11:23:34 -0700 Subject: openvswitch: Add vxlan tunneling support. Following patch adds vxlan vport type for openvswitch using vxlan api. So now there is vxlan dependency for openvswitch. CC: Jesse Gross Signed-off-by: Pravin B Shelar Acked-by: Jesse Gross Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 11 +++ net/openvswitch/Kconfig | 13 +++ net/openvswitch/Makefile | 4 + net/openvswitch/vport-vxlan.c | 204 +++++++++++++++++++++++++++++++++++++++ net/openvswitch/vport.c | 3 + net/openvswitch/vport.h | 1 + 6 files changed, 236 insertions(+) create mode 100644 net/openvswitch/vport-vxlan.c (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index c55efaaa9bb..52490b0e62b 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -165,6 +165,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_NETDEV, /* network device */ OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */ OVS_VPORT_TYPE_GRE, /* GRE tunnel. */ + OVS_VPORT_TYPE_VXLAN, /* VXLAN tunnel. */ __OVS_VPORT_TYPE_MAX }; @@ -211,6 +212,16 @@ enum ovs_vport_attr { #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1) +/* OVS_VPORT_ATTR_OPTIONS attributes for tunnels. + */ +enum { + OVS_TUNNEL_ATTR_UNSPEC, + OVS_TUNNEL_ATTR_DST_PORT, /* 16-bit UDP port, used by L4 tunnels. */ + __OVS_TUNNEL_ATTR_MAX +}; + +#define OVS_TUNNEL_ATTR_MAX (__OVS_TUNNEL_ATTR_MAX - 1) + /* Flows. */ #define OVS_FLOW_FAMILY "ovs_flow" diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 27ee56b688a..bed30e69baa 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -40,3 +40,16 @@ config OPENVSWITCH_GRE Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config OPENVSWITCH_VXLAN + bool "Open vSwitch VXLAN tunneling support" + depends on INET + depends on OPENVSWITCH + depends on VXLAN && !(OPENVSWITCH=y && VXLAN=m) + default y + ---help--- + If you say Y here, then the Open vSwitch will be able create vxlan vport. + + Say N to exclude this support and reduce the binary size. + + If unsure, say Y. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 01bddb2991e..82e4ee54a44 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -13,3 +13,7 @@ openvswitch-y := \ vport-gre.o \ vport-internal_dev.o \ vport-netdev.o + +ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) +openvswitch-y += vport-vxlan.o +endif diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c new file mode 100644 index 00000000000..36848bd54a7 --- /dev/null +++ b/net/openvswitch/vport-vxlan.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * Copyright (c) 2013 Cisco Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +/** + * struct vxlan_port - Keeps track of open UDP ports + * @vs: vxlan_sock created for the port. + * @name: vport name. + */ +struct vxlan_port { + struct vxlan_sock *vs; + char name[IFNAMSIZ]; +}; + +static inline struct vxlan_port *vxlan_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +/* Called with rcu_read_lock and BH disabled. */ +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) +{ + struct ovs_key_ipv4_tunnel tun_key; + struct vport *vport = vs->data; + struct iphdr *iph; + __be64 key; + + /* Save outer tunnel values */ + iph = ip_hdr(skb); + key = cpu_to_be64(ntohl(vx_vni) >> 8); + ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); + + ovs_vport_receive(vport, skb, &tun_key); +} + +static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) + return -EMSGSIZE; + return 0; +} + +static void vxlan_tnl_destroy(struct vport *vport) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + + vxlan_sock_release(vxlan_port->vs); + + ovs_vport_deferred_free(vport); +} + +static struct vport *vxlan_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct vxlan_port *vxlan_port; + struct vxlan_sock *vs; + struct vport *vport; + struct nlattr *a; + u16 dst_port; + int err; + + if (!options) { + err = -EINVAL; + goto error; + } + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct vxlan_port), + &ovs_vxlan_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + vxlan_port = vxlan_vport(vport); + strncpy(vxlan_port->name, parms->name, IFNAMSIZ); + + vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true); + if (IS_ERR(vs)) { + ovs_vport_free(vport); + return (void *)vs; + } + vxlan_port->vs = vs; + + return vport; + +error: + return ERR_PTR(err); +} + +static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct vxlan_port *vxlan_port = vxlan_vport(vport); + __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + struct rtable *rt; + struct flowi4 fl; + __be16 src_port; + int port_min; + int port_max; + __be16 df; + int err; + + if (unlikely(!OVS_CB(skb)->tun_key)) { + err = -EINVAL; + goto error; + } + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; + fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + htons(IP_DF) : 0; + + skb->local_df = 1; + + inet_get_local_port_range(&port_min, &port_max); + src_port = vxlan_src_port(port_min, port_max, skb); + + err = vxlan_xmit_skb(net, vxlan_port->vs, rt, skb, + fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, + OVS_CB(skb)->tun_key->ipv4_tos, + OVS_CB(skb)->tun_key->ipv4_ttl, df, + src_port, dst_port, + htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8)); + if (err < 0) + ip_rt_put(rt); +error: + return err; +} + +static const char *vxlan_get_name(const struct vport *vport) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + return vxlan_port->name; +} + +const struct vport_ops ovs_vxlan_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_tnl_create, + .destroy = vxlan_tnl_destroy, + .get_name = vxlan_get_name, + .get_options = vxlan_get_options, + .send = vxlan_tnl_send, +}; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index d4c7fa04ce0..d69e0c06dfd 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -42,6 +42,9 @@ static const struct vport_ops *vport_ops_list[] = { #ifdef CONFIG_OPENVSWITCH_GRE &ovs_gre_vport_ops, #endif +#ifdef CONFIG_OPENVSWITCH_VXLAN + &ovs_vxlan_vport_ops, +#endif }; /* Protected by RCU read lock for reading, ovs_mutex for writing. */ diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 376045c42f8..1a9fbcec6e1 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -199,6 +199,7 @@ void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); extern const struct vport_ops ovs_netdev_vport_ops; extern const struct vport_ops ovs_internal_vport_ops; extern const struct vport_ops ovs_gre_vport_ops; +extern const struct vport_ops ovs_vxlan_vport_ops; static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) -- cgit v1.2.3-70-g09d2 From 8d3214c4e8c8be6efd8ec7a172239ebbd4deb04b Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sun, 18 Aug 2013 03:13:26 +0400 Subject: sh_eth: remove 'register_type' field from 'struct sh_eth_plat_data' Now that the 'register_type' field of the 'sh_eth' driver's platform data is not used by the driver anymore, it's time to remove it and its initializers from the SH platform code. Also move *enum* declaring values for this field from to the local driver's header file as they're only needed by the driver itself now... Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- arch/arm/mach-shmobile/board-armadillo800eva.c | 1 - arch/arm/mach-shmobile/board-bockw.c | 1 - arch/sh/boards/board-espt.c | 1 - arch/sh/boards/board-sh7757lcr.c | 4 ---- arch/sh/boards/mach-ecovec24/setup.c | 1 - arch/sh/boards/mach-se/7724/setup.c | 1 - arch/sh/boards/mach-sh7763rdp/setup.c | 1 - arch/sh/kernel/cpu/sh2/setup-sh7619.c | 1 - drivers/net/ethernet/renesas/sh_eth.h | 7 +++++++ include/linux/sh_eth.h | 7 ------- 10 files changed, 7 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-shmobile/board-armadillo800eva.c b/arch/arm/mach-shmobile/board-armadillo800eva.c index c5be60d85e4..3a6ffa250fb 100644 --- a/arch/arm/mach-shmobile/board-armadillo800eva.c +++ b/arch/arm/mach-shmobile/board-armadillo800eva.c @@ -358,7 +358,6 @@ static struct platform_device usbhsf_device = { static struct sh_eth_plat_data sh_eth_platdata = { .phy = 0x00, /* LAN8710A */ .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_GIGABIT, .phy_interface = PHY_INTERFACE_MODE_MII, }; diff --git a/arch/arm/mach-shmobile/board-bockw.c b/arch/arm/mach-shmobile/board-bockw.c index 3354a85c90f..fa8885b2d5a 100644 --- a/arch/arm/mach-shmobile/board-bockw.c +++ b/arch/arm/mach-shmobile/board-bockw.c @@ -89,7 +89,6 @@ static struct sh_mobile_sdhi_info sdhi0_info = { static struct sh_eth_plat_data ether_platform_data __initdata = { .phy = 0x01, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_FAST_RCAR, .phy_interface = PHY_INTERFACE_MODE_RMII, /* * Although the LINK signal is available on the board, it's connected to diff --git a/arch/sh/boards/board-espt.c b/arch/sh/boards/board-espt.c index 4d94dff9015..7291e2f11a4 100644 --- a/arch/sh/boards/board-espt.c +++ b/arch/sh/boards/board-espt.c @@ -80,7 +80,6 @@ static struct resource sh_eth_resources[] = { static struct sh_eth_plat_data sh7763_eth_pdata = { .phy = 0, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_GIGABIT, .phy_interface = PHY_INTERFACE_MODE_MII, }; diff --git a/arch/sh/boards/board-sh7757lcr.c b/arch/sh/boards/board-sh7757lcr.c index 4f114d1cd01..25c5a932f9f 100644 --- a/arch/sh/boards/board-sh7757lcr.c +++ b/arch/sh/boards/board-sh7757lcr.c @@ -77,7 +77,6 @@ static struct resource sh_eth0_resources[] = { static struct sh_eth_plat_data sh7757_eth0_pdata = { .phy = 1, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_FAST_SH4, .set_mdio_gate = sh7757_eth_set_mdio_gate, }; @@ -106,7 +105,6 @@ static struct resource sh_eth1_resources[] = { static struct sh_eth_plat_data sh7757_eth1_pdata = { .phy = 1, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_FAST_SH4, .set_mdio_gate = sh7757_eth_set_mdio_gate, }; @@ -151,7 +149,6 @@ static struct resource sh_eth_giga0_resources[] = { static struct sh_eth_plat_data sh7757_eth_giga0_pdata = { .phy = 18, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_GIGABIT, .set_mdio_gate = sh7757_eth_giga_set_mdio_gate, .phy_interface = PHY_INTERFACE_MODE_RGMII_ID, }; @@ -186,7 +183,6 @@ static struct resource sh_eth_giga1_resources[] = { static struct sh_eth_plat_data sh7757_eth_giga1_pdata = { .phy = 19, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_GIGABIT, .set_mdio_gate = sh7757_eth_giga_set_mdio_gate, .phy_interface = PHY_INTERFACE_MODE_RGMII_ID, }; diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index 61fade0ffa9..a4f630f04ea 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -159,7 +159,6 @@ static struct resource sh_eth_resources[] = { static struct sh_eth_plat_data sh_eth_plat = { .phy = 0x1f, /* SMSC LAN8700 */ .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_FAST_SH4, .phy_interface = PHY_INTERFACE_MODE_MII, .ether_link_active_low = 1 }; diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c index aed4ca9ee1f..e96e053f260 100644 --- a/arch/sh/boards/mach-se/7724/setup.c +++ b/arch/sh/boards/mach-se/7724/setup.c @@ -377,7 +377,6 @@ static struct resource sh_eth_resources[] = { static struct sh_eth_plat_data sh_eth_plat = { .phy = 0x1f, /* SMSC LAN8187 */ .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_FAST_SH4, .phy_interace = PHY_INTERFACE_MODE_MII, }; diff --git a/arch/sh/boards/mach-sh7763rdp/setup.c b/arch/sh/boards/mach-sh7763rdp/setup.c index 50ba481fa24..2c8fb04685d 100644 --- a/arch/sh/boards/mach-sh7763rdp/setup.c +++ b/arch/sh/boards/mach-sh7763rdp/setup.c @@ -88,7 +88,6 @@ static struct resource sh_eth_resources[] = { static struct sh_eth_plat_data sh7763_eth_pdata = { .phy = 1, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_GIGABIT, .phy_interface = PHY_INTERFACE_MODE_MII, }; diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c index bbadd482033..19472817e27 100644 --- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c +++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c @@ -114,7 +114,6 @@ static struct platform_device scif2_device = { static struct sh_eth_plat_data eth_platform_data = { .phy = 1, .edmac_endian = EDMAC_LITTLE_ENDIAN, - .register_type = SH_ETH_REG_FAST_SH3_SH2, .phy_interace = PHY_INTERFACE_MODE_MII, }; diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h index 1e7852810c8..a0db02c63b1 100644 --- a/drivers/net/ethernet/renesas/sh_eth.h +++ b/drivers/net/ethernet/renesas/sh_eth.h @@ -157,6 +157,13 @@ enum { SH_ETH_MAX_REGISTER_OFFSET, }; +enum { + SH_ETH_REG_GIGABIT, + SH_ETH_REG_FAST_RCAR, + SH_ETH_REG_FAST_SH4, + SH_ETH_REG_FAST_SH3_SH2 +}; + /* Driver's parameters */ #if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARCH_SHMOBILE) #define SH4_SKB_RX_ALIGN 32 diff --git a/include/linux/sh_eth.h b/include/linux/sh_eth.h index 6205eeba392..90b5e30c2f2 100644 --- a/include/linux/sh_eth.h +++ b/include/linux/sh_eth.h @@ -5,17 +5,10 @@ #include enum {EDMAC_LITTLE_ENDIAN, EDMAC_BIG_ENDIAN}; -enum { - SH_ETH_REG_GIGABIT, - SH_ETH_REG_FAST_RCAR, - SH_ETH_REG_FAST_SH4, - SH_ETH_REG_FAST_SH3_SH2 -}; struct sh_eth_plat_data { int phy; int edmac_endian; - int register_type; phy_interface_t phy_interface; void (*set_mdio_gate)(void *addr); -- cgit v1.2.3-70-g09d2 From dbe34724c08ea25d39d31735120077013fbbb6fb Mon Sep 17 00:00:00 2001 From: Mugunthan V N Date: Mon, 19 Aug 2013 17:47:40 +0530 Subject: drivers: net: cpsw: remove platform data header file of cpsw CPSW driver no longer supports platform register as all the SoCs which has CPSW are supporting DT only booting, so moving cpsw.h header file from platform include to drivers/net/ethernet/ti Signed-off-by: Mugunthan V N Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 2 +- drivers/net/ethernet/ti/cpsw.h | 42 ++++++++++++++++++++++++++++++++++++ include/linux/platform_data/cpsw.h | 44 -------------------------------------- 3 files changed, 43 insertions(+), 45 deletions(-) create mode 100644 drivers/net/ethernet/ti/cpsw.h delete mode 100644 include/linux/platform_data/cpsw.h (limited to 'include') diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 0fcf21254ad..b4a85f5531e 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -34,9 +34,9 @@ #include #include -#include #include +#include "cpsw.h" #include "cpsw_ale.h" #include "cpts.h" #include "davinci_cpdma.h" diff --git a/drivers/net/ethernet/ti/cpsw.h b/drivers/net/ethernet/ti/cpsw.h new file mode 100644 index 00000000000..eb3e101ec04 --- /dev/null +++ b/drivers/net/ethernet/ti/cpsw.h @@ -0,0 +1,42 @@ +/* Texas Instruments Ethernet Switch Driver + * + * Copyright (C) 2013 Texas Instruments + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed "as is" WITHOUT ANY WARRANTY of any + * kind, whether express or implied; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#ifndef __CPSW_H__ +#define __CPSW_H__ + +#include + +struct cpsw_slave_data { + char phy_id[MII_BUS_ID_SIZE]; + int phy_if; + u8 mac_addr[ETH_ALEN]; + u16 dual_emac_res_vlan; /* Reserved VLAN for DualEMAC */ +}; + +struct cpsw_platform_data { + struct cpsw_slave_data *slave_data; + u32 ss_reg_ofs; /* Subsystem control register offset */ + u32 channels; /* number of cpdma channels (symmetric) */ + u32 slaves; /* number of slave cpgmac ports */ + u32 active_slave; /* time stamping, ethtool and SIOCGMIIPHY slave */ + u32 cpts_clock_mult; /* convert input clock ticks to nanoseconds */ + u32 cpts_clock_shift; /* convert input clock ticks to nanoseconds */ + u32 ale_entries; /* ale table size */ + u32 bd_ram_size; /*buffer descriptor ram size */ + u32 rx_descs; /* Number of Rx Descriptios */ + u32 mac_control; /* Mac control register */ + u16 default_vlan; /* Def VLAN for ALE lookup in VLAN aware mode*/ + bool dual_emac; /* Enable Dual EMAC mode */ +}; + +#endif /* __CPSW_H__ */ diff --git a/include/linux/platform_data/cpsw.h b/include/linux/platform_data/cpsw.h deleted file mode 100644 index bb3cd58d71e..00000000000 --- a/include/linux/platform_data/cpsw.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Texas Instruments Ethernet Switch Driver - * - * Copyright (C) 2012 Texas Instruments - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation version 2. - * - * This program is distributed "as is" WITHOUT ANY WARRANTY of any - * kind, whether express or implied; without even the implied warranty - * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ -#ifndef __CPSW_H__ -#define __CPSW_H__ - -#include - -struct cpsw_slave_data { - char phy_id[MII_BUS_ID_SIZE]; - int phy_if; - u8 mac_addr[ETH_ALEN]; - u16 dual_emac_res_vlan; /* Reserved VLAN for DualEMAC */ - -}; - -struct cpsw_platform_data { - u32 ss_reg_ofs; /* Subsystem control register offset */ - u32 channels; /* number of cpdma channels (symmetric) */ - u32 slaves; /* number of slave cpgmac ports */ - struct cpsw_slave_data *slave_data; - u32 active_slave; /* time stamping, ethtool and SIOCGMIIPHY slave */ - u32 cpts_clock_mult; /* convert input clock ticks to nanoseconds */ - u32 cpts_clock_shift; /* convert input clock ticks to nanoseconds */ - u32 ale_entries; /* ale table size */ - u32 bd_ram_size; /*buffer descriptor ram size */ - u32 rx_descs; /* Number of Rx Descriptios */ - u32 mac_control; /* Mac control register */ - u16 default_vlan; /* Def VLAN for ALE lookup in VLAN aware mode*/ - bool dual_emac; /* Enable Dual EMAC mode */ -}; - -#endif /* __CPSW_H__ */ -- cgit v1.2.3-70-g09d2 From 1ddff7da0faecffdcdeab3d981fb8241453cea44 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Mon, 19 Aug 2013 15:39:19 +0400 Subject: can: mcp251x: Replace power callbacks with regulator API This patch replaces power callbacks to the regulator API. To improve the readability of the code, helper for the regulator enable/disable was added. Acked-by: Haojian Zhuang Signed-off-by: Alexander Shiyan Signed-off-by: Marc Kleine-Budde --- arch/arm/mach-pxa/icontrol.c | 3 -- arch/arm/mach-pxa/zeus.c | 46 +++++++++++--------- drivers/net/can/mcp251x.c | 83 +++++++++++++++++++----------------- include/linux/can/platform/mcp251x.h | 13 +----- 4 files changed, 69 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-pxa/icontrol.c b/arch/arm/mach-pxa/icontrol.c index fe31bfcbb8d..c98511c5abd 100644 --- a/arch/arm/mach-pxa/icontrol.c +++ b/arch/arm/mach-pxa/icontrol.c @@ -73,9 +73,6 @@ static struct pxa2xx_spi_chip mcp251x_chip_info4 = { static struct mcp251x_platform_data mcp251x_info = { .oscillator_frequency = 16E6, - .board_specific_setup = NULL, - .power_enable = NULL, - .transceiver_enable = NULL }; static struct spi_board_info mcp251x_board_info[] = { diff --git a/arch/arm/mach-pxa/zeus.c b/arch/arm/mach-pxa/zeus.c index f5d43643456..04a0aea2387 100644 --- a/arch/arm/mach-pxa/zeus.c +++ b/arch/arm/mach-pxa/zeus.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include @@ -391,33 +393,34 @@ static struct pxa2xx_spi_master pxa2xx_spi_ssp3_master_info = { }; /* CAN bus on SPI */ -static int zeus_mcp2515_setup(struct spi_device *sdev) -{ - int err; - - err = gpio_request(ZEUS_CAN_SHDN_GPIO, "CAN shutdown"); - if (err) - return err; +static struct regulator_consumer_supply can_regulator_consumer = + REGULATOR_SUPPLY("vdd", "spi3.0"); - err = gpio_direction_output(ZEUS_CAN_SHDN_GPIO, 1); - if (err) { - gpio_free(ZEUS_CAN_SHDN_GPIO); - return err; - } +static struct regulator_init_data can_regulator_init_data = { + .constraints = { + .valid_ops_mask = REGULATOR_CHANGE_STATUS, + }, + .consumer_supplies = &can_regulator_consumer, + .num_consumer_supplies = 1, +}; - return 0; -} +static struct fixed_voltage_config can_regulator_pdata = { + .supply_name = "CAN_SHDN", + .microvolts = 3300000, + .gpio = ZEUS_CAN_SHDN_GPIO, + .init_data = &can_regulator_init_data, +}; -static int zeus_mcp2515_transceiver_enable(int enable) -{ - gpio_set_value(ZEUS_CAN_SHDN_GPIO, !enable); - return 0; -} +static struct platform_device can_regulator_device = { + .name = "reg-fixed-volage", + .id = -1, + .dev = { + .platform_data = &can_regulator_pdata, + }, +}; static struct mcp251x_platform_data zeus_mcp2515_pdata = { .oscillator_frequency = 16*1000*1000, - .board_specific_setup = zeus_mcp2515_setup, - .power_enable = zeus_mcp2515_transceiver_enable, }; static struct spi_board_info zeus_spi_board_info[] = { @@ -516,6 +519,7 @@ static struct platform_device *zeus_devices[] __initdata = { &zeus_leds_device, &zeus_pcmcia_device, &zeus_max6369_device, + &can_regulator_device, }; /* AC'97 */ diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c index 8cda23bf061..d2c54888b21 100644 --- a/drivers/net/can/mcp251x.c +++ b/drivers/net/can/mcp251x.c @@ -37,9 +37,6 @@ * * static struct mcp251x_platform_data mcp251x_info = { * .oscillator_frequency = 8000000, - * .board_specific_setup = &mcp251x_setup, - * .power_enable = mcp251x_power_enable, - * .transceiver_enable = NULL, * }; * * static struct spi_board_info spi_board_info[] = { @@ -76,6 +73,7 @@ #include #include #include +#include /* SPI interface instruction set */ #define INSTRUCTION_WRITE 0x02 @@ -264,6 +262,8 @@ struct mcp251x_priv { #define AFTER_SUSPEND_POWER 4 #define AFTER_SUSPEND_RESTART 8 int restart_tx; + struct regulator *power; + struct regulator *transceiver; }; #define MCP251X_IS(_model) \ @@ -667,16 +667,25 @@ static int mcp251x_hw_probe(struct spi_device *spi) return (st1 == 0x80 && st2 == 0x07) ? 1 : 0; } +static int mcp251x_power_enable(struct regulator *reg, int enable) +{ + if (IS_ERR(reg)) + return 0; + + if (enable) + return regulator_enable(reg); + else + return regulator_disable(reg); +} + static void mcp251x_open_clean(struct net_device *net) { struct mcp251x_priv *priv = netdev_priv(net); struct spi_device *spi = priv->spi; - struct mcp251x_platform_data *pdata = spi->dev.platform_data; free_irq(spi->irq, priv); mcp251x_hw_sleep(spi); - if (pdata->transceiver_enable) - pdata->transceiver_enable(0); + mcp251x_power_enable(priv->transceiver, 0); close_candev(net); } @@ -684,7 +693,6 @@ static int mcp251x_stop(struct net_device *net) { struct mcp251x_priv *priv = netdev_priv(net); struct spi_device *spi = priv->spi; - struct mcp251x_platform_data *pdata = spi->dev.platform_data; close_candev(net); @@ -704,8 +712,7 @@ static int mcp251x_stop(struct net_device *net) mcp251x_hw_sleep(spi); - if (pdata->transceiver_enable) - pdata->transceiver_enable(0); + mcp251x_power_enable(priv->transceiver, 0); priv->can.state = CAN_STATE_STOPPED; @@ -939,8 +946,7 @@ static int mcp251x_open(struct net_device *net) } mutex_lock(&priv->mcp_lock); - if (pdata->transceiver_enable) - pdata->transceiver_enable(1); + mcp251x_power_enable(priv->transceiver, 1); priv->force_quit = 0; priv->tx_skb = NULL; @@ -956,8 +962,7 @@ static int mcp251x_open(struct net_device *net) flags, DEVICE_NAME, priv); if (ret) { dev_err(&spi->dev, "failed to acquire irq %d\n", spi->irq); - if (pdata->transceiver_enable) - pdata->transceiver_enable(0); + mcp251x_power_enable(priv->transceiver, 0); close_candev(net); goto open_unlock; } @@ -1026,6 +1031,19 @@ static int mcp251x_can_probe(struct spi_device *spi) CAN_CTRLMODE_LOOPBACK | CAN_CTRLMODE_LISTENONLY; priv->model = spi_get_device_id(spi)->driver_data; priv->net = net; + + priv->power = devm_regulator_get(&spi->dev, "vdd"); + priv->transceiver = devm_regulator_get(&spi->dev, "xceiver"); + if ((PTR_ERR(priv->power) == -EPROBE_DEFER) || + (PTR_ERR(priv->transceiver) == -EPROBE_DEFER)) { + ret = -EPROBE_DEFER; + goto error_power; + } + + ret = mcp251x_power_enable(priv->power, 1); + if (ret) + goto error_power; + spi_set_drvdata(spi, priv); priv->spi = spi; @@ -1068,13 +1086,6 @@ static int mcp251x_can_probe(struct spi_device *spi) } } - if (pdata->power_enable) - pdata->power_enable(1); - - /* Call out to platform specific setup */ - if (pdata->board_specific_setup) - pdata->board_specific_setup(spi); - SET_NETDEV_DEV(net, &spi->dev); /* Configure the SPI bus */ @@ -1084,14 +1095,11 @@ static int mcp251x_can_probe(struct spi_device *spi) /* Here is OK to not lock the MCP, no one knows about it yet */ if (!mcp251x_hw_probe(spi)) { - dev_info(&spi->dev, "Probe failed\n"); + ret = -ENODEV; goto error_probe; } mcp251x_hw_sleep(spi); - if (pdata->transceiver_enable) - pdata->transceiver_enable(0); - ret = register_candev(net); if (ret) goto error_probe; @@ -1109,13 +1117,13 @@ error_rx_buf: if (!mcp251x_enable_dma) kfree(priv->spi_tx_buf); error_tx_buf: - free_candev(net); if (mcp251x_enable_dma) dma_free_coherent(&spi->dev, PAGE_SIZE, priv->spi_tx_buf, priv->spi_tx_dma); + mcp251x_power_enable(priv->power, 0); +error_power: + free_candev(net); error_alloc: - if (pdata->power_enable) - pdata->power_enable(0); dev_err(&spi->dev, "probe failed\n"); error_out: return ret; @@ -1123,12 +1131,10 @@ error_out: static int mcp251x_can_remove(struct spi_device *spi) { - struct mcp251x_platform_data *pdata = spi->dev.platform_data; struct mcp251x_priv *priv = spi_get_drvdata(spi); struct net_device *net = priv->net; unregister_candev(net); - free_candev(net); if (mcp251x_enable_dma) { dma_free_coherent(&spi->dev, PAGE_SIZE, @@ -1138,8 +1144,9 @@ static int mcp251x_can_remove(struct spi_device *spi) kfree(priv->spi_rx_buf); } - if (pdata->power_enable) - pdata->power_enable(0); + mcp251x_power_enable(priv->power, 0); + + free_candev(net); return 0; } @@ -1149,7 +1156,6 @@ static int mcp251x_can_remove(struct spi_device *spi) static int mcp251x_can_suspend(struct device *dev) { struct spi_device *spi = to_spi_device(dev); - struct mcp251x_platform_data *pdata = spi->dev.platform_data; struct mcp251x_priv *priv = spi_get_drvdata(spi); struct net_device *net = priv->net; @@ -1163,15 +1169,14 @@ static int mcp251x_can_suspend(struct device *dev) netif_device_detach(net); mcp251x_hw_sleep(spi); - if (pdata->transceiver_enable) - pdata->transceiver_enable(0); + mcp251x_power_enable(priv->transceiver, 0); priv->after_suspend = AFTER_SUSPEND_UP; } else { priv->after_suspend = AFTER_SUSPEND_DOWN; } - if (pdata->power_enable) { - pdata->power_enable(0); + if (!IS_ERR(priv->power)) { + regulator_disable(priv->power); priv->after_suspend |= AFTER_SUSPEND_POWER; } @@ -1181,16 +1186,14 @@ static int mcp251x_can_suspend(struct device *dev) static int mcp251x_can_resume(struct device *dev) { struct spi_device *spi = to_spi_device(dev); - struct mcp251x_platform_data *pdata = spi->dev.platform_data; struct mcp251x_priv *priv = spi_get_drvdata(spi); if (priv->after_suspend & AFTER_SUSPEND_POWER) { - pdata->power_enable(1); + mcp251x_power_enable(priv->power, 1); queue_work(priv->wq, &priv->restart_work); } else { if (priv->after_suspend & AFTER_SUSPEND_UP) { - if (pdata->transceiver_enable) - pdata->transceiver_enable(1); + mcp251x_power_enable(priv->transceiver, 1); queue_work(priv->wq, &priv->restart_work); } else { priv->after_suspend = 0; diff --git a/include/linux/can/platform/mcp251x.h b/include/linux/can/platform/mcp251x.h index 089fe43211a..8a2725676d8 100644 --- a/include/linux/can/platform/mcp251x.h +++ b/include/linux/can/platform/mcp251x.h @@ -9,26 +9,15 @@ #include -/** +/* * struct mcp251x_platform_data - MCP251X SPI CAN controller platform data * @oscillator_frequency: - oscillator frequency in Hz * @irq_flags: - IRQF configuration flags - * @board_specific_setup: - called before probing the chip (power,reset) - * @transceiver_enable: - called to power on/off the transceiver - * @power_enable: - called to power on/off the mcp *and* the - * transceiver - * - * Please note that you should define power_enable or transceiver_enable or - * none of them. Defining both of them is no use. - * */ struct mcp251x_platform_data { unsigned long oscillator_frequency; unsigned long irq_flags; - int (*board_specific_setup)(struct spi_device *spi); - int (*transceiver_enable)(int enable); - int (*power_enable) (int enable); }; #endif /* __CAN_PLATFORM_MCP251X_H__ */ -- cgit v1.2.3-70-g09d2 From ae5d589e5f9f3217656ada632869968178886ac6 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Mon, 19 Aug 2013 15:39:20 +0400 Subject: can: mcp251x: Eliminate irq_flags from driver platform_data Flags is not used by boards, so remove this field from the driver platform_data. Signed-off-by: Alexander Shiyan Signed-off-by: Marc Kleine-Budde --- drivers/net/can/mcp251x.c | 9 +-------- include/linux/can/platform/mcp251x.h | 2 -- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c index d2c54888b21..ec84ce16360 100644 --- a/drivers/net/can/mcp251x.c +++ b/drivers/net/can/mcp251x.c @@ -935,8 +935,7 @@ static int mcp251x_open(struct net_device *net) { struct mcp251x_priv *priv = netdev_priv(net); struct spi_device *spi = priv->spi; - struct mcp251x_platform_data *pdata = spi->dev.platform_data; - unsigned long flags; + unsigned long flags = IRQF_ONESHOT | IRQF_TRIGGER_FALLING; int ret; ret = open_candev(net); @@ -952,12 +951,6 @@ static int mcp251x_open(struct net_device *net) priv->tx_skb = NULL; priv->tx_len = 0; - flags = IRQF_ONESHOT; - if (pdata->irq_flags) - flags |= pdata->irq_flags; - else - flags |= IRQF_TRIGGER_FALLING; - ret = request_threaded_irq(spi->irq, NULL, mcp251x_can_ist, flags, DEVICE_NAME, priv); if (ret) { diff --git a/include/linux/can/platform/mcp251x.h b/include/linux/can/platform/mcp251x.h index 8a2725676d8..dc029dba7a0 100644 --- a/include/linux/can/platform/mcp251x.h +++ b/include/linux/can/platform/mcp251x.h @@ -12,12 +12,10 @@ /* * struct mcp251x_platform_data - MCP251X SPI CAN controller platform data * @oscillator_frequency: - oscillator frequency in Hz - * @irq_flags: - IRQF configuration flags */ struct mcp251x_platform_data { unsigned long oscillator_frequency; - unsigned long irq_flags; }; #endif /* __CAN_PLATFORM_MCP251X_H__ */ -- cgit v1.2.3-70-g09d2 From e660ed6c70370dae0887e2b5224d8b6c1e138120 Mon Sep 17 00:00:00 2001 From: Frédéric Dalleau Date: Mon, 19 Aug 2013 14:23:54 +0200 Subject: Bluetooth: Use hci_connect_sco directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hci_connect is a super function for connecting hci protocols. But the voice_setting parameter (introduced in subsequent patches) is only needed by SCO and security requirements are not needed for SCO channels. Thus, it makes sense to have a separate function for SCO. Signed-off-by: Frédéric Dalleau Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg Signed-off-by: Gustavo Padovan --- include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_conn.c | 8 ++------ net/bluetooth/sco.c | 3 +-- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index f77885ea78c..307a1926902 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -584,6 +584,8 @@ struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle); struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 dst_type, __u8 sec_level, __u8 auth_type); +struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, + bdaddr_t *dst); int hci_conn_check_link_mode(struct hci_conn *conn); int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level); int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 6c7f3637972..5f1f448dd0f 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -560,13 +560,12 @@ static struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, return acl; } -static struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, - bdaddr_t *dst, u8 sec_level, u8 auth_type) +struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst) { struct hci_conn *acl; struct hci_conn *sco; - acl = hci_connect_acl(hdev, dst, sec_level, auth_type); + acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING); if (IS_ERR(acl)) return acl; @@ -612,9 +611,6 @@ struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, return hci_connect_le(hdev, dst, dst_type, sec_level, auth_type); case ACL_LINK: return hci_connect_acl(hdev, dst, sec_level, auth_type); - case SCO_LINK: - case ESCO_LINK: - return hci_connect_sco(hdev, type, dst, sec_level, auth_type); } return ERR_PTR(-EINVAL); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 2de7150a630..ab2502cbecf 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -176,8 +176,7 @@ static int sco_connect(struct sock *sk) else type = SCO_LINK; - hcon = hci_connect(hdev, type, dst, BDADDR_BREDR, BT_SECURITY_LOW, - HCI_AT_NO_BONDING); + hcon = hci_connect_sco(hdev, type, dst); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto done; -- cgit v1.2.3-70-g09d2 From ad10b1a48754b1381582d96f070a39832e41382d Mon Sep 17 00:00:00 2001 From: Frédéric Dalleau Date: Mon, 19 Aug 2013 14:23:56 +0200 Subject: Bluetooth: Add Bluetooth socket voice option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch extends the current Bluetooth socket options with BT_VOICE. This is intended to choose voice data type at runtime. It only applies to SCO sockets. Incoming connections shall be setup during deferred setup. Outgoing connections shall be setup before connect(). The desired setting is stored in the SCO socket info. This patch declares needed members, modifies getsockopt() and setsockopt(). Signed-off-by: Frédéric Dalleau Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg Signed-off-by: Gustavo Padovan --- include/net/bluetooth/bluetooth.h | 8 ++++++++ include/net/bluetooth/sco.h | 1 + net/bluetooth/sco.c | 40 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 48 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 10eb9b38901..10d43d8c703 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -107,6 +107,14 @@ struct bt_power { */ #define BT_CHANNEL_POLICY_AMP_PREFERRED 2 +#define BT_VOICE 11 +struct bt_voice { + __u16 setting; +}; + +#define BT_VOICE_TRANSPARENT 0x0003 +#define BT_VOICE_CVSD_16BIT 0x0060 + __printf(1, 2) int bt_info(const char *fmt, ...); __printf(1, 2) diff --git a/include/net/bluetooth/sco.h b/include/net/bluetooth/sco.h index 1e35c43657c..e252a31ee6b 100644 --- a/include/net/bluetooth/sco.h +++ b/include/net/bluetooth/sco.h @@ -73,6 +73,7 @@ struct sco_conn { struct sco_pinfo { struct bt_sock bt; __u32 flags; + __u16 setting; struct sco_conn *conn; }; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index acdca68806d..678747e2e38 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -416,6 +416,8 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int pro sk->sk_protocol = proto; sk->sk_state = BT_OPEN; + sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; + setup_timer(&sk->sk_timer, sco_sock_timeout, (unsigned long)sk); bt_sock_link(&sco_sk_list, sk); @@ -709,7 +711,8 @@ static int sco_sock_recvmsg(struct kiocb *iocb, struct socket *sock, static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; - int err = 0; + int len, err = 0; + struct bt_voice voice; u32 opt; BT_DBG("sk %p", sk); @@ -735,6 +738,31 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); break; + case BT_VOICE: + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && + sk->sk_state != BT_CONNECT2) { + err = -EINVAL; + break; + } + + voice.setting = sco_pi(sk)->setting; + + len = min_t(unsigned int, sizeof(voice), optlen); + if (copy_from_user((char *) &voice, optval, len)) { + err = -EFAULT; + break; + } + + /* Explicitly check for these values */ + if (voice.setting != BT_VOICE_TRANSPARENT && + voice.setting != BT_VOICE_CVSD_16BIT) { + err = -EINVAL; + break; + } + + sco_pi(sk)->setting = voice.setting; + break; + default: err = -ENOPROTOOPT; break; @@ -808,6 +836,7 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char { struct sock *sk = sock->sk; int len, err = 0; + struct bt_voice voice; BT_DBG("sk %p", sk); @@ -833,6 +862,15 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char break; + case BT_VOICE: + voice.setting = sco_pi(sk)->setting; + + len = min_t(unsigned int, len, sizeof(voice)); + if (copy_to_user(optval, (char *)&voice, len)) + err = -EFAULT; + + break; + default: err = -ENOPROTOOPT; break; -- cgit v1.2.3-70-g09d2 From 5d4d62f6ca04c54ed6c84df6adf5427c52feda3e Mon Sep 17 00:00:00 2001 From: Frédéric Dalleau Date: Mon, 19 Aug 2013 14:23:57 +0200 Subject: Bluetooth: Add constants for SCO airmode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch defines constants for SCO airmode from SCO voice setting. It refers to Bluetooth Core V4.0 specification, Part E, Chap 6.12 which describe SCO voice setting format. Signed-off-by: Frédéric Dalleau Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg Signed-off-by: Gustavo Padovan --- include/net/bluetooth/hci_core.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 307a1926902..f403509b4d1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1215,4 +1215,8 @@ void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __u8 rand[8], u8 bdaddr_to_le(u8 bdaddr_type); +#define SCO_AIRMODE_MASK 0x0003 +#define SCO_AIRMODE_CVSD 0x0000 +#define SCO_AIRMODE_TRANSP 0x0003 + #endif /* __HCI_CORE_H */ -- cgit v1.2.3-70-g09d2 From 10c62ddc6f032c3a096401ca3ce7e5b2d5780859 Mon Sep 17 00:00:00 2001 From: Frédéric Dalleau Date: Mon, 19 Aug 2013 14:23:59 +0200 Subject: Bluetooth: Parameters for outgoing SCO connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to establish a transparent SCO connection, the correct settings must be specified in the Setup Synchronous Connection request. For that, a setting field is added to ACL connection data to set up the desired parameters. The patch also removes usage of hdev->voice_setting in CVSD connection and makes use of T2 parameters for transparent data. Signed-off-by: Frédéric Dalleau Signed-off-by: Johan Hedberg Acked-by: Marcel Holtmann Signed-off-by: Gustavo Padovan --- include/net/bluetooth/hci_core.h | 5 +++-- net/bluetooth/hci_conn.c | 24 +++++++++++++++++++----- net/bluetooth/sco.c | 2 +- 3 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index f403509b4d1..61ca2ce5b06 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -320,6 +320,7 @@ struct hci_conn { __u32 passkey_notify; __u8 passkey_entered; __u16 disc_timeout; + __u16 setting; unsigned long flags; __u8 remote_cap; @@ -584,8 +585,8 @@ struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle); struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 dst_type, __u8 sec_level, __u8 auth_type); -struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, - bdaddr_t *dst); +struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, + __u16 setting); int hci_conn_check_link_mode(struct hci_conn *conn); int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level); int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 5f1f448dd0f..c0e56a512ff 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -185,13 +185,24 @@ void hci_setup_sync(struct hci_conn *conn, __u16 handle) conn->attempt++; cp.handle = cpu_to_le16(handle); - cp.pkt_type = cpu_to_le16(conn->pkt_type); cp.tx_bandwidth = __constant_cpu_to_le32(0x00001f40); cp.rx_bandwidth = __constant_cpu_to_le32(0x00001f40); - cp.max_latency = __constant_cpu_to_le16(0xffff); - cp.voice_setting = cpu_to_le16(hdev->voice_setting); - cp.retrans_effort = 0xff; + cp.voice_setting = cpu_to_le16(conn->setting); + + switch (conn->setting & SCO_AIRMODE_MASK) { + case SCO_AIRMODE_TRANSP: + cp.pkt_type = __constant_cpu_to_le16(EDR_ESCO_MASK & + ~ESCO_2EV3); + cp.max_latency = __constant_cpu_to_le16(0x000d); + cp.retrans_effort = 0x02; + break; + case SCO_AIRMODE_CVSD: + cp.pkt_type = cpu_to_le16(conn->pkt_type); + cp.max_latency = __constant_cpu_to_le16(0xffff); + cp.retrans_effort = 0xff; + break; + } hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp); } @@ -560,7 +571,8 @@ static struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, return acl; } -struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst) +struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, + __u16 setting) { struct hci_conn *acl; struct hci_conn *sco; @@ -583,6 +595,8 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst) hci_conn_hold(sco); + sco->setting = setting; + if (acl->state == BT_CONNECTED && (sco->state == BT_OPEN || sco->state == BT_CLOSED)) { set_bit(HCI_CONN_POWER_SAVE, &acl->flags); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index b1016c82c90..ed581b41e03 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -176,7 +176,7 @@ static int sco_connect(struct sock *sk) else type = SCO_LINK; - hcon = hci_connect_sco(hdev, type, dst); + hcon = hci_connect_sco(hdev, type, dst, sco_pi(sk)->setting); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto done; -- cgit v1.2.3-70-g09d2 From 07a5c61eda7f23883273f738edbf6caaebb71923 Mon Sep 17 00:00:00 2001 From: Frédéric Dalleau Date: Mon, 19 Aug 2013 14:24:00 +0200 Subject: Bluetooth: Add constants and macro declaration for transparent data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch defines constants and macro for transparent data LMP features. It refers to Bluetooth Core V4.0 specification, Part C, Chap 3.3 which defines LMP feature mask. Signed-off-by: Frédéric Dalleau Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg Signed-off-by: Gustavo Padovan --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index a01fbb4a049..aaeaf0938ec 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -238,6 +238,7 @@ enum { #define LMP_CVSD 0x01 #define LMP_PSCHEME 0x02 #define LMP_PCONTROL 0x04 +#define LMP_TRANSPARENT 0x08 #define LMP_RSSI_INQ 0x40 #define LMP_ESCO 0x80 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 61ca2ce5b06..b2bfab80184 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -800,6 +800,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_lsto_capable(dev) ((dev)->features[0][7] & LMP_LSTO) #define lmp_inq_tx_pwr_capable(dev) ((dev)->features[0][7] & LMP_INQ_TX_PWR) #define lmp_ext_feat_capable(dev) ((dev)->features[0][7] & LMP_EXTFEATURES) +#define lmp_transp_capable(dev) ((dev)->features[0][2] & LMP_TRANSPARENT) /* ----- Extended LMP capabilities ----- */ #define lmp_host_ssp_capable(dev) ((dev)->features[1][0] & LMP_HOST_SSP) -- cgit v1.2.3-70-g09d2 From 2dea632f9acad076370fe871d4ccc93868621403 Mon Sep 17 00:00:00 2001 From: Frédéric Dalleau Date: Mon, 19 Aug 2013 14:24:03 +0200 Subject: Bluetooth: Add SCO connection fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When initiating a transparent eSCO connection, make use of T2 settings at first try. T2 is the recommended settings from HFP 1.6 WideBand Speech. Upon connection failure, try T1 settings. When CVSD is requested and eSCO is supported, try to establish eSCO connection using S3 settings. If it fails, fallback in sequence to S2, S1, D1, D0 settings. To know which setting should be used, conn->attempt is used. It indicates the currently ongoing SCO connection attempt and can be used as the index for the fallback settings table. These setting and the fallback order are described in Bluetooth HFP 1.6 specification p. 101. Signed-off-by: Frédéric Dalleau Signed-off-by: Johan Hedberg Signed-off-by: Gustavo Padovan --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_conn.c | 44 ++++++++++++++++++++++++++++++++-------- net/bluetooth/hci_event.c | 6 +++--- 3 files changed, 40 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b2bfab80184..3ede820d328 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -570,7 +570,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, } void hci_disconnect(struct hci_conn *conn, __u8 reason); -void hci_setup_sync(struct hci_conn *conn, __u16 handle); +bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index c0e56a512ff..f0817121ec5 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -31,6 +31,24 @@ #include #include +struct sco_param { + u16 pkt_type; + u16 max_latency; +}; + +static const struct sco_param sco_param_cvsd[] = { + { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a }, /* S3 */ + { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007 }, /* S2 */ + { EDR_ESCO_MASK | ESCO_EV3, 0x0007 }, /* S1 */ + { EDR_ESCO_MASK | ESCO_HV3, 0xffff }, /* D1 */ + { EDR_ESCO_MASK | ESCO_HV1, 0xffff }, /* D0 */ +}; + +static const struct sco_param sco_param_wideband[] = { + { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d }, /* T2 */ + { EDR_ESCO_MASK | ESCO_EV3, 0x0008 }, /* T1 */ +}; + static void hci_le_create_connection(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; @@ -172,10 +190,11 @@ static void hci_add_sco(struct hci_conn *conn, __u16 handle) hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp); } -void hci_setup_sync(struct hci_conn *conn, __u16 handle) +bool hci_setup_sync(struct hci_conn *conn, __u16 handle) { struct hci_dev *hdev = conn->hdev; struct hci_cp_setup_sync_conn cp; + const struct sco_param *param; BT_DBG("hcon %p", conn); @@ -192,19 +211,28 @@ void hci_setup_sync(struct hci_conn *conn, __u16 handle) switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: - cp.pkt_type = __constant_cpu_to_le16(EDR_ESCO_MASK & - ~ESCO_2EV3); - cp.max_latency = __constant_cpu_to_le16(0x000d); + if (conn->attempt > ARRAY_SIZE(sco_param_wideband)) + return false; cp.retrans_effort = 0x02; + param = &sco_param_wideband[conn->attempt - 1]; break; case SCO_AIRMODE_CVSD: - cp.pkt_type = cpu_to_le16(conn->pkt_type); - cp.max_latency = __constant_cpu_to_le16(0xffff); - cp.retrans_effort = 0xff; + if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) + return false; + cp.retrans_effort = 0x01; + param = &sco_param_cvsd[conn->attempt - 1]; break; + default: + return false; } - hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp); + cp.pkt_type = __cpu_to_le16(param->pkt_type); + cp.max_latency = __cpu_to_le16(param->max_latency); + + if (hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0) + return false; + + return true; } void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index a8bb7bb3f48..94aab73f89d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2909,11 +2909,11 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, case 0x1c: /* SCO interval rejected */ case 0x1a: /* Unsupported Remote Feature */ case 0x1f: /* Unspecified error */ - if (conn->out && conn->attempt < 2) { + if (conn->out) { conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) | (hdev->esco_type & EDR_ESCO_MASK); - hci_setup_sync(conn, conn->link->handle); - goto unlock; + if (hci_setup_sync(conn, conn->link->handle)) + goto unlock; } /* fall through */ -- cgit v1.2.3-70-g09d2 From fb7589a162162223e6bb6422dde3fb1ce07d9a78 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 Aug 2013 14:31:38 +0400 Subject: tun: Add ability to create tun device with given index Tun devices cannot be created with ifidex user wants, but it's required by checkpoint-restore project. Long time ago such ability was implemented for rtnl_ops-based interface for creating links (9c7dafbf net: Allow to create links with given ifindex), but the only API for creating and managing tuntap devices is ioctl-based and is evolving with adding new ones (cde8b15f tuntap: add ioctl to attach or detach a file form tuntap device). Following that trend, here's how a new ioctl that sets the ifindex for device, that _will_ be created by TUNSETIFF ioctl looks like. So those who want a tuntap device with the ifindex N, should open the tun device, call ioctl(fd, TUNSETIFINDEX, &N), then call TUNSETIFF. If the index N is busy, then the register_netdev will find this out and the ioctl would be failed with -EBUSY. If setifindex is not called, then it will be generated as before. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/net/tun.c | 21 ++++++++++++++++++++- include/uapi/linux/if_tun.h | 1 + 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7ed13cc0dcb..4b65fbcc490 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -138,7 +138,10 @@ struct tun_file { struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; - u16 queue_index; + union { + u16 queue_index; + unsigned int ifindex; + }; struct list_head next; struct tun_struct *detached; }; @@ -1601,6 +1604,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; + dev->ifindex = tfile->ifindex; tun = netdev_priv(dev); tun->dev = dev; @@ -1817,6 +1821,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, kgid_t group; int sndbuf; int vnet_hdr_sz; + unsigned int ifindex; int ret; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) { @@ -1851,6 +1856,19 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = -EFAULT; goto unlock; } + if (cmd == TUNSETIFINDEX) { + ret = -EPERM; + if (tun) + goto unlock; + + ret = -EFAULT; + if (copy_from_user(&ifindex, argp, sizeof(ifindex))) + goto unlock; + + ret = 0; + tfile->ifindex = ifindex; + goto unlock; + } ret = -EBADFD; if (!tun) @@ -2099,6 +2117,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) rcu_assign_pointer(tfile->tun, NULL); tfile->net = get_net(current->nsproxy->net_ns); tfile->flags = 0; + tfile->ifindex = 0; rcu_assign_pointer(tfile->socket.wq, &tfile->wq); init_waitqueue_head(&tfile->wq.wait); diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 1870ee29bb3..c58d023c482 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -56,6 +56,7 @@ #define TUNGETVNETHDRSZ _IOR('T', 215, int) #define TUNSETVNETHDRSZ _IOW('T', 216, int) #define TUNSETQUEUE _IOW('T', 217, int) +#define TUNSETIFINDEX _IOW('T', 218, unsigned int) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 -- cgit v1.2.3-70-g09d2 From 849c9b6f93cc4cb5eb59301b6380a7a81b43f414 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 Aug 2013 14:32:21 +0400 Subject: tun: Allow to skip filter on attach There's a small problem with sk-filters on tun devices. Consider an application doing this sequence of steps: fd = open("/dev/net/tun"); ioctl(fd, TUNSETIFF, { .ifr_name = "tun0" }); ioctl(fd, TUNATTACHFILTER, &my_filter); ioctl(fd, TUNSETPERSIST, 1); close(fd); At that point the tun0 will remain in the system and will keep in mind that there should be a socket filter at address '&my_filter'. If after that we do fd = open("/dev/net/tun"); ioctl(fd, TUNSETIFF, { .ifr_name = "tun0" }); we most likely receive the -EFAULT error, since tun_attach() would try to connect the filter back. But (!) if we provide a filter at address &my_filter, then tun0 will be created and the "new" filter would be attached, but application may not know about that. This may create certain problems to anyone using tun-s, but it's critical problem for c/r -- if we meet a persistent tun device with a filter in mind, we will not be able to attach to it to dump its state (flags, owner, address, vnethdr size, etc.). The proposal is to allow to attach to tun device (with TUNSETIFF) w/o attaching the filter to the tun-file's socket. After this attach app may e.g clean the device by dropping the filter, it doesn't want to have one, or (in case of c/r) get information about the device with tun ioctls. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/net/tun.c | 12 +++++++----- include/uapi/linux/if_tun.h | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index db43a240973..6acbdbccebc 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -501,7 +501,7 @@ static void tun_detach_all(struct net_device *dev) module_put(THIS_MODULE); } -static int tun_attach(struct tun_struct *tun, struct file *file) +static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter) { struct tun_file *tfile = file->private_data; int err; @@ -526,7 +526,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file) err = 0; /* Re-attach the filter to presist device */ - if (tun->filter_attached == true) { + if (!skip_filter && (tun->filter_attached == true)) { err = sk_attach_filter(&tun->fprog, tfile->socket.sk); if (!err) goto out; @@ -1557,7 +1557,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (err < 0) return err; - err = tun_attach(tun, file); + err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER); if (err < 0) return err; @@ -1631,7 +1631,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev->vlan_features = dev->features; INIT_LIST_HEAD(&tun->disabled); - err = tun_attach(tun, file); + err = tun_attach(tun, file, false); if (err < 0) goto err_free_dev; @@ -1795,7 +1795,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; - ret = tun_attach(tun, file); + ret = tun_attach(tun, file, false); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & TUN_TAP_MQ) || tfile->detached) @@ -1883,6 +1883,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (tfile->detached) ifr.ifr_flags |= IFF_DETACH_QUEUE; + if (!tfile->socket.sk->sk_filter) + ifr.ifr_flags |= IFF_NOFILTER; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index c58d023c482..cc127b2b4c3 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -71,6 +71,7 @@ #define IFF_DETACH_QUEUE 0x0400 /* read-only flag */ #define IFF_PERSIST 0x0800 +#define IFF_NOFILTER 0x1000 /* Socket options */ #define TUN_TX_TIMESTAMP 1 -- cgit v1.2.3-70-g09d2 From 76975e9cb4a7c6fe39478a3dc4dd292a5c6c8c74 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 Aug 2013 14:32:39 +0400 Subject: tun: Get skfilter layout The only thing we may have from tun device is the fprog, whic contains the number of filter elements and a pointer to (user-space) memory where the elements are. The program itself may not be available if the device is persistent and detached. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/net/tun.c | 10 ++++++++++ include/uapi/linux/if_tun.h | 1 + 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 6acbdbccebc..60a1e93e9d3 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2042,6 +2042,16 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, tun_detach_filter(tun, tun->numqueues); break; + case TUNGETFILTER: + ret = -EINVAL; + if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + break; + ret = -EFAULT; + if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) + break; + ret = 0; + break; + default: ret = -EINVAL; break; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index cc127b2b4c3..e9502dd1ee2 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -57,6 +57,7 @@ #define TUNSETVNETHDRSZ _IOW('T', 216, int) #define TUNSETQUEUE _IOW('T', 217, int) #define TUNSETIFINDEX _IOW('T', 218, unsigned int) +#define TUNGETFILTER _IOR('T', 219, struct sock_fprog) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 -- cgit v1.2.3-70-g09d2 From 2771399ac9986c75437a83b1c723493cfcdfa439 Mon Sep 17 00:00:00 2001 From: Gerhard Sittig Date: Thu, 22 Aug 2013 21:55:13 +0200 Subject: fs_enet: cleanup clock API use make the Freescale ethernet driver get, prepare and enable the FEC clock during probe(); disable and unprepare the clock upon remove(), put is done by the devm approach; hold a reference to the clock over the period of use. clock lookup is non-fatal as not all platforms provide clock specs in their device tree; failure to enable specified clocks is fatal. Signed-off-by: Gerhard Sittig Signed-off-by: Anatolij Gustschin Signed-off-by: David S. Miller --- .../net/ethernet/freescale/fs_enet/fs_enet-main.c | 20 ++++++++++++++++++++ include/linux/fs_enet_pd.h | 3 +++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index c04eb3aeb83..6b60582ce8c 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -999,6 +999,8 @@ static int fs_enet_probe(struct platform_device *ofdev) struct fs_enet_private *fep; struct fs_platform_info *fpi; const u32 *data; + struct clk *clk; + int err; const u8 *mac_addr; const char *phy_connection_type; int privsize, len, ret = -ENODEV; @@ -1036,6 +1038,20 @@ static int fs_enet_probe(struct platform_device *ofdev) fpi->use_rmii = 1; } + /* make clock lookup non-fatal (the driver is shared among platforms), + * but require enable to succeed when a clock was specified/found, + * keep a reference to the clock upon successful acquisition + */ + clk = devm_clk_get(&ofdev->dev, "per"); + if (!IS_ERR(clk)) { + err = clk_prepare_enable(clk); + if (err) { + ret = err; + goto out_free_fpi; + } + fpi->clk_per = clk; + } + privsize = sizeof(*fep) + sizeof(struct sk_buff **) * (fpi->rx_ring + fpi->tx_ring); @@ -1107,6 +1123,8 @@ out_free_dev: free_netdev(ndev); out_put: of_node_put(fpi->phy_node); + if (fpi->clk_per) + clk_disable_unprepare(fpi->clk_per); out_free_fpi: kfree(fpi); return ret; @@ -1123,6 +1141,8 @@ static int fs_enet_remove(struct platform_device *ofdev) fep->ops->cleanup_data(ndev); dev_set_drvdata(fep->dev, NULL); of_node_put(fep->fpi->phy_node); + if (fep->fpi->clk_per) + clk_disable_unprepare(fep->fpi->clk_per); free_netdev(ndev); return 0; } diff --git a/include/linux/fs_enet_pd.h b/include/linux/fs_enet_pd.h index 343d82a5446..efb05961bdd 100644 --- a/include/linux/fs_enet_pd.h +++ b/include/linux/fs_enet_pd.h @@ -16,6 +16,7 @@ #ifndef FS_ENET_PD_H #define FS_ENET_PD_H +#include #include #include #include @@ -143,6 +144,8 @@ struct fs_platform_info { int use_rmii; /* use RMII mode */ int has_phy; /* if the network is phy container as well...*/ + + struct clk *clk_per; /* 'per' clock for register access */ }; struct fs_mii_fec_platform_info { u32 irq[32]; -- cgit v1.2.3-70-g09d2 From 19504cf5f35fbe85db811fce9f4392a0cbdada2f Mon Sep 17 00:00:00 2001 From: Vladimir Kondratiev Date: Thu, 15 Aug 2013 14:51:28 +0300 Subject: cfg80211: add flags to cfg80211_rx_mgmt() Add flags intended to report various auxiliary information and introduce the NL80211_RXMGMT_FLAG_ANSWERED flag to report that the frame was already answered by the device. Signed-off-by: Vladimir Kondratiev [REPLIED->ANSWERED, reword commit message] Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/wmi.c | 7 +++---- drivers/net/wireless/ath/wil6210/wmi.c | 2 +- drivers/net/wireless/brcm80211/brcmfmac/p2p.c | 4 ++-- drivers/net/wireless/mwifiex/util.c | 4 ++-- include/net/cfg80211.h | 3 ++- include/uapi/linux/nl80211.h | 16 ++++++++++++++++ net/mac80211/rx.c | 3 +-- net/wireless/mlme.c | 4 ++-- net/wireless/nl80211.c | 6 ++++-- net/wireless/nl80211.h | 2 +- 10 files changed, 34 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c index 87aefb4c4c2..546d5da0b89 100644 --- a/drivers/net/wireless/ath/ath6kl/wmi.c +++ b/drivers/net/wireless/ath/ath6kl/wmi.c @@ -568,8 +568,8 @@ static int ath6kl_wmi_rx_probe_req_event_rx(struct wmi *wmi, u8 *datap, int len, dlen, freq, vif->probe_req_report); if (vif->probe_req_report || vif->nw_type == AP_NETWORK) - cfg80211_rx_mgmt(&vif->wdev, freq, 0, - ev->data, dlen, GFP_ATOMIC); + cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0, + GFP_ATOMIC); return 0; } @@ -608,8 +608,7 @@ static int ath6kl_wmi_rx_action_event_rx(struct wmi *wmi, u8 *datap, int len, return -EINVAL; } ath6kl_dbg(ATH6KL_DBG_WMI, "rx_action: len=%u freq=%u\n", dlen, freq); - cfg80211_rx_mgmt(&vif->wdev, freq, 0, - ev->data, dlen, GFP_ATOMIC); + cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0, GFP_ATOMIC); return 0; } diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c index dc8059ad4ba..21c791ee817 100644 --- a/drivers/net/wireless/ath/wil6210/wmi.c +++ b/drivers/net/wireless/ath/wil6210/wmi.c @@ -339,7 +339,7 @@ static void wmi_evt_rx_mgmt(struct wil6210_priv *wil, int id, void *d, int len) } } else { cfg80211_rx_mgmt(wil->wdev, freq, signal, - (void *)rx_mgmt_frame, d_len, GFP_KERNEL); + (void *)rx_mgmt_frame, d_len, 0, GFP_KERNEL); } } diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c index 79555f006d5..d7a97453290 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c @@ -1430,7 +1430,7 @@ int brcmf_p2p_notify_action_frame_rx(struct brcmf_if *ifp, IEEE80211_BAND_5GHZ); wdev = &ifp->vif->wdev; - cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len, + cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len, 0, GFP_ATOMIC); kfree(mgmt_frame); @@ -1895,7 +1895,7 @@ s32 brcmf_p2p_notify_rx_mgmt_p2p_probereq(struct brcmf_if *ifp, IEEE80211_BAND_2GHZ : IEEE80211_BAND_5GHZ); - cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len, + cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len, 0, GFP_ATOMIC); brcmf_dbg(INFO, "mgmt_frame_len (%d) , e->datalen (%d), chanspec (%04x), freq (%d)\n", diff --git a/drivers/net/wireless/mwifiex/util.c b/drivers/net/wireless/mwifiex/util.c index e57ac0dd3ab..5d9e150f411 100644 --- a/drivers/net/wireless/mwifiex/util.c +++ b/drivers/net/wireless/mwifiex/util.c @@ -171,8 +171,8 @@ mwifiex_process_mgmt_packet(struct mwifiex_private *priv, rx_pd->rx_pkt_length = cpu_to_le16(pkt_len); cfg80211_rx_mgmt(priv->wdev, priv->roc_cfg.chan.center_freq, - CAL_RSSI(rx_pd->snr, rx_pd->nf), - skb->data, pkt_len, GFP_ATOMIC); + CAL_RSSI(rx_pd->snr, rx_pd->nf), skb->data, pkt_len, + 0, GFP_ATOMIC); return 0; } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9ab7a0690d9..d530c54a366 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4056,6 +4056,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * @sig_dbm: signal strength in mBm, or 0 if unknown * @buf: Management frame (header + body) * @len: length of the frame data + * @flags: flags, as defined in enum nl80211_rxmgmt_flags * @gfp: context flags * * This function is called whenever an Action frame is received for a station @@ -4067,7 +4068,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * driver is responsible for rejecting the frame. */ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, gfp_t gfp); + const u8 *buf, size_t len, u32 flags, gfp_t gfp); /** * cfg80211_mgmt_tx_status - notification of TX status for management frame diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1f42bc3dcb9..fde2c021b26 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1493,6 +1493,9 @@ enum nl80211_commands { * @NL80211_ATTR_CSA_C_OFF_PRESP: Offset of the channel switch counter * field in the probe response (%NL80211_ATTR_PROBE_RESP). * + * @NL80211_ATTR_RXMGMT_FLAGS: flags for nl80211_send_mgmt(), u32. + * As specified in the &enum nl80211_rxmgmt_flags. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1801,6 +1804,8 @@ enum nl80211_attrs { NL80211_ATTR_CSA_C_OFF_BEACON, NL80211_ATTR_CSA_C_OFF_PRESP, + NL80211_ATTR_RXMGMT_FLAGS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3901,4 +3906,15 @@ enum nl80211_crit_proto_id { /* maximum duration for critical protocol measures */ #define NL80211_CRIT_PROTO_MAX_DURATION 5000 /* msec */ +/** + * enum nl80211_rxmgmt_flags - flags for received management frame. + * + * Used by cfg80211_rx_mgmt() + * + * @NL80211_RXMGMT_FLAG_ANSWERED: frame was answered by device/driver. + */ +enum nl80211_rxmgmt_flags { + NL80211_RXMGMT_FLAG_ANSWERED = 1 << 0, +}; + #endif /* __LINUX_NL80211_H */ diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index ffad155316a..07901050812 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2678,8 +2678,7 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) sig = status->signal; if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, - rx->skb->data, rx->skb->len, - GFP_ATOMIC)) { + rx->skb->data, rx->skb->len, 0, GFP_ATOMIC)) { if (rx->sta) rx->sta->rx_packets++; dev_kfree_skb(rx->skb); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index bfac5e186f5..8d49c1ce3de 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -621,7 +621,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, } bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, - const u8 *buf, size_t len, gfp_t gfp) + const u8 *buf, size_t len, u32 flags, gfp_t gfp) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); @@ -664,7 +664,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, /* Indicate the received Action frame to user space */ if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, freq, sig_mbm, - buf, len, gfp)) + buf, len, flags, gfp)) continue; result = true; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 334697de5cc..a51269d2d48 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10446,7 +10446,7 @@ EXPORT_SYMBOL(cfg80211_rx_unexpected_4addr_frame); int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u32 nlportid, int freq, int sig_dbm, - const u8 *buf, size_t len, gfp_t gfp) + const u8 *buf, size_t len, u32 flags, gfp_t gfp) { struct net_device *netdev = wdev->netdev; struct sk_buff *msg; @@ -10469,7 +10469,9 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || - nla_put(msg, NL80211_ATTR_FRAME, len, buf)) + nla_put(msg, NL80211_ATTR_FRAME, len, buf) || + (flags && + nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, flags))) goto nla_put_failure; genlmsg_end(msg, hdr); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 44341bf53cf..2c0f2b3c07c 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -66,7 +66,7 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u32 nlpid, int freq, int sig_dbm, - const u8 *buf, size_t len, gfp_t gfp); + const u8 *buf, size_t len, u32 flags, gfp_t gfp); void nl80211_radar_notify(struct cfg80211_registered_device *rdev, -- cgit v1.2.3-70-g09d2 From 03f0d916aa0317592dda11bd17c7357858719b6c Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Wed, 7 Aug 2013 20:01:00 -0700 Subject: openvswitch: Mega flow implementation Add wildcarded flow support in kernel datapath. Wildcarded flow can improve OVS flow set up performance by avoid sending matching new flows to the user space program. The exact performance boost will largely dependent on wildcarded flow hit rate. In case all new flows hits wildcard flows, the flow set up rate is within 5% of that of linux bridge module. Pravin has made significant contributions to this patch. Including API clean ups and bug fixes. Signed-off-by: Pravin B Shelar Signed-off-by: Andy Zhou Signed-off-by: Jesse Gross --- Documentation/networking/openvswitch.txt | 40 + include/uapi/linux/openvswitch.h | 9 +- net/openvswitch/actions.c | 6 +- net/openvswitch/datapath.c | 140 ++- net/openvswitch/datapath.h | 6 + net/openvswitch/flow.c | 1387 ++++++++++++++++++++---------- net/openvswitch/flow.h | 96 ++- 7 files changed, 1171 insertions(+), 513 deletions(-) (limited to 'include') diff --git a/Documentation/networking/openvswitch.txt b/Documentation/networking/openvswitch.txt index 8fa2dd1e792..37c20ee2455 100644 --- a/Documentation/networking/openvswitch.txt +++ b/Documentation/networking/openvswitch.txt @@ -91,6 +91,46 @@ Often we ellipsize arguments not important to the discussion, e.g.: in_port(1), eth(...), eth_type(0x0800), ipv4(...), tcp(...) +Wildcarded flow key format +-------------------------- + +A wildcarded flow is described with two sequences of Netlink attributes +passed over the Netlink socket. A flow key, exactly as described above, and an +optional corresponding flow mask. + +A wildcarded flow can represent a group of exact match flows. Each '1' bit +in the mask specifies a exact match with the corresponding bit in the flow key. +A '0' bit specifies a don't care bit, which will match either a '1' or '0' bit +of a incoming packet. Using wildcarded flow can improve the flow set up rate +by reduce the number of new flows need to be processed by the user space program. + +Support for the mask Netlink attribute is optional for both the kernel and user +space program. The kernel can ignore the mask attribute, installing an exact +match flow, or reduce the number of don't care bits in the kernel to less than +what was specified by the user space program. In this case, variations in bits +that the kernel does not implement will simply result in additional flow setups. +The kernel module will also work with user space programs that neither support +nor supply flow mask attributes. + +Since the kernel may ignore or modify wildcard bits, it can be difficult for +the userspace program to know exactly what matches are installed. There are +two possible approaches: reactively install flows as they miss the kernel +flow table (and therefore not attempt to determine wildcard changes at all) +or use the kernel's response messages to determine the installed wildcards. + +When interacting with userspace, the kernel should maintain the match portion +of the key exactly as originally installed. This will provides a handle to +identify the flow for all future operations. However, when reporting the +mask of an installed flow, the mask should include any restrictions imposed +by the kernel. + +The behavior when using overlapping wildcarded flows is undefined. It is the +responsibility of the user space program to ensure that any incoming packet +can match at most one flow, wildcarded or not. The current implementation +performs best-effort detection of overlapping wildcarded flows and may reject +some but not all of them. However, this behavior may change in future versions. + + Basic rule for evolving flow keys --------------------------------- diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 52490b0e62b..de1fa5d3780 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -1,6 +1,6 @@ /* - * Copyright (c) 2007-2011 Nicira Networks. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -379,6 +379,12 @@ struct ovs_key_nd { * @OVS_FLOW_ATTR_CLEAR: If present in a %OVS_FLOW_CMD_SET request, clears the * last-used time, accumulated TCP flags, and statistics for this flow. * Otherwise ignored in requests. Never present in notifications. + * @OVS_FLOW_ATTR_MASK: Nested %OVS_KEY_ATTR_* attributes specifying the + * mask bits for wildcarded flow match. Mask bit value '1' specifies exact + * match with corresponding flow key bit, while mask bit value '0' specifies + * a wildcarded match. Omitting attribute is treated as wildcarding all + * corresponding fields. Optional for all requests. If not present, + * all flow key bits are exact match bits. * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_FLOW_* commands. @@ -391,6 +397,7 @@ enum ovs_flow_attr { OVS_FLOW_ATTR_TCP_FLAGS, /* 8-bit OR'd TCP flags. */ OVS_FLOW_ATTR_USED, /* u64 msecs last used in monotonic time. */ OVS_FLOW_ATTR_CLEAR, /* Flag to clear stats, tcp_flags, used. */ + OVS_FLOW_ATTR_MASK, /* Sequence of OVS_KEY_ATTR_* attributes. */ __OVS_FLOW_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index ab101f71544..1f680222f4f 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -376,8 +376,10 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, const struct nlattr *a; int rem; + BUG_ON(!OVS_CB(skb)->pkt_key); + upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.key = &OVS_CB(skb)->flow->key; + upcall.key = OVS_CB(skb)->pkt_key; upcall.userdata = NULL; upcall.portid = 0; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9d97ef3c983..d29cd9aa4a6 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -165,7 +165,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); - ovs_flow_tbl_destroy((__force struct flow_table *)dp->table); + ovs_flow_tbl_destroy((__force struct flow_table *)dp->table, false); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -226,19 +226,18 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) struct sw_flow_key key; u64 *stats_counter; int error; - int key_len; stats = this_cpu_ptr(dp->stats_percpu); /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_extract(skb, p->port_no, &key, &key_len); + error = ovs_flow_extract(skb, p->port_no, &key); if (unlikely(error)) { kfree_skb(skb); return; } /* Look up flow. */ - flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len); + flow = ovs_flow_lookup(rcu_dereference(dp->table), &key); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -253,6 +252,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) } OVS_CB(skb)->flow = flow; + OVS_CB(skb)->pkt_key = &key; stats_counter = &stats->n_hit; ovs_flow_used(OVS_CB(skb)->flow, skb); @@ -435,7 +435,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, upcall->dp_ifindex = dp_ifindex; nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); - ovs_flow_to_nlattrs(upcall_info->key, user_skb); + ovs_flow_to_nlattrs(upcall_info->key, upcall_info->key, user_skb); nla_nest_end(user_skb, nla); if (upcall_info->userdata) @@ -468,7 +468,7 @@ static int flush_flows(struct datapath *dp) rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_deferred_destroy(old_table); + ovs_flow_tbl_destroy(old_table, true); return 0; } @@ -611,10 +611,12 @@ static int validate_tp_port(const struct sw_flow_key *flow_key) static int validate_and_copy_set_tun(const struct nlattr *attr, struct sw_flow_actions **sfa) { - struct ovs_key_ipv4_tunnel tun_key; + struct sw_flow_match match; + struct sw_flow_key key; int err, start; - err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &tun_key); + ovs_match_init(&match, &key, NULL); + err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &match, false); if (err) return err; @@ -622,7 +624,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; - err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key)); + err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, + sizeof(match.key->tun_key)); add_nested_action_end(*sfa, start); return err; @@ -857,7 +860,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct ethhdr *eth; int len; int err; - int key_len; err = -EINVAL; if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || @@ -890,11 +892,11 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(flow)) goto err_kfree_skb; - err = ovs_flow_extract(packet, -1, &flow->key, &key_len); + err = ovs_flow_extract(packet, -1, &flow->key); if (err) goto err_flow_free; - err = ovs_flow_metadata_from_nlattrs(flow, key_len, a[OVS_PACKET_ATTR_KEY]); + err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]); if (err) goto err_flow_free; acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); @@ -908,6 +910,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_flow_free; OVS_CB(packet)->flow = flow; + OVS_CB(packet)->pkt_key = &flow->key; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; @@ -922,13 +925,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) local_bh_enable(); rcu_read_unlock(); - ovs_flow_free(flow); + ovs_flow_free(flow, false); return err; err_unlock: rcu_read_unlock(); err_flow_free: - ovs_flow_free(flow); + ovs_flow_free(flow, false); err_kfree_skb: kfree_skb(packet); err: @@ -1045,7 +1048,8 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) if (!start) return -EMSGSIZE; - err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key)); + err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key), + nla_data(ovs_key)); if (err) return err; nla_nest_end(skb, start); @@ -1093,6 +1097,7 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) { return NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */ + + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */ + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + nla_total_size(8) /* OVS_FLOW_ATTR_USED */ @@ -1119,12 +1124,25 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, ovs_header->dp_ifindex = get_dpifindex(dp); + /* Fill flow key. */ nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); if (!nla) goto nla_put_failure; - err = ovs_flow_to_nlattrs(&flow->key, skb); + + err = ovs_flow_to_nlattrs(&flow->unmasked_key, + &flow->unmasked_key, skb); + if (err) + goto error; + nla_nest_end(skb, nla); + + nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK); + if (!nla) + goto nla_put_failure; + + err = ovs_flow_to_nlattrs(&flow->key, &flow->mask->key, skb); if (err) goto error; + nla_nest_end(skb, nla); spin_lock_bh(&flow->lock); @@ -1214,20 +1232,24 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; - struct sw_flow_key key; - struct sw_flow *flow; + struct sw_flow_key key, masked_key; + struct sw_flow *flow = NULL; + struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; struct flow_table *table; struct sw_flow_actions *acts = NULL; + struct sw_flow_match match; int error; - int key_len; /* Extract key. */ error = -EINVAL; if (!a[OVS_FLOW_ATTR_KEY]) goto error; - error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + + ovs_match_init(&match, &key, &mask); + error = ovs_match_from_nlattrs(&match, + a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); if (error) goto error; @@ -1238,9 +1260,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(acts)) goto error; - error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0, &acts); - if (error) + ovs_flow_key_mask(&masked_key, &key, &mask); + error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], + &masked_key, 0, &acts); + if (error) { + OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); goto err_kfree; + } } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) { error = -EINVAL; goto error; @@ -1253,8 +1279,11 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) goto err_unlock_ovs; table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); + + /* Check if this is a duplicate flow */ + flow = ovs_flow_lookup(table, &key); if (!flow) { + struct sw_flow_mask *mask_p; /* Bail out if we're not allowed to create a new flow. */ error = -ENOENT; if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) @@ -1267,7 +1296,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) new_table = ovs_flow_tbl_expand(table); if (!IS_ERR(new_table)) { rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_deferred_destroy(table); + ovs_flow_tbl_destroy(table, true); table = ovsl_dereference(dp->table); } } @@ -1280,14 +1309,30 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) } clear_stats(flow); + flow->key = masked_key; + flow->unmasked_key = key; + + /* Make sure mask is unique in the system */ + mask_p = ovs_sw_flow_mask_find(table, &mask); + if (!mask_p) { + /* Allocate a new mask if none exsits. */ + mask_p = ovs_sw_flow_mask_alloc(); + if (!mask_p) + goto err_flow_free; + mask_p->key = mask.key; + mask_p->range = mask.range; + ovs_sw_flow_mask_insert(table, mask_p); + } + + ovs_sw_flow_mask_add_ref(mask_p); + flow->mask = mask_p; rcu_assign_pointer(flow->sf_acts, acts); /* Put flow in bucket. */ - ovs_flow_tbl_insert(table, flow, &key, key_len); + ovs_flow_insert(table, flow); reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, - OVS_FLOW_CMD_NEW); + info->snd_seq, OVS_FLOW_CMD_NEW); } else { /* We found a matching flow. */ struct sw_flow_actions *old_acts; @@ -1303,6 +1348,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) goto err_unlock_ovs; + /* The unmasked key has to be the same for flow updates. */ + error = -EINVAL; + if (!ovs_flow_cmp_unmasked_key(flow, &key, match.range.end)) { + OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n"); + goto err_unlock_ovs; + } + /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); @@ -1327,6 +1379,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) ovs_dp_flow_multicast_group.id, PTR_ERR(reply)); return 0; +err_flow_free: + ovs_flow_free(flow, false); err_unlock_ovs: ovs_unlock(); err_kfree: @@ -1344,12 +1398,16 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) struct sw_flow *flow; struct datapath *dp; struct flow_table *table; + struct sw_flow_match match; int err; - int key_len; - if (!a[OVS_FLOW_ATTR_KEY]) + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR("Flow get message rejected, Key attribute missing.\n"); return -EINVAL; - err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + } + + ovs_match_init(&match, &key, NULL); + err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL); if (err) return err; @@ -1361,7 +1419,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) } table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); + flow = ovs_flow_lookup_unmasked_key(table, &match); if (!flow) { err = -ENOENT; goto unlock; @@ -1390,8 +1448,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) struct sw_flow *flow; struct datapath *dp; struct flow_table *table; + struct sw_flow_match match; int err; - int key_len; ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); @@ -1404,12 +1462,14 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) err = flush_flows(dp); goto unlock; } - err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + + ovs_match_init(&match, &key, NULL); + err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL); if (err) goto unlock; table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); + flow = ovs_flow_lookup_unmasked_key(table, &match); if (!flow) { err = -ENOENT; goto unlock; @@ -1421,13 +1481,13 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) goto unlock; } - ovs_flow_tbl_remove(table, flow); + ovs_flow_remove(table, flow); err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_DEL); BUG_ON(err < 0); - ovs_flow_deferred_free(flow); + ovs_flow_free(flow, true); ovs_unlock(); ovs_notify(reply, info, &ovs_dp_flow_multicast_group); @@ -1457,7 +1517,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) bucket = cb->args[0]; obj = cb->args[1]; - flow = ovs_flow_tbl_next(table, &bucket, &obj); + flow = ovs_flow_dump_next(table, &bucket, &obj); if (!flow) break; @@ -1680,7 +1740,7 @@ err_destroy_ports_array: err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(ovsl_dereference(dp->table)); + ovs_flow_tbl_destroy(ovsl_dereference(dp->table), false); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); @@ -2287,7 +2347,7 @@ static void rehash_flow_table(struct work_struct *work) new_table = ovs_flow_tbl_rehash(old_table); if (!IS_ERR(new_table)) { rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_deferred_destroy(old_table); + ovs_flow_tbl_destroy(old_table, true); } } } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index a9148648491..4d109c176ef 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -88,11 +88,13 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB * @flow: The flow associated with this packet. May be %NULL if no flow. + * @pkt_key: The flow information extracted from the packet. Must be nonnull. * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the * packet is not being tunneled. */ struct ovs_skb_cb { struct sw_flow *flow; + struct sw_flow_key *pkt_key; struct ovs_key_ipv4_tunnel *tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -183,4 +185,8 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); void ovs_dp_notify_wq(struct work_struct *work); + +#define OVS_NLERR(fmt, ...) \ + pr_info_once("netlink: " fmt, ##__VA_ARGS__) + #endif /* datapath.h */ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index fca282520ce..1fceb965359 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2011 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -46,6 +46,184 @@ static struct kmem_cache *flow_cache; +static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask, + struct sw_flow_key_range *range, u8 val); + +static void update_range__(struct sw_flow_match *match, + size_t offset, size_t size, bool is_mask) +{ + struct sw_flow_key_range *range = NULL; + size_t start = offset; + size_t end = offset + size; + + if (!is_mask) + range = &match->range; + else if (match->mask) + range = &match->mask->range; + + if (!range) + return; + + if (range->start == range->end) { + range->start = start; + range->end = end; + return; + } + + if (range->start > start) + range->start = start; + + if (range->end < end) + range->end = end; +} + +#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ + do { \ + update_range__(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) { \ + if ((match)->mask) \ + (match)->mask->key.field = value; \ + } else { \ + (match)->key->field = value; \ + } \ + } while (0) + +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + do { \ + update_range__(match, offsetof(struct sw_flow_key, field), \ + len, is_mask); \ + if (is_mask) { \ + if ((match)->mask) \ + memcpy(&(match)->mask->key.field, value_p, len);\ + } else { \ + memcpy(&(match)->key->field, value_p, len); \ + } \ + } while (0) + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, + struct sw_flow_mask *mask) +{ + memset(match, 0, sizeof(*match)); + match->key = key; + match->mask = mask; + + memset(key, 0, sizeof(*key)); + + if (mask) { + memset(&mask->key, 0, sizeof(mask->key)); + mask->range.start = mask->range.end = 0; + } +} + +static bool ovs_match_validate(const struct sw_flow_match *match, + u64 key_attrs, u64 mask_attrs) +{ + u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; + u64 mask_allowed = key_attrs; /* At most allow all key attributes */ + + /* The following mask attributes allowed only if they + * pass the validation tests. */ + mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) + | (1 << OVS_KEY_ATTR_IPV6) + | (1 << OVS_KEY_ATTR_TCP) + | (1 << OVS_KEY_ATTR_UDP) + | (1 << OVS_KEY_ATTR_ICMP) + | (1 << OVS_KEY_ATTR_ICMPV6) + | (1 << OVS_KEY_ATTR_ARP) + | (1 << OVS_KEY_ATTR_ND)); + + /* Always allowed mask fields. */ + mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) + | (1 << OVS_KEY_ATTR_IN_PORT) + | (1 << OVS_KEY_ATTR_ETHERTYPE)); + + /* Check key attributes. */ + if (match->key->eth.type == htons(ETH_P_ARP) + || match->key->eth.type == htons(ETH_P_RARP)) { + key_expected |= 1 << OVS_KEY_ATTR_ARP; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ARP; + } + + if (match->key->eth.type == htons(ETH_P_IP)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV4; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + } + + if (match->key->ip.proto == IPPROTO_ICMP) { + key_expected |= 1 << OVS_KEY_ATTR_ICMP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMP; + } + } + } + + if (match->key->eth.type == htons(ETH_P_IPV6)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV6; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + } + + if (match->key->ip.proto == IPPROTO_ICMPV6) { + key_expected |= 1 << OVS_KEY_ATTR_ICMPV6; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; + + if (match->key->ipv6.tp.src == + htons(NDISC_NEIGHBOUR_SOLICITATION) || + match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + key_expected |= 1 << OVS_KEY_ATTR_ND; + if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ND; + } + } + } + } + + if ((key_attrs & key_expected) != key_expected) { + /* Key attributes check failed. */ + OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n", + key_attrs, key_expected); + return false; + } + + if ((mask_attrs & mask_allowed) != mask_attrs) { + /* Mask attributes check failed. */ + OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n", + mask_attrs, mask_allowed); + return false; + } + + return true; +} + static int check_header(struct sk_buff *skb, int len) { if (unlikely(skb->len < len)) @@ -121,12 +299,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) return cur_ms - idle_ms; } -#define SW_FLOW_KEY_OFFSET(field) \ - (offsetof(struct sw_flow_key, field) + \ - FIELD_SIZEOF(struct sw_flow_key, field)) - -static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, - int *key_lenp) +static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) { unsigned int nh_ofs = skb_network_offset(skb); unsigned int nh_len; @@ -136,8 +309,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, __be16 frag_off; int err; - *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label); - err = check_header(skb, nh_ofs + sizeof(*nh)); if (unlikely(err)) return err; @@ -176,6 +347,21 @@ static bool icmp6hdr_ok(struct sk_buff *skb) sizeof(struct icmp6hdr)); } +void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask) +{ + u8 *m = (u8 *)&mask->key + mask->range.start; + u8 *s = (u8 *)src + mask->range.start; + u8 *d = (u8 *)dst + mask->range.start; + int i; + + memset(dst, 0, sizeof(*dst)); + for (i = 0; i < ovs_sw_flow_mask_size_roundup(mask); i++) { + *d = *s & *m; + d++, s++, m++; + } +} + #define TCP_FLAGS_OFFSET 13 #define TCP_FLAG_MASK 0x3f @@ -224,6 +410,7 @@ struct sw_flow *ovs_flow_alloc(void) spin_lock_init(&flow->lock); flow->sf_acts = NULL; + flow->mask = NULL; return flow; } @@ -263,7 +450,7 @@ static void free_buckets(struct flex_array *buckets) flex_array_free(buckets); } -struct flow_table *ovs_flow_tbl_alloc(int new_size) +static struct flow_table *__flow_tbl_alloc(int new_size) { struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); @@ -281,17 +468,15 @@ struct flow_table *ovs_flow_tbl_alloc(int new_size) table->node_ver = 0; table->keep_flows = false; get_random_bytes(&table->hash_seed, sizeof(u32)); + table->mask_list = NULL; return table; } -void ovs_flow_tbl_destroy(struct flow_table *table) +static void __flow_tbl_destroy(struct flow_table *table) { int i; - if (!table) - return; - if (table->keep_flows) goto skip_flows; @@ -303,31 +488,55 @@ void ovs_flow_tbl_destroy(struct flow_table *table) hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { hlist_del(&flow->hash_node[ver]); - ovs_flow_free(flow); + ovs_flow_free(flow, false); } } + BUG_ON(!list_empty(table->mask_list)); + kfree(table->mask_list); + skip_flows: free_buckets(table->buckets); kfree(table); } +struct flow_table *ovs_flow_tbl_alloc(int new_size) +{ + struct flow_table *table = __flow_tbl_alloc(new_size); + + if (!table) + return NULL; + + table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL); + if (!table->mask_list) { + table->keep_flows = true; + __flow_tbl_destroy(table); + return NULL; + } + INIT_LIST_HEAD(table->mask_list); + + return table; +} + static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) { struct flow_table *table = container_of(rcu, struct flow_table, rcu); - ovs_flow_tbl_destroy(table); + __flow_tbl_destroy(table); } -void ovs_flow_tbl_deferred_destroy(struct flow_table *table) +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) { if (!table) return; - call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); + if (deferred) + call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); + else + __flow_tbl_destroy(table); } -struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last) +struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *last) { struct sw_flow *flow; struct hlist_head *head; @@ -353,11 +562,13 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la return NULL; } -static void __flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) +static void __tbl_insert(struct flow_table *table, struct sw_flow *flow) { struct hlist_head *head; + head = find_bucket(table, flow->hash); hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); + table->count++; } @@ -377,8 +588,10 @@ static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new head = flex_array_get(old->buckets, i); hlist_for_each_entry(flow, head, hash_node[old_ver]) - __flow_tbl_insert(new, flow); + __tbl_insert(new, flow); } + + new->mask_list = old->mask_list; old->keep_flows = true; } @@ -386,7 +599,7 @@ static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buck { struct flow_table *new_table; - new_table = ovs_flow_tbl_alloc(n_buckets); + new_table = __flow_tbl_alloc(n_buckets); if (!new_table) return ERR_PTR(-ENOMEM); @@ -405,28 +618,30 @@ struct flow_table *ovs_flow_tbl_expand(struct flow_table *table) return __flow_tbl_rehash(table, table->n_buckets * 2); } -void ovs_flow_free(struct sw_flow *flow) +static void __flow_free(struct sw_flow *flow) { - if (unlikely(!flow)) - return; - kfree((struct sf_flow_acts __force *)flow->sf_acts); kmem_cache_free(flow_cache, flow); } -/* RCU callback used by ovs_flow_deferred_free. */ static void rcu_free_flow_callback(struct rcu_head *rcu) { struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); - ovs_flow_free(flow); + __flow_free(flow); } -/* Schedules 'flow' to be freed after the next RCU grace period. - * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_flow_deferred_free(struct sw_flow *flow) +void ovs_flow_free(struct sw_flow *flow, bool deferred) { - call_rcu(&flow->rcu, rcu_free_flow_callback); + if (!flow) + return; + + ovs_sw_flow_mask_del_ref(flow->mask, deferred); + + if (deferred) + call_rcu(&flow->rcu, rcu_free_flow_callback); + else + __flow_free(flow); } /* Schedules 'sf_acts' to be freed after the next RCU grace period. @@ -497,18 +712,15 @@ static __be16 parse_ethertype(struct sk_buff *skb) } static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, - int *key_lenp, int nh_len) + int nh_len) { struct icmp6hdr *icmp = icmp6_hdr(skb); - int error = 0; - int key_len; /* The ICMPv6 type and code fields use the 16-bit transport port * fields, so we need to store them in 16-bit network byte order. */ key->ipv6.tp.src = htons(icmp->icmp6_type); key->ipv6.tp.dst = htons(icmp->icmp6_code); - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (icmp->icmp6_code == 0 && (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || @@ -517,21 +729,17 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, struct nd_msg *nd; int offset; - key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); - /* In order to process neighbor discovery options, we need the * entire packet. */ if (unlikely(icmp_len < sizeof(*nd))) - goto out; - if (unlikely(skb_linearize(skb))) { - error = -ENOMEM; - goto out; - } + return 0; + + if (unlikely(skb_linearize(skb))) + return -ENOMEM; nd = (struct nd_msg *)skb_transport_header(skb); key->ipv6.nd.target = nd->target; - key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); icmp_len -= sizeof(*nd); offset = 0; @@ -541,7 +749,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, int opt_len = nd_opt->nd_opt_len * 8; if (unlikely(!opt_len || opt_len > icmp_len)) - goto invalid; + return 0; /* Store the link layer address if the appropriate * option is provided. It is considered an error if @@ -566,16 +774,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, } } - goto out; + return 0; invalid: memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target)); memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll)); memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll)); -out: - *key_lenp = key_len; - return error; + return 0; } /** @@ -584,7 +790,6 @@ out: * Ethernet header * @in_port: port number on which @skb was received. * @key: output flow key - * @key_lenp: length of output flow key * * The caller must ensure that skb->len >= ETH_HLEN. * @@ -602,11 +807,9 @@ out: * of a correct length, otherwise the same as skb->network_header. * For other key->eth.type values it is left untouched. */ -int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, - int *key_lenp) +int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) { - int error = 0; - int key_len = SW_FLOW_KEY_OFFSET(eth); + int error; struct ethhdr *eth; memset(key, 0, sizeof(*key)); @@ -649,15 +852,13 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, struct iphdr *nh; __be16 offset; - key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); - error = check_iphdr(skb); if (unlikely(error)) { if (error == -EINVAL) { skb->transport_header = skb->network_header; error = 0; } - goto out; + return error; } nh = ip_hdr(skb); @@ -671,7 +872,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, offset = nh->frag_off & htons(IP_OFFSET); if (offset) { key->ip.frag = OVS_FRAG_TYPE_LATER; - goto out; + return 0; } if (nh->frag_off & htons(IP_MF) || skb_shinfo(skb)->gso_type & SKB_GSO_UDP) @@ -679,21 +880,18 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, /* Transport layer. */ if (key->ip.proto == IPPROTO_TCP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); key->ipv4.tp.src = tcp->source; key->ipv4.tp.dst = tcp->dest; } } else if (key->ip.proto == IPPROTO_UDP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->ipv4.tp.src = udp->source; key->ipv4.tp.dst = udp->dest; } } else if (key->ip.proto == IPPROTO_ICMP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (icmphdr_ok(skb)) { struct icmphdr *icmp = icmp_hdr(skb); /* The ICMP type and code fields use the 16-bit @@ -722,53 +920,49 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN); memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN); - key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ - nh_len = parse_ipv6hdr(skb, key, &key_len); + nh_len = parse_ipv6hdr(skb, key); if (unlikely(nh_len < 0)) { - if (nh_len == -EINVAL) + if (nh_len == -EINVAL) { skb->transport_header = skb->network_header; - else + error = 0; + } else { error = nh_len; - goto out; + } + return error; } if (key->ip.frag == OVS_FRAG_TYPE_LATER) - goto out; + return 0; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; /* Transport layer. */ if (key->ip.proto == NEXTHDR_TCP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); key->ipv6.tp.src = tcp->source; key->ipv6.tp.dst = tcp->dest; } } else if (key->ip.proto == NEXTHDR_UDP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->ipv6.tp.src = udp->source; key->ipv6.tp.dst = udp->dest; } } else if (key->ip.proto == NEXTHDR_ICMP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (icmp6hdr_ok(skb)) { - error = parse_icmpv6(skb, key, &key_len, nh_len); - if (error < 0) - goto out; + error = parse_icmpv6(skb, key, nh_len); + if (error) + return error; } } } -out: - *key_lenp = key_len; - return error; + return 0; } static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_len) @@ -777,7 +971,7 @@ static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_l DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0); } -static int flow_key_start(struct sw_flow_key *key) +static int flow_key_start(const struct sw_flow_key *key) { if (key->tun_key.ipv4_dst) return 0; @@ -785,39 +979,95 @@ static int flow_key_start(struct sw_flow_key *key) return offsetof(struct sw_flow_key, phy); } -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, - struct sw_flow_key *key, int key_len) +static bool __cmp_key(const struct sw_flow_key *key1, + const struct sw_flow_key *key2, int key_start, int key_len) +{ + return !memcmp((u8 *)key1 + key_start, + (u8 *)key2 + key_start, (key_len - key_start)); +} + +static bool __flow_cmp_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_start, int key_len) +{ + return __cmp_key(&flow->key, key, key_start, key_len); +} + +static bool __flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_start, int key_len) +{ + return __cmp_key(&flow->unmasked_key, key, key_start, key_len); +} + +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_len) +{ + int key_start; + key_start = flow_key_start(key); + + return __flow_cmp_unmasked_key(flow, key, key_start, key_len); + +} + +struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table, + struct sw_flow_match *match) +{ + struct sw_flow_key *unmasked = match->key; + int key_len = match->range.end; + struct sw_flow *flow; + + flow = ovs_flow_lookup(table, unmasked); + if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_len))) + flow = NULL; + + return flow; +} + +static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table, + const struct sw_flow_key *flow_key, + struct sw_flow_mask *mask) { struct sw_flow *flow; struct hlist_head *head; - u8 *_key; - int key_start; + int key_start = mask->range.start; + int key_len = mask->range.end; u32 hash; + struct sw_flow_key masked_key; - key_start = flow_key_start(key); - hash = ovs_flow_hash(key, key_start, key_len); - - _key = (u8 *) key + key_start; + ovs_flow_key_mask(&masked_key, flow_key, mask); + hash = ovs_flow_hash(&masked_key, key_start, key_len); head = find_bucket(table, hash); hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { - - if (flow->hash == hash && - !memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) { + if (flow->mask == mask && + __flow_cmp_key(flow, &masked_key, key_start, key_len)) return flow; - } } return NULL; } -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - struct sw_flow_key *key, int key_len) +struct sw_flow *ovs_flow_lookup(struct flow_table *tbl, + const struct sw_flow_key *key) { - flow->hash = ovs_flow_hash(key, flow_key_start(key), key_len); - memcpy(&flow->key, key, sizeof(flow->key)); - __flow_tbl_insert(table, flow); + struct sw_flow *flow = NULL; + struct sw_flow_mask *mask; + + list_for_each_entry_rcu(mask, tbl->mask_list, list) { + flow = ovs_masked_flow_lookup(tbl, key, mask); + if (flow) /* Found */ + break; + } + + return flow; } -void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) + +void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow) +{ + flow->hash = ovs_flow_hash(&flow->key, flow->mask->range.start, + flow->mask->range.end); + __tbl_insert(table, flow); +} + +void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow) { BUG_ON(table->count == 0); hlist_del_rcu(&flow->hash_node[table->node_ver]); @@ -844,149 +1094,84 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_TUNNEL] = -1, }; -static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, - const struct nlattr *a[], u32 *attrs) -{ - const struct ovs_key_icmp *icmp_key; - const struct ovs_key_tcp *tcp_key; - const struct ovs_key_udp *udp_key; - - switch (swkey->ip.proto) { - case IPPROTO_TCP: - if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_TCP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - swkey->ipv4.tp.src = tcp_key->tcp_src; - swkey->ipv4.tp.dst = tcp_key->tcp_dst; - break; - - case IPPROTO_UDP: - if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_UDP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - swkey->ipv4.tp.src = udp_key->udp_src; - swkey->ipv4.tp.dst = udp_key->udp_dst; - break; - - case IPPROTO_ICMP: - if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ICMP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); - swkey->ipv4.tp.src = htons(icmp_key->icmp_type); - swkey->ipv4.tp.dst = htons(icmp_key->icmp_code); - break; - } - - return 0; -} - -static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, - const struct nlattr *a[], u32 *attrs) +static bool is_all_zero(const u8 *fp, size_t size) { - const struct ovs_key_icmpv6 *icmpv6_key; - const struct ovs_key_tcp *tcp_key; - const struct ovs_key_udp *udp_key; - - switch (swkey->ip.proto) { - case IPPROTO_TCP: - if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_TCP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - swkey->ipv6.tp.src = tcp_key->tcp_src; - swkey->ipv6.tp.dst = tcp_key->tcp_dst; - break; - - case IPPROTO_UDP: - if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_UDP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - swkey->ipv6.tp.src = udp_key->udp_src; - swkey->ipv6.tp.dst = udp_key->udp_dst; - break; - - case IPPROTO_ICMPV6: - if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); - swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type); - swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code); + int i; - if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || - swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { - const struct ovs_key_nd *nd_key; + if (!fp) + return false; - if (!(*attrs & (1 << OVS_KEY_ATTR_ND))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ND); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); - nd_key = nla_data(a[OVS_KEY_ATTR_ND]); - memcpy(&swkey->ipv6.nd.target, nd_key->nd_target, - sizeof(swkey->ipv6.nd.target)); - memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN); - memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN); - } - break; - } + for (i = 0; i < size; i++) + if (fp[i]) + return false; - return 0; + return true; } -static int parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u32 *attrsp) +static int __parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], + u64 *attrsp, bool nz) { const struct nlattr *nla; u32 attrs; int rem; - attrs = 0; + attrs = *attrsp; nla_for_each_nested(nla, attr, rem) { u16 type = nla_type(nla); int expected_len; - if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type)) + if (type > OVS_KEY_ATTR_MAX) { + OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n", + type, OVS_KEY_ATTR_MAX); + } + + if (attrs & (1 << type)) { + OVS_NLERR("Duplicate key attribute (type %d).\n", type); return -EINVAL; + } expected_len = ovs_key_lens[type]; - if (nla_len(nla) != expected_len && expected_len != -1) + if (nla_len(nla) != expected_len && expected_len != -1) { + OVS_NLERR("Key attribute has unexpected length (type=%d" + ", length=%d, expected=%d).\n", type, + nla_len(nla), expected_len); return -EINVAL; + } - attrs |= 1 << type; - a[type] = nla; + if (!nz || !is_all_zero(nla_data(nla), expected_len)) { + attrs |= 1 << type; + a[type] = nla; + } } - if (rem) + if (rem) { + OVS_NLERR("Message has %d unknown bytes.\n", rem); return -EINVAL; + } *attrsp = attrs; return 0; } +static int parse_flow_mask_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp) +{ + return __parse_flow_nlattrs(attr, a, attrsp, true); +} + +static int parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp) +{ + return __parse_flow_nlattrs(attr, a, attrsp, false); +} + int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct ovs_key_ipv4_tunnel *tun_key) + struct sw_flow_match *match, bool is_mask) { struct nlattr *a; int rem; bool ttl = false; - - memset(tun_key, 0, sizeof(*tun_key)); + __be16 tun_flags = 0; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); @@ -1000,53 +1185,78 @@ int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, }; - if (type > OVS_TUNNEL_KEY_ATTR_MAX || - ovs_tunnel_key_lens[type] != nla_len(a)) + if (type > OVS_TUNNEL_KEY_ATTR_MAX) { + OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n", + type, OVS_TUNNEL_KEY_ATTR_MAX); return -EINVAL; + } + + if (ovs_tunnel_key_lens[type] != nla_len(a)) { + OVS_NLERR("IPv4 tunnel attribute type has unexpected " + " length (type=%d, length=%d, expected=%d).\n", + type, nla_len(a), ovs_tunnel_key_lens[type]); + return -EINVAL; + } switch (type) { case OVS_TUNNEL_KEY_ATTR_ID: - tun_key->tun_id = nla_get_be64(a); - tun_key->tun_flags |= TUNNEL_KEY; + SW_FLOW_KEY_PUT(match, tun_key.tun_id, + nla_get_be64(a), is_mask); + tun_flags |= TUNNEL_KEY; break; case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - tun_key->ipv4_src = nla_get_be32(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, + nla_get_be32(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - tun_key->ipv4_dst = nla_get_be32(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, + nla_get_be32(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TOS: - tun_key->ipv4_tos = nla_get_u8(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, + nla_get_u8(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TTL: - tun_key->ipv4_ttl = nla_get_u8(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, + nla_get_u8(a), is_mask); ttl = true; break; case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: - tun_key->tun_flags |= TUNNEL_DONT_FRAGMENT; + tun_flags |= TUNNEL_DONT_FRAGMENT; break; case OVS_TUNNEL_KEY_ATTR_CSUM: - tun_key->tun_flags |= TUNNEL_CSUM; + tun_flags |= TUNNEL_CSUM; break; default: return -EINVAL; - } } - if (rem > 0) - return -EINVAL; - if (!tun_key->ipv4_dst) - return -EINVAL; + SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); - if (!ttl) + if (rem > 0) { + OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem); return -EINVAL; + } + + if (!is_mask) { + if (!match->key->tun_key.ipv4_dst) { + OVS_NLERR("IPv4 tunnel destination address is zero.\n"); + return -EINVAL; + } + + if (!ttl) { + OVS_NLERR("IPv4 tunnel TTL not specified.\n"); + return -EINVAL; + } + } return 0; } int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key) + const struct ovs_key_ipv4_tunnel *tun_key, + const struct ovs_key_ipv4_tunnel *output) { struct nlattr *nla; @@ -1054,23 +1264,24 @@ int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, if (!nla) return -EMSGSIZE; - if (tun_key->tun_flags & TUNNEL_KEY && - nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, tun_key->tun_id)) + if (output->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; - if (tun_key->ipv4_src && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->ipv4_src)) + if (output->ipv4_src && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) return -EMSGSIZE; - if (nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->ipv4_dst)) + if (output->ipv4_dst && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) return -EMSGSIZE; - if (tun_key->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, tun_key->ipv4_tos)) + if (output->ipv4_tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, tun_key->ipv4_ttl)) + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) && + if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_CSUM) && + if ((output->tun_flags & TUNNEL_CSUM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; @@ -1078,176 +1289,372 @@ int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } -/** - * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key. - * @swkey: receives the extracted flow key. - * @key_lenp: number of bytes used in @swkey. - * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. - */ -int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, - const struct nlattr *attr) +static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, + const struct nlattr **a, bool is_mask) { - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - const struct ovs_key_ethernet *eth_key; - int key_len; - u32 attrs; - int err; + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { + SW_FLOW_KEY_PUT(match, phy.priority, + nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + } - memset(swkey, 0, sizeof(struct sw_flow_key)); - key_len = SW_FLOW_KEY_OFFSET(eth); + if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { + u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - err = parse_flow_nlattrs(attr, a, &attrs); - if (err) - return err; + if (is_mask) + in_port = 0xffffffff; /* Always exact match in_port. */ + else if (in_port >= DP_MAX_PORTS) + return -EINVAL; - /* Metadata attributes. */ - if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { - swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]); - attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); } - if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { - u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - if (in_port >= DP_MAX_PORTS) - return -EINVAL; - swkey->phy.in_port = in_port; - attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); - } else { - swkey->phy.in_port = DP_MAX_PORTS; + + if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { + uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); + + SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); } - if (attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { - swkey->phy.skb_mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); - attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); + if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { + if (ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask)) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); } + return 0; +} - if (attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { - err = ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], &swkey->tun_key); - if (err) - return err; +static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, + const struct nlattr **a, bool is_mask) +{ + int err; + u64 orig_attrs = attrs; - attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); - } + err = metadata_from_nlattrs(match, &attrs, a, is_mask); + if (err) + return err; - /* Data attributes. */ - if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) { + const struct ovs_key_ethernet *eth_key; - eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); - memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN); - memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN); + eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); + SW_FLOW_KEY_MEMCPY(match, eth.src, + eth_key->eth_src, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, eth.dst, + eth_key->eth_dst, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + } - if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) && - nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) { - const struct nlattr *encap; + if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { __be16 tci; - if (attrs != ((1 << OVS_KEY_ATTR_VLAN) | - (1 << OVS_KEY_ATTR_ETHERTYPE) | - (1 << OVS_KEY_ATTR_ENCAP))) - return -EINVAL; - - encap = a[OVS_KEY_ATTR_ENCAP]; tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - if (tci & htons(VLAN_TAG_PRESENT)) { - swkey->eth.tci = tci; - - err = parse_flow_nlattrs(encap, a, &attrs); - if (err) - return err; - } else if (!tci) { - /* Corner case for truncated 802.1Q header. */ - if (nla_len(encap)) - return -EINVAL; + if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (is_mask) + OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n"); + else + OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n"); - swkey->eth.type = htons(ETH_P_8021Q); - *key_lenp = key_len; - return 0; - } else { return -EINVAL; } - } + + SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + } else if (!is_mask) + SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { - swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - if (ntohs(swkey->eth.type) < ETH_P_802_3_MIN) + __be16 eth_type; + + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + if (is_mask) { + /* Always exact match EtherType. */ + eth_type = htons(0xffff); + } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { + OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n", + ntohs(eth_type), ETH_P_802_3_MIN); return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - } else { - swkey->eth.type = htons(ETH_P_802_2); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); } - if (swkey->eth.type == htons(ETH_P_IP)) { + if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { const struct ovs_key_ipv4 *ipv4_key; - if (!(attrs & (1 << OVS_KEY_ATTR_IPV4))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_IPV4); - - key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); - if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) + if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n", + ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; - swkey->ip.proto = ipv4_key->ipv4_proto; - swkey->ip.tos = ipv4_key->ipv4_tos; - swkey->ip.ttl = ipv4_key->ipv4_ttl; - swkey->ip.frag = ipv4_key->ipv4_frag; - swkey->ipv4.addr.src = ipv4_key->ipv4_src; - swkey->ipv4.addr.dst = ipv4_key->ipv4_dst; - - if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs); - if (err) - return err; } - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - const struct ovs_key_ipv6 *ipv6_key; + SW_FLOW_KEY_PUT(match, ip.proto, + ipv4_key->ipv4_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv4_key->ipv4_tos, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv4_key->ipv4_ttl, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv4_key->ipv4_frag, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + ipv4_key->ipv4_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + ipv4_key->ipv4_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_IPV4); + } - if (!(attrs & (1 << OVS_KEY_ATTR_IPV6))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + if (attrs & (1 << OVS_KEY_ATTR_IPV6)) { + const struct ovs_key_ipv6 *ipv6_key; - key_len = SW_FLOW_KEY_OFFSET(ipv6.label); ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); - if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) + if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n", + ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; - swkey->ipv6.label = ipv6_key->ipv6_label; - swkey->ip.proto = ipv6_key->ipv6_proto; - swkey->ip.tos = ipv6_key->ipv6_tclass; - swkey->ip.ttl = ipv6_key->ipv6_hlimit; - swkey->ip.frag = ipv6_key->ipv6_frag; - memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src, - sizeof(swkey->ipv6.addr.src)); - memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst, - sizeof(swkey->ipv6.addr.dst)); - - if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs); - if (err) - return err; } - } else if (swkey->eth.type == htons(ETH_P_ARP) || - swkey->eth.type == htons(ETH_P_RARP)) { + SW_FLOW_KEY_PUT(match, ipv6.label, + ipv6_key->ipv6_label, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ipv6_key->ipv6_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv6_key->ipv6_tclass, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv6_key->ipv6_hlimit, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv6_key->ipv6_frag, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, + ipv6_key->ipv6_src, + sizeof(match->key->ipv6.addr.src), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, + ipv6_key->ipv6_dst, + sizeof(match->key->ipv6.addr.dst), + is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ARP)) { const struct ovs_key_arp *arp_key; - if (!(attrs & (1 << OVS_KEY_ATTR_ARP))) + arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); + if (!is_mask && (arp_key->arp_op & htons(0xff00))) { + OVS_NLERR("Unknown ARP opcode (opcode=%d).\n", + arp_key->arp_op); return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + arp_key->arp_sip, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + arp_key->arp_tip, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ntohs(arp_key->arp_op), is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, + arp_key->arp_sha, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, + arp_key->arp_tha, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ARP); + } - key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); - arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); - swkey->ipv4.addr.src = arp_key->arp_sip; - swkey->ipv4.addr.dst = arp_key->arp_tip; - if (arp_key->arp_op & htons(0xff00)) + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { + const struct ovs_key_tcp *tcp_key; + + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + tcp_key->tcp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + tcp_key->tcp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_TCP); + } + + if (attrs & (1 << OVS_KEY_ATTR_UDP)) { + const struct ovs_key_udp *udp_key; + + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + udp_key->udp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + udp_key->udp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_UDP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { + const struct ovs_key_icmp *icmp_key; + + icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + htons(icmp_key->icmp_type), is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + htons(icmp_key->icmp_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) { + const struct ovs_key_icmpv6 *icmpv6_key; + + icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + htons(icmpv6_key->icmpv6_type), is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + htons(icmpv6_key->icmpv6_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ND)) { + const struct ovs_key_nd *nd_key; + + nd_key = nla_data(a[OVS_KEY_ATTR_ND]); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, + nd_key->nd_target, + sizeof(match->key->ipv6.nd.target), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, + nd_key->nd_sll, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, + nd_key->nd_tll, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ND); + } + + if (attrs != 0) + return -EINVAL; + + return 0; +} + +/** + * ovs_match_from_nlattrs - parses Netlink attributes into a flow key and + * mask. In case the 'mask' is NULL, the flow is treated as exact match + * flow. Otherwise, it is treated as a wildcarded flow, except the mask + * does not include any don't care bit. + * @match: receives the extracted flow match information. + * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. The fields should of the packet that triggered the creation + * of this flow. + * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink + * attribute specifies the mask field of the wildcarded flow. + */ +int ovs_match_from_nlattrs(struct sw_flow_match *match, + const struct nlattr *key, + const struct nlattr *mask) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + const struct nlattr *encap; + u64 key_attrs = 0; + u64 mask_attrs = 0; + bool encap_valid = false; + int err; + + err = parse_flow_nlattrs(key, a, &key_attrs); + if (err) + return err; + + if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && + (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && + (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { + __be16 tci; + + if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && + (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { + OVS_NLERR("Invalid Vlan frame.\n"); return -EINVAL; - swkey->ip.proto = ntohs(arp_key->arp_op); - memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN); - memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN); + } + + key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + encap = a[OVS_KEY_ATTR_ENCAP]; + key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + encap_valid = true; + + if (tci & htons(VLAN_TAG_PRESENT)) { + err = parse_flow_nlattrs(encap, a, &key_attrs); + if (err) + return err; + } else if (!tci) { + /* Corner case for truncated 802.1Q header. */ + if (nla_len(encap)) { + OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n"); + return -EINVAL; + } + } else { + OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n"); + return -EINVAL; + } } - if (attrs) + err = ovs_key_from_nlattrs(match, key_attrs, a, false); + if (err) + return err; + + if (mask) { + err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); + if (err) + return err; + + if (mask_attrs & 1ULL << OVS_KEY_ATTR_ENCAP) { + __be16 eth_type = 0; + __be16 tci = 0; + + if (!encap_valid) { + OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n"); + return -EINVAL; + } + + mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + if (a[OVS_KEY_ATTR_ETHERTYPE]) + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (eth_type == htons(0xffff)) { + mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + encap = a[OVS_KEY_ATTR_ENCAP]; + err = parse_flow_mask_nlattrs(encap, a, &mask_attrs); + } else { + OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n", + ntohs(eth_type)); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (!(tci & htons(VLAN_TAG_PRESENT))) { + OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci)); + return -EINVAL; + } + } + + err = ovs_key_from_nlattrs(match, mask_attrs, a, true); + if (err) + return err; + } else { + /* Populate exact match flow's key mask. */ + if (match->mask) + ovs_sw_flow_mask_set(match->mask, &match->range, 0xff); + } + + if (!ovs_match_validate(match, key_attrs, mask_attrs)) return -EINVAL; - *key_lenp = key_len; return 0; } @@ -1255,7 +1662,6 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, /** * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key. * @flow: Receives extracted in_port, priority, tun_key and skb_mark. - * @key_len: Length of key in @flow. Used for calculating flow hash. * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. * @@ -1264,102 +1670,100 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, * get the metadata, that is, the parts of the flow key that cannot be * extracted from the packet itself. */ -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, - const struct nlattr *attr) + +int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, + const struct nlattr *attr) { struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; - const struct nlattr *nla; - int rem; + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + u64 attrs = 0; + int err; + struct sw_flow_match match; flow->key.phy.in_port = DP_MAX_PORTS; flow->key.phy.priority = 0; flow->key.phy.skb_mark = 0; memset(tun_key, 0, sizeof(flow->key.tun_key)); - nla_for_each_nested(nla, attr, rem) { - int type = nla_type(nla); - - if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) { - int err; - - if (nla_len(nla) != ovs_key_lens[type]) - return -EINVAL; - - switch (type) { - case OVS_KEY_ATTR_PRIORITY: - flow->key.phy.priority = nla_get_u32(nla); - break; - - case OVS_KEY_ATTR_TUNNEL: - err = ovs_ipv4_tun_from_nlattr(nla, tun_key); - if (err) - return err; - break; - - case OVS_KEY_ATTR_IN_PORT: - if (nla_get_u32(nla) >= DP_MAX_PORTS) - return -EINVAL; - flow->key.phy.in_port = nla_get_u32(nla); - break; - - case OVS_KEY_ATTR_SKB_MARK: - flow->key.phy.skb_mark = nla_get_u32(nla); - break; - } - } - } - if (rem) + err = parse_flow_nlattrs(attr, a, &attrs); + if (err) return -EINVAL; - flow->hash = ovs_flow_hash(&flow->key, - flow_key_start(&flow->key), key_len); + memset(&match, 0, sizeof(match)); + match.key = &flow->key; + + err = metadata_from_nlattrs(&match, &attrs, a, false); + if (err) + return err; return 0; } -int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) +int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb) { struct ovs_key_ethernet *eth_key; struct nlattr *nla, *encap; + bool is_mask = (swkey != output); - if (swkey->phy.priority && - nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority)) + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if (swkey->tun_key.ipv4_dst && - ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key)) + if ((swkey->tun_key.ipv4_dst || is_mask) && + ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) goto nla_put_failure; - if (swkey->phy.in_port != DP_MAX_PORTS && - nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port)) - goto nla_put_failure; + if (swkey->phy.in_port == DP_MAX_PORTS) { + if (is_mask && (output->phy.in_port == 0xffff)) + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) + goto nla_put_failure; + } else { + u16 upper_u16; + upper_u16 = !is_mask ? 0 : 0xffff; - if (swkey->phy.skb_mark && - nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, swkey->phy.skb_mark)) + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, + (upper_u16 << 16) | output->phy.in_port)) + goto nla_put_failure; + } + + if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) goto nla_put_failure; nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); if (!nla) goto nla_put_failure; + eth_key = nla_data(nla); - memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN); - memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN); + memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN); + memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN); if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q)) || - nla_put_be16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci)) + __be16 eth_type; + eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) goto nla_put_failure; encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); if (!swkey->eth.tci) goto unencap; - } else { + } else encap = NULL; - } - if (swkey->eth.type == htons(ETH_P_802_2)) + if (swkey->eth.type == htons(ETH_P_802_2)) { + /* + * Ethertype 802.2 is represented in the netlink with omitted + * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and + * 0xffff in the mask attribute. Ethertype can also + * be wildcarded. + */ + if (is_mask && output->eth.type) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, + output->eth.type)) + goto nla_put_failure; goto unencap; + } - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type)) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) goto nla_put_failure; if (swkey->eth.type == htons(ETH_P_IP)) { @@ -1369,12 +1773,12 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; ipv4_key = nla_data(nla); - ipv4_key->ipv4_src = swkey->ipv4.addr.src; - ipv4_key->ipv4_dst = swkey->ipv4.addr.dst; - ipv4_key->ipv4_proto = swkey->ip.proto; - ipv4_key->ipv4_tos = swkey->ip.tos; - ipv4_key->ipv4_ttl = swkey->ip.ttl; - ipv4_key->ipv4_frag = swkey->ip.frag; + ipv4_key->ipv4_src = output->ipv4.addr.src; + ipv4_key->ipv4_dst = output->ipv4.addr.dst; + ipv4_key->ipv4_proto = output->ip.proto; + ipv4_key->ipv4_tos = output->ip.tos; + ipv4_key->ipv4_ttl = output->ip.ttl; + ipv4_key->ipv4_frag = output->ip.frag; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { struct ovs_key_ipv6 *ipv6_key; @@ -1382,15 +1786,15 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; ipv6_key = nla_data(nla); - memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src, + memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, sizeof(ipv6_key->ipv6_src)); - memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst, + memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, sizeof(ipv6_key->ipv6_dst)); - ipv6_key->ipv6_label = swkey->ipv6.label; - ipv6_key->ipv6_proto = swkey->ip.proto; - ipv6_key->ipv6_tclass = swkey->ip.tos; - ipv6_key->ipv6_hlimit = swkey->ip.ttl; - ipv6_key->ipv6_frag = swkey->ip.frag; + ipv6_key->ipv6_label = output->ipv6.label; + ipv6_key->ipv6_proto = output->ip.proto; + ipv6_key->ipv6_tclass = output->ip.tos; + ipv6_key->ipv6_hlimit = output->ip.ttl; + ipv6_key->ipv6_frag = output->ip.frag; } else if (swkey->eth.type == htons(ETH_P_ARP) || swkey->eth.type == htons(ETH_P_RARP)) { struct ovs_key_arp *arp_key; @@ -1400,11 +1804,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) goto nla_put_failure; arp_key = nla_data(nla); memset(arp_key, 0, sizeof(struct ovs_key_arp)); - arp_key->arp_sip = swkey->ipv4.addr.src; - arp_key->arp_tip = swkey->ipv4.addr.dst; - arp_key->arp_op = htons(swkey->ip.proto); - memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN); - memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN); + arp_key->arp_sip = output->ipv4.addr.src; + arp_key->arp_tip = output->ipv4.addr.dst; + arp_key->arp_op = htons(output->ip.proto); + memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN); + memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN); } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -1419,11 +1823,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) goto nla_put_failure; tcp_key = nla_data(nla); if (swkey->eth.type == htons(ETH_P_IP)) { - tcp_key->tcp_src = swkey->ipv4.tp.src; - tcp_key->tcp_dst = swkey->ipv4.tp.dst; + tcp_key->tcp_src = output->ipv4.tp.src; + tcp_key->tcp_dst = output->ipv4.tp.dst; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - tcp_key->tcp_src = swkey->ipv6.tp.src; - tcp_key->tcp_dst = swkey->ipv6.tp.dst; + tcp_key->tcp_src = output->ipv6.tp.src; + tcp_key->tcp_dst = output->ipv6.tp.dst; } } else if (swkey->ip.proto == IPPROTO_UDP) { struct ovs_key_udp *udp_key; @@ -1433,11 +1837,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) goto nla_put_failure; udp_key = nla_data(nla); if (swkey->eth.type == htons(ETH_P_IP)) { - udp_key->udp_src = swkey->ipv4.tp.src; - udp_key->udp_dst = swkey->ipv4.tp.dst; + udp_key->udp_src = output->ipv4.tp.src; + udp_key->udp_dst = output->ipv4.tp.dst; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - udp_key->udp_src = swkey->ipv6.tp.src; - udp_key->udp_dst = swkey->ipv6.tp.dst; + udp_key->udp_src = output->ipv6.tp.src; + udp_key->udp_dst = output->ipv6.tp.dst; } } else if (swkey->eth.type == htons(ETH_P_IP) && swkey->ip.proto == IPPROTO_ICMP) { @@ -1447,8 +1851,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; icmp_key = nla_data(nla); - icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src); - icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst); + icmp_key->icmp_type = ntohs(output->ipv4.tp.src); + icmp_key->icmp_code = ntohs(output->ipv4.tp.dst); } else if (swkey->eth.type == htons(ETH_P_IPV6) && swkey->ip.proto == IPPROTO_ICMPV6) { struct ovs_key_icmpv6 *icmpv6_key; @@ -1458,8 +1862,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; icmpv6_key = nla_data(nla); - icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src); - icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst); + icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src); + icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst); if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { @@ -1469,10 +1873,10 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; nd_key = nla_data(nla); - memcpy(nd_key->nd_target, &swkey->ipv6.nd.target, + memcpy(nd_key->nd_target, &output->ipv6.nd.target, sizeof(nd_key->nd_target)); - memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN); - memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN); + memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN); + memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN); } } } @@ -1504,3 +1908,84 @@ void ovs_flow_exit(void) { kmem_cache_destroy(flow_cache); } + +struct sw_flow_mask *ovs_sw_flow_mask_alloc(void) +{ + struct sw_flow_mask *mask; + + mask = kmalloc(sizeof(*mask), GFP_KERNEL); + if (mask) + mask->ref_count = 0; + + return mask; +} + +void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask) +{ + mask->ref_count++; +} + +void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred) +{ + if (!mask) + return; + + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) { + list_del_rcu(&mask->list); + if (deferred) + kfree_rcu(mask, rcu); + else + kfree(mask); + } +} + +static bool ovs_sw_flow_mask_equal(const struct sw_flow_mask *a, + const struct sw_flow_mask *b) +{ + u8 *a_ = (u8 *)&a->key + a->range.start; + u8 *b_ = (u8 *)&b->key + b->range.start; + + return (a->range.end == b->range.end) + && (a->range.start == b->range.start) + && (memcmp(a_, b_, ovs_sw_flow_mask_actual_size(a)) == 0); +} + +struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl, + const struct sw_flow_mask *mask) +{ + struct list_head *ml; + + list_for_each(ml, tbl->mask_list) { + struct sw_flow_mask *m; + m = container_of(ml, struct sw_flow_mask, list); + if (ovs_sw_flow_mask_equal(mask, m)) + return m; + } + + return NULL; +} + +/** + * add a new mask into the mask list. + * The caller needs to make sure that 'mask' is not the same + * as any masks that are already on the list. + */ +void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + list_add_rcu(&mask->list, tbl->mask_list); +} + +/** + * Set 'range' fields in the mask to the value of 'val'. + */ +static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask, + struct sw_flow_key_range *range, u8 val) +{ + u8 *m = (u8 *)&mask->key + range->start; + + mask->range = *range; + memset(m, val, ovs_sw_flow_mask_size_roundup(mask)); +} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 66ef7220293..9674e45f696 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2011 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -33,6 +33,8 @@ #include struct sk_buff; +struct sw_flow_mask; +struct flow_table; struct sw_flow_actions { struct rcu_head rcu; @@ -131,6 +133,8 @@ struct sw_flow { u32 hash; struct sw_flow_key key; + struct sw_flow_key unmasked_key; + struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; spinlock_t lock; /* Lock for values below. */ @@ -140,6 +144,25 @@ struct sw_flow { u8 tcp_flags; /* Union of seen TCP flags. */ }; +struct sw_flow_key_range { + size_t start; + size_t end; +}; + +static inline u16 ovs_sw_flow_key_range_actual_size(const struct sw_flow_key_range *range) +{ + return range->end - range->start; +} + +struct sw_flow_match { + struct sw_flow_key *key; + struct sw_flow_key_range range; + struct sw_flow_mask *mask; +}; + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, struct sw_flow_mask *mask); + struct arp_eth_header { __be16 ar_hrd; /* format of hardware address */ __be16 ar_pro; /* format of protocol address */ @@ -159,21 +182,21 @@ void ovs_flow_exit(void); struct sw_flow *ovs_flow_alloc(void); void ovs_flow_deferred_free(struct sw_flow *); -void ovs_flow_free(struct sw_flow *flow); +void ovs_flow_free(struct sw_flow *, bool deferred); struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len); void ovs_flow_deferred_free_acts(struct sw_flow_actions *); -int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *, - int *key_lenp); +int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); void ovs_flow_used(struct sw_flow *, struct sk_buff *); u64 ovs_flow_used_time(unsigned long flow_jiffies); - -int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *); -int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, +int ovs_flow_to_nlattrs(const struct sw_flow_key *, + const struct sw_flow_key *, struct sk_buff *); +int ovs_match_from_nlattrs(struct sw_flow_match *match, + const struct nlattr *, const struct nlattr *); -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, - const struct nlattr *attr); +int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, + const struct nlattr *attr); #define MAX_ACTIONS_BUFSIZE (32 * 1024) #define TBL_MIN_BUCKETS 1024 @@ -182,6 +205,7 @@ struct flow_table { struct flex_array *buckets; unsigned int count, n_buckets; struct rcu_head rcu; + struct list_head *mask_list; int node_ver; u32 hash_seed; bool keep_flows; @@ -197,22 +221,56 @@ static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table) return (table->count > table->n_buckets); } -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, - struct sw_flow_key *key, int len); -void ovs_flow_tbl_destroy(struct flow_table *table); -void ovs_flow_tbl_deferred_destroy(struct flow_table *table); +struct sw_flow *ovs_flow_lookup(struct flow_table *, + const struct sw_flow_key *); +struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table, + struct sw_flow_match *match); + +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); struct flow_table *ovs_flow_tbl_alloc(int new_size); struct flow_table *ovs_flow_tbl_expand(struct flow_table *table); struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table); -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - struct sw_flow_key *key, int key_len); -void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); -struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx); +void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow); +void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow); + +struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *idx); extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1]; int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct ovs_key_ipv4_tunnel *tun_key); + struct sw_flow_match *match, bool is_mask); int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key); + const struct ovs_key_ipv4_tunnel *tun_key, + const struct ovs_key_ipv4_tunnel *output); + +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_len); + +struct sw_flow_mask { + int ref_count; + struct rcu_head rcu; + struct list_head list; + struct sw_flow_key_range range; + struct sw_flow_key key; +}; + +static inline u16 +ovs_sw_flow_mask_actual_size(const struct sw_flow_mask *mask) +{ + return ovs_sw_flow_key_range_actual_size(&mask->range); +} + +static inline u16 +ovs_sw_flow_mask_size_roundup(const struct sw_flow_mask *mask) +{ + return roundup(ovs_sw_flow_mask_actual_size(mask), sizeof(u32)); +} +struct sw_flow_mask *ovs_sw_flow_mask_alloc(void); +void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *); +void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred); +void ovs_sw_flow_mask_insert(struct flow_table *, struct sw_flow_mask *); +struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *, + const struct sw_flow_mask *); +void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask); #endif /* flow.h */ -- cgit v1.2.3-70-g09d2 From 280c571e1a0f553d4f2384f1a8b643b5de26e0ec Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 23 Jul 2013 13:37:45 +0900 Subject: net: Add NEXTHDR_SCTP to ipv6.h Signed-off-by: Joe Stringer Signed-off-by: Jesse Gross --- include/net/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5fe56498517..7bdff045506 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -41,6 +41,7 @@ #define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ #define NEXTHDR_NONE 59 /* No next header */ #define NEXTHDR_DEST 60 /* Destination options header. */ +#define NEXTHDR_SCTP 132 /* SCTP message. */ #define NEXTHDR_MOBILITY 135 /* Mobility header. */ #define NEXTHDR_MAX 255 -- cgit v1.2.3-70-g09d2 From cfe51ec1ae427ec0be5a7670eae815ce5eb30e1c Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Sat, 24 Aug 2013 00:32:30 +0200 Subject: bcma: add method to power up and down the PCIe core by wifi driver The wifi driver should tell the PCIe core that it is now in operation so that some workarounds can be applied and the power state is changed. This should replace the call to bcma_core_pci_extend_L1timer by the brcmsmac driver. Signed-off-by: Hauke Mehrtens Signed-off-by: John W. Linville --- drivers/bcma/driver_pci.c | 26 ++++++++++++++++++++++++++ include/linux/bcma/bcma_driver_pci.h | 3 +++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/drivers/bcma/driver_pci.c b/drivers/bcma/driver_pci.c index cf7a476a519..6ea817fae59 100644 --- a/drivers/bcma/driver_pci.c +++ b/drivers/bcma/driver_pci.c @@ -275,3 +275,29 @@ void bcma_core_pci_extend_L1timer(struct bcma_drv_pci *pc, bool extend) bcma_pcie_read(pc, BCMA_CORE_PCI_DLLP_PMTHRESHREG); } EXPORT_SYMBOL_GPL(bcma_core_pci_extend_L1timer); + +void bcma_core_pci_up(struct bcma_bus *bus) +{ + struct bcma_drv_pci *pc; + + if (bus->hosttype != BCMA_HOSTTYPE_PCI) + return; + + pc = &bus->drv_pci[0]; + + bcma_core_pci_extend_L1timer(pc, true); +} +EXPORT_SYMBOL_GPL(bcma_core_pci_up); + +void bcma_core_pci_down(struct bcma_bus *bus) +{ + struct bcma_drv_pci *pc; + + if (bus->hosttype != BCMA_HOSTTYPE_PCI) + return; + + pc = &bus->drv_pci[0]; + + bcma_core_pci_extend_L1timer(pc, false); +} +EXPORT_SYMBOL_GPL(bcma_core_pci_down); diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h index 424760f01b9..27f9ceb68b4 100644 --- a/include/linux/bcma/bcma_driver_pci.h +++ b/include/linux/bcma/bcma_driver_pci.h @@ -185,6 +185,7 @@ struct pci_dev; #define BCMA_CORE_PCI_RC_CRS_VISIBILITY 0x0001 struct bcma_drv_pci; +struct bcma_bus; #ifdef CONFIG_BCMA_DRIVER_PCI_HOSTMODE struct bcma_drv_pci_host { @@ -220,6 +221,8 @@ extern void bcma_core_pci_init(struct bcma_drv_pci *pc); extern int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc, struct bcma_device *core, bool enable); extern void bcma_core_pci_extend_L1timer(struct bcma_drv_pci *pc, bool extend); +extern void bcma_core_pci_up(struct bcma_bus *bus); +extern void bcma_core_pci_down(struct bcma_bus *bus); extern int bcma_core_pci_pcibios_map_irq(const struct pci_dev *dev); extern int bcma_core_pci_plat_dev_init(struct pci_dev *dev); -- cgit v1.2.3-70-g09d2 From 780335acc815802dcee63d75f5589d43c3ccb402 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Sat, 24 Aug 2013 00:32:32 +0200 Subject: bcma: do not export bcma_core_pci_extend_L1timer() This is not called any more, do not export it. Signed-off-by: Hauke Mehrtens Signed-off-by: John W. Linville --- drivers/bcma/driver_pci.c | 3 +-- include/linux/bcma/bcma_driver_pci.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/bcma/driver_pci.c b/drivers/bcma/driver_pci.c index 6ea817fae59..d51d1943a20 100644 --- a/drivers/bcma/driver_pci.c +++ b/drivers/bcma/driver_pci.c @@ -262,7 +262,7 @@ out: } EXPORT_SYMBOL_GPL(bcma_core_pci_irq_ctl); -void bcma_core_pci_extend_L1timer(struct bcma_drv_pci *pc, bool extend) +static void bcma_core_pci_extend_L1timer(struct bcma_drv_pci *pc, bool extend) { u32 w; @@ -274,7 +274,6 @@ void bcma_core_pci_extend_L1timer(struct bcma_drv_pci *pc, bool extend) bcma_pcie_write(pc, BCMA_CORE_PCI_DLLP_PMTHRESHREG, w); bcma_pcie_read(pc, BCMA_CORE_PCI_DLLP_PMTHRESHREG); } -EXPORT_SYMBOL_GPL(bcma_core_pci_extend_L1timer); void bcma_core_pci_up(struct bcma_bus *bus) { diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h index 27f9ceb68b4..0234955aa9c 100644 --- a/include/linux/bcma/bcma_driver_pci.h +++ b/include/linux/bcma/bcma_driver_pci.h @@ -220,7 +220,6 @@ struct bcma_drv_pci { extern void bcma_core_pci_init(struct bcma_drv_pci *pc); extern int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc, struct bcma_device *core, bool enable); -extern void bcma_core_pci_extend_L1timer(struct bcma_drv_pci *pc, bool extend); extern void bcma_core_pci_up(struct bcma_bus *bus); extern void bcma_core_pci_down(struct bcma_bus *bus); -- cgit v1.2.3-70-g09d2 From 521deea64088bc885a76bd174241eaa3d3a6876f Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Sat, 24 Aug 2013 00:32:33 +0200 Subject: bcma: add bcma_core_pci_power_save() This enables or disables power saving on the PCIe bus when the wifi is in operation or not. Signed-off-by: Hauke Mehrtens Signed-off-by: John W. Linville --- drivers/bcma/driver_pci.c | 36 +++++++++++++++++++++++++++++++++--- include/linux/bcma/bcma_driver_pci.h | 20 ++++++++++++++++++++ 2 files changed, 53 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/bcma/driver_pci.c b/drivers/bcma/driver_pci.c index d51d1943a20..c9fd6943ce4 100644 --- a/drivers/bcma/driver_pci.c +++ b/drivers/bcma/driver_pci.c @@ -31,7 +31,7 @@ static void bcma_pcie_write(struct bcma_drv_pci *pc, u32 address, u32 data) pcicore_write32(pc, BCMA_CORE_PCI_PCIEIND_DATA, data); } -static void bcma_pcie_mdio_set_phy(struct bcma_drv_pci *pc, u8 phy) +static void bcma_pcie_mdio_set_phy(struct bcma_drv_pci *pc, u16 phy) { u32 v; int i; @@ -55,7 +55,7 @@ static void bcma_pcie_mdio_set_phy(struct bcma_drv_pci *pc, u8 phy) } } -static u16 bcma_pcie_mdio_read(struct bcma_drv_pci *pc, u8 device, u8 address) +static u16 bcma_pcie_mdio_read(struct bcma_drv_pci *pc, u16 device, u8 address) { int max_retries = 10; u16 ret = 0; @@ -98,7 +98,7 @@ static u16 bcma_pcie_mdio_read(struct bcma_drv_pci *pc, u8 device, u8 address) return ret; } -static void bcma_pcie_mdio_write(struct bcma_drv_pci *pc, u8 device, +static void bcma_pcie_mdio_write(struct bcma_drv_pci *pc, u16 device, u8 address, u16 data) { int max_retries = 10; @@ -137,6 +137,13 @@ static void bcma_pcie_mdio_write(struct bcma_drv_pci *pc, u8 device, pcicore_write32(pc, BCMA_CORE_PCI_MDIO_CONTROL, 0); } +static u16 bcma_pcie_mdio_writeread(struct bcma_drv_pci *pc, u16 device, + u8 address, u16 data) +{ + bcma_pcie_mdio_write(pc, device, address, data); + return bcma_pcie_mdio_read(pc, device, address); +} + /************************************************** * Workarounds. **************************************************/ @@ -203,6 +210,25 @@ static void bcma_core_pci_config_fixup(struct bcma_drv_pci *pc) } } +static void bcma_core_pci_power_save(struct bcma_drv_pci *pc, bool up) +{ + u16 data; + + if (pc->core->id.rev >= 15 && pc->core->id.rev <= 20) { + data = up ? 0x74 : 0x7C; + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7F64); + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data); + } else if (pc->core->id.rev >= 21 && pc->core->id.rev <= 22) { + data = up ? 0x75 : 0x7D; + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7E65); + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data); + } +} + /************************************************** * Init. **************************************************/ @@ -284,6 +310,8 @@ void bcma_core_pci_up(struct bcma_bus *bus) pc = &bus->drv_pci[0]; + bcma_core_pci_power_save(pc, true); + bcma_core_pci_extend_L1timer(pc, true); } EXPORT_SYMBOL_GPL(bcma_core_pci_up); @@ -298,5 +326,7 @@ void bcma_core_pci_down(struct bcma_bus *bus) pc = &bus->drv_pci[0]; bcma_core_pci_extend_L1timer(pc, false); + + bcma_core_pci_power_save(pc, false); } EXPORT_SYMBOL_GPL(bcma_core_pci_down); diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h index 0234955aa9c..d66033f418c 100644 --- a/include/linux/bcma/bcma_driver_pci.h +++ b/include/linux/bcma/bcma_driver_pci.h @@ -181,6 +181,26 @@ struct pci_dev; #define BCMA_CORE_PCI_CFG_DEVCTRL 0xd8 +#define BCMA_CORE_PCI_ + +/* MDIO devices (SERDES modules) */ +#define BCMA_CORE_PCI_MDIO_IEEE0 0x000 +#define BCMA_CORE_PCI_MDIO_IEEE1 0x001 +#define BCMA_CORE_PCI_MDIO_BLK0 0x800 +#define BCMA_CORE_PCI_MDIO_BLK1 0x801 +#define BCMA_CORE_PCI_MDIO_BLK1_MGMT0 0x16 +#define BCMA_CORE_PCI_MDIO_BLK1_MGMT1 0x17 +#define BCMA_CORE_PCI_MDIO_BLK1_MGMT2 0x18 +#define BCMA_CORE_PCI_MDIO_BLK1_MGMT3 0x19 +#define BCMA_CORE_PCI_MDIO_BLK1_MGMT4 0x1A +#define BCMA_CORE_PCI_MDIO_BLK2 0x802 +#define BCMA_CORE_PCI_MDIO_BLK3 0x803 +#define BCMA_CORE_PCI_MDIO_BLK4 0x804 +#define BCMA_CORE_PCI_MDIO_TXPLL 0x808 /* TXPLL register block idx */ +#define BCMA_CORE_PCI_MDIO_TXCTRL0 0x820 +#define BCMA_CORE_PCI_MDIO_SERDESID 0x831 +#define BCMA_CORE_PCI_MDIO_RXCTRL0 0x840 + /* PCIE Root Capability Register bits (Host mode only) */ #define BCMA_CORE_PCI_RC_CRS_VISIBILITY 0x0001 -- cgit v1.2.3-70-g09d2 From a175a723301a8a4a9fedf9ce5b8ca586e7a97b40 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Thu, 22 Aug 2013 12:30:48 -0700 Subject: openvswitch: Add SCTP support This patch adds support for rewriting SCTP src,dst ports similar to the functionality already available for TCP/UDP. Rewriting SCTP ports is expensive due to double-recalculation of the SCTP checksums; this is performed to ensure that packets traversing OVS with invalid checksums will continue to the destination with any checksum corruption intact. Reviewed-by: Simon Horman Signed-off-by: Joe Stringer Signed-off-by: Ben Pfaff Signed-off-by: Jesse Gross --- include/uapi/linux/openvswitch.h | 6 ++++ net/openvswitch/Kconfig | 1 + net/openvswitch/actions.c | 39 ++++++++++++++++++++++++ net/openvswitch/datapath.c | 6 ++++ net/openvswitch/flow.c | 65 ++++++++++++++++++++++++++++++++++++++++ net/openvswitch/flow.h | 8 ++--- 6 files changed, 121 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index de1fa5d3780..a74d375b439 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -259,6 +259,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_ND, /* struct ovs_key_nd */ OVS_KEY_ATTR_SKB_MARK, /* u32 skb mark */ OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ + OVS_KEY_ATTR_SCTP, /* struct ovs_key_sctp */ #ifdef __KERNEL__ OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ @@ -333,6 +334,11 @@ struct ovs_key_udp { __be16 udp_dst; }; +struct ovs_key_sctp { + __be16 sctp_src; + __be16 sctp_dst; +}; + struct ovs_key_icmp { __u8 icmp_type; __u8 icmp_code; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index bed30e69baa..6ecf491ad50 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -4,6 +4,7 @@ config OPENVSWITCH tristate "Open vSwitch" + select LIBCRC32C ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 1f680222f4f..65cfaa81607 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include #include +#include #include "datapath.h" #include "vport.h" @@ -352,6 +354,39 @@ static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) return 0; } +static int set_sctp(struct sk_buff *skb, + const struct ovs_key_sctp *sctp_port_key) +{ + struct sctphdr *sh; + int err; + unsigned int sctphoff = skb_transport_offset(skb); + + err = make_writable(skb, sctphoff + sizeof(struct sctphdr)); + if (unlikely(err)) + return err; + + sh = sctp_hdr(skb); + if (sctp_port_key->sctp_src != sh->source || + sctp_port_key->sctp_dst != sh->dest) { + __le32 old_correct_csum, new_csum, old_csum; + + old_csum = sh->checksum; + old_correct_csum = sctp_compute_cksum(skb, sctphoff); + + sh->source = sctp_port_key->sctp_src; + sh->dest = sctp_port_key->sctp_dst; + + new_csum = sctp_compute_cksum(skb, sctphoff); + + /* Carry any checksum errors through. */ + sh->checksum = old_csum ^ old_correct_csum ^ new_csum; + + skb->rxhash = 0; + } + + return 0; +} + static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) { struct vport *vport; @@ -461,6 +496,10 @@ static int execute_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_UDP: err = set_udp(skb, nla_data(nested_attr)); break; + + case OVS_KEY_ATTR_SCTP: + err = set_sctp(skb, nla_data(nested_attr)); + break; } return err; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d29cd9aa4a6..2aa13bd7f2b 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -712,6 +712,12 @@ static int validate_set(const struct nlattr *a, return validate_tp_port(flow_key); + case OVS_KEY_ATTR_SCTP: + if (flow_key->ip.proto != IPPROTO_SCTP) + return -EINVAL; + + return validate_tp_port(flow_key); + default: return -EINVAL; } diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 1fceb965359..2b4785590b5 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -129,6 +130,7 @@ static bool ovs_match_validate(const struct sw_flow_match *match, | (1 << OVS_KEY_ATTR_IPV6) | (1 << OVS_KEY_ATTR_TCP) | (1 << OVS_KEY_ATTR_UDP) + | (1 << OVS_KEY_ATTR_SCTP) | (1 << OVS_KEY_ATTR_ICMP) | (1 << OVS_KEY_ATTR_ICMPV6) | (1 << OVS_KEY_ATTR_ARP) @@ -159,6 +161,12 @@ static bool ovs_match_validate(const struct sw_flow_match *match, mask_allowed |= 1 << OVS_KEY_ATTR_UDP; } + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + if (match->key->ip.proto == IPPROTO_TCP) { key_expected |= 1 << OVS_KEY_ATTR_TCP; if (match->mask && (match->mask->key.ip.proto == 0xff)) @@ -185,6 +193,12 @@ static bool ovs_match_validate(const struct sw_flow_match *match, mask_allowed |= 1 << OVS_KEY_ATTR_UDP; } + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + if (match->key->ip.proto == IPPROTO_TCP) { key_expected |= 1 << OVS_KEY_ATTR_TCP; if (match->mask && (match->mask->key.ip.proto == 0xff)) @@ -280,6 +294,12 @@ static bool udphdr_ok(struct sk_buff *skb) sizeof(struct udphdr)); } +static bool sctphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct sctphdr)); +} + static bool icmphdr_ok(struct sk_buff *skb) { return pskb_may_pull(skb, skb_transport_offset(skb) + @@ -891,6 +911,12 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) key->ipv4.tp.src = udp->source; key->ipv4.tp.dst = udp->dest; } + } else if (key->ip.proto == IPPROTO_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->ipv4.tp.src = sctp->source; + key->ipv4.tp.dst = sctp->dest; + } } else if (key->ip.proto == IPPROTO_ICMP) { if (icmphdr_ok(skb)) { struct icmphdr *icmp = icmp_hdr(skb); @@ -953,6 +979,12 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) key->ipv6.tp.src = udp->source; key->ipv6.tp.dst = udp->dest; } + } else if (key->ip.proto == NEXTHDR_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->ipv6.tp.src = sctp->source; + key->ipv6.tp.dst = sctp->dest; + } } else if (key->ip.proto == NEXTHDR_ICMP) { if (icmp6hdr_ok(skb)) { error = parse_icmpv6(skb, key, nh_len); @@ -1087,6 +1119,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), + [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp), [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), @@ -1500,6 +1533,24 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, attrs &= ~(1 << OVS_KEY_ATTR_UDP); } + if (attrs & (1 << OVS_KEY_ATTR_SCTP)) { + const struct ovs_key_sctp *sctp_key; + + sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + sctp_key->sctp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + sctp_key->sctp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_SCTP); + } + if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { const struct ovs_key_icmp *icmp_key; @@ -1843,6 +1894,20 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, udp_key->udp_src = output->ipv6.tp.src; udp_key->udp_dst = output->ipv6.tp.dst; } + } else if (swkey->ip.proto == IPPROTO_SCTP) { + struct ovs_key_sctp *sctp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); + if (!nla) + goto nla_put_failure; + sctp_key = nla_data(nla); + if (swkey->eth.type == htons(ETH_P_IP)) { + sctp_key->sctp_src = swkey->ipv4.tp.src; + sctp_key->sctp_dst = swkey->ipv4.tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + sctp_key->sctp_src = swkey->ipv6.tp.src; + sctp_key->sctp_dst = swkey->ipv6.tp.dst; + } } else if (swkey->eth.type == htons(ETH_P_IP) && swkey->ip.proto == IPPROTO_ICMP) { struct ovs_key_icmp *icmp_key; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 9674e45f696..d08dcf78dbf 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -99,8 +99,8 @@ struct sw_flow_key { } addr; union { struct { - __be16 src; /* TCP/UDP source port. */ - __be16 dst; /* TCP/UDP destination port. */ + __be16 src; /* TCP/UDP/SCTP source port. */ + __be16 dst; /* TCP/UDP/SCTP destination port. */ } tp; struct { u8 sha[ETH_ALEN]; /* ARP source hardware address. */ @@ -115,8 +115,8 @@ struct sw_flow_key { } addr; __be32 label; /* IPv6 flow label. */ struct { - __be16 src; /* TCP/UDP source port. */ - __be16 dst; /* TCP/UDP destination port. */ + __be16 src; /* TCP/UDP/SCTP source port. */ + __be16 dst; /* TCP/UDP/SCTP destination port. */ } tp; struct { struct in6_addr target; /* ND target address. */ -- cgit v1.2.3-70-g09d2 From 41d73ec053d2424599c4ed8452b889374d523ade Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 27 Aug 2013 08:50:12 +0200 Subject: netfilter: nf_conntrack: make sequence number adjustments usuable without NAT Split out sequence number adjustments from NAT and move them to the conntrack core to make them usable for SYN proxying. The sequence number adjustment information is moved to a seperate extend. The extend is added to new conntracks when a NAT mapping is set up for a connection using a helper. As a side effect, this saves 24 bytes per connection with NAT in the common case that a connection does not have a helper assigned. Signed-off-by: Patrick McHardy Tested-by: Martin Topholm Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 9 +- include/net/netfilter/nf_conntrack_extend.h | 2 + include/net/netfilter/nf_conntrack_seqadj.h | 49 +++++ include/net/netfilter/nf_nat.h | 10 - include/net/netfilter/nf_nat_helper.h | 19 -- include/uapi/linux/netfilter/nf_conntrack_common.h | 3 +- include/uapi/linux/netfilter/nfnetlink_conntrack.h | 15 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 7 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 7 +- net/netfilter/Makefile | 2 +- net/netfilter/nf_conntrack_core.c | 16 +- net/netfilter/nf_conntrack_netlink.c | 115 +++++------ net/netfilter/nf_conntrack_proto_tcp.c | 18 +- net/netfilter/nf_conntrack_seqadj.c | 218 ++++++++++++++++++++ net/netfilter/nf_nat_core.c | 16 +- net/netfilter/nf_nat_helper.c | 228 +-------------------- net/netfilter/nf_nat_sip.c | 3 +- net/netfilter/nfnetlink_queue_ct.c | 8 +- 18 files changed, 369 insertions(+), 376 deletions(-) create mode 100644 include/net/netfilter/nf_conntrack_seqadj.h create mode 100644 net/netfilter/nf_conntrack_seqadj.c (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index e2cf786be22..708fe72ab91 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -319,6 +319,7 @@ extern void nf_ct_attach(struct sk_buff *, const struct sk_buff *); extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu; struct nf_conn; +enum ip_conntrack_info; struct nlattr; struct nfq_ct_hook { @@ -327,14 +328,10 @@ struct nfq_ct_hook { int (*parse)(const struct nlattr *attr, struct nf_conn *ct); int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report); -}; -extern struct nfq_ct_hook __rcu *nfq_ct_hook; - -struct nfq_ct_nat_hook { void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, - u32 ctinfo, s32 off); + enum ip_conntrack_info ctinfo, s32 off); }; -extern struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook; +extern struct nfq_ct_hook __rcu *nfq_ct_hook; #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 977bc8a4644..2a22bcbfe6e 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -9,6 +9,7 @@ enum nf_ct_ext_id { NF_CT_EXT_HELPER, #if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE) NF_CT_EXT_NAT, + NF_CT_EXT_SEQADJ, #endif NF_CT_EXT_ACCT, #ifdef CONFIG_NF_CONNTRACK_EVENTS @@ -31,6 +32,7 @@ enum nf_ct_ext_id { #define NF_CT_EXT_HELPER_TYPE struct nf_conn_help #define NF_CT_EXT_NAT_TYPE struct nf_conn_nat +#define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache #define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h new file mode 100644 index 00000000000..30bfbbed9f4 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_seqadj.h @@ -0,0 +1,49 @@ +#ifndef _NF_CONNTRACK_SEQADJ_H +#define _NF_CONNTRACK_SEQADJ_H + +#include + +/** + * struct nf_ct_seqadj - sequence number adjustment information + * + * @correction_pos: position of the last TCP sequence number modification + * @offset_before: sequence number offset before last modification + * @offset_after: sequence number offset after last modification + */ +struct nf_ct_seqadj { + u32 correction_pos; + s32 offset_before; + s32 offset_after; +}; + +struct nf_conn_seqadj { + struct nf_ct_seqadj seq[IP_CT_DIR_MAX]; +}; + +static inline struct nf_conn_seqadj *nfct_seqadj(const struct nf_conn *ct) +{ + return nf_ct_ext_find(ct, NF_CT_EXT_SEQADJ); +} + +static inline struct nf_conn_seqadj *nfct_seqadj_ext_add(struct nf_conn *ct) +{ + return nf_ct_ext_add(ct, NF_CT_EXT_SEQADJ, GFP_ATOMIC); +} + +extern int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + __be32 seq, s32 off); +extern void nf_ct_tcp_seqadj_set(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + s32 off); + +extern int nf_ct_seq_adjust(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff); +extern s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir, + u32 seq); + +extern int nf_conntrack_seqadj_init(void); +extern void nf_conntrack_seqadj_fini(void); + +#endif /* _NF_CONNTRACK_SEQADJ_H */ diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index e2441413675..59a19242005 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -13,15 +13,6 @@ enum nf_nat_manip_type { #define HOOK2MANIP(hooknum) ((hooknum) != NF_INET_POST_ROUTING && \ (hooknum) != NF_INET_LOCAL_IN) -/* NAT sequence number modifications */ -struct nf_nat_seq { - /* position of the last TCP sequence number modification (if any) */ - u_int32_t correction_pos; - - /* sequence number offset before and after last modification */ - int32_t offset_before, offset_after; -}; - #include #include #include @@ -39,7 +30,6 @@ struct nf_conn; /* The structure embedded in the conntrack structure. */ struct nf_conn_nat { struct hlist_node bysource; - struct nf_nat_seq seq[IP_CT_DIR_MAX]; struct nf_conn *ct; union nf_conntrack_nat_help help; #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h index 194c3479492..404324d1d0c 100644 --- a/include/net/netfilter/nf_nat_helper.h +++ b/include/net/netfilter/nf_nat_helper.h @@ -39,28 +39,9 @@ extern int nf_nat_mangle_udp_packet(struct sk_buff *skb, const char *rep_buffer, unsigned int rep_len); -extern void nf_nat_set_seq_adjust(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - __be32 seq, s32 off); -extern int nf_nat_seq_adjust(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff); -extern int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff); - /* Setup NAT on this expected conntrack so it follows master, but goes * to port ct->master->saved_proto. */ extern void nf_nat_follow_master(struct nf_conn *ct, struct nf_conntrack_expect *this); -extern s32 nf_nat_get_offset(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq); - -extern void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, - u32 dir, s32 off); - #endif diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index d69483fb382..8dd803818eb 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -99,7 +99,8 @@ enum ip_conntrack_events { IPCT_PROTOINFO, /* protocol information has changed */ IPCT_HELPER, /* new helper has been set */ IPCT_MARK, /* new mark has been set */ - IPCT_NATSEQADJ, /* NAT is doing sequence adjustment */ + IPCT_SEQADJ, /* sequence adjustment has changed */ + IPCT_NATSEQADJ = IPCT_SEQADJ, IPCT_SECMARK, /* new security mark has been set */ IPCT_LABEL, /* new connlabel has been set */ }; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 08fabc6c93f..acad6c52a65 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -42,8 +42,10 @@ enum ctattr_type { CTA_ID, CTA_NAT_DST, CTA_TUPLE_MASTER, - CTA_NAT_SEQ_ADJ_ORIG, - CTA_NAT_SEQ_ADJ_REPLY, + CTA_SEQ_ADJ_ORIG, + CTA_NAT_SEQ_ADJ_ORIG = CTA_SEQ_ADJ_ORIG, + CTA_SEQ_ADJ_REPLY, + CTA_NAT_SEQ_ADJ_REPLY = CTA_SEQ_ADJ_REPLY, CTA_SECMARK, /* obsolete */ CTA_ZONE, CTA_SECCTX, @@ -165,6 +167,15 @@ enum ctattr_protonat { }; #define CTA_PROTONAT_MAX (__CTA_PROTONAT_MAX - 1) +enum ctattr_seqadj { + CTA_SEQADJ_UNSPEC, + CTA_SEQADJ_CORRECTION_POS, + CTA_SEQADJ_OFFSET_BEFORE, + CTA_SEQADJ_OFFSET_AFTER, + __CTA_SEQADJ_MAX +}; +#define CTA_SEQADJ_MAX (__CTA_SEQADJ_MAX - 1) + enum ctattr_natseq { CTA_NAT_SEQ_UNSPEC, CTA_NAT_SEQ_CORRECTION_POS, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 0a2e0e3e95b..86f5b34a4ed 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -136,11 +137,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum, /* adjust seqs for loopback traffic only in outgoing direction */ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { - typeof(nf_nat_seq_adjust_hook) seq_adjust; - - seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); - if (!seq_adjust || - !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; } diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index c9b6a6e6a1e..d6e4dd8b58d 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -158,11 +159,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum, /* adjust seqs for loopback traffic only in outgoing direction */ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { - typeof(nf_nat_seq_adjust_hook) seq_adjust; - - seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); - if (!seq_adjust || - !seq_adjust(skb, ct, ctinfo, protoff)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; } diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index ebfa7dc747c..89a9c1658f5 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,6 +1,6 @@ netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o -nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index da6f1787a10..00a7a94d413 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -1326,6 +1327,7 @@ void nf_conntrack_cleanup_end(void) nf_ct_extend_unregister(&nf_ct_zone_extend); #endif nf_conntrack_proto_fini(); + nf_conntrack_seqadj_fini(); nf_conntrack_labels_fini(); nf_conntrack_helper_fini(); nf_conntrack_timeout_fini(); @@ -1531,6 +1533,10 @@ int nf_conntrack_init_start(void) if (ret < 0) goto err_labels; + ret = nf_conntrack_seqadj_init(); + if (ret < 0) + goto err_seqadj; + #ifdef CONFIG_NF_CONNTRACK_ZONES ret = nf_ct_extend_register(&nf_ct_zone_extend); if (ret < 0) @@ -1555,6 +1561,8 @@ err_proto: nf_ct_extend_unregister(&nf_ct_zone_extend); err_extend: #endif + nf_conntrack_seqadj_fini(); +err_seqadj: nf_conntrack_labels_fini(); err_labels: nf_conntrack_helper_fini(); @@ -1577,9 +1585,6 @@ void nf_conntrack_init_end(void) /* For use by REJECT target */ RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); - - /* Howto get NAT offsets */ - RCU_INIT_POINTER(nf_ct_nat_offset, NULL); } /* @@ -1666,8 +1671,3 @@ err_slabname: err_stat: return ret; } - -s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq); -EXPORT_SYMBOL_GPL(nf_ct_nat_offset); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index fa61fea6323..7c55745ecec 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -381,9 +382,8 @@ nla_put_failure: return -1; } -#ifdef CONFIG_NF_NAT_NEEDED static int -dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) +dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type) { struct nlattr *nest_parms; @@ -391,12 +391,12 @@ dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) if (!nest_parms) goto nla_put_failure; - if (nla_put_be32(skb, CTA_NAT_SEQ_CORRECTION_POS, - htonl(natseq->correction_pos)) || - nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_BEFORE, - htonl(natseq->offset_before)) || - nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_AFTER, - htonl(natseq->offset_after))) + if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS, + htonl(seq->correction_pos)) || + nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE, + htonl(seq->offset_before)) || + nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER, + htonl(seq->offset_after))) goto nla_put_failure; nla_nest_end(skb, nest_parms); @@ -408,27 +408,24 @@ nla_put_failure: } static inline int -ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) +ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) { - struct nf_nat_seq *natseq; - struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *seq; - if (!(ct->status & IPS_SEQ_ADJUST) || !nat) + if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; - natseq = &nat->seq[IP_CT_DIR_ORIGINAL]; - if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1) + seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; + if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) return -1; - natseq = &nat->seq[IP_CT_DIR_REPLY]; - if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1) + seq = &seqadj->seq[IP_CT_DIR_REPLY]; + if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) return -1; return 0; } -#else -#define ctnetlink_dump_nat_seq_adj(a, b) (0) -#endif static inline int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) @@ -502,7 +499,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -707,8 +704,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; - if (events & (1 << IPCT_NATSEQADJ) && - ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + if (events & (1 << IPCT_SEQADJ) && + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; } @@ -1439,66 +1436,65 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[] return err; } -#ifdef CONFIG_NF_NAT_NEEDED -static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = { - [CTA_NAT_SEQ_CORRECTION_POS] = { .type = NLA_U32 }, - [CTA_NAT_SEQ_OFFSET_BEFORE] = { .type = NLA_U32 }, - [CTA_NAT_SEQ_OFFSET_AFTER] = { .type = NLA_U32 }, +static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = { + [CTA_SEQADJ_CORRECTION_POS] = { .type = NLA_U32 }, + [CTA_SEQADJ_OFFSET_BEFORE] = { .type = NLA_U32 }, + [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 }, }; static inline int -change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr) +change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr) { int err; - struct nlattr *cda[CTA_NAT_SEQ_MAX+1]; + struct nlattr *cda[CTA_SEQADJ_MAX+1]; - err = nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy); + err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy); if (err < 0) return err; - if (!cda[CTA_NAT_SEQ_CORRECTION_POS]) + if (!cda[CTA_SEQADJ_CORRECTION_POS]) return -EINVAL; - natseq->correction_pos = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS])); + seq->correction_pos = + ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS])); - if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE]) + if (!cda[CTA_SEQADJ_OFFSET_BEFORE]) return -EINVAL; - natseq->offset_before = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE])); + seq->offset_before = + ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE])); - if (!cda[CTA_NAT_SEQ_OFFSET_AFTER]) + if (!cda[CTA_SEQADJ_OFFSET_AFTER]) return -EINVAL; - natseq->offset_after = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER])); + seq->offset_after = + ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER])); return 0; } static int -ctnetlink_change_nat_seq_adj(struct nf_conn *ct, - const struct nlattr * const cda[]) +ctnetlink_change_seq_adj(struct nf_conn *ct, + const struct nlattr * const cda[]) { + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); int ret = 0; - struct nf_conn_nat *nat = nfct_nat(ct); - if (!nat) + if (!seqadj) return 0; - if (cda[CTA_NAT_SEQ_ADJ_ORIG]) { - ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL], - cda[CTA_NAT_SEQ_ADJ_ORIG]); + if (cda[CTA_SEQ_ADJ_ORIG]) { + ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], + cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) return ret; ct->status |= IPS_SEQ_ADJUST; } - if (cda[CTA_NAT_SEQ_ADJ_REPLY]) { - ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY], - cda[CTA_NAT_SEQ_ADJ_REPLY]); + if (cda[CTA_SEQ_ADJ_REPLY]) { + ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], + cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) return ret; @@ -1507,7 +1503,6 @@ ctnetlink_change_nat_seq_adj(struct nf_conn *ct, return 0; } -#endif static int ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) @@ -1573,13 +1568,12 @@ ctnetlink_change_conntrack(struct nf_conn *ct, ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); #endif -#ifdef CONFIG_NF_NAT_NEEDED - if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { - err = ctnetlink_change_nat_seq_adj(ct, cda); + if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) return err; } -#endif + if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) @@ -1684,13 +1678,11 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, goto err2; } -#ifdef CONFIG_NF_NAT_NEEDED - if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { - err = ctnetlink_change_nat_seq_adj(ct, cda); + if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) goto err2; } -#endif memset(&ct->proto, 0, sizeof(ct->proto)); if (cda[CTA_PROTOINFO]) { @@ -1804,7 +1796,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | (1 << IPCT_PROTOINFO) | - (1 << IPCT_NATSEQADJ) | + (1 << IPCT_SEQADJ) | (1 << IPCT_MARK) | events, ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); @@ -1827,7 +1819,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, (1 << IPCT_HELPER) | (1 << IPCT_LABEL) | (1 << IPCT_PROTOINFO) | - (1 << IPCT_NATSEQADJ) | + (1 << IPCT_SEQADJ) | (1 << IPCT_MARK), ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); @@ -2082,7 +2074,7 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; if ((ct->status & IPS_SEQ_ADJUST) && - ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK @@ -2211,6 +2203,7 @@ static struct nfq_ct_hook ctnetlink_nfqueue_hook = { .build = ctnetlink_nfqueue_build, .parse = ctnetlink_nfqueue_parse, .attach_expect = ctnetlink_nfqueue_attach_expect, + .seq_adjust = nf_ct_tcp_seqadj_set, }; #endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index d224e001f14..984a8d1a335 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -495,21 +496,6 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } -#ifdef CONFIG_NF_NAT_NEEDED -static inline s32 nat_offset(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq) -{ - typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset); - - return get_offset != NULL ? get_offset(ct, dir, seq) : 0; -} -#define NAT_OFFSET(ct, dir, seq) \ - (nat_offset(ct, dir, seq)) -#else -#define NAT_OFFSET(ct, dir, seq) 0 -#endif - static bool tcp_in_window(const struct nf_conn *ct, struct ip_ct_tcp *state, enum ip_conntrack_dir dir, @@ -540,7 +526,7 @@ static bool tcp_in_window(const struct nf_conn *ct, tcp_sack(skb, dataoff, tcph, &sack); /* Take into account NAT sequence number mangling */ - receiver_offset = NAT_OFFSET(ct, !dir, ack - 1); + receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1); ack -= receiver_offset; sack -= receiver_offset; diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c new file mode 100644 index 00000000000..483eb9ce321 --- /dev/null +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -0,0 +1,218 @@ +#include +#include +#include + +#include +#include +#include + +int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + __be32 seq, s32 off) +{ + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_seqadj *this_way; + + if (off == 0) + return 0; + + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + + spin_lock_bh(&ct->lock); + this_way = &seqadj->seq[dir]; + if (this_way->offset_before == this_way->offset_after || + before(this_way->correction_pos, seq)) { + this_way->correction_pos = seq; + this_way->offset_before = this_way->offset_after; + this_way->offset_after += off; + } + spin_unlock_bh(&ct->lock); + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); + +void nf_ct_tcp_seqadj_set(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off) +{ + const struct tcphdr *th; + + if (nf_ct_protonum(ct) != IPPROTO_TCP) + return; + + th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); + nf_ct_seqadj_set(ct, ctinfo, th->seq, off); +} +EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); + +/* Adjust one found SACK option including checksum correction */ +static void nf_ct_sack_block_adjust(struct sk_buff *skb, + struct tcphdr *tcph, + unsigned int sackoff, + unsigned int sackend, + struct nf_ct_seqadj *seq) +{ + while (sackoff < sackend) { + struct tcp_sack_block_wire *sack; + __be32 new_start_seq, new_end_seq; + + sack = (void *)skb->data + sackoff; + if (after(ntohl(sack->start_seq) - seq->offset_before, + seq->correction_pos)) + new_start_seq = htonl(ntohl(sack->start_seq) - + seq->offset_after); + else + new_start_seq = htonl(ntohl(sack->start_seq) - + seq->offset_before); + + if (after(ntohl(sack->end_seq) - seq->offset_before, + seq->correction_pos)) + new_end_seq = htonl(ntohl(sack->end_seq) - + seq->offset_after); + else + new_end_seq = htonl(ntohl(sack->end_seq) - + seq->offset_before); + + pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", + ntohl(sack->start_seq), new_start_seq, + ntohl(sack->end_seq), new_end_seq); + + inet_proto_csum_replace4(&tcph->check, skb, + sack->start_seq, new_start_seq, 0); + inet_proto_csum_replace4(&tcph->check, skb, + sack->end_seq, new_end_seq, 0); + sack->start_seq = new_start_seq; + sack->end_seq = new_end_seq; + sackoff += sizeof(*sack); + } +} + +/* TCP SACK sequence number adjustment */ +static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *tcph, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dir, optoff, optend; + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + + optoff = protoff + sizeof(struct tcphdr); + optend = protoff + tcph->doff * 4; + + if (!skb_make_writable(skb, optend)) + return 0; + + dir = CTINFO2DIR(ctinfo); + + while (optoff < optend) { + /* Usually: option, length. */ + unsigned char *op = skb->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + /* no partial options */ + if (optoff + 1 == optend || + optoff + op[1] > optend || + op[1] < 2) + return 0; + if (op[0] == TCPOPT_SACK && + op[1] >= 2+TCPOLEN_SACK_PERBLOCK && + ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) + nf_ct_sack_block_adjust(skb, tcph, optoff + 2, + optoff+op[1], + &seqadj->seq[!dir]); + optoff += op[1]; + } + } + return 1; +} + +/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ +int nf_ct_seq_adjust(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct tcphdr *tcph; + __be32 newseq, newack; + s32 seqoff, ackoff; + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *this_way, *other_way; + int res; + + this_way = &seqadj->seq[dir]; + other_way = &seqadj->seq[!dir]; + + if (!skb_make_writable(skb, protoff + sizeof(*tcph))) + return 0; + + tcph = (void *)skb->data + protoff; + spin_lock_bh(&ct->lock); + if (after(ntohl(tcph->seq), this_way->correction_pos)) + seqoff = this_way->offset_after; + else + seqoff = this_way->offset_before; + + if (after(ntohl(tcph->ack_seq) - other_way->offset_before, + other_way->correction_pos)) + ackoff = other_way->offset_after; + else + ackoff = other_way->offset_before; + + newseq = htonl(ntohl(tcph->seq) + seqoff); + newack = htonl(ntohl(tcph->ack_seq) - ackoff); + + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + + pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", + ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), + ntohl(newack)); + + tcph->seq = newseq; + tcph->ack_seq = newack; + + res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo); + spin_unlock_bh(&ct->lock); + + return res; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); + +s32 nf_ct_seq_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *this_way; + + if (!seqadj) + return 0; + + this_way = &seqadj->seq[dir]; + return after(seq, this_way->correction_pos) ? + this_way->offset_after : this_way->offset_before; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_offset); + +static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = { + .len = sizeof(struct nf_conn_seqadj), + .align = __alignof__(struct nf_conn_seqadj), + .id = NF_CT_EXT_SEQADJ, +}; + +int nf_conntrack_seqadj_init(void) +{ + return nf_ct_extend_register(&nf_ct_seqadj_extend); +} + +void nf_conntrack_seqadj_fini(void) +{ + nf_ct_extend_unregister(&nf_ct_seqadj_extend); +} diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 6ff808375b5..6f0f4f7f68a 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -402,6 +403,9 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; + + if (nfct_help(ct)) + nfct_seqadj_ext_add(ct); } if (maniptype == NF_NAT_MANIP_SRC) { @@ -764,10 +768,6 @@ static struct nf_ct_helper_expectfn follow_master_nat = { .expectfn = nf_nat_follow_master, }; -static struct nfq_ct_nat_hook nfq_ct_nat = { - .seq_adjust = nf_nat_tcp_seq_adjust, -}; - static int __init nf_nat_init(void) { int ret; @@ -787,14 +787,9 @@ static int __init nf_nat_init(void) /* Initialize fake conntrack so that NAT will skip it */ nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); - BUG_ON(nf_nat_seq_adjust_hook != NULL); - RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, nfnetlink_parse_nat_setup); - BUG_ON(nf_ct_nat_offset != NULL); - RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); - RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat); #ifdef CONFIG_XFRM BUG_ON(nf_nat_decode_session_hook != NULL); RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session); @@ -813,10 +808,7 @@ static void __exit nf_nat_cleanup(void) unregister_pernet_subsys(&nf_nat_net_ops); nf_ct_extend_unregister(&nat_extend); nf_ct_helper_expectfn_unregister(&follow_master_nat); - RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); - RCU_INIT_POINTER(nf_ct_nat_offset, NULL); - RCU_INIT_POINTER(nfq_ct_nat_hook, NULL); #ifdef CONFIG_XFRM RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); #endif diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 46b9baa845a..2840abb5bb9 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -20,67 +20,13 @@ #include #include #include +#include #include #include #include #include #include -#define DUMP_OFFSET(x) \ - pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \ - x->offset_before, x->offset_after, x->correction_pos); - -/* Setup TCP sequence correction given this change at this sequence */ -static inline void -adjust_tcp_sequence(u32 seq, - int sizediff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way = &nat->seq[dir]; - - pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", - seq, sizediff); - - pr_debug("adjust_tcp_sequence: Seq_offset before: "); - DUMP_OFFSET(this_way); - - spin_lock_bh(&ct->lock); - - /* SYN adjust. If it's uninitialized, or this is after last - * correction, record it: we don't handle more than one - * adjustment in the window, but do deal with common case of a - * retransmit */ - if (this_way->offset_before == this_way->offset_after || - before(this_way->correction_pos, seq)) { - this_way->correction_pos = seq; - this_way->offset_before = this_way->offset_after; - this_way->offset_after += sizediff; - } - spin_unlock_bh(&ct->lock); - - pr_debug("adjust_tcp_sequence: Seq_offset after: "); - DUMP_OFFSET(this_way); -} - -/* Get the offset value, for conntrack. Caller must have the conntrack locked */ -s32 nf_nat_get_offset(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq) -{ - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way; - - if (!nat) - return 0; - - this_way = &nat->seq[dir]; - return after(seq, this_way->correction_pos) - ? this_way->offset_after : this_way->offset_before; -} - /* Frobs data inside this packet, which is linear. */ static void mangle_contents(struct sk_buff *skb, unsigned int dataoff, @@ -135,30 +81,6 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra) return 1; } -void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - __be32 seq, s32 off) -{ - if (!off) - return; - set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); - adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo); - nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); -} -EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); - -void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, - u32 ctinfo, int off) -{ - const struct tcphdr *th; - - if (nf_ct_protonum(ct) != IPPROTO_TCP) - return; - - th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb)); - nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); -} -EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust); - /* Generic function for mangling variable-length address changes inside * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX * command in FTP). @@ -203,8 +125,8 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, datalen, oldlen); if (adjust && rep_len != match_len) - nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, - (int)rep_len - (int)match_len); + nf_ct_seqadj_set(ct, ctinfo, tcph->seq, + (int)rep_len - (int)match_len); return 1; } @@ -264,150 +186,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, } EXPORT_SYMBOL(nf_nat_mangle_udp_packet); -/* Adjust one found SACK option including checksum correction */ -static void -sack_adjust(struct sk_buff *skb, - struct tcphdr *tcph, - unsigned int sackoff, - unsigned int sackend, - struct nf_nat_seq *natseq) -{ - while (sackoff < sackend) { - struct tcp_sack_block_wire *sack; - __be32 new_start_seq, new_end_seq; - - sack = (void *)skb->data + sackoff; - if (after(ntohl(sack->start_seq) - natseq->offset_before, - natseq->correction_pos)) - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_after); - else - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_before); - - if (after(ntohl(sack->end_seq) - natseq->offset_before, - natseq->correction_pos)) - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_after); - else - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_before); - - pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", - ntohl(sack->start_seq), new_start_seq, - ntohl(sack->end_seq), new_end_seq); - - inet_proto_csum_replace4(&tcph->check, skb, - sack->start_seq, new_start_seq, 0); - inet_proto_csum_replace4(&tcph->check, skb, - sack->end_seq, new_end_seq, 0); - sack->start_seq = new_start_seq; - sack->end_seq = new_end_seq; - sackoff += sizeof(*sack); - } -} - -/* TCP SACK sequence number adjustment */ -static inline unsigned int -nf_nat_sack_adjust(struct sk_buff *skb, - unsigned int protoff, - struct tcphdr *tcph, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - unsigned int dir, optoff, optend; - struct nf_conn_nat *nat = nfct_nat(ct); - - optoff = protoff + sizeof(struct tcphdr); - optend = protoff + tcph->doff * 4; - - if (!skb_make_writable(skb, optend)) - return 0; - - dir = CTINFO2DIR(ctinfo); - - while (optoff < optend) { - /* Usually: option, length. */ - unsigned char *op = skb->data + optoff; - - switch (op[0]) { - case TCPOPT_EOL: - return 1; - case TCPOPT_NOP: - optoff++; - continue; - default: - /* no partial options */ - if (optoff + 1 == optend || - optoff + op[1] > optend || - op[1] < 2) - return 0; - if (op[0] == TCPOPT_SACK && - op[1] >= 2+TCPOLEN_SACK_PERBLOCK && - ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) - sack_adjust(skb, tcph, optoff+2, - optoff+op[1], &nat->seq[!dir]); - optoff += op[1]; - } - } - return 1; -} - -/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ -int -nf_nat_seq_adjust(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff) -{ - struct tcphdr *tcph; - int dir; - __be32 newseq, newack; - s32 seqoff, ackoff; - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way, *other_way; - int res; - - dir = CTINFO2DIR(ctinfo); - - this_way = &nat->seq[dir]; - other_way = &nat->seq[!dir]; - - if (!skb_make_writable(skb, protoff + sizeof(*tcph))) - return 0; - - tcph = (void *)skb->data + protoff; - spin_lock_bh(&ct->lock); - if (after(ntohl(tcph->seq), this_way->correction_pos)) - seqoff = this_way->offset_after; - else - seqoff = this_way->offset_before; - - if (after(ntohl(tcph->ack_seq) - other_way->offset_before, - other_way->correction_pos)) - ackoff = other_way->offset_after; - else - ackoff = other_way->offset_before; - - newseq = htonl(ntohl(tcph->seq) + seqoff); - newack = htonl(ntohl(tcph->ack_seq) - ackoff); - - inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); - inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); - - pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", - ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), - ntohl(newack)); - - tcph->seq = newseq; - tcph->ack_seq = newack; - - res = nf_nat_sack_adjust(skb, protoff, tcph, ct, ctinfo); - spin_unlock_bh(&ct->lock); - - return res; -} - /* Setup NAT on this expected conntrack so it follows master. */ /* If we fail to get a free NAT slot, we'll get dropped on confirm */ void nf_nat_follow_master(struct nf_conn *ct, diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index dac11f73868..f9790405b7f 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -20,6 +20,7 @@ #include #include #include +#include #include MODULE_LICENSE("GPL"); @@ -308,7 +309,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff, return; th = (struct tcphdr *)(skb->data + protoff); - nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); + nf_ct_seqadj_set(ct, ctinfo, th->seq, off); } /* Handles expected signalling connections and media streams */ diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c index be893039966..96cac50e0d1 100644 --- a/net/netfilter/nfnetlink_queue_ct.c +++ b/net/netfilter/nfnetlink_queue_ct.c @@ -87,14 +87,14 @@ nla_put_failure: void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff) { - struct nfq_ct_nat_hook *nfq_nat_ct; + struct nfq_ct_hook *nfq_ct; - nfq_nat_ct = rcu_dereference(nfq_ct_nat_hook); - if (nfq_nat_ct == NULL) + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) return; if ((ct->status & IPS_NAT_MASK) && diff) - nfq_nat_ct->seq_adjust(skb, ct, ctinfo, diff); + nfq_ct->seq_adjust(skb, ct, ctinfo, diff); } int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, -- cgit v1.2.3-70-g09d2 From 0198230b7705eb2386e53778d944e307eef0cc71 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 27 Aug 2013 08:50:13 +0200 Subject: net: syncookies: export cookie_v4_init_sequence/cookie_v4_check Extract the local TCP stack independant parts of tcp_v4_init_sequence() and cookie_v4_check() and export them for use by the upcoming SYNPROXY target. Signed-off-by: Patrick McHardy Acked-by: David S. Miller Tested-by: Martin Topholm Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- include/net/tcp.h | 4 ++++ net/ipv4/syncookies.c | 29 ++++++++++++++++++----------- 2 files changed, 22 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 09cb5c11cee..8e2b63678a8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -476,9 +476,13 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; +extern int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, + u32 cookie); extern struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt); #ifdef CONFIG_SYN_COOKIES +extern u32 __cookie_v4_init_sequence(const struct iphdr *iph, + const struct tcphdr *th, u16 *mssp); extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss); #else diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b05c96e7af8..14a15c49129 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -160,26 +160,33 @@ static __u16 const msstab[] = { * Generate a syncookie. mssp points to the mss, which is returned * rounded down to the value encoded in the cookie. */ -__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, + u16 *mssp) { - const struct iphdr *iph = ip_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); int mssind; const __u16 mss = *mssp; - tcp_synq_overflow(sk); - for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), jiffies / (HZ * 60), mssind); } +EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); + +__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +{ + const struct iphdr *iph = ip_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + + tcp_synq_overflow(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + + return __cookie_v4_init_sequence(iph, th, mssp); +} /* * This (misnamed) value is the age of syncookie which is permitted. @@ -192,10 +199,9 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. */ -static inline int cookie_check(struct sk_buff *skb, __u32 cookie) +int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, + u32 cookie) { - const struct iphdr *iph = ip_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); __u32 seq = ntohl(th->seq) - 1; __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, th->source, th->dest, seq, @@ -204,6 +210,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } +EXPORT_SYMBOL_GPL(__cookie_v4_check); static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -284,7 +291,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, goto out; if (tcp_synq_no_recent_overflow(sk) || - (mss = cookie_check(skb, cookie)) == 0) { + (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } -- cgit v1.2.3-70-g09d2 From 48b1de4c110a7afa4b85862f6c75af817db26fad Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 27 Aug 2013 08:50:14 +0200 Subject: netfilter: add SYNPROXY core/target Add a SYNPROXY for netfilter. The code is split into two parts, the synproxy core with common functions and an address family specific target. The SYNPROXY receives the connection request from the client, responds with a SYN/ACK containing a SYN cookie and announcing a zero window and checks whether the final ACK from the client contains a valid cookie. It then establishes a connection to the original destination and, if successful, sends a window update to the client with the window size announced by the server. Support for timestamps, SACK, window scaling and MSS options can be statically configured as target parameters if the features of the server are known. If timestamps are used, the timestamp value sent back to the client in the SYN/ACK will be different from the real timestamp of the server. In order to now break PAWS, the timestamps are translated in the direction server->client. Signed-off-by: Patrick McHardy Tested-by: Martin Topholm Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_extend.h | 6 +- include/net/netfilter/nf_conntrack_seqadj.h | 2 + include/net/netfilter/nf_conntrack_synproxy.h | 77 +++++ include/uapi/linux/netfilter/xt_SYNPROXY.h | 16 + net/ipv4/netfilter/Kconfig | 13 + net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_SYNPROXY.c | 472 ++++++++++++++++++++++++++ net/netfilter/Kconfig | 3 + net/netfilter/Makefile | 3 + net/netfilter/nf_conntrack_core.c | 6 + net/netfilter/nf_conntrack_proto_tcp.c | 16 + net/netfilter/nf_conntrack_seqadj.c | 20 ++ net/netfilter/nf_synproxy_core.c | 432 +++++++++++++++++++++++ 13 files changed, 1066 insertions(+), 1 deletion(-) create mode 100644 include/net/netfilter/nf_conntrack_synproxy.h create mode 100644 include/uapi/linux/netfilter/xt_SYNPROXY.h create mode 100644 net/ipv4/netfilter/ipt_SYNPROXY.c create mode 100644 net/netfilter/nf_synproxy_core.c (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 2a22bcbfe6e..ff95434e50c 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -9,8 +9,8 @@ enum nf_ct_ext_id { NF_CT_EXT_HELPER, #if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE) NF_CT_EXT_NAT, - NF_CT_EXT_SEQADJ, #endif + NF_CT_EXT_SEQADJ, NF_CT_EXT_ACCT, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_CT_EXT_ECACHE, @@ -26,6 +26,9 @@ enum nf_ct_ext_id { #endif #ifdef CONFIG_NF_CONNTRACK_LABELS NF_CT_EXT_LABELS, +#endif +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + NF_CT_EXT_SYNPROXY, #endif NF_CT_EXT_NUM, }; @@ -39,6 +42,7 @@ enum nf_ct_ext_id { #define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp #define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout #define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels +#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h index 30bfbbed9f4..f6177a5fe0c 100644 --- a/include/net/netfilter/nf_conntrack_seqadj.h +++ b/include/net/netfilter/nf_conntrack_seqadj.h @@ -30,6 +30,8 @@ static inline struct nf_conn_seqadj *nfct_seqadj_ext_add(struct nf_conn *ct) return nf_ct_ext_add(ct, NF_CT_EXT_SEQADJ, GFP_ATOMIC); } +extern int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off); extern int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off); extern void nf_ct_tcp_seqadj_set(struct sk_buff *skb, diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h new file mode 100644 index 00000000000..806f54a290d --- /dev/null +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -0,0 +1,77 @@ +#ifndef _NF_CONNTRACK_SYNPROXY_H +#define _NF_CONNTRACK_SYNPROXY_H + +#include + +struct nf_conn_synproxy { + u32 isn; + u32 its; + u32 tsoff; +}; + +static inline struct nf_conn_synproxy *nfct_synproxy(const struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + return nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY); +#else + return NULL; +#endif +} + +static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + return nf_ct_ext_add(ct, NF_CT_EXT_SYNPROXY, GFP_ATOMIC); +#else + return NULL; +#endif +} + +struct synproxy_stats { + unsigned int syn_received; + unsigned int cookie_invalid; + unsigned int cookie_valid; + unsigned int cookie_retrans; + unsigned int conn_reopened; +}; + +struct synproxy_net { + struct nf_conn *tmpl; + struct synproxy_stats __percpu *stats; +}; + +extern int synproxy_net_id; +static inline struct synproxy_net *synproxy_pernet(struct net *net) +{ + return net_generic(net, synproxy_net_id); +} + +struct synproxy_options { + u8 options; + u8 wscale; + u16 mss; + u32 tsval; + u32 tsecr; +}; + +struct tcphdr; +struct xt_synproxy_info; +extern void synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, + const struct tcphdr *th, + struct synproxy_options *opts); +extern unsigned int synproxy_options_size(const struct synproxy_options *opts); +extern void synproxy_build_options(struct tcphdr *th, + const struct synproxy_options *opts); + +extern void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, + struct synproxy_options *opts); +extern void synproxy_check_timestamp_cookie(struct synproxy_options *opts); + +extern unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *th, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy); + +#endif /* _NF_CONNTRACK_SYNPROXY_H */ diff --git a/include/uapi/linux/netfilter/xt_SYNPROXY.h b/include/uapi/linux/netfilter/xt_SYNPROXY.h new file mode 100644 index 00000000000..2d59fbaa93c --- /dev/null +++ b/include/uapi/linux/netfilter/xt_SYNPROXY.h @@ -0,0 +1,16 @@ +#ifndef _XT_SYNPROXY_H +#define _XT_SYNPROXY_H + +#define XT_SYNPROXY_OPT_MSS 0x01 +#define XT_SYNPROXY_OPT_WSCALE 0x02 +#define XT_SYNPROXY_OPT_SACK_PERM 0x04 +#define XT_SYNPROXY_OPT_TIMESTAMP 0x08 +#define XT_SYNPROXY_OPT_ECN 0x10 + +struct xt_synproxy_info { + __u8 options; + __u8 wscale; + __u16 mss; +}; + +#endif /* _XT_SYNPROXY_H */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4e902801742..1657e39b291 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -110,6 +110,19 @@ config IP_NF_TARGET_REJECT To compile it as a module, choose M here. If unsure, say N. +config IP_NF_TARGET_SYNPROXY + tristate "SYNPROXY target support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY target allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + + To compile it as a module, choose M here. If unsure, say N. + config IP_NF_TARGET_ULOG tristate "ULOG target support (obsolete)" default m if NETFILTER_ADVANCED=n diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 007b128eecc..3622b248b6d 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o +obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o # generic ARP tables diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c new file mode 100644 index 00000000000..94371db6aec --- /dev/null +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2013 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static struct iphdr * +synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) +{ + struct iphdr *iph; + + skb_reset_network_header(skb); + iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); + iph->version = 4; + iph->ihl = sizeof(*iph) / 4; + iph->tos = 0; + iph->id = 0; + iph->frag_off = htons(IP_DF); + iph->ttl = sysctl_ip_default_ttl; + iph->protocol = IPPROTO_TCP; + iph->check = 0; + iph->saddr = saddr; + iph->daddr = daddr; + + return iph; +} + +static void +synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct iphdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + skb_dst_set_noref(nskb, skb_dst(skb)); + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(nskb, RTN_UNSPEC)) + goto free_nskb; + + if (nfct) { + nskb->nfct = nfct; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nfct); + } + + ip_local_out(nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +static void +synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_syn(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(const struct synproxy_net *snet, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static bool +synproxy_recv_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + int mss; + + mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + return true; +} + +static unsigned int +synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_synproxy_info *info = par->targinfo; + struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_options opts = {}; + struct tcphdr *th, _th; + + if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) + return NF_DROP; + + th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + synproxy_parse_options(skb, par->thoff, th, &opts); + + if (th->syn && !th->ack) { + /* Initial SYN from client */ + this_cpu_inc(snet->stats->syn_received); + + if (th->ece && th->cwr) + opts.options |= XT_SYNPROXY_OPT_ECN; + + opts.options &= info->options; + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, &opts); + else + opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM | + XT_SYNPROXY_OPT_ECN); + + synproxy_send_client_synack(skb, th, &opts); + } else if (th->ack && !(th->fin || th->rst)) + /* ACK from client */ + synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + + return NF_DROP; +} + +static unsigned int ipv4_synproxy_hook(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + unsigned int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (synproxy == NULL) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + thoff = ip_hdrlen(skb); + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + synproxy_parse_options(skb, thoff, th, &opts); + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(snet, skb, th, &opts, + ntohl(th->seq) + 1)) + this_cpu_inc(snet->stats->cookie_retrans); + + return NF_DROP; + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + synproxy_parse_options(skb, thoff, th, &opts); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->tsoff = opts.tsval - synproxy->its; + + opts.options &= ~(XT_SYNPROXY_OPT_MSS | + XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(snet, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(snet, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} + +static int synproxy_tg4_check(const struct xt_tgchk_param *par) +{ + const struct ipt_entry *e = par->entryinfo; + + if (e->ip.proto != IPPROTO_TCP || + e->ip.invflags & XT_INV_PROTO) + return -EINVAL; + + return nf_ct_l3proto_try_module_get(par->family); +} + +static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target synproxy_tg4_reg __read_mostly = { + .name = "SYNPROXY", + .family = NFPROTO_IPV4, + .target = synproxy_tg4, + .targetsize = sizeof(struct xt_synproxy_info), + .checkentry = synproxy_tg4_check, + .destroy = synproxy_tg4_destroy, + .me = THIS_MODULE, +}; + +static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { + { + .hook = ipv4_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv4_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +static int __init synproxy_tg4_init(void) +{ + int err; + + err = nf_register_hooks(ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); + if (err < 0) + goto err1; + + err = xt_register_target(&synproxy_tg4_reg); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); +err1: + return err; +} + +static void __exit synproxy_tg4_exit(void) +{ + xt_unregister_target(&synproxy_tg4_reg); + nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); +} + +module_init(synproxy_tg4_init); +module_exit(synproxy_tg4_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index c45fc1a60e0..62a171ab204 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -408,6 +408,9 @@ config NF_NAT_TFTP depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_TFTP +config NETFILTER_SYNPROXY + tristate + endif # NF_CONNTRACK config NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 89a9c1658f5..c3a0a12907f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -61,6 +61,9 @@ obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o +# SYNPROXY +obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o + # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 00a7a94d413..5d892febd64 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -799,6 +800,11 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (IS_ERR(ct)) return (struct nf_conntrack_tuple_hash *)ct; + if (tmpl && nfct_synproxy(tmpl)) { + nfct_seqadj_ext_add(ct); + nfct_synproxy_ext_add(ct); + } + timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; if (timeout_ext) timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 984a8d1a335..44d1ea32570 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -946,6 +947,21 @@ static int tcp_packet(struct nf_conn *ct, "state %s ", tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: + /* Special case for SYN proxy: when the SYN to the server or + * the SYN/ACK from the server is lost, the client may transmit + * a keep-alive packet while in SYN_SENT state. This needs to + * be associated with the original conntrack entry in order to + * generate a new SYN with the correct sequence number. + */ + if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT && + index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL && + ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL && + ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) { + pr_debug("nf_ct_tcp: SYN proxy client keep alive\n"); + spin_unlock_bh(&ct->lock); + return NF_ACCEPT; + } + /* Invalid packet */ pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index 483eb9ce321..5f9bfd060de 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -6,6 +6,26 @@ #include #include +int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_conn_seqadj *seqadj; + struct nf_ct_seqadj *this_way; + + if (off == 0) + return 0; + + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + + seqadj = nfct_seqadj(ct); + this_way = &seqadj->seq[dir]; + this_way->offset_before = off; + this_way->offset_after = off; + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); + int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off) { diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c new file mode 100644 index 00000000000..d23dc791aca --- /dev/null +++ b/net/netfilter/nf_synproxy_core.c @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2013 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +int synproxy_net_id; +EXPORT_SYMBOL_GPL(synproxy_net_id); + +void +synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, + const struct tcphdr *th, struct synproxy_options *opts) +{ + int length = (th->doff * 4) - sizeof(*th); + u8 buf[40], *ptr; + + ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); + BUG_ON(ptr == NULL); + + opts->options = 0; + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) + return; + if (opsize > length) + return; + + switch (opcode) { + case TCPOPT_MSS: + if (opsize == TCPOLEN_MSS) { + opts->mss = get_unaligned_be16(ptr); + opts->options |= XT_SYNPROXY_OPT_MSS; + } + break; + case TCPOPT_WINDOW: + if (opsize == TCPOLEN_WINDOW) { + opts->wscale = *ptr; + if (opts->wscale > 14) + opts->wscale = 14; + opts->options |= XT_SYNPROXY_OPT_WSCALE; + } + break; + case TCPOPT_TIMESTAMP: + if (opsize == TCPOLEN_TIMESTAMP) { + opts->tsval = get_unaligned_be32(ptr); + opts->tsecr = get_unaligned_be32(ptr + 4); + opts->options |= XT_SYNPROXY_OPT_TIMESTAMP; + } + break; + case TCPOPT_SACK_PERM: + if (opsize == TCPOLEN_SACK_PERM) + opts->options |= XT_SYNPROXY_OPT_SACK_PERM; + break; + } + + ptr += opsize - 2; + length -= opsize; + } + } +} +EXPORT_SYMBOL_GPL(synproxy_parse_options); + +unsigned int synproxy_options_size(const struct synproxy_options *opts) +{ + unsigned int size = 0; + + if (opts->options & XT_SYNPROXY_OPT_MSS) + size += TCPOLEN_MSS_ALIGNED; + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + size += TCPOLEN_TSTAMP_ALIGNED; + else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + size += TCPOLEN_SACKPERM_ALIGNED; + if (opts->options & XT_SYNPROXY_OPT_WSCALE) + size += TCPOLEN_WSCALE_ALIGNED; + + return size; +} +EXPORT_SYMBOL_GPL(synproxy_options_size); + +void +synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) +{ + __be32 *ptr = (__be32 *)(th + 1); + u8 options = opts->options; + + if (options & XT_SYNPROXY_OPT_MSS) + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + + if (options & XT_SYNPROXY_OPT_TIMESTAMP) { + if (options & XT_SYNPROXY_OPT_SACK_PERM) + *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + else + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + + *ptr++ = htonl(opts->tsval); + *ptr++ = htonl(opts->tsecr); + } else if (options & XT_SYNPROXY_OPT_SACK_PERM) + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | + TCPOLEN_SACK_PERM); + + if (options & XT_SYNPROXY_OPT_WSCALE) + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + opts->wscale); +} +EXPORT_SYMBOL_GPL(synproxy_build_options); + +void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, + struct synproxy_options *opts) +{ + opts->tsecr = opts->tsval; + opts->tsval = tcp_time_stamp & ~0x3f; + + if (opts->options & XT_SYNPROXY_OPT_WSCALE) + opts->tsval |= info->wscale; + else + opts->tsval |= 0xf; + + if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + opts->tsval |= 1 << 4; + + if (opts->options & XT_SYNPROXY_OPT_ECN) + opts->tsval |= 1 << 5; +} +EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); + +void synproxy_check_timestamp_cookie(struct synproxy_options *opts) +{ + opts->wscale = opts->tsecr & 0xf; + if (opts->wscale != 0xf) + opts->options |= XT_SYNPROXY_OPT_WSCALE; + + opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0; + + opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0; +} +EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie); + +unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *th, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy) +{ + unsigned int optoff, optend; + u32 *ptr, old; + + if (synproxy->tsoff == 0) + return 1; + + optoff = protoff + sizeof(struct tcphdr); + optend = protoff + th->doff * 4; + + if (!skb_make_writable(skb, optend)) + return 0; + + while (optoff < optend) { + unsigned char *op = skb->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + if (optoff + 1 == optend || + optoff + op[1] > optend || + op[1] < 2) + return 0; + if (op[0] == TCPOPT_TIMESTAMP && + op[1] == TCPOLEN_TIMESTAMP) { + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + ptr = (u32 *)&op[2]; + old = *ptr; + *ptr = htonl(ntohl(*ptr) - + synproxy->tsoff); + } else { + ptr = (u32 *)&op[6]; + old = *ptr; + *ptr = htonl(ntohl(*ptr) + + synproxy->tsoff); + } + inet_proto_csum_replace4(&th->check, skb, + old, *ptr, 0); + return 1; + } + optoff += op[1]; + } + } + return 1; +} +EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust); + +static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = { + .len = sizeof(struct nf_conn_synproxy), + .align = __alignof__(struct nf_conn_synproxy), + .id = NF_CT_EXT_SYNPROXY, +}; + +#ifdef CONFIG_PROC_FS +static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(snet->stats, cpu); + } + + return NULL; +} + +static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(snet->stats, cpu); + } + + return NULL; +} + +static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v) +{ + return; +} + +static int synproxy_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct synproxy_stats *stats = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries\t\tsyn_received\t" + "cookie_invalid\tcookie_valid\t" + "cookie_retrans\tconn_reopened\n"); + return 0; + } + + seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0, + stats->syn_received, + stats->cookie_invalid, + stats->cookie_valid, + stats->cookie_retrans, + stats->conn_reopened); + + return 0; +} + +static const struct seq_operations synproxy_cpu_seq_ops = { + .start = synproxy_cpu_seq_start, + .next = synproxy_cpu_seq_next, + .stop = synproxy_cpu_seq_stop, + .show = synproxy_cpu_seq_show, +}; + +static int synproxy_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &synproxy_cpu_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations synproxy_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = synproxy_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init synproxy_proc_init(struct net *net) +{ + if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat, + &synproxy_cpu_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ + remove_proc_entry("synproxy", net->proc_net_stat); +} +#else +static int __net_init synproxy_proc_init(struct net *net) +{ + return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ + return; +} +#endif /* CONFIG_PROC_FS */ + +static int __net_init synproxy_net_init(struct net *net) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct nf_conntrack_tuple t; + struct nf_conn *ct; + int err = -ENOMEM; + + memset(&t, 0, sizeof(t)); + ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL); + if (IS_ERR(ct)) { + err = PTR_ERR(ct); + goto err1; + } + + __set_bit(IPS_TEMPLATE_BIT, &ct->status); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); + if (!nfct_seqadj_ext_add(ct)) + goto err2; + if (!nfct_synproxy_ext_add(ct)) + goto err2; + + snet->tmpl = ct; + + snet->stats = alloc_percpu(struct synproxy_stats); + if (snet->stats == NULL) + goto err2; + + err = synproxy_proc_init(net); + if (err < 0) + goto err3; + + return 0; + +err3: + free_percpu(snet->stats); +err2: + nf_conntrack_free(ct); +err1: + return err; +} + +static void __net_exit synproxy_net_exit(struct net *net) +{ + struct synproxy_net *snet = synproxy_pernet(net); + + nf_conntrack_free(snet->tmpl); + synproxy_proc_exit(net); + free_percpu(snet->stats); +} + +static struct pernet_operations synproxy_net_ops = { + .init = synproxy_net_init, + .exit = synproxy_net_exit, + .id = &synproxy_net_id, + .size = sizeof(struct synproxy_net), +}; + +static int __init synproxy_core_init(void) +{ + int err; + + err = nf_ct_extend_register(&nf_ct_synproxy_extend); + if (err < 0) + goto err1; + + err = register_pernet_subsys(&synproxy_net_ops); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_ct_extend_unregister(&nf_ct_synproxy_extend); +err1: + return err; +} + +static void __exit synproxy_core_exit(void) +{ + unregister_pernet_subsys(&synproxy_net_ops); + nf_ct_extend_unregister(&nf_ct_synproxy_extend); +} + +module_init(synproxy_core_init); +module_exit(synproxy_core_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); -- cgit v1.2.3-70-g09d2 From 81eb6a1487718a89621a0e0be7fafd0cd7c429a4 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 27 Aug 2013 08:50:15 +0200 Subject: net: syncookies: export cookie_v6_init_sequence/cookie_v6_check Extract the local TCP stack independant parts of tcp_v6_init_sequence() and cookie_v6_check() and export them for use by the upcoming IPv6 SYNPROXY target. Signed-off-by: Patrick McHardy Acked-by: David S. Miller Tested-by: Martin Topholm Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- include/net/tcp.h | 4 ++++ net/ipv6/syncookies.c | 25 ++++++++++++++++--------- 2 files changed, 20 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 8e2b63678a8..dd5e16f66f8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -499,8 +499,12 @@ extern bool cookie_check_timestamp(struct tcp_options_received *opt, struct net *net, bool *ecn_ok); /* From net/ipv6/syncookies.c */ +extern int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, + u32 cookie); extern struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES +extern u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, + const struct tcphdr *th, u16 *mssp); extern __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mss); #else diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index d5dda20bd71..bf63ac8a49b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -112,32 +112,38 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, & COOKIEMASK; } -__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp) +u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, + const struct tcphdr *th, __u16 *mssp) { - const struct ipv6hdr *iph = ipv6_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); int mssind; const __u16 mss = *mssp; - tcp_synq_overflow(sk); - for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, th->dest, ntohl(th->seq), jiffies / (HZ * 60), mssind); } +EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); -static inline int cookie_check(const struct sk_buff *skb, __u32 cookie) +__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); + + tcp_synq_overflow(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + + return __cookie_v6_init_sequence(iph, th, mssp); +} + +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, + __u32 cookie) +{ __u32 seq = ntohl(th->seq) - 1; __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, th->source, th->dest, seq, @@ -145,6 +151,7 @@ static inline int cookie_check(const struct sk_buff *skb, __u32 cookie) return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } +EXPORT_SYMBOL_GPL(__cookie_v6_check); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) { @@ -167,7 +174,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; if (tcp_synq_no_recent_overflow(sk) || - (mss = cookie_check(skb, cookie)) == 0) { + (mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie)) == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } -- cgit v1.2.3-70-g09d2 From 76bfd8984430be5b19727d09f6ab5e1d7a247f8c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 26 Aug 2013 16:34:00 +0200 Subject: net: sctp: reorder sctp_globals to reduce cacheline usage Reduce cacheline usage from 2 to 1 cacheline for sctp_globals structure. By reordering elements, we can close gaps and simply achieve the following: Current situation: /* size: 80, cachelines: 2, members: 10 */ /* sum members: 57, holes: 4, sum holes: 16 */ /* padding: 7 */ /* last cacheline: 16 bytes */ Afterwards: /* size: 64, cachelines: 1, members: 10 */ /* padding: 7 */ Signed-off-by: Daniel Borkmann Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 422db6cc321..2174d8da077 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -113,29 +113,27 @@ struct sctp_hashbucket { /* The SCTP globals structure. */ extern struct sctp_globals { - /* The following variables are implementation specific. */ - - /* Default initialization values to be applied to new associations. */ - __u16 max_instreams; - __u16 max_outstreams; - /* This is a list of groups of functions for each address * family that we support. */ struct list_head address_families; /* This is the hash of all endpoints. */ - int ep_hashsize; struct sctp_hashbucket *ep_hashtable; - /* This is the hash of all associations. */ - int assoc_hashsize; struct sctp_hashbucket *assoc_hashtable; - /* This is the sctp port control hash. */ - int port_hashsize; struct sctp_bind_hashbucket *port_hashtable; + /* Sizes of above hashtables. */ + int ep_hashsize; + int assoc_hashsize; + int port_hashsize; + + /* Default initialization values to be applied to new associations. */ + __u16 max_instreams; + __u16 max_outstreams; + /* Flag to indicate whether computing and verifying checksum * is disabled. */ bool checksum_disable; -- cgit v1.2.3-70-g09d2 From b800c3b966bcf004bd8592293a49ed5cb7ea67a9 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 27 Aug 2013 01:36:51 +0200 Subject: ipv6: drop fragmented ndisc packets by default (RFC 6980) This patch implements RFC6980: Drop fragmented ndisc packets by default. If a fragmented ndisc packet is received the user is informed that it is possible to disable the check. Cc: Fernando Gont Cc: YOSHIFUJI Hideaki Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 6 ++++++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 10 ++++++++++ net/ipv6/ndisc.c | 17 +++++++++++++++++ 5 files changed, 35 insertions(+) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index debfe857d8f..a2be556032c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1349,6 +1349,12 @@ mldv2_unsolicited_report_interval - INTEGER MLDv2 report retransmit will take place. Default: 1000 (1 second) +suppress_frag_ndisc - INTEGER + Control RFC 6980 (Security Implications of IPv6 Fragmentation + with IPv6 Neighbor Discovery) behavior: + 1 - (default) discard fragmented neighbor discovery packets + 0 - allow fragmented neighbor discovery packets + icmp/*: ratelimit - INTEGER Limit the maximal rates for sending ICMPv6 packets. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 9ac5047062c..28ea3843931 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -50,6 +50,7 @@ struct ipv6_devconf { __s32 accept_dad; __s32 force_tllao; __s32 ndisc_notify; + __s32 suppress_frag_ndisc; void *sysctl; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index d07ac6903e5..593b0e32d95 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -162,6 +162,7 @@ enum { DEVCONF_NDISC_NOTIFY, DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL, DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_SUPPRESS_FRAG_NDISC, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2d6d1793bbf..a7183fc9bbc 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -204,6 +204,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, .accept_dad = 1, + .suppress_frag_ndisc = 1, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -241,6 +242,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, .accept_dad = 1, + .suppress_frag_ndisc = 1, }; /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ @@ -4188,6 +4190,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify; + array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; } static inline size_t inet6_ifla6_size(void) @@ -5001,6 +5004,13 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "suppress_frag_ndisc", + .data = &ipv6_devconf.suppress_frag_ndisc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { /* sentinel */ } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 04d31c2fbef..41720feeaa6 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1519,10 +1519,27 @@ static void pndisc_redo(struct sk_buff *skb) kfree_skb(skb); } +static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) +{ + struct inet6_dev *idev = __in6_dev_get(skb->dev); + + if (!idev) + return true; + if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && + idev->cnf.suppress_frag_ndisc) { + net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); + return true; + } + return false; +} + int ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; + if (ndisc_suppress_frag_ndisc(skb)) + return 0; + if (skb_linearize(skb)) return 0; -- cgit v1.2.3-70-g09d2 From 95bd09eb27507691520d39ee1044d6ad831c1168 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Aug 2013 05:46:32 -0700 Subject: tcp: TSO packets automatic sizing After hearing many people over past years complaining against TSO being bursty or even buggy, we are proud to present automatic sizing of TSO packets. One part of the problem is that tcp_tso_should_defer() uses an heuristic relying on upcoming ACKS instead of a timer, but more generally, having big TSO packets makes little sense for low rates, as it tends to create micro bursts on the network, and general consensus is to reduce the buffering amount. This patch introduces a per socket sk_pacing_rate, that approximates the current sending rate, and allows us to size the TSO packets so that we try to send one packet every ms. This field could be set by other transports. Patch has no impact for high speed flows, where having large TSO packets makes sense to reach line rate. For other flows, this helps better packet scheduling and ACK clocking. This patch increases performance of TCP flows in lossy environments. A new sysctl (tcp_min_tso_segs) is added, to specify the minimal size of a TSO packet (default being 2). A follow-up patch will provide a new packet scheduler (FQ), using sk_pacing_rate as an input to perform optional per flow pacing. This explains why we chose to set sk_pacing_rate to twice the current rate, allowing 'slow start' ramp up. sk_pacing_rate = 2 * cwnd * mss / srtt v2: Neal Cardwell reported a suspect deferring of last two segments on initial write of 10 MSS, I had to change tcp_tso_should_defer() to take into account tp->xmit_size_goal_segs Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Van Jacobson Cc: Tom Herbert Acked-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/net/sock.h | 2 ++ include/net/tcp.h | 1 + net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ net/ipv4/tcp.c | 28 +++++++++++++++++++++++----- net/ipv4/tcp_input.c | 32 +++++++++++++++++++++++++++++++- net/ipv4/tcp_output.c | 2 +- 7 files changed, 77 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a2be556032c..1cb3aeb4baf 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -482,6 +482,15 @@ tcp_syn_retries - INTEGER tcp_timestamps - BOOLEAN Enable timestamps as defined in RFC1323. +tcp_min_tso_segs - INTEGER + Minimal number of segments per TSO frame. + Since linux-3.12, TCP does an automatic sizing of TSO frames, + depending on flow rate, instead of filling 64Kbytes packets. + For specific usages, it's possible to force TCP to build big + TSO frames. Note that TCP stack might split too big TSO packets + if available window is too small. + Default: 2 + tcp_tso_win_divisor - INTEGER This allows control over what percentage of the congestion window can be consumed by a single TSO frame. diff --git a/include/net/sock.h b/include/net/sock.h index e4bbcbfd07e..6ba2e7b0e2b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -232,6 +232,7 @@ struct cg_proto; * @sk_napi_id: id of the last napi context to receive data for sk * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode + * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_sndbuf: size of send buffer in bytes * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings @@ -361,6 +362,7 @@ struct sock { kmemcheck_bitfield_end(flags); int sk_wmem_queued; gfp_t sk_allocation; + u32 sk_pacing_rate; /* bytes per second */ netdev_features_t sk_route_caps; netdev_features_t sk_route_nocaps; int sk_gso_type; diff --git a/include/net/tcp.h b/include/net/tcp.h index dd5e16f66f8..6a6a88db462 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -281,6 +281,7 @@ extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; extern unsigned int sysctl_tcp_notsent_lowat; +extern int sysctl_tcp_min_tso_segs; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 8ed7c32ae28..540279f4c53 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -29,6 +29,7 @@ static int zero; static int one = 1; static int four = 4; +static int gso_max_segs = GSO_MAX_SEGS; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -760,6 +761,15 @@ static struct ctl_table ipv4_table[] = { .extra1 = &zero, .extra2 = &four, }, + { + .procname = "tcp_min_tso_segs", + .data = &sysctl_tcp_min_tso_segs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &gso_max_segs, + }, { .procname = "udp_mem", .data = &sysctl_udp_mem, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e42c03859f..fdf74090a00 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -283,6 +283,8 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; +int sysctl_tcp_min_tso_segs __read_mostly = 2; + struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -785,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, xmit_size_goal = mss_now; if (large_allowed && sk_can_gso(sk)) { - xmit_size_goal = ((sk->sk_gso_max_size - 1) - - inet_csk(sk)->icsk_af_ops->net_header_len - - inet_csk(sk)->icsk_ext_hdr_len - - tp->tcp_header_len); + u32 gso_size, hlen; + + /* Maybe we should/could use sk->sk_prot->max_header here ? */ + hlen = inet_csk(sk)->icsk_af_ops->net_header_len + + inet_csk(sk)->icsk_ext_hdr_len + + tp->tcp_header_len; + + /* Goal is to send at least one packet per ms, + * not one big TSO packet every 100 ms. + * This preserves ACK clocking and is consistent + * with tcp_tso_should_defer() heuristic. + */ + gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); + gso_size = max_t(u32, gso_size, + sysctl_tcp_min_tso_segs * mss_now); + + xmit_size_goal = min_t(u32, gso_size, + sk->sk_gso_max_size - 1 - hlen); - /* TSQ : try to have two TSO segments in flight */ + /* TSQ : try to have at least two segments in flight + * (one in NIC TX ring, another in Qdisc) + */ xmit_size_goal = min_t(u32, xmit_size_goal, sysctl_tcp_limit_output_bytes >> 1); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ec492eae0cd..1a84fffe699 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) } } +/* Set the sk_pacing_rate to allow proper sizing of TSO packets. + * Note: TCP stack does not yet implement pacing. + * FQ packet scheduler can be used to implement cheap but effective + * TCP pacing, to smooth the burst on large writes when packets + * in flight is significantly lower than cwnd (or rwin) + */ +static void tcp_update_pacing_rate(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u64 rate; + + /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ + rate = (u64)tp->mss_cache * 2 * (HZ << 3); + + rate *= max(tp->snd_cwnd, tp->packets_out); + + /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), + * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) + * We probably need usec resolution in the future. + * Note: This also takes care of possible srtt=0 case, + * when tcp_rtt_estimator() was not yet called. + */ + if (tp->srtt > 8 + 2) + do_div(rate, tp->srtt); + + sk->sk_pacing_rate = min_t(u64, rate, ~0U); +} + /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ @@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_in_flight; + u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; u32 prior_fackets; int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; @@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); + if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) + tcp_update_pacing_rate(sk); return 1; no_queue: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 884efff5b53..e63ae4c9691 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1631,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) /* If a full-sized TSO skb can be sent, do it. */ if (limit >= min_t(unsigned int, sk->sk_gso_max_size, - sk->sk_gso_max_segs * tp->mss_cache)) + tp->xmit_size_goal_segs * tp->mss_cache)) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ -- cgit v1.2.3-70-g09d2 From 5d261913ca3daf6c2d21d38924235667b3d07c40 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:05 +0200 Subject: net: add lower_dev_list to net_device and make a full mesh This patch adds lower_dev_list list_head to net_device, which is the same as upper_dev_list, only for lower devices, and begins to use it in the same way as the upper list. It also changes the way the whole adjacent device lists work - now they contain *all* of upper/lower devices, not only the first level. The first level devices are distinguished by the bool neighbour field in netdev_adjacent, also added by this patch. There are cases when a device can be added several times to the adjacent list, the simplest would be: /---- eth0.10 ---\ eth0- --- bond0 \---- eth0.20 ---/ where both bond0 and eth0 'see' each other in the adjacent lists two times. To avoid duplication of netdev_adjacent structures ref_nr is being kept as the number of times the device was added to the list. The 'full view' is achieved by adding, on link creation, all of the upper_dev's upper_dev_list devices as upper devices to all of the lower_dev's lower_dev_list devices (and to the lower_dev itself), and vice versa. On unlink they are removed using the same logic. I've tested it with thousands vlans/bonds/bridges, everything works ok and no observable lags even on a huge number of interfaces. Memory footprint for 128 devices interconnected with each other via both upper and lower (which is impossible, but for the comparison) lists would be: 128*128*2*sizeof(netdev_adjacent) = 1.5MB but in the real world we usualy have at most several devices with slaves and a lot of vlans, so the footprint will be much lower. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 285 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 259 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 077363dcd86..5ccf5b73c37 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1125,6 +1125,7 @@ struct net_device { struct list_head napi_list; struct list_head unreg_list; struct list_head upper_dev_list; /* List of upper devices */ + struct list_head lower_dev_list; /* currently active device features */ diff --git a/net/core/dev.c b/net/core/dev.c index 5072e2c1a07..2aa914eee05 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4369,7 +4369,16 @@ softnet_break: struct netdev_adjacent { struct net_device *dev; + + /* upper master flag, there can only be one master device per list */ bool master; + + /* indicates that this dev is our first-level lower/upper device */ + bool neighbour; + + /* counter for the number of times this device was added to us */ + u16 ref_nr; + struct list_head list; struct rcu_head rcu; struct list_head search_list; @@ -4408,18 +4417,34 @@ static bool __netdev_search_upper_dev(struct net_device *dev, return ret; } -static struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, - struct net_device *upper_dev) +static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, + struct net_device *adj_dev, + bool upper) { - struct netdev_adjacent *upper; + struct netdev_adjacent *adj; + struct list_head *dev_list; - list_for_each_entry(upper, &dev->upper_dev_list, list) { - if (upper->dev == upper_dev) - return upper; + dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list; + + list_for_each_entry(adj, dev_list, list) { + if (adj->dev == adj_dev) + return adj; } return NULL; } +static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_find_adj(dev, udev, true); +} + +static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, + struct net_device *ldev) +{ + return __netdev_find_adj(dev, ldev, false); +} + /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device @@ -4496,10 +4521,149 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + bool neighbour, bool master, + bool upper) +{ + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(dev, adj_dev, upper); + + if (adj) { + BUG_ON(neighbour); + adj->ref_nr++; + return 0; + } + + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; + + adj->dev = adj_dev; + adj->master = master; + adj->neighbour = neighbour; + adj->ref_nr = 1; + INIT_LIST_HEAD(&adj->search_list); + + dev_hold(adj_dev); + pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); + + if (!upper) { + list_add_tail_rcu(&adj->list, &dev->lower_dev_list); + return 0; + } + + /* Ensure that master upper link is always the first item in list. */ + if (master) + list_add_rcu(&adj->list, &dev->upper_dev_list); + else + list_add_tail_rcu(&adj->list, &dev->upper_dev_list); + + return 0; +} + +static inline int __netdev_upper_dev_insert(struct net_device *dev, + struct net_device *udev, + bool master, bool neighbour) +{ + return __netdev_adjacent_dev_insert(dev, udev, neighbour, master, + true); +} + +static inline int __netdev_lower_dev_insert(struct net_device *dev, + struct net_device *ldev, + bool neighbour) +{ + return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false, + false); +} + +void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, bool upper) +{ + struct netdev_adjacent *adj; + + if (upper) + adj = __netdev_find_upper(dev, adj_dev); + else + adj = __netdev_find_lower(dev, adj_dev); + + if (!adj) + BUG(); + + if (adj->ref_nr > 1) { + adj->ref_nr--; + return; + } + + list_del_rcu(&adj->list); + pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", + adj_dev->name, upper ? "upper" : "lower", dev->name, + adj_dev->name); + dev_put(adj_dev); + kfree_rcu(adj, rcu); +} + +static inline void __netdev_upper_dev_remove(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_adjacent_dev_remove(dev, udev, true); +} + +static inline void __netdev_lower_dev_remove(struct net_device *dev, + struct net_device *ldev) +{ + return __netdev_adjacent_dev_remove(dev, ldev, false); +} + +int __netdev_adjacent_dev_insert_link(struct net_device *dev, + struct net_device *upper_dev, + bool master, bool neighbour) +{ + int ret; + + ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour); + if (ret) + return ret; + + ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour); + if (ret) { + __netdev_upper_dev_remove(dev, upper_dev); + return ret; + } + + return 0; +} + +static inline int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *udev) +{ + return __netdev_adjacent_dev_insert_link(dev, udev, false, false); +} + +static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *udev, + bool master) +{ + return __netdev_adjacent_dev_insert_link(dev, udev, master, true); +} + +void __netdev_adjacent_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_upper_dev_remove(dev, upper_dev); + __netdev_lower_dev_remove(upper_dev, dev); +} + + static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master) { - struct netdev_adjacent *upper; + struct netdev_adjacent *i, *j, *to_i, *to_j; + int ret = 0; ASSERT_RTNL(); @@ -4516,22 +4680,76 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - upper = kmalloc(sizeof(*upper), GFP_KERNEL); - if (!upper) - return -ENOMEM; + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master); + if (ret) + return ret; - upper->dev = upper_dev; - upper->master = master; - INIT_LIST_HEAD(&upper->search_list); + /* Now that we linked these devs, make all the upper_dev's + * upper_dev_list visible to every dev's lower_dev_list and vice + * versa, and don't forget the devices itself. All of these + * links are non-neighbours. + */ + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + list_for_each_entry(j, &dev->lower_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, j->dev); + if (ret) + goto rollback_mesh; + } + } + + /* add dev to every upper_dev's upper device */ + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + ret = __netdev_adjacent_dev_link(dev, i->dev); + if (ret) + goto rollback_upper_mesh; + } + + /* add upper_dev to every dev's lower device */ + list_for_each_entry(i, &dev->lower_dev_list, list) { + ret = __netdev_adjacent_dev_link(i->dev, upper_dev); + if (ret) + goto rollback_lower_mesh; + } - /* Ensure that master upper link is always the first item in list. */ - if (master) - list_add_rcu(&upper->list, &dev->upper_dev_list); - else - list_add_tail_rcu(&upper->list, &dev->upper_dev_list); - dev_hold(upper_dev); call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); return 0; + +rollback_lower_mesh: + to_i = i; + list_for_each_entry(i, &dev->lower_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + } + + i = NULL; + +rollback_upper_mesh: + to_i = i; + list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + if (i == to_i) + break; + __netdev_adjacent_dev_unlink(dev, i->dev); + } + + i = j = NULL; + +rollback_mesh: + to_i = i; + to_j = j; + list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + if (i == to_i && j == to_j) + break; + __netdev_adjacent_dev_unlink(i->dev, j->dev); + } + if (i == to_i) + break; + } + + __netdev_adjacent_dev_unlink(dev, upper_dev); + + return ret; } /** @@ -4580,16 +4798,28 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_adjacent *upper; - + struct netdev_adjacent *i, *j; ASSERT_RTNL(); - upper = __netdev_find_upper(dev, upper_dev); - if (!upper) - return; - list_del_rcu(&upper->list); - dev_put(upper_dev); - kfree_rcu(upper, rcu); + __netdev_adjacent_dev_unlink(dev, upper_dev); + + /* Here is the tricky part. We must remove all dev's lower + * devices from all upper_dev's upper devices and vice + * versa, to maintain the graph relationship. + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + list_for_each_entry(j, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, j->dev); + + /* remove also the devices itself from lower/upper device + * list + */ + list_for_each_entry(i, &dev->lower_dev_list, list) + __netdev_adjacent_dev_unlink(i->dev, upper_dev); + + list_for_each_entry(i, &upper_dev->upper_dev_list, list) + __netdev_adjacent_dev_unlink(dev, i->dev); + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -5850,6 +6080,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->upper_dev_list); + INIT_LIST_HEAD(&dev->lower_dev_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); -- cgit v1.2.3-70-g09d2 From 8b5be8561b804edf6b58fc27edbccf1d45863e08 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 28 Aug 2013 23:25:08 +0200 Subject: net: add netdev_for_each_upper_dev_rcu() The new macro netdev_for_each_upper_dev_rcu(dev, upper, iter) iterates through the dev->upper_dev_list starting from the first element, using the netdev_upper_get_next_dev_rcu(dev, &iter). Must be called under RCU read lock. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Alexander Duyck CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5ccf5b73c37..3ad49b833ea 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2768,6 +2768,16 @@ extern int bpf_jit_enable; extern bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); extern bool netdev_has_any_upper_dev(struct net_device *dev); +extern struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter); + +/* iterate through upper list, must be called under RCU read lock */ +#define netdev_for_each_upper_dev_rcu(dev, upper, iter) \ + for (iter = &(dev)->upper_dev_list, \ + upper = netdev_upper_get_next_dev_rcu(dev, &(iter)); \ + upper; \ + upper = netdev_upper_get_next_dev_rcu(dev, &(iter))) + extern struct net_device *netdev_master_upper_dev_get(struct net_device *dev); extern struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); extern int netdev_upper_dev_link(struct net_device *dev, -- cgit v1.2.3-70-g09d2 From 5df0ddfbc9209ffafc82236509ba0e975120e3c3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 28 Aug 2013 22:13:09 +0200 Subject: net: packet: add randomized fanout scheduler We currently allow for different fanout scheduling policies in pf_packet such as scheduling by skb's rxhash, round-robin, by cpu, and rollover. Also allow for a random, equidistributed selection of the socket from the fanout process group. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 1 + net/packet/af_packet.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index b950c02030c..dbf06667394 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -56,6 +56,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_LB 1 #define PACKET_FANOUT_CPU 2 #define PACKET_FANOUT_ROLLOVER 3 +#define PACKET_FANOUT_RND 4 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 1fdf9ab91c3..bee9bfdc8d0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -88,7 +88,7 @@ #include #include #include - +#include #ifdef CONFIG_INET #include #endif @@ -1158,6 +1158,13 @@ static unsigned int fanout_demux_cpu(struct packet_fanout *f, return smp_processor_id() % num; } +static unsigned int fanout_demux_rnd(struct packet_fanout *f, + struct sk_buff *skb, + unsigned int num) +{ + return reciprocal_divide(prandom_u32(), num); +} + static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, unsigned int idx, unsigned int skip, @@ -1215,6 +1222,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, case PACKET_FANOUT_CPU: idx = fanout_demux_cpu(f, skb, num); break; + case PACKET_FANOUT_RND: + idx = fanout_demux_rnd(f, skb, num); + break; case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); break; @@ -1284,6 +1294,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: + case PACKET_FANOUT_RND: break; default: return -EINVAL; -- cgit v1.2.3-70-g09d2 From 391ac1282dd7ff1cb8245cccc5262e8e4173edc4 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 26 Aug 2013 15:05:36 +0200 Subject: can: gw: add a per rule limitation of frame hops Usually the received CAN frames can be processed/routed as much as 'max_hops' times (which is given at module load time of the can-gw module). Introduce a new configuration option to reduce the number of possible hops for a specific gateway rule to a value smaller then max_hops. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/gw.h | 9 ++++++++- net/can/gw.c | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/can/gw.h b/include/uapi/linux/can/gw.h index ae07bec74f4..4e27c82b564 100644 --- a/include/uapi/linux/can/gw.h +++ b/include/uapi/linux/can/gw.h @@ -45,6 +45,7 @@ enum { CGW_DST_IF, /* ifindex of destination network interface */ CGW_FILTER, /* specify struct can_filter on source CAN device */ CGW_DELETED, /* number of deleted CAN frames (see max_hops param) */ + CGW_LIM_HOPS, /* limit the number of hops of this specific rule */ __CGW_MAX }; @@ -116,13 +117,19 @@ enum { * Sets a CAN receive filter for the gateway job specified by the * struct can_filter described in include/linux/can.h * - * CGW_MOD_XXX (length 17 bytes): + * CGW_MOD_(AND|OR|XOR|SET) (length 17 bytes): * Specifies a modification that's done to a received CAN frame before it is * send out to the destination interface. * * data used as operator * affected CAN frame elements * + * CGW_LIM_HOPS (length 1 byte): + * Limit the number of hops of this specific rule. Usually the received CAN + * frame can be processed as much as 'max_hops' times (which is given at module + * load time of the can-gw module). This value is used to reduce the number of + * possible hops for this gateway rule to a value smaller then max_hops. + * * CGW_CS_XOR (length 4 bytes): * Set a simple XOR checksum starting with an initial value into * data[result-idx] using data[start-idx] .. data[end-idx] diff --git a/net/can/gw.c b/net/can/gw.c index 2f291f961a1..3f9b0f3a281 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -146,6 +146,7 @@ struct cgw_job { /* tbc */ }; u8 gwtype; + u8 limit_hops; u16 flags; }; @@ -402,6 +403,11 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) /* put the incremented hop counter in the cloned skb */ cgw_hops(nskb) = cgw_hops(skb) + 1; + + /* first processing of this CAN frame -> adjust to private hop limit */ + if (gwj->limit_hops && cgw_hops(nskb) == 1) + cgw_hops(nskb) = max_hops - gwj->limit_hops + 1; + nskb->dev = gwj->dst.dev; /* pointer to modifiable CAN frame */ @@ -509,6 +515,11 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, /* check non default settings of attributes */ + if (gwj->limit_hops) { + if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0) + goto cancel; + } + if (gwj->mod.modtype.and) { memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.and; @@ -606,11 +617,12 @@ static const struct nla_policy cgw_policy[CGW_MAX+1] = { [CGW_SRC_IF] = { .type = NLA_U32 }, [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, + [CGW_LIM_HOPS] = { .type = NLA_U8 }, }; /* check for common and gwtype specific attributes */ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, - u8 gwtype, void *gwtypeattr) + u8 gwtype, void *gwtypeattr, u8 *limhops) { struct nlattr *tb[CGW_MAX+1]; struct cgw_frame_mod mb; @@ -625,6 +637,13 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, if (err < 0) return err; + if (tb[CGW_LIM_HOPS]) { + *limhops = nla_get_u8(tb[CGW_LIM_HOPS]); + + if (*limhops < 1 || *limhops > max_hops) + return -EINVAL; + } + /* check for AND/OR/XOR/SET modifications */ if (tb[CGW_MOD_AND]) { @@ -782,6 +801,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) { struct rtcanmsg *r; struct cgw_job *gwj; + u8 limhops = 0; int err = 0; if (!capable(CAP_NET_ADMIN)) @@ -808,7 +828,8 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) gwj->flags = r->flags; gwj->gwtype = r->gwtype; - err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw); + err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw, + &limhops); if (err < 0) goto out; @@ -836,6 +857,8 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (gwj->dst.dev->type != ARPHRD_CAN || gwj->dst.dev->header_ops) goto put_src_dst_out; + gwj->limit_hops = limhops; + ASSERT_RTNL(); err = cgw_register_filter(gwj); @@ -867,13 +890,14 @@ static void cgw_remove_all_jobs(void) } } -static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) +static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) { struct cgw_job *gwj = NULL; struct hlist_node *nx; struct rtcanmsg *r; struct cf_mod mod; struct can_can_gw ccgw; + u8 limhops = 0; int err = 0; if (!capable(CAP_NET_ADMIN)) @@ -890,7 +914,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; - err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw); + err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; @@ -910,6 +934,9 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (gwj->flags != r->flags) continue; + if (gwj->limit_hops != limhops) + continue; + if (memcmp(&gwj->mod, &mod, sizeof(mod))) continue; -- cgit v1.2.3-70-g09d2 From afe4fd062416b158a8a8538b23adc1930a9b88dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Aug 2013 15:49:55 -0700 Subject: pkt_sched: fq: Fair Queue packet scheduler - Uses perfect flow match (not stochastic hash like SFQ/FQ_codel) - Uses the new_flow/old_flow separation from FQ_codel - New flows get an initial credit allowing IW10 without added delay. - Special FIFO queue for high prio packets (no need for PRIO + FQ) - Uses a hash table of RB trees to locate the flows at enqueue() time - Smart on demand gc (at enqueue() time, RB tree lookup evicts old unused flows) - Dynamic memory allocations. - Designed to allow millions of concurrent flows per Qdisc. - Small memory footprint : ~8K per Qdisc, and 104 bytes per flow. - Single high resolution timer for throttled flows (if any). - One RB tree to link throttled flows. - Ability to have a max rate per flow. We might add a socket option to add per socket limitation. Attempts have been made to add TCP pacing in TCP stack, but this seems to add complex code to an already complex stack. TCP pacing is welcomed for flows having idle times, as the cwnd permits TCP stack to queue a possibly large number of packets. This removes the 'slow start after idle' choice, hitting badly large BDP flows, and applications delivering chunks of data as video streams. Nicely spaced packets : Here interface is 10Gbit, but flow bottleneck is ~20Mbit cwin is big, yet FQ avoids the typical bursts generated by TCP (as in netperf TCP_RR -- -r 100000,100000) 15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 15:01:23.545394 IP B > A: . ack 81089 win 3668 15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 15:01:23.546565 IP B > A: . ack 83985 win 3668 15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 15:01:23.547778 IP B > A: . ack 86881 win 3668 15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 15:01:23.548949 IP B > A: . ack 89777 win 3668 15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 15:01:23.550182 IP B > A: . ack 92673 win 3668 15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 15:01:23.551406 IP B > A: . ack 95569 win 3668 15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 15:01:23.552576 IP B > A: . ack 98465 win 3668 15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 15:01:23.554204 IP B > A: . ack 100001 win 3668 15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 TCP timestamps show that most packets from B were queued in the same ms timeframe (TSval 1159799{3,4}), but FQ managed to send them right in time to avoid a big burst. In slow start or steady state, very few packets are throttled [1] FQ gets a bunch of tunables as : limit : max number of packets on whole Qdisc (default 10000) flow_limit : max number of packets per flow (default 100) quantum : the credit per RR round (default is 2 MTU) initial_quantum : initial credit for new flows (default is 10 MTU) maxrate : max per flow rate (default : unlimited) buckets : number of RB trees (default : 1024) in hash table. (consumes 8 bytes per bucket) [no]pacing : disable/enable pacing (default is enable) All of them can be changed on a live qdisc. $ tc qd add dev eth0 root fq help Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ] [ quantum BYTES ] [ initial_quantum BYTES ] [ maxrate RATE ] [ buckets NUMBER ] [ [no]pacing ] $ tc -s -d qd qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140 Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14) backlog 0b 0p requeues 14 511 flows, 511 inactive, 0 throttled 110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit [1] Except if initial srtt is overestimated, as if using cached srtt in tcp metrics. We'll provide a fix for this issue. Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 41 +++ net/sched/Kconfig | 14 + net/sched/Makefile | 1 + net/sched/sch_fq.c | 792 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 848 insertions(+) create mode 100644 net/sched/sch_fq.c (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 09d62b9228f..9b829134d42 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -744,4 +744,45 @@ struct tc_fq_codel_xstats { }; }; +/* FQ */ + +enum { + TCA_FQ_UNSPEC, + + TCA_FQ_PLIMIT, /* limit of total number of packets in queue */ + + TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */ + + TCA_FQ_QUANTUM, /* RR quantum */ + + TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */ + + TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */ + + TCA_FQ_FLOW_DEFAULT_RATE,/* for sockets with unspecified sk_rate, + * use the following rate + */ + + TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */ + + TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */ + __TCA_FQ_MAX +}; + +#define TCA_FQ_MAX (__TCA_FQ_MAX - 1) + +struct tc_fq_qd_stats { + __u64 gc_flows; + __u64 highprio_packets; + __u64 tcp_retrans; + __u64 throttled; + __u64 flows_plimit; + __u64 pkts_too_long; + __u64 allocation_errors; + __s64 time_next_delayed_flow; + __u32 flows; + __u32 inactive_flows; + __u32 throttled_flows; + __u32 pad; +}; #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 235e01acac5..c03a32a0418 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -272,6 +272,20 @@ config NET_SCH_FQ_CODEL If unsure, say N. +config NET_SCH_FQ + tristate "Fair Queue" + help + Say Y here if you want to use the FQ packet scheduling algorithm. + + FQ does flow separation, and is able to respect pacing requirements + set by TCP stack into sk->sk_pacing_rate (for localy generated + traffic) + + To compile this driver as a module, choose M here: the module + will be called sch_fq. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index 978cbf004e8..e5f9abe9a5d 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o +obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c new file mode 100644 index 00000000000..91ceca7bcb5 --- /dev/null +++ b/net/sched/sch_fq.c @@ -0,0 +1,792 @@ +/* + * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) + * + * Copyright (C) 2013 Eric Dumazet + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Meant to be mostly used for localy generated traffic : + * Fast classification depends on skb->sk being set before reaching us. + * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. + * All packets belonging to a socket are considered as a 'flow'. + * + * Flows are dynamically allocated and stored in a hash table of RB trees + * They are also part of one Round Robin 'queues' (new or old flows) + * + * Burst avoidance (aka pacing) capability : + * + * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a + * bunch of packets, and this packet scheduler adds delay between + * packets to respect rate limitation. + * + * enqueue() : + * - lookup one RB tree (out of 1024 or more) to find the flow. + * If non existent flow, create it, add it to the tree. + * Add skb to the per flow list of skb (fifo). + * - Use a special fifo for high prio packets + * + * dequeue() : serves flows in Round Robin + * Note : When a flow becomes empty, we do not immediately remove it from + * rb trees, for performance reasons (its expected to send additional packets, + * or SLAB cache will reuse socket for another flow) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Per flow structure, dynamically allocated + */ +struct fq_flow { + struct sk_buff *head; /* list of skbs for this flow : first skb */ + union { + struct sk_buff *tail; /* last skb in the list */ + unsigned long age; /* jiffies when flow was emptied, for gc */ + }; + struct rb_node fq_node; /* anchor in fq_root[] trees */ + struct sock *sk; + int qlen; /* number of packets in flow queue */ + int credit; + u32 socket_hash; /* sk_hash */ + struct fq_flow *next; /* next pointer in RR lists, or &detached */ + + struct rb_node rate_node; /* anchor in q->delayed tree */ + u64 time_next_packet; +}; + +struct fq_flow_head { + struct fq_flow *first; + struct fq_flow *last; +}; + +struct fq_sched_data { + struct fq_flow_head new_flows; + + struct fq_flow_head old_flows; + + struct rb_root delayed; /* for rate limited flows */ + u64 time_next_delayed_flow; + + struct fq_flow internal; /* for non classified or high prio packets */ + u32 quantum; + u32 initial_quantum; + u32 flow_default_rate;/* rate per flow : bytes per second */ + u32 flow_max_rate; /* optional max rate per flow */ + u32 flow_plimit; /* max packets per flow */ + struct rb_root *fq_root; + u8 rate_enable; + u8 fq_trees_log; + + u32 flows; + u32 inactive_flows; + u32 throttled_flows; + + u64 stat_gc_flows; + u64 stat_internal_packets; + u64 stat_tcp_retrans; + u64 stat_throttled; + u64 stat_flows_plimit; + u64 stat_pkts_too_long; + u64 stat_allocation_errors; + struct qdisc_watchdog watchdog; +}; + +/* special value to mark a detached flow (not on old/new list) */ +static struct fq_flow detached, throttled; + +static void fq_flow_set_detached(struct fq_flow *f) +{ + f->next = &detached; +} + +static bool fq_flow_is_detached(const struct fq_flow *f) +{ + return f->next == &detached; +} + +static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) +{ + struct rb_node **p = &q->delayed.rb_node, *parent = NULL; + + while (*p) { + struct fq_flow *aux; + + parent = *p; + aux = container_of(parent, struct fq_flow, rate_node); + if (f->time_next_packet >= aux->time_next_packet) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&f->rate_node, parent, p); + rb_insert_color(&f->rate_node, &q->delayed); + q->throttled_flows++; + q->stat_throttled++; + + f->next = &throttled; + if (q->time_next_delayed_flow > f->time_next_packet) + q->time_next_delayed_flow = f->time_next_packet; +} + + +static struct kmem_cache *fq_flow_cachep __read_mostly; + +static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) +{ + if (head->first) + head->last->next = flow; + else + head->first = flow; + head->last = flow; + flow->next = NULL; +} + +/* limit number of collected flows per round */ +#define FQ_GC_MAX 8 +#define FQ_GC_AGE (3*HZ) + +static bool fq_gc_candidate(const struct fq_flow *f) +{ + return fq_flow_is_detached(f) && + time_after(jiffies, f->age + FQ_GC_AGE); +} + +static void fq_gc(struct fq_sched_data *q, + struct rb_root *root, + struct sock *sk) +{ + struct fq_flow *f, *tofree[FQ_GC_MAX]; + struct rb_node **p, *parent; + int fcnt = 0; + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) + break; + + if (fq_gc_candidate(f)) { + tofree[fcnt++] = f; + if (fcnt == FQ_GC_MAX) + break; + } + + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; + while (fcnt) { + struct fq_flow *f = tofree[--fcnt]; + + rb_erase(&f->fq_node, root); + kmem_cache_free(fq_flow_cachep, f); + } +} + +static const u8 prio2band[TC_PRIO_MAX + 1] = { + 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 +}; + +static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) +{ + struct rb_node **p, *parent; + struct sock *sk = skb->sk; + struct rb_root *root; + struct fq_flow *f; + int band; + + /* warning: no starvation prevention... */ + band = prio2band[skb->priority & TC_PRIO_MAX]; + if (unlikely(band == 0)) + return &q->internal; + + if (unlikely(!sk)) { + /* By forcing low order bit to 1, we make sure to not + * collide with a local flow (socket pointers are word aligned) + */ + sk = (struct sock *)(skb_get_rxhash(skb) | 1L); + } + + root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; + + if (q->flows >= (2U << q->fq_trees_log) && + q->inactive_flows > q->flows/2) + fq_gc(q, root, sk); + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) { + /* socket might have been reallocated, so check + * if its sk_hash is the same. + * It not, we need to refill credit with + * initial quantum + */ + if (unlikely(skb->sk && + f->socket_hash != sk->sk_hash)) { + f->credit = q->initial_quantum; + f->socket_hash = sk->sk_hash; + } + return f; + } + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!f)) { + q->stat_allocation_errors++; + return &q->internal; + } + fq_flow_set_detached(f); + f->sk = sk; + if (skb->sk) + f->socket_hash = sk->sk_hash; + f->credit = q->initial_quantum; + + rb_link_node(&f->fq_node, parent, p); + rb_insert_color(&f->fq_node, root); + + q->flows++; + q->inactive_flows++; + return f; +} + + +/* remove one skb from head of flow queue */ +static struct sk_buff *fq_dequeue_head(struct fq_flow *flow) +{ + struct sk_buff *skb = flow->head; + + if (skb) { + flow->head = skb->next; + skb->next = NULL; + flow->qlen--; + } + return skb; +} + +/* We might add in the future detection of retransmits + * For the time being, just return false + */ +static bool skb_is_retransmit(struct sk_buff *skb) +{ + return false; +} + +/* add skb to flow queue + * flow queue is a linked list, kind of FIFO, except for TCP retransmits + * We special case tcp retransmits to be transmitted before other packets. + * We rely on fact that TCP retransmits are unlikely, so we do not waste + * a separate queue or a pointer. + * head-> [retrans pkt 1] + * [retrans pkt 2] + * [ normal pkt 1] + * [ normal pkt 2] + * [ normal pkt 3] + * tail-> [ normal pkt 4] + */ +static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) +{ + struct sk_buff *prev, *head = flow->head; + + skb->next = NULL; + if (!head) { + flow->head = skb; + flow->tail = skb; + return; + } + if (likely(!skb_is_retransmit(skb))) { + flow->tail->next = skb; + flow->tail = skb; + return; + } + + /* This skb is a tcp retransmit, + * find the last retrans packet in the queue + */ + prev = NULL; + while (skb_is_retransmit(head)) { + prev = head; + head = head->next; + if (!head) + break; + } + if (!prev) { /* no rtx packet in queue, become the new head */ + skb->next = flow->head; + flow->head = skb; + } else { + if (prev == flow->tail) + flow->tail = skb; + else + skb->next = prev->next; + prev->next = skb; + } +} + +static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct fq_flow *f; + + if (unlikely(sch->q.qlen >= sch->limit)) + return qdisc_drop(skb, sch); + + f = fq_classify(skb, q); + if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { + q->stat_flows_plimit++; + return qdisc_drop(skb, sch); + } + + f->qlen++; + flow_queue_add(f, skb); + if (skb_is_retransmit(skb)) + q->stat_tcp_retrans++; + sch->qstats.backlog += qdisc_pkt_len(skb); + if (fq_flow_is_detached(f)) { + fq_flow_add_tail(&q->new_flows, f); + if (q->quantum > f->credit) + f->credit = q->quantum; + q->inactive_flows--; + qdisc_unthrottled(sch); + } + if (unlikely(f == &q->internal)) { + q->stat_internal_packets++; + qdisc_unthrottled(sch); + } + sch->q.qlen++; + + return NET_XMIT_SUCCESS; +} + +static void fq_check_throttled(struct fq_sched_data *q, u64 now) +{ + struct rb_node *p; + + if (q->time_next_delayed_flow > now) + return; + + q->time_next_delayed_flow = ~0ULL; + while ((p = rb_first(&q->delayed)) != NULL) { + struct fq_flow *f = container_of(p, struct fq_flow, rate_node); + + if (f->time_next_packet > now) { + q->time_next_delayed_flow = f->time_next_packet; + break; + } + rb_erase(p, &q->delayed); + q->throttled_flows--; + fq_flow_add_tail(&q->old_flows, f); + } +} + +static struct sk_buff *fq_dequeue(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct fq_flow_head *head; + struct sk_buff *skb; + struct fq_flow *f; + + skb = fq_dequeue_head(&q->internal); + if (skb) + goto out; + fq_check_throttled(q, now); +begin: + head = &q->new_flows; + if (!head->first) { + head = &q->old_flows; + if (!head->first) { + if (q->time_next_delayed_flow != ~0ULL) + qdisc_watchdog_schedule_ns(&q->watchdog, + q->time_next_delayed_flow); + return NULL; + } + } + f = head->first; + + if (f->credit <= 0) { + f->credit += q->quantum; + head->first = f->next; + fq_flow_add_tail(&q->old_flows, f); + goto begin; + } + + if (unlikely(f->head && now < f->time_next_packet)) { + head->first = f->next; + fq_flow_set_throttled(q, f); + goto begin; + } + + skb = fq_dequeue_head(f); + if (!skb) { + head->first = f->next; + /* force a pass through old_flows to prevent starvation */ + if ((head == &q->new_flows) && q->old_flows.first) { + fq_flow_add_tail(&q->old_flows, f); + } else { + fq_flow_set_detached(f); + f->age = jiffies; + q->inactive_flows++; + } + goto begin; + } + f->time_next_packet = now; + f->credit -= qdisc_pkt_len(skb); + + if (f->credit <= 0 && + q->rate_enable && + skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) { + u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate; + + rate = min(rate, q->flow_max_rate); + if (rate) { + u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC; + + do_div(len, rate); + /* Since socket rate can change later, + * clamp the delay to 125 ms. + * TODO: maybe segment the too big skb, as in commit + * e43ac79a4bc ("sch_tbf: segment too big GSO packets") + */ + if (unlikely(len > 125 * NSEC_PER_MSEC)) { + len = 125 * NSEC_PER_MSEC; + q->stat_pkts_too_long++; + } + + f->time_next_packet = now + len; + } + } +out: + prefetch(&skb->end); + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + qdisc_unthrottled(sch); + return skb; +} + +static void fq_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = fq_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void fq_rehash(struct fq_sched_data *q, + struct rb_root *old_array, u32 old_log, + struct rb_root *new_array, u32 new_log) +{ + struct rb_node *op, **np, *parent; + struct rb_root *oroot, *nroot; + struct fq_flow *of, *nf; + int fcnt = 0; + u32 idx; + + for (idx = 0; idx < (1U << old_log); idx++) { + oroot = &old_array[idx]; + while ((op = rb_first(oroot)) != NULL) { + rb_erase(op, oroot); + of = container_of(op, struct fq_flow, fq_node); + if (fq_gc_candidate(of)) { + fcnt++; + kmem_cache_free(fq_flow_cachep, of); + continue; + } + nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; + + np = &nroot->rb_node; + parent = NULL; + while (*np) { + parent = *np; + + nf = container_of(parent, struct fq_flow, fq_node); + BUG_ON(nf->sk == of->sk); + + if (nf->sk > of->sk) + np = &parent->rb_right; + else + np = &parent->rb_left; + } + + rb_link_node(&of->fq_node, parent, np); + rb_insert_color(&of->fq_node, nroot); + } + } + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; +} + +static int fq_resize(struct fq_sched_data *q, u32 log) +{ + struct rb_root *array; + u32 idx; + + if (q->fq_root && log == q->fq_trees_log) + return 0; + + array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL); + if (!array) + return -ENOMEM; + + for (idx = 0; idx < (1U << log); idx++) + array[idx] = RB_ROOT; + + if (q->fq_root) { + fq_rehash(q, q->fq_root, q->fq_trees_log, array, log); + kfree(q->fq_root); + } + q->fq_root = array; + q->fq_trees_log = log; + + return 0; +} + +static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { + [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, + [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, +}; + +static int fq_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_MAX + 1]; + int err, drop_count = 0; + u32 fq_log; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + fq_log = q->fq_trees_log; + + if (tb[TCA_FQ_BUCKETS_LOG]) { + u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); + + if (nval >= 1 && nval <= ilog2(256*1024)) + fq_log = nval; + else + err = -EINVAL; + } + if (tb[TCA_FQ_PLIMIT]) + sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); + + if (tb[TCA_FQ_FLOW_PLIMIT]) + q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); + + if (tb[TCA_FQ_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); + + if (tb[TCA_FQ_INITIAL_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); + + if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) + q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]); + + if (tb[TCA_FQ_FLOW_MAX_RATE]) + q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + + if (tb[TCA_FQ_RATE_ENABLE]) { + u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); + + if (enable <= 1) + q->rate_enable = enable; + else + err = -EINVAL; + } + + if (!err) + err = fq_resize(q, fq_log); + + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_dequeue(sch); + + kfree_skb(skb); + drop_count++; + } + qdisc_tree_decrease_qlen(sch, drop_count); + + sch_tree_unlock(sch); + return err; +} + +static void fq_destroy(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct rb_root *root; + struct rb_node *p; + unsigned int idx; + + if (q->fq_root) { + for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { + root = &q->fq_root[idx]; + while ((p = rb_first(root)) != NULL) { + rb_erase(p, root); + kmem_cache_free(fq_flow_cachep, + container_of(p, struct fq_flow, fq_node)); + } + } + kfree(q->fq_root); + } + qdisc_watchdog_cancel(&q->watchdog); +} + +static int fq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + int err; + + sch->limit = 10000; + q->flow_plimit = 100; + q->quantum = 2 * psched_mtu(qdisc_dev(sch)); + q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); + q->flow_default_rate = 0; + q->flow_max_rate = ~0U; + q->rate_enable = 1; + q->new_flows.first = NULL; + q->old_flows.first = NULL; + q->delayed = RB_ROOT; + q->fq_root = NULL; + q->fq_trees_log = ilog2(1024); + qdisc_watchdog_init(&q->watchdog, sch); + + if (opt) + err = fq_change(sch, opt); + else + err = fq_resize(q, q->fq_trees_log); + + return err; +} + +static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || + nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || + nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || + nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || + nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) || + nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) + goto nla_put_failure; + + nla_nest_end(skb, opts); + return skb->len; + +nla_put_failure: + return -1; +} + +static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct tc_fq_qd_stats st = { + .gc_flows = q->stat_gc_flows, + .highprio_packets = q->stat_internal_packets, + .tcp_retrans = q->stat_tcp_retrans, + .throttled = q->stat_throttled, + .flows_plimit = q->stat_flows_plimit, + .pkts_too_long = q->stat_pkts_too_long, + .allocation_errors = q->stat_allocation_errors, + .flows = q->flows, + .inactive_flows = q->inactive_flows, + .throttled_flows = q->throttled_flows, + .time_next_delayed_flow = q->time_next_delayed_flow - now, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops fq_qdisc_ops __read_mostly = { + .id = "fq", + .priv_size = sizeof(struct fq_sched_data), + + .enqueue = fq_enqueue, + .dequeue = fq_dequeue, + .peek = qdisc_peek_dequeued, + .init = fq_init, + .reset = fq_reset, + .destroy = fq_destroy, + .change = fq_change, + .dump = fq_dump, + .dump_stats = fq_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_module_init(void) +{ + int ret; + + fq_flow_cachep = kmem_cache_create("fq_flow_cache", + sizeof(struct fq_flow), + 0, 0, NULL); + if (!fq_flow_cachep) + return -ENOMEM; + + ret = register_qdisc(&fq_qdisc_ops); + if (ret) + kmem_cache_destroy(fq_flow_cachep); + return ret; +} + +static void __exit fq_module_exit(void) +{ + unregister_qdisc(&fq_qdisc_ops); + kmem_cache_destroy(fq_flow_cachep); +} + +module_init(fq_module_init) +module_exit(fq_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-70-g09d2 From ede23fa8161c1a04aa1b3bf5447812ca14b3fef1 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 26 Aug 2013 22:45:23 -0700 Subject: drivers:net: Convert dma_alloc_coherent(...__GFP_ZERO) to dma_zalloc_coherent __GFP_ZERO is an uncommon flag and perhaps is better not used. static inline dma_zalloc_coherent exists so convert the uses of dma_alloc_coherent with __GFP_ZERO to the more common kernel style with zalloc. Remove memset from the static inline dma_zalloc_coherent and add just one use of __GFP_ZERO instead. Trivially reduces the size of the existing uses of dma_zalloc_coherent. Realign arguments as appropriate. Signed-off-by: Joe Perches Acked-by: Neil Horman Acked-by: Jesse Brandeburg Acked-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/aeroflex/greth.c | 12 ++++----- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 6 ++--- drivers/net/ethernet/broadcom/bnx2.c | 5 ++-- drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 5 ++-- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 3 +-- drivers/net/ethernet/broadcom/tg3.c | 23 ++++++++--------- drivers/net/ethernet/emulex/benet/be_main.c | 18 +++++++------- drivers/net/ethernet/faraday/ftgmac100.c | 7 +++--- drivers/net/ethernet/faraday/ftmac100.c | 8 +++--- drivers/net/ethernet/ibm/emac/mal.c | 4 +-- drivers/net/ethernet/intel/e1000/e1000_ethtool.c | 8 +++--- drivers/net/ethernet/intel/ixgb/ixgb_main.c | 4 +-- drivers/net/ethernet/marvell/pxa168_eth.c | 19 +++++++------- drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 6 ++--- .../net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 16 ++++++------ drivers/net/ethernet/pasemi/pasemi_mac.c | 7 +++--- drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c | 29 +++++++++++----------- drivers/net/ethernet/sfc/nic.c | 5 ++-- drivers/net/ethernet/sgi/meth.c | 5 ++-- drivers/net/ethernet/tundra/tsi108_eth.c | 8 +++--- drivers/net/ethernet/xilinx/ll_temac_main.c | 12 ++++----- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 14 +++++------ drivers/net/fddi/defxx.c | 6 ++--- drivers/net/irda/ali-ircc.c | 8 +++--- drivers/net/irda/nsc-ircc.c | 8 +++--- drivers/net/irda/smsc-ircc2.c | 8 +++--- drivers/net/irda/via-ircc.c | 8 +++--- drivers/net/irda/w83977af_ir.c | 8 +++--- drivers/net/wireless/b43/dma.c | 6 ++--- drivers/net/wireless/b43legacy/dma.c | 7 +++--- include/linux/dma-mapping.h | 5 ++-- 31 files changed, 135 insertions(+), 153 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/aeroflex/greth.c b/drivers/net/ethernet/aeroflex/greth.c index 7ff4b30d55e..e0669455514 100644 --- a/drivers/net/ethernet/aeroflex/greth.c +++ b/drivers/net/ethernet/aeroflex/greth.c @@ -1464,18 +1464,18 @@ static int greth_of_probe(struct platform_device *ofdev) } /* Allocate TX descriptor ring in coherent memory */ - greth->tx_bd_base = dma_alloc_coherent(greth->dev, 1024, - &greth->tx_bd_base_phys, - GFP_KERNEL | __GFP_ZERO); + greth->tx_bd_base = dma_zalloc_coherent(greth->dev, 1024, + &greth->tx_bd_base_phys, + GFP_KERNEL); if (!greth->tx_bd_base) { err = -ENOMEM; goto error3; } /* Allocate RX descriptor ring in coherent memory */ - greth->rx_bd_base = dma_alloc_coherent(greth->dev, 1024, - &greth->rx_bd_base_phys, - GFP_KERNEL | __GFP_ZERO); + greth->rx_bd_base = dma_zalloc_coherent(greth->dev, 1024, + &greth->rx_bd_base_phys, + GFP_KERNEL); if (!greth->rx_bd_base) { err = -ENOMEM; goto error4; diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index 190219e0240..98b68703c53 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -948,8 +948,7 @@ static int bcm_enet_open(struct net_device *dev) /* allocate rx dma ring */ size = priv->rx_ring_size * sizeof(struct bcm_enet_desc); - p = dma_alloc_coherent(kdev, size, &priv->rx_desc_dma, - GFP_KERNEL | __GFP_ZERO); + p = dma_zalloc_coherent(kdev, size, &priv->rx_desc_dma, GFP_KERNEL); if (!p) { ret = -ENOMEM; goto out_freeirq_tx; @@ -960,8 +959,7 @@ static int bcm_enet_open(struct net_device *dev) /* allocate tx dma ring */ size = priv->tx_ring_size * sizeof(struct bcm_enet_desc); - p = dma_alloc_coherent(kdev, size, &priv->tx_desc_dma, - GFP_KERNEL | __GFP_ZERO); + p = dma_zalloc_coherent(kdev, size, &priv->tx_desc_dma, GFP_KERNEL); if (!p) { ret = -ENOMEM; goto out_free_rx_ring; diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index 4148058cef8..e838a3f74b6 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -853,9 +853,8 @@ bnx2_alloc_mem(struct bnx2 *bp) bp->status_stats_size = status_blk_size + sizeof(struct statistics_block); - status_blk = dma_alloc_coherent(&bp->pdev->dev, bp->status_stats_size, - &bp->status_blk_mapping, - GFP_KERNEL | __GFP_ZERO); + status_blk = dma_zalloc_coherent(&bp->pdev->dev, bp->status_stats_size, + &bp->status_blk_mapping, GFP_KERNEL); if (status_blk == NULL) goto alloc_mem_err; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h index 12202f81735..3e77a1b1a44 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h @@ -2069,9 +2069,8 @@ static inline u32 reg_poll(struct bnx2x *bp, u32 reg, u32 expected, int ms, void bnx2x_igu_clear_sb_gen(struct bnx2x *bp, u8 func, u8 idu_sb_id, bool is_pf); -#define BNX2X_ILT_ZALLOC(x, y, size) \ - x = dma_alloc_coherent(&bp->pdev->dev, size, y, \ - GFP_KERNEL | __GFP_ZERO) +#define BNX2X_ILT_ZALLOC(x, y, size) \ + x = dma_zalloc_coherent(&bp->pdev->dev, size, y, GFP_KERNEL) #define BNX2X_ILT_FREE(x, y, size) \ do { \ diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h index 38be494ffa6..affb7646241 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h @@ -51,8 +51,7 @@ extern int int_mode; #define BNX2X_PCI_ALLOC(x, y, size) \ do { \ - x = dma_alloc_coherent(&bp->pdev->dev, size, y, \ - GFP_KERNEL | __GFP_ZERO); \ + x = dma_zalloc_coherent(&bp->pdev->dev, size, y, GFP_KERNEL); \ if (x == NULL) \ goto alloc_mem_err; \ DP(NETIF_MSG_HW, "BNX2X_PCI_ALLOC: Physical %Lx Virtual %p\n", \ diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 95b8995187d..2e55ee29cf1 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -8591,10 +8591,10 @@ static int tg3_mem_rx_acquire(struct tg3 *tp) if (!i && tg3_flag(tp, ENABLE_RSS)) continue; - tnapi->rx_rcb = dma_alloc_coherent(&tp->pdev->dev, - TG3_RX_RCB_RING_BYTES(tp), - &tnapi->rx_rcb_mapping, - GFP_KERNEL | __GFP_ZERO); + tnapi->rx_rcb = dma_zalloc_coherent(&tp->pdev->dev, + TG3_RX_RCB_RING_BYTES(tp), + &tnapi->rx_rcb_mapping, + GFP_KERNEL); if (!tnapi->rx_rcb) goto err_out; } @@ -8643,10 +8643,9 @@ static int tg3_alloc_consistent(struct tg3 *tp) { int i; - tp->hw_stats = dma_alloc_coherent(&tp->pdev->dev, - sizeof(struct tg3_hw_stats), - &tp->stats_mapping, - GFP_KERNEL | __GFP_ZERO); + tp->hw_stats = dma_zalloc_coherent(&tp->pdev->dev, + sizeof(struct tg3_hw_stats), + &tp->stats_mapping, GFP_KERNEL); if (!tp->hw_stats) goto err_out; @@ -8654,10 +8653,10 @@ static int tg3_alloc_consistent(struct tg3 *tp) struct tg3_napi *tnapi = &tp->napi[i]; struct tg3_hw_status *sblk; - tnapi->hw_status = dma_alloc_coherent(&tp->pdev->dev, - TG3_HW_STATUS_SIZE, - &tnapi->status_mapping, - GFP_KERNEL | __GFP_ZERO); + tnapi->hw_status = dma_zalloc_coherent(&tp->pdev->dev, + TG3_HW_STATUS_SIZE, + &tnapi->status_mapping, + GFP_KERNEL); if (!tnapi->hw_status) goto err_out; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 50116f8ed57..39e0a7697a8 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -145,8 +145,8 @@ static int be_queue_alloc(struct be_adapter *adapter, struct be_queue_info *q, q->len = len; q->entry_size = entry_size; mem->size = len * entry_size; - mem->va = dma_alloc_coherent(&adapter->pdev->dev, mem->size, &mem->dma, - GFP_KERNEL | __GFP_ZERO); + mem->va = dma_zalloc_coherent(&adapter->pdev->dev, mem->size, &mem->dma, + GFP_KERNEL); if (!mem->va) return -ENOMEM; return 0; @@ -2642,8 +2642,8 @@ static int be_setup_wol(struct be_adapter *adapter, bool enable) memset(mac, 0, ETH_ALEN); cmd.size = sizeof(struct be_cmd_req_acpi_wol_magic_config); - cmd.va = dma_alloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma, - GFP_KERNEL | __GFP_ZERO); + cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma, + GFP_KERNEL); if (cmd.va == NULL) return -1; @@ -3946,9 +3946,9 @@ static int be_ctrl_init(struct be_adapter *adapter) memset(mbox_mem_align->va, 0, sizeof(struct be_mcc_mailbox)); rx_filter->size = sizeof(struct be_cmd_req_rx_filter); - rx_filter->va = dma_alloc_coherent(&adapter->pdev->dev, rx_filter->size, - &rx_filter->dma, - GFP_KERNEL | __GFP_ZERO); + rx_filter->va = dma_zalloc_coherent(&adapter->pdev->dev, + rx_filter->size, &rx_filter->dma, + GFP_KERNEL); if (rx_filter->va == NULL) { status = -ENOMEM; goto free_mbox; @@ -3994,8 +3994,8 @@ static int be_stats_init(struct be_adapter *adapter) /* BE3 and Skyhawk */ cmd->size = sizeof(struct be_cmd_req_get_stats_v1); - cmd->va = dma_alloc_coherent(&adapter->pdev->dev, cmd->size, &cmd->dma, - GFP_KERNEL | __GFP_ZERO); + cmd->va = dma_zalloc_coherent(&adapter->pdev->dev, cmd->size, &cmd->dma, + GFP_KERNEL); if (cmd->va == NULL) return -1; return 0; diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 934e1ae279f..212f44b3a77 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -778,10 +778,9 @@ static int ftgmac100_alloc_buffers(struct ftgmac100 *priv) { int i; - priv->descs = dma_alloc_coherent(priv->dev, - sizeof(struct ftgmac100_descs), - &priv->descs_dma_addr, - GFP_KERNEL | __GFP_ZERO); + priv->descs = dma_zalloc_coherent(priv->dev, + sizeof(struct ftgmac100_descs), + &priv->descs_dma_addr, GFP_KERNEL); if (!priv->descs) return -ENOMEM; diff --git a/drivers/net/ethernet/faraday/ftmac100.c b/drivers/net/ethernet/faraday/ftmac100.c index 4658f4cc196..8be5b40c0a1 100644 --- a/drivers/net/ethernet/faraday/ftmac100.c +++ b/drivers/net/ethernet/faraday/ftmac100.c @@ -732,10 +732,10 @@ static int ftmac100_alloc_buffers(struct ftmac100 *priv) { int i; - priv->descs = dma_alloc_coherent(priv->dev, - sizeof(struct ftmac100_descs), - &priv->descs_dma_addr, - GFP_KERNEL | __GFP_ZERO); + priv->descs = dma_zalloc_coherent(priv->dev, + sizeof(struct ftmac100_descs), + &priv->descs_dma_addr, + GFP_KERNEL); if (!priv->descs) return -ENOMEM; diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c index 856ea66c922..dac564c2544 100644 --- a/drivers/net/ethernet/ibm/emac/mal.c +++ b/drivers/net/ethernet/ibm/emac/mal.c @@ -637,8 +637,8 @@ static int mal_probe(struct platform_device *ofdev) bd_size = sizeof(struct mal_descriptor) * (NUM_TX_BUFF * mal->num_tx_chans + NUM_RX_BUFF * mal->num_rx_chans); - mal->bd_virt = dma_alloc_coherent(&ofdev->dev, bd_size, &mal->bd_dma, - GFP_KERNEL | __GFP_ZERO); + mal->bd_virt = dma_zalloc_coherent(&ofdev->dev, bd_size, &mal->bd_dma, + GFP_KERNEL); if (mal->bd_virt == NULL) { err = -ENOMEM; goto fail_unmap; diff --git a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c index 82a967c9559..73a8aeefb92 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c +++ b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c @@ -1019,8 +1019,8 @@ static int e1000_setup_desc_rings(struct e1000_adapter *adapter) txdr->size = txdr->count * sizeof(struct e1000_tx_desc); txdr->size = ALIGN(txdr->size, 4096); - txdr->desc = dma_alloc_coherent(&pdev->dev, txdr->size, &txdr->dma, - GFP_KERNEL | __GFP_ZERO); + txdr->desc = dma_zalloc_coherent(&pdev->dev, txdr->size, &txdr->dma, + GFP_KERNEL); if (!txdr->desc) { ret_val = 2; goto err_nomem; @@ -1077,8 +1077,8 @@ static int e1000_setup_desc_rings(struct e1000_adapter *adapter) } rxdr->size = rxdr->count * sizeof(struct e1000_rx_desc); - rxdr->desc = dma_alloc_coherent(&pdev->dev, rxdr->size, &rxdr->dma, - GFP_KERNEL | __GFP_ZERO); + rxdr->desc = dma_zalloc_coherent(&pdev->dev, rxdr->size, &rxdr->dma, + GFP_KERNEL); if (!rxdr->desc) { ret_val = 6; goto err_nomem; diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c index fce3e92f9d1..9f6b236828e 100644 --- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c +++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c @@ -718,8 +718,8 @@ ixgb_setup_tx_resources(struct ixgb_adapter *adapter) txdr->size = txdr->count * sizeof(struct ixgb_tx_desc); txdr->size = ALIGN(txdr->size, 4096); - txdr->desc = dma_alloc_coherent(&pdev->dev, txdr->size, &txdr->dma, - GFP_KERNEL | __GFP_ZERO); + txdr->desc = dma_zalloc_coherent(&pdev->dev, txdr->size, &txdr->dma, + GFP_KERNEL); if (!txdr->desc) { vfree(txdr->buffer_info); return -ENOMEM; diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c index db481477bcc..3e69febe377 100644 --- a/drivers/net/ethernet/marvell/pxa168_eth.c +++ b/drivers/net/ethernet/marvell/pxa168_eth.c @@ -583,10 +583,9 @@ static int init_hash_table(struct pxa168_eth_private *pep) * table is full. */ if (pep->htpr == NULL) { - pep->htpr = dma_alloc_coherent(pep->dev->dev.parent, - HASH_ADDR_TABLE_SIZE, - &pep->htpr_dma, - GFP_KERNEL | __GFP_ZERO); + pep->htpr = dma_zalloc_coherent(pep->dev->dev.parent, + HASH_ADDR_TABLE_SIZE, + &pep->htpr_dma, GFP_KERNEL); if (pep->htpr == NULL) return -ENOMEM; } else { @@ -1024,9 +1023,9 @@ static int rxq_init(struct net_device *dev) pep->rx_desc_count = 0; size = pep->rx_ring_size * sizeof(struct rx_desc); pep->rx_desc_area_size = size; - pep->p_rx_desc_area = dma_alloc_coherent(pep->dev->dev.parent, size, - &pep->rx_desc_dma, - GFP_KERNEL | __GFP_ZERO); + pep->p_rx_desc_area = dma_zalloc_coherent(pep->dev->dev.parent, size, + &pep->rx_desc_dma, + GFP_KERNEL); if (!pep->p_rx_desc_area) goto out; @@ -1085,9 +1084,9 @@ static int txq_init(struct net_device *dev) pep->tx_desc_count = 0; size = pep->tx_ring_size * sizeof(struct tx_desc); pep->tx_desc_area_size = size; - pep->p_tx_desc_area = dma_alloc_coherent(pep->dev->dev.parent, size, - &pep->tx_desc_dma, - GFP_KERNEL | __GFP_ZERO); + pep->p_tx_desc_area = dma_zalloc_coherent(pep->dev->dev.parent, size, + &pep->tx_desc_dma, + GFP_KERNEL); if (!pep->p_tx_desc_area) goto out; /* Initialize the next_desc_ptr links in the Tx descriptors ring */ diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index 50a1d4a04eb..149355b52ad 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -3783,9 +3783,9 @@ static int myri10ge_alloc_slices(struct myri10ge_priv *mgp) for (i = 0; i < mgp->num_slices; i++) { ss = &mgp->ss[i]; bytes = mgp->max_intr_slots * sizeof(*ss->rx_done.entry); - ss->rx_done.entry = dma_alloc_coherent(&pdev->dev, bytes, - &ss->rx_done.bus, - GFP_KERNEL | __GFP_ZERO); + ss->rx_done.entry = dma_zalloc_coherent(&pdev->dev, bytes, + &ss->rx_done.bus, + GFP_KERNEL); if (ss->rx_done.entry == NULL) goto abort; bytes = sizeof(*ss->fw_stats); diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c index e19f1be60d5..5a0f04c2c81 100644 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c @@ -1491,9 +1491,9 @@ pch_gbe_alloc_rx_buffers_pool(struct pch_gbe_adapter *adapter, bufsz = adapter->rx_buffer_len; size = rx_ring->count * bufsz + PCH_GBE_RESERVE_MEMORY; - rx_ring->rx_buff_pool = dma_alloc_coherent(&pdev->dev, size, - &rx_ring->rx_buff_pool_logic, - GFP_KERNEL | __GFP_ZERO); + rx_ring->rx_buff_pool = + dma_zalloc_coherent(&pdev->dev, size, + &rx_ring->rx_buff_pool_logic, GFP_KERNEL); if (!rx_ring->rx_buff_pool) return -ENOMEM; @@ -1807,9 +1807,8 @@ int pch_gbe_setup_tx_resources(struct pch_gbe_adapter *adapter, tx_ring->size = tx_ring->count * (int)sizeof(struct pch_gbe_tx_desc); - tx_ring->desc = dma_alloc_coherent(&pdev->dev, tx_ring->size, - &tx_ring->dma, - GFP_KERNEL | __GFP_ZERO); + tx_ring->desc = dma_zalloc_coherent(&pdev->dev, tx_ring->size, + &tx_ring->dma, GFP_KERNEL); if (!tx_ring->desc) { vfree(tx_ring->buffer_info); return -ENOMEM; @@ -1852,9 +1851,8 @@ int pch_gbe_setup_rx_resources(struct pch_gbe_adapter *adapter, return -ENOMEM; rx_ring->size = rx_ring->count * (int)sizeof(struct pch_gbe_rx_desc); - rx_ring->desc = dma_alloc_coherent(&pdev->dev, rx_ring->size, - &rx_ring->dma, - GFP_KERNEL | __GFP_ZERO); + rx_ring->desc = dma_zalloc_coherent(&pdev->dev, rx_ring->size, + &rx_ring->dma, GFP_KERNEL); if (!rx_ring->desc) { vfree(rx_ring->buffer_info); return -ENOMEM; diff --git a/drivers/net/ethernet/pasemi/pasemi_mac.c b/drivers/net/ethernet/pasemi/pasemi_mac.c index f21ae7b6c76..c498181a9aa 100644 --- a/drivers/net/ethernet/pasemi/pasemi_mac.c +++ b/drivers/net/ethernet/pasemi/pasemi_mac.c @@ -440,10 +440,9 @@ static int pasemi_mac_setup_rx_resources(const struct net_device *dev) if (pasemi_dma_alloc_ring(&ring->chan, RX_RING_SIZE)) goto out_ring_desc; - ring->buffers = dma_alloc_coherent(&mac->dma_pdev->dev, - RX_RING_SIZE * sizeof(u64), - &ring->buf_dma, - GFP_KERNEL | __GFP_ZERO); + ring->buffers = dma_zalloc_coherent(&mac->dma_pdev->dev, + RX_RING_SIZE * sizeof(u64), + &ring->buf_dma, GFP_KERNEL); if (!ring->buffers) goto out_ring_desc; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c index bf3b17eaf8b..86850dd633a 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c @@ -448,14 +448,14 @@ int qlcnic_82xx_fw_cmd_create_tx_ctx(struct qlcnic_adapter *adapter, *(tx_ring->hw_consumer) = 0; rq_size = SIZEOF_HOSTRQ_TX(struct qlcnic_hostrq_tx_ctx); - rq_addr = dma_alloc_coherent(&adapter->pdev->dev, rq_size, - &rq_phys_addr, GFP_KERNEL | __GFP_ZERO); + rq_addr = dma_zalloc_coherent(&adapter->pdev->dev, rq_size, + &rq_phys_addr, GFP_KERNEL); if (!rq_addr) return -ENOMEM; rsp_size = SIZEOF_CARDRSP_TX(struct qlcnic_cardrsp_tx_ctx); - rsp_addr = dma_alloc_coherent(&adapter->pdev->dev, rsp_size, - &rsp_phys_addr, GFP_KERNEL | __GFP_ZERO); + rsp_addr = dma_zalloc_coherent(&adapter->pdev->dev, rsp_size, + &rsp_phys_addr, GFP_KERNEL); if (!rsp_addr) { err = -ENOMEM; goto out_free_rq; @@ -865,8 +865,8 @@ int qlcnic_82xx_get_nic_info(struct qlcnic_adapter *adapter, struct qlcnic_cmd_args cmd; size_t nic_size = sizeof(struct qlcnic_info_le); - nic_info_addr = dma_alloc_coherent(&adapter->pdev->dev, nic_size, - &nic_dma_t, GFP_KERNEL | __GFP_ZERO); + nic_info_addr = dma_zalloc_coherent(&adapter->pdev->dev, nic_size, + &nic_dma_t, GFP_KERNEL); if (!nic_info_addr) return -ENOMEM; @@ -919,8 +919,8 @@ int qlcnic_82xx_set_nic_info(struct qlcnic_adapter *adapter, if (adapter->ahw->op_mode != QLCNIC_MGMT_FUNC) return err; - nic_info_addr = dma_alloc_coherent(&adapter->pdev->dev, nic_size, - &nic_dma_t, GFP_KERNEL | __GFP_ZERO); + nic_info_addr = dma_zalloc_coherent(&adapter->pdev->dev, nic_size, + &nic_dma_t, GFP_KERNEL); if (!nic_info_addr) return -ENOMEM; @@ -972,9 +972,8 @@ int qlcnic_82xx_get_pci_info(struct qlcnic_adapter *adapter, size_t npar_size = sizeof(struct qlcnic_pci_info_le); size_t pci_size = npar_size * QLCNIC_MAX_PCI_FUNC; - pci_info_addr = dma_alloc_coherent(&adapter->pdev->dev, pci_size, - &pci_info_dma_t, - GFP_KERNEL | __GFP_ZERO); + pci_info_addr = dma_zalloc_coherent(&adapter->pdev->dev, pci_size, + &pci_info_dma_t, GFP_KERNEL); if (!pci_info_addr) return -ENOMEM; @@ -1074,8 +1073,8 @@ int qlcnic_get_port_stats(struct qlcnic_adapter *adapter, const u8 func, return -EIO; } - stats_addr = dma_alloc_coherent(&adapter->pdev->dev, stats_size, - &stats_dma_t, GFP_KERNEL | __GFP_ZERO); + stats_addr = dma_zalloc_coherent(&adapter->pdev->dev, stats_size, + &stats_dma_t, GFP_KERNEL); if (!stats_addr) return -ENOMEM; @@ -1130,8 +1129,8 @@ int qlcnic_get_mac_stats(struct qlcnic_adapter *adapter, if (mac_stats == NULL) return -ENOMEM; - stats_addr = dma_alloc_coherent(&adapter->pdev->dev, stats_size, - &stats_dma_t, GFP_KERNEL | __GFP_ZERO); + stats_addr = dma_zalloc_coherent(&adapter->pdev->dev, stats_size, + &stats_dma_t, GFP_KERNEL); if (!stats_addr) return -ENOMEM; diff --git a/drivers/net/ethernet/sfc/nic.c b/drivers/net/ethernet/sfc/nic.c index b9b127743a0..78d41331205 100644 --- a/drivers/net/ethernet/sfc/nic.c +++ b/drivers/net/ethernet/sfc/nic.c @@ -33,9 +33,8 @@ int efx_nic_alloc_buffer(struct efx_nic *efx, struct efx_buffer *buffer, unsigned int len, gfp_t gfp_flags) { - buffer->addr = dma_alloc_coherent(&efx->pci_dev->dev, len, - &buffer->dma_addr, - gfp_flags | __GFP_ZERO); + buffer->addr = dma_zalloc_coherent(&efx->pci_dev->dev, len, + &buffer->dma_addr, gfp_flags); if (!buffer->addr) return -ENOMEM; buffer->len = len; diff --git a/drivers/net/ethernet/sgi/meth.c b/drivers/net/ethernet/sgi/meth.c index 9f5f35e041a..770036bc2d8 100644 --- a/drivers/net/ethernet/sgi/meth.c +++ b/drivers/net/ethernet/sgi/meth.c @@ -212,9 +212,8 @@ static void meth_check_link(struct net_device *dev) static int meth_init_tx_ring(struct meth_private *priv) { /* Init TX ring */ - priv->tx_ring = dma_alloc_coherent(NULL, TX_RING_BUFFER_SIZE, - &priv->tx_ring_dma, - GFP_ATOMIC | __GFP_ZERO); + priv->tx_ring = dma_zalloc_coherent(NULL, TX_RING_BUFFER_SIZE, + &priv->tx_ring_dma, GFP_ATOMIC); if (!priv->tx_ring) return -ENOMEM; diff --git a/drivers/net/ethernet/tundra/tsi108_eth.c b/drivers/net/ethernet/tundra/tsi108_eth.c index 01bdc6ca075..f2e76c9d736 100644 --- a/drivers/net/ethernet/tundra/tsi108_eth.c +++ b/drivers/net/ethernet/tundra/tsi108_eth.c @@ -1308,13 +1308,13 @@ static int tsi108_open(struct net_device *dev) data->id, dev->irq, dev->name); } - data->rxring = dma_alloc_coherent(NULL, rxring_size, &data->rxdma, - GFP_KERNEL | __GFP_ZERO); + data->rxring = dma_zalloc_coherent(NULL, rxring_size, &data->rxdma, + GFP_KERNEL); if (!data->rxring) return -ENOMEM; - data->txring = dma_alloc_coherent(NULL, txring_size, &data->txdma, - GFP_KERNEL | __GFP_ZERO); + data->txring = dma_zalloc_coherent(NULL, txring_size, &data->txdma, + GFP_KERNEL); if (!data->txring) { pci_free_consistent(0, rxring_size, data->rxring, data->rxdma); return -ENOMEM; diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 58eb4488bef..b88121f240c 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -243,15 +243,15 @@ static int temac_dma_bd_init(struct net_device *ndev) /* allocate the tx and rx ring buffer descriptors. */ /* returns a virtual address and a physical address. */ - lp->tx_bd_v = dma_alloc_coherent(ndev->dev.parent, - sizeof(*lp->tx_bd_v) * TX_BD_NUM, - &lp->tx_bd_p, GFP_KERNEL | __GFP_ZERO); + lp->tx_bd_v = dma_zalloc_coherent(ndev->dev.parent, + sizeof(*lp->tx_bd_v) * TX_BD_NUM, + &lp->tx_bd_p, GFP_KERNEL); if (!lp->tx_bd_v) goto out; - lp->rx_bd_v = dma_alloc_coherent(ndev->dev.parent, - sizeof(*lp->rx_bd_v) * RX_BD_NUM, - &lp->rx_bd_p, GFP_KERNEL | __GFP_ZERO); + lp->rx_bd_v = dma_zalloc_coherent(ndev->dev.parent, + sizeof(*lp->rx_bd_v) * RX_BD_NUM, + &lp->rx_bd_p, GFP_KERNEL); if (!lp->rx_bd_v) goto out; diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index fb7d1c28a2e..b2ff038d6d2 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -201,17 +201,15 @@ static int axienet_dma_bd_init(struct net_device *ndev) /* * Allocate the Tx and Rx buffer descriptors. */ - lp->tx_bd_v = dma_alloc_coherent(ndev->dev.parent, - sizeof(*lp->tx_bd_v) * TX_BD_NUM, - &lp->tx_bd_p, - GFP_KERNEL | __GFP_ZERO); + lp->tx_bd_v = dma_zalloc_coherent(ndev->dev.parent, + sizeof(*lp->tx_bd_v) * TX_BD_NUM, + &lp->tx_bd_p, GFP_KERNEL); if (!lp->tx_bd_v) goto out; - lp->rx_bd_v = dma_alloc_coherent(ndev->dev.parent, - sizeof(*lp->rx_bd_v) * RX_BD_NUM, - &lp->rx_bd_p, - GFP_KERNEL | __GFP_ZERO); + lp->rx_bd_v = dma_zalloc_coherent(ndev->dev.parent, + sizeof(*lp->rx_bd_v) * RX_BD_NUM, + &lp->rx_bd_p, GFP_KERNEL); if (!lp->rx_bd_v) goto out; diff --git a/drivers/net/fddi/defxx.c b/drivers/net/fddi/defxx.c index 4c8ddc944d5..0b40e1c46f0 100644 --- a/drivers/net/fddi/defxx.c +++ b/drivers/net/fddi/defxx.c @@ -1068,9 +1068,9 @@ static int dfx_driver_init(struct net_device *dev, const char *print_name, #endif sizeof(PI_CONSUMER_BLOCK) + (PI_ALIGN_K_DESC_BLK - 1); - bp->kmalloced = top_v = dma_alloc_coherent(bp->bus_dev, alloc_size, - &bp->kmalloced_dma, - GFP_ATOMIC | __GFP_ZERO); + bp->kmalloced = top_v = dma_zalloc_coherent(bp->bus_dev, alloc_size, + &bp->kmalloced_dma, + GFP_ATOMIC); if (top_v == NULL) return DFX_K_FAILURE; diff --git a/drivers/net/irda/ali-ircc.c b/drivers/net/irda/ali-ircc.c index 3adb43ce138..7bbd318bc93 100644 --- a/drivers/net/irda/ali-ircc.c +++ b/drivers/net/irda/ali-ircc.c @@ -351,16 +351,16 @@ static int ali_ircc_open(int i, chipio_t *info) /* Allocate memory if needed */ self->rx_buff.head = - dma_alloc_coherent(NULL, self->rx_buff.truesize, - &self->rx_buff_dma, GFP_KERNEL | __GFP_ZERO); + dma_zalloc_coherent(NULL, self->rx_buff.truesize, + &self->rx_buff_dma, GFP_KERNEL); if (self->rx_buff.head == NULL) { err = -ENOMEM; goto err_out2; } self->tx_buff.head = - dma_alloc_coherent(NULL, self->tx_buff.truesize, - &self->tx_buff_dma, GFP_KERNEL | __GFP_ZERO); + dma_zalloc_coherent(NULL, self->tx_buff.truesize, + &self->tx_buff_dma, GFP_KERNEL); if (self->tx_buff.head == NULL) { err = -ENOMEM; goto err_out3; diff --git a/drivers/net/irda/nsc-ircc.c b/drivers/net/irda/nsc-ircc.c index 9cf836b57c4..ceeb53737f8 100644 --- a/drivers/net/irda/nsc-ircc.c +++ b/drivers/net/irda/nsc-ircc.c @@ -430,8 +430,8 @@ static int __init nsc_ircc_open(chipio_t *info) /* Allocate memory if needed */ self->rx_buff.head = - dma_alloc_coherent(NULL, self->rx_buff.truesize, - &self->rx_buff_dma, GFP_KERNEL | __GFP_ZERO); + dma_zalloc_coherent(NULL, self->rx_buff.truesize, + &self->rx_buff_dma, GFP_KERNEL); if (self->rx_buff.head == NULL) { err = -ENOMEM; goto out2; @@ -439,8 +439,8 @@ static int __init nsc_ircc_open(chipio_t *info) } self->tx_buff.head = - dma_alloc_coherent(NULL, self->tx_buff.truesize, - &self->tx_buff_dma, GFP_KERNEL | __GFP_ZERO); + dma_zalloc_coherent(NULL, self->tx_buff.truesize, + &self->tx_buff_dma, GFP_KERNEL); if (self->tx_buff.head == NULL) { err = -ENOMEM; goto out3; diff --git a/drivers/net/irda/smsc-ircc2.c b/drivers/net/irda/smsc-ircc2.c index aa05dad7533..0dcdf1592f6 100644 --- a/drivers/net/irda/smsc-ircc2.c +++ b/drivers/net/irda/smsc-ircc2.c @@ -562,14 +562,14 @@ static int smsc_ircc_open(unsigned int fir_base, unsigned int sir_base, u8 dma, self->tx_buff.truesize = SMSC_IRCC2_TX_BUFF_TRUESIZE; self->rx_buff.head = - dma_alloc_coherent(NULL, self->rx_buff.truesize, - &self->rx_buff_dma, GFP_KERNEL | __GFP_ZERO); + dma_zalloc_coherent(NULL, self->rx_buff.truesize, + &self->rx_buff_dma, GFP_KERNEL); if (self->rx_buff.head == NULL) goto err_out2; self->tx_buff.head = - dma_alloc_coherent(NULL, self->tx_buff.truesize, - &self->tx_buff_dma, GFP_KERNEL | __GFP_ZERO); + dma_zalloc_coherent(NULL, self->tx_buff.truesize, + &self->tx_buff_dma, GFP_KERNEL); if (self->tx_buff.head == NULL) goto err_out3; diff --git a/drivers/net/irda/via-ircc.c b/drivers/net/irda/via-ircc.c index 2dcc60fb37f..9abaec27f96 100644 --- a/drivers/net/irda/via-ircc.c +++ b/drivers/net/irda/via-ircc.c @@ -361,16 +361,16 @@ static int via_ircc_open(struct pci_dev *pdev, chipio_t *info, unsigned int id) /* Allocate memory if needed */ self->rx_buff.head = - dma_alloc_coherent(&pdev->dev, self->rx_buff.truesize, - &self->rx_buff_dma, GFP_KERNEL | __GFP_ZERO); + dma_zalloc_coherent(&pdev->dev, self->rx_buff.truesize, + &self->rx_buff_dma, GFP_KERNEL); if (self->rx_buff.head == NULL) { err = -ENOMEM; goto err_out2; } self->tx_buff.head = - dma_alloc_coherent(&pdev->dev, self->tx_buff.truesize, - &self->tx_buff_dma, GFP_KERNEL | __GFP_ZERO); + dma_zalloc_coherent(&pdev->dev, self->tx_buff.truesize, + &self->tx_buff_dma, GFP_KERNEL); if (self->tx_buff.head == NULL) { err = -ENOMEM; goto err_out3; diff --git a/drivers/net/irda/w83977af_ir.c b/drivers/net/irda/w83977af_ir.c index bb8857a158a..e641bb24036 100644 --- a/drivers/net/irda/w83977af_ir.c +++ b/drivers/net/irda/w83977af_ir.c @@ -215,16 +215,16 @@ static int w83977af_open(int i, unsigned int iobase, unsigned int irq, /* Allocate memory if needed */ self->rx_buff.head = - dma_alloc_coherent(NULL, self->rx_buff.truesize, - &self->rx_buff_dma, GFP_KERNEL | __GFP_ZERO); + dma_zalloc_coherent(NULL, self->rx_buff.truesize, + &self->rx_buff_dma, GFP_KERNEL); if (self->rx_buff.head == NULL) { err = -ENOMEM; goto err_out1; } self->tx_buff.head = - dma_alloc_coherent(NULL, self->tx_buff.truesize, - &self->tx_buff_dma, GFP_KERNEL | __GFP_ZERO); + dma_zalloc_coherent(NULL, self->tx_buff.truesize, + &self->tx_buff_dma, GFP_KERNEL); if (self->tx_buff.head == NULL) { err = -ENOMEM; goto err_out2; diff --git a/drivers/net/wireless/b43/dma.c b/drivers/net/wireless/b43/dma.c index f7c70b3a6ea..c51d2dc489e 100644 --- a/drivers/net/wireless/b43/dma.c +++ b/drivers/net/wireless/b43/dma.c @@ -431,9 +431,9 @@ static int alloc_ringmemory(struct b43_dmaring *ring) u16 ring_mem_size = (ring->type == B43_DMA_64BIT) ? B43_DMA64_RINGMEMSIZE : B43_DMA32_RINGMEMSIZE; - ring->descbase = dma_alloc_coherent(ring->dev->dev->dma_dev, - ring_mem_size, &(ring->dmabase), - GFP_KERNEL | __GFP_ZERO); + ring->descbase = dma_zalloc_coherent(ring->dev->dev->dma_dev, + ring_mem_size, &(ring->dmabase), + GFP_KERNEL); if (!ring->descbase) return -ENOMEM; diff --git a/drivers/net/wireless/b43legacy/dma.c b/drivers/net/wireless/b43legacy/dma.c index faeafe219c5..42eb26c99e1 100644 --- a/drivers/net/wireless/b43legacy/dma.c +++ b/drivers/net/wireless/b43legacy/dma.c @@ -331,10 +331,9 @@ void free_descriptor_buffer(struct b43legacy_dmaring *ring, static int alloc_ringmemory(struct b43legacy_dmaring *ring) { /* GFP flags must match the flags in free_ringmemory()! */ - ring->descbase = dma_alloc_coherent(ring->dev->dev->dma_dev, - B43legacy_DMA_RINGMEMSIZE, - &(ring->dmabase), - GFP_KERNEL | __GFP_ZERO); + ring->descbase = dma_zalloc_coherent(ring->dev->dev->dma_dev, + B43legacy_DMA_RINGMEMSIZE, + &(ring->dmabase), GFP_KERNEL); if (!ring->descbase) return -ENOMEM; diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 94af4185851..3a8d0a2af60 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -132,9 +132,8 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) static inline void *dma_zalloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { - void *ret = dma_alloc_coherent(dev, size, dma_handle, flag); - if (ret) - memset(ret, 0, size); + void *ret = dma_alloc_coherent(dev, size, dma_handle, + flag | __GFP_ZERO); return ret; } -- cgit v1.2.3-70-g09d2 From e2a240c7d3bcebf90936cc7c22c2729b3a4cec1f Mon Sep 17 00:00:00 2001 From: Sonic Zhang Date: Wed, 28 Aug 2013 18:55:39 +0800 Subject: driver:net:stmmac: Disable DMA store and forward mode if platform data force_thresh_dma_mode is set. Some synopsys ip implementation doesn't support DMA store and forward mode, such as BF60x. So, set force_thresh_dma_mode to use DMA thresholds only. Update document and devicetree as well. Signed-off-by: Sonic Zhang Acked-by: Giuseppe Cavallaro Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/stmmac.txt | 5 +++++ Documentation/networking/stmmac.txt | 3 +++ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +++- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 5 +++++ include/linux/stmmac.h | 1 + 5 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt index 261c563b5f0..eba0e5e59eb 100644 --- a/Documentation/devicetree/bindings/net/stmmac.txt +++ b/Documentation/devicetree/bindings/net/stmmac.txt @@ -22,6 +22,11 @@ Required properties: - snps,pbl Programmable Burst Length - snps,fixed-burst Program the DMA to use the fixed burst mode - snps,mixed-burst Program the DMA to use the mixed burst mode +- snps,force_thresh_dma_mode Force DMA to use the threshold mode for + both tx and rx +- snps,force_sf_dma_mode Force DMA to use the Store and Forward + mode for both tx and rx. This flag is + ignored if force_thresh_dma_mode is set. Optional properties: - mac-address: 6 bytes, mac address diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt index 654d2e55c8c..457b8bbafb0 100644 --- a/Documentation/networking/stmmac.txt +++ b/Documentation/networking/stmmac.txt @@ -123,6 +123,7 @@ struct plat_stmmacenet_data { int bugged_jumbo; int pmt; int force_sf_dma_mode; + int force_thresh_dma_mode; int riwt_off; void (*fix_mac_speed)(void *priv, unsigned int speed); void (*bus_setup)(void __iomem *ioaddr); @@ -159,6 +160,8 @@ Where: o pmt: core has the embedded power module (optional). o force_sf_dma_mode: force DMA to use the Store and Forward mode instead of the Threshold. + o force_thresh_dma_mode: force DMA to use the Shreshold mode other than + the Store and Forward mode. o riwt_off: force to disable the RX watchdog feature and switch to NAPI mode. o fix_mac_speed: this callback is used for modifying some syscfg registers (on ST SoCs) according to the link speed negotiated by the diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index be406911fd0..8d4ccd35a01 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1224,7 +1224,9 @@ static void free_dma_desc_resources(struct stmmac_priv *priv) */ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) { - if (priv->plat->force_sf_dma_mode || priv->plat->tx_coe) { + if (priv->plat->force_thresh_dma_mode) + priv->hw->dma->dma_mode(priv->ioaddr, tc, tc); + else if (priv->plat->force_sf_dma_mode || priv->plat->tx_coe) { /* * In case of GMAC, SF mode can be enabled * to perform the TX COE in HW. This depends on: diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index da8be6e6309..74a89a4aef8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -79,6 +79,11 @@ static int stmmac_probe_config_dt(struct platform_device *pdev, of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl); dma_cfg->fixed_burst = of_property_read_bool(np, "snps,fixed-burst"); dma_cfg->mixed_burst = of_property_read_bool(np, "snps,mixed-burst"); + plat->force_thresh_dma_mode = of_property_read_bool(np, "snps,force_thresh_dma_mode"); + if (plat->force_thresh_dma_mode) { + plat->force_sf_dma_mode = 0; + pr_warn("force_sf_dma_mode is ignored if force_thresh_dma_mode is set."); + } return 0; } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 9e495d31516..bb5deb0feb6 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -108,6 +108,7 @@ struct plat_stmmacenet_data { int bugged_jumbo; int pmt; int force_sf_dma_mode; + int force_thresh_dma_mode; int riwt_off; void (*fix_mac_speed)(void *priv, unsigned int speed); void (*bus_setup)(void __iomem *ioaddr); -- cgit v1.2.3-70-g09d2 From 6da7c8fcbcbdb50ec68c61b40d554c74850fdb91 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 27 Aug 2013 16:19:08 -0700 Subject: qdisc: allow setting default queuing discipline By default, the pfifo_fast queue discipline has been used by default for all devices. But we have better choices now. This patch allow setting the default queueing discipline with sysctl. This allows easy use of better queueing disciplines on all devices without having to use tc qdisc scripts. It is intended to allow an easy path for distributions to make fq_codel or sfq the default qdisc. This patch also makes pfifo_fast more of a first class qdisc, since it is now possible to manually override the default and explicitly use pfifo_fast. The behavior for systems who do not use the sysctl is unchanged, they still get pfifo_fast Also removes leftover random # in sysctl net core. Signed-off-by: Stephen Hemminger Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 13 ++++++++++ include/net/pkt_sched.h | 3 +++ include/net/sch_generic.h | 1 + net/core/sysctl_net_core.c | 30 ++++++++++++++++++++++- net/sched/sch_api.c | 58 ++++++++++++++++++++++++++++++++++++++++++++ net/sched/sch_generic.c | 11 +++++---- net/sched/sch_mq.c | 2 +- net/sched/sch_mqprio.c | 2 +- 8 files changed, 112 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index d569f2a424d..9a0319a8247 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -50,6 +50,19 @@ The maximum number of packets that kernel can handle on a NAPI interrupt, it's a Per-CPU variable. Default: 64 +default_qdisc +-------------- + +The default queuing discipline to use for network devices. This allows +overriding the default queue discipline of pfifo_fast with an +alternative. Since the default queuing discipline is created with the +no additional parameters so is best suited to queuing disciplines that +work well without configuration like stochastic fair queue (sfq), +CoDel (codel) or fair queue CoDel (fq_codel). Don't use queuing disciplines +like Hierarchical Token Bucket or Deficit Round Robin which require setting +up classes and bandwidths. +Default: pfifo_fast + busy_read ---------------- Low latency busy poll timeout for socket reads. (needs CONFIG_NET_RX_BUSY_POLL) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index f7c24f8fbdc..59ec3cd15d6 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -85,6 +85,9 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, int register_qdisc(struct Qdisc_ops *qops); int unregister_qdisc(struct Qdisc_ops *qops); +void qdisc_get_default(char *id, size_t len); +int qdisc_set_default(const char *id); + void qdisc_list_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 76368c9d450..2e31bf14df0 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -316,6 +316,7 @@ extern struct Qdisc noop_qdisc; extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; extern struct Qdisc_ops mq_qdisc_ops; +extern const struct Qdisc_ops *default_qdisc_ops; struct Qdisc_class_common { u32 classid; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 31107abd278..cca44419090 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -20,6 +20,7 @@ #include #include #include +#include static int zero = 0; static int one = 1; @@ -193,6 +194,26 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, } #endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_SCHED +static int set_default_qdisc(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char id[IFNAMSIZ]; + struct ctl_table tbl = { + .data = id, + .maxlen = IFNAMSIZ, + }; + int ret; + + qdisc_get_default(id, IFNAMSIZ); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = qdisc_set_default(id); + return ret; +} +#endif + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -315,7 +336,14 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, -# +#endif +#ifdef CONFIG_NET_SCHED + { + .procname = "default_qdisc", + .mode = 0644, + .maxlen = IFNAMSIZ, + .proc_handler = set_default_qdisc + }, #endif #endif /* CONFIG_NET */ { diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 51b968d3feb..812e5790059 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -131,6 +131,11 @@ static DEFINE_RWLOCK(qdisc_mod_lock); ************************************************/ +/* Qdisc to use by default */ + +const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; +EXPORT_SYMBOL(default_qdisc_ops); + /* The list of all installed queueing disciplines. */ static struct Qdisc_ops *qdisc_base; @@ -200,6 +205,58 @@ int unregister_qdisc(struct Qdisc_ops *qops) } EXPORT_SYMBOL(unregister_qdisc); +/* Get default qdisc if not otherwise specified */ +void qdisc_get_default(char *name, size_t len) +{ + read_lock(&qdisc_mod_lock); + strlcpy(name, default_qdisc_ops->id, len); + read_unlock(&qdisc_mod_lock); +} + +static struct Qdisc_ops *qdisc_lookup_default(const char *name) +{ + struct Qdisc_ops *q = NULL; + + for (q = qdisc_base; q; q = q->next) { + if (!strcmp(name, q->id)) { + if (!try_module_get(q->owner)) + q = NULL; + break; + } + } + + return q; +} + +/* Set new default qdisc to use */ +int qdisc_set_default(const char *name) +{ + const struct Qdisc_ops *ops; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + write_lock(&qdisc_mod_lock); + ops = qdisc_lookup_default(name); + if (!ops) { + /* Not found, drop lock and try to load module */ + write_unlock(&qdisc_mod_lock); + request_module("sch_%s", name); + write_lock(&qdisc_mod_lock); + + ops = qdisc_lookup_default(name); + } + + if (ops) { + /* Set new default */ + module_put(default_qdisc_ops->owner); + default_qdisc_ops = ops; + } + write_unlock(&qdisc_mod_lock); + + return ops ? 0 : -ENOENT; +} + /* We know handle. Find qdisc among all qdisc's attached to device (root qdisc, all its children, children of children etc.) */ @@ -1854,6 +1911,7 @@ static int __init pktsched_init(void) return err; } + register_qdisc(&pfifo_fast_ops); register_qdisc(&pfifo_qdisc_ops); register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 48be3d5c0d9..5078e0c5db8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -530,7 +530,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { .dump = pfifo_fast_dump, .owner = THIS_MODULE, }; -EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; @@ -583,6 +582,9 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, { struct Qdisc *sch; + if (!try_module_get(ops->owner)) + goto errout; + sch = qdisc_alloc(dev_queue, ops); if (IS_ERR(sch)) goto errout; @@ -686,7 +688,7 @@ static void attach_one_default_qdisc(struct net_device *dev, if (dev->tx_queue_len) { qdisc = qdisc_create_dflt(dev_queue, - &pfifo_fast_ops, TC_H_ROOT); + default_qdisc_ops, TC_H_ROOT); if (!qdisc) { netdev_info(dev, "activation failed\n"); return; @@ -739,9 +741,8 @@ void dev_activate(struct net_device *dev) int need_watchdog; /* No queueing discipline is attached to device; - create default one i.e. pfifo_fast for devices, - which need queueing and noqueue_qdisc for - virtual interfaces + * create default one for devices, which need queueing + * and noqueue_qdisc for virtual interfaces */ if (dev->qdisc == &noop_qdisc) diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 5da78a19ac9..2e56185736d 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -57,7 +57,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); - qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, + qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1))); if (qdisc == NULL) diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index accec33c454..d44c868cb53 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -124,7 +124,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < dev->num_tx_queues; i++) { dev_queue = netdev_get_tx_queue(dev, i); - qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, + qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1))); if (qdisc == NULL) { -- cgit v1.2.3-70-g09d2 From d2a7f269f91299b8e5f6c7d2a41ba46248d73d24 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sat, 31 Aug 2013 10:15:50 -0700 Subject: qdisc: make args to qdisc_create_default const Fixes warnings introduced by the qdisc default patch. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/sch_generic.h | 4 ++-- net/sched/sch_generic.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 2e31bf14df0..f4eb365f7dc 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -370,9 +370,9 @@ void qdisc_reset(struct Qdisc *qdisc); void qdisc_destroy(struct Qdisc *qdisc); void qdisc_tree_decrease_qlen(struct Qdisc *qdisc, unsigned int n); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops); + const struct Qdisc_ops *ops); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops, u32 parentid); + const struct Qdisc_ops *ops, u32 parentid); void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab); void tcf_destroy(struct tcf_proto *tp); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5078e0c5db8..6b021106542 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -534,7 +534,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { static struct lock_class_key qdisc_tx_busylock; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops) + const struct Qdisc_ops *ops) { void *p; struct Qdisc *sch; @@ -578,7 +578,8 @@ errout: } struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, - struct Qdisc_ops *ops, unsigned int parentid) + const struct Qdisc_ops *ops, + unsigned int parentid) { struct Qdisc *sch; -- cgit v1.2.3-70-g09d2 From 3ce9b35ff6de8dfebb0b0045e667c000f632e563 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 31 Aug 2013 13:44:28 +0800 Subject: ipv6: move ip6_dst_hoplimit() into core kernel It will be used by vxlan, and may not be inlined. Cc: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 -- include/net/ipv6.h | 2 ++ net/ipv6/output_core.c | 22 ++++++++++++++++++++++ net/ipv6/route.c | 19 ------------------- 4 files changed, 24 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f667248202b..f525e7038cc 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -112,8 +112,6 @@ extern struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, const struct in6_addr *addr, bool anycast); -extern int ip6_dst_hoplimit(struct dst_entry *dst); - /* * support functions for ND * diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7bdff045506..bbf1c8fb851 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -658,6 +658,8 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add extern void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt); +extern int ip6_dst_hoplimit(struct dst_entry *dst); + /* * Header manipulation */ diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index ab92a3673fb..53a062187db 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -5,6 +5,7 @@ #include #include #include +#include void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) { @@ -75,3 +76,24 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) return offset; } EXPORT_SYMBOL(ip6_find_1stfragopt); + +#if IS_ENABLED(CONFIG_IPV6) +int ip6_dst_hoplimit(struct dst_entry *dst) +{ + int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + if (hoplimit == 0) { + struct net_device *dev = dst->dev; + struct inet6_dev *idev; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) + hoplimit = idev->cnf.hop_limit; + else + hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; + rcu_read_unlock(); + } + return hoplimit; +} +EXPORT_SYMBOL(ip6_dst_hoplimit); +#endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 55236a84c74..b770085ae36 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1354,25 +1354,6 @@ out: return entries > rt_max_size; } -int ip6_dst_hoplimit(struct dst_entry *dst) -{ - int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); - if (hoplimit == 0) { - struct net_device *dev = dst->dev; - struct inet6_dev *idev; - - rcu_read_lock(); - idev = __in6_dev_get(dev); - if (idev) - hoplimit = idev->cnf.hop_limit; - else - hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; - rcu_read_unlock(); - } - return hoplimit; -} -EXPORT_SYMBOL(ip6_dst_hoplimit); - /* * */ -- cgit v1.2.3-70-g09d2 From 5f81bd2e5d804ca93f3ec8873451b22d2f454721 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 31 Aug 2013 13:44:30 +0800 Subject: ipv6: export a stub for IPv6 symbols used by vxlan In case IPv6 is compiled as a module, introduce a stub for ipv6_sock_mc_join and ipv6_sock_mc_drop etc.. It will be used by vxlan module. Suggested by Ben. This is an ugly but easy solution for now. Cc: Ben Hutchings Cc: Stephen Hemminger Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/addrconf.h | 15 +++++++++++++++ net/ipv6/addrconf_core.c | 3 +++ net/ipv6/af_inet6.c | 11 +++++++++++ 3 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 43fa31a610b..5339cab356b 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -141,6 +141,21 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr); void ipv6_mc_dad_complete(struct inet6_dev *idev); + +/* A stub used by vxlan module. This is ugly, ideally these + * symbols should be built into the core kernel. + */ +struct ipv6_stub { + int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex, + const struct in6_addr *addr); + int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex, + const struct in6_addr *addr); + int (*ipv6_dst_lookup)(struct sock *sk, struct dst_entry **dst, + struct flowi6 *fl6); + void (*udpv6_encap_enable)(void); +}; +extern const struct ipv6_stub *ipv6_stub __read_mostly; + /* * identify MLD packets for MLD filter exceptions */ diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index d2f87427244..93b24c6bcae 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -98,3 +98,6 @@ int inet6addr_notifier_call_chain(unsigned long val, void *v) return atomic_notifier_call_chain(&inet6addr_chain, val, v); } EXPORT_SYMBOL(inet6addr_notifier_call_chain); + +const struct ipv6_stub *ipv6_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 0d1a9b153fb..0c9c22f7487 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -810,6 +810,13 @@ static struct pernet_operations inet6_net_ops = { .exit = inet6_net_exit, }; +static const struct ipv6_stub ipv6_stub_impl = { + .ipv6_sock_mc_join = ipv6_sock_mc_join, + .ipv6_sock_mc_drop = ipv6_sock_mc_drop, + .ipv6_dst_lookup = ip6_dst_lookup, + .udpv6_encap_enable = udpv6_encap_enable, +}; + static int __init inet6_init(void) { struct list_head *r; @@ -884,6 +891,9 @@ static int __init inet6_init(void) err = igmp6_init(); if (err) goto igmp_fail; + + ipv6_stub = &ipv6_stub_impl; + err = ipv6_netfilter_init(); if (err) goto netfilter_fail; @@ -1040,6 +1050,7 @@ static void __exit inet6_exit(void) raw6_proc_exit(); #endif ipv6_netfilter_fini(); + ipv6_stub = NULL; igmp6_cleanup(); ndisc_cleanup(); ip6_mr_cleanup(); -- cgit v1.2.3-70-g09d2 From e4c7ed415387cf718ffbec305396c30cee092987 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 31 Aug 2013 13:44:33 +0800 Subject: vxlan: add ipv6 support This patch adds IPv6 support to vxlan device, as the new version RFC already mentions it: http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-03 Cc: David Stevens Cc: Stephen Hemminger Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 764 ++++++++++++++++++++++++++++++++++-------- include/net/vxlan.h | 2 +- include/uapi/linux/if_link.h | 2 + net/openvswitch/vport-vxlan.c | 2 +- 4 files changed, 622 insertions(+), 148 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3b21aca0c0c..faf131e02a7 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -6,9 +6,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * TODO - * - IPv6 (not in RFC) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -43,6 +40,11 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#include +#endif #define VXLAN_VERSION "0.1" @@ -59,6 +61,8 @@ #define VXLAN_VID_MASK (VXLAN_N_VID - 1) /* IP header + UDP + VXLAN + Ethernet header */ #define VXLAN_HEADROOM (20 + 8 + 8 + 14) +/* IPv6 header + UDP + VXLAN + Ethernet header */ +#define VXLAN6_HEADROOM (40 + 8 + 8 + 14) #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) #define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ @@ -92,8 +96,14 @@ struct vxlan_net { spinlock_t sock_lock; }; +union vxlan_addr { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + struct sockaddr sa; +}; + struct vxlan_rdst { - __be32 remote_ip; + union vxlan_addr remote_ip; __be16 remote_port; u32 remote_vni; u32 remote_ifindex; @@ -120,7 +130,7 @@ struct vxlan_dev { struct vxlan_sock *vn_sock; /* listening socket */ struct net_device *dev; struct vxlan_rdst default_dst; /* default destination */ - __be32 saddr; /* source address */ + union vxlan_addr saddr; /* source address */ __be16 dst_port; __u16 port_min; /* source port range */ __u16 port_max; @@ -146,6 +156,7 @@ struct vxlan_dev { #define VXLAN_F_RSC 0x04 #define VXLAN_F_L2MISS 0x08 #define VXLAN_F_L3MISS 0x10 +#define VXLAN_F_IPV6 0x20 /* internal flag */ /* salt for hash table */ static u32 vxlan_salt __read_mostly; @@ -153,6 +164,96 @@ static struct workqueue_struct *vxlan_wq; static void vxlan_sock_work(struct work_struct *work); +#if IS_ENABLED(CONFIG_IPV6) +static inline +bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) +{ + if (a->sa.sa_family != b->sa.sa_family) + return false; + if (a->sa.sa_family == AF_INET6) + return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); + else + return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; +} + +static inline bool vxlan_addr_any(const union vxlan_addr *ipa) +{ + if (ipa->sa.sa_family == AF_INET6) + return ipv6_addr_any(&ipa->sin6.sin6_addr); + else + return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); +} + +static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) +{ + if (ipa->sa.sa_family == AF_INET6) + return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr); + else + return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); +} + +static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) +{ + if (nla_len(nla) >= sizeof(struct in6_addr)) { + nla_memcpy(&ip->sin6.sin6_addr, nla, sizeof(struct in6_addr)); + ip->sa.sa_family = AF_INET6; + return 0; + } else if (nla_len(nla) >= sizeof(__be32)) { + ip->sin.sin_addr.s_addr = nla_get_be32(nla); + ip->sa.sa_family = AF_INET; + return 0; + } else { + return -EAFNOSUPPORT; + } +} + +static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, + const union vxlan_addr *ip) +{ + if (ip->sa.sa_family == AF_INET6) + return nla_put(skb, attr, sizeof(struct in6_addr), &ip->sin6.sin6_addr); + else + return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr); +} + +#else /* !CONFIG_IPV6 */ + +static inline +bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) +{ + return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; +} + +static inline bool vxlan_addr_any(const union vxlan_addr *ipa) +{ + return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); +} + +static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) +{ + return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); +} + +static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) +{ + if (nla_len(nla) >= sizeof(struct in6_addr)) { + return -EAFNOSUPPORT; + } else if (nla_len(nla) >= sizeof(__be32)) { + ip->sin.sin_addr.s_addr = nla_get_be32(nla); + ip->sa.sa_family = AF_INET; + return 0; + } else { + return -EAFNOSUPPORT; + } +} + +static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, + const union vxlan_addr *ip) +{ + return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr); +} +#endif + /* Virtual Network hash table head */ static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id) { @@ -239,7 +340,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (type == RTM_GETNEIGH) { ndm->ndm_family = AF_INET; - send_ip = rdst->remote_ip != htonl(INADDR_ANY); + send_ip = !vxlan_addr_any(&rdst->remote_ip); send_eth = !is_zero_ether_addr(fdb->eth_addr); } else ndm->ndm_family = AF_BRIDGE; @@ -251,7 +352,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; - if (send_ip && nla_put_be32(skb, NDA_DST, rdst->remote_ip)) + if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) goto nla_put_failure; if (rdst->remote_port && rdst->remote_port != vxlan->dst_port && @@ -283,7 +384,7 @@ static inline size_t vxlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ - + nla_total_size(sizeof(__be32)) /* NDA_DST */ + + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ + nla_total_size(sizeof(__be16)) /* NDA_PORT */ + nla_total_size(sizeof(__be32)) /* NDA_VNI */ + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ @@ -317,14 +418,14 @@ errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } -static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) +static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb f = { .state = NUD_STALE, }; struct vxlan_rdst remote = { - .remote_ip = ipa, /* goes to NDA_DST */ + .remote_ip = *ipa, /* goes to NDA_DST */ .remote_vni = VXLAN_N_VID, }; @@ -397,13 +498,13 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, /* caller should hold vxlan->hash_lock */ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, - __be32 ip, __be16 port, + union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex) { struct vxlan_rdst *rd; list_for_each_entry(rd, &f->remotes, list) { - if (rd->remote_ip == ip && + if (vxlan_addr_equal(&rd->remote_ip, ip) && rd->remote_port == port && rd->remote_vni == vni && rd->remote_ifindex == ifindex) @@ -415,7 +516,7 @@ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, /* Replace destination of unicast mac */ static int vxlan_fdb_replace(struct vxlan_fdb *f, - __be32 ip, __be16 port, __u32 vni, __u32 ifindex) + union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex) { struct vxlan_rdst *rd; @@ -426,7 +527,7 @@ static int vxlan_fdb_replace(struct vxlan_fdb *f, rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); if (!rd) return 0; - rd->remote_ip = ip; + rd->remote_ip = *ip; rd->remote_port = port; rd->remote_vni = vni; rd->remote_ifindex = ifindex; @@ -435,7 +536,7 @@ static int vxlan_fdb_replace(struct vxlan_fdb *f, /* Add/update destinations for multicast */ static int vxlan_fdb_append(struct vxlan_fdb *f, - __be32 ip, __be16 port, __u32 vni, __u32 ifindex) + union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex) { struct vxlan_rdst *rd; @@ -446,7 +547,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) return -ENOBUFS; - rd->remote_ip = ip; + rd->remote_ip = *ip; rd->remote_port = port; rd->remote_vni = vni; rd->remote_ifindex = ifindex; @@ -458,7 +559,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, /* Add new entry to forwarding table -- assumes lock held */ static int vxlan_fdb_create(struct vxlan_dev *vxlan, - const u8 *mac, __be32 ip, + const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __u32 vni, __u32 ifindex, __u8 ndm_flags) @@ -517,7 +618,7 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) return -EOPNOTSUPP; - netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip); + netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return -ENOMEM; @@ -565,17 +666,26 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) } static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, - __be32 *ip, __be16 *port, u32 *vni, u32 *ifindex) + union vxlan_addr *ip, __be16 *port, u32 *vni, u32 *ifindex) { struct net *net = dev_net(vxlan->dev); + int err; if (tb[NDA_DST]) { - if (nla_len(tb[NDA_DST]) != sizeof(__be32)) - return -EAFNOSUPPORT; - - *ip = nla_get_be32(tb[NDA_DST]); + err = vxlan_nla_get_addr(ip, tb[NDA_DST]); + if (err) + return err; } else { - *ip = htonl(INADDR_ANY); + union vxlan_addr *remote = &vxlan->default_dst.remote_ip; + if (remote->sa.sa_family == AF_INET) { + ip->sin.sin_addr.s_addr = htonl(INADDR_ANY); + ip->sa.sa_family = AF_INET; +#if IS_ENABLED(CONFIG_IPV6) + } else { + ip->sin6.sin6_addr = in6addr_any; + ip->sa.sa_family = AF_INET6; +#endif + } } if (tb[NDA_PORT]) { @@ -618,7 +728,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], { struct vxlan_dev *vxlan = netdev_priv(dev); /* struct net *net = dev_net(vxlan->dev); */ - __be32 ip; + union vxlan_addr ip; __be16 port; u32 vni, ifindex; int err; @@ -637,7 +747,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return err; spin_lock_bh(&vxlan->hash_lock); - err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags, + err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags, port, vni, ifindex, ndm->ndm_flags); spin_unlock_bh(&vxlan->hash_lock); @@ -652,7 +762,7 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; struct vxlan_rdst *rd = NULL; - __be32 ip; + union vxlan_addr ip; __be16 port; u32 vni, ifindex; int err; @@ -668,8 +778,8 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], if (!f) goto out; - if (ip != htonl(INADDR_ANY)) { - rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); + if (!vxlan_addr_any(&ip)) { + rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex); if (!rd) goto out; } @@ -732,7 +842,7 @@ out: * Return true if packet is bogus and should be droppped. */ static bool vxlan_snoop(struct net_device *dev, - __be32 src_ip, const u8 *src_mac) + union vxlan_addr *src_ip, const u8 *src_mac) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; @@ -741,7 +851,7 @@ static bool vxlan_snoop(struct net_device *dev, if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); - if (likely(rdst->remote_ip == src_ip)) + if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip))) return false; /* Don't migrate static entries, drop packets */ @@ -750,10 +860,10 @@ static bool vxlan_snoop(struct net_device *dev, if (net_ratelimit()) netdev_info(dev, - "%pM migrated from %pI4 to %pI4\n", + "%pM migrated from %pIS to %pIS\n", src_mac, &rdst->remote_ip, &src_ip); - rdst->remote_ip = src_ip; + rdst->remote_ip = *src_ip; f->updated = jiffies; vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH); } else { @@ -775,7 +885,7 @@ static bool vxlan_snoop(struct net_device *dev, } /* See if multicast group is already in use by other ID */ -static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip) +static bool vxlan_group_used(struct vxlan_net *vn, union vxlan_addr *remote_ip) { struct vxlan_dev *vxlan; @@ -783,7 +893,8 @@ static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip) if (!netif_running(vxlan->dev)) continue; - if (vxlan->default_dst.remote_ip == remote_ip) + if (vxlan_addr_equal(&vxlan->default_dst.remote_ip, + remote_ip)) return true; } @@ -819,13 +930,23 @@ static void vxlan_igmp_join(struct work_struct *work) struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; - struct ip_mreqn mreq = { - .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, - .imr_ifindex = vxlan->default_dst.remote_ifindex, - }; + union vxlan_addr *ip = &vxlan->default_dst.remote_ip; + int ifindex = vxlan->default_dst.remote_ifindex; lock_sock(sk); - ip_mc_join_group(sk, &mreq); + if (ip->sa.sa_family == AF_INET) { + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, + .imr_ifindex = ifindex, + }; + + ip_mc_join_group(sk, &mreq); +#if IS_ENABLED(CONFIG_IPV6) + } else { + ipv6_stub->ipv6_sock_mc_join(sk, ifindex, + &ip->sin6.sin6_addr); +#endif + } release_sock(sk); vxlan_sock_release(vs); @@ -838,13 +959,24 @@ static void vxlan_igmp_leave(struct work_struct *work) struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; - struct ip_mreqn mreq = { - .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, - .imr_ifindex = vxlan->default_dst.remote_ifindex, - }; + union vxlan_addr *ip = &vxlan->default_dst.remote_ip; + int ifindex = vxlan->default_dst.remote_ifindex; lock_sock(sk); - ip_mc_leave_group(sk, &mreq); + if (ip->sa.sa_family == AF_INET) { + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, + .imr_ifindex = ifindex, + }; + + ip_mc_leave_group(sk, &mreq); +#if IS_ENABLED(CONFIG_IPV6) + } else { + ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, + &ip->sin6.sin6_addr); +#endif + } + release_sock(sk); vxlan_sock_release(vs); @@ -896,11 +1028,14 @@ error: static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) { - struct iphdr *oip; + struct iphdr *oip = NULL; + struct ipv6hdr *oip6 = NULL; struct vxlan_dev *vxlan; struct pcpu_tstats *stats; + union vxlan_addr saddr; __u32 vni; - int err; + int err = 0; + union vxlan_addr *remote_ip; vni = ntohl(vx_vni) >> 8; /* Is this VNI defined? */ @@ -908,6 +1043,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, if (!vxlan) goto drop; + remote_ip = &vxlan->default_dst.remote_ip; skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, vxlan->dev); @@ -917,9 +1053,20 @@ static void vxlan_rcv(struct vxlan_sock *vs, goto drop; /* Re-examine inner Ethernet packet */ - oip = ip_hdr(skb); + if (remote_ip->sa.sa_family == AF_INET) { + oip = ip_hdr(skb); + saddr.sin.sin_addr.s_addr = oip->saddr; + saddr.sa.sa_family = AF_INET; +#if IS_ENABLED(CONFIG_IPV6) + } else { + oip6 = ipv6_hdr(skb); + saddr.sin6.sin6_addr = oip6->saddr; + saddr.sa.sa_family = AF_INET6; +#endif + } + if ((vxlan->flags & VXLAN_F_LEARN) && - vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source)) + vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) goto drop; skb_reset_network_header(skb); @@ -935,11 +1082,20 @@ static void vxlan_rcv(struct vxlan_sock *vs, skb->encapsulation = 0; - err = IP_ECN_decapsulate(oip, skb); + if (oip6) + err = IP6_ECN_decapsulate(oip6, skb); + if (oip) + err = IP_ECN_decapsulate(oip, skb); + if (unlikely(err)) { - if (log_ecn_error) - net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", - &oip->saddr, oip->tos); + if (log_ecn_error) { + if (oip6) + net_info_ratelimited("non-ECT from %pI6\n", + &oip6->saddr); + if (oip) + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", + &oip->saddr, oip->tos); + } if (err > 1) { ++vxlan->dev->stats.rx_frame_errors; ++vxlan->dev->stats.rx_errors; @@ -1009,7 +1165,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb) } f = vxlan_find_mac(vxlan, n->ha); - if (f && first_remote_rcu(f)->remote_ip == htonl(INADDR_ANY)) { + if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); goto out; @@ -1027,8 +1183,14 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb) if (netif_rx_ni(reply) == NET_RX_DROP) dev->stats.rx_dropped++; - } else if (vxlan->flags & VXLAN_F_L3MISS) - vxlan_ip_miss(dev, tip); + } else if (vxlan->flags & VXLAN_F_L3MISS) { + union vxlan_addr ipa = { + .sin.sin_addr.s_addr = tip, + .sa.sa_family = AF_INET, + }; + + vxlan_ip_miss(dev, &ipa); + } out: consume_skb(skb); return NETDEV_TX_OK; @@ -1050,6 +1212,16 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) return false; pip = ip_hdr(skb); n = neigh_lookup(&arp_tbl, &pip->daddr, dev); + if (!n && (vxlan->flags & VXLAN_F_L3MISS)) { + union vxlan_addr ipa = { + .sin.sin_addr.s_addr = pip->daddr, + .sa.sa_family = AF_INET, + }; + + vxlan_ip_miss(dev, &ipa); + return false; + } + break; default: return false; @@ -1066,8 +1238,8 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) } neigh_release(n); return diff; - } else if (vxlan->flags & VXLAN_F_L3MISS) - vxlan_ip_miss(dev, pip->daddr); + } + return false; } @@ -1118,6 +1290,102 @@ static int handle_offloads(struct sk_buff *skb) return 0; } +#if IS_ENABLED(CONFIG_IPV6) +static int vxlan6_xmit_skb(struct net *net, struct vxlan_sock *vs, + struct dst_entry *dst, struct sk_buff *skb, + struct net_device *dev, struct in6_addr *saddr, + struct in6_addr *daddr, __u8 prio, __u8 ttl, + __be16 src_port, __be16 dst_port, __be32 vni) +{ + struct ipv6hdr *ip6h; + struct vxlanhdr *vxh; + struct udphdr *uh; + int min_headroom; + int err; + + if (!skb->encapsulation) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + + VXLAN_HLEN + sizeof(struct ipv6hdr) + + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + /* Need space for new headers (invalidates iph ptr) */ + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) + return err; + + if (vlan_tx_tag_present(skb)) { + if (WARN_ON(!__vlan_put_tag(skb, + skb->vlan_proto, + vlan_tx_tag_get(skb)))) + return -ENOMEM; + + skb->vlan_tci = 0; + } + + vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); + vxh->vx_flags = htonl(VXLAN_FLAGS); + vxh->vx_vni = vni; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + + uh->len = htons(skb->len); + uh->check = 0; + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | + IPSKB_REROUTED); + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + if (!skb_is_gso(skb) && !(dst->dev->features & NETIF_F_IPV6_CSUM)) { + __wsum csum = skb_checksum(skb, 0, skb->len, 0); + skb->ip_summed = CHECKSUM_UNNECESSARY; + uh->check = csum_ipv6_magic(saddr, daddr, skb->len, + IPPROTO_UDP, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~csum_ipv6_magic(saddr, daddr, + skb->len, IPPROTO_UDP, 0); + } + + __skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6h->version = 6; + ip6h->priority = prio; + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; + ip6h->payload_len = htons(skb->len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = ttl; + ip6h->daddr = *daddr; + ip6h->saddr = *saddr; + + vxlan_set_owner(vs->sock->sk, skb); + + err = handle_offloads(skb); + if (err) + return err; + + ip6tunnel_xmit(skb, dev); + return 0; +} +#endif + int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, @@ -1182,15 +1450,26 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, { struct pcpu_tstats *tx_stats = this_cpu_ptr(src_vxlan->dev->tstats); struct pcpu_tstats *rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats); + union vxlan_addr loopback; + union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb->dev = dst_vxlan->dev; __skb_pull(skb, skb_network_offset(skb)); + if (remote_ip->sa.sa_family == AF_INET) { + loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + loopback.sa.sa_family = AF_INET; +#if IS_ENABLED(CONFIG_IPV6) + } else { + loopback.sin6.sin6_addr = in6addr_loopback; + loopback.sa.sa_family = AF_INET6; +#endif + } + if (dst_vxlan->flags & VXLAN_F_LEARN) - vxlan_snoop(skb->dev, htonl(INADDR_LOOPBACK), - eth_hdr(skb)->h_source); + vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source); u64_stats_update_begin(&tx_stats->syncp); tx_stats->tx_packets++; @@ -1211,11 +1490,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct rtable *rt; + struct rtable *rt = NULL; const struct iphdr *old_iph; struct flowi4 fl4; - __be32 dst; - __be16 src_port, dst_port; + union vxlan_addr *dst; + __be16 src_port = 0, dst_port; u32 vni; __be16 df = 0; __u8 tos, ttl; @@ -1223,9 +1502,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; vni = rdst->remote_vni; - dst = rdst->remote_ip; + dst = &rdst->remote_ip; - if (!dst) { + if (vxlan_addr_any(dst)) { if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan); @@ -1237,7 +1516,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, old_iph = ip_hdr(skb); ttl = vxlan->ttl; - if (!ttl && IN_MULTICAST(ntohl(dst))) + if (!ttl && vxlan_addr_multicast(dst)) ttl = 1; tos = vxlan->tos; @@ -1246,48 +1525,101 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, src_port = vxlan_src_port(vxlan->port_min, vxlan->port_max, skb); - memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_oif = rdst->remote_ifindex; - fl4.flowi4_tos = RT_TOS(tos); - fl4.daddr = dst; - fl4.saddr = vxlan->saddr; - - rt = ip_route_output_key(dev_net(dev), &fl4); - if (IS_ERR(rt)) { - netdev_dbg(dev, "no route to %pI4\n", &dst); - dev->stats.tx_carrier_errors++; - goto tx_error; - } + if (dst->sa.sa_family == AF_INET) { + memset(&fl4, 0, sizeof(fl4)); + fl4.flowi4_oif = rdst->remote_ifindex; + fl4.flowi4_tos = RT_TOS(tos); + fl4.daddr = dst->sin.sin_addr.s_addr; + fl4.saddr = vxlan->saddr.sin.sin_addr.s_addr; + + rt = ip_route_output_key(dev_net(dev), &fl4); + if (IS_ERR(rt)) { + netdev_dbg(dev, "no route to %pI4\n", + &dst->sin.sin_addr.s_addr); + dev->stats.tx_carrier_errors++; + goto tx_error; + } - if (rt->dst.dev == dev) { - netdev_dbg(dev, "circular route to %pI4\n", &dst); - dev->stats.collisions++; - goto rt_tx_error; - } + if (rt->dst.dev == dev) { + netdev_dbg(dev, "circular route to %pI4\n", + &dst->sin.sin_addr.s_addr); + dev->stats.collisions++; + goto tx_error; + } + + /* Bypass encapsulation if the destination is local */ + if (rt->rt_flags & RTCF_LOCAL && + !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + struct vxlan_dev *dst_vxlan; + + ip_rt_put(rt); + dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port); + if (!dst_vxlan) + goto tx_error; + vxlan_encap_bypass(skb, vxlan, dst_vxlan); + return; + } - /* Bypass encapsulation if the destination is local */ - if (rt->rt_flags & RTCF_LOCAL && - !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { - struct vxlan_dev *dst_vxlan; + tos = ip_tunnel_ecn_encap(tos, old_iph, skb); + ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - ip_rt_put(rt); - dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port); - if (!dst_vxlan) + err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb, + fl4.saddr, dst->sin.sin_addr.s_addr, + tos, ttl, df, src_port, dst_port, + htonl(vni << 8)); + + if (err < 0) + goto rt_tx_error; + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct sock *sk = vxlan->vn_sock->sock->sk; + struct dst_entry *ndst; + struct flowi6 fl6; + u32 flags; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = rdst->remote_ifindex; + fl6.daddr = dst->sin6.sin6_addr; + fl6.saddr = vxlan->saddr.sin6.sin6_addr; + fl6.flowi6_proto = skb->protocol; + + if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) { + netdev_dbg(dev, "no route to %pI6\n", + &dst->sin6.sin6_addr); + dev->stats.tx_carrier_errors++; goto tx_error; - vxlan_encap_bypass(skb, vxlan, dst_vxlan); - return; - } + } - tos = ip_tunnel_ecn_encap(tos, old_iph, skb); - ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); + if (ndst->dev == dev) { + netdev_dbg(dev, "circular route to %pI6\n", + &dst->sin6.sin6_addr); + dst_release(ndst); + dev->stats.collisions++; + goto tx_error; + } - err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb, - fl4.saddr, dst, tos, ttl, df, - src_port, dst_port, htonl(vni << 8)); + /* Bypass encapsulation if the destination is local */ + flags = ((struct rt6_info *)ndst)->rt6i_flags; + if (flags & RTF_LOCAL && + !(flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + struct vxlan_dev *dst_vxlan; + + dst_release(ndst); + dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port); + if (!dst_vxlan) + goto tx_error; + vxlan_encap_bypass(skb, vxlan, dst_vxlan); + return; + } - if (err < 0) - goto rt_tx_error; - iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + ttl = ttl ? : ip6_dst_hoplimit(ndst); + + err = vxlan6_xmit_skb(dev_net(dev), vxlan->vn_sock, ndst, skb, + dev, &fl6.saddr, &fl6.daddr, 0, ttl, + src_port, dst_port, htonl(vni << 8)); +#endif + } return; @@ -1464,8 +1796,8 @@ static int vxlan_open(struct net_device *dev) if (!vs) return -ENOTCONN; - if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) && - vxlan_group_used(vn, vxlan->default_dst.remote_ip)) { + if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && + vxlan_group_used(vn, &vxlan->default_dst.remote_ip)) { vxlan_sock_hold(vs); dev_hold(dev); queue_work(vxlan_wq, &vxlan->igmp_join); @@ -1503,8 +1835,8 @@ static int vxlan_stop(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_sock *vs = vxlan->vn_sock; - if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) && - ! vxlan_group_used(vn, vxlan->default_dst.remote_ip)) { + if (vs && vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && + ! vxlan_group_used(vn, &vxlan->default_dst.remote_ip)) { vxlan_sock_hold(vs); dev_hold(dev); queue_work(vxlan_wq, &vxlan->igmp_leave); @@ -1552,7 +1884,10 @@ static void vxlan_setup(struct net_device *dev) eth_hw_addr_random(dev); ether_setup(dev); - dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM; + if (vxlan->default_dst.remote_ip.sa.sa_family == AF_INET6) + dev->hard_header_len = ETH_HLEN + VXLAN6_HEADROOM; + else + dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM; dev->netdev_ops = &vxlan_netdev_ops; dev->destructor = free_netdev; @@ -1597,8 +1932,10 @@ static void vxlan_setup(struct net_device *dev) static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, @@ -1669,58 +2006,132 @@ static void vxlan_del_work(struct work_struct *work) kfree_rcu(vs, rcu); } -static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data) +#if IS_ENABLED(CONFIG_IPV6) +/* Create UDP socket for encapsulation receive. AF_INET6 socket + * could be used for both IPv4 and IPv6 communications, but + * users may set bindv6only=1. + */ +static int create_v6_sock(struct net *net, __be16 port, struct socket **psock) +{ + struct sock *sk; + struct socket *sock; + struct sockaddr_in6 vxlan_addr = { + .sin6_family = AF_INET6, + .sin6_port = port, + }; + int rc, val = 1; + + rc = sock_create_kern(AF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (rc < 0) { + pr_debug("UDPv6 socket create failed\n"); + return rc; + } + + /* Put in proper namespace */ + sk = sock->sk; + sk_change_net(sk, net); + + kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, + (char *)&val, sizeof(val)); + rc = kernel_bind(sock, (struct sockaddr *)&vxlan_addr, + sizeof(struct sockaddr_in6)); + if (rc < 0) { + pr_debug("bind for UDPv6 socket %pI6:%u (%d)\n", + &vxlan_addr.sin6_addr, ntohs(vxlan_addr.sin6_port), rc); + sk_release_kernel(sk); + return rc; + } + /* At this point, IPv6 module should have been loaded in + * sock_create_kern(). + */ + BUG_ON(!ipv6_stub); + + *psock = sock; + /* Disable multicast loopback */ + inet_sk(sk)->mc_loop = 0; + return 0; +} + +#else + +static int create_v6_sock(struct net *net, __be16 port, struct socket **psock) +{ + return -EPFNOSUPPORT; +} +#endif + +static int create_v4_sock(struct net *net, __be16 port, struct socket **psock) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - struct vxlan_sock *vs; struct sock *sk; + struct socket *sock; struct sockaddr_in vxlan_addr = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = port, }; int rc; - unsigned int h; - - vs = kmalloc(sizeof(*vs), GFP_KERNEL); - if (!vs) { - pr_debug("memory alocation failure\n"); - return ERR_PTR(-ENOMEM); - } - - for (h = 0; h < VNI_HASH_SIZE; ++h) - INIT_HLIST_HEAD(&vs->vni_list[h]); - - INIT_WORK(&vs->del_work, vxlan_del_work); /* Create UDP socket for encapsulation receive. */ - rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vs->sock); + rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (rc < 0) { pr_debug("UDP socket create failed\n"); - kfree(vs); - return ERR_PTR(rc); + return rc; } /* Put in proper namespace */ - sk = vs->sock->sk; + sk = sock->sk; sk_change_net(sk, net); - rc = kernel_bind(vs->sock, (struct sockaddr *) &vxlan_addr, + rc = kernel_bind(sock, (struct sockaddr *) &vxlan_addr, sizeof(vxlan_addr)); if (rc < 0) { pr_debug("bind for UDP socket %pI4:%u (%d)\n", &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc); sk_release_kernel(sk); + return rc; + } + + *psock = sock; + /* Disable multicast loopback */ + inet_sk(sk)->mc_loop = 0; + return 0; +} + +/* Create new listen socket if needed */ +static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, + vxlan_rcv_t *rcv, void *data, bool ipv6) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_sock *vs; + struct socket *sock; + struct sock *sk; + int rc = 0; + unsigned int h; + + vs = kmalloc(sizeof(*vs), GFP_KERNEL); + if (!vs) + return ERR_PTR(-ENOMEM); + + for (h = 0; h < VNI_HASH_SIZE; ++h) + INIT_HLIST_HEAD(&vs->vni_list[h]); + + INIT_WORK(&vs->del_work, vxlan_del_work); + + if (ipv6) + rc = create_v6_sock(net, port, &sock); + else + rc = create_v4_sock(net, port, &sock); + if (rc < 0) { kfree(vs); return ERR_PTR(rc); } + + vs->sock = sock; + sk = sock->sk; atomic_set(&vs->refcnt, 1); vs->rcv = rcv; vs->data = data; - /* Disable multicast loopback */ - inet_sk(sk)->mc_loop = 0; spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); spin_unlock(&vn->sock_lock); @@ -1728,18 +2139,24 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, /* Mark socket as an encapsulation socket. */ udp_sk(sk)->encap_type = 1; udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; - udp_encap_enable(); +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6) + ipv6_stub->udpv6_encap_enable(); + else +#endif + udp_encap_enable(); + return vs; } struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, vxlan_rcv_t *rcv, void *data, - bool no_share) + bool no_share, bool ipv6) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; - vs = vxlan_socket_create(net, port, rcv, data); + vs = vxlan_socket_create(net, port, rcv, data, ipv6); if (!IS_ERR(vs)) return vs; @@ -1772,7 +2189,7 @@ static void vxlan_sock_work(struct work_struct *work) __be16 port = vxlan->dst_port; struct vxlan_sock *nvs; - nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false); + nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags & VXLAN_F_IPV6); spin_lock(&vn->sock_lock); if (!IS_ERR(nvs)) vxlan_vs_add_dev(nvs, vxlan); @@ -1789,6 +2206,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, struct vxlan_rdst *dst = &vxlan->default_dst; __u32 vni; int err; + bool use_ipv6 = false; if (!data[IFLA_VXLAN_ID]) return -EINVAL; @@ -1796,11 +2214,32 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, vni = nla_get_u32(data[IFLA_VXLAN_ID]); dst->remote_vni = vni; - if (data[IFLA_VXLAN_GROUP]) - dst->remote_ip = nla_get_be32(data[IFLA_VXLAN_GROUP]); + if (data[IFLA_VXLAN_GROUP]) { + dst->remote_ip.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_GROUP]); + dst->remote_ip.sa.sa_family = AF_INET; + } else if (data[IFLA_VXLAN_GROUP6]) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; + + nla_memcpy(&dst->remote_ip.sin6.sin6_addr, data[IFLA_VXLAN_GROUP6], + sizeof(struct in6_addr)); + dst->remote_ip.sa.sa_family = AF_INET6; + use_ipv6 = true; + } - if (data[IFLA_VXLAN_LOCAL]) - vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]); + if (data[IFLA_VXLAN_LOCAL]) { + vxlan->saddr.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_LOCAL]); + vxlan->saddr.sa.sa_family = AF_INET; + } else if (data[IFLA_VXLAN_LOCAL6]) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; + + /* TODO: respect scope id */ + nla_memcpy(&vxlan->saddr.sin6.sin6_addr, data[IFLA_VXLAN_LOCAL6], + sizeof(struct in6_addr)); + vxlan->saddr.sa.sa_family = AF_INET6; + use_ipv6 = true; + } if (data[IFLA_VXLAN_LINK] && (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) { @@ -1812,12 +2251,23 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, return -ENODEV; } +#if IS_ENABLED(CONFIG_IPV6) + if (use_ipv6) { + struct inet6_dev *idev = __in6_dev_get(lowerdev); + if (idev && idev->cnf.disable_ipv6) { + pr_info("IPv6 is disabled via sysctl\n"); + return -EPERM; + } + vxlan->flags |= VXLAN_F_IPV6; + } +#endif + if (!tb[IFLA_MTU]) - dev->mtu = lowerdev->mtu - VXLAN_HEADROOM; + dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); /* update header length based on lower device */ dev->hard_header_len = lowerdev->hard_header_len + - VXLAN_HEADROOM; + (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); } if (data[IFLA_VXLAN_TOS]) @@ -1868,7 +2318,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, /* create an fdb entry for default destination */ err = vxlan_fdb_create(vxlan, all_zeros_mac, - vxlan->default_dst.remote_ip, + &vxlan->default_dst.remote_ip, NUD_REACHABLE|NUD_PERMANENT, NLM_F_EXCL|NLM_F_CREATE, vxlan->dst_port, vxlan->default_dst.remote_vni, @@ -1905,9 +2355,9 @@ static size_t vxlan_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ - nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */ + nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ - nla_total_size(sizeof(__be32))+ /* IFLA_VXLAN_LOCAL */ + nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ @@ -1934,14 +2384,36 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni)) goto nla_put_failure; - if (dst->remote_ip && nla_put_be32(skb, IFLA_VXLAN_GROUP, dst->remote_ip)) - goto nla_put_failure; + if (!vxlan_addr_any(&dst->remote_ip)) { + if (dst->remote_ip.sa.sa_family == AF_INET) { + if (nla_put_be32(skb, IFLA_VXLAN_GROUP, + dst->remote_ip.sin.sin_addr.s_addr)) + goto nla_put_failure; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (nla_put(skb, IFLA_VXLAN_GROUP6, sizeof(struct in6_addr), + &dst->remote_ip.sin6.sin6_addr)) + goto nla_put_failure; +#endif + } + } if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) goto nla_put_failure; - if (vxlan->saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr)) - goto nla_put_failure; + if (!vxlan_addr_any(&vxlan->saddr)) { + if (vxlan->saddr.sa.sa_family == AF_INET) { + if (nla_put_be32(skb, IFLA_VXLAN_LOCAL, + vxlan->saddr.sin.sin_addr.s_addr)) + goto nla_put_failure; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr), + &vxlan->saddr.sin6.sin6_addr)) + goto nla_put_failure; +#endif + } + } if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) || nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) || diff --git a/include/net/vxlan.h b/include/net/vxlan.h index ad342e3688a..d2b88cafa7a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -25,7 +25,7 @@ struct vxlan_sock { struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, vxlan_rcv_t *rcv, void *data, - bool no_share); + bool no_share, bool ipv6); void vxlan_sock_release(struct vxlan_sock *vs); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 04c0e7a5d48..80394e8dc3a 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -314,6 +314,8 @@ enum { IFLA_VXLAN_L2MISS, IFLA_VXLAN_L3MISS, IFLA_VXLAN_PORT, /* destination port */ + IFLA_VXLAN_GROUP6, + IFLA_VXLAN_LOCAL6, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 36848bd54a7..a0060245b4e 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -123,7 +123,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) vxlan_port = vxlan_vport(vport); strncpy(vxlan_port->name, parms->name, IFNAMSIZ); - vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true); + vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, false); if (IS_ERR(vs)) { ovs_vport_free(vport); return (void *)vs; -- cgit v1.2.3-70-g09d2 From e15a00aafa4b7953ad717d3cb1ad7acf4ff76945 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 31 Aug 2013 13:44:34 +0800 Subject: vxlan: add ipv6 route short circuit support route short circuit only has IPv4 part, this patch adds the IPv6 part. nd_tbl will be needed. Cc: David S. Miller Cc: David Stevens Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 30 ++++++++++++++++++++++++++++-- include/net/addrconf.h | 1 + net/ipv6/af_inet6.c | 1 + 3 files changed, 30 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index faf131e02a7..c833763fe5c 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1200,7 +1200,6 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); struct neighbour *n; - struct iphdr *pip; if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) return false; @@ -1208,6 +1207,9 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) n = NULL; switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: + { + struct iphdr *pip; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) return false; pip = ip_hdr(skb); @@ -1223,6 +1225,29 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) } break; + } +#if IS_ENABLED(CONFIG_IPV6) + case ETH_P_IPV6: + { + struct ipv6hdr *pip6; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return false; + pip6 = ipv6_hdr(skb); + n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev); + if (!n && (vxlan->flags & VXLAN_F_L3MISS)) { + union vxlan_addr ipa = { + .sin6.sin6_addr = pip6->daddr, + .sa.sa_family = AF_INET6, + }; + + vxlan_ip_miss(dev, &ipa); + return false; + } + + break; + } +#endif default: return false; } @@ -1659,7 +1684,8 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) did_rsc = false; if (f && (f->flags & NTF_ROUTER) && (vxlan->flags & VXLAN_F_RSC) && - ntohs(eth->h_proto) == ETH_P_IP) { + (ntohs(eth->h_proto) == ETH_P_IP || + ntohs(eth->h_proto) == ETH_P_IPV6)) { did_rsc = route_shortcircuit(dev, skb); if (did_rsc) f = vxlan_find_mac(vxlan, eth->h_dest); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 5339cab356b..bcf957341b6 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -153,6 +153,7 @@ struct ipv6_stub { int (*ipv6_dst_lookup)(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); void (*udpv6_encap_enable)(void); + struct neigh_table *nd_tbl; }; extern const struct ipv6_stub *ipv6_stub __read_mostly; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 0c9c22f7487..1996a7c34f7 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -815,6 +815,7 @@ static const struct ipv6_stub ipv6_stub_impl = { .ipv6_sock_mc_drop = ipv6_sock_mc_drop, .ipv6_dst_lookup = ip6_dst_lookup, .udpv6_encap_enable = udpv6_encap_enable, + .nd_tbl = &nd_tbl, }; static int __init inet6_init(void) -- cgit v1.2.3-70-g09d2 From f564f45c451809aa3b74f577754528520d315ac1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 31 Aug 2013 13:44:36 +0800 Subject: vxlan: add ipv6 proxy support This patch adds the IPv6 version of "arp_reduce", ndisc_send_na() will be needed. Cc: David S. Miller Cc: David Stevens Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++-- include/net/addrconf.h | 4 +++ include/net/ndisc.h | 5 +++ net/ipv6/af_inet6.c | 2 ++ net/ipv6/ndisc.c | 8 ++--- 5 files changed, 95 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index c833763fe5c..3ffb22d684a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1196,6 +1196,70 @@ out: return NETDEV_TX_OK; } +#if IS_ENABLED(CONFIG_IPV6) +static int neigh_reduce(struct net_device *dev, struct sk_buff *skb) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct neighbour *n; + union vxlan_addr ipa; + const struct ipv6hdr *iphdr; + const struct in6_addr *saddr, *daddr; + struct nd_msg *msg; + struct inet6_dev *in6_dev = NULL; + + in6_dev = __in6_dev_get(dev); + if (!in6_dev) + goto out; + + if (!pskb_may_pull(skb, skb->len)) + goto out; + + iphdr = ipv6_hdr(skb); + saddr = &iphdr->saddr; + daddr = &iphdr->daddr; + + if (ipv6_addr_loopback(daddr) || + ipv6_addr_is_multicast(daddr)) + goto out; + + msg = (struct nd_msg *)skb_transport_header(skb); + if (msg->icmph.icmp6_code != 0 || + msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) + goto out; + + n = neigh_lookup(ipv6_stub->nd_tbl, daddr, dev); + + if (n) { + struct vxlan_fdb *f; + + if (!(n->nud_state & NUD_CONNECTED)) { + neigh_release(n); + goto out; + } + + f = vxlan_find_mac(vxlan, n->ha); + if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { + /* bridge-local neighbor */ + neigh_release(n); + goto out; + } + + ipv6_stub->ndisc_send_na(dev, n, saddr, &msg->target, + !!in6_dev->cnf.forwarding, + true, false, false); + neigh_release(n); + } else if (vxlan->flags & VXLAN_F_L3MISS) { + ipa.sin6.sin6_addr = *daddr; + ipa.sa.sa_family = AF_INET6; + vxlan_ip_miss(dev, &ipa); + } + +out: + consume_skb(skb); + return NETDEV_TX_OK; +} +#endif + static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -1677,8 +1741,22 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); eth = eth_hdr(skb); - if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP) - return arp_reduce(dev, skb); + if ((vxlan->flags & VXLAN_F_PROXY)) { + if (ntohs(eth->h_proto) == ETH_P_ARP) + return arp_reduce(dev, skb); +#if IS_ENABLED(CONFIG_IPV6) + else if (ntohs(eth->h_proto) == ETH_P_IPV6 && + skb->len >= sizeof(struct ipv6hdr) + sizeof(struct nd_msg) && + ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { + struct nd_msg *msg; + + msg = (struct nd_msg *)skb_transport_header(skb); + if (msg->icmph.icmp6_code == 0 && + msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) + return neigh_reduce(dev, skb); + } +#endif + } f = vxlan_find_mac(vxlan, eth->h_dest); did_rsc = false; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index bcf957341b6..fb314de2b61 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -153,6 +153,10 @@ struct ipv6_stub { int (*ipv6_dst_lookup)(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); void (*udpv6_encap_enable)(void); + void (*ndisc_send_na)(struct net_device *dev, struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, + bool router, bool solicited, bool override, bool inc_opt); struct neigh_table *nd_tbl; }; extern const struct ipv6_stub *ipv6_stub __read_mostly; diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 6fea32340ae..3c4211f0bed 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -204,6 +204,11 @@ extern void ndisc_send_ns(struct net_device *dev, extern void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); +extern void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, + bool router, bool solicited, bool override, + bool inc_opt); extern void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1996a7c34f7..136fe55c1a4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -56,6 +56,7 @@ #include #include #include +#include #ifdef CONFIG_IPV6_TUNNEL #include #endif @@ -815,6 +816,7 @@ static const struct ipv6_stub ipv6_stub_impl = { .ipv6_sock_mc_drop = ipv6_sock_mc_drop, .ipv6_dst_lookup = ip6_dst_lookup, .udpv6_encap_enable = udpv6_encap_enable, + .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, }; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index bb6fd95a5cd..14bd2f9d9db 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -461,10 +461,10 @@ static void ndisc_send_skb(struct sk_buff *skb, rcu_read_unlock(); } -static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, - const struct in6_addr *daddr, - const struct in6_addr *solicited_addr, - bool router, bool solicited, bool override, bool inc_opt) +void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, + bool router, bool solicited, bool override, bool inc_opt) { struct sk_buff *skb; struct in6_addr tmpaddr; -- cgit v1.2.3-70-g09d2 From 5a17a390de7bdbcfff9b8f344273a886ca4cf8bf Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 2 Sep 2013 10:06:53 +0800 Subject: net: make snmp_mib_free static inline Fengguang reported: net/built-in.o: In function `in6_dev_finish_destroy': (.text+0x4ca7d): undefined reference to `snmp_mib_free' this is due to snmp_mib_free() is defined when CONFIG_INET is enabled, but in6_dev_finish_destroy() is now moved to core kernel. I think snmp_mib_free() is small enough to be inlined, so just make it static inline. Reported-by: kbuild test robot Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/ip.h | 12 +++++++++++- net/ipv4/af_inet.c | 12 ------------ 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index a68f838a132..48f55979d84 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -194,7 +194,17 @@ static inline u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp } #endif extern int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align); -extern void snmp_mib_free(void __percpu *ptr[2]); + +static inline void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ]) +{ + int i; + + BUG_ON(ptr == NULL); + for (i = 0; i < SNMP_ARRAY_SZ; i++) { + free_percpu(ptr[i]); + ptr[i] = NULL; + } +} extern struct local_ports { seqlock_t lock; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b4d0be2b7ce..7a1874b7b8f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1532,18 +1532,6 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) } EXPORT_SYMBOL_GPL(snmp_mib_init); -void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ]) -{ - int i; - - BUG_ON(ptr == NULL); - for (i = 0; i < SNMP_ARRAY_SZ; i++) { - free_percpu(ptr[i]); - ptr[i] = NULL; - } -} -EXPORT_SYMBOL_GPL(snmp_mib_free); - #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, -- cgit v1.2.3-70-g09d2 From 83a093b486ecfd25e7b04b82f6a388f24bd274b2 Mon Sep 17 00:00:00 2001 From: Bjørn Mork Date: Fri, 30 Aug 2013 18:08:44 +0200 Subject: net: etherdevice: add address inherit helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some etherdevices inherit their address from a parent or master device. The addr_assign_type should be updated along with the address in these cases. Adding a helper function to simplify this. Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index c623861964e..d8b512496e5 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -198,6 +198,21 @@ static inline void eth_hw_addr_random(struct net_device *dev) eth_random_addr(dev->dev_addr); } +/** + * eth_hw_addr_inherit - Copy dev_addr from another net_device + * @dst: pointer to net_device to copy dev_addr to + * @src: pointer to net_device to copy dev_addr from + * + * Copy the Ethernet address from one net_device to another along with + * the address attributes (addr_assign_type). + */ +static inline void eth_hw_addr_inherit(struct net_device *dst, + struct net_device *src) +{ + dst->addr_assign_type = src->addr_assign_type; + memcpy(dst->dev_addr, src->dev_addr, ETH_ALEN); +} + /** * compare_ether_addr - Compare two Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address -- cgit v1.2.3-70-g09d2 From 61e76b178dbe7145e8d6afa84bb4ccea71918994 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Fri, 30 Aug 2013 11:18:45 +0200 Subject: ICMPv6: treat dest unreachable codes 5 and 6 as EACCES, not EPROTO RFC 4443 has defined two additional codes for ICMPv6 type 1 (destination unreachable) messages: 5 - Source address failed ingress/egress policy 6 - Reject route to destination Now they are treated as protocol error and icmpv6_err_convert() converts them to EPROTO. RFC 4443 says: "Codes 5 and 6 are more informative subsets of code 1." Treat codes 5 and 6 as code 1 (EACCES) Btw, connect() returning -EPROTO confuses firefox, so that fallback to other/IPv4 addresses does not work: https://bugzilla.mozilla.org/show_bug.cgi?id=910773 Signed-off-by: Jiri Bohac Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/icmpv6.h | 2 ++ net/ipv6/icmp.c | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index e0133c73c30..590beda78ea 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -115,6 +115,8 @@ struct icmp6hdr { #define ICMPV6_NOT_NEIGHBOUR 2 #define ICMPV6_ADDR_UNREACH 3 #define ICMPV6_PORT_UNREACH 4 +#define ICMPV6_POLICY_FAIL 5 +#define ICMPV6_REJECT_ROUTE 6 /* * Codes for Time Exceeded diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 7cfc8d28487..67ae4e0d40b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -940,6 +940,14 @@ static const struct icmp6_err { .err = ECONNREFUSED, .fatal = 1, }, + { /* POLICY_FAIL */ + .err = EACCES, + .fatal = 1, + }, + { /* REJECT_ROUTE */ + .err = EACCES, + .fatal = 1, + }, }; int icmpv6_err_convert(u8 type, u8 code, int *err) @@ -951,7 +959,7 @@ int icmpv6_err_convert(u8 type, u8 code, int *err) switch (type) { case ICMPV6_DEST_UNREACH: fatal = 1; - if (code <= ICMPV6_PORT_UNREACH) { + if (code < ARRAY_SIZE(tab_unreach)) { *err = tab_unreach[code].err; fatal = tab_unreach[code].fatal; } -- cgit v1.2.3-70-g09d2 From bc6fc9fa0e1686d8a06aaa14005d3dbf7978d81f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 30 Aug 2013 15:36:14 +0100 Subject: net: fix comment typo for __skb_alloc_pages() The name of the function in the comment is __skb_alloc_page() while we are actually commenting __skb_alloc_pages(). Fix this typo and make it a valid kernel doc comment. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5ac96f31d54..6f1330af1eb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1905,8 +1905,8 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC); } -/* - * __skb_alloc_page - allocate pages for ps-rx on a skb and preserve pfmemalloc data +/** + * __skb_alloc_pages - allocate pages for ps-rx on a skb and preserve pfmemalloc data * @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX * @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used * @order: size of the allocation -- cgit v1.2.3-70-g09d2 From 951fd874c3b014c4abf38a8e588d4687b98fedb4 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 1 Sep 2013 13:11:55 -0700 Subject: llc: Use normal etherdevice.h tests Convert the llc_ static inlines to the equivalents from etherdevice.h and remove the llc_ static inline functions. llc_mac_null -> is_zero_ether_addr llc_mac_multicast -> is_multicast_ether_addr llc_mac_match -> ether_addr_equal Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/llc_if.h | 30 ------------------------------ net/llc/af_llc.c | 6 +++--- net/llc/llc_conn.c | 6 +++--- net/llc/llc_sap.c | 4 ++-- 4 files changed, 8 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/net/llc_if.h b/include/net/llc_if.h index b595a004d31..f0cb909b60e 100644 --- a/include/net/llc_if.h +++ b/include/net/llc_if.h @@ -62,36 +62,6 @@ #define LLC_STATUS_CONFLICT 7 /* disconnect conn */ #define LLC_STATUS_RESET_DONE 8 /* */ -/** - * llc_mac_null - determines if a address is a null mac address - * @mac: Mac address to test if null. - * - * Determines if a given address is a null mac address. Returns 0 if the - * address is not a null mac, 1 if the address is a null mac. - */ -static inline int llc_mac_null(const u8 *mac) -{ - return is_zero_ether_addr(mac); -} - -static inline int llc_mac_multicast(const u8 *mac) -{ - return is_multicast_ether_addr(mac); -} -/** - * llc_mac_match - determines if two mac addresses are the same - * @mac1: First mac address to compare. - * @mac2: Second mac address to compare. - * - * Determines if two given mac address are the same. Returns 0 if there - * is not a complete match up to len, 1 if a complete match up to len is - * found. - */ -static inline int llc_mac_match(const u8 *mac1, const u8 *mac2) -{ - return !compare_ether_addr(mac1, mac2); -} - extern int llc_establish_connection(struct sock *sk, u8 *lmac, u8 *dmac, u8 dsap); extern int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 48aaa89253e..6cba486353e 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -321,12 +321,12 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) if (llc->dev) { if (!addr->sllc_arphrd) addr->sllc_arphrd = llc->dev->type; - if (llc_mac_null(addr->sllc_mac)) + if (is_zero_ether_addr(addr->sllc_mac)) memcpy(addr->sllc_mac, llc->dev->dev_addr, IFHWADDRLEN); if (addr->sllc_arphrd != llc->dev->type || - !llc_mac_match(addr->sllc_mac, - llc->dev->dev_addr)) { + !ether_addr_equal(addr->sllc_mac, + llc->dev->dev_addr)) { rc = -EINVAL; llc->dev = NULL; } diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 0d0d416dfab..cd872417796 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -478,8 +478,8 @@ static inline bool llc_estab_match(const struct llc_sap *sap, return llc->laddr.lsap == laddr->lsap && llc->daddr.lsap == daddr->lsap && - llc_mac_match(llc->laddr.mac, laddr->mac) && - llc_mac_match(llc->daddr.mac, daddr->mac); + ether_addr_equal(llc->laddr.mac, laddr->mac) && + ether_addr_equal(llc->daddr.mac, daddr->mac); } /** @@ -550,7 +550,7 @@ static inline bool llc_listener_match(const struct llc_sap *sap, return sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN && llc->laddr.lsap == laddr->lsap && - llc_mac_match(llc->laddr.mac, laddr->mac); + ether_addr_equal(llc->laddr.mac, laddr->mac); } static struct sock *__llc_lookup_listener(struct llc_sap *sap, diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 78be45cda5c..e5850699098 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -302,7 +302,7 @@ static inline bool llc_dgram_match(const struct llc_sap *sap, return sk->sk_type == SOCK_DGRAM && llc->laddr.lsap == laddr->lsap && - llc_mac_match(llc->laddr.mac, laddr->mac); + ether_addr_equal(llc->laddr.mac, laddr->mac); } /** @@ -425,7 +425,7 @@ void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) llc_pdu_decode_da(skb, laddr.mac); llc_pdu_decode_dsap(skb, &laddr.lsap); - if (llc_mac_multicast(laddr.mac)) { + if (is_multicast_ether_addr(laddr.mac)) { llc_sap_mcast(sap, &laddr, skb); kfree_skb(skb); } else { -- cgit v1.2.3-70-g09d2 From 8b7ed2d91d6afb0b55ba75f94b66e51f70783a46 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 2 Sep 2013 15:34:54 +0200 Subject: iptunnels: remove net arg from iptunnel_xmit() This argument is not used, let's remove it. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 3 +-- include/net/ip_tunnels.h | 3 +-- net/ipv4/ip_tunnel.c | 3 +-- net/ipv4/ip_tunnel_core.c | 3 +-- net/ipv6/sit.c | 4 ++-- net/openvswitch/vport-gre.c | 2 +- 6 files changed, 7 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 8f6d6c1153c..e25c97dfbac 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1528,8 +1528,7 @@ int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs, if (err) return err; - return iptunnel_xmit(net, rt, skb, src, dst, - IPPROTO_UDP, tos, ttl, df); + return iptunnel_xmit(rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df); } EXPORT_SYMBOL_GPL(vxlan_xmit_skb); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0ce316bb3c6..94fe8fd07bc 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -146,8 +146,7 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, } int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); -int iptunnel_xmit(struct net *net, struct rtable *rt, - struct sk_buff *skb, +int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 830de3f4e29..0a6cf0e6947 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -654,8 +654,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } } - err = iptunnel_xmit(tunnel->net, rt, skb, - fl4.saddr, fl4.daddr, protocol, + err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol, ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 850525b3489..e8204584235 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,8 +46,7 @@ #include #include -int iptunnel_xmit(struct net *net, struct rtable *rt, - struct sk_buff *skb, +int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df) { diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index f18f842ac89..1d1458a3b7c 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -888,8 +888,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, fl4.daddr, - IPPROTO_IPV6, tos, ttl, df); + err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos, + ttl, df); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 21d5073e148..9b3713ef831 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -176,7 +176,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) skb->local_df = 1; - return iptunnel_xmit(net, rt, skb, fl.saddr, + return iptunnel_xmit(rt, skb, fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, OVS_CB(skb)->tun_key->ipv4_tos, OVS_CB(skb)->tun_key->ipv4_ttl, df); -- cgit v1.2.3-70-g09d2 From 117961878cc1386923cfddcdd9016b777827c8dd Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 2 Sep 2013 15:34:55 +0200 Subject: vxlan: remove net arg from vxlan[6]_xmit_skb() This argument is not used, let's remove it. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 8 ++++---- include/net/vxlan.h | 2 +- net/openvswitch/vport-vxlan.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e25c97dfbac..a334bfb91c5 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1380,7 +1380,7 @@ static int handle_offloads(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_IPV6) -static int vxlan6_xmit_skb(struct net *net, struct vxlan_sock *vs, +static int vxlan6_xmit_skb(struct vxlan_sock *vs, struct dst_entry *dst, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, __u8 prio, __u8 ttl, @@ -1475,7 +1475,7 @@ static int vxlan6_xmit_skb(struct net *net, struct vxlan_sock *vs, } #endif -int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs, +int vxlan_xmit_skb(struct vxlan_sock *vs, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, __be32 vni) @@ -1651,7 +1651,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb, + err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb, fl4.saddr, dst->sin.sin_addr.s_addr, tos, ttl, df, src_port, dst_port, htonl(vni << 8)); @@ -1703,7 +1703,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ttl = ttl ? : ip6_dst_hoplimit(ndst); - err = vxlan6_xmit_skb(dev_net(dev), vxlan->vn_sock, ndst, skb, + err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb, dev, &fl6.saddr, &fl6.daddr, 0, ttl, src_port, dst_port, htonl(vni << 8)); #endif diff --git a/include/net/vxlan.h b/include/net/vxlan.h index d2b88cafa7a..e09c40b6802 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -29,7 +29,7 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, void vxlan_sock_release(struct vxlan_sock *vs); -int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs, +int vxlan_xmit_skb(struct vxlan_sock *vs, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, __be32 vni); diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index a0060245b4e..a481c03e286 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -176,7 +176,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) inet_get_local_port_range(&port_min, &port_max); src_port = vxlan_src_port(port_min, port_max, skb); - err = vxlan_xmit_skb(net, vxlan_port->vs, rt, skb, + err = vxlan_xmit_skb(vxlan_port->vs, rt, skb, fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, OVS_CB(skb)->tun_key->ipv4_tos, OVS_CB(skb)->tun_key->ipv4_ttl, df, -- cgit v1.2.3-70-g09d2 From 8b27f27797cac5ed9b2f3e63dac89a7ae70e70a7 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 2 Sep 2013 15:34:56 +0200 Subject: skb: allow skb_scrub_packet() to be used by tunnels This function was only used when a packet was sent to another netns. Now, it can also be used after tunnel encapsulation or decapsulation. Only skb_orphan() should not be done when a packet is not crossing netns. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/dev.c | 2 +- net/core/skbuff.c | 19 ++++++++++++------- net/ipv4/ip_tunnel.c | 4 ++-- net/ipv6/ip6_tunnel.c | 4 ++-- net/ipv6/sit.c | 4 ++-- 6 files changed, 20 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6f1330af1eb..2ddb48d9312 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2392,7 +2392,7 @@ extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); -extern void skb_scrub_packet(struct sk_buff *skb); +extern void skb_scrub_packet(struct sk_buff *skb, bool xnet); extern struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); diff --git a/net/core/dev.c b/net/core/dev.c index 6fbb0c90849..07684e880a5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1697,7 +1697,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) * call skb_scrub_packet() after it to clear pkt_type _after_ calling * eth_type_trans(). */ - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); return netif_rx(skb); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2c3d0f53d19..d81cff119f7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3500,17 +3500,22 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, EXPORT_SYMBOL(skb_try_coalesce); /** - * skb_scrub_packet - scrub an skb before sending it to another netns + * skb_scrub_packet - scrub an skb * * @skb: buffer to clean - * - * skb_scrub_packet can be used to clean an skb before injecting it in - * another namespace. We have to clear all information in the skb that - * could impact namespace isolation. + * @xnet: packet is crossing netns + * + * skb_scrub_packet can be used after encapsulating or decapsulting a packet + * into/from a tunnel. Some information have to be cleared during these + * operations. + * skb_scrub_packet can also be used to clean a skb before injecting it in + * another namespace (@xnet == true). We have to clear all information in the + * skb that could impact namespace isolation. */ -void skb_scrub_packet(struct sk_buff *skb) +void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb_orphan(skb); + if (xnet) + skb_orphan(skb); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0a6cf0e6947..b0e74e17088 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -462,7 +462,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, } if (!net_eq(tunnel->net, dev_net(tunnel->dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); gro_cells_receive(&tunnel->gro_cells, skb); return 0; @@ -615,7 +615,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } if (!net_eq(tunnel->net, dev_net(dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); if (tunnel->err_count > 0) { if (time_before(jiffies, diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index d6e00a39274..72372ac9015 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -830,7 +830,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, tstats->rx_bytes += skb->len; if (!net_eq(t->net, dev_net(t->dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); netif_rx(skb); @@ -1002,7 +1002,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, } if (!net_eq(t->net, dev_net(dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); /* * Okay, now see if we can stuff it in the buffer as-is. diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 1d1458a3b7c..b2e44f478e1 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -622,7 +622,7 @@ static int ipip6_rcv(struct sk_buff *skb) tstats->rx_bytes += skb->len; if (!net_eq(tunnel->net, dev_net(tunnel->dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); netif_rx(skb); return 0; @@ -861,7 +861,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } if (!net_eq(tunnel->net, dev_net(dev))) - skb_scrub_packet(skb); + skb_scrub_packet(skb, true); /* * Okay, now see if we can stuff it in the buffer as-is. -- cgit v1.2.3-70-g09d2 From 963a88b31ddbbe99f38502239b1a46601773d217 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 2 Sep 2013 15:34:57 +0200 Subject: tunnels: harmonize cleanup done on skb on xmit path The goal of this patch is to harmonize cleanup done on a skbuff on xmit path. Before this patch, behaviors were different depending of the tunnel type. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 6 ++++-- include/net/ip6_tunnel.h | 1 - include/net/ip_tunnels.h | 2 +- net/ipv4/ip_tunnel.c | 6 ++---- net/ipv4/ip_tunnel_core.c | 7 +++---- net/ipv6/ip6_gre.c | 4 ++-- net/ipv6/ip6_tunnel.c | 4 +--- net/ipv6/sit.c | 5 +---- net/openvswitch/vport-gre.c | 2 +- 9 files changed, 15 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index a334bfb91c5..ebda3a1c2f3 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1397,6 +1397,8 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs, skb->encapsulation = 1; } + skb_scrub_packet(skb, false); + min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + VXLAN_HLEN + sizeof(struct ipv6hdr) + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); @@ -1432,7 +1434,6 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs, memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); - skb_dst_drop(skb); skb_dst_set(skb, dst); if (!skb_is_gso(skb) && !(dst->dev->features & NETIF_F_IPV6_CSUM)) { @@ -1528,7 +1529,8 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, if (err) return err; - return iptunnel_xmit(rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df); + return iptunnel_xmit(rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, + false); } EXPORT_SYMBOL_GPL(vxlan_xmit_skb); diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 2265b0bf97e..6d1549c4893 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -75,7 +75,6 @@ static inline void ip6tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct net_device_stats *stats = &dev->stats; int pkt_len, err; - nf_reset(skb); pkt_len = skb->len; err = ip6_local_out(skb); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 94fe8fd07bc..a0a4a100f5c 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -148,7 +148,7 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, - __u8 tos, __u8 ttl, __be16 df); + __u8 tos, __u8 ttl, __be16 df, bool xnet); static inline void iptunnel_xmit_stats(int err, struct net_device_stats *err_stats, diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index b0e74e17088..88d7d7d1ecc 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -614,9 +614,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (!net_eq(tunnel->net, dev_net(dev))) - skb_scrub_packet(skb, true); - if (tunnel->err_count > 0) { if (time_before(jiffies, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { @@ -655,7 +652,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol, - ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df); + ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df, + !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index e8204584235..d6c856b17fd 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -48,16 +48,15 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, - __u8 tos, __u8 ttl, __be16 df) + __u8 tos, __u8 ttl, __be16 df, bool xnet) { int pkt_len = skb->len; struct iphdr *iph; int err; - nf_reset(skb); - secpath_reset(skb); + skb_scrub_packet(skb, xnet); + skb->rxhash = 0; - skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f2d0a42f805..f179ff1f56e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -694,6 +694,8 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, tunnel->err_count = 0; } + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + dst->header_len; if (skb_headroom(skb) < max_headroom || skb_shared(skb) || @@ -710,8 +712,6 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb = new_skb; } - skb_dst_drop(skb); - if (fl6->flowi6_mark) { skb_dst_set(skb, dst); ndst = NULL; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 72372ac9015..ecbcdbd4bc4 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1001,8 +1001,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, goto tx_err_dst_release; } - if (!net_eq(t->net, dev_net(dev))) - skb_scrub_packet(skb, true); + skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* * Okay, now see if we can stuff it in the buffer as-is. @@ -1021,7 +1020,6 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, consume_skb(skb); skb = new_skb; } - skb_dst_drop(skb); if (fl6->flowi6_mark) { skb_dst_set(skb, dst); ndst = NULL; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index b2e44f478e1..82b425b9b7d 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -860,9 +860,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, tunnel->err_count = 0; } - if (!net_eq(tunnel->net, dev_net(dev))) - skb_scrub_packet(skb, true); - /* * Okay, now see if we can stuff it in the buffer as-is. */ @@ -889,7 +886,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos, - ttl, df); + ttl, df, !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 9b3713ef831..c99dea543d6 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -179,7 +179,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) return iptunnel_xmit(rt, skb, fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, OVS_CB(skb)->tun_key->ipv4_tos, - OVS_CB(skb)->tun_key->ipv4_ttl, df); + OVS_CB(skb)->tun_key->ipv4_ttl, df, false); err_free_rt: ip_rt_put(rt); error: -- cgit v1.2.3-70-g09d2 From ea23192e8e577dfc51e0f4fc5ca113af334edff9 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 2 Sep 2013 15:34:58 +0200 Subject: tunnels: harmonize cleanup done on skb on rx path The goal of this patch is to harmonize cleanup done on a skbuff on rx path. Before this patch, behaviors were different depending of the tunnel type. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/dst.h | 12 +++++++----- net/ipv4/ip_tunnel.c | 3 +-- net/ipv4/ipmr.c | 3 +-- net/ipv6/ip6_gre.c | 5 +---- net/ipv6/ip6_tunnel.c | 7 +------ net/ipv6/ip6mr.c | 3 +-- net/ipv6/sit.c | 6 +----- 7 files changed, 13 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 1f8fd109e22..3bc4865f826 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -311,11 +311,13 @@ static inline void skb_dst_force(struct sk_buff *skb) * __skb_tunnel_rx - prepare skb for rx reinsert * @skb: buffer * @dev: tunnel device + * @net: netns for packet i/o * * After decapsulation, packet is going to re-enter (netif_rx()) our stack, * so make some cleanups. (no accounting done) */ -static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev) +static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, + struct net *net) { skb->dev = dev; @@ -327,8 +329,7 @@ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev) if (!skb->l4_rxhash) skb->rxhash = 0; skb_set_queue_mapping(skb, 0); - skb_dst_drop(skb); - nf_reset(skb); + skb_scrub_packet(skb, !net_eq(net, dev_net(dev))); } /** @@ -340,12 +341,13 @@ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev) * so make some cleanups, and perform accounting. * Note: this accounting is not SMP safe. */ -static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev) +static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, + struct net *net) { /* TODO : stats should be SMP safe */ dev->stats.rx_packets++; dev->stats.rx_bytes += skb->len; - __skb_tunnel_rx(skb, dev); + __skb_tunnel_rx(skb, dev, net); } /* Children define the path of the packet through the diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 88d7d7d1ecc..ac9fabe0300 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -461,8 +461,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, skb->dev = tunnel->dev; } - if (!net_eq(tunnel->net, dev_net(tunnel->dev))) - skb_scrub_packet(skb, true); + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); gro_cells_receive(&tunnel->gro_cells, skb); return 0; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index bacc0bcf48c..9ae54b09254 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2067,9 +2067,8 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); skb->ip_summed = CHECKSUM_NONE; - skb->pkt_type = PACKET_HOST; - skb_tunnel_rx(skb, reg_dev); + skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f179ff1f56e..db992a37301 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -509,8 +509,6 @@ static int ip6gre_rcv(struct sk_buff *skb) goto drop; } - secpath_reset(skb); - skb->protocol = gre_proto; /* WCCP version 1 and 2 protocol decoding. * - Change protocol to IP @@ -525,7 +523,6 @@ static int ip6gre_rcv(struct sk_buff *skb) skb->mac_header = skb->network_header; __pskb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); - skb->pkt_type = PACKET_HOST; if (((flags&GRE_CSUM) && csum) || (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { @@ -557,7 +554,7 @@ static int ip6gre_rcv(struct sk_buff *skb) skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } - __skb_tunnel_rx(skb, tunnel->dev); + __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); skb_reset_network_header(skb); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ecbcdbd4bc4..55999d923f2 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -802,14 +802,12 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, rcu_read_unlock(); goto discard; } - secpath_reset(skb); skb->mac_header = skb->network_header; skb_reset_network_header(skb); skb->protocol = htons(protocol); - skb->pkt_type = PACKET_HOST; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); - __skb_tunnel_rx(skb, t->dev); + __skb_tunnel_rx(skb, t->dev, t->net); err = dscp_ecn_decapsulate(t, ipv6h, skb); if (unlikely(err)) { @@ -829,9 +827,6 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, tstats->rx_packets++; tstats->rx_bytes += skb->len; - if (!net_eq(t->net, dev_net(t->dev))) - skb_scrub_packet(skb, true); - netif_rx(skb); rcu_read_unlock(); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index a60a84ef04f..f365310bfcc 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -672,9 +672,8 @@ static int pim6_rcv(struct sk_buff *skb) skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; - skb->pkt_type = PACKET_HOST; - skb_tunnel_rx(skb, reg_dev); + skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 82b425b9b7d..19abcc9d6a1 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -581,12 +581,10 @@ static int ipip6_rcv(struct sk_buff *skb) tunnel->parms.iph.protocol != 0) goto out; - secpath_reset(skb); skb->mac_header = skb->network_header; skb_reset_network_header(skb); IPCB(skb)->flags = 0; skb->protocol = htons(ETH_P_IPV6); - skb->pkt_type = PACKET_HOST; if (tunnel->dev->priv_flags & IFF_ISATAP) { if (!isatap_chksrc(skb, iph, tunnel)) { @@ -603,7 +601,7 @@ static int ipip6_rcv(struct sk_buff *skb) } } - __skb_tunnel_rx(skb, tunnel->dev); + __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { @@ -621,8 +619,6 @@ static int ipip6_rcv(struct sk_buff *skb) tstats->rx_packets++; tstats->rx_bytes += skb->len; - if (!net_eq(tunnel->net, dev_net(tunnel->dev))) - skb_scrub_packet(skb, true); netif_rx(skb); return 0; -- cgit v1.2.3-70-g09d2 From c995ae2259ee36caf48bbfacf40111998dacd4af Mon Sep 17 00:00:00 2001 From: Vijay Subramanian Date: Tue, 3 Sep 2013 12:23:22 -0700 Subject: tcp: Change return value of tcp_rcv_established() tcp_rcv_established() returns only one value namely 0. We change the return value to void (as suggested by David Miller). After commit 0c24604b (tcp: implement RFC 5961 4.2), we no longer send RSTs in response to SYNs. We can remove the check and processing on the return value of tcp_rcv_established(). We also fix jtcp_rcv_established() in tcp_probe.c to match that of tcp_rcv_established(). Signed-off-by: Vijay Subramanian Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++-- net/ipv4/tcp_input.c | 13 ++++++------- net/ipv4/tcp_ipv4.c | 5 +---- net/ipv4/tcp_probe.c | 5 ++--- net/ipv6/tcp_ipv6.c | 3 +-- 5 files changed, 12 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6a6a88db462..b1aa324c5e6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -371,8 +371,8 @@ extern void tcp_delack_timer_handler(struct sock *sk); extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len); -extern int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len); +extern void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, unsigned int len); extern void tcp_rcv_space_adjust(struct sock *sk); extern void tcp_cleanup_rbuf(struct sock *sk, int copied); extern int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1a84fffe699..93d7e9de414 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5049,8 +5049,8 @@ discard: * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ -int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); @@ -5127,7 +5127,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_ack(sk, skb, 0); __kfree_skb(skb); tcp_data_snd_check(sk); - return 0; + return; } else { /* Header too small */ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -5220,7 +5220,7 @@ no_ack: if (eaten) kfree_skb_partial(skb, fragstolen); sk->sk_data_ready(sk, 0); - return 0; + return; } } @@ -5236,7 +5236,7 @@ slow_path: */ if (!tcp_validate_incoming(sk, skb, th, 1)) - return 0; + return; step5: if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) @@ -5252,7 +5252,7 @@ step5: tcp_data_snd_check(sk); tcp_ack_snd_check(sk); - return 0; + return; csum_error: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); @@ -5260,7 +5260,6 @@ csum_error: discard: __kfree_skb(skb); - return 0; } EXPORT_SYMBOL(tcp_rcv_established); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 09d45d71897..b14266bb91e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1799,10 +1799,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { - rsk = sk; - goto reset; - } + tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); return 0; } diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 1f6aa543b64..611beab38a0 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -122,8 +122,8 @@ static inline int tcp_probe_avail(void) * Hook inserted to be called before each receive packet. * Note: arguments must match tcp_rcv_established()! */ -static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) +static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, unsigned int len) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_sock *inet = inet_sk(sk); @@ -172,7 +172,6 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, } jprobe_return(); - return 0; } static struct jprobe tcp_jprobe = { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5bcfadf09e9..9acdcedf9a1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1360,8 +1360,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } } - if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) - goto reset; + tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); if (opt_skb) goto ipv6_pktoptions; return 0; -- cgit v1.2.3-70-g09d2 From cfd280c91253cc28e4919e349fa7a813b63e71e8 Mon Sep 17 00:00:00 2001 From: Carlos O'Donell Date: Thu, 15 Aug 2013 17:28:10 +0800 Subject: net: sync some IP headers with glibc Solution: ========= - Synchronize linux's `include/uapi/linux/in6.h' with glibc's `inet/netinet/in.h'. - Synchronize glibc's `inet/netinet/in.h with linux's `include/uapi/linux/in6.h'. - Allow including the headers in either other. - First header included defines the structures and macros. Details: ======== The kernel promises not to break the UAPI ABI so I don't see why we can't just have the two userspace headers coordinate? If you include the kernel headers first you get those, and if you include the glibc headers first you get those, and the following patch arranges a coordination and synchronization between the two. Let's handle `include/uapi/linux/in6.h' from linux, and `inet/netinet/in.h' from glibc and ensure they compile in any order and preserve the required ABI. These two patches pass the following compile tests: cat >> test1.c <> test2.c < Cc: Thomas Backlund Cc: libc-alpha@sourceware.org Cc: YOSHIFUJI Hideaki Cc: David S. Miller Tested-by: Cong Wang Signed-off-by: Carlos O'Donell Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/Kbuild | 1 + include/uapi/linux/in.h | 49 ++++++++++++++----- include/uapi/linux/in6.h | 36 +++++++++++--- include/uapi/linux/libc-compat.h | 103 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 169 insertions(+), 20 deletions(-) create mode 100644 include/uapi/linux/libc-compat.h (limited to 'include') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 997f9f2f096..e7c94eeb947 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -227,6 +227,7 @@ header-y += kvm_para.h endif header-y += l2tp.h +header-y += libc-compat.h header-y += limits.h header-y += llc.h header-y += loop.h diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 9edb441df82..f9e8e496ae5 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -24,30 +24,53 @@ /* Standard well-defined IP protocols. */ enum { IPPROTO_IP = 0, /* Dummy protocol for TCP */ +#define IPPROTO_IP IPPROTO_IP IPPROTO_ICMP = 1, /* Internet Control Message Protocol */ +#define IPPROTO_ICMP IPPROTO_ICMP IPPROTO_IGMP = 2, /* Internet Group Management Protocol */ +#define IPPROTO_IGMP IPPROTO_IGMP IPPROTO_IPIP = 4, /* IPIP tunnels (older KA9Q tunnels use 94) */ +#define IPPROTO_IPIP IPPROTO_IPIP IPPROTO_TCP = 6, /* Transmission Control Protocol */ +#define IPPROTO_TCP IPPROTO_TCP IPPROTO_EGP = 8, /* Exterior Gateway Protocol */ +#define IPPROTO_EGP IPPROTO_EGP IPPROTO_PUP = 12, /* PUP protocol */ +#define IPPROTO_PUP IPPROTO_PUP IPPROTO_UDP = 17, /* User Datagram Protocol */ +#define IPPROTO_UDP IPPROTO_UDP IPPROTO_IDP = 22, /* XNS IDP protocol */ +#define IPPROTO_IDP IPPROTO_IDP + IPPROTO_TP = 29, /* SO Transport Protocol Class 4 */ +#define IPPROTO_TP IPPROTO_TP IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */ - IPPROTO_RSVP = 46, /* RSVP protocol */ +#define IPPROTO_DCCP IPPROTO_DCCP + IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */ +#define IPPROTO_IPV6 IPPROTO_IPV6 + IPPROTO_RSVP = 46, /* RSVP Protocol */ +#define IPPROTO_RSVP IPPROTO_RSVP IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */ - - IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */ - - IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */ - IPPROTO_AH = 51, /* Authentication Header protocol */ - IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */ - IPPROTO_PIM = 103, /* Protocol Independent Multicast */ - - IPPROTO_COMP = 108, /* Compression Header protocol */ - IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */ +#define IPPROTO_GRE IPPROTO_GRE + IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */ +#define IPPROTO_ESP IPPROTO_ESP + IPPROTO_AH = 51, /* Authentication Header protocol */ +#define IPPROTO_AH IPPROTO_AH + IPPROTO_MTP = 92, /* Multicast Transport Protocol */ +#define IPPROTO_MTP IPPROTO_MTP + IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */ +#define IPPROTO_BEETPH IPPROTO_BEETPH + IPPROTO_ENCAP = 98, /* Encapsulation Header */ +#define IPPROTO_ENCAP IPPROTO_ENCAP + IPPROTO_PIM = 103, /* Protocol Independent Multicast */ +#define IPPROTO_PIM IPPROTO_PIM + IPPROTO_COMP = 108, /* Compression Header Protocol */ +#define IPPROTO_COMP IPPROTO_COMP + IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */ +#define IPPROTO_SCTP IPPROTO_SCTP IPPROTO_UDPLITE = 136, /* UDP-Lite (RFC 3828) */ - - IPPROTO_RAW = 255, /* Raw IP packets */ +#define IPPROTO_UDPLITE IPPROTO_UDPLITE + IPPROTO_RAW = 255, /* Raw IP packets */ +#define IPPROTO_RAW IPPROTO_RAW IPPROTO_MAX }; diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 53b1d56a6e7..440d5c47914 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -22,22 +22,30 @@ #define _UAPI_LINUX_IN6_H #include +#include /* * IPv6 address structure */ +#if __UAPI_DEF_IN6_ADDR struct in6_addr { union { __u8 u6_addr8[16]; +#if __UAPI_DEF_IN6_ADDR_ALT __be16 u6_addr16[8]; __be32 u6_addr32[4]; +#endif } in6_u; #define s6_addr in6_u.u6_addr8 +#if __UAPI_DEF_IN6_ADDR_ALT #define s6_addr16 in6_u.u6_addr16 #define s6_addr32 in6_u.u6_addr32 +#endif }; +#endif /* __UAPI_DEF_IN6_ADDR */ +#if __UAPI_DEF_SOCKADDR_IN6 struct sockaddr_in6 { unsigned short int sin6_family; /* AF_INET6 */ __be16 sin6_port; /* Transport layer port # */ @@ -45,7 +53,9 @@ struct sockaddr_in6 { struct in6_addr sin6_addr; /* IPv6 address */ __u32 sin6_scope_id; /* scope id (new in RFC2553) */ }; +#endif /* __UAPI_DEF_SOCKADDR_IN6 */ +#if __UAPI_DEF_IPV6_MREQ struct ipv6_mreq { /* IPv6 multicast address of group */ struct in6_addr ipv6mr_multiaddr; @@ -53,6 +63,7 @@ struct ipv6_mreq { /* local IPv6 address of interface */ int ipv6mr_ifindex; }; +#endif /* __UAPI_DEF_IVP6_MREQ */ #define ipv6mr_acaddr ipv6mr_multiaddr @@ -114,13 +125,24 @@ struct in6_flowlabel_req { /* * IPV6 extension headers */ -#define IPPROTO_HOPOPTS 0 /* IPv6 hop-by-hop options */ -#define IPPROTO_ROUTING 43 /* IPv6 routing header */ -#define IPPROTO_FRAGMENT 44 /* IPv6 fragmentation header */ -#define IPPROTO_ICMPV6 58 /* ICMPv6 */ -#define IPPROTO_NONE 59 /* IPv6 no next header */ -#define IPPROTO_DSTOPTS 60 /* IPv6 destination options */ -#define IPPROTO_MH 135 /* IPv6 mobility header */ +#if __UAPI_DEF_IPPROTO_V6 +enum { + IPPROTO_HOPOPTS = 0, /* IPv6 hop-by-hop options */ +#define IPPROTO_HOPOPTS IPPROTO_HOPOPTS + IPPROTO_ROUTING = 43, /* IPv6 routing header */ +#define IPPROTO_ROUTING IPPROTO_ROUTING + IPPROTO_FRAGMENT = 44, /* IPv6 fragmentation header */ +#define IPPROTO_FRAGMENT IPPROTO_FRAGMENT + IPPROTO_ICMPV6 = 58, /* ICMPv6 */ +#define IPPROTO_ICMPV6 IPPROTO_ICMPV6 + IPPROTO_NONE = 59, /* IPv6 no next header */ +#define IPPROTO_NONE IPPROTO_NONE + IPPROTO_DSTOPTS = 60, /* IPv6 destination options */ +#define IPPROTO_DSTOPTS IPPROTO_DSTOPTS + IPPROTO_MH = 135, /* IPv6 mobility header */ +#define IPPROTO_MH IPPROTO_MH +}; +#endif /* __UAPI_DEF_IPPROTO_V6 */ /* * IPv6 TLV options. diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h new file mode 100644 index 00000000000..335e8a7cad3 --- /dev/null +++ b/include/uapi/linux/libc-compat.h @@ -0,0 +1,103 @@ +/* + * Compatibility interface for userspace libc header coordination: + * + * Define compatibility macros that are used to control the inclusion or + * exclusion of UAPI structures and definitions in coordination with another + * userspace C library. + * + * This header is intended to solve the problem of UAPI definitions that + * conflict with userspace definitions. If a UAPI header has such conflicting + * definitions then the solution is as follows: + * + * * Synchronize the UAPI header and the libc headers so either one can be + * used and such that the ABI is preserved. If this is not possible then + * no simple compatibility interface exists (you need to write translating + * wrappers and rename things) and you can't use this interface. + * + * Then follow this process: + * + * (a) Include libc-compat.h in the UAPI header. + * e.g. #include + * This include must be as early as possible. + * + * (b) In libc-compat.h add enough code to detect that the comflicting + * userspace libc header has been included first. + * + * (c) If the userspace libc header has been included first define a set of + * guard macros of the form __UAPI_DEF_FOO and set their values to 1, else + * set their values to 0. + * + * (d) Back in the UAPI header with the conflicting definitions, guard the + * definitions with: + * #if __UAPI_DEF_FOO + * ... + * #endif + * + * This fixes the situation where the linux headers are included *after* the + * libc headers. To fix the problem with the inclusion in the other order the + * userspace libc headers must be fixed like this: + * + * * For all definitions that conflict with kernel definitions wrap those + * defines in the following: + * #if !__UAPI_DEF_FOO + * ... + * #endif + * + * This prevents the redefinition of a construct already defined by the kernel. + */ +#ifndef _UAPI_LIBC_COMPAT_H +#define _UAPI_LIBC_COMPAT_H + +/* We have included glibc headers... */ +#if defined(__GLIBC__) + +/* Coordinate with glibc netinet/in.h header. */ +#if defined(_NETINET_IN_H) + +/* GLIBC headers included first so don't define anything + * that would already be defined. */ +#define __UAPI_DEF_IN6_ADDR 0 +/* The exception is the in6_addr macros which must be defined + * if the glibc code didn't define them. This guard matches + * the guard in glibc/inet/netinet/in.h which defines the + * additional in6_addr macros e.g. s6_addr16, and s6_addr32. */ +#if defined(__USE_MISC) || defined (__USE_GNU) +#define __UAPI_DEF_IN6_ADDR_ALT 0 +#else +#define __UAPI_DEF_IN6_ADDR_ALT 1 +#endif +#define __UAPI_DEF_SOCKADDR_IN6 0 +#define __UAPI_DEF_IPV6_MREQ 0 +#define __UAPI_DEF_IPPROTO_V6 0 + +#else + +/* Linux headers included first, and we must define everything + * we need. The expectation is that glibc will check the + * __UAPI_DEF_* defines and adjust appropriately. */ +#define __UAPI_DEF_IN6_ADDR 1 +/* We unconditionally define the in6_addr macros and glibc must + * coordinate. */ +#define __UAPI_DEF_IN6_ADDR_ALT 1 +#define __UAPI_DEF_SOCKADDR_IN6 1 +#define __UAPI_DEF_IPV6_MREQ 1 +#define __UAPI_DEF_IPPROTO_V6 1 + +#endif /* _NETINET_IN_H */ + + +/* If we did not see any headers from any supported C libraries, + * or we are being included in the kernel, then define everything + * that we need. */ +#else /* !defined(__GLIBC__) */ + +/* Definitions for in6.h */ +#define __UAPI_DEF_IN6_ADDR 1 +#define __UAPI_DEF_IN6_ADDR_ALT 1 +#define __UAPI_DEF_SOCKADDR_IN6 1 +#define __UAPI_DEF_IPV6_MREQ 1 +#define __UAPI_DEF_IPPROTO_V6 1 + +#endif /* __GLIBC__ */ + +#endif /* _UAPI_LIBC_COMPAT_H */ -- cgit v1.2.3-70-g09d2 From 89225d1ce6af3916bf32aecbe9d83f571a098588 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 4 Sep 2013 00:19:37 +0200 Subject: net: ipv6: mld: fix v1/v2 switchback timeout to rfc3810, 9.12. i) RFC3810, 9.2. Query Interval [QI] says: The Query Interval variable denotes the interval between General Queries sent by the Querier. Default value: 125 seconds. [...] ii) RFC3810, 9.3. Query Response Interval [QRI] says: The Maximum Response Delay used to calculate the Maximum Response Code inserted into the periodic General Queries. Default value: 10000 (10 seconds) [...] The number of seconds represented by the [Query Response Interval] must be less than the [Query Interval]. iii) RFC3810, 9.12. Older Version Querier Present Timeout [OVQPT] says: The Older Version Querier Present Timeout is the time-out for transitioning a host back to MLDv2 Host Compatibility Mode. When an MLDv1 query is received, MLDv2 hosts set their Older Version Querier Present Timer to [Older Version Querier Present Timeout]. This value MUST be ([Robustness Variable] times (the [Query Interval] in the last Query received)) plus ([Query Response Interval]). Hence, on *default* the timeout results in: [RV] = 2, [QI] = 125sec, [QRI] = 10sec [OVQPT] = [RV] * [QI] + [QRI] = 260sec Having that said, we currently calculate [OVQPT] (here given as 'switchback' variable) as ... switchback = (idev->mc_qrv + 1) * max_delay RFC3810, 9.12. says "the [Query Interval] in the last Query received". In section "9.14. Configuring timers", it is said: This section is meant to provide advice to network administrators on how to tune these settings to their network. Ambitious router implementations might tune these settings dynamically based upon changing characteristics of the network. [...] iv) RFC38010, 9.14.2. Query Interval: The overall level of periodic MLD traffic is inversely proportional to the Query Interval. A longer Query Interval results in a lower overall level of MLD traffic. The value of the Query Interval MUST be equal to or greater than the Maximum Response Delay used to calculate the Maximum Response Code inserted in General Query messages. I assume that was why switchback is calculated as is (3 * max_delay), although this setting seems to be meant for routers only to configure their [QI] interval for non-default intervals. So usage here like this is clearly wrong. Concluding, the current behaviour in IPv6's multicast code is not conform to the RFC as switch back is calculated wrongly. That is, it has a too small value, so MLDv2 hosts switch back again to MLDv2 way too early, i.e. ~30secs instead of ~260secs on default. Hence, introduce necessary helper functions and fix this up properly as it should be. Introduced in 06da92283 ("[IPV6]: Add MLDv2 support."). Credits to Hannes Frederic Sowa who also had a hand in this as well. Also thanks to Hangbin Liu who did initial testing. Signed-off-by: Daniel Borkmann Cc: David Stevens Cc: Hannes Frederic Sowa Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/if_inet6.h | 9 +++- include/net/mld.h | 27 +++++++++++- net/ipv6/mcast.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 143 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 736b5fb9547..02ef7727bb5 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -171,12 +171,17 @@ struct inet6_dev { struct ifmcaddr6 *mc_list; struct ifmcaddr6 *mc_tomb; spinlock_t mc_lock; - unsigned char mc_qrv; + + unsigned char mc_qrv; /* Query Robustness Variable */ unsigned char mc_gq_running; unsigned char mc_ifc_count; unsigned char mc_dad_count; - unsigned long mc_v1_seen; + + unsigned long mc_v1_seen; /* Max time we stay in MLDv1 mode */ + unsigned long mc_qi; /* Query Interval */ + unsigned long mc_qri; /* Query Response Interval */ unsigned long mc_maxdelay; + struct timer_list mc_gq_timer; /* general query timer */ struct timer_list mc_ifc_timer; /* interface change timer */ struct timer_list mc_dad_timer; /* dad complete mc timer */ diff --git a/include/net/mld.h b/include/net/mld.h index 467143cd4e2..2b5421f6a7c 100644 --- a/include/net/mld.h +++ b/include/net/mld.h @@ -63,7 +63,7 @@ struct mld2_query { #define mld2q_mrc mld2q_hdr.icmp6_maxdelay #define mld2q_resv1 mld2q_hdr.icmp6_dataun.un_data16[1] -/* Max Response Code */ +/* Max Response Code, TODO: transform this to use the below */ #define MLDV2_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value)) #define MLDV2_EXP(thresh, nbmant, nbexp, value) \ ((value) < (thresh) ? (value) : \ @@ -72,4 +72,29 @@ struct mld2_query { #define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value) +/* RFC3810, 5.1.3. Maximum Response Code: + * + * If Maximum Response Code >= 32768, Maximum Response Code represents a + * floating-point value as follows: + * + * 0 1 2 3 4 5 6 7 8 9 A B C D E F + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |1| exp | mant | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +#define MLDV2_MRC_EXP(value) (((value) >> 12) & 0x0007) +#define MLDV2_MRC_MAN(value) ((value) & 0x0fff) + +/* RFC3810, 5.1.9. QQIC (Querier's Query Interval Code): + * + * If QQIC >= 128, QQIC represents a floating-point value as follows: + * + * 0 1 2 3 4 5 6 7 + * +-+-+-+-+-+-+-+-+ + * |1| exp | mant | + * +-+-+-+-+-+-+-+-+ + */ +#define MLDV2_QQIC_EXP(value) (((value) >> 4) & 0x07) +#define MLDV2_QQIC_MAN(value) ((value) & 0x0f) + #endif diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 98ead2b1a66..8992ff26278 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -108,6 +108,10 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); #define MLD_QRV_DEFAULT 2 +/* RFC3810, 9.2. Query Interval */ +#define MLD_QI_DEFAULT (125 * HZ) +/* RFC3810, 9.3. Query Response Interval */ +#define MLD_QRI_DEFAULT (10 * HZ) /* RFC3810, 8.1 Query Version Distinctions */ #define MLD_V1_QUERY_LEN 24 @@ -1112,6 +1116,93 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, return true; } +static void mld_set_v1_mode(struct inet6_dev *idev) +{ + /* RFC3810, relevant sections: + * - 9.1. Robustness Variable + * - 9.2. Query Interval + * - 9.3. Query Response Interval + * - 9.12. Older Version Querier Present Timeout + */ + unsigned long switchback; + + switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri; + + idev->mc_v1_seen = jiffies + switchback; +} + +static void mld_update_qrv(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.8. QRV (Querier's Robustness Variable) + * - 9.1. Robustness Variable + */ + + /* The value of the Robustness Variable MUST NOT be zero, + * and SHOULD NOT be one. Catch this here if we ever run + * into such a case in future. + */ + WARN_ON(idev->mc_qrv == 0); + + if (mlh2->mld2q_qrv > 0) + idev->mc_qrv = mlh2->mld2q_qrv; + + if (unlikely(idev->mc_qrv < 2)) { + net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n", + idev->mc_qrv, MLD_QRV_DEFAULT); + idev->mc_qrv = MLD_QRV_DEFAULT; + } +} + +static void mld_update_qi(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.9. QQIC (Querier's Query Interval Code) + * - 9.2. Query Interval + * - 9.12. Older Version Querier Present Timeout + * (the [Query Interval] in the last Query received) + */ + unsigned long mc_qqi; + + if (mlh2->mld2q_qqic < 128) { + mc_qqi = mlh2->mld2q_qqic; + } else { + unsigned long mc_man, mc_exp; + + mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic); + mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic); + + mc_qqi = (mc_man | 0x10) << (mc_exp + 3); + } + + idev->mc_qi = mc_qqi * HZ; +} + +static void mld_update_qri(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.3. Maximum Response Code + * - 9.3. Query Response Interval + */ + unsigned long mc_qri, mc_mrc = ntohs(mlh2->mld2q_mrc); + + if (mc_mrc < 32768) { + mc_qri = mc_mrc; + } else { + unsigned long mc_man, mc_exp; + + mc_exp = MLDV2_MRC_EXP(mc_mrc); + mc_man = MLDV2_MRC_MAN(mc_mrc); + + mc_qri = (mc_man | 0x1000) << (mc_exp + 3); + } + + idev->mc_qri = msecs_to_jiffies(mc_qri); +} + /* called with rcu_read_lock() */ int igmp6_event_query(struct sk_buff *skb) { @@ -1150,12 +1241,15 @@ int igmp6_event_query(struct sk_buff *skb) return -EINVAL; if (len == MLD_V1_QUERY_LEN) { - int switchback; /* MLDv1 router present */ - max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay)); - switchback = (idev->mc_qrv + 1) * max_delay; - idev->mc_v1_seen = jiffies + switchback; + + mld_set_v1_mode(idev); + + /* cancel MLDv2 report timer */ + idev->mc_gq_running = 0; + if (del_timer(&idev->mc_gq_timer)) + __in6_dev_put(idev); /* cancel the interface change timer */ idev->mc_ifc_count = 0; @@ -1166,6 +1260,10 @@ int igmp6_event_query(struct sk_buff *skb) } else if (len >= MLD_V2_QUERY_LEN_MIN) { int srcs_offset = sizeof(struct mld2_query) - sizeof(struct icmp6hdr); + + /* hosts need to stay in MLDv1 mode, discard MLDv2 queries */ + if (MLD_V1_SEEN(idev)) + return 0; if (!pskb_may_pull(skb, srcs_offset)) return -EINVAL; @@ -1175,8 +1273,10 @@ int igmp6_event_query(struct sk_buff *skb) idev->mc_maxdelay = max_delay; - if (mlh2->mld2q_qrv) - idev->mc_qrv = mlh2->mld2q_qrv; + mld_update_qrv(idev, mlh2); + mld_update_qi(idev, mlh2); + mld_update_qri(idev, mlh2); + if (group_type == IPV6_ADDR_ANY) { /* general query */ if (mlh2->mld2q_nsrcs) return -EINVAL; /* no sources allowed */ @@ -2337,7 +2437,11 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) (unsigned long)idev); setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire, (unsigned long)idev); + idev->mc_qrv = MLD_QRV_DEFAULT; + idev->mc_qi = MLD_QI_DEFAULT; + idev->mc_qri = MLD_QRI_DEFAULT; + idev->mc_maxdelay = unsolicited_report_interval(idev); idev->mc_v1_seen = 0; write_unlock_bh(&idev->lock); -- cgit v1.2.3-70-g09d2 From e3f5b17047dec4acd8957dad053e70d87f18d97e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 4 Sep 2013 00:19:39 +0200 Subject: net: ipv6: mld: get rid of MLDV2_MRC and simplify calculation Get rid of MLDV2_MRC and use our new macros for mantisse and exponent to calculate Maximum Response Delay out of the Maximum Response Code. Signed-off-by: Daniel Borkmann Cc: Hannes Frederic Sowa Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/mld.h | 28 +++++++++++++++++++--------- net/bridge/br_multicast.c | 3 ++- net/ipv6/mcast.c | 18 ++---------------- 3 files changed, 23 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/net/mld.h b/include/net/mld.h index 2b5421f6a7c..faa1d161bf2 100644 --- a/include/net/mld.h +++ b/include/net/mld.h @@ -63,15 +63,6 @@ struct mld2_query { #define mld2q_mrc mld2q_hdr.icmp6_maxdelay #define mld2q_resv1 mld2q_hdr.icmp6_dataun.un_data16[1] -/* Max Response Code, TODO: transform this to use the below */ -#define MLDV2_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value)) -#define MLDV2_EXP(thresh, nbmant, nbexp, value) \ - ((value) < (thresh) ? (value) : \ - ((MLDV2_MASK(value, nbmant) | (1<<(nbmant))) << \ - (MLDV2_MASK((value) >> (nbmant), nbexp) + (nbexp)))) - -#define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value) - /* RFC3810, 5.1.3. Maximum Response Code: * * If Maximum Response Code >= 32768, Maximum Response Code represents a @@ -97,4 +88,23 @@ struct mld2_query { #define MLDV2_QQIC_EXP(value) (((value) >> 4) & 0x07) #define MLDV2_QQIC_MAN(value) ((value) & 0x0f) +static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) +{ + /* RFC3810, 5.1.3. Maximum Response Code */ + unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc); + + if (mc_mrc < 32768) { + ret = mc_mrc; + } else { + unsigned long mc_man, mc_exp; + + mc_exp = MLDV2_MRC_EXP(mc_mrc); + mc_man = MLDV2_MRC_MAN(mc_mrc); + + ret = (mc_man | 0x1000) << (mc_exp + 3); + } + + return ret; +} + #endif diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 08e576ada0b..4accd0de6e8 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1203,7 +1203,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, mld2q = (struct mld2_query *)icmp6_hdr(skb); if (!mld2q->mld2q_nsrcs) group = &mld2q->mld2q_mca; - max_delay = mld2q->mld2q_mrc ? MLDV2_MRC(ntohs(mld2q->mld2q_mrc)) : 1; + + max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL); } br_multicast_query_received(br, port, !ipv6_addr_any(&ip6h->saddr), diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index f86e26b7296..005b22fd6bc 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1195,20 +1195,7 @@ static void mld_update_qri(struct inet6_dev *idev, * - 5.1.3. Maximum Response Code * - 9.3. Query Response Interval */ - unsigned long mc_qri, mc_mrc = ntohs(mlh2->mld2q_mrc); - - if (mc_mrc < 32768) { - mc_qri = mc_mrc; - } else { - unsigned long mc_man, mc_exp; - - mc_exp = MLDV2_MRC_EXP(mc_mrc); - mc_man = MLDV2_MRC_MAN(mc_mrc); - - mc_qri = (mc_man | 0x1000) << (mc_exp + 3); - } - - idev->mc_qri = msecs_to_jiffies(mc_qri); + idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2)); } /* called with rcu_read_lock() */ @@ -1277,8 +1264,7 @@ int igmp6_event_query(struct sk_buff *skb) mlh2 = (struct mld2_query *)skb_transport_header(skb); - max_delay = max(msecs_to_jiffies(MLDV2_MRC(ntohs(mlh2->mld2q_mrc))), 1UL); - + max_delay = max(msecs_to_jiffies(mldv2_mrc(mlh2)), 1UL); idev->mc_maxdelay = max_delay; mld_update_qrv(idev, mlh2); -- cgit v1.2.3-70-g09d2 From 53cf527513eed6e7170e9dceacd198f9267171b0 Mon Sep 17 00:00:00 2001 From: Joseph Gasparakis Date: Wed, 4 Sep 2013 02:13:38 -0700 Subject: vxlan: Notify drivers for listening UDP port changes This patch adds two more ndo ops: ndo_add_rx_vxlan_port() and ndo_del_rx_vxlan_port(). Drivers can get notifications through the above functions about changes of the UDP listening port of VXLAN. Also, when physical ports come up, now they can call vxlan_get_rx_port() in order to obtain the port number(s) of the existing VXLAN interface in case they already up before them. This information about the listening UDP port would be used for VXLAN related offloads. A big thank you to John Fastabend (john.r.fastabend@intel.com) for his input and his suggestions on this patch set. CC: John Fastabend CC: Stephen Hemminger Signed-off-by: Joseph Gasparakis Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++- include/linux/netdevice.h | 19 +++++++++++++ include/net/vxlan.h | 1 + 3 files changed, 87 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index bd35d2dfc50..bf64b4191dc 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -558,6 +558,40 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, return 1; } +/* Notify netdevs that UDP port started listening */ +static void vxlan_notify_add_rx_port(struct sock *sk) +{ + struct net_device *dev; + struct net *net = sock_net(sk); + sa_family_t sa_family = sk->sk_family; + u16 port = htons(inet_sk(sk)->inet_sport); + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (dev->netdev_ops->ndo_add_vxlan_port) + dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, + port); + } + rcu_read_unlock(); +} + +/* Notify netdevs that UDP port is no more listening */ +static void vxlan_notify_del_rx_port(struct sock *sk) +{ + struct net_device *dev; + struct net *net = sock_net(sk); + sa_family_t sa_family = sk->sk_family; + u16 port = htons(inet_sk(sk)->inet_sport); + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (dev->netdev_ops->ndo_del_vxlan_port) + dev->netdev_ops->ndo_del_vxlan_port(dev, sa_family, + port); + } + rcu_read_unlock(); +} + /* Add new entry to forwarding table -- assumes lock held */ static int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, @@ -909,7 +943,9 @@ static void vxlan_sock_hold(struct vxlan_sock *vs) void vxlan_sock_release(struct vxlan_sock *vs) { - struct vxlan_net *vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); + struct sock *sk = vs->sock->sk; + struct net *net = sock_net(sk); + struct vxlan_net *vn = net_generic(net, vxlan_net_id); if (!atomic_dec_and_test(&vs->refcnt)) return; @@ -918,6 +954,7 @@ void vxlan_sock_release(struct vxlan_sock *vs) hlist_del_rcu(&vs->hlist); smp_wmb(); vs->sock->sk->sk_user_data = NULL; + vxlan_notify_del_rx_port(sk); spin_unlock(&vn->sock_lock); queue_work(vxlan_wq, &vs->del_work); @@ -1983,6 +2020,34 @@ static struct device_type vxlan_type = { .name = "vxlan", }; +/* Calls the ndo_add_vxlan_port of the caller in order to + * supply the listening VXLAN udp ports. + */ +void vxlan_get_rx_port(struct net_device *dev) +{ + struct vxlan_sock *vs; + struct net *net = dev_net(dev); + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + sa_family_t sa_family; + u16 port; + int i; + + if (!dev || !dev->netdev_ops || !dev->netdev_ops->ndo_add_vxlan_port) + return; + + spin_lock(&vn->sock_lock); + for (i = 0; i < PORT_HASH_SIZE; ++i) { + hlist_for_each_entry_rcu(vs, vs_head(net, i), hlist) { + port = htons(inet_sk(vs->sock->sk)->inet_sport); + sa_family = vs->sock->sk->sk_family; + dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, + port); + } + } + spin_unlock(&vn->sock_lock); +} +EXPORT_SYMBOL_GPL(vxlan_get_rx_port); + /* Initialize the device structure. */ static void vxlan_setup(struct net_device *dev) { @@ -2244,6 +2309,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); + vxlan_notify_add_rx_port(sk); spin_unlock(&vn->sock_lock); /* Mark socket as an encapsulation socket. */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ad49b833ea..8ed4ae94305 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -948,6 +948,19 @@ struct netdev_phys_port_id { * Called to get ID of physical port of this device. If driver does * not implement this, it is assumed that the hw is not able to have * multiple net devices on single physical port. + * + * void (*ndo_add_vxlan_port)(struct net_device *dev, + * sa_family_t sa_family, __u16 port); + * Called by vxlan to notiy a driver about the UDP port and socket + * address family that vxlan is listnening to. It is called only when + * a new port starts listening. The operation is protected by the + * vxlan_net->sock_lock. + * + * void (*ndo_del_vxlan_port)(struct net_device *dev, + * sa_family_t sa_family, __u16 port); + * Called by vxlan to notify the driver about a UDP port and socket + * address family that vxlan is not listening to anymore. The operation + * is protected by the vxlan_net->sock_lock. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1078,6 +1091,12 @@ struct net_device_ops { bool new_carrier); int (*ndo_get_phys_port_id)(struct net_device *dev, struct netdev_phys_port_id *ppid); + void (*ndo_add_vxlan_port)(struct net_device *dev, + sa_family_t sa_family, + __u16 port); + void (*ndo_del_vxlan_port)(struct net_device *dev, + sa_family_t sa_family, + __u16 port); }; /* diff --git a/include/net/vxlan.h b/include/net/vxlan.h index e09c40b6802..2d64d3cd499 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -36,4 +36,5 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, __be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb); +void vxlan_get_rx_port(struct net_device *netdev); #endif -- cgit v1.2.3-70-g09d2