From 6cb6a27c45cec9184302c2e350b3593c64bc7f6c Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Sat, 2 Apr 2011 22:48:47 -0700 Subject: net: Call netdev_features_change() from netdev_update_features() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue FEAT_CHANGE notification when features are changed by netdev_update_features(). This will allow changes made by extra constraints on e.g. MTU change to be properly propagated like changes via ethtool. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5eeb2cd3631..423a5447d2e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2550,6 +2550,7 @@ static inline u32 netdev_get_wanted_features(struct net_device *dev) } u32 netdev_increment_features(u32 all, u32 one, u32 mask); u32 netdev_fix_features(struct net_device *dev, u32 features); +int __netdev_update_features(struct net_device *dev); void netdev_update_features(struct net_device *dev); void netif_stacked_transfer_operstate(const struct net_device *rootdev, -- cgit v1.2.3-70-g09d2 From c6e1a0d12ca7b4f22c58e55a16beacfb7d3d8462 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 4 Apr 2011 22:30:30 -0700 Subject: net: Allow no-cache copy from user on transmit This patch uses __copy_from_user_nocache on transmit to bypass data cache for a performance improvement. skb_add_data_nocache and skb_copy_to_page_nocache can be called by sendmsg functions to use this feature, initial support is in tcp_sendmsg. This functionality is configurable per device using ethtool. Presumably, this feature would only be useful when the driver does not touch the data. The feature is turned on by default if a device indicates that it does some form of checksum offload; it is off by default for devices that do no checksum offload or indicate no checksum is necessary. For the former case copy-checksum is probably done anyway, in the latter case the device is likely loopback in which case the no cache copy is probably not beneficial. This patch was tested using 200 instances of netperf TCP_RR with 1400 byte request and one byte reply. Platform is 16 core AMD x86. No-cache copy disabled: 672703 tps, 97.13% utilization 50/90/99% latency:244.31 484.205 1028.41 No-cache copy enabled: 702113 tps, 96.16% utilization, 50/90/99% latency 238.56 467.56 956.955 Using 14000 byte request and response sizes demonstrate the effects more dramatically: No-cache copy disabled: 79571 tps, 34.34 %utlization 50/90/95% latency 1584.46 2319.59 5001.76 No-cache copy enabled: 83856 tps, 34.81% utilization 50/90/95% latency 2508.42 2622.62 2735.88 Note especially the effect on latency tail (95th percentile). This seems to provide a nice performance improvement and is consistent in the tests I ran. Presumably, this would provide the greatest benfits in the presence of an application workload stressing the cache and a lot of transmit data happening. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- include/linux/netdevice.h | 3 ++- include/net/sock.h | 53 +++++++++++++++++++++++++++++++++++++++++ net/core/dev.c | 12 ++++++++++ net/core/ethtool.c | 2 +- net/ipv4/tcp.c | 7 +++--- 6 files changed, 73 insertions(+), 6 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 16d6fe95469..b51e021354b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1407,7 +1407,7 @@ static int bond_compute_features(struct bonding *bond) int i; features &= ~(NETIF_F_ALL_CSUM | BOND_VLAN_FEATURES); - features |= NETIF_F_GSO_MASK | NETIF_F_NO_CSUM; + features |= NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_NOCACHE_COPY; if (!bond->first_slave) goto done; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a4664cc68e2..09d26241576 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1066,6 +1066,7 @@ struct net_device { #define NETIF_F_NTUPLE (1 << 27) /* N-tuple filters supported */ #define NETIF_F_RXHASH (1 << 28) /* Receive hashing offload */ #define NETIF_F_RXCSUM (1 << 29) /* Receive checksumming offload */ +#define NETIF_F_NOCACHE_COPY (1 << 30) /* Use no-cache copyfromuser */ /* Segmentation offload features */ #define NETIF_F_GSO_SHIFT 16 @@ -1081,7 +1082,7 @@ struct net_device { /* = all defined minus driver/device-class-related */ #define NETIF_F_NEVER_CHANGE (NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | \ NETIF_F_LLTX | NETIF_F_NETNS_LOCAL) -#define NETIF_F_ETHTOOL_BITS (0x3f3fffff & ~NETIF_F_NEVER_CHANGE) +#define NETIF_F_ETHTOOL_BITS (0x7f3fffff & ~NETIF_F_NEVER_CHANGE) /* List of features with software fallbacks. */ #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | \ diff --git a/include/net/sock.h b/include/net/sock.h index da0534d3401..43bd515e92f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -1389,6 +1390,58 @@ static inline void sk_nocaps_add(struct sock *sk, int flags) sk->sk_route_caps &= ~flags; } +static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, + char __user *from, char *to, + int copy) +{ + if (skb->ip_summed == CHECKSUM_NONE) { + int err = 0; + __wsum csum = csum_and_copy_from_user(from, to, copy, 0, &err); + if (err) + return err; + skb->csum = csum_block_add(skb->csum, csum, skb->len); + } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { + if (!access_ok(VERIFY_READ, from, copy) || + __copy_from_user_nocache(to, from, copy)) + return -EFAULT; + } else if (copy_from_user(to, from, copy)) + return -EFAULT; + + return 0; +} + +static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb, + char __user *from, int copy) +{ + int err; + + err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy), copy); + if (err) + __skb_trim(skb, skb->len); + + return err; +} + +static inline int skb_copy_to_page_nocache(struct sock *sk, char __user *from, + struct sk_buff *skb, + struct page *page, + int off, int copy) +{ + int err; + + err = skb_do_copy_data_nocache(sk, skb, from, + page_address(page) + off, copy); + if (err) + return err; + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk->sk_wmem_queued += copy; + sk_mem_charge(sk, copy); + return 0; +} + static inline int skb_copy_to_page(struct sock *sk, char __user *from, struct sk_buff *skb, struct page *page, int off, int copy) diff --git a/net/core/dev.c b/net/core/dev.c index 02f56376fe9..5d0b4f6f1a7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5425,6 +5425,14 @@ int register_netdevice(struct net_device *dev) dev->features &= ~NETIF_F_GSO; } + /* Turn on no cache copy if HW is doing checksum */ + dev->hw_features |= NETIF_F_NOCACHE_COPY; + if ((dev->features & NETIF_F_ALL_CSUM) && + !(dev->features & NETIF_F_NO_CSUM)) { + dev->wanted_features |= NETIF_F_NOCACHE_COPY; + dev->features |= NETIF_F_NOCACHE_COPY; + } + /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, * vlan_dev_init() will do the dev->features check, so these features * are enabled only if supported by underlying device. @@ -6182,6 +6190,10 @@ u32 netdev_increment_features(u32 all, u32 one, u32 mask) } } + /* If device can't no cache copy, don't do for all */ + if (!(one & NETIF_F_NOCACHE_COPY)) + all &= ~NETIF_F_NOCACHE_COPY; + one |= NETIF_F_ALL_CSUM; one |= all & NETIF_F_ONE_FOR_ALL; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 439e4b0e131..719670ae199 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -359,7 +359,7 @@ static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GS /* NETIF_F_NTUPLE */ "rx-ntuple-filter", /* NETIF_F_RXHASH */ "rx-hashing", /* NETIF_F_RXCSUM */ "rx-checksum", - "", + /* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy" "", }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b22d4501054..054a59d21eb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -999,7 +999,8 @@ new_segment: /* We have some space in skb head. Superb! */ if (copy > skb_tailroom(skb)) copy = skb_tailroom(skb); - if ((err = skb_add_data(skb, from, copy)) != 0) + err = skb_add_data_nocache(sk, skb, from, copy); + if (err) goto do_fault; } else { int merge = 0; @@ -1042,8 +1043,8 @@ new_segment: /* Time to copy data. We are close to * the end! */ - err = skb_copy_to_page(sk, from, skb, page, - off, copy); + err = skb_copy_to_page_nocache(sk, from, skb, + page, off, copy); if (err) { /* If this page was new, give it to the * socket so it does not get leaked. -- cgit v1.2.3-70-g09d2 From 1aac62671686e6234c91b5f6fc4caaa850419d5d Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Tue, 12 Apr 2011 04:07:39 +0000 Subject: net: vlan_features comment clarification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 09d26241576..cb8178ab3c5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1035,7 +1035,7 @@ struct net_device { u32 hw_features; /* user-requested features */ u32 wanted_features; - /* VLAN feature mask */ + /* mask of features inheritable by VLAN devices */ u32 vlan_features; /* Net device feature bits; if you change something, -- cgit v1.2.3-70-g09d2 From 0a14842f5a3c0e88a1e59fac5c3025db39721f74 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Apr 2011 09:27:32 +0000 Subject: net: filter: Just In Time compiler for x86-64 In order to speedup packet filtering, here is an implementation of a JIT compiler for x86_64 It is disabled by default, and must be enabled by the admin. echo 1 >/proc/sys/net/core/bpf_jit_enable It uses module_alloc() and module_free() to get memory in the 2GB text kernel range since we call helpers functions from the generated code. EAX : BPF A accumulator EBX : BPF X accumulator RDI : pointer to skb (first argument given to JIT function) RBP : frame pointer (even if CONFIG_FRAME_POINTER=n) r9d : skb->len - skb->data_len (headlen) r8 : skb->data To get a trace of generated code, use : echo 2 >/proc/sys/net/core/bpf_jit_enable Example of generated code : # tcpdump -p -n -s 0 -i eth1 host 192.168.20.0/24 flen=18 proglen=147 pass=3 image=ffffffffa00b5000 JIT code: ffffffffa00b5000: 55 48 89 e5 48 83 ec 60 48 89 5d f8 44 8b 4f 60 JIT code: ffffffffa00b5010: 44 2b 4f 64 4c 8b 87 b8 00 00 00 be 0c 00 00 00 JIT code: ffffffffa00b5020: e8 24 7b f7 e0 3d 00 08 00 00 75 28 be 1a 00 00 JIT code: ffffffffa00b5030: 00 e8 fe 7a f7 e0 24 00 3d 00 14 a8 c0 74 49 be JIT code: ffffffffa00b5040: 1e 00 00 00 e8 eb 7a f7 e0 24 00 3d 00 14 a8 c0 JIT code: ffffffffa00b5050: 74 36 eb 3b 3d 06 08 00 00 74 07 3d 35 80 00 00 JIT code: ffffffffa00b5060: 75 2d be 1c 00 00 00 e8 c8 7a f7 e0 24 00 3d 00 JIT code: ffffffffa00b5070: 14 a8 c0 74 13 be 26 00 00 00 e8 b5 7a f7 e0 24 JIT code: ffffffffa00b5080: 00 3d 00 14 a8 c0 75 07 b8 ff ff 00 00 eb 02 31 JIT code: ffffffffa00b5090: c0 c9 c3 BPF program is 144 bytes long, so native program is almost same size ;) (000) ldh [12] (001) jeq #0x800 jt 2 jf 8 (002) ld [26] (003) and #0xffffff00 (004) jeq #0xc0a81400 jt 16 jf 5 (005) ld [30] (006) and #0xffffff00 (007) jeq #0xc0a81400 jt 16 jf 17 (008) jeq #0x806 jt 10 jf 9 (009) jeq #0x8035 jt 10 jf 17 (010) ld [28] (011) and #0xffffff00 (012) jeq #0xc0a81400 jt 16 jf 13 (013) ld [38] (014) and #0xffffff00 (015) jeq #0xc0a81400 jt 16 jf 17 (016) ret #65535 (017) ret #0 Signed-off-by: Eric Dumazet Cc: Arnaldo Carvalho de Melo Cc: Ben Hutchings Cc: Hagen Paul Pfeifer Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 11 + MAINTAINERS | 1 + arch/x86/Kbuild | 1 + arch/x86/Kconfig | 1 + arch/x86/net/Makefile | 4 + arch/x86/net/bpf_jit.S | 140 +++++++++ arch/x86/net/bpf_jit_comp.c | 654 +++++++++++++++++++++++++++++++++++++++++++ include/linux/filter.h | 76 +++++ include/linux/netdevice.h | 1 + include/linux/skbuff.h | 2 +- net/Kconfig | 13 + net/core/filter.c | 65 +---- net/core/sysctl_net_core.c | 9 + net/packet/af_packet.c | 2 +- 14 files changed, 918 insertions(+), 62 deletions(-) create mode 100644 arch/x86/net/Makefile create mode 100644 arch/x86/net/bpf_jit.S create mode 100644 arch/x86/net/bpf_jit_comp.c (limited to 'include/linux/netdevice.h') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index cbd05ffc606..3201a7097e4 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -32,6 +32,17 @@ Table : Subdirectories in /proc/sys/net 1. /proc/sys/net/core - Network core options ------------------------------------------------------- +bpf_jit_enable +-------------- + +This enables Berkeley Packet Filter Just in Time compiler. +Currently supported on x86_64 architecture, bpf_jit provides a framework +to speed packet filtering, the one used by tcpdump/libpcap for example. +Values : + 0 - disable the JIT (default value) + 1 - enable the JIT + 2 - enable the JIT and ask the compiler to emit traces on kernel log. + rmem_default ------------ diff --git a/MAINTAINERS b/MAINTAINERS index b5266ad5016..17c0917a26e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4372,6 +4372,7 @@ S: Maintained F: net/ipv4/ F: net/ipv6/ F: include/net/ip* +F: arch/x86/net/* NETWORKING [LABELED] (NetLabel, CIPSO, Labeled IPsec, SECMARK) M: Paul Moore diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 0e103236b75..0e9dec6cadd 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -15,3 +15,4 @@ obj-y += vdso/ obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ +obj-y += net/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc6c53a95bf..855a1bdc437 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -72,6 +72,7 @@ config X86 select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP select ARCH_NO_SYSDEV_OPS + select HAVE_BPF_JIT if X86_64 config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile new file mode 100644 index 00000000000..90568c33ddb --- /dev/null +++ b/arch/x86/net/Makefile @@ -0,0 +1,4 @@ +# +# Arch-specific network modules +# +obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S new file mode 100644 index 00000000000..66870223f8c --- /dev/null +++ b/arch/x86/net/bpf_jit.S @@ -0,0 +1,140 @@ +/* bpf_jit.S : BPF JIT helper functions + * + * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include +#include + +/* + * Calling convention : + * rdi : skb pointer + * esi : offset of byte(s) to fetch in skb (can be scratched) + * r8 : copy of skb->data + * r9d : hlen = skb->len - skb->data_len + */ +#define SKBDATA %r8 + +sk_load_word_ind: + .globl sk_load_word_ind + + add %ebx,%esi /* offset += X */ +# test %esi,%esi /* if (offset < 0) goto bpf_error; */ + js bpf_error + +sk_load_word: + .globl sk_load_word + + mov %r9d,%eax # hlen + sub %esi,%eax # hlen - offset + cmp $3,%eax + jle bpf_slow_path_word + mov (SKBDATA,%rsi),%eax + bswap %eax /* ntohl() */ + ret + + +sk_load_half_ind: + .globl sk_load_half_ind + + add %ebx,%esi /* offset += X */ + js bpf_error + +sk_load_half: + .globl sk_load_half + + mov %r9d,%eax + sub %esi,%eax # hlen - offset + cmp $1,%eax + jle bpf_slow_path_half + movzwl (SKBDATA,%rsi),%eax + rol $8,%ax # ntohs() + ret + +sk_load_byte_ind: + .globl sk_load_byte_ind + add %ebx,%esi /* offset += X */ + js bpf_error + +sk_load_byte: + .globl sk_load_byte + + cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */ + jle bpf_slow_path_byte + movzbl (SKBDATA,%rsi),%eax + ret + +/** + * sk_load_byte_msh - BPF_S_LDX_B_MSH helper + * + * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf) + * Must preserve A accumulator (%eax) + * Inputs : %esi is the offset value, already known positive + */ +ENTRY(sk_load_byte_msh) + CFI_STARTPROC + cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */ + jle bpf_slow_path_byte_msh + movzbl (SKBDATA,%rsi),%ebx + and $15,%bl + shl $2,%bl + ret + CFI_ENDPROC +ENDPROC(sk_load_byte_msh) + +bpf_error: +# force a return 0 from jit handler + xor %eax,%eax + mov -8(%rbp),%rbx + leaveq + ret + +/* rsi contains offset and can be scratched */ +#define bpf_slow_path_common(LEN) \ + push %rdi; /* save skb */ \ + push %r9; \ + push SKBDATA; \ +/* rsi already has offset */ \ + mov $LEN,%ecx; /* len */ \ + lea -12(%rbp),%rdx; \ + call skb_copy_bits; \ + test %eax,%eax; \ + pop SKBDATA; \ + pop %r9; \ + pop %rdi + + +bpf_slow_path_word: + bpf_slow_path_common(4) + js bpf_error + mov -12(%rbp),%eax + bswap %eax + ret + +bpf_slow_path_half: + bpf_slow_path_common(2) + js bpf_error + mov -12(%rbp),%ax + rol $8,%ax + movzwl %ax,%eax + ret + +bpf_slow_path_byte: + bpf_slow_path_common(1) + js bpf_error + movzbl -12(%rbp),%eax + ret + +bpf_slow_path_byte_msh: + xchg %eax,%ebx /* dont lose A , X is about to be scratched */ + bpf_slow_path_common(1) + js bpf_error + movzbl -12(%rbp),%eax + and $15,%al + shl $2,%al + xchg %eax,%ebx + ret diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c new file mode 100644 index 00000000000..bfab3fa10ed --- /dev/null +++ b/arch/x86/net/bpf_jit_comp.c @@ -0,0 +1,654 @@ +/* bpf_jit_comp.c : BPF JIT compiler + * + * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include +#include +#include +#include + +/* + * Conventions : + * EAX : BPF A accumulator + * EBX : BPF X accumulator + * RDI : pointer to skb (first argument given to JIT function) + * RBP : frame pointer (even if CONFIG_FRAME_POINTER=n) + * ECX,EDX,ESI : scratch registers + * r9d : skb->len - skb->data_len (headlen) + * r8 : skb->data + * -8(RBP) : saved RBX value + * -16(RBP)..-80(RBP) : BPF_MEMWORDS values + */ +int bpf_jit_enable __read_mostly; + +/* + * assembly code in arch/x86/net/bpf_jit.S + */ +extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[]; +extern u8 sk_load_word_ind[], sk_load_half_ind[], sk_load_byte_ind[]; + +static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) +{ + if (len == 1) + *ptr = bytes; + else if (len == 2) + *(u16 *)ptr = bytes; + else { + *(u32 *)ptr = bytes; + barrier(); + } + return ptr + len; +} + +#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0) + +#define EMIT1(b1) EMIT(b1, 1) +#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) +#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) +#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) +#define EMIT1_off32(b1, off) do { EMIT1(b1); EMIT(off, 4);} while (0) + +#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */ +#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */ + +static inline bool is_imm8(int value) +{ + return value <= 127 && value >= -128; +} + +static inline bool is_near(int offset) +{ + return offset <= 127 && offset >= -128; +} + +#define EMIT_JMP(offset) \ +do { \ + if (offset) { \ + if (is_near(offset)) \ + EMIT2(0xeb, offset); /* jmp .+off8 */ \ + else \ + EMIT1_off32(0xe9, offset); /* jmp .+off32 */ \ + } \ +} while (0) + +/* list of x86 cond jumps opcodes (. + s8) + * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32) + */ +#define X86_JB 0x72 +#define X86_JAE 0x73 +#define X86_JE 0x74 +#define X86_JNE 0x75 +#define X86_JBE 0x76 +#define X86_JA 0x77 + +#define EMIT_COND_JMP(op, offset) \ +do { \ + if (is_near(offset)) \ + EMIT2(op, offset); /* jxx .+off8 */ \ + else { \ + EMIT2(0x0f, op + 0x10); \ + EMIT(offset, 4); /* jxx .+off32 */ \ + } \ +} while (0) + +#define COND_SEL(CODE, TOP, FOP) \ + case CODE: \ + t_op = TOP; \ + f_op = FOP; \ + goto cond_branch + + +#define SEEN_DATAREF 1 /* might call external helpers */ +#define SEEN_XREG 2 /* ebx is used */ +#define SEEN_MEM 4 /* use mem[] for temporary storage */ + +static inline void bpf_flush_icache(void *start, void *end) +{ + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + smp_wmb(); + flush_icache_range((unsigned long)start, (unsigned long)end); + set_fs(old_fs); +} + + +void bpf_jit_compile(struct sk_filter *fp) +{ + u8 temp[64]; + u8 *prog; + unsigned int proglen, oldproglen = 0; + int ilen, i; + int t_offset, f_offset; + u8 t_op, f_op, seen = 0, pass; + u8 *image = NULL; + u8 *func; + int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */ + unsigned int cleanup_addr; /* epilogue code offset */ + unsigned int *addrs; + const struct sock_filter *filter = fp->insns; + int flen = fp->len; + + if (!bpf_jit_enable) + return; + + addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL); + if (addrs == NULL) + return; + + /* Before first pass, make a rough estimation of addrs[] + * each bpf instruction is translated to less than 64 bytes + */ + for (proglen = 0, i = 0; i < flen; i++) { + proglen += 64; + addrs[i] = proglen; + } + cleanup_addr = proglen; /* epilogue address */ + + for (pass = 0; pass < 10; pass++) { + /* no prologue/epilogue for trivial filters (RET something) */ + proglen = 0; + prog = temp; + + if (seen) { + EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */ + EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */ + /* note : must save %rbx in case bpf_error is hit */ + if (seen & (SEEN_XREG | SEEN_DATAREF)) + EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */ + if (seen & SEEN_XREG) + CLEAR_X(); /* make sure we dont leek kernel memory */ + + /* + * If this filter needs to access skb data, + * loads r9 and r8 with : + * r9 = skb->len - skb->data_len + * r8 = skb->data + */ + if (seen & SEEN_DATAREF) { + if (offsetof(struct sk_buff, len) <= 127) + /* mov off8(%rdi),%r9d */ + EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len)); + else { + /* mov off32(%rdi),%r9d */ + EMIT3(0x44, 0x8b, 0x8f); + EMIT(offsetof(struct sk_buff, len), 4); + } + if (is_imm8(offsetof(struct sk_buff, data_len))) + /* sub off8(%rdi),%r9d */ + EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len)); + else { + EMIT3(0x44, 0x2b, 0x8f); + EMIT(offsetof(struct sk_buff, data_len), 4); + } + + if (is_imm8(offsetof(struct sk_buff, data))) + /* mov off8(%rdi),%r8 */ + EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data)); + else { + /* mov off32(%rdi),%r8 */ + EMIT3(0x4c, 0x8b, 0x87); + EMIT(offsetof(struct sk_buff, data), 4); + } + } + } + + switch (filter[0].code) { + case BPF_S_RET_K: + case BPF_S_LD_W_LEN: + case BPF_S_ANC_PROTOCOL: + case BPF_S_ANC_IFINDEX: + case BPF_S_ANC_MARK: + case BPF_S_ANC_RXHASH: + case BPF_S_ANC_CPU: + case BPF_S_ANC_QUEUE: + case BPF_S_LD_W_ABS: + case BPF_S_LD_H_ABS: + case BPF_S_LD_B_ABS: + /* first instruction sets A register (or is RET 'constant') */ + break; + default: + /* make sure we dont leak kernel information to user */ + CLEAR_A(); /* A = 0 */ + } + + for (i = 0; i < flen; i++) { + unsigned int K = filter[i].k; + + switch (filter[i].code) { + case BPF_S_ALU_ADD_X: /* A += X; */ + seen |= SEEN_XREG; + EMIT2(0x01, 0xd8); /* add %ebx,%eax */ + break; + case BPF_S_ALU_ADD_K: /* A += K; */ + if (!K) + break; + if (is_imm8(K)) + EMIT3(0x83, 0xc0, K); /* add imm8,%eax */ + else + EMIT1_off32(0x05, K); /* add imm32,%eax */ + break; + case BPF_S_ALU_SUB_X: /* A -= X; */ + seen |= SEEN_XREG; + EMIT2(0x29, 0xd8); /* sub %ebx,%eax */ + break; + case BPF_S_ALU_SUB_K: /* A -= K */ + if (!K) + break; + if (is_imm8(K)) + EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */ + else + EMIT1_off32(0x2d, K); /* sub imm32,%eax */ + break; + case BPF_S_ALU_MUL_X: /* A *= X; */ + seen |= SEEN_XREG; + EMIT3(0x0f, 0xaf, 0xc3); /* imul %ebx,%eax */ + break; + case BPF_S_ALU_MUL_K: /* A *= K */ + if (is_imm8(K)) + EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */ + else { + EMIT2(0x69, 0xc0); /* imul imm32,%eax */ + EMIT(K, 4); + } + break; + case BPF_S_ALU_DIV_X: /* A /= X; */ + seen |= SEEN_XREG; + EMIT2(0x85, 0xdb); /* test %ebx,%ebx */ + if (pc_ret0 != -1) + EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4)); + else { + EMIT_COND_JMP(X86_JNE, 2 + 5); + CLEAR_A(); + EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */ + } + EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */ + break; + case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */ + EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */ + EMIT(K, 4); + EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */ + break; + case BPF_S_ALU_AND_X: + seen |= SEEN_XREG; + EMIT2(0x21, 0xd8); /* and %ebx,%eax */ + break; + case BPF_S_ALU_AND_K: + if (K >= 0xFFFFFF00) { + EMIT2(0x24, K & 0xFF); /* and imm8,%al */ + } else if (K >= 0xFFFF0000) { + EMIT2(0x66, 0x25); /* and imm16,%ax */ + EMIT2(K, 2); + } else { + EMIT1_off32(0x25, K); /* and imm32,%eax */ + } + break; + case BPF_S_ALU_OR_X: + seen |= SEEN_XREG; + EMIT2(0x09, 0xd8); /* or %ebx,%eax */ + break; + case BPF_S_ALU_OR_K: + if (is_imm8(K)) + EMIT3(0x83, 0xc8, K); /* or imm8,%eax */ + else + EMIT1_off32(0x0d, K); /* or imm32,%eax */ + break; + case BPF_S_ALU_LSH_X: /* A <<= X; */ + seen |= SEEN_XREG; + EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */ + break; + case BPF_S_ALU_LSH_K: + if (K == 0) + break; + else if (K == 1) + EMIT2(0xd1, 0xe0); /* shl %eax */ + else + EMIT3(0xc1, 0xe0, K); + break; + case BPF_S_ALU_RSH_X: /* A >>= X; */ + seen |= SEEN_XREG; + EMIT4(0x89, 0xd9, 0xd3, 0xe8); /* mov %ebx,%ecx; shr %cl,%eax */ + break; + case BPF_S_ALU_RSH_K: /* A >>= K; */ + if (K == 0) + break; + else if (K == 1) + EMIT2(0xd1, 0xe8); /* shr %eax */ + else + EMIT3(0xc1, 0xe8, K); + break; + case BPF_S_ALU_NEG: + EMIT2(0xf7, 0xd8); /* neg %eax */ + break; + case BPF_S_RET_K: + if (!K) { + if (pc_ret0 == -1) + pc_ret0 = i; + CLEAR_A(); + } else { + EMIT1_off32(0xb8, K); /* mov $imm32,%eax */ + } + /* fallinto */ + case BPF_S_RET_A: + if (seen) { + if (i != flen - 1) { + EMIT_JMP(cleanup_addr - addrs[i]); + break; + } + if (seen & SEEN_XREG) + EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */ + EMIT1(0xc9); /* leaveq */ + } + EMIT1(0xc3); /* ret */ + break; + case BPF_S_MISC_TAX: /* X = A */ + seen |= SEEN_XREG; + EMIT2(0x89, 0xc3); /* mov %eax,%ebx */ + break; + case BPF_S_MISC_TXA: /* A = X */ + seen |= SEEN_XREG; + EMIT2(0x89, 0xd8); /* mov %ebx,%eax */ + break; + case BPF_S_LD_IMM: /* A = K */ + if (!K) + CLEAR_A(); + else + EMIT1_off32(0xb8, K); /* mov $imm32,%eax */ + break; + case BPF_S_LDX_IMM: /* X = K */ + seen |= SEEN_XREG; + if (!K) + CLEAR_X(); + else + EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */ + break; + case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */ + seen |= SEEN_MEM; + EMIT3(0x8b, 0x45, 0xf0 - K*4); + break; + case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */ + seen |= SEEN_XREG | SEEN_MEM; + EMIT3(0x8b, 0x5d, 0xf0 - K*4); + break; + case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */ + seen |= SEEN_MEM; + EMIT3(0x89, 0x45, 0xf0 - K*4); + break; + case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */ + seen |= SEEN_XREG | SEEN_MEM; + EMIT3(0x89, 0x5d, 0xf0 - K*4); + break; + case BPF_S_LD_W_LEN: /* A = skb->len; */ + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); + if (is_imm8(offsetof(struct sk_buff, len))) + /* mov off8(%rdi),%eax */ + EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len)); + else { + EMIT2(0x8b, 0x87); + EMIT(offsetof(struct sk_buff, len), 4); + } + break; + case BPF_S_LDX_W_LEN: /* X = skb->len; */ + seen |= SEEN_XREG; + if (is_imm8(offsetof(struct sk_buff, len))) + /* mov off8(%rdi),%ebx */ + EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len)); + else { + EMIT2(0x8b, 0x9f); + EMIT(offsetof(struct sk_buff, len), 4); + } + break; + case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */ + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + if (is_imm8(offsetof(struct sk_buff, protocol))) { + /* movzwl off8(%rdi),%eax */ + EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol)); + } else { + EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ + EMIT(offsetof(struct sk_buff, protocol), 4); + } + EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */ + break; + case BPF_S_ANC_IFINDEX: + if (is_imm8(offsetof(struct sk_buff, dev))) { + /* movq off8(%rdi),%rax */ + EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev)); + } else { + EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */ + EMIT(offsetof(struct sk_buff, dev), 4); + } + EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */ + EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6)); + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + EMIT2(0x8b, 0x80); /* mov off32(%rax),%eax */ + EMIT(offsetof(struct net_device, ifindex), 4); + break; + case BPF_S_ANC_MARK: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + if (is_imm8(offsetof(struct sk_buff, mark))) { + /* mov off8(%rdi),%eax */ + EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark)); + } else { + EMIT2(0x8b, 0x87); + EMIT(offsetof(struct sk_buff, mark), 4); + } + break; + case BPF_S_ANC_RXHASH: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4); + if (is_imm8(offsetof(struct sk_buff, rxhash))) { + /* mov off8(%rdi),%eax */ + EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash)); + } else { + EMIT2(0x8b, 0x87); + EMIT(offsetof(struct sk_buff, rxhash), 4); + } + break; + case BPF_S_ANC_QUEUE: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + if (is_imm8(offsetof(struct sk_buff, queue_mapping))) { + /* movzwl off8(%rdi),%eax */ + EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping)); + } else { + EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ + EMIT(offsetof(struct sk_buff, queue_mapping), 4); + } + break; + case BPF_S_ANC_CPU: +#ifdef CONFIG_SMP + EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */ + EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */ +#else + CLEAR_A(); +#endif + break; + case BPF_S_LD_W_ABS: + func = sk_load_word; +common_load: seen |= SEEN_DATAREF; + if ((int)K < 0) + goto out; + t_offset = func - (image + addrs[i]); + EMIT1_off32(0xbe, K); /* mov imm32,%esi */ + EMIT1_off32(0xe8, t_offset); /* call */ + break; + case BPF_S_LD_H_ABS: + func = sk_load_half; + goto common_load; + case BPF_S_LD_B_ABS: + func = sk_load_byte; + goto common_load; + case BPF_S_LDX_B_MSH: + if ((int)K < 0) { + if (pc_ret0 != -1) { + EMIT_JMP(addrs[pc_ret0] - addrs[i]); + break; + } + CLEAR_A(); + EMIT_JMP(cleanup_addr - addrs[i]); + break; + } + seen |= SEEN_DATAREF | SEEN_XREG; + t_offset = sk_load_byte_msh - (image + addrs[i]); + EMIT1_off32(0xbe, K); /* mov imm32,%esi */ + EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */ + break; + case BPF_S_LD_W_IND: + func = sk_load_word_ind; +common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG; + t_offset = func - (image + addrs[i]); + EMIT1_off32(0xbe, K); /* mov imm32,%esi */ + EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */ + break; + case BPF_S_LD_H_IND: + func = sk_load_half_ind; + goto common_load_ind; + case BPF_S_LD_B_IND: + func = sk_load_byte_ind; + goto common_load_ind; + case BPF_S_JMP_JA: + t_offset = addrs[i + K] - addrs[i]; + EMIT_JMP(t_offset); + break; + COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE); + COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB); + COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE); + COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE); + COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE); + COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB); + COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE); + COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE); + +cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; + t_offset = addrs[i + filter[i].jt] - addrs[i]; + + /* same targets, can avoid doing the test :) */ + if (filter[i].jt == filter[i].jf) { + EMIT_JMP(t_offset); + break; + } + + switch (filter[i].code) { + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JEQ_X: + seen |= SEEN_XREG; + EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */ + break; + case BPF_S_JMP_JSET_X: + seen |= SEEN_XREG; + EMIT2(0x85, 0xd8); /* test %ebx,%eax */ + break; + case BPF_S_JMP_JEQ_K: + if (K == 0) { + EMIT2(0x85, 0xc0); /* test %eax,%eax */ + break; + } + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGE_K: + if (K <= 127) + EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */ + else + EMIT1_off32(0x3d, K); /* cmp imm32,%eax */ + break; + case BPF_S_JMP_JSET_K: + if (K <= 0xFF) + EMIT2(0xa8, K); /* test imm8,%al */ + else if (!(K & 0xFFFF00FF)) + EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */ + else if (K <= 0xFFFF) { + EMIT2(0x66, 0xa9); /* test imm16,%ax */ + EMIT(K, 2); + } else { + EMIT1_off32(0xa9, K); /* test imm32,%eax */ + } + break; + } + if (filter[i].jt != 0) { + if (filter[i].jf) + t_offset += is_near(f_offset) ? 2 : 6; + EMIT_COND_JMP(t_op, t_offset); + if (filter[i].jf) + EMIT_JMP(f_offset); + break; + } + EMIT_COND_JMP(f_op, f_offset); + break; + default: + /* hmm, too complex filter, give up with jit compiler */ + goto out; + } + ilen = prog - temp; + if (image) { + if (unlikely(proglen + ilen > oldproglen)) { + pr_err("bpb_jit_compile fatal error\n"); + kfree(addrs); + module_free(NULL, image); + return; + } + memcpy(image + proglen, temp, ilen); + } + proglen += ilen; + addrs[i] = proglen; + prog = temp; + } + /* last bpf instruction is always a RET : + * use it to give the cleanup instruction(s) addr + */ + cleanup_addr = proglen - 1; /* ret */ + if (seen) + cleanup_addr -= 1; /* leaveq */ + if (seen & SEEN_XREG) + cleanup_addr -= 4; /* mov -8(%rbp),%rbx */ + + if (image) { + WARN_ON(proglen != oldproglen); + break; + } + if (proglen == oldproglen) { + image = module_alloc(max_t(unsigned int, + proglen, + sizeof(struct work_struct))); + if (!image) + goto out; + } + oldproglen = proglen; + } + if (bpf_jit_enable > 1) + pr_err("flen=%d proglen=%u pass=%d image=%p\n", + flen, proglen, pass, image); + + if (image) { + if (bpf_jit_enable > 1) + print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS, + 16, 1, image, proglen, false); + + bpf_flush_icache(image, image + proglen); + + fp->bpf_func = (void *)image; + } +out: + kfree(addrs); + return; +} + +static void jit_free_defer(struct work_struct *arg) +{ + module_free(NULL, arg); +} + +/* run from softirq, we must use a work_struct to call + * module_free() from process context + */ +void bpf_jit_free(struct sk_filter *fp) +{ + if (fp->bpf_func != sk_run_filter) { + struct work_struct *work = (struct work_struct *)fp->bpf_func; + + INIT_WORK(work, jit_free_defer); + schedule_work(work); + } +} diff --git a/include/linux/filter.h b/include/linux/filter.h index 45266b75409..4609b85e559 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -135,6 +135,8 @@ struct sk_filter { atomic_t refcnt; unsigned int len; /* Number of filter blocks */ + unsigned int (*bpf_func)(const struct sk_buff *skb, + const struct sock_filter *filter); struct rcu_head rcu; struct sock_filter insns[0]; }; @@ -153,6 +155,80 @@ extern unsigned int sk_run_filter(const struct sk_buff *skb, extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); extern int sk_detach_filter(struct sock *sk); extern int sk_chk_filter(struct sock_filter *filter, int flen); + +#ifdef CONFIG_BPF_JIT +extern void bpf_jit_compile(struct sk_filter *fp); +extern void bpf_jit_free(struct sk_filter *fp); +#define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns) +#else +static inline void bpf_jit_compile(struct sk_filter *fp) +{ +} +static inline void bpf_jit_free(struct sk_filter *fp) +{ +} +#define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns) +#endif + +enum { + BPF_S_RET_K = 1, + BPF_S_RET_A, + BPF_S_ALU_ADD_K, + BPF_S_ALU_ADD_X, + BPF_S_ALU_SUB_K, + BPF_S_ALU_SUB_X, + BPF_S_ALU_MUL_K, + BPF_S_ALU_MUL_X, + BPF_S_ALU_DIV_X, + BPF_S_ALU_AND_K, + BPF_S_ALU_AND_X, + BPF_S_ALU_OR_K, + BPF_S_ALU_OR_X, + BPF_S_ALU_LSH_K, + BPF_S_ALU_LSH_X, + BPF_S_ALU_RSH_K, + BPF_S_ALU_RSH_X, + BPF_S_ALU_NEG, + BPF_S_LD_W_ABS, + BPF_S_LD_H_ABS, + BPF_S_LD_B_ABS, + BPF_S_LD_W_LEN, + BPF_S_LD_W_IND, + BPF_S_LD_H_IND, + BPF_S_LD_B_IND, + BPF_S_LD_IMM, + BPF_S_LDX_W_LEN, + BPF_S_LDX_B_MSH, + BPF_S_LDX_IMM, + BPF_S_MISC_TAX, + BPF_S_MISC_TXA, + BPF_S_ALU_DIV_K, + BPF_S_LD_MEM, + BPF_S_LDX_MEM, + BPF_S_ST, + BPF_S_STX, + BPF_S_JMP_JA, + BPF_S_JMP_JEQ_K, + BPF_S_JMP_JEQ_X, + BPF_S_JMP_JGE_K, + BPF_S_JMP_JGE_X, + BPF_S_JMP_JGT_K, + BPF_S_JMP_JGT_X, + BPF_S_JMP_JSET_K, + BPF_S_JMP_JSET_X, + /* Ancillary data */ + BPF_S_ANC_PROTOCOL, + BPF_S_ANC_PKTTYPE, + BPF_S_ANC_IFINDEX, + BPF_S_ANC_NLATTR, + BPF_S_ANC_NLATTR_NEST, + BPF_S_ANC_MARK, + BPF_S_ANC_QUEUE, + BPF_S_ANC_HATYPE, + BPF_S_ANC_RXHASH, + BPF_S_ANC_CPU, +}; + #endif /* __KERNEL__ */ #endif /* __LINUX_FILTER_H__ */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cb8178ab3c5..364bcf212f7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2514,6 +2514,7 @@ extern struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, extern int netdev_max_backlog; extern int netdev_tstamp_prequeue; extern int weight_p; +extern int bpf_jit_enable; extern int netdev_set_master(struct net_device *dev, struct net_device *master); extern int netdev_set_bond_master(struct net_device *dev, struct net_device *master); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d0ae90af0b4..79aafbbf430 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -391,8 +391,8 @@ struct sk_buff { __u32 rxhash; + __u16 queue_mapping; kmemcheck_bitfield_begin(flags2); - __u16 queue_mapping:16; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif diff --git a/net/Kconfig b/net/Kconfig index 79cabf1ee68..745fb02d2fd 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -232,6 +232,19 @@ config XPS depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS default y +config HAVE_BPF_JIT + bool + +config BPF_JIT + bool "enable BPF Just In Time compiler" + depends on HAVE_BPF_JIT + ---help--- + Berkeley Packet Filter filtering capabilities are normally handled + by an interpreter. This option allows kernel to generate a native + code when filter is loaded in memory. This should speedup + packet sniffing (libpcap/tcpdump). Note : Admin should enable + this feature changing /proc/sys/net/core/bpf_jit_enable + menu "Network testing" config NET_PKTGEN diff --git a/net/core/filter.c b/net/core/filter.c index afb8afb066b..0eb8c4466ea 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -39,65 +39,6 @@ #include #include -enum { - BPF_S_RET_K = 1, - BPF_S_RET_A, - BPF_S_ALU_ADD_K, - BPF_S_ALU_ADD_X, - BPF_S_ALU_SUB_K, - BPF_S_ALU_SUB_X, - BPF_S_ALU_MUL_K, - BPF_S_ALU_MUL_X, - BPF_S_ALU_DIV_X, - BPF_S_ALU_AND_K, - BPF_S_ALU_AND_X, - BPF_S_ALU_OR_K, - BPF_S_ALU_OR_X, - BPF_S_ALU_LSH_K, - BPF_S_ALU_LSH_X, - BPF_S_ALU_RSH_K, - BPF_S_ALU_RSH_X, - BPF_S_ALU_NEG, - BPF_S_LD_W_ABS, - BPF_S_LD_H_ABS, - BPF_S_LD_B_ABS, - BPF_S_LD_W_LEN, - BPF_S_LD_W_IND, - BPF_S_LD_H_IND, - BPF_S_LD_B_IND, - BPF_S_LD_IMM, - BPF_S_LDX_W_LEN, - BPF_S_LDX_B_MSH, - BPF_S_LDX_IMM, - BPF_S_MISC_TAX, - BPF_S_MISC_TXA, - BPF_S_ALU_DIV_K, - BPF_S_LD_MEM, - BPF_S_LDX_MEM, - BPF_S_ST, - BPF_S_STX, - BPF_S_JMP_JA, - BPF_S_JMP_JEQ_K, - BPF_S_JMP_JEQ_X, - BPF_S_JMP_JGE_K, - BPF_S_JMP_JGE_X, - BPF_S_JMP_JGT_K, - BPF_S_JMP_JGT_X, - BPF_S_JMP_JSET_K, - BPF_S_JMP_JSET_X, - /* Ancillary data */ - BPF_S_ANC_PROTOCOL, - BPF_S_ANC_PKTTYPE, - BPF_S_ANC_IFINDEX, - BPF_S_ANC_NLATTR, - BPF_S_ANC_NLATTR_NEST, - BPF_S_ANC_MARK, - BPF_S_ANC_QUEUE, - BPF_S_ANC_HATYPE, - BPF_S_ANC_RXHASH, - BPF_S_ANC_CPU, -}; - /* No hurry in this branch */ static void *__load_pointer(const struct sk_buff *skb, int k, unsigned int size) { @@ -145,7 +86,7 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { - unsigned int pkt_len = sk_run_filter(skb, filter->insns); + unsigned int pkt_len = SK_RUN_FILTER(filter, skb); err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; } @@ -638,6 +579,7 @@ void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); + bpf_jit_free(fp); kfree(fp); } EXPORT_SYMBOL(sk_filter_release_rcu); @@ -672,6 +614,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) atomic_set(&fp->refcnt, 1); fp->len = fprog->len; + fp->bpf_func = sk_run_filter; err = sk_chk_filter(fp->insns, fp->len); if (err) { @@ -679,6 +622,8 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) return err; } + bpf_jit_compile(fp); + old_fp = rcu_dereference_protected(sk->sk_filter, sock_owned_by_user(sk)); rcu_assign_pointer(sk->sk_filter, fp); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 385b6095fdc..a829e3f60ae 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -122,6 +122,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_BPF_JIT + { + .procname = "bpf_jit_enable", + .data = &bpf_jit_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif { .procname = "netdev_tstamp_prequeue", .data = &netdev_tstamp_prequeue, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b5362e96022..549527bca87 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -538,7 +538,7 @@ static inline unsigned int run_filter(const struct sk_buff *skb, rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) - res = sk_run_filter(skb, filter->insns); + res = SK_RUN_FILTER(filter, skb); rcu_read_unlock(); return res; -- cgit v1.2.3-70-g09d2 From 1742f183fc218798dab6fcf0ded25b6608fc0a48 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Fri, 22 Apr 2011 06:31:16 +0000 Subject: net: fix netdev_increment_features() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify and fix netdev_increment_features() to conform to what is stated in netdevice.h comments about NETIF_F_ONE_FOR_ALL. Include FCoE segmentation and VLAN-challedged flags in computation. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++++- net/core/dev.c | 35 +++++++++++------------------------ 2 files changed, 17 insertions(+), 25 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 364bcf212f7..b7d0304762a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1106,7 +1106,12 @@ struct net_device { */ #define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \ NETIF_F_SG | NETIF_F_HIGHDMA | \ - NETIF_F_FRAGLIST) + NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED) + /* + * If one device doesn't support one of these features, then disable it + * for all in netdev_increment_features. + */ +#define NETIF_F_ALL_FOR_ALL (NETIF_F_NOCACHE_COPY | NETIF_F_FSO) /* changeable features with no special hardware requirements */ #define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO) diff --git a/net/core/dev.c b/net/core/dev.c index 3bbb4c2ce92..7db99b52679 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6164,33 +6164,20 @@ static int dev_cpu_callback(struct notifier_block *nfb, */ u32 netdev_increment_features(u32 all, u32 one, u32 mask) { - /* If device needs checksumming, downgrade to it. */ - if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) - all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); - else if (mask & NETIF_F_ALL_CSUM) { - /* If one device supports v4/v6 checksumming, set for all. */ - if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) && - !(all & NETIF_F_GEN_CSUM)) { - all &= ~NETIF_F_ALL_CSUM; - all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); - } - - /* If one device supports hw checksumming, set for all. */ - if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { - all &= ~NETIF_F_ALL_CSUM; - all |= NETIF_F_HW_CSUM; - } - } + if (mask & NETIF_F_GEN_CSUM) + mask |= NETIF_F_ALL_CSUM; + mask |= NETIF_F_VLAN_CHALLENGED; - /* If device can't no cache copy, don't do for all */ - if (!(one & NETIF_F_NOCACHE_COPY)) - all &= ~NETIF_F_NOCACHE_COPY; + all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; + all &= one | ~NETIF_F_ALL_FOR_ALL; - one |= NETIF_F_ALL_CSUM; + /* If device needs checksumming, downgrade to it. */ + if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM)) + all &= ~NETIF_F_NO_CSUM; - one |= all & NETIF_F_ONE_FOR_ALL; - all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; - all |= one & mask & NETIF_F_ONE_FOR_ALL; + /* If one device supports hw checksumming, set for all. */ + if (all & NETIF_F_GEN_CSUM) + all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); return all; } -- cgit v1.2.3-70-g09d2 From fa2bd7ff9247f4218dfc907db14d000cd7edd862 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Fri, 22 Apr 2011 06:31:16 +0000 Subject: net: allow user to change NETIF_F_HIGHDMA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NETIF_F_HIGHDMA is like any other TX offloads, so allow user to toggle it. This is needed later for bridge and bonding convertsion to hw_features. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b7d0304762a..e03af35843b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1080,7 +1080,7 @@ struct net_device { /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ -#define NETIF_F_NEVER_CHANGE (NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | \ +#define NETIF_F_NEVER_CHANGE (NETIF_F_VLAN_CHALLENGED | \ NETIF_F_LLTX | NETIF_F_NETNS_LOCAL) #define NETIF_F_ETHTOOL_BITS (0x7f3fffff & ~NETIF_F_NEVER_CHANGE) @@ -1098,6 +1098,7 @@ struct net_device { #define NETIF_F_ALL_TX_OFFLOADS (NETIF_F_ALL_CSUM | NETIF_F_SG | \ NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \ + NETIF_F_HIGHDMA | \ NETIF_F_SCTP_CSUM | NETIF_F_FCOE_CRC) /* -- cgit v1.2.3-70-g09d2 From 8ae6daca85c8bbd6a32c382db5e2a2a989f8bed2 Mon Sep 17 00:00:00 2001 From: David Decotigny Date: Wed, 27 Apr 2011 18:32:38 +0000 Subject: ethtool: Call ethtool's get/set_settings callbacks with cleaned data This makes sure that when a driver calls the ethtool's get/set_settings() callback of another driver, the data passed to it is clean. This guarantees that speed_hi will be zeroed correctly if the called callback doesn't explicitely set it: we are sure we don't get a corrupted speed from the underlying driver. We also take care of setting the cmd field appropriately (ETHTOOL_GSET/SSET). This applies to dev_ethtool_get_settings(), which now makes sure it sets up that ethtool command parameter correctly before passing it to drivers. This also means that whoever calls dev_ethtool_get_settings() does not have to clean the ethtool command parameter. This function also becomes an exported symbol instead of an inline. All drivers visible to make allyesconfig under x86_64 have been updated. Signed-off-by: David Decotigny Signed-off-by: David S. Miller --- arch/mips/txx9/generic/setup_tx4939.c | 21 ++++++++------------- drivers/net/e100.c | 2 +- drivers/net/mdio.c | 3 +++ drivers/net/mii.c | 3 +++ drivers/net/pch_gbe/pch_gbe_main.c | 6 +++--- drivers/net/pch_gbe/pch_gbe_phy.c | 2 +- drivers/net/pcnet32.c | 16 ++++++++-------- drivers/net/sfc/mdio_10g.c | 4 ++-- drivers/net/stmmac/stmmac_ethtool.c | 5 ++--- drivers/net/usb/asix.c | 28 +++++++++++++++------------- drivers/net/usb/dm9601.c | 6 +++--- drivers/net/usb/smsc75xx.c | 7 ++++--- drivers/net/usb/smsc95xx.c | 7 ++++--- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 11 +++++++---- drivers/scsi/fcoe/fcoe.c | 11 +++++++---- include/linux/ethtool.h | 4 +++- include/linux/netdevice.h | 9 ++------- include/rdma/ib_addr.h | 13 +++++++------ net/core/dev.c | 24 ++++++++++++++++++++++++ net/core/net-sysfs.c | 24 ++++++++++-------------- 20 files changed, 117 insertions(+), 89 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/arch/mips/txx9/generic/setup_tx4939.c b/arch/mips/txx9/generic/setup_tx4939.c index 3dc19f48295..e9f95dcde37 100644 --- a/arch/mips/txx9/generic/setup_tx4939.c +++ b/arch/mips/txx9/generic/setup_tx4939.c @@ -318,19 +318,15 @@ void __init tx4939_sio_init(unsigned int sclk, unsigned int cts_mask) } #if defined(CONFIG_TC35815) || defined(CONFIG_TC35815_MODULE) -static int tx4939_get_eth_speed(struct net_device *dev) +static u32 tx4939_get_eth_speed(struct net_device *dev) { - struct ethtool_cmd cmd = { ETHTOOL_GSET }; - int speed = 100; /* default 100Mbps */ - int err; - if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) - return speed; - err = dev->ethtool_ops->get_settings(dev, &cmd); - if (err < 0) - return speed; - speed = cmd.speed == SPEED_100 ? 100 : 10; - return speed; + struct ethtool_cmd cmd; + if (dev_ethtool_get_settings(dev, &cmd)) + return 100; /* default 100Mbps */ + + return ethtool_cmd_speed(&cmd); } + static int tx4939_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -343,8 +339,7 @@ static int tx4939_netdev_event(struct notifier_block *this, else if (dev->irq == TXX9_IRQ_BASE + TX4939_IR_ETH(1)) bit = TX4939_PCFG_SPEED1; if (bit) { - int speed = tx4939_get_eth_speed(dev); - if (speed == 100) + if (tx4939_get_eth_speed(dev) == 100) txx9_set64(&tx4939_ccfgptr->pcfg, bit); else txx9_clear64(&tx4939_ccfgptr->pcfg, bit); diff --git a/drivers/net/e100.c b/drivers/net/e100.c index b0aa9e68990..66ba596a4d3 100644 --- a/drivers/net/e100.c +++ b/drivers/net/e100.c @@ -1668,7 +1668,7 @@ static void e100_adjust_adaptive_ifs(struct nic *nic, int speed, int duplex) static void e100_watchdog(unsigned long data) { struct nic *nic = (struct nic *)data; - struct ethtool_cmd cmd; + struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET }; netif_printk(nic, timer, KERN_DEBUG, nic->netdev, "right now = %ld\n", jiffies); diff --git a/drivers/net/mdio.c b/drivers/net/mdio.c index e85bf04cf81..f2d10abd040 100644 --- a/drivers/net/mdio.c +++ b/drivers/net/mdio.c @@ -176,6 +176,9 @@ static u32 mdio45_get_an(const struct mdio_if_info *mdio, u16 addr) * @npage_adv: Modes currently advertised on next pages * @npage_lpa: Modes advertised by link partner on next pages * + * The @ecmd parameter is expected to have been cleared before calling + * mdio45_ethtool_gset_npage(). + * * Since the CSRs for auto-negotiation using next pages are not fully * standardised, this function does not attempt to decode them. The * caller must pass them in. diff --git a/drivers/net/mii.c b/drivers/net/mii.c index 0a6c6a2e755..05acca78f63 100644 --- a/drivers/net/mii.c +++ b/drivers/net/mii.c @@ -58,6 +58,9 @@ static u32 mii_get_an(struct mii_if_info *mii, u16 addr) * @mii: MII interface * @ecmd: requested ethtool_cmd * + * The @ecmd parameter is expected to have been cleared before calling + * mii_ethtool_gset(). + * * Returns 0 for success, negative on error. */ int mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd) diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c index 4cc9872f5ec..f3e4b0adae9 100644 --- a/drivers/net/pch_gbe/pch_gbe_main.c +++ b/drivers/net/pch_gbe/pch_gbe_main.c @@ -888,12 +888,12 @@ static void pch_gbe_watchdog(unsigned long data) struct pch_gbe_adapter *adapter = (struct pch_gbe_adapter *)data; struct net_device *netdev = adapter->netdev; struct pch_gbe_hw *hw = &adapter->hw; - struct ethtool_cmd cmd; pr_debug("right now = %ld\n", jiffies); pch_gbe_update_stats(adapter); if ((mii_link_ok(&adapter->mii)) && (!netif_carrier_ok(netdev))) { + struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET }; netdev->tx_queue_len = adapter->tx_queue_len; /* mii library handles link maintenance tasks */ if (mii_ethtool_gset(&adapter->mii, &cmd)) { @@ -903,7 +903,7 @@ static void pch_gbe_watchdog(unsigned long data) PCH_GBE_WATCHDOG_PERIOD)); return; } - hw->mac.link_speed = cmd.speed; + hw->mac.link_speed = ethtool_cmd_speed(&cmd); hw->mac.link_duplex = cmd.duplex; /* Set the RGMII control. */ pch_gbe_set_rgmii_ctrl(adapter, hw->mac.link_speed, @@ -913,7 +913,7 @@ static void pch_gbe_watchdog(unsigned long data) hw->mac.link_duplex); netdev_dbg(netdev, "Link is Up %d Mbps %s-Duplex\n", - cmd.speed, + hw->mac.link_speed, cmd.duplex == DUPLEX_FULL ? "Full" : "Half"); netif_carrier_on(netdev); netif_wake_queue(netdev); diff --git a/drivers/net/pch_gbe/pch_gbe_phy.c b/drivers/net/pch_gbe/pch_gbe_phy.c index 923a687acd3..9a8207f686f 100644 --- a/drivers/net/pch_gbe/pch_gbe_phy.c +++ b/drivers/net/pch_gbe/pch_gbe_phy.c @@ -247,7 +247,7 @@ inline void pch_gbe_phy_set_rgmii(struct pch_gbe_hw *hw) void pch_gbe_phy_init_setting(struct pch_gbe_hw *hw) { struct pch_gbe_adapter *adapter; - struct ethtool_cmd cmd; + struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET }; int ret; u16 mii_reg; diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c index 0a1efbae1bc..b48aba9e422 100644 --- a/drivers/net/pcnet32.c +++ b/drivers/net/pcnet32.c @@ -2099,7 +2099,7 @@ static int pcnet32_open(struct net_device *dev) int first_phy = -1; u16 bmcr; u32 bcr9; - struct ethtool_cmd ecmd; + struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; /* * There is really no good other way to handle multiple PHYs @@ -2115,9 +2115,9 @@ static int pcnet32_open(struct net_device *dev) ecmd.port = PORT_MII; ecmd.transceiver = XCVR_INTERNAL; ecmd.autoneg = AUTONEG_DISABLE; - ecmd.speed = - lp-> - options & PCNET32_PORT_100 ? SPEED_100 : SPEED_10; + ethtool_cmd_speed_set(&ecmd, + (lp->options & PCNET32_PORT_100) ? + SPEED_100 : SPEED_10); bcr9 = lp->a.read_bcr(ioaddr, 9); if (lp->options & PCNET32_PORT_FD) { @@ -2763,11 +2763,11 @@ static void pcnet32_check_media(struct net_device *dev, int verbose) netif_carrier_on(dev); if (lp->mii) { if (netif_msg_link(lp)) { - struct ethtool_cmd ecmd; + struct ethtool_cmd ecmd = { + .cmd = ETHTOOL_GSET }; mii_ethtool_gset(&lp->mii_if, &ecmd); - netdev_info(dev, "link up, %sMbps, %s-duplex\n", - (ecmd.speed == SPEED_100) - ? "100" : "10", + netdev_info(dev, "link up, %uMbps, %s-duplex\n", + ethtool_cmd_speed(&ecmd), (ecmd.duplex == DUPLEX_FULL) ? "full" : "half"); } diff --git a/drivers/net/sfc/mdio_10g.c b/drivers/net/sfc/mdio_10g.c index 19e68c26d10..71159145b4b 100644 --- a/drivers/net/sfc/mdio_10g.c +++ b/drivers/net/sfc/mdio_10g.c @@ -232,12 +232,12 @@ void efx_mdio_set_mmds_lpower(struct efx_nic *efx, */ int efx_mdio_set_settings(struct efx_nic *efx, struct ethtool_cmd *ecmd) { - struct ethtool_cmd prev; + struct ethtool_cmd prev = { .cmd = ETHTOOL_GSET }; efx->phy_op->get_settings(efx, &prev); if (ecmd->advertising == prev.advertising && - ecmd->speed == prev.speed && + ethtool_cmd_speed(ecmd) == ethtool_cmd_speed(&prev) && ecmd->duplex == prev.duplex && ecmd->port == prev.port && ecmd->autoneg == prev.autoneg) diff --git a/drivers/net/stmmac/stmmac_ethtool.c b/drivers/net/stmmac/stmmac_ethtool.c index 0e61ac8707c..6f5aaeb986f 100644 --- a/drivers/net/stmmac/stmmac_ethtool.c +++ b/drivers/net/stmmac/stmmac_ethtool.c @@ -237,13 +237,12 @@ stmmac_set_pauseparam(struct net_device *netdev, if (phy->autoneg) { if (netif_running(netdev)) { - struct ethtool_cmd cmd; + struct ethtool_cmd cmd = { .cmd = ETHTOOL_SSET }; /* auto-negotiation automatically restarted */ - cmd.cmd = ETHTOOL_NWAY_RST; cmd.supported = phy->supported; cmd.advertising = phy->advertising; cmd.autoneg = phy->autoneg; - cmd.speed = phy->speed; + ethtool_cmd_speed_set(&cmd, phy->speed); cmd.duplex = phy->duplex; cmd.phy_address = phy->addr; ret = phy_ethtool_sset(phy, &cmd); diff --git a/drivers/net/usb/asix.c b/drivers/net/usb/asix.c index 6140b56cce5..6998aa6b7bb 100644 --- a/drivers/net/usb/asix.c +++ b/drivers/net/usb/asix.c @@ -847,7 +847,7 @@ static void ax88172_set_multicast(struct net_device *net) static int ax88172_link_reset(struct usbnet *dev) { u8 mode; - struct ethtool_cmd ecmd; + struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); @@ -856,8 +856,8 @@ static int ax88172_link_reset(struct usbnet *dev) if (ecmd.duplex != DUPLEX_FULL) mode |= ~AX88172_MEDIUM_FD; - netdev_dbg(dev->net, "ax88172_link_reset() speed: %d duplex: %d setting mode to 0x%04x\n", - ecmd.speed, ecmd.duplex, mode); + netdev_dbg(dev->net, "ax88172_link_reset() speed: %u duplex: %d setting mode to 0x%04x\n", + ethtool_cmd_speed(&ecmd), ecmd.duplex, mode); asix_write_medium_mode(dev, mode); @@ -947,20 +947,20 @@ static const struct ethtool_ops ax88772_ethtool_ops = { static int ax88772_link_reset(struct usbnet *dev) { u16 mode; - struct ethtool_cmd ecmd; + struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); mode = AX88772_MEDIUM_DEFAULT; - if (ecmd.speed != SPEED_100) + if (ethtool_cmd_speed(&ecmd) != SPEED_100) mode &= ~AX_MEDIUM_PS; if (ecmd.duplex != DUPLEX_FULL) mode &= ~AX_MEDIUM_FD; - netdev_dbg(dev->net, "ax88772_link_reset() speed: %d duplex: %d setting mode to 0x%04x\n", - ecmd.speed, ecmd.duplex, mode); + netdev_dbg(dev->net, "ax88772_link_reset() speed: %u duplex: %d setting mode to 0x%04x\n", + ethtool_cmd_speed(&ecmd), ecmd.duplex, mode); asix_write_medium_mode(dev, mode); @@ -1173,18 +1173,20 @@ static int marvell_led_status(struct usbnet *dev, u16 speed) static int ax88178_link_reset(struct usbnet *dev) { u16 mode; - struct ethtool_cmd ecmd; + struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; struct asix_data *data = (struct asix_data *)&dev->data; + u32 speed; netdev_dbg(dev->net, "ax88178_link_reset()\n"); mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); mode = AX88178_MEDIUM_DEFAULT; + speed = ethtool_cmd_speed(&ecmd); - if (ecmd.speed == SPEED_1000) + if (speed == SPEED_1000) mode |= AX_MEDIUM_GM; - else if (ecmd.speed == SPEED_100) + else if (speed == SPEED_100) mode |= AX_MEDIUM_PS; else mode &= ~(AX_MEDIUM_PS | AX_MEDIUM_GM); @@ -1196,13 +1198,13 @@ static int ax88178_link_reset(struct usbnet *dev) else mode &= ~AX_MEDIUM_FD; - netdev_dbg(dev->net, "ax88178_link_reset() speed: %d duplex: %d setting mode to 0x%04x\n", - ecmd.speed, ecmd.duplex, mode); + netdev_dbg(dev->net, "ax88178_link_reset() speed: %u duplex: %d setting mode to 0x%04x\n", + speed, ecmd.duplex, mode); asix_write_medium_mode(dev, mode); if (data->phymode == PHY_MODE_MARVELL && data->ledmode) - marvell_led_status(dev, ecmd.speed); + marvell_led_status(dev, speed); return 0; } diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c index 5002f5be47b..1d93133e9b7 100644 --- a/drivers/net/usb/dm9601.c +++ b/drivers/net/usb/dm9601.c @@ -599,13 +599,13 @@ static void dm9601_status(struct usbnet *dev, struct urb *urb) static int dm9601_link_reset(struct usbnet *dev) { - struct ethtool_cmd ecmd; + struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); - netdev_dbg(dev->net, "link_reset() speed: %d duplex: %d\n", - ecmd.speed, ecmd.duplex); + netdev_dbg(dev->net, "link_reset() speed: %u duplex: %d\n", + ethtool_cmd_speed(&ecmd), ecmd.duplex); return 0; } diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index 860a20c938b..15b3d6888ae 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -503,7 +503,7 @@ static int smsc75xx_update_flowcontrol(struct usbnet *dev, u8 duplex, static int smsc75xx_link_reset(struct usbnet *dev) { struct mii_if_info *mii = &dev->mii; - struct ethtool_cmd ecmd; + struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; u16 lcladv, rmtadv; int ret; @@ -519,8 +519,9 @@ static int smsc75xx_link_reset(struct usbnet *dev) lcladv = smsc75xx_mdio_read(dev->net, mii->phy_id, MII_ADVERTISE); rmtadv = smsc75xx_mdio_read(dev->net, mii->phy_id, MII_LPA); - netif_dbg(dev, link, dev->net, "speed: %d duplex: %d lcladv: %04x" - " rmtadv: %04x", ecmd.speed, ecmd.duplex, lcladv, rmtadv); + netif_dbg(dev, link, dev->net, "speed: %u duplex: %d lcladv: %04x" + " rmtadv: %04x", ethtool_cmd_speed(&ecmd), + ecmd.duplex, lcladv, rmtadv); return smsc75xx_update_flowcontrol(dev, ecmd.duplex, lcladv, rmtadv); } diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index 24f4b3739dd..b374a999790 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -457,7 +457,7 @@ static int smsc95xx_link_reset(struct usbnet *dev) { struct smsc95xx_priv *pdata = (struct smsc95xx_priv *)(dev->data[0]); struct mii_if_info *mii = &dev->mii; - struct ethtool_cmd ecmd; + struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; unsigned long flags; u16 lcladv, rmtadv; u32 intdata; @@ -472,8 +472,9 @@ static int smsc95xx_link_reset(struct usbnet *dev) lcladv = smsc95xx_mdio_read(dev->net, mii->phy_id, MII_ADVERTISE); rmtadv = smsc95xx_mdio_read(dev->net, mii->phy_id, MII_LPA); - netif_dbg(dev, link, dev->net, "speed: %d duplex: %d lcladv: %04x rmtadv: %04x\n", - ecmd.speed, ecmd.duplex, lcladv, rmtadv); + netif_dbg(dev, link, dev->net, + "speed: %u duplex: %d lcladv: %04x rmtadv: %04x\n", + ethtool_cmd_speed(&ecmd), ecmd.duplex, lcladv, rmtadv); spin_lock_irqsave(&pdata->mac_cr_lock, flags); if (ecmd.duplex != DUPLEX_FULL) { diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index e2e647509a7..cd050196a16 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -664,7 +664,7 @@ static void bnx2fc_link_speed_update(struct fc_lport *lport) struct fcoe_port *port = lport_priv(lport); struct bnx2fc_hba *hba = port->priv; struct net_device *netdev = hba->netdev; - struct ethtool_cmd ecmd = { ETHTOOL_GSET }; + struct ethtool_cmd ecmd; if (!dev_ethtool_get_settings(netdev, &ecmd)) { lport->link_supported_speeds &= @@ -675,12 +675,15 @@ static void bnx2fc_link_speed_update(struct fc_lport *lport) if (ecmd.supported & SUPPORTED_10000baseT_Full) lport->link_supported_speeds |= FC_PORTSPEED_10GBIT; - if (ecmd.speed == SPEED_1000) + switch (ethtool_cmd_speed(&ecmd)) { + case SPEED_1000: lport->link_speed = FC_PORTSPEED_1GBIT; - if (ecmd.speed == SPEED_10000) + break; + case SPEED_10000: lport->link_speed = FC_PORTSPEED_10GBIT; + break; + } } - return; } static int bnx2fc_link_ok(struct fc_lport *lport) { diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index bde6ee5333e..04f346b562d 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -2026,7 +2026,7 @@ out_nodev: int fcoe_link_speed_update(struct fc_lport *lport) { struct net_device *netdev = fcoe_netdev(lport); - struct ethtool_cmd ecmd = { ETHTOOL_GSET }; + struct ethtool_cmd ecmd; if (!dev_ethtool_get_settings(netdev, &ecmd)) { lport->link_supported_speeds &= @@ -2037,11 +2037,14 @@ int fcoe_link_speed_update(struct fc_lport *lport) if (ecmd.supported & SUPPORTED_10000baseT_Full) lport->link_supported_speeds |= FC_PORTSPEED_10GBIT; - if (ecmd.speed == SPEED_1000) + switch (ethtool_cmd_speed(&ecmd)) { + case SPEED_1000: lport->link_speed = FC_PORTSPEED_1GBIT; - if (ecmd.speed == SPEED_10000) + break; + case SPEED_10000: lport->link_speed = FC_PORTSPEED_10GBIT; - + break; + } return 0; } return -1; diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 7e6e0a89ca2..4194a2067a1 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -744,7 +744,9 @@ bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported); /** * struct ethtool_ops - optional netdev operations * @get_settings: Get various device settings including Ethernet link - * settings. Returns a negative error code or zero. + * settings. The @cmd parameter is expected to have been cleared + * before get_settings is called. Returns a negative error code or + * zero. * @set_settings: Set various device settings including Ethernet link * settings. Returns a negative error code or zero. * @get_drvinfo: Report driver/device information. Should only set the diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e03af35843b..d5de66af46f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2597,13 +2597,8 @@ static inline int netif_is_bond_slave(struct net_device *dev) extern struct pernet_operations __net_initdata loopback_net_ops; -static inline int dev_ethtool_get_settings(struct net_device *dev, - struct ethtool_cmd *cmd) -{ - if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) - return -EOPNOTSUPP; - return dev->ethtool_ops->get_settings(dev, cmd); -} +int dev_ethtool_get_settings(struct net_device *dev, + struct ethtool_cmd *cmd); static inline u32 dev_ethtool_get_rx_csum(struct net_device *dev) { diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index b5fc9f39122..ae8c68f30f1 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -217,18 +217,19 @@ static inline enum ib_mtu iboe_get_mtu(int mtu) static inline int iboe_get_rate(struct net_device *dev) { struct ethtool_cmd cmd; + u32 speed; - if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings || - dev->ethtool_ops->get_settings(dev, &cmd)) + if (dev_ethtool_get_settings(dev, &cmd)) return IB_RATE_PORT_CURRENT; - if (cmd.speed >= 40000) + speed = ethtool_cmd_speed(&cmd); + if (speed >= 40000) return IB_RATE_40_GBPS; - else if (cmd.speed >= 30000) + else if (speed >= 30000) return IB_RATE_30_GBPS; - else if (cmd.speed >= 20000) + else if (speed >= 20000) return IB_RATE_20_GBPS; - else if (cmd.speed >= 10000) + else if (speed >= 10000) return IB_RATE_10_GBPS; else return IB_RATE_PORT_CURRENT; diff --git a/net/core/dev.c b/net/core/dev.c index 7db99b52679..e95dc30110e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4495,6 +4495,30 @@ void dev_set_rx_mode(struct net_device *dev) netif_addr_unlock_bh(dev); } +/** + * dev_ethtool_get_settings - call device's ethtool_ops::get_settings() + * @dev: device + * @cmd: memory area for ethtool_ops::get_settings() result + * + * The cmd arg is initialized properly (cleared and + * ethtool_cmd::cmd field set to ETHTOOL_GSET). + * + * Return device's ethtool_ops::get_settings() result value or + * -EOPNOTSUPP when device doesn't expose + * ethtool_ops::get_settings() operation. + */ +int dev_ethtool_get_settings(struct net_device *dev, + struct ethtool_cmd *cmd) +{ + if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + memset(cmd, 0, sizeof(struct ethtool_cmd)); + cmd->cmd = ETHTOOL_GSET; + return dev->ethtool_ops->get_settings(dev, cmd); +} +EXPORT_SYMBOL(dev_ethtool_get_settings); + /** * dev_get_flags - get flags reported to userspace * @dev: device diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 5ceb257e860..381813eae46 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -28,6 +28,7 @@ static const char fmt_hex[] = "%#x\n"; static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; +static const char fmt_udec[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -145,13 +146,10 @@ static ssize_t show_speed(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev) && - netdev->ethtool_ops && - netdev->ethtool_ops->get_settings) { - struct ethtool_cmd cmd = { ETHTOOL_GSET }; - - if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) - ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); + if (netif_running(netdev)) { + struct ethtool_cmd cmd; + if (!dev_ethtool_get_settings(netdev, &cmd)) + ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); } rtnl_unlock(); return ret; @@ -166,13 +164,11 @@ static ssize_t show_duplex(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev) && - netdev->ethtool_ops && - netdev->ethtool_ops->get_settings) { - struct ethtool_cmd cmd = { ETHTOOL_GSET }; - - if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) - ret = sprintf(buf, "%s\n", cmd.duplex ? "full" : "half"); + if (netif_running(netdev)) { + struct ethtool_cmd cmd; + if (!dev_ethtool_get_settings(netdev, &cmd)) + ret = sprintf(buf, "%s\n", + cmd.duplex ? "full" : "half"); } rtnl_unlock(); return ret; -- cgit v1.2.3-70-g09d2 From eed2a12f1ed9aabf0676f4d0db34aad51976c5c6 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Wed, 4 May 2011 15:30:11 +0000 Subject: net: Allow ethtool to set interface in loopback mode. This patch enables ethtool to set the loopback mode on a given interface. By configuring the interface in loopback mode in conjunction with a policy route / rule, a userland application can stress the egress / ingress path exposing the flows of the change in progress and potentially help developer(s) understand the impact of those changes without even sending a packet out on the network. Following set of commands illustrates one such example - a) ip -4 addr add 192.168.1.1/24 dev eth1 b) ip -4 rule add from all iif eth1 lookup 250 c) ip -4 route add local 0/0 dev lo proto kernel scope host table 250 d) arp -Ds 192.168.1.100 eth1 e) arp -Ds 192.168.1.200 eth1 f) sysctl -w net.ipv4.ip_nonlocal_bind=1 g) sysctl -w net.ipv4.conf.all.accept_local=1 # Assuming that the machine has 8 cores h) taskset 000f netserver -L 192.168.1.200 i) taskset 00f0 netperf -t TCP_CRR -L 192.168.1.100 -H 192.168.1.200 -l 30 Signed-off-by: Mahesh Bandewar Acked-by: Ben Hutchings Signed-off-by: David S. Miller --- drivers/net/loopback.c | 3 ++- include/linux/netdevice.h | 3 ++- net/core/ethtool.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index d70fb76edb7..4ce9e5f2c06 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -174,7 +174,8 @@ static void loopback_setup(struct net_device *dev) | NETIF_F_HIGHDMA | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL - | NETIF_F_VLAN_CHALLENGED; + | NETIF_F_VLAN_CHALLENGED + | NETIF_F_LOOPBACK; dev->ethtool_ops = &loopback_ethtool_ops; dev->header_ops = ð_header_ops; dev->netdev_ops = &loopback_ops; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d5de66af46f..e7244ed1f9a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1067,6 +1067,7 @@ struct net_device { #define NETIF_F_RXHASH (1 << 28) /* Receive hashing offload */ #define NETIF_F_RXCSUM (1 << 29) /* Receive checksumming offload */ #define NETIF_F_NOCACHE_COPY (1 << 30) /* Use no-cache copyfromuser */ +#define NETIF_F_LOOPBACK (1 << 31) /* Enable loopback */ /* Segmentation offload features */ #define NETIF_F_GSO_SHIFT 16 @@ -1082,7 +1083,7 @@ struct net_device { /* = all defined minus driver/device-class-related */ #define NETIF_F_NEVER_CHANGE (NETIF_F_VLAN_CHALLENGED | \ NETIF_F_LLTX | NETIF_F_NETNS_LOCAL) -#define NETIF_F_ETHTOOL_BITS (0x7f3fffff & ~NETIF_F_NEVER_CHANGE) +#define NETIF_F_ETHTOOL_BITS (0xff3fffff & ~NETIF_F_NEVER_CHANGE) /* List of features with software fallbacks. */ #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | \ diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 927819d9224..b6f40588853 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -362,7 +362,7 @@ static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GS /* NETIF_F_RXHASH */ "rx-hashing", /* NETIF_F_RXCSUM */ "rx-checksum", /* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy" - "", + /* NETIF_F_LOOPBACK */ "loopback", }; static int __ethtool_get_sset_count(struct net_device *dev, int sset) -- cgit v1.2.3-70-g09d2 From 7f267051bd7a280265b1b5ead58e9c6e4e1ac3a4 Mon Sep 17 00:00:00 2001 From: Yi Zou Date: Mon, 9 May 2011 11:53:27 +0000 Subject: net: group FCoE related feature flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Michał Mirosław's patch (http://patchwork.ozlabs.org/patch/94421/) fixes the issue (http://patchwork.ozlabs.org/patch/94188/) about not populating FCoE related flags correctly on vlan devices. However, only NETIF_F_FCOE_CRC is part of the NETIF_F_ALL_TX_OFFLOADS right now, where weed NETIF_F_FCOE_MTU and NETIF_F_FSO as well. Therefore, add NETIF_F_ALL_FCOE to indicate feature flags used by FCoE TX offloads. These include NETIF_F_FCOE_CRC, NETIF_F_FCOE_MTU, and NETIF_F_FSO and add them to be part of NETIF_F_ALL_TX_OFFLOADS. This would eventually make sure all FCoE needed flags are populated properly to vlan devices. Signed-off-by: Yi Zou Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e7244ed1f9a..00d650c7480 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1097,10 +1097,14 @@ struct net_device { #define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN) +#define NETIF_F_ALL_FCOE (NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \ + NETIF_F_FSO) + #define NETIF_F_ALL_TX_OFFLOADS (NETIF_F_ALL_CSUM | NETIF_F_SG | \ NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \ NETIF_F_HIGHDMA | \ - NETIF_F_SCTP_CSUM | NETIF_F_FCOE_CRC) + NETIF_F_SCTP_CSUM | \ + NETIF_F_ALL_FCOE) /* * If one device supports one of these features, then enable them -- cgit v1.2.3-70-g09d2 From afe12cc86b0ba545a01ad8716539ab07ab6e9e89 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Sat, 7 May 2011 03:22:17 +0000 Subject: net: introduce netdev_change_features() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It will be needed by bonding and other drivers changing vlan_features after ndo_init callback. As a bonus, this includes kernel-doc for netdev_update_features(). Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 00d650c7480..1d9696a9ee4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2565,6 +2565,7 @@ u32 netdev_increment_features(u32 all, u32 one, u32 mask); u32 netdev_fix_features(struct net_device *dev, u32 features); int __netdev_update_features(struct net_device *dev); void netdev_update_features(struct net_device *dev); +void netdev_change_features(struct net_device *dev); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index 75898a32c03..ea23353e625 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5289,6 +5289,14 @@ int __netdev_update_features(struct net_device *dev) return 1; } +/** + * netdev_update_features - recalculate device features + * @dev: the device to check + * + * Recalculate dev->features set and send notifications if it + * has changed. Should be called after driver or hardware dependent + * conditions might have changed that influence the features. + */ void netdev_update_features(struct net_device *dev) { if (__netdev_update_features(dev)) @@ -5296,6 +5304,23 @@ void netdev_update_features(struct net_device *dev) } EXPORT_SYMBOL(netdev_update_features); +/** + * netdev_change_features - recalculate device features + * @dev: the device to check + * + * Recalculate dev->features set and send notifications even + * if they have not changed. Should be called instead of + * netdev_update_features() if also dev->vlan_features might + * have changed to allow the changes to be propagated to stacked + * VLAN devices. + */ +void netdev_change_features(struct net_device *dev) +{ + __netdev_update_features(dev); + netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_change_features); + /** * netif_stacked_transfer_operstate - transfer operstate * @rootdev: the root or lower level device to transfer state from -- cgit v1.2.3-70-g09d2 From bdc220da3209d50b8c295490dec94845c88670a2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 9 May 2011 17:42:46 +0000 Subject: netdevice.h: Align struct net_device members Save a bit of space. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1d9696a9ee4..a134d809125 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1020,9 +1020,6 @@ struct net_device { * part of the usual set specified in Space.c. */ - unsigned char if_port; /* Selectable AUI, TP,..*/ - unsigned char dma; /* DMA channel */ - unsigned long state; struct list_head dev_list; @@ -1146,13 +1143,16 @@ struct net_device { const struct header_ops *header_ops; unsigned int flags; /* interface flags (a la BSD) */ + unsigned int priv_flags; /* Like 'flags' but invisible to userspace. */ unsigned short gflags; - unsigned int priv_flags; /* Like 'flags' but invisible to userspace. */ unsigned short padded; /* How much padding added by alloc_netdev() */ unsigned char operstate; /* RFC2863 operstate */ unsigned char link_mode; /* mapping policy to operstate */ + unsigned char if_port; /* Selectable AUI, TP,..*/ + unsigned char dma; /* DMA channel */ + unsigned int mtu; /* interface MTU value */ unsigned short type; /* interface hardware type */ unsigned short hard_header_len; /* hardware hdr length */ -- cgit v1.2.3-70-g09d2 From 449f4544267e73d5db372971da63634707c32299 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 May 2011 12:24:16 +0000 Subject: macvlan: remove one synchronize_rcu() call When one macvlan device is dismantled, we can avoid one synchronize_rcu() call done after deletion from hash list, since caller will perform a synchronize_net() call after its ndo_stop() call. Add a new netdev->dismantle field to signal this dismantle intent. Reduces RTNL hold time. Signed-off-by: Eric Dumazet CC: Patrick McHardy CC: Ben Greear Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 9 +++++---- include/linux/netdevice.h | 4 +++- net/core/dev.c | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d7c0bc62da7..07bcb8084d7 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -70,16 +70,17 @@ static void macvlan_hash_add(struct macvlan_dev *vlan) hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[addr[5]]); } -static void macvlan_hash_del(struct macvlan_dev *vlan) +static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync) { hlist_del_rcu(&vlan->hlist); - synchronize_rcu(); + if (sync) + synchronize_rcu(); } static void macvlan_hash_change_addr(struct macvlan_dev *vlan, const unsigned char *addr) { - macvlan_hash_del(vlan); + macvlan_hash_del(vlan, true); /* Now that we are unhashed it is safe to change the device * address without confusing packet delivery. */ @@ -345,7 +346,7 @@ static int macvlan_stop(struct net_device *dev) dev_uc_del(lowerdev, dev->dev_addr); hash_del: - macvlan_hash_del(vlan); + macvlan_hash_del(vlan, !dev->dismantle); return 0; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a134d809125..ca333e79e10 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1293,7 +1293,9 @@ struct net_device { NETREG_UNREGISTERED, /* completed unregister todo */ NETREG_RELEASED, /* called free_netdev */ NETREG_DUMMY, /* dummy device for NAPI poll */ - } reg_state:16; + } reg_state:8; + + bool dismantle; /* device is going do be freed */ enum { RTNL_LINK_INITIALIZED, diff --git a/net/core/dev.c b/net/core/dev.c index 155de2094e7..d94537914a7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5126,7 +5126,7 @@ static void rollback_registered_many(struct list_head *head) list_del(&dev->unreg_list); continue; } - + dev->dismantle = true; BUG_ON(dev->reg_state != NETREG_REGISTERED); } -- cgit v1.2.3-70-g09d2 From 3019de124b9f5b1526cb3668b74af14371e21795 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 6 Jun 2011 16:41:33 -0700 Subject: net: Rework netdev_drivername() to avoid warning. This interface uses a temporary buffer, but for no real reason. And now can generate warnings like: net/sched/sch_generic.c: In function dev_watchdog net/sched/sch_generic.c:254:10: warning: unused variable drivername Just return driver->name directly or "". Reported-by: Connor Hansen Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- net/core/dev.c | 16 +++++----------- net/sched/sch_generic.c | 3 +-- 3 files changed, 7 insertions(+), 14 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ca333e79e10..54b8b4d7b68 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2555,7 +2555,7 @@ extern void netdev_class_remove_file(struct class_attribute *class_attr); extern struct kobj_ns_type_operations net_ns_type_operations; -extern char *netdev_drivername(const struct net_device *dev, char *buffer, int len); +extern const char *netdev_drivername(const struct net_device *dev); extern void linkwatch_run_queue(void); diff --git a/net/core/dev.c b/net/core/dev.c index 939307891e7..1af6cb27f67 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6264,29 +6264,23 @@ err_name: /** * netdev_drivername - network driver for the device * @dev: network device - * @buffer: buffer for resulting name - * @len: size of buffer * * Determine network driver for device. */ -char *netdev_drivername(const struct net_device *dev, char *buffer, int len) +const char *netdev_drivername(const struct net_device *dev) { const struct device_driver *driver; const struct device *parent; - - if (len <= 0 || !buffer) - return buffer; - buffer[0] = 0; + const char *empty = ""; parent = dev->dev.parent; - if (!parent) - return buffer; + return empty; driver = parent->driver; if (driver && driver->name) - strlcpy(buffer, driver->name, len); - return buffer; + return driver->name; + return empty; } static int __netdev_printk(const char *level, const struct net_device *dev, diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index b1721d71c27..b4c680900d7 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -251,9 +251,8 @@ static void dev_watchdog(unsigned long arg) } if (some_queue_timedout) { - char drivername[64]; WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", - dev->name, netdev_drivername(dev, drivername, 64), i); + dev->name, netdev_drivername(dev), i); dev->netdev_ops->ndo_tx_timeout(dev); } if (!mod_timer(&dev->watchdog_timer, -- cgit v1.2.3-70-g09d2