summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c2
-rw-r--r--net/8021q/vlan_dev.c14
-rw-r--r--net/9p/trans_rdma.c1
-rw-r--r--net/Kconfig6
-rw-r--r--net/Makefile4
-rw-r--r--net/atm/clip.c8
-rw-r--r--net/batman-adv/Makefile2
-rw-r--r--net/batman-adv/aggregation.c10
-rw-r--r--net/batman-adv/aggregation.h6
-rw-r--r--net/batman-adv/bat_debugfs.c6
-rw-r--r--net/batman-adv/bat_debugfs.h2
-rw-r--r--net/batman-adv/bat_sysfs.c53
-rw-r--r--net/batman-adv/bat_sysfs.h2
-rw-r--r--net/batman-adv/bitarray.c2
-rw-r--r--net/batman-adv/bitarray.h2
-rw-r--r--net/batman-adv/gateway_client.c142
-rw-r--r--net/batman-adv/gateway_client.h2
-rw-r--r--net/batman-adv/gateway_common.c2
-rw-r--r--net/batman-adv/gateway_common.h2
-rw-r--r--net/batman-adv/hard-interface.c420
-rw-r--r--net/batman-adv/hard-interface.h21
-rw-r--r--net/batman-adv/hash.c28
-rw-r--r--net/batman-adv/hash.h119
-rw-r--r--net/batman-adv/icmp_socket.c43
-rw-r--r--net/batman-adv/icmp_socket.h4
-rw-r--r--net/batman-adv/main.c16
-rw-r--r--net/batman-adv/main.h35
-rw-r--r--net/batman-adv/originator.c254
-rw-r--r--net/batman-adv/originator.h52
-rw-r--r--net/batman-adv/packet.h17
-rw-r--r--net/batman-adv/ring_buffer.c2
-rw-r--r--net/batman-adv/ring_buffer.h2
-rw-r--r--net/batman-adv/routing.c1000
-rw-r--r--net/batman-adv/routing.h30
-rw-r--r--net/batman-adv/send.c110
-rw-r--r--net/batman-adv/send.h12
-rw-r--r--net/batman-adv/soft-interface.c77
-rw-r--r--net/batman-adv/soft-interface.h5
-rw-r--r--net/batman-adv/translation-table.c208
-rw-r--r--net/batman-adv/translation-table.h4
-rw-r--r--net/batman-adv/types.h54
-rw-r--r--net/batman-adv/unicast.c142
-rw-r--r--net/batman-adv/unicast.h27
-rw-r--r--net/batman-adv/vis.c208
-rw-r--r--net/batman-adv/vis.h2
-rw-r--r--net/bridge/Kconfig1
-rw-r--r--net/bridge/br_device.c21
-rw-r--r--net/bridge/br_fdb.c4
-rw-r--r--net/bridge/br_if.c15
-rw-r--r--net/bridge/br_input.c2
-rw-r--r--net/bridge/br_multicast.c42
-rw-r--r--net/bridge/br_netfilter.c14
-rw-r--r--net/bridge/br_private.h5
-rw-r--r--net/bridge/br_stp.c35
-rw-r--r--net/bridge/br_stp_timer.c1
-rw-r--r--net/bridge/netfilter/ebt_ip6.c46
-rw-r--r--net/bridge/netfilter/ebtables.c3
-rw-r--r--net/caif/cfcnfg.c11
-rw-r--r--net/caif/cfdgml.c1
-rw-r--r--net/caif/cfserl.c1
-rw-r--r--net/caif/cfutill.c2
-rw-r--r--net/caif/cfveil.c2
-rw-r--r--net/caif/chnl_net.c4
-rw-r--r--net/can/bcm.c3
-rw-r--r--net/can/raw.c3
-rw-r--r--net/ceph/messenger.c133
-rw-r--r--net/ceph/pagevec.c18
-rw-r--r--net/core/dev.c514
-rw-r--r--net/core/dev_addr_lists.c2
-rw-r--r--net/core/dst.c43
-rw-r--r--net/core/ethtool.c606
-rw-r--r--net/core/fib_rules.c6
-rw-r--r--net/core/filter.c6
-rw-r--r--net/core/flow.c14
-rw-r--r--net/core/neighbour.c13
-rw-r--r--net/core/net-sysfs.c17
-rw-r--r--net/core/netpoll.c13
-rw-r--r--net/core/pktgen.c235
-rw-r--r--net/core/rtnetlink.c94
-rw-r--r--net/core/skbuff.c16
-rw-r--r--net/dcb/dcbnl.c172
-rw-r--r--net/dccp/ccids/ccid2.c9
-rw-r--r--net/dccp/input.c7
-rw-r--r--net/dccp/ipv4.c50
-rw-r--r--net/dccp/ipv6.c188
-rw-r--r--net/decnet/af_decnet.c16
-rw-r--r--net/decnet/dn_fib.c23
-rw-r--r--net/decnet/dn_nsp_out.c16
-rw-r--r--net/decnet/dn_route.c300
-rw-r--r--net/decnet/dn_rules.c17
-rw-r--r--net/decnet/dn_table.c7
-rw-r--r--net/dns_resolver/dns_key.c20
-rw-r--r--net/dsa/dsa.c2
-rw-r--r--net/dsa/mv88e6060.c7
-rw-r--r--net/econet/af_econet.c4
-rw-r--r--net/ipv4/Kconfig42
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c62
-rw-r--r--net/ipv4/ah4.c27
-rw-r--r--net/ipv4/arp.c36
-rw-r--r--net/ipv4/datagram.c11
-rw-r--r--net/ipv4/devinet.c110
-rw-r--r--net/ipv4/esp4.c104
-rw-r--r--net/ipv4/fib_frontend.c105
-rw-r--r--net/ipv4/fib_hash.c1133
-rw-r--r--net/ipv4/fib_lookup.h10
-rw-r--r--net/ipv4/fib_rules.c25
-rw-r--r--net/ipv4/fib_semantics.c257
-rw-r--r--net/ipv4/fib_trie.c272
-rw-r--r--net/ipv4/icmp.c240
-rw-r--r--net/ipv4/igmp.c45
-rw-r--r--net/ipv4/inet_connection_sock.c27
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c2
-rw-r--r--net/ipv4/inetpeer.c150
-rw-r--r--net/ipv4/ip_gre.c57
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/ip_output.c345
-rw-r--r--net/ipv4/ipip.c41
-rw-r--r--net/ipv4/ipmr.c155
-rw-r--r--net/ipv4/netfilter.c36
-rw-r--r--net/ipv4/netfilter/Kconfig3
-rw-r--r--net/ipv4/netfilter/arp_tables.c2
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c6
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c7
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c3
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c33
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c9
-rw-r--r--net/ipv4/raw.c58
-rw-r--r--net/ipv4/route.c1186
-rw-r--r--net/ipv4/syncookies.c25
-rw-r--r--net/ipv4/tcp.c20
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cubic.c2
-rw-r--r--net/ipv4/tcp_highspeed.c2
-rw-r--r--net/ipv4/tcp_htcp.c2
-rw-r--r--net/ipv4/tcp_hybla.c2
-rw-r--r--net/ipv4/tcp_illinois.c2
-rw-r--r--net/ipv4/tcp_input.c9
-rw-r--r--net/ipv4/tcp_ipv4.c38
-rw-r--r--net/ipv4/tcp_lp.c2
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/tcp_scalable.c2
-rw-r--r--net/ipv4/tcp_timer.c3
-rw-r--r--net/ipv4/tcp_vegas.c2
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/tcp_yeah.c2
-rw-r--r--net/ipv4/udp.c139
-rw-r--r--net/ipv4/xfrm4_policy.c74
-rw-r--r--net/ipv4/xfrm4_state.c20
-rw-r--r--net/ipv6/addrconf.c87
-rw-r--r--net/ipv6/af_inet6.c49
-rw-r--r--net/ipv6/ah6.c2
-rw-r--r--net/ipv6/datagram.c88
-rw-r--r--net/ipv6/esp6.c109
-rw-r--r--net/ipv6/exthdrs.c12
-rw-r--r--net/ipv6/fib6_rules.c19
-rw-r--r--net/ipv6/icmp.c226
-rw-r--r--net/ipv6/inet6_connection_sock.c81
-rw-r--r--net/ipv6/ip6_fib.c4
-rw-r--r--net/ipv6/ip6_flowlabel.c6
-rw-r--r--net/ipv6/ip6_output.c156
-rw-r--r--net/ipv6/ip6_tunnel.c83
-rw-r--r--net/ipv6/ip6mr.c131
-rw-r--r--net/ipv6/ipv6_sockglue.c10
-rw-r--r--net/ipv6/mcast.c27
-rw-r--r--net/ipv6/mip6.c16
-rw-r--r--net/ipv6/ndisc.c22
-rw-r--r--net/ipv6/netfilter.c19
-rw-r--r--net/ipv6/netfilter/ip6_tables.c2
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c5
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c21
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c3
-rw-r--r--net/ipv6/raw.c125
-rw-r--r--net/ipv6/route.c223
-rw-r--r--net/ipv6/sit.c58
-rw-r--r--net/ipv6/syncookies.c31
-rw-r--r--net/ipv6/sysctl_net_ipv6.c9
-rw-r--r--net/ipv6/tcp_ipv6.c169
-rw-r--r--net/ipv6/udp.c91
-rw-r--r--net/ipv6/xfrm6_policy.c55
-rw-r--r--net/ipv6/xfrm6_state.c20
-rw-r--r--net/key/af_key.c243
-rw-r--r--net/l2tp/l2tp_ip.c36
-rw-r--r--net/llc/llc_input.c25
-rw-r--r--net/mac80211/Kconfig6
-rw-r--r--net/mac80211/iface.c1
-rw-r--r--net/netfilter/Kconfig66
-rw-r--r--net/netfilter/Makefile9
-rw-r--r--net/netfilter/core.c23
-rw-r--r--net/netfilter/ipset/Kconfig122
-rw-r--r--net/netfilter/ipset/Makefile24
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c587
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c652
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c515
-rw-r--r--net/netfilter/ipset/ip_set_core.c1671
-rw-r--r--net/netfilter/ipset/ip_set_getport.c141
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c464
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c544
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c562
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c628
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c458
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c578
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c584
-rw-r--r--net/netfilter/ipset/pfxlen.c291
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c98
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c237
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c400
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c904
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c134
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c61
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c82
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c99
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c20
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c17
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c129
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c45
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c153
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c142
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c110
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c25
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c1239
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c22
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c14
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c117
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c82
-rw-r--r--net/netfilter/nf_conntrack_core.c68
-rw-r--r--net/netfilter/nf_conntrack_ecache.c3
-rw-r--r--net/netfilter/nf_conntrack_expect.c34
-rw-r--r--net/netfilter/nf_conntrack_extend.c11
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c32
-rw-r--r--net/netfilter/nf_conntrack_helper.c20
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c74
-rw-r--r--net/netfilter/nf_conntrack_netlink.c54
-rw-r--r--net/netfilter/nf_conntrack_proto.c24
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c3
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c18
-rw-r--r--net/netfilter/nf_conntrack_snmp.c77
-rw-r--r--net/netfilter/nf_conntrack_standalone.c45
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c120
-rw-r--r--net/netfilter/nf_log.c10
-rw-r--r--net/netfilter/nf_queue.c82
-rw-r--r--net/netfilter/nf_tproxy_core.c27
-rw-r--r--net/netfilter/nfnetlink_log.c9
-rw-r--r--net/netfilter/nfnetlink_queue.c22
-rw-r--r--net/netfilter/x_tables.c98
-rw-r--r--net/netfilter/xt_AUDIT.c204
-rw-r--r--net/netfilter/xt_CLASSIFY.c36
-rw-r--r--net/netfilter/xt_IDLETIMER.c2
-rw-r--r--net/netfilter/xt_LED.c2
-rw-r--r--net/netfilter/xt_NFQUEUE.c34
-rw-r--r--net/netfilter/xt_TCPMSS.c15
-rw-r--r--net/netfilter/xt_TEE.c27
-rw-r--r--net/netfilter/xt_TPROXY.c22
-rw-r--r--net/netfilter/xt_connlimit.c62
-rw-r--r--net/netfilter/xt_conntrack.c80
-rw-r--r--net/netfilter/xt_cpu.c2
-rw-r--r--net/netfilter/xt_devgroup.c82
-rw-r--r--net/netfilter/xt_iprange.c34
-rw-r--r--net/netfilter/xt_ipvs.c2
-rw-r--r--net/netfilter/xt_set.c359
-rw-r--r--net/netfilter/xt_socket.c13
-rw-r--r--net/netlabel/netlabel_user.h6
-rw-r--r--net/netlink/af_netlink.c27
-rw-r--r--net/netlink/genetlink.c2
-rw-r--r--net/packet/af_packet.c41
-rw-r--r--net/phonet/Kconfig12
-rw-r--r--net/phonet/af_phonet.c32
-rw-r--r--net/phonet/pep.c832
-rw-r--r--net/phonet/socket.c126
-rw-r--r--net/rds/ib_send.c5
-rw-r--r--net/rds/loop.c11
-rw-r--r--net/rds/rds.h1
-rw-r--r--net/rfkill/Kconfig4
-rw-r--r--net/rose/af_rose.c7
-rw-r--r--net/rose/rose_route.c28
-rw-r--r--net/rxrpc/ar-input.c1
-rw-r--r--net/rxrpc/ar-key.c8
-rw-r--r--net/rxrpc/ar-peer.c28
-rw-r--r--net/sched/Kconfig39
-rw-r--r--net/sched/Makefile4
-rw-r--r--net/sched/act_api.c46
-rw-r--r--net/sched/act_csum.c2
-rw-r--r--net/sched/act_gact.c8
-rw-r--r--net/sched/act_ipt.c16
-rw-r--r--net/sched/act_mirred.c4
-rw-r--r--net/sched/act_nat.c2
-rw-r--r--net/sched/act_pedit.c10
-rw-r--r--net/sched/act_police.c9
-rw-r--r--net/sched/act_simple.c10
-rw-r--r--net/sched/act_skbedit.c8
-rw-r--r--net/sched/cls_api.c33
-rw-r--r--net/sched/cls_basic.c17
-rw-r--r--net/sched/cls_cgroup.c8
-rw-r--r--net/sched/cls_flow.c6
-rw-r--r--net/sched/cls_fw.c38
-rw-r--r--net/sched/cls_route.c126
-rw-r--r--net/sched/cls_rsvp.h95
-rw-r--r--net/sched/cls_tcindex.c2
-rw-r--r--net/sched/cls_u32.c89
-rw-r--r--net/sched/em_cmp.c47
-rw-r--r--net/sched/em_meta.c48
-rw-r--r--net/sched/em_nbyte.c3
-rw-r--r--net/sched/em_text.c3
-rw-r--r--net/sched/em_u32.c2
-rw-r--r--net/sched/ematch.c37
-rw-r--r--net/sched/sch_api.c173
-rw-r--r--net/sched/sch_atm.c16
-rw-r--r--net/sched/sch_cbq.c365
-rw-r--r--net/sched/sch_choke.c688
-rw-r--r--net/sched/sch_drr.c2
-rw-r--r--net/sched/sch_dsmark.c23
-rw-r--r--net/sched/sch_fifo.c55
-rw-r--r--net/sched/sch_generic.c59
-rw-r--r--net/sched/sch_gred.c85
-rw-r--r--net/sched/sch_hfsc.c39
-rw-r--r--net/sched/sch_htb.c118
-rw-r--r--net/sched/sch_mq.c1
-rw-r--r--net/sched/sch_mqprio.c418
-rw-r--r--net/sched/sch_multiq.c10
-rw-r--r--net/sched/sch_netem.c412
-rw-r--r--net/sched/sch_prio.c36
-rw-r--r--net/sched/sch_red.c72
-rw-r--r--net/sched/sch_sfb.c709
-rw-r--r--net/sched/sch_sfq.c72
-rw-r--r--net/sched/sch_tbf.c41
-rw-r--r--net/sched/sch_teql.c39
-rw-r--r--net/sctp/associola.c2
-rw-r--r--net/sctp/input.c3
-rw-r--r--net/sctp/ipv6.c42
-rw-r--r--net/sctp/outqueue.c2
-rw-r--r--net/sctp/protocol.c33
-rw-r--r--net/sctp/sm_make_chunk.c13
-rw-r--r--net/sctp/socket.c15
-rw-r--r--net/sctp/tsnmap.c2
-rw-r--r--net/sctp/ulpqueue.c7
-rw-r--r--net/socket.c31
-rw-r--r--net/sunrpc/svcsock.c36
-rw-r--r--net/tipc/Kconfig12
-rw-r--r--net/tipc/addr.c15
-rw-r--r--net/tipc/addr.h17
-rw-r--r--net/tipc/bcast.c47
-rw-r--r--net/tipc/bcast.h3
-rw-r--r--net/tipc/bearer.c116
-rw-r--r--net/tipc/bearer.h73
-rw-r--r--net/tipc/config.c31
-rw-r--r--net/tipc/core.c9
-rw-r--r--net/tipc/core.h4
-rw-r--r--net/tipc/discover.c140
-rw-r--r--net/tipc/discover.h9
-rw-r--r--net/tipc/link.c130
-rw-r--r--net/tipc/link.h29
-rw-r--r--net/tipc/msg.c41
-rw-r--r--net/tipc/msg.h64
-rw-r--r--net/tipc/name_distr.c18
-rw-r--r--net/tipc/net.c32
-rw-r--r--net/tipc/net.h19
-rw-r--r--net/tipc/node.c125
-rw-r--r--net/tipc/node.h36
-rw-r--r--net/tipc/node_subscr.c21
-rw-r--r--net/tipc/node_subscr.h3
-rw-r--r--net/tipc/port.c306
-rw-r--r--net/tipc/port.h73
-rw-r--r--net/tipc/socket.c76
-rw-r--r--net/tipc/subscr.c13
-rw-r--r--net/unix/af_unix.c87
-rw-r--r--net/wanrouter/wanmain.c2
-rw-r--r--net/wireless/Kconfig2
-rw-r--r--net/wireless/wext-compat.c4
-rw-r--r--net/x25/x25_facilities.c28
-rw-r--r--net/x25/x25_in.c14
-rw-r--r--net/x25/x25_link.c5
-rw-r--r--net/xfrm/Makefile2
-rw-r--r--net/xfrm/xfrm_algo.c8
-rw-r--r--net/xfrm/xfrm_hash.h32
-rw-r--r--net/xfrm/xfrm_input.c13
-rw-r--r--net/xfrm/xfrm_output.c15
-rw-r--r--net/xfrm/xfrm_policy.c221
-rw-r--r--net/xfrm/xfrm_replay.c534
-rw-r--r--net/xfrm/xfrm_state.c175
-rw-r--r--net/xfrm/xfrm_user.c211
394 files changed, 24534 insertions, 10870 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 6e64f7c6a2e..7850412f52b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -327,7 +327,7 @@ static void vlan_sync_address(struct net_device *dev,
static void vlan_transfer_features(struct net_device *dev,
struct net_device *vlandev)
{
- unsigned long old_features = vlandev->features;
+ u32 old_features = vlandev->features;
vlandev->features &= ~dev->vlan_features;
vlandev->features |= dev->features & dev->vlan_features;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index be737539f34..ae610f046de 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -625,6 +625,19 @@ static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type)
rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type);
return rc;
}
+
+static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid,
+ struct scatterlist *sgl, unsigned int sgc)
+{
+ struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int rc = 0;
+
+ if (ops->ndo_fcoe_ddp_target)
+ rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc);
+
+ return rc;
+}
#endif
static void vlan_dev_change_rx_flags(struct net_device *dev, int change)
@@ -858,6 +871,7 @@ static const struct net_device_ops vlan_netdev_ops = {
.ndo_fcoe_enable = vlan_dev_fcoe_enable,
.ndo_fcoe_disable = vlan_dev_fcoe_disable,
.ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn,
+ .ndo_fcoe_ddp_target = vlan_dev_fcoe_ddp_target,
#endif
};
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 17c5ba7551a..29a54ccd213 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -59,7 +59,6 @@
* safely advertise a maxsize
* of 64k */
-#define P9_RDMA_MAX_SGE (P9_RDMA_MAXSIZE >> PAGE_SHIFT)
/**
* struct p9_trans_rdma - RDMA transport instance
*
diff --git a/net/Kconfig b/net/Kconfig
index 72840626284..79cabf1ee68 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -221,6 +221,12 @@ config RPS
depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
default y
+config RFS_ACCEL
+ boolean
+ depends on RPS && GENERIC_HARDIRQS
+ select CPU_RMAP
+ default y
+
config XPS
boolean
depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
diff --git a/net/Makefile b/net/Makefile
index a3330ebe2c5..a51d9465e62 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -19,9 +19,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_INET) += ipv4/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX) += unix/
-ifneq ($(CONFIG_IPV6),)
-obj-y += ipv6/
-endif
+obj-$(CONFIG_NET) += ipv6/
obj-$(CONFIG_PACKET) += packet/
obj-$(CONFIG_NET_KEY) += key/
obj-$(CONFIG_BRIDGE) += bridge/
diff --git a/net/atm/clip.c b/net/atm/clip.c
index d257da50fcf..1d4be60e139 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -502,8 +502,6 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
struct atmarp_entry *entry;
int error;
struct clip_vcc *clip_vcc;
- struct flowi fl = { .fl4_dst = ip,
- .fl4_tos = 1 };
struct rtable *rt;
if (vcc->push != clip_push) {
@@ -520,9 +518,9 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
unlink_clip_vcc(clip_vcc);
return 0;
}
- error = ip_route_output_key(&init_net, &rt, &fl);
- if (error)
- return error;
+ rt = ip_route_output(&init_net, ip, 0, 1, 0);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
neigh = __neigh_lookup(&clip_tbl, &ip, rt->dst.dev, 1);
ip_rt_put(rt);
if (!neigh)
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index d936aeccd19..2de93d00631 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
#
# Marek Lindner, Simon Wunderlich
#
diff --git a/net/batman-adv/aggregation.c b/net/batman-adv/aggregation.c
index 3850a3ecf94..af45d6b2031 100644
--- a/net/batman-adv/aggregation.c
+++ b/net/batman-adv/aggregation.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -35,7 +35,7 @@ static bool can_aggregate_with(struct batman_packet *new_batman_packet,
int packet_len,
unsigned long send_time,
bool directlink,
- struct batman_if *if_incoming,
+ struct hard_iface *if_incoming,
struct forw_packet *forw_packet)
{
struct batman_packet *batman_packet =
@@ -99,7 +99,7 @@ static bool can_aggregate_with(struct batman_packet *new_batman_packet,
/* create a new aggregated packet and add this packet to it */
static void new_aggregated_packet(unsigned char *packet_buff, int packet_len,
unsigned long send_time, bool direct_link,
- struct batman_if *if_incoming,
+ struct hard_iface *if_incoming,
int own_packet)
{
struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
@@ -188,7 +188,7 @@ static void aggregate(struct forw_packet *forw_packet_aggr,
void add_bat_packet_to_list(struct bat_priv *bat_priv,
unsigned char *packet_buff, int packet_len,
- struct batman_if *if_incoming, char own_packet,
+ struct hard_iface *if_incoming, char own_packet,
unsigned long send_time)
{
/**
@@ -247,7 +247,7 @@ void add_bat_packet_to_list(struct bat_priv *bat_priv,
/* unpack the aggregated packets and process them one by one */
void receive_aggr_bat_packet(struct ethhdr *ethhdr, unsigned char *packet_buff,
- int packet_len, struct batman_if *if_incoming)
+ int packet_len, struct hard_iface *if_incoming)
{
struct batman_packet *batman_packet;
int buff_pos = 0;
diff --git a/net/batman-adv/aggregation.h b/net/batman-adv/aggregation.h
index 71a91b3da91..062204289d1 100644
--- a/net/batman-adv/aggregation.h
+++ b/net/batman-adv/aggregation.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -35,9 +35,9 @@ static inline int aggregated_packet(int buff_pos, int packet_len, int num_hna)
void add_bat_packet_to_list(struct bat_priv *bat_priv,
unsigned char *packet_buff, int packet_len,
- struct batman_if *if_incoming, char own_packet,
+ struct hard_iface *if_incoming, char own_packet,
unsigned long send_time);
void receive_aggr_bat_packet(struct ethhdr *ethhdr, unsigned char *packet_buff,
- int packet_len, struct batman_if *if_incoming);
+ int packet_len, struct hard_iface *if_incoming);
#endif /* _NET_BATMAN_ADV_AGGREGATION_H_ */
diff --git a/net/batman-adv/bat_debugfs.c b/net/batman-adv/bat_debugfs.c
index 0ae81d07f10..0e9d4350993 100644
--- a/net/batman-adv/bat_debugfs.c
+++ b/net/batman-adv/bat_debugfs.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -52,7 +52,6 @@ static void emit_log_char(struct debug_log *debug_log, char c)
static int fdebug_log(struct debug_log *debug_log, char *fmt, ...)
{
- int printed_len;
va_list args;
static char debug_log_buf[256];
char *p;
@@ -62,8 +61,7 @@ static int fdebug_log(struct debug_log *debug_log, char *fmt, ...)
spin_lock_bh(&debug_log->lock);
va_start(args, fmt);
- printed_len = vscnprintf(debug_log_buf, sizeof(debug_log_buf),
- fmt, args);
+ vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args);
va_end(args);
for (p = debug_log_buf; *p != 0; p++)
diff --git a/net/batman-adv/bat_debugfs.h b/net/batman-adv/bat_debugfs.h
index 72df532b7d5..bc9cda3f01e 100644
--- a/net/batman-adv/bat_debugfs.h
+++ b/net/batman-adv/bat_debugfs.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/bat_sysfs.c b/net/batman-adv/bat_sysfs.c
index cd7bb51825f..e449bf6353e 100644
--- a/net/batman-adv/bat_sysfs.c
+++ b/net/batman-adv/bat_sysfs.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -441,16 +441,16 @@ static ssize_t show_mesh_iface(struct kobject *kobj, struct attribute *attr,
char *buff)
{
struct net_device *net_dev = kobj_to_netdev(kobj);
- struct batman_if *batman_if = get_batman_if_by_netdev(net_dev);
+ struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev);
ssize_t length;
- if (!batman_if)
+ if (!hard_iface)
return 0;
- length = sprintf(buff, "%s\n", batman_if->if_status == IF_NOT_IN_USE ?
- "none" : batman_if->soft_iface->name);
+ length = sprintf(buff, "%s\n", hard_iface->if_status == IF_NOT_IN_USE ?
+ "none" : hard_iface->soft_iface->name);
- kref_put(&batman_if->refcount, hardif_free_ref);
+ hardif_free_ref(hard_iface);
return length;
}
@@ -459,11 +459,11 @@ static ssize_t store_mesh_iface(struct kobject *kobj, struct attribute *attr,
char *buff, size_t count)
{
struct net_device *net_dev = kobj_to_netdev(kobj);
- struct batman_if *batman_if = get_batman_if_by_netdev(net_dev);
+ struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev);
int status_tmp = -1;
- int ret;
+ int ret = count;
- if (!batman_if)
+ if (!hard_iface)
return count;
if (buff[count - 1] == '\n')
@@ -472,7 +472,7 @@ static ssize_t store_mesh_iface(struct kobject *kobj, struct attribute *attr,
if (strlen(buff) >= IFNAMSIZ) {
pr_err("Invalid parameter for 'mesh_iface' setting received: "
"interface name too long '%s'\n", buff);
- kref_put(&batman_if->refcount, hardif_free_ref);
+ hardif_free_ref(hard_iface);
return -EINVAL;
}
@@ -481,30 +481,31 @@ static ssize_t store_mesh_iface(struct kobject *kobj, struct attribute *attr,
else
status_tmp = IF_I_WANT_YOU;
- if ((batman_if->if_status == status_tmp) || ((batman_if->soft_iface) &&
- (strncmp(batman_if->soft_iface->name, buff, IFNAMSIZ) == 0))) {
- kref_put(&batman_if->refcount, hardif_free_ref);
- return count;
- }
+ if (hard_iface->if_status == status_tmp)
+ goto out;
+
+ if ((hard_iface->soft_iface) &&
+ (strncmp(hard_iface->soft_iface->name, buff, IFNAMSIZ) == 0))
+ goto out;
if (status_tmp == IF_NOT_IN_USE) {
rtnl_lock();
- hardif_disable_interface(batman_if);
+ hardif_disable_interface(hard_iface);
rtnl_unlock();
- kref_put(&batman_if->refcount, hardif_free_ref);
- return count;
+ goto out;
}
/* if the interface already is in use */
- if (batman_if->if_status != IF_NOT_IN_USE) {
+ if (hard_iface->if_status != IF_NOT_IN_USE) {
rtnl_lock();
- hardif_disable_interface(batman_if);
+ hardif_disable_interface(hard_iface);
rtnl_unlock();
}
- ret = hardif_enable_interface(batman_if, buff);
- kref_put(&batman_if->refcount, hardif_free_ref);
+ ret = hardif_enable_interface(hard_iface, buff);
+out:
+ hardif_free_ref(hard_iface);
return ret;
}
@@ -512,13 +513,13 @@ static ssize_t show_iface_status(struct kobject *kobj, struct attribute *attr,
char *buff)
{
struct net_device *net_dev = kobj_to_netdev(kobj);
- struct batman_if *batman_if = get_batman_if_by_netdev(net_dev);
+ struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev);
ssize_t length;
- if (!batman_if)
+ if (!hard_iface)
return 0;
- switch (batman_if->if_status) {
+ switch (hard_iface->if_status) {
case IF_TO_BE_REMOVED:
length = sprintf(buff, "disabling\n");
break;
@@ -537,7 +538,7 @@ static ssize_t show_iface_status(struct kobject *kobj, struct attribute *attr,
break;
}
- kref_put(&batman_if->refcount, hardif_free_ref);
+ hardif_free_ref(hard_iface);
return length;
}
diff --git a/net/batman-adv/bat_sysfs.h b/net/batman-adv/bat_sysfs.h
index 7f186c007b4..02f1fa7aadf 100644
--- a/net/batman-adv/bat_sysfs.h
+++ b/net/batman-adv/bat_sysfs.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index bbcd8f744cd..ad2ca925b3e 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index ac54017601b..769c246d1fc 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 0065ffb8d96..3cc43558cf9 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -28,58 +28,75 @@
#include <linux/udp.h>
#include <linux/if_vlan.h>
-static void gw_node_free_ref(struct kref *refcount)
+static void gw_node_free_rcu(struct rcu_head *rcu)
{
struct gw_node *gw_node;
- gw_node = container_of(refcount, struct gw_node, refcount);
+ gw_node = container_of(rcu, struct gw_node, rcu);
kfree(gw_node);
}
-static void gw_node_free_rcu(struct rcu_head *rcu)
+static void gw_node_free_ref(struct gw_node *gw_node)
{
- struct gw_node *gw_node;
-
- gw_node = container_of(rcu, struct gw_node, rcu);
- kref_put(&gw_node->refcount, gw_node_free_ref);
+ if (atomic_dec_and_test(&gw_node->refcount))
+ call_rcu(&gw_node->rcu, gw_node_free_rcu);
}
void *gw_get_selected(struct bat_priv *bat_priv)
{
- struct gw_node *curr_gateway_tmp = bat_priv->curr_gw;
+ struct gw_node *curr_gateway_tmp;
+ struct orig_node *orig_node = NULL;
+ rcu_read_lock();
+ curr_gateway_tmp = rcu_dereference(bat_priv->curr_gw);
if (!curr_gateway_tmp)
- return NULL;
+ goto out;
+
+ orig_node = curr_gateway_tmp->orig_node;
+ if (!orig_node)
+ goto out;
- return curr_gateway_tmp->orig_node;
+ if (!atomic_inc_not_zero(&orig_node->refcount))
+ orig_node = NULL;
+
+out:
+ rcu_read_unlock();
+ return orig_node;
}
void gw_deselect(struct bat_priv *bat_priv)
{
- struct gw_node *gw_node = bat_priv->curr_gw;
+ struct gw_node *gw_node;
- bat_priv->curr_gw = NULL;
+ spin_lock_bh(&bat_priv->gw_list_lock);
+ gw_node = rcu_dereference(bat_priv->curr_gw);
+ rcu_assign_pointer(bat_priv->curr_gw, NULL);
+ spin_unlock_bh(&bat_priv->gw_list_lock);
if (gw_node)
- kref_put(&gw_node->refcount, gw_node_free_ref);
+ gw_node_free_ref(gw_node);
}
-static struct gw_node *gw_select(struct bat_priv *bat_priv,
- struct gw_node *new_gw_node)
+static void gw_select(struct bat_priv *bat_priv, struct gw_node *new_gw_node)
{
- struct gw_node *curr_gw_node = bat_priv->curr_gw;
+ struct gw_node *curr_gw_node;
- if (new_gw_node)
- kref_get(&new_gw_node->refcount);
+ if (new_gw_node && !atomic_inc_not_zero(&new_gw_node->refcount))
+ new_gw_node = NULL;
+
+ spin_lock_bh(&bat_priv->gw_list_lock);
+ curr_gw_node = rcu_dereference(bat_priv->curr_gw);
+ rcu_assign_pointer(bat_priv->curr_gw, new_gw_node);
+ spin_unlock_bh(&bat_priv->gw_list_lock);
- bat_priv->curr_gw = new_gw_node;
- return curr_gw_node;
+ if (curr_gw_node)
+ gw_node_free_ref(curr_gw_node);
}
void gw_election(struct bat_priv *bat_priv)
{
struct hlist_node *node;
- struct gw_node *gw_node, *curr_gw_tmp = NULL, *old_gw_node = NULL;
+ struct gw_node *gw_node, *curr_gw, *curr_gw_tmp = NULL;
uint8_t max_tq = 0;
uint32_t max_gw_factor = 0, tmp_gw_factor = 0;
int down, up;
@@ -93,19 +110,23 @@ void gw_election(struct bat_priv *bat_priv)
if (atomic_read(&bat_priv->gw_mode) != GW_MODE_CLIENT)
return;
- if (bat_priv->curr_gw)
+ rcu_read_lock();
+ curr_gw = rcu_dereference(bat_priv->curr_gw);
+ if (curr_gw) {
+ rcu_read_unlock();
return;
+ }
- rcu_read_lock();
if (hlist_empty(&bat_priv->gw_list)) {
- rcu_read_unlock();
- if (bat_priv->curr_gw) {
+ if (curr_gw) {
+ rcu_read_unlock();
bat_dbg(DBG_BATMAN, bat_priv,
"Removing selected gateway - "
"no gateway in range\n");
gw_deselect(bat_priv);
- }
+ } else
+ rcu_read_unlock();
return;
}
@@ -154,12 +175,12 @@ void gw_election(struct bat_priv *bat_priv)
max_gw_factor = tmp_gw_factor;
}
- if (bat_priv->curr_gw != curr_gw_tmp) {
- if ((bat_priv->curr_gw) && (!curr_gw_tmp))
+ if (curr_gw != curr_gw_tmp) {
+ if ((curr_gw) && (!curr_gw_tmp))
bat_dbg(DBG_BATMAN, bat_priv,
"Removing selected gateway - "
"no gateway in range\n");
- else if ((!bat_priv->curr_gw) && (curr_gw_tmp))
+ else if ((!curr_gw) && (curr_gw_tmp))
bat_dbg(DBG_BATMAN, bat_priv,
"Adding route to gateway %pM "
"(gw_flags: %i, tq: %i)\n",
@@ -174,43 +195,43 @@ void gw_election(struct bat_priv *bat_priv)
curr_gw_tmp->orig_node->gw_flags,
curr_gw_tmp->orig_node->router->tq_avg);
- old_gw_node = gw_select(bat_priv, curr_gw_tmp);
+ gw_select(bat_priv, curr_gw_tmp);
}
rcu_read_unlock();
-
- /* the kfree() has to be outside of the rcu lock */
- if (old_gw_node)
- kref_put(&old_gw_node->refcount, gw_node_free_ref);
}
void gw_check_election(struct bat_priv *bat_priv, struct orig_node *orig_node)
{
- struct gw_node *curr_gateway_tmp = bat_priv->curr_gw;
+ struct gw_node *curr_gateway_tmp;
uint8_t gw_tq_avg, orig_tq_avg;
+ rcu_read_lock();
+ curr_gateway_tmp = rcu_dereference(bat_priv->curr_gw);
if (!curr_gateway_tmp)
- return;
+ goto out_rcu;
if (!curr_gateway_tmp->orig_node)
- goto deselect;
+ goto deselect_rcu;
if (!curr_gateway_tmp->orig_node->router)
- goto deselect;
+ goto deselect_rcu;
/* this node already is the gateway */
if (curr_gateway_tmp->orig_node == orig_node)
- return;
+ goto out_rcu;
if (!orig_node->router)
- return;
+ goto out_rcu;
gw_tq_avg = curr_gateway_tmp->orig_node->router->tq_avg;
+ rcu_read_unlock();
+
orig_tq_avg = orig_node->router->tq_avg;
/* the TQ value has to be better */
if (orig_tq_avg < gw_tq_avg)
- return;
+ goto out;
/**
* if the routing class is greater than 3 the value tells us how much
@@ -218,15 +239,23 @@ void gw_check_election(struct bat_priv *bat_priv, struct orig_node *orig_node)
**/
if ((atomic_read(&bat_priv->gw_sel_class) > 3) &&
(orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw_sel_class)))
- return;
+ goto out;
bat_dbg(DBG_BATMAN, bat_priv,
"Restarting gateway selection: better gateway found (tq curr: "
"%i, tq new: %i)\n",
gw_tq_avg, orig_tq_avg);
+ goto deselect;
+out_rcu:
+ rcu_read_unlock();
+ goto out;
+deselect_rcu:
+ rcu_read_unlock();
deselect:
gw_deselect(bat_priv);
+out:
+ return;
}
static void gw_node_add(struct bat_priv *bat_priv,
@@ -242,7 +271,7 @@ static void gw_node_add(struct bat_priv *bat_priv,
memset(gw_node, 0, sizeof(struct gw_node));
INIT_HLIST_NODE(&gw_node->list);
gw_node->orig_node = orig_node;
- kref_init(&gw_node->refcount);
+ atomic_set(&gw_node->refcount, 1);
spin_lock_bh(&bat_priv->gw_list_lock);
hlist_add_head_rcu(&gw_node->list, &bat_priv->gw_list);
@@ -283,7 +312,7 @@ void gw_node_update(struct bat_priv *bat_priv,
"Gateway %pM removed from gateway list\n",
orig_node->orig);
- if (gw_node == bat_priv->curr_gw) {
+ if (gw_node == rcu_dereference(bat_priv->curr_gw)) {
rcu_read_unlock();
gw_deselect(bat_priv);
return;
@@ -321,11 +350,11 @@ void gw_node_purge(struct bat_priv *bat_priv)
atomic_read(&bat_priv->mesh_state) == MESH_ACTIVE)
continue;
- if (bat_priv->curr_gw == gw_node)
+ if (rcu_dereference(bat_priv->curr_gw) == gw_node)
gw_deselect(bat_priv);
hlist_del_rcu(&gw_node->list);
- call_rcu(&gw_node->rcu, gw_node_free_rcu);
+ gw_node_free_ref(gw_node);
}
@@ -335,12 +364,16 @@ void gw_node_purge(struct bat_priv *bat_priv)
static int _write_buffer_text(struct bat_priv *bat_priv,
struct seq_file *seq, struct gw_node *gw_node)
{
- int down, up;
+ struct gw_node *curr_gw;
+ int down, up, ret;
gw_bandwidth_to_kbit(gw_node->orig_node->gw_flags, &down, &up);
- return seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %3i - %i%s/%i%s\n",
- (bat_priv->curr_gw == gw_node ? "=>" : " "),
+ rcu_read_lock();
+ curr_gw = rcu_dereference(bat_priv->curr_gw);
+
+ ret = seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %3i - %i%s/%i%s\n",
+ (curr_gw == gw_node ? "=>" : " "),
gw_node->orig_node->orig,
gw_node->orig_node->router->tq_avg,
gw_node->orig_node->router->addr,
@@ -350,6 +383,9 @@ static int _write_buffer_text(struct bat_priv *bat_priv,
(down > 2048 ? "MBit" : "KBit"),
(up > 2048 ? up / 1024 : up),
(up > 2048 ? "MBit" : "KBit"));
+
+ rcu_read_unlock();
+ return ret;
}
int gw_client_seq_print_text(struct seq_file *seq, void *offset)
@@ -470,8 +506,12 @@ int gw_is_target(struct bat_priv *bat_priv, struct sk_buff *skb)
if (atomic_read(&bat_priv->gw_mode) == GW_MODE_SERVER)
return -1;
- if (!bat_priv->curr_gw)
+ rcu_read_lock();
+ if (!rcu_dereference(bat_priv->curr_gw)) {
+ rcu_read_unlock();
return 0;
+ }
+ rcu_read_unlock();
return 1;
}
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 4585e654984..2aa439124ee 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index b962982f017..50d3a59a3d7 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 5e728d0b795..55e527a489f 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 4f95777ce08..b3058e46ee6 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -31,36 +31,40 @@
#include <linux/if_arp.h>
-/* protect update critical side of if_list - but not the content */
-static DEFINE_SPINLOCK(if_list_lock);
+/* protect update critical side of hardif_list - but not the content */
+static DEFINE_SPINLOCK(hardif_list_lock);
-static void hardif_free_rcu(struct rcu_head *rcu)
+
+static int batman_skb_recv(struct sk_buff *skb,
+ struct net_device *dev,
+ struct packet_type *ptype,
+ struct net_device *orig_dev);
+
+void hardif_free_rcu(struct rcu_head *rcu)
{
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
- batman_if = container_of(rcu, struct batman_if, rcu);
- dev_put(batman_if->net_dev);
- kref_put(&batman_if->refcount, hardif_free_ref);
+ hard_iface = container_of(rcu, struct hard_iface, rcu);
+ dev_put(hard_iface->net_dev);
+ kfree(hard_iface);
}
-struct batman_if *get_batman_if_by_netdev(struct net_device *net_dev)
+struct hard_iface *hardif_get_by_netdev(struct net_device *net_dev)
{
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
rcu_read_lock();
- list_for_each_entry_rcu(batman_if, &if_list, list) {
- if (batman_if->net_dev == net_dev)
+ list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+ if (hard_iface->net_dev == net_dev &&
+ atomic_inc_not_zero(&hard_iface->refcount))
goto out;
}
- batman_if = NULL;
+ hard_iface = NULL;
out:
- if (batman_if)
- kref_get(&batman_if->refcount);
-
rcu_read_unlock();
- return batman_if;
+ return hard_iface;
}
static int is_valid_iface(struct net_device *net_dev)
@@ -75,13 +79,8 @@ static int is_valid_iface(struct net_device *net_dev)
return 0;
/* no batman over batman */
-#ifdef HAVE_NET_DEVICE_OPS
- if (net_dev->netdev_ops->ndo_start_xmit == interface_tx)
- return 0;
-#else
- if (net_dev->hard_start_xmit == interface_tx)
+ if (softif_is_valid(net_dev))
return 0;
-#endif
/* Device is being bridged */
/* if (net_dev->priv_flags & IFF_BRIDGE_PORT)
@@ -90,27 +89,25 @@ static int is_valid_iface(struct net_device *net_dev)
return 1;
}
-static struct batman_if *get_active_batman_if(struct net_device *soft_iface)
+static struct hard_iface *hardif_get_active(struct net_device *soft_iface)
{
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
rcu_read_lock();
- list_for_each_entry_rcu(batman_if, &if_list, list) {
- if (batman_if->soft_iface != soft_iface)
+ list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+ if (hard_iface->soft_iface != soft_iface)
continue;
- if (batman_if->if_status == IF_ACTIVE)
+ if (hard_iface->if_status == IF_ACTIVE &&
+ atomic_inc_not_zero(&hard_iface->refcount))
goto out;
}
- batman_if = NULL;
+ hard_iface = NULL;
out:
- if (batman_if)
- kref_get(&batman_if->refcount);
-
rcu_read_unlock();
- return batman_if;
+ return hard_iface;
}
static void update_primary_addr(struct bat_priv *bat_priv)
@@ -126,24 +123,24 @@ static void update_primary_addr(struct bat_priv *bat_priv)
}
static void set_primary_if(struct bat_priv *bat_priv,
- struct batman_if *batman_if)
+ struct hard_iface *hard_iface)
{
struct batman_packet *batman_packet;
- struct batman_if *old_if;
+ struct hard_iface *old_if;
- if (batman_if)
- kref_get(&batman_if->refcount);
+ if (hard_iface && !atomic_inc_not_zero(&hard_iface->refcount))
+ hard_iface = NULL;
old_if = bat_priv->primary_if;
- bat_priv->primary_if = batman_if;
+ bat_priv->primary_if = hard_iface;
if (old_if)
- kref_put(&old_if->refcount, hardif_free_ref);
+ hardif_free_ref(old_if);
if (!bat_priv->primary_if)
return;
- batman_packet = (struct batman_packet *)(batman_if->packet_buff);
+ batman_packet = (struct batman_packet *)(hard_iface->packet_buff);
batman_packet->flags = PRIMARIES_FIRST_HOP;
batman_packet->ttl = TTL;
@@ -156,42 +153,42 @@ static void set_primary_if(struct bat_priv *bat_priv,
atomic_set(&bat_priv->hna_local_changed, 1);
}
-static bool hardif_is_iface_up(struct batman_if *batman_if)
+static bool hardif_is_iface_up(struct hard_iface *hard_iface)
{
- if (batman_if->net_dev->flags & IFF_UP)
+ if (hard_iface->net_dev->flags & IFF_UP)
return true;
return false;
}
-static void update_mac_addresses(struct batman_if *batman_if)
+static void update_mac_addresses(struct hard_iface *hard_iface)
{
- memcpy(((struct batman_packet *)(batman_if->packet_buff))->orig,
- batman_if->net_dev->dev_addr, ETH_ALEN);
- memcpy(((struct batman_packet *)(batman_if->packet_buff))->prev_sender,
- batman_if->net_dev->dev_addr, ETH_ALEN);
+ memcpy(((struct batman_packet *)(hard_iface->packet_buff))->orig,
+ hard_iface->net_dev->dev_addr, ETH_ALEN);
+ memcpy(((struct batman_packet *)(hard_iface->packet_buff))->prev_sender,
+ hard_iface->net_dev->dev_addr, ETH_ALEN);
}
static void check_known_mac_addr(struct net_device *net_dev)
{
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
rcu_read_lock();
- list_for_each_entry_rcu(batman_if, &if_list, list) {
- if ((batman_if->if_status != IF_ACTIVE) &&
- (batman_if->if_status != IF_TO_BE_ACTIVATED))
+ list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+ if ((hard_iface->if_status != IF_ACTIVE) &&
+ (hard_iface->if_status != IF_TO_BE_ACTIVATED))
continue;
- if (batman_if->net_dev == net_dev)
+ if (hard_iface->net_dev == net_dev)
continue;
- if (!compare_orig(batman_if->net_dev->dev_addr,
- net_dev->dev_addr))
+ if (!compare_eth(hard_iface->net_dev->dev_addr,
+ net_dev->dev_addr))
continue;
pr_warning("The newly added mac address (%pM) already exists "
"on: %s\n", net_dev->dev_addr,
- batman_if->net_dev->name);
+ hard_iface->net_dev->name);
pr_warning("It is strongly recommended to keep mac addresses "
"unique to avoid problems!\n");
}
@@ -201,7 +198,7 @@ static void check_known_mac_addr(struct net_device *net_dev)
int hardif_min_mtu(struct net_device *soft_iface)
{
struct bat_priv *bat_priv = netdev_priv(soft_iface);
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
/* allow big frames if all devices are capable to do so
* (have MTU > 1500 + BAT_HEADER_LEN) */
int min_mtu = ETH_DATA_LEN;
@@ -210,15 +207,15 @@ int hardif_min_mtu(struct net_device *soft_iface)
goto out;
rcu_read_lock();
- list_for_each_entry_rcu(batman_if, &if_list, list) {
- if ((batman_if->if_status != IF_ACTIVE) &&
- (batman_if->if_status != IF_TO_BE_ACTIVATED))
+ list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+ if ((hard_iface->if_status != IF_ACTIVE) &&
+ (hard_iface->if_status != IF_TO_BE_ACTIVATED))
continue;
- if (batman_if->soft_iface != soft_iface)
+ if (hard_iface->soft_iface != soft_iface)
continue;
- min_mtu = min_t(int, batman_if->net_dev->mtu - BAT_HEADER_LEN,
+ min_mtu = min_t(int, hard_iface->net_dev->mtu - BAT_HEADER_LEN,
min_mtu);
}
rcu_read_unlock();
@@ -236,77 +233,95 @@ void update_min_mtu(struct net_device *soft_iface)
soft_iface->mtu = min_mtu;
}
-static void hardif_activate_interface(struct batman_if *batman_if)
+static void hardif_activate_interface(struct hard_iface *hard_iface)
{
struct bat_priv *bat_priv;
- if (batman_if->if_status != IF_INACTIVE)
+ if (hard_iface->if_status != IF_INACTIVE)
return;
- bat_priv = netdev_priv(batman_if->soft_iface);
+ bat_priv = netdev_priv(hard_iface->soft_iface);
- update_mac_addresses(batman_if);
- batman_if->if_status = IF_TO_BE_ACTIVATED;
+ update_mac_addresses(hard_iface);
+ hard_iface->if_status = IF_TO_BE_ACTIVATED;
/**
* the first active interface becomes our primary interface or
* the next active interface after the old primay interface was removed
*/
if (!bat_priv->primary_if)
- set_primary_if(bat_priv, batman_if);
+ set_primary_if(bat_priv, hard_iface);
- bat_info(batman_if->soft_iface, "Interface activated: %s\n",
- batman_if->net_dev->name);
+ bat_info(hard_iface->soft_iface, "Interface activated: %s\n",
+ hard_iface->net_dev->name);
- update_min_mtu(batman_if->soft_iface);
+ update_min_mtu(hard_iface->soft_iface);
return;
}
-static void hardif_deactivate_interface(struct batman_if *batman_if)
+static void hardif_deactivate_interface(struct hard_iface *hard_iface)
{
- if ((batman_if->if_status != IF_ACTIVE) &&
- (batman_if->if_status != IF_TO_BE_ACTIVATED))
+ if ((hard_iface->if_status != IF_ACTIVE) &&
+ (hard_iface->if_status != IF_TO_BE_ACTIVATED))
return;
- batman_if->if_status = IF_INACTIVE;
+ hard_iface->if_status = IF_INACTIVE;
- bat_info(batman_if->soft_iface, "Interface deactivated: %s\n",
- batman_if->net_dev->name);
+ bat_info(hard_iface->soft_iface, "Interface deactivated: %s\n",
+ hard_iface->net_dev->name);
- update_min_mtu(batman_if->soft_iface);
+ update_min_mtu(hard_iface->soft_iface);
}
-int hardif_enable_interface(struct batman_if *batman_if, char *iface_name)
+int hardif_enable_interface(struct hard_iface *hard_iface, char *iface_name)
{
struct bat_priv *bat_priv;
struct batman_packet *batman_packet;
+ struct net_device *soft_iface;
+ int ret;
- if (batman_if->if_status != IF_NOT_IN_USE)
+ if (hard_iface->if_status != IF_NOT_IN_USE)
goto out;
- batman_if->soft_iface = dev_get_by_name(&init_net, iface_name);
+ if (!atomic_inc_not_zero(&hard_iface->refcount))
+ goto out;
- if (!batman_if->soft_iface) {
- batman_if->soft_iface = softif_create(iface_name);
+ soft_iface = dev_get_by_name(&init_net, iface_name);
- if (!batman_if->soft_iface)
+ if (!soft_iface) {
+ soft_iface = softif_create(iface_name);
+
+ if (!soft_iface) {
+ ret = -ENOMEM;
goto err;
+ }
/* dev_get_by_name() increases the reference counter for us */
- dev_hold(batman_if->soft_iface);
+ dev_hold(soft_iface);
+ }
+
+ if (!softif_is_valid(soft_iface)) {
+ pr_err("Can't create batman mesh interface %s: "
+ "already exists as regular interface\n",
+ soft_iface->name);
+ dev_put(soft_iface);
+ ret = -EINVAL;
+ goto err;
}
- bat_priv = netdev_priv(batman_if->soft_iface);
- batman_if->packet_len = BAT_PACKET_LEN;
- batman_if->packet_buff = kmalloc(batman_if->packet_len, GFP_ATOMIC);
+ hard_iface->soft_iface = soft_iface;
+ bat_priv = netdev_priv(hard_iface->soft_iface);
+ hard_iface->packet_len = BAT_PACKET_LEN;
+ hard_iface->packet_buff = kmalloc(hard_iface->packet_len, GFP_ATOMIC);
- if (!batman_if->packet_buff) {
- bat_err(batman_if->soft_iface, "Can't add interface packet "
- "(%s): out of memory\n", batman_if->net_dev->name);
+ if (!hard_iface->packet_buff) {
+ bat_err(hard_iface->soft_iface, "Can't add interface packet "
+ "(%s): out of memory\n", hard_iface->net_dev->name);
+ ret = -ENOMEM;
goto err;
}
- batman_packet = (struct batman_packet *)(batman_if->packet_buff);
+ batman_packet = (struct batman_packet *)(hard_iface->packet_buff);
batman_packet->packet_type = BAT_PACKET;
batman_packet->version = COMPAT_VERSION;
batman_packet->flags = 0;
@@ -314,107 +329,107 @@ int hardif_enable_interface(struct batman_if *batman_if, char *iface_name)
batman_packet->tq = TQ_MAX_VALUE;
batman_packet->num_hna = 0;
- batman_if->if_num = bat_priv->num_ifaces;
+ hard_iface->if_num = bat_priv->num_ifaces;
bat_priv->num_ifaces++;
- batman_if->if_status = IF_INACTIVE;
- orig_hash_add_if(batman_if, bat_priv->num_ifaces);
+ hard_iface->if_status = IF_INACTIVE;
+ orig_hash_add_if(hard_iface, bat_priv->num_ifaces);
- batman_if->batman_adv_ptype.type = __constant_htons(ETH_P_BATMAN);
- batman_if->batman_adv_ptype.func = batman_skb_recv;
- batman_if->batman_adv_ptype.dev = batman_if->net_dev;
- kref_get(&batman_if->refcount);
- dev_add_pack(&batman_if->batman_adv_ptype);
+ hard_iface->batman_adv_ptype.type = __constant_htons(ETH_P_BATMAN);
+ hard_iface->batman_adv_ptype.func = batman_skb_recv;
+ hard_iface->batman_adv_ptype.dev = hard_iface->net_dev;
+ dev_add_pack(&hard_iface->batman_adv_ptype);
- atomic_set(&batman_if->seqno, 1);
- atomic_set(&batman_if->frag_seqno, 1);
- bat_info(batman_if->soft_iface, "Adding interface: %s\n",
- batman_if->net_dev->name);
+ atomic_set(&hard_iface->seqno, 1);
+ atomic_set(&hard_iface->frag_seqno, 1);
+ bat_info(hard_iface->soft_iface, "Adding interface: %s\n",
+ hard_iface->net_dev->name);
- if (atomic_read(&bat_priv->fragmentation) && batman_if->net_dev->mtu <
+ if (atomic_read(&bat_priv->fragmentation) && hard_iface->net_dev->mtu <
ETH_DATA_LEN + BAT_HEADER_LEN)
- bat_info(batman_if->soft_iface,
+ bat_info(hard_iface->soft_iface,
"The MTU of interface %s is too small (%i) to handle "
"the transport of batman-adv packets. Packets going "
"over this interface will be fragmented on layer2 "
"which could impact the performance. Setting the MTU "
"to %zi would solve the problem.\n",
- batman_if->net_dev->name, batman_if->net_dev->mtu,
+ hard_iface->net_dev->name, hard_iface->net_dev->mtu,
ETH_DATA_LEN + BAT_HEADER_LEN);
- if (!atomic_read(&bat_priv->fragmentation) && batman_if->net_dev->mtu <
+ if (!atomic_read(&bat_priv->fragmentation) && hard_iface->net_dev->mtu <
ETH_DATA_LEN + BAT_HEADER_LEN)
- bat_info(batman_if->soft_iface,
+ bat_info(hard_iface->soft_iface,
"The MTU of interface %s is too small (%i) to handle "
"the transport of batman-adv packets. If you experience"
" problems getting traffic through try increasing the "
"MTU to %zi.\n",
- batman_if->net_dev->name, batman_if->net_dev->mtu,
+ hard_iface->net_dev->name, hard_iface->net_dev->mtu,
ETH_DATA_LEN + BAT_HEADER_LEN);
- if (hardif_is_iface_up(batman_if))
- hardif_activate_interface(batman_if);
+ if (hardif_is_iface_up(hard_iface))
+ hardif_activate_interface(hard_iface);
else
- bat_err(batman_if->soft_iface, "Not using interface %s "
+ bat_err(hard_iface->soft_iface, "Not using interface %s "
"(retrying later): interface not active\n",
- batman_if->net_dev->name);
+ hard_iface->net_dev->name);
/* begin scheduling originator messages on that interface */
- schedule_own_packet(batman_if);
+ schedule_own_packet(hard_iface);
out:
return 0;
err:
- return -ENOMEM;
+ hardif_free_ref(hard_iface);
+ return ret;
}
-void hardif_disable_interface(struct batman_if *batman_if)
+void hardif_disable_interface(struct hard_iface *hard_iface)
{
- struct bat_priv *bat_priv = netdev_priv(batman_if->soft_iface);
+ struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
- if (batman_if->if_status == IF_ACTIVE)
- hardif_deactivate_interface(batman_if);
+ if (hard_iface->if_status == IF_ACTIVE)
+ hardif_deactivate_interface(hard_iface);
- if (batman_if->if_status != IF_INACTIVE)
+ if (hard_iface->if_status != IF_INACTIVE)
return;
- bat_info(batman_if->soft_iface, "Removing interface: %s\n",
- batman_if->net_dev->name);
- dev_remove_pack(&batman_if->batman_adv_ptype);
- kref_put(&batman_if->refcount, hardif_free_ref);
+ bat_info(hard_iface->soft_iface, "Removing interface: %s\n",
+ hard_iface->net_dev->name);
+ dev_remove_pack(&hard_iface->batman_adv_ptype);
bat_priv->num_ifaces--;
- orig_hash_del_if(batman_if, bat_priv->num_ifaces);
+ orig_hash_del_if(hard_iface, bat_priv->num_ifaces);
- if (batman_if == bat_priv->primary_if) {
- struct batman_if *new_if;
+ if (hard_iface == bat_priv->primary_if) {
+ struct hard_iface *new_if;
- new_if = get_active_batman_if(batman_if->soft_iface);
+ new_if = hardif_get_active(hard_iface->soft_iface);
set_primary_if(bat_priv, new_if);
if (new_if)
- kref_put(&new_if->refcount, hardif_free_ref);
+ hardif_free_ref(new_if);
}
- kfree(batman_if->packet_buff);
- batman_if->packet_buff = NULL;
- batman_if->if_status = IF_NOT_IN_USE;
+ kfree(hard_iface->packet_buff);
+ hard_iface->packet_buff = NULL;
+ hard_iface->if_status = IF_NOT_IN_USE;
- /* delete all references to this batman_if */
+ /* delete all references to this hard_iface */
purge_orig_ref(bat_priv);
- purge_outstanding_packets(bat_priv, batman_if);
- dev_put(batman_if->soft_iface);
+ purge_outstanding_packets(bat_priv, hard_iface);
+ dev_put(hard_iface->soft_iface);
/* nobody uses this interface anymore */
if (!bat_priv->num_ifaces)
- softif_destroy(batman_if->soft_iface);
+ softif_destroy(hard_iface->soft_iface);
- batman_if->soft_iface = NULL;
+ hard_iface->soft_iface = NULL;
+ hardif_free_ref(hard_iface);
}
-static struct batman_if *hardif_add_interface(struct net_device *net_dev)
+static struct hard_iface *hardif_add_interface(struct net_device *net_dev)
{
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
int ret;
ret = is_valid_iface(net_dev);
@@ -423,73 +438,73 @@ static struct batman_if *hardif_add_interface(struct net_device *net_dev)
dev_hold(net_dev);
- batman_if = kmalloc(sizeof(struct batman_if), GFP_ATOMIC);
- if (!batman_if) {
+ hard_iface = kmalloc(sizeof(struct hard_iface), GFP_ATOMIC);
+ if (!hard_iface) {
pr_err("Can't add interface (%s): out of memory\n",
net_dev->name);
goto release_dev;
}
- ret = sysfs_add_hardif(&batman_if->hardif_obj, net_dev);
+ ret = sysfs_add_hardif(&hard_iface->hardif_obj, net_dev);
if (ret)
goto free_if;
- batman_if->if_num = -1;
- batman_if->net_dev = net_dev;
- batman_if->soft_iface = NULL;
- batman_if->if_status = IF_NOT_IN_USE;
- INIT_LIST_HEAD(&batman_if->list);
- kref_init(&batman_if->refcount);
+ hard_iface->if_num = -1;
+ hard_iface->net_dev = net_dev;
+ hard_iface->soft_iface = NULL;
+ hard_iface->if_status = IF_NOT_IN_USE;
+ INIT_LIST_HEAD(&hard_iface->list);
+ /* extra reference for return */
+ atomic_set(&hard_iface->refcount, 2);
- check_known_mac_addr(batman_if->net_dev);
+ check_known_mac_addr(hard_iface->net_dev);
- spin_lock(&if_list_lock);
- list_add_tail_rcu(&batman_if->list, &if_list);
- spin_unlock(&if_list_lock);
+ spin_lock(&hardif_list_lock);
+ list_add_tail_rcu(&hard_iface->list, &hardif_list);
+ spin_unlock(&hardif_list_lock);
- /* extra reference for return */
- kref_get(&batman_if->refcount);
- return batman_if;
+ return hard_iface;
free_if:
- kfree(batman_if);
+ kfree(hard_iface);
release_dev:
dev_put(net_dev);
out:
return NULL;
}
-static void hardif_remove_interface(struct batman_if *batman_if)
+static void hardif_remove_interface(struct hard_iface *hard_iface)
{
/* first deactivate interface */
- if (batman_if->if_status != IF_NOT_IN_USE)
- hardif_disable_interface(batman_if);
+ if (hard_iface->if_status != IF_NOT_IN_USE)
+ hardif_disable_interface(hard_iface);
- if (batman_if->if_status != IF_NOT_IN_USE)
+ if (hard_iface->if_status != IF_NOT_IN_USE)
return;
- batman_if->if_status = IF_TO_BE_REMOVED;
- sysfs_del_hardif(&batman_if->hardif_obj);
- call_rcu(&batman_if->rcu, hardif_free_rcu);
+ hard_iface->if_status = IF_TO_BE_REMOVED;
+ sysfs_del_hardif(&hard_iface->hardif_obj);
+ hardif_free_ref(hard_iface);
}
void hardif_remove_interfaces(void)
{
- struct batman_if *batman_if, *batman_if_tmp;
+ struct hard_iface *hard_iface, *hard_iface_tmp;
struct list_head if_queue;
INIT_LIST_HEAD(&if_queue);
- spin_lock(&if_list_lock);
- list_for_each_entry_safe(batman_if, batman_if_tmp, &if_list, list) {
- list_del_rcu(&batman_if->list);
- list_add_tail(&batman_if->list, &if_queue);
+ spin_lock(&hardif_list_lock);
+ list_for_each_entry_safe(hard_iface, hard_iface_tmp,
+ &hardif_list, list) {
+ list_del_rcu(&hard_iface->list);
+ list_add_tail(&hard_iface->list, &if_queue);
}
- spin_unlock(&if_list_lock);
+ spin_unlock(&hardif_list_lock);
rtnl_lock();
- list_for_each_entry_safe(batman_if, batman_if_tmp, &if_queue, list) {
- hardif_remove_interface(batman_if);
+ list_for_each_entry_safe(hard_iface, hard_iface_tmp, &if_queue, list) {
+ hardif_remove_interface(hard_iface);
}
rtnl_unlock();
}
@@ -498,43 +513,43 @@ static int hard_if_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *net_dev = (struct net_device *)ptr;
- struct batman_if *batman_if = get_batman_if_by_netdev(net_dev);
+ struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev);
struct bat_priv *bat_priv;
- if (!batman_if && event == NETDEV_REGISTER)
- batman_if = hardif_add_interface(net_dev);
+ if (!hard_iface && event == NETDEV_REGISTER)
+ hard_iface = hardif_add_interface(net_dev);
- if (!batman_if)
+ if (!hard_iface)
goto out;
switch (event) {
case NETDEV_UP:
- hardif_activate_interface(batman_if);
+ hardif_activate_interface(hard_iface);
break;
case NETDEV_GOING_DOWN:
case NETDEV_DOWN:
- hardif_deactivate_interface(batman_if);
+ hardif_deactivate_interface(hard_iface);
break;
case NETDEV_UNREGISTER:
- spin_lock(&if_list_lock);
- list_del_rcu(&batman_if->list);
- spin_unlock(&if_list_lock);
+ spin_lock(&hardif_list_lock);
+ list_del_rcu(&hard_iface->list);
+ spin_unlock(&hardif_list_lock);
- hardif_remove_interface(batman_if);
+ hardif_remove_interface(hard_iface);
break;
case NETDEV_CHANGEMTU:
- if (batman_if->soft_iface)
- update_min_mtu(batman_if->soft_iface);
+ if (hard_iface->soft_iface)
+ update_min_mtu(hard_iface->soft_iface);
break;
case NETDEV_CHANGEADDR:
- if (batman_if->if_status == IF_NOT_IN_USE)
+ if (hard_iface->if_status == IF_NOT_IN_USE)
goto hardif_put;
- check_known_mac_addr(batman_if->net_dev);
- update_mac_addresses(batman_if);
+ check_known_mac_addr(hard_iface->net_dev);
+ update_mac_addresses(hard_iface);
- bat_priv = netdev_priv(batman_if->soft_iface);
- if (batman_if == bat_priv->primary_if)
+ bat_priv = netdev_priv(hard_iface->soft_iface);
+ if (hard_iface == bat_priv->primary_if)
update_primary_addr(bat_priv);
break;
default:
@@ -542,22 +557,23 @@ static int hard_if_event(struct notifier_block *this,
};
hardif_put:
- kref_put(&batman_if->refcount, hardif_free_ref);
+ hardif_free_ref(hard_iface);
out:
return NOTIFY_DONE;
}
/* receive a packet with the batman ethertype coming on a hard
* interface */
-int batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *ptype, struct net_device *orig_dev)
+static int batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *ptype,
+ struct net_device *orig_dev)
{
struct bat_priv *bat_priv;
struct batman_packet *batman_packet;
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
int ret;
- batman_if = container_of(ptype, struct batman_if, batman_adv_ptype);
+ hard_iface = container_of(ptype, struct hard_iface, batman_adv_ptype);
skb = skb_share_check(skb, GFP_ATOMIC);
/* skb was released by skb_share_check() */
@@ -573,16 +589,16 @@ int batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
|| !skb_mac_header(skb)))
goto err_free;
- if (!batman_if->soft_iface)
+ if (!hard_iface->soft_iface)
goto err_free;
- bat_priv = netdev_priv(batman_if->soft_iface);
+ bat_priv = netdev_priv(hard_iface->soft_iface);
if (atomic_read(&bat_priv->mesh_state) != MESH_ACTIVE)
goto err_free;
/* discard frames on not active interfaces */
- if (batman_if->if_status != IF_ACTIVE)
+ if (hard_iface->if_status != IF_ACTIVE)
goto err_free;
batman_packet = (struct batman_packet *)skb->data;
@@ -600,32 +616,32 @@ int batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
switch (batman_packet->packet_type) {
/* batman originator packet */
case BAT_PACKET:
- ret = recv_bat_packet(skb, batman_if);
+ ret = recv_bat_packet(skb, hard_iface);
break;
/* batman icmp packet */
case BAT_ICMP:
- ret = recv_icmp_packet(skb, batman_if);
+ ret = recv_icmp_packet(skb, hard_iface);
break;
/* unicast packet */
case BAT_UNICAST:
- ret = recv_unicast_packet(skb, batman_if);
+ ret = recv_unicast_packet(skb, hard_iface);
break;
/* fragmented unicast packet */
case BAT_UNICAST_FRAG:
- ret = recv_ucast_frag_packet(skb, batman_if);
+ ret = recv_ucast_frag_packet(skb, hard_iface);
break;
/* broadcast packet */
case BAT_BCAST:
- ret = recv_bcast_packet(skb, batman_if);
+ ret = recv_bcast_packet(skb, hard_iface);
break;
/* vis packet */
case BAT_VIS:
- ret = recv_vis_packet(skb, batman_if);
+ ret = recv_vis_packet(skb, hard_iface);
break;
default:
ret = NET_RX_DROP;
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 30ec3b8db45..a9ddf36e51c 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -31,23 +31,18 @@
extern struct notifier_block hard_if_notifier;
-struct batman_if *get_batman_if_by_netdev(struct net_device *net_dev);
-int hardif_enable_interface(struct batman_if *batman_if, char *iface_name);
-void hardif_disable_interface(struct batman_if *batman_if);
+struct hard_iface *hardif_get_by_netdev(struct net_device *net_dev);
+int hardif_enable_interface(struct hard_iface *hard_iface, char *iface_name);
+void hardif_disable_interface(struct hard_iface *hard_iface);
void hardif_remove_interfaces(void);
-int batman_skb_recv(struct sk_buff *skb,
- struct net_device *dev,
- struct packet_type *ptype,
- struct net_device *orig_dev);
int hardif_min_mtu(struct net_device *soft_iface);
void update_min_mtu(struct net_device *soft_iface);
+void hardif_free_rcu(struct rcu_head *rcu);
-static inline void hardif_free_ref(struct kref *refcount)
+static inline void hardif_free_ref(struct hard_iface *hard_iface)
{
- struct batman_if *batman_if;
-
- batman_if = container_of(refcount, struct batman_if, refcount);
- kfree(batman_if);
+ if (atomic_dec_and_test(&hard_iface->refcount))
+ call_rcu(&hard_iface->rcu, hardif_free_rcu);
}
#endif /* _NET_BATMAN_ADV_HARD_INTERFACE_H_ */
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index 26e623eb9de..c5213d8f2cc 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
@@ -27,13 +27,16 @@ static void hash_init(struct hashtable_t *hash)
{
int i;
- for (i = 0 ; i < hash->size; i++)
+ for (i = 0 ; i < hash->size; i++) {
INIT_HLIST_HEAD(&hash->table[i]);
+ spin_lock_init(&hash->list_locks[i]);
+ }
}
/* free only the hashtable and the hash itself. */
void hash_destroy(struct hashtable_t *hash)
{
+ kfree(hash->list_locks);
kfree(hash->table);
kfree(hash);
}
@@ -43,20 +46,25 @@ struct hashtable_t *hash_new(int size)
{
struct hashtable_t *hash;
- hash = kmalloc(sizeof(struct hashtable_t) , GFP_ATOMIC);
-
+ hash = kmalloc(sizeof(struct hashtable_t), GFP_ATOMIC);
if (!hash)
return NULL;
- hash->size = size;
hash->table = kmalloc(sizeof(struct element_t *) * size, GFP_ATOMIC);
+ if (!hash->table)
+ goto free_hash;
- if (!hash->table) {
- kfree(hash);
- return NULL;
- }
+ hash->list_locks = kmalloc(sizeof(spinlock_t) * size, GFP_ATOMIC);
+ if (!hash->list_locks)
+ goto free_table;
+ hash->size = size;
hash_init(hash);
-
return hash;
+
+free_table:
+ kfree(hash->table);
+free_hash:
+ kfree(hash);
+ return NULL;
}
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 09216ade16f..434822b2747 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
@@ -28,32 +28,23 @@
* compare 2 element datas for their keys,
* return 0 if same and not 0 if not
* same */
-typedef int (*hashdata_compare_cb)(void *, void *);
+typedef int (*hashdata_compare_cb)(struct hlist_node *, void *);
/* the hashfunction, should return an index
* based on the key in the data of the first
* argument and the size the second */
typedef int (*hashdata_choose_cb)(void *, int);
-typedef void (*hashdata_free_cb)(void *, void *);
-
-struct element_t {
- void *data; /* pointer to the data */
- struct hlist_node hlist; /* bucket list pointer */
-};
+typedef void (*hashdata_free_cb)(struct hlist_node *, void *);
struct hashtable_t {
- struct hlist_head *table; /* the hashtable itself, with the buckets */
+ struct hlist_head *table; /* the hashtable itself with the buckets */
+ spinlock_t *list_locks; /* spinlock for each hash list entry */
int size; /* size of hashtable */
};
/* allocates and clears the hash */
struct hashtable_t *hash_new(int size);
-/* remove element if you already found the element you want to delete and don't
- * need the overhead to find it again with hash_remove(). But usually, you
- * don't want to use this function, as it fiddles with hash-internals. */
-void *hash_remove_element(struct hashtable_t *hash, struct element_t *elem);
-
/* free only the hashtable and the hash itself. */
void hash_destroy(struct hashtable_t *hash);
@@ -64,21 +55,22 @@ static inline void hash_delete(struct hashtable_t *hash,
hashdata_free_cb free_cb, void *arg)
{
struct hlist_head *head;
- struct hlist_node *walk, *safe;
- struct element_t *bucket;
+ struct hlist_node *node, *node_tmp;
+ spinlock_t *list_lock; /* spinlock to protect write access */
int i;
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
- hlist_for_each_safe(walk, safe, head) {
- bucket = hlist_entry(walk, struct element_t, hlist);
- if (free_cb)
- free_cb(bucket->data, arg);
+ spin_lock_bh(list_lock);
+ hlist_for_each_safe(node, node_tmp, head) {
+ hlist_del_rcu(node);
- hlist_del(walk);
- kfree(bucket);
+ if (free_cb)
+ free_cb(node, arg);
}
+ spin_unlock_bh(list_lock);
}
hash_destroy(hash);
@@ -87,35 +79,41 @@ static inline void hash_delete(struct hashtable_t *hash,
/* adds data to the hashtable. returns 0 on success, -1 on error */
static inline int hash_add(struct hashtable_t *hash,
hashdata_compare_cb compare,
- hashdata_choose_cb choose, void *data)
+ hashdata_choose_cb choose,
+ void *data, struct hlist_node *data_node)
{
int index;
struct hlist_head *head;
- struct hlist_node *walk, *safe;
- struct element_t *bucket;
+ struct hlist_node *node;
+ spinlock_t *list_lock; /* spinlock to protect write access */
if (!hash)
- return -1;
+ goto err;
index = choose(data, hash->size);
head = &hash->table[index];
+ list_lock = &hash->list_locks[index];
+
+ rcu_read_lock();
+ __hlist_for_each_rcu(node, head) {
+ if (!compare(node, data))
+ continue;
- hlist_for_each_safe(walk, safe, head) {
- bucket = hlist_entry(walk, struct element_t, hlist);
- if (compare(bucket->data, data))
- return -1;
+ goto err_unlock;
}
+ rcu_read_unlock();
/* no duplicate found in list, add new element */
- bucket = kmalloc(sizeof(struct element_t), GFP_ATOMIC);
-
- if (!bucket)
- return -1;
-
- bucket->data = data;
- hlist_add_head(&bucket->hlist, head);
+ spin_lock_bh(list_lock);
+ hlist_add_head_rcu(data_node, head);
+ spin_unlock_bh(list_lock);
return 0;
+
+err_unlock:
+ rcu_read_unlock();
+err:
+ return -1;
}
/* removes data from hash, if found. returns pointer do data on success, so you
@@ -127,50 +125,25 @@ static inline void *hash_remove(struct hashtable_t *hash,
hashdata_choose_cb choose, void *data)
{
size_t index;
- struct hlist_node *walk;
- struct element_t *bucket;
+ struct hlist_node *node;
struct hlist_head *head;
- void *data_save;
+ void *data_save = NULL;
index = choose(data, hash->size);
head = &hash->table[index];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- if (compare(bucket->data, data)) {
- data_save = bucket->data;
- hlist_del(walk);
- kfree(bucket);
- return data_save;
- }
- }
-
- return NULL;
-}
-
-/* finds data, based on the key in keydata. returns the found data on success,
- * or NULL on error */
-static inline void *hash_find(struct hashtable_t *hash,
- hashdata_compare_cb compare,
- hashdata_choose_cb choose, void *keydata)
-{
- int index;
- struct hlist_head *head;
- struct hlist_node *walk;
- struct element_t *bucket;
-
- if (!hash)
- return NULL;
-
- index = choose(keydata , hash->size);
- head = &hash->table[index];
+ spin_lock_bh(&hash->list_locks[index]);
+ hlist_for_each(node, head) {
+ if (!compare(node, data))
+ continue;
- hlist_for_each(walk, head) {
- bucket = hlist_entry(walk, struct element_t, hlist);
- if (compare(bucket->data, keydata))
- return bucket->data;
+ data_save = node;
+ hlist_del_rcu(node);
+ break;
}
+ spin_unlock_bh(&hash->list_locks[index]);
- return NULL;
+ return data_save;
}
#endif /* _NET_BATMAN_ADV_HASH_H_ */
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index ecf6d7ffab2..34ce56c358e 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -24,7 +24,6 @@
#include <linux/slab.h>
#include "icmp_socket.h"
#include "send.h"
-#include "types.h"
#include "hash.h"
#include "originator.h"
#include "hard-interface.h"
@@ -157,10 +156,9 @@ static ssize_t bat_socket_write(struct file *file, const char __user *buff,
struct sk_buff *skb;
struct icmp_packet_rr *icmp_packet;
- struct orig_node *orig_node;
- struct batman_if *batman_if;
+ struct orig_node *orig_node = NULL;
+ struct neigh_node *neigh_node = NULL;
size_t packet_len = sizeof(struct icmp_packet);
- uint8_t dstaddr[ETH_ALEN];
if (len < sizeof(struct icmp_packet)) {
bat_dbg(DBG_BATMAN, bat_priv,
@@ -220,47 +218,52 @@ static ssize_t bat_socket_write(struct file *file, const char __user *buff,
if (atomic_read(&bat_priv->mesh_state) != MESH_ACTIVE)
goto dst_unreach;
- spin_lock_bh(&bat_priv->orig_hash_lock);
- orig_node = ((struct orig_node *)hash_find(bat_priv->orig_hash,
- compare_orig, choose_orig,
- icmp_packet->dst));
+ rcu_read_lock();
+ orig_node = orig_hash_find(bat_priv, icmp_packet->dst);
if (!orig_node)
goto unlock;
- if (!orig_node->router)
+ neigh_node = orig_node->router;
+
+ if (!neigh_node)
goto unlock;
- batman_if = orig_node->router->if_incoming;
- memcpy(dstaddr, orig_node->router->addr, ETH_ALEN);
+ if (!atomic_inc_not_zero(&neigh_node->refcount)) {
+ neigh_node = NULL;
+ goto unlock;
+ }
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ rcu_read_unlock();
- if (!batman_if)
+ if (!neigh_node->if_incoming)
goto dst_unreach;
- if (batman_if->if_status != IF_ACTIVE)
+ if (neigh_node->if_incoming->if_status != IF_ACTIVE)
goto dst_unreach;
memcpy(icmp_packet->orig,
bat_priv->primary_if->net_dev->dev_addr, ETH_ALEN);
if (packet_len == sizeof(struct icmp_packet_rr))
- memcpy(icmp_packet->rr, batman_if->net_dev->dev_addr, ETH_ALEN);
-
-
- send_skb_packet(skb, batman_if, dstaddr);
+ memcpy(icmp_packet->rr,
+ neigh_node->if_incoming->net_dev->dev_addr, ETH_ALEN);
+ send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
goto out;
unlock:
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ rcu_read_unlock();
dst_unreach:
icmp_packet->msg_type = DESTINATION_UNREACHABLE;
bat_socket_add_packet(socket_client, icmp_packet, packet_len);
free_skb:
kfree_skb(skb);
out:
+ if (neigh_node)
+ neigh_node_free_ref(neigh_node);
+ if (orig_node)
+ orig_node_free_ref(orig_node);
return len;
}
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index bf9b348cde2..462b190fa10 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -22,8 +22,6 @@
#ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_
#define _NET_BATMAN_ADV_ICMP_SOCKET_H_
-#include "types.h"
-
#define ICMP_SOCKET "socket"
void bat_socket_init(void);
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index b827f6a158c..709b33bbdf4 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -30,11 +30,10 @@
#include "translation-table.h"
#include "hard-interface.h"
#include "gateway_client.h"
-#include "types.h"
#include "vis.h"
#include "hash.h"
-struct list_head if_list;
+struct list_head hardif_list;
unsigned char broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
@@ -42,7 +41,7 @@ struct workqueue_struct *bat_event_workqueue;
static int __init batman_init(void)
{
- INIT_LIST_HEAD(&if_list);
+ INIT_LIST_HEAD(&hardif_list);
/* the name should not be longer than 10 chars - see
* http://lwn.net/Articles/23634/ */
@@ -80,7 +79,6 @@ int mesh_init(struct net_device *soft_iface)
{
struct bat_priv *bat_priv = netdev_priv(soft_iface);
- spin_lock_init(&bat_priv->orig_hash_lock);
spin_lock_init(&bat_priv->forw_bat_list_lock);
spin_lock_init(&bat_priv->forw_bcast_list_lock);
spin_lock_init(&bat_priv->hna_lhash_lock);
@@ -155,14 +153,14 @@ void dec_module_count(void)
int is_my_mac(uint8_t *addr)
{
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
rcu_read_lock();
- list_for_each_entry_rcu(batman_if, &if_list, list) {
- if (batman_if->if_status != IF_ACTIVE)
+ list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+ if (hard_iface->if_status != IF_ACTIVE)
continue;
- if (compare_orig(batman_if->net_dev->dev_addr, addr)) {
+ if (compare_eth(hard_iface->net_dev->dev_addr, addr)) {
rcu_read_unlock();
return 1;
}
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index d4d9926c220..dc248697de7 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,9 +22,6 @@
#ifndef _NET_BATMAN_ADV_MAIN_H_
#define _NET_BATMAN_ADV_MAIN_H_
-/* Kernel Programming */
-#define LINUX
-
#define DRIVER_AUTHOR "Marek Lindner <lindner_marek@yahoo.de>, " \
"Simon Wunderlich <siwu@hrz.tu-chemnitz.de>"
#define DRIVER_DESC "B.A.T.M.A.N. advanced"
@@ -54,7 +51,6 @@
#define NUM_WORDS (TQ_LOCAL_WINDOW_SIZE / WORD_BIT_SIZE)
-#define PACKBUFF_SIZE 2000
#define LOG_BUF_LEN 8192 /* has to be a power of 2 */
#define VIS_INTERVAL 5000 /* 5 seconds */
@@ -96,15 +92,11 @@
#define DBG_ROUTES 2 /* route or hna added / changed / deleted */
#define DBG_ALL 3
-#define LOG_BUF_LEN 8192 /* has to be a power of 2 */
-
/*
* Vis
*/
-/* #define VIS_SUBCLUSTERS_DISABLED */
-
/*
* Kernel headers
*/
@@ -130,7 +122,7 @@
#define REVISION_VERSION_STR " "REVISION_VERSION
#endif
-extern struct list_head if_list;
+extern struct list_head hardif_list;
extern unsigned char broadcast_addr[];
extern struct workqueue_struct *bat_event_workqueue;
@@ -151,20 +143,13 @@ int debug_log(struct bat_priv *bat_priv, char *fmt, ...);
} \
while (0)
#else /* !CONFIG_BATMAN_ADV_DEBUG */
-static inline void bat_dbg(char type __attribute__((unused)),
- struct bat_priv *bat_priv __attribute__((unused)),
- char *fmt __attribute__((unused)), ...)
+static inline void bat_dbg(char type __always_unused,
+ struct bat_priv *bat_priv __always_unused,
+ char *fmt __always_unused, ...)
{
}
#endif
-#define bat_warning(net_dev, fmt, arg...) \
- do { \
- struct net_device *_netdev = (net_dev); \
- struct bat_priv *_batpriv = netdev_priv(_netdev); \
- bat_dbg(DBG_ALL, _batpriv, fmt, ## arg); \
- pr_warning("%s: " fmt, _netdev->name, ## arg); \
- } while (0)
#define bat_info(net_dev, fmt, arg...) \
do { \
struct net_device *_netdev = (net_dev); \
@@ -180,4 +165,14 @@ static inline void bat_dbg(char type __attribute__((unused)),
pr_err("%s: " fmt, _netdev->name, ## arg); \
} while (0)
+/**
+ * returns 1 if they are the same ethernet addr
+ *
+ * note: can't use compare_ether_addr() as it requires aligned memory
+ */
+static inline int compare_eth(void *data1, void *data2)
+{
+ return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0);
+}
+
#endif /* _NET_BATMAN_ADV_MAIN_H_ */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 6b7fb6b7e6f..0b9133022d2 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -44,24 +44,36 @@ int originator_init(struct bat_priv *bat_priv)
if (bat_priv->orig_hash)
return 1;
- spin_lock_bh(&bat_priv->orig_hash_lock);
bat_priv->orig_hash = hash_new(1024);
if (!bat_priv->orig_hash)
goto err;
- spin_unlock_bh(&bat_priv->orig_hash_lock);
start_purge_timer(bat_priv);
return 1;
err:
- spin_unlock_bh(&bat_priv->orig_hash_lock);
return 0;
}
-struct neigh_node *
-create_neighbor(struct orig_node *orig_node, struct orig_node *orig_neigh_node,
- uint8_t *neigh, struct batman_if *if_incoming)
+static void neigh_node_free_rcu(struct rcu_head *rcu)
+{
+ struct neigh_node *neigh_node;
+
+ neigh_node = container_of(rcu, struct neigh_node, rcu);
+ kfree(neigh_node);
+}
+
+void neigh_node_free_ref(struct neigh_node *neigh_node)
+{
+ if (atomic_dec_and_test(&neigh_node->refcount))
+ call_rcu(&neigh_node->rcu, neigh_node_free_rcu);
+}
+
+struct neigh_node *create_neighbor(struct orig_node *orig_node,
+ struct orig_node *orig_neigh_node,
+ uint8_t *neigh,
+ struct hard_iface *if_incoming)
{
struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
struct neigh_node *neigh_node;
@@ -73,50 +85,94 @@ create_neighbor(struct orig_node *orig_node, struct orig_node *orig_neigh_node,
if (!neigh_node)
return NULL;
- INIT_LIST_HEAD(&neigh_node->list);
+ INIT_HLIST_NODE(&neigh_node->list);
+ INIT_LIST_HEAD(&neigh_node->bonding_list);
memcpy(neigh_node->addr, neigh, ETH_ALEN);
neigh_node->orig_node = orig_neigh_node;
neigh_node->if_incoming = if_incoming;
- list_add_tail(&neigh_node->list, &orig_node->neigh_list);
+ /* extra reference for return */
+ atomic_set(&neigh_node->refcount, 2);
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
+ spin_unlock_bh(&orig_node->neigh_list_lock);
return neigh_node;
}
-static void free_orig_node(void *data, void *arg)
+static void orig_node_free_rcu(struct rcu_head *rcu)
{
- struct list_head *list_pos, *list_pos_tmp;
- struct neigh_node *neigh_node;
- struct orig_node *orig_node = (struct orig_node *)data;
- struct bat_priv *bat_priv = (struct bat_priv *)arg;
+ struct hlist_node *node, *node_tmp;
+ struct neigh_node *neigh_node, *tmp_neigh_node;
+ struct orig_node *orig_node;
- /* for all neighbors towards this originator ... */
- list_for_each_safe(list_pos, list_pos_tmp, &orig_node->neigh_list) {
- neigh_node = list_entry(list_pos, struct neigh_node, list);
+ orig_node = container_of(rcu, struct orig_node, rcu);
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
+ /* for all bonding members ... */
+ list_for_each_entry_safe(neigh_node, tmp_neigh_node,
+ &orig_node->bond_list, bonding_list) {
+ list_del_rcu(&neigh_node->bonding_list);
+ neigh_node_free_ref(neigh_node);
+ }
- list_del(list_pos);
- kfree(neigh_node);
+ /* for all neighbors towards this originator ... */
+ hlist_for_each_entry_safe(neigh_node, node, node_tmp,
+ &orig_node->neigh_list, list) {
+ hlist_del_rcu(&neigh_node->list);
+ neigh_node_free_ref(neigh_node);
}
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
frag_list_free(&orig_node->frag_list);
- hna_global_del_orig(bat_priv, orig_node, "originator timed out");
+ hna_global_del_orig(orig_node->bat_priv, orig_node,
+ "originator timed out");
kfree(orig_node->bcast_own);
kfree(orig_node->bcast_own_sum);
kfree(orig_node);
}
+void orig_node_free_ref(struct orig_node *orig_node)
+{
+ if (atomic_dec_and_test(&orig_node->refcount))
+ call_rcu(&orig_node->rcu, orig_node_free_rcu);
+}
+
void originator_free(struct bat_priv *bat_priv)
{
- if (!bat_priv->orig_hash)
+ struct hashtable_t *hash = bat_priv->orig_hash;
+ struct hlist_node *node, *node_tmp;
+ struct hlist_head *head;
+ spinlock_t *list_lock; /* spinlock to protect write access */
+ struct orig_node *orig_node;
+ int i;
+
+ if (!hash)
return;
cancel_delayed_work_sync(&bat_priv->orig_work);
- spin_lock_bh(&bat_priv->orig_hash_lock);
- hash_delete(bat_priv->orig_hash, free_orig_node, bat_priv);
bat_priv->orig_hash = NULL;
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(orig_node, node, node_tmp,
+ head, hash_entry) {
+
+ hlist_del_rcu(node);
+ orig_node_free_ref(orig_node);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ hash_destroy(hash);
}
/* this function finds or creates an originator entry for the given
@@ -127,10 +183,7 @@ struct orig_node *get_orig_node(struct bat_priv *bat_priv, uint8_t *addr)
int size;
int hash_added;
- orig_node = ((struct orig_node *)hash_find(bat_priv->orig_hash,
- compare_orig, choose_orig,
- addr));
-
+ orig_node = orig_hash_find(bat_priv, addr);
if (orig_node)
return orig_node;
@@ -141,8 +194,16 @@ struct orig_node *get_orig_node(struct bat_priv *bat_priv, uint8_t *addr)
if (!orig_node)
return NULL;
- INIT_LIST_HEAD(&orig_node->neigh_list);
+ INIT_HLIST_HEAD(&orig_node->neigh_list);
+ INIT_LIST_HEAD(&orig_node->bond_list);
+ spin_lock_init(&orig_node->ogm_cnt_lock);
+ spin_lock_init(&orig_node->bcast_seqno_lock);
+ spin_lock_init(&orig_node->neigh_list_lock);
+
+ /* extra reference for return */
+ atomic_set(&orig_node->refcount, 2);
+ orig_node->bat_priv = bat_priv;
memcpy(orig_node->orig, addr, ETH_ALEN);
orig_node->router = NULL;
orig_node->hna_buff = NULL;
@@ -151,6 +212,8 @@ struct orig_node *get_orig_node(struct bat_priv *bat_priv, uint8_t *addr)
orig_node->batman_seqno_reset = jiffies - 1
- msecs_to_jiffies(RESET_PROTECTION_MS);
+ atomic_set(&orig_node->bond_candidates, 0);
+
size = bat_priv->num_ifaces * sizeof(unsigned long) * NUM_WORDS;
orig_node->bcast_own = kzalloc(size, GFP_ATOMIC);
@@ -166,8 +229,8 @@ struct orig_node *get_orig_node(struct bat_priv *bat_priv, uint8_t *addr)
if (!orig_node->bcast_own_sum)
goto free_bcast_own;
- hash_added = hash_add(bat_priv->orig_hash, compare_orig, choose_orig,
- orig_node);
+ hash_added = hash_add(bat_priv->orig_hash, compare_orig,
+ choose_orig, orig_node, &orig_node->hash_entry);
if (hash_added < 0)
goto free_bcast_own_sum;
@@ -185,23 +248,30 @@ static bool purge_orig_neighbors(struct bat_priv *bat_priv,
struct orig_node *orig_node,
struct neigh_node **best_neigh_node)
{
- struct list_head *list_pos, *list_pos_tmp;
+ struct hlist_node *node, *node_tmp;
struct neigh_node *neigh_node;
bool neigh_purged = false;
*best_neigh_node = NULL;
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
/* for all neighbors towards this originator ... */
- list_for_each_safe(list_pos, list_pos_tmp, &orig_node->neigh_list) {
- neigh_node = list_entry(list_pos, struct neigh_node, list);
+ hlist_for_each_entry_safe(neigh_node, node, node_tmp,
+ &orig_node->neigh_list, list) {
if ((time_after(jiffies,
neigh_node->last_valid + PURGE_TIMEOUT * HZ)) ||
(neigh_node->if_incoming->if_status == IF_INACTIVE) ||
+ (neigh_node->if_incoming->if_status == IF_NOT_IN_USE) ||
(neigh_node->if_incoming->if_status == IF_TO_BE_REMOVED)) {
- if (neigh_node->if_incoming->if_status ==
- IF_TO_BE_REMOVED)
+ if ((neigh_node->if_incoming->if_status ==
+ IF_INACTIVE) ||
+ (neigh_node->if_incoming->if_status ==
+ IF_NOT_IN_USE) ||
+ (neigh_node->if_incoming->if_status ==
+ IF_TO_BE_REMOVED))
bat_dbg(DBG_BATMAN, bat_priv,
"neighbor purge: originator %pM, "
"neighbor: %pM, iface: %s\n",
@@ -215,14 +285,18 @@ static bool purge_orig_neighbors(struct bat_priv *bat_priv,
(neigh_node->last_valid / HZ));
neigh_purged = true;
- list_del(list_pos);
- kfree(neigh_node);
+
+ hlist_del_rcu(&neigh_node->list);
+ bonding_candidate_del(orig_node, neigh_node);
+ neigh_node_free_ref(neigh_node);
} else {
if ((!*best_neigh_node) ||
(neigh_node->tq_avg > (*best_neigh_node)->tq_avg))
*best_neigh_node = neigh_node;
}
}
+
+ spin_unlock_bh(&orig_node->neigh_list_lock);
return neigh_purged;
}
@@ -245,9 +319,6 @@ static bool purge_orig_node(struct bat_priv *bat_priv,
best_neigh_node,
orig_node->hna_buff,
orig_node->hna_buff_len);
- /* update bonding candidates, we could have lost
- * some candidates. */
- update_bonding_candidates(bat_priv, orig_node);
}
}
@@ -257,40 +328,38 @@ static bool purge_orig_node(struct bat_priv *bat_priv,
static void _purge_orig(struct bat_priv *bat_priv)
{
struct hashtable_t *hash = bat_priv->orig_hash;
- struct hlist_node *walk, *safe;
+ struct hlist_node *node, *node_tmp;
struct hlist_head *head;
- struct element_t *bucket;
+ spinlock_t *list_lock; /* spinlock to protect write access */
struct orig_node *orig_node;
int i;
if (!hash)
return;
- spin_lock_bh(&bat_priv->orig_hash_lock);
-
/* for all origins... */
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
- hlist_for_each_entry_safe(bucket, walk, safe, head, hlist) {
- orig_node = bucket->data;
-
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(orig_node, node, node_tmp,
+ head, hash_entry) {
if (purge_orig_node(bat_priv, orig_node)) {
if (orig_node->gw_flags)
gw_node_delete(bat_priv, orig_node);
- hlist_del(walk);
- kfree(bucket);
- free_orig_node(orig_node, bat_priv);
+ hlist_del_rcu(node);
+ orig_node_free_ref(orig_node);
+ continue;
}
if (time_after(jiffies, orig_node->last_frag_packet +
msecs_to_jiffies(FRAG_TIMEOUT)))
frag_list_free(&orig_node->frag_list);
}
+ spin_unlock_bh(list_lock);
}
- spin_unlock_bh(&bat_priv->orig_hash_lock);
-
gw_node_purge(bat_priv);
gw_election(bat_priv);
@@ -318,9 +387,8 @@ int orig_seq_print_text(struct seq_file *seq, void *offset)
struct net_device *net_dev = (struct net_device *)seq->private;
struct bat_priv *bat_priv = netdev_priv(net_dev);
struct hashtable_t *hash = bat_priv->orig_hash;
- struct hlist_node *walk;
+ struct hlist_node *node, *node_tmp;
struct hlist_head *head;
- struct element_t *bucket;
struct orig_node *orig_node;
struct neigh_node *neigh_node;
int batman_count = 0;
@@ -348,14 +416,11 @@ int orig_seq_print_text(struct seq_file *seq, void *offset)
"Originator", "last-seen", "#", TQ_MAX_VALUE, "Nexthop",
"outgoingIF", "Potential nexthops");
- spin_lock_bh(&bat_priv->orig_hash_lock);
-
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- orig_node = bucket->data;
-
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
if (!orig_node->router)
continue;
@@ -374,8 +439,8 @@ int orig_seq_print_text(struct seq_file *seq, void *offset)
neigh_node->addr,
neigh_node->if_incoming->net_dev->name);
- list_for_each_entry(neigh_node, &orig_node->neigh_list,
- list) {
+ hlist_for_each_entry_rcu(neigh_node, node_tmp,
+ &orig_node->neigh_list, list) {
seq_printf(seq, " %pM (%3i)", neigh_node->addr,
neigh_node->tq_avg);
}
@@ -383,10 +448,9 @@ int orig_seq_print_text(struct seq_file *seq, void *offset)
seq_printf(seq, "\n");
batman_count++;
}
+ rcu_read_unlock();
}
- spin_unlock_bh(&bat_priv->orig_hash_lock);
-
if ((batman_count == 0))
seq_printf(seq, "No batman nodes in range ...\n");
@@ -423,36 +487,36 @@ static int orig_node_add_if(struct orig_node *orig_node, int max_if_num)
return 0;
}
-int orig_hash_add_if(struct batman_if *batman_if, int max_if_num)
+int orig_hash_add_if(struct hard_iface *hard_iface, int max_if_num)
{
- struct bat_priv *bat_priv = netdev_priv(batman_if->soft_iface);
+ struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
struct hashtable_t *hash = bat_priv->orig_hash;
- struct hlist_node *walk;
+ struct hlist_node *node;
struct hlist_head *head;
- struct element_t *bucket;
struct orig_node *orig_node;
- int i;
+ int i, ret;
/* resize all orig nodes because orig_node->bcast_own(_sum) depend on
* if_num */
- spin_lock_bh(&bat_priv->orig_hash_lock);
-
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- orig_node = bucket->data;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+ spin_lock_bh(&orig_node->ogm_cnt_lock);
+ ret = orig_node_add_if(orig_node, max_if_num);
+ spin_unlock_bh(&orig_node->ogm_cnt_lock);
- if (orig_node_add_if(orig_node, max_if_num) == -1)
+ if (ret == -1)
goto err;
}
+ rcu_read_unlock();
}
- spin_unlock_bh(&bat_priv->orig_hash_lock);
return 0;
err:
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ rcu_read_unlock();
return -ENOMEM;
}
@@ -508,57 +572,55 @@ free_own_sum:
return 0;
}
-int orig_hash_del_if(struct batman_if *batman_if, int max_if_num)
+int orig_hash_del_if(struct hard_iface *hard_iface, int max_if_num)
{
- struct bat_priv *bat_priv = netdev_priv(batman_if->soft_iface);
+ struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
struct hashtable_t *hash = bat_priv->orig_hash;
- struct hlist_node *walk;
+ struct hlist_node *node;
struct hlist_head *head;
- struct element_t *bucket;
- struct batman_if *batman_if_tmp;
+ struct hard_iface *hard_iface_tmp;
struct orig_node *orig_node;
int i, ret;
/* resize all orig nodes because orig_node->bcast_own(_sum) depend on
* if_num */
- spin_lock_bh(&bat_priv->orig_hash_lock);
-
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- orig_node = bucket->data;
-
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+ spin_lock_bh(&orig_node->ogm_cnt_lock);
ret = orig_node_del_if(orig_node, max_if_num,
- batman_if->if_num);
+ hard_iface->if_num);
+ spin_unlock_bh(&orig_node->ogm_cnt_lock);
if (ret == -1)
goto err;
}
+ rcu_read_unlock();
}
/* renumber remaining batman interfaces _inside_ of orig_hash_lock */
rcu_read_lock();
- list_for_each_entry_rcu(batman_if_tmp, &if_list, list) {
- if (batman_if_tmp->if_status == IF_NOT_IN_USE)
+ list_for_each_entry_rcu(hard_iface_tmp, &hardif_list, list) {
+ if (hard_iface_tmp->if_status == IF_NOT_IN_USE)
continue;
- if (batman_if == batman_if_tmp)
+ if (hard_iface == hard_iface_tmp)
continue;
- if (batman_if->soft_iface != batman_if_tmp->soft_iface)
+ if (hard_iface->soft_iface != hard_iface_tmp->soft_iface)
continue;
- if (batman_if_tmp->if_num > batman_if->if_num)
- batman_if_tmp->if_num--;
+ if (hard_iface_tmp->if_num > hard_iface->if_num)
+ hard_iface_tmp->if_num--;
}
rcu_read_unlock();
- batman_if->if_num = -1;
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ hard_iface->if_num = -1;
return 0;
err:
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ rcu_read_unlock();
return -ENOMEM;
}
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index d474ceb2a4e..5cc011057da 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,21 +22,28 @@
#ifndef _NET_BATMAN_ADV_ORIGINATOR_H_
#define _NET_BATMAN_ADV_ORIGINATOR_H_
+#include "hash.h"
+
int originator_init(struct bat_priv *bat_priv);
void originator_free(struct bat_priv *bat_priv);
void purge_orig_ref(struct bat_priv *bat_priv);
+void orig_node_free_ref(struct orig_node *orig_node);
struct orig_node *get_orig_node(struct bat_priv *bat_priv, uint8_t *addr);
-struct neigh_node *
-create_neighbor(struct orig_node *orig_node, struct orig_node *orig_neigh_node,
- uint8_t *neigh, struct batman_if *if_incoming);
+struct neigh_node *create_neighbor(struct orig_node *orig_node,
+ struct orig_node *orig_neigh_node,
+ uint8_t *neigh,
+ struct hard_iface *if_incoming);
+void neigh_node_free_ref(struct neigh_node *neigh_node);
int orig_seq_print_text(struct seq_file *seq, void *offset);
-int orig_hash_add_if(struct batman_if *batman_if, int max_if_num);
-int orig_hash_del_if(struct batman_if *batman_if, int max_if_num);
+int orig_hash_add_if(struct hard_iface *hard_iface, int max_if_num);
+int orig_hash_del_if(struct hard_iface *hard_iface, int max_if_num);
/* returns 1 if they are the same originator */
-static inline int compare_orig(void *data1, void *data2)
+static inline int compare_orig(struct hlist_node *node, void *data2)
{
+ void *data1 = container_of(node, struct orig_node, hash_entry);
+
return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0);
}
@@ -61,4 +68,35 @@ static inline int choose_orig(void *data, int32_t size)
return hash % size;
}
+static inline struct orig_node *orig_hash_find(struct bat_priv *bat_priv,
+ void *data)
+{
+ struct hashtable_t *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct orig_node *orig_node, *orig_node_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ index = choose_orig(data, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+ if (!compare_eth(orig_node, data))
+ continue;
+
+ if (!atomic_inc_not_zero(&orig_node->refcount))
+ continue;
+
+ orig_node_tmp = orig_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_node_tmp;
+}
+
#endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index b49fdf70a6d..e7571879af3 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -50,6 +50,7 @@
/* fragmentation defines */
#define UNI_FRAG_HEAD 0x01
+#define UNI_FRAG_LARGETAIL 0x02
struct batman_packet {
uint8_t packet_type;
@@ -63,7 +64,7 @@ struct batman_packet {
uint8_t num_hna;
uint8_t gw_flags; /* flags related to gateway class */
uint8_t align;
-} __attribute__((packed));
+} __packed;
#define BAT_PACKET_LEN sizeof(struct batman_packet)
@@ -76,7 +77,7 @@ struct icmp_packet {
uint8_t orig[6];
uint16_t seqno;
uint8_t uid;
-} __attribute__((packed));
+} __packed;
#define BAT_RR_LEN 16
@@ -93,14 +94,14 @@ struct icmp_packet_rr {
uint8_t uid;
uint8_t rr_cur;
uint8_t rr[BAT_RR_LEN][ETH_ALEN];
-} __attribute__((packed));
+} __packed;
struct unicast_packet {
uint8_t packet_type;
uint8_t version; /* batman version field */
uint8_t dest[6];
uint8_t ttl;
-} __attribute__((packed));
+} __packed;
struct unicast_frag_packet {
uint8_t packet_type;
@@ -110,7 +111,7 @@ struct unicast_frag_packet {
uint8_t flags;
uint8_t orig[6];
uint16_t seqno;
-} __attribute__((packed));
+} __packed;
struct bcast_packet {
uint8_t packet_type;
@@ -118,7 +119,7 @@ struct bcast_packet {
uint8_t orig[6];
uint8_t ttl;
uint32_t seqno;
-} __attribute__((packed));
+} __packed;
struct vis_packet {
uint8_t packet_type;
@@ -131,6 +132,6 @@ struct vis_packet {
* neighbors */
uint8_t target_orig[6]; /* who should receive this packet */
uint8_t sender_orig[6]; /* who sent or rebroadcasted this packet */
-} __attribute__((packed));
+} __packed;
#endif /* _NET_BATMAN_ADV_PACKET_H_ */
diff --git a/net/batman-adv/ring_buffer.c b/net/batman-adv/ring_buffer.c
index defd37c9be1..5bb6a619afe 100644
--- a/net/batman-adv/ring_buffer.c
+++ b/net/batman-adv/ring_buffer.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/ring_buffer.h b/net/batman-adv/ring_buffer.h
index 6b0cb9aaeba..0395b274186 100644
--- a/net/batman-adv/ring_buffer.h
+++ b/net/batman-adv/ring_buffer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 8828eddd3f7..c172f5d0e05 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -28,7 +28,6 @@
#include "icmp_socket.h"
#include "translation-table.h"
#include "originator.h"
-#include "types.h"
#include "ring_buffer.h"
#include "vis.h"
#include "aggregation.h"
@@ -36,35 +35,33 @@
#include "gateway_client.h"
#include "unicast.h"
-void slide_own_bcast_window(struct batman_if *batman_if)
+void slide_own_bcast_window(struct hard_iface *hard_iface)
{
- struct bat_priv *bat_priv = netdev_priv(batman_if->soft_iface);
+ struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
struct hashtable_t *hash = bat_priv->orig_hash;
- struct hlist_node *walk;
+ struct hlist_node *node;
struct hlist_head *head;
- struct element_t *bucket;
struct orig_node *orig_node;
unsigned long *word;
int i;
size_t word_index;
- spin_lock_bh(&bat_priv->orig_hash_lock);
-
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- orig_node = bucket->data;
- word_index = batman_if->if_num * NUM_WORDS;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+ spin_lock_bh(&orig_node->ogm_cnt_lock);
+ word_index = hard_iface->if_num * NUM_WORDS;
word = &(orig_node->bcast_own[word_index]);
bit_get_packet(bat_priv, word, 1, 0);
- orig_node->bcast_own_sum[batman_if->if_num] =
+ orig_node->bcast_own_sum[hard_iface->if_num] =
bit_packet_count(word);
+ spin_unlock_bh(&orig_node->ogm_cnt_lock);
}
+ rcu_read_unlock();
}
-
- spin_unlock_bh(&bat_priv->orig_hash_lock);
}
static void update_HNA(struct bat_priv *bat_priv, struct orig_node *orig_node,
@@ -90,6 +87,8 @@ static void update_route(struct bat_priv *bat_priv,
struct neigh_node *neigh_node,
unsigned char *hna_buff, int hna_buff_len)
{
+ struct neigh_node *neigh_node_tmp;
+
/* route deleted */
if ((orig_node->router) && (!neigh_node)) {
@@ -116,7 +115,12 @@ static void update_route(struct bat_priv *bat_priv,
orig_node->router->addr);
}
+ if (neigh_node && !atomic_inc_not_zero(&neigh_node->refcount))
+ neigh_node = NULL;
+ neigh_node_tmp = orig_node->router;
orig_node->router = neigh_node;
+ if (neigh_node_tmp)
+ neigh_node_free_ref(neigh_node_tmp);
}
@@ -139,73 +143,93 @@ void update_routes(struct bat_priv *bat_priv, struct orig_node *orig_node,
static int is_bidirectional_neigh(struct orig_node *orig_node,
struct orig_node *orig_neigh_node,
struct batman_packet *batman_packet,
- struct batman_if *if_incoming)
+ struct hard_iface *if_incoming)
{
struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
- struct neigh_node *neigh_node = NULL, *tmp_neigh_node = NULL;
+ struct neigh_node *neigh_node = NULL, *tmp_neigh_node;
+ struct hlist_node *node;
unsigned char total_count;
+ uint8_t orig_eq_count, neigh_rq_count, tq_own;
+ int tq_asym_penalty, ret = 0;
if (orig_node == orig_neigh_node) {
- list_for_each_entry(tmp_neigh_node,
- &orig_node->neigh_list,
- list) {
-
- if (compare_orig(tmp_neigh_node->addr,
- orig_neigh_node->orig) &&
- (tmp_neigh_node->if_incoming == if_incoming))
- neigh_node = tmp_neigh_node;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_node, node,
+ &orig_node->neigh_list, list) {
+
+ if (!compare_eth(tmp_neigh_node->addr,
+ orig_neigh_node->orig))
+ continue;
+
+ if (tmp_neigh_node->if_incoming != if_incoming)
+ continue;
+
+ if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+ continue;
+
+ neigh_node = tmp_neigh_node;
}
+ rcu_read_unlock();
if (!neigh_node)
neigh_node = create_neighbor(orig_node,
orig_neigh_node,
orig_neigh_node->orig,
if_incoming);
- /* create_neighbor failed, return 0 */
if (!neigh_node)
- return 0;
+ goto out;
neigh_node->last_valid = jiffies;
} else {
/* find packet count of corresponding one hop neighbor */
- list_for_each_entry(tmp_neigh_node,
- &orig_neigh_node->neigh_list, list) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_node, node,
+ &orig_neigh_node->neigh_list, list) {
- if (compare_orig(tmp_neigh_node->addr,
- orig_neigh_node->orig) &&
- (tmp_neigh_node->if_incoming == if_incoming))
- neigh_node = tmp_neigh_node;
+ if (!compare_eth(tmp_neigh_node->addr,
+ orig_neigh_node->orig))
+ continue;
+
+ if (tmp_neigh_node->if_incoming != if_incoming)
+ continue;
+
+ if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+ continue;
+
+ neigh_node = tmp_neigh_node;
}
+ rcu_read_unlock();
if (!neigh_node)
neigh_node = create_neighbor(orig_neigh_node,
orig_neigh_node,
orig_neigh_node->orig,
if_incoming);
- /* create_neighbor failed, return 0 */
if (!neigh_node)
- return 0;
+ goto out;
}
orig_node->last_valid = jiffies;
+ spin_lock_bh(&orig_node->ogm_cnt_lock);
+ orig_eq_count = orig_neigh_node->bcast_own_sum[if_incoming->if_num];
+ neigh_rq_count = neigh_node->real_packet_count;
+ spin_unlock_bh(&orig_node->ogm_cnt_lock);
+
/* pay attention to not get a value bigger than 100 % */
- total_count = (orig_neigh_node->bcast_own_sum[if_incoming->if_num] >
- neigh_node->real_packet_count ?
- neigh_node->real_packet_count :
- orig_neigh_node->bcast_own_sum[if_incoming->if_num]);
+ total_count = (orig_eq_count > neigh_rq_count ?
+ neigh_rq_count : orig_eq_count);
/* if we have too few packets (too less data) we set tq_own to zero */
/* if we receive too few packets it is not considered bidirectional */
if ((total_count < TQ_LOCAL_BIDRECT_SEND_MINIMUM) ||
- (neigh_node->real_packet_count < TQ_LOCAL_BIDRECT_RECV_MINIMUM))
- orig_neigh_node->tq_own = 0;
+ (neigh_rq_count < TQ_LOCAL_BIDRECT_RECV_MINIMUM))
+ tq_own = 0;
else
/* neigh_node->real_packet_count is never zero as we
* only purge old information when getting new
* information */
- orig_neigh_node->tq_own = (TQ_MAX_VALUE * total_count) /
- neigh_node->real_packet_count;
+ tq_own = (TQ_MAX_VALUE * total_count) / neigh_rq_count;
/*
* 1 - ((1-x) ** 3), normalized to TQ_MAX_VALUE this does
@@ -213,20 +237,16 @@ static int is_bidirectional_neigh(struct orig_node *orig_node,
* punishes asymmetric links more. This will give a value
* between 0 and TQ_MAX_VALUE
*/
- orig_neigh_node->tq_asym_penalty =
- TQ_MAX_VALUE -
- (TQ_MAX_VALUE *
- (TQ_LOCAL_WINDOW_SIZE - neigh_node->real_packet_count) *
- (TQ_LOCAL_WINDOW_SIZE - neigh_node->real_packet_count) *
- (TQ_LOCAL_WINDOW_SIZE - neigh_node->real_packet_count)) /
- (TQ_LOCAL_WINDOW_SIZE *
- TQ_LOCAL_WINDOW_SIZE *
- TQ_LOCAL_WINDOW_SIZE);
-
- batman_packet->tq = ((batman_packet->tq *
- orig_neigh_node->tq_own *
- orig_neigh_node->tq_asym_penalty) /
- (TQ_MAX_VALUE * TQ_MAX_VALUE));
+ tq_asym_penalty = TQ_MAX_VALUE - (TQ_MAX_VALUE *
+ (TQ_LOCAL_WINDOW_SIZE - neigh_rq_count) *
+ (TQ_LOCAL_WINDOW_SIZE - neigh_rq_count) *
+ (TQ_LOCAL_WINDOW_SIZE - neigh_rq_count)) /
+ (TQ_LOCAL_WINDOW_SIZE *
+ TQ_LOCAL_WINDOW_SIZE *
+ TQ_LOCAL_WINDOW_SIZE);
+
+ batman_packet->tq = ((batman_packet->tq * tq_own * tq_asym_penalty) /
+ (TQ_MAX_VALUE * TQ_MAX_VALUE));
bat_dbg(DBG_BATMAN, bat_priv,
"bidirectional: "
@@ -234,34 +254,141 @@ static int is_bidirectional_neigh(struct orig_node *orig_node,
"real recv = %2i, local tq: %3i, asym_penalty: %3i, "
"total tq: %3i\n",
orig_node->orig, orig_neigh_node->orig, total_count,
- neigh_node->real_packet_count, orig_neigh_node->tq_own,
- orig_neigh_node->tq_asym_penalty, batman_packet->tq);
+ neigh_rq_count, tq_own, tq_asym_penalty, batman_packet->tq);
/* if link has the minimum required transmission quality
* consider it bidirectional */
if (batman_packet->tq >= TQ_TOTAL_BIDRECT_LIMIT)
- return 1;
+ ret = 1;
- return 0;
+out:
+ if (neigh_node)
+ neigh_node_free_ref(neigh_node);
+ return ret;
+}
+
+/* caller must hold the neigh_list_lock */
+void bonding_candidate_del(struct orig_node *orig_node,
+ struct neigh_node *neigh_node)
+{
+ /* this neighbor is not part of our candidate list */
+ if (list_empty(&neigh_node->bonding_list))
+ goto out;
+
+ list_del_rcu(&neigh_node->bonding_list);
+ INIT_LIST_HEAD(&neigh_node->bonding_list);
+ neigh_node_free_ref(neigh_node);
+ atomic_dec(&orig_node->bond_candidates);
+
+out:
+ return;
+}
+
+static void bonding_candidate_add(struct orig_node *orig_node,
+ struct neigh_node *neigh_node)
+{
+ struct hlist_node *node;
+ struct neigh_node *tmp_neigh_node;
+ uint8_t best_tq, interference_candidate = 0;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
+ /* only consider if it has the same primary address ... */
+ if (!compare_eth(orig_node->orig,
+ neigh_node->orig_node->primary_addr))
+ goto candidate_del;
+
+ if (!orig_node->router)
+ goto candidate_del;
+
+ best_tq = orig_node->router->tq_avg;
+
+ /* ... and is good enough to be considered */
+ if (neigh_node->tq_avg < best_tq - BONDING_TQ_THRESHOLD)
+ goto candidate_del;
+
+ /**
+ * check if we have another candidate with the same mac address or
+ * interface. If we do, we won't select this candidate because of
+ * possible interference.
+ */
+ hlist_for_each_entry_rcu(tmp_neigh_node, node,
+ &orig_node->neigh_list, list) {
+
+ if (tmp_neigh_node == neigh_node)
+ continue;
+
+ /* we only care if the other candidate is even
+ * considered as candidate. */
+ if (list_empty(&tmp_neigh_node->bonding_list))
+ continue;
+
+ if ((neigh_node->if_incoming == tmp_neigh_node->if_incoming) ||
+ (compare_eth(neigh_node->addr, tmp_neigh_node->addr))) {
+ interference_candidate = 1;
+ break;
+ }
+ }
+
+ /* don't care further if it is an interference candidate */
+ if (interference_candidate)
+ goto candidate_del;
+
+ /* this neighbor already is part of our candidate list */
+ if (!list_empty(&neigh_node->bonding_list))
+ goto out;
+
+ if (!atomic_inc_not_zero(&neigh_node->refcount))
+ goto out;
+
+ list_add_rcu(&neigh_node->bonding_list, &orig_node->bond_list);
+ atomic_inc(&orig_node->bond_candidates);
+ goto out;
+
+candidate_del:
+ bonding_candidate_del(orig_node, neigh_node);
+
+out:
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+ return;
+}
+
+/* copy primary address for bonding */
+static void bonding_save_primary(struct orig_node *orig_node,
+ struct orig_node *orig_neigh_node,
+ struct batman_packet *batman_packet)
+{
+ if (!(batman_packet->flags & PRIMARIES_FIRST_HOP))
+ return;
+
+ memcpy(orig_neigh_node->primary_addr, orig_node->orig, ETH_ALEN);
}
static void update_orig(struct bat_priv *bat_priv,
struct orig_node *orig_node,
struct ethhdr *ethhdr,
struct batman_packet *batman_packet,
- struct batman_if *if_incoming,
+ struct hard_iface *if_incoming,
unsigned char *hna_buff, int hna_buff_len,
char is_duplicate)
{
struct neigh_node *neigh_node = NULL, *tmp_neigh_node = NULL;
+ struct orig_node *orig_node_tmp;
+ struct hlist_node *node;
int tmp_hna_buff_len;
+ uint8_t bcast_own_sum_orig, bcast_own_sum_neigh;
bat_dbg(DBG_BATMAN, bat_priv, "update_originator(): "
"Searching and updating originator entry of received packet\n");
- list_for_each_entry(tmp_neigh_node, &orig_node->neigh_list, list) {
- if (compare_orig(tmp_neigh_node->addr, ethhdr->h_source) &&
- (tmp_neigh_node->if_incoming == if_incoming)) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_node, node,
+ &orig_node->neigh_list, list) {
+ if (compare_eth(tmp_neigh_node->addr, ethhdr->h_source) &&
+ (tmp_neigh_node->if_incoming == if_incoming) &&
+ atomic_inc_not_zero(&tmp_neigh_node->refcount)) {
+ if (neigh_node)
+ neigh_node_free_ref(neigh_node);
neigh_node = tmp_neigh_node;
continue;
}
@@ -280,16 +407,20 @@ static void update_orig(struct bat_priv *bat_priv,
orig_tmp = get_orig_node(bat_priv, ethhdr->h_source);
if (!orig_tmp)
- return;
+ goto unlock;
neigh_node = create_neighbor(orig_node, orig_tmp,
ethhdr->h_source, if_incoming);
+
+ orig_node_free_ref(orig_tmp);
if (!neigh_node)
- return;
+ goto unlock;
} else
bat_dbg(DBG_BATMAN, bat_priv,
"Updating existing last-hop neighbor of originator\n");
+ rcu_read_unlock();
+
orig_node->flags = batman_packet->flags;
neigh_node->last_valid = jiffies;
@@ -303,6 +434,8 @@ static void update_orig(struct bat_priv *bat_priv,
neigh_node->last_ttl = batman_packet->ttl;
}
+ bonding_candidate_add(orig_node, neigh_node);
+
tmp_hna_buff_len = (hna_buff_len > batman_packet->num_hna * ETH_ALEN ?
batman_packet->num_hna * ETH_ALEN : hna_buff_len);
@@ -319,10 +452,22 @@ static void update_orig(struct bat_priv *bat_priv,
/* if the TQ is the same and the link not more symetric we
* won't consider it either */
if ((orig_node->router) &&
- ((neigh_node->tq_avg == orig_node->router->tq_avg) &&
- (orig_node->router->orig_node->bcast_own_sum[if_incoming->if_num]
- >= neigh_node->orig_node->bcast_own_sum[if_incoming->if_num])))
- goto update_hna;
+ (neigh_node->tq_avg == orig_node->router->tq_avg)) {
+ orig_node_tmp = orig_node->router->orig_node;
+ spin_lock_bh(&orig_node_tmp->ogm_cnt_lock);
+ bcast_own_sum_orig =
+ orig_node_tmp->bcast_own_sum[if_incoming->if_num];
+ spin_unlock_bh(&orig_node_tmp->ogm_cnt_lock);
+
+ orig_node_tmp = neigh_node->orig_node;
+ spin_lock_bh(&orig_node_tmp->ogm_cnt_lock);
+ bcast_own_sum_neigh =
+ orig_node_tmp->bcast_own_sum[if_incoming->if_num];
+ spin_unlock_bh(&orig_node_tmp->ogm_cnt_lock);
+
+ if (bcast_own_sum_orig >= bcast_own_sum_neigh)
+ goto update_hna;
+ }
update_routes(bat_priv, orig_node, neigh_node,
hna_buff, tmp_hna_buff_len);
@@ -343,6 +488,14 @@ update_gw:
(atomic_read(&bat_priv->gw_mode) == GW_MODE_CLIENT) &&
(atomic_read(&bat_priv->gw_sel_class) > 2))
gw_check_election(bat_priv, orig_node);
+
+ goto out;
+
+unlock:
+ rcu_read_unlock();
+out:
+ if (neigh_node)
+ neigh_node_free_ref(neigh_node);
}
/* checks whether the host restarted and is in the protection time.
@@ -380,34 +533,38 @@ static int window_protected(struct bat_priv *bat_priv,
*/
static char count_real_packets(struct ethhdr *ethhdr,
struct batman_packet *batman_packet,
- struct batman_if *if_incoming)
+ struct hard_iface *if_incoming)
{
struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
struct orig_node *orig_node;
struct neigh_node *tmp_neigh_node;
+ struct hlist_node *node;
char is_duplicate = 0;
int32_t seq_diff;
int need_update = 0;
- int set_mark;
+ int set_mark, ret = -1;
orig_node = get_orig_node(bat_priv, batman_packet->orig);
if (!orig_node)
return 0;
+ spin_lock_bh(&orig_node->ogm_cnt_lock);
seq_diff = batman_packet->seqno - orig_node->last_real_seqno;
/* signalize caller that the packet is to be dropped. */
if (window_protected(bat_priv, seq_diff,
&orig_node->batman_seqno_reset))
- return -1;
+ goto out;
- list_for_each_entry(tmp_neigh_node, &orig_node->neigh_list, list) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_node, node,
+ &orig_node->neigh_list, list) {
is_duplicate |= get_bit_status(tmp_neigh_node->real_bits,
orig_node->last_real_seqno,
batman_packet->seqno);
- if (compare_orig(tmp_neigh_node->addr, ethhdr->h_source) &&
+ if (compare_eth(tmp_neigh_node->addr, ethhdr->h_source) &&
(tmp_neigh_node->if_incoming == if_incoming))
set_mark = 1;
else
@@ -421,6 +578,7 @@ static char count_real_packets(struct ethhdr *ethhdr,
tmp_neigh_node->real_packet_count =
bit_packet_count(tmp_neigh_node->real_bits);
}
+ rcu_read_unlock();
if (need_update) {
bat_dbg(DBG_BATMAN, bat_priv,
@@ -429,123 +587,21 @@ static char count_real_packets(struct ethhdr *ethhdr,
orig_node->last_real_seqno = batman_packet->seqno;
}
- return is_duplicate;
-}
-
-/* copy primary address for bonding */
-static void mark_bonding_address(struct bat_priv *bat_priv,
- struct orig_node *orig_node,
- struct orig_node *orig_neigh_node,
- struct batman_packet *batman_packet)
+ ret = is_duplicate;
-{
- if (batman_packet->flags & PRIMARIES_FIRST_HOP)
- memcpy(orig_neigh_node->primary_addr,
- orig_node->orig, ETH_ALEN);
-
- return;
-}
-
-/* mark possible bond.candidates in the neighbor list */
-void update_bonding_candidates(struct bat_priv *bat_priv,
- struct orig_node *orig_node)
-{
- int candidates;
- int interference_candidate;
- int best_tq;
- struct neigh_node *tmp_neigh_node, *tmp_neigh_node2;
- struct neigh_node *first_candidate, *last_candidate;
-
- /* update the candidates for this originator */
- if (!orig_node->router) {
- orig_node->bond.candidates = 0;
- return;
- }
-
- best_tq = orig_node->router->tq_avg;
-
- /* update bond.candidates */
-
- candidates = 0;
-
- /* mark other nodes which also received "PRIMARIES FIRST HOP" packets
- * as "bonding partner" */
-
- /* first, zero the list */
- list_for_each_entry(tmp_neigh_node, &orig_node->neigh_list, list) {
- tmp_neigh_node->next_bond_candidate = NULL;
- }
-
- first_candidate = NULL;
- last_candidate = NULL;
- list_for_each_entry(tmp_neigh_node, &orig_node->neigh_list, list) {
-
- /* only consider if it has the same primary address ... */
- if (memcmp(orig_node->orig,
- tmp_neigh_node->orig_node->primary_addr,
- ETH_ALEN) != 0)
- continue;
-
- /* ... and is good enough to be considered */
- if (tmp_neigh_node->tq_avg < best_tq - BONDING_TQ_THRESHOLD)
- continue;
-
- /* check if we have another candidate with the same
- * mac address or interface. If we do, we won't
- * select this candidate because of possible interference. */
-
- interference_candidate = 0;
- list_for_each_entry(tmp_neigh_node2,
- &orig_node->neigh_list, list) {
-
- if (tmp_neigh_node2 == tmp_neigh_node)
- continue;
-
- /* we only care if the other candidate is even
- * considered as candidate. */
- if (!tmp_neigh_node2->next_bond_candidate)
- continue;
-
-
- if ((tmp_neigh_node->if_incoming ==
- tmp_neigh_node2->if_incoming)
- || (memcmp(tmp_neigh_node->addr,
- tmp_neigh_node2->addr, ETH_ALEN) == 0)) {
-
- interference_candidate = 1;
- break;
- }
- }
- /* don't care further if it is an interference candidate */
- if (interference_candidate)
- continue;
-
- if (!first_candidate) {
- first_candidate = tmp_neigh_node;
- tmp_neigh_node->next_bond_candidate = first_candidate;
- } else
- tmp_neigh_node->next_bond_candidate = last_candidate;
-
- last_candidate = tmp_neigh_node;
-
- candidates++;
- }
-
- if (candidates > 0) {
- first_candidate->next_bond_candidate = last_candidate;
- orig_node->bond.selected = first_candidate;
- }
-
- orig_node->bond.candidates = candidates;
+out:
+ spin_unlock_bh(&orig_node->ogm_cnt_lock);
+ orig_node_free_ref(orig_node);
+ return ret;
}
void receive_bat_packet(struct ethhdr *ethhdr,
- struct batman_packet *batman_packet,
- unsigned char *hna_buff, int hna_buff_len,
- struct batman_if *if_incoming)
+ struct batman_packet *batman_packet,
+ unsigned char *hna_buff, int hna_buff_len,
+ struct hard_iface *if_incoming)
{
struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
struct orig_node *orig_neigh_node, *orig_node;
char has_directlink_flag;
char is_my_addr = 0, is_my_orig = 0, is_my_oldorig = 0;
@@ -573,8 +629,8 @@ void receive_bat_packet(struct ethhdr *ethhdr,
has_directlink_flag = (batman_packet->flags & DIRECTLINK ? 1 : 0);
- is_single_hop_neigh = (compare_orig(ethhdr->h_source,
- batman_packet->orig) ? 1 : 0);
+ is_single_hop_neigh = (compare_eth(ethhdr->h_source,
+ batman_packet->orig) ? 1 : 0);
bat_dbg(DBG_BATMAN, bat_priv,
"Received BATMAN packet via NB: %pM, IF: %s [%pM] "
@@ -587,26 +643,26 @@ void receive_bat_packet(struct ethhdr *ethhdr,
has_directlink_flag);
rcu_read_lock();
- list_for_each_entry_rcu(batman_if, &if_list, list) {
- if (batman_if->if_status != IF_ACTIVE)
+ list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+ if (hard_iface->if_status != IF_ACTIVE)
continue;
- if (batman_if->soft_iface != if_incoming->soft_iface)
+ if (hard_iface->soft_iface != if_incoming->soft_iface)
continue;
- if (compare_orig(ethhdr->h_source,
- batman_if->net_dev->dev_addr))
+ if (compare_eth(ethhdr->h_source,
+ hard_iface->net_dev->dev_addr))
is_my_addr = 1;
- if (compare_orig(batman_packet->orig,
- batman_if->net_dev->dev_addr))
+ if (compare_eth(batman_packet->orig,
+ hard_iface->net_dev->dev_addr))
is_my_orig = 1;
- if (compare_orig(batman_packet->prev_sender,
- batman_if->net_dev->dev_addr))
+ if (compare_eth(batman_packet->prev_sender,
+ hard_iface->net_dev->dev_addr))
is_my_oldorig = 1;
- if (compare_orig(ethhdr->h_source, broadcast_addr))
+ if (compare_eth(ethhdr->h_source, broadcast_addr))
is_broadcast = 1;
}
rcu_read_unlock();
@@ -638,7 +694,6 @@ void receive_bat_packet(struct ethhdr *ethhdr,
int offset;
orig_neigh_node = get_orig_node(bat_priv, ethhdr->h_source);
-
if (!orig_neigh_node)
return;
@@ -647,18 +702,22 @@ void receive_bat_packet(struct ethhdr *ethhdr,
/* if received seqno equals last send seqno save new
* seqno for bidirectional check */
if (has_directlink_flag &&
- compare_orig(if_incoming->net_dev->dev_addr,
- batman_packet->orig) &&
+ compare_eth(if_incoming->net_dev->dev_addr,
+ batman_packet->orig) &&
(batman_packet->seqno - if_incoming_seqno + 2 == 0)) {
offset = if_incoming->if_num * NUM_WORDS;
+
+ spin_lock_bh(&orig_neigh_node->ogm_cnt_lock);
word = &(orig_neigh_node->bcast_own[offset]);
bit_mark(word, 0);
orig_neigh_node->bcast_own_sum[if_incoming->if_num] =
bit_packet_count(word);
+ spin_unlock_bh(&orig_neigh_node->ogm_cnt_lock);
}
bat_dbg(DBG_BATMAN, bat_priv, "Drop packet: "
"originator packet from myself (via neighbor)\n");
+ orig_node_free_ref(orig_neigh_node);
return;
}
@@ -679,27 +738,27 @@ void receive_bat_packet(struct ethhdr *ethhdr,
bat_dbg(DBG_BATMAN, bat_priv,
"Drop packet: packet within seqno protection time "
"(sender: %pM)\n", ethhdr->h_source);
- return;
+ goto out;
}
if (batman_packet->tq == 0) {
bat_dbg(DBG_BATMAN, bat_priv,
"Drop packet: originator packet with tq equal 0\n");
- return;
+ goto out;
}
/* avoid temporary routing loops */
if ((orig_node->router) &&
(orig_node->router->orig_node->router) &&
- (compare_orig(orig_node->router->addr,
- batman_packet->prev_sender)) &&
- !(compare_orig(batman_packet->orig, batman_packet->prev_sender)) &&
- (compare_orig(orig_node->router->addr,
- orig_node->router->orig_node->router->addr))) {
+ (compare_eth(orig_node->router->addr,
+ batman_packet->prev_sender)) &&
+ !(compare_eth(batman_packet->orig, batman_packet->prev_sender)) &&
+ (compare_eth(orig_node->router->addr,
+ orig_node->router->orig_node->router->addr))) {
bat_dbg(DBG_BATMAN, bat_priv,
"Drop packet: ignoring all rebroadcast packets that "
"may make me loop (sender: %pM)\n", ethhdr->h_source);
- return;
+ goto out;
}
/* if sender is a direct neighbor the sender mac equals
@@ -708,19 +767,21 @@ void receive_bat_packet(struct ethhdr *ethhdr,
orig_node :
get_orig_node(bat_priv, ethhdr->h_source));
if (!orig_neigh_node)
- return;
+ goto out;
/* drop packet if sender is not a direct neighbor and if we
* don't route towards it */
if (!is_single_hop_neigh && (!orig_neigh_node->router)) {
bat_dbg(DBG_BATMAN, bat_priv,
"Drop packet: OGM via unknown neighbor!\n");
- return;
+ goto out_neigh;
}
is_bidirectional = is_bidirectional_neigh(orig_node, orig_neigh_node,
batman_packet, if_incoming);
+ bonding_save_primary(orig_node, orig_neigh_node, batman_packet);
+
/* update ranking if it is not a duplicate or has the same
* seqno and similar ttl as the non-duplicate */
if (is_bidirectional &&
@@ -730,10 +791,6 @@ void receive_bat_packet(struct ethhdr *ethhdr,
update_orig(bat_priv, orig_node, ethhdr, batman_packet,
if_incoming, hna_buff, hna_buff_len, is_duplicate);
- mark_bonding_address(bat_priv, orig_node,
- orig_neigh_node, batman_packet);
- update_bonding_candidates(bat_priv, orig_node);
-
/* is single hop (direct) neighbor */
if (is_single_hop_neigh) {
@@ -743,31 +800,36 @@ void receive_bat_packet(struct ethhdr *ethhdr,
bat_dbg(DBG_BATMAN, bat_priv, "Forwarding packet: "
"rebroadcast neighbor packet with direct link flag\n");
- return;
+ goto out_neigh;
}
/* multihop originator */
if (!is_bidirectional) {
bat_dbg(DBG_BATMAN, bat_priv,
"Drop packet: not received via bidirectional link\n");
- return;
+ goto out_neigh;
}
if (is_duplicate) {
bat_dbg(DBG_BATMAN, bat_priv,
"Drop packet: duplicate packet received\n");
- return;
+ goto out_neigh;
}
bat_dbg(DBG_BATMAN, bat_priv,
"Forwarding packet: rebroadcast originator packet\n");
schedule_forward_packet(orig_node, ethhdr, batman_packet,
0, hna_buff_len, if_incoming);
+
+out_neigh:
+ if ((orig_neigh_node) && (!is_single_hop_neigh))
+ orig_node_free_ref(orig_neigh_node);
+out:
+ orig_node_free_ref(orig_node);
}
-int recv_bat_packet(struct sk_buff *skb, struct batman_if *batman_if)
+int recv_bat_packet(struct sk_buff *skb, struct hard_iface *hard_iface)
{
- struct bat_priv *bat_priv = netdev_priv(batman_if->soft_iface);
struct ethhdr *ethhdr;
/* drop packet if it has not necessary minimum size */
@@ -794,12 +856,10 @@ int recv_bat_packet(struct sk_buff *skb, struct batman_if *batman_if)
ethhdr = (struct ethhdr *)skb_mac_header(skb);
- spin_lock_bh(&bat_priv->orig_hash_lock);
receive_aggr_bat_packet(ethhdr,
skb->data,
skb_headlen(skb),
- batman_if);
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ hard_iface);
kfree_skb(skb);
return NET_RX_SUCCESS;
@@ -808,135 +868,144 @@ int recv_bat_packet(struct sk_buff *skb, struct batman_if *batman_if)
static int recv_my_icmp_packet(struct bat_priv *bat_priv,
struct sk_buff *skb, size_t icmp_len)
{
- struct orig_node *orig_node;
+ struct orig_node *orig_node = NULL;
+ struct neigh_node *neigh_node = NULL;
struct icmp_packet_rr *icmp_packet;
- struct ethhdr *ethhdr;
- struct batman_if *batman_if;
- int ret;
- uint8_t dstaddr[ETH_ALEN];
+ int ret = NET_RX_DROP;
icmp_packet = (struct icmp_packet_rr *)skb->data;
- ethhdr = (struct ethhdr *)skb_mac_header(skb);
/* add data to device queue */
if (icmp_packet->msg_type != ECHO_REQUEST) {
bat_socket_receive_packet(icmp_packet, icmp_len);
- return NET_RX_DROP;
+ goto out;
}
if (!bat_priv->primary_if)
- return NET_RX_DROP;
+ goto out;
/* answer echo request (ping) */
/* get routing information */
- spin_lock_bh(&bat_priv->orig_hash_lock);
- orig_node = ((struct orig_node *)hash_find(bat_priv->orig_hash,
- compare_orig, choose_orig,
- icmp_packet->orig));
- ret = NET_RX_DROP;
-
- if ((orig_node) && (orig_node->router)) {
-
- /* don't lock while sending the packets ... we therefore
- * copy the required data before sending */
- batman_if = orig_node->router->if_incoming;
- memcpy(dstaddr, orig_node->router->addr, ETH_ALEN);
- spin_unlock_bh(&bat_priv->orig_hash_lock);
-
- /* create a copy of the skb, if needed, to modify it. */
- if (skb_cow(skb, sizeof(struct ethhdr)) < 0)
- return NET_RX_DROP;
+ rcu_read_lock();
+ orig_node = orig_hash_find(bat_priv, icmp_packet->orig);
- icmp_packet = (struct icmp_packet_rr *)skb->data;
- ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ if (!orig_node)
+ goto unlock;
- memcpy(icmp_packet->dst, icmp_packet->orig, ETH_ALEN);
- memcpy(icmp_packet->orig,
- bat_priv->primary_if->net_dev->dev_addr, ETH_ALEN);
- icmp_packet->msg_type = ECHO_REPLY;
- icmp_packet->ttl = TTL;
+ neigh_node = orig_node->router;
- send_skb_packet(skb, batman_if, dstaddr);
- ret = NET_RX_SUCCESS;
+ if (!neigh_node)
+ goto unlock;
- } else
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ if (!atomic_inc_not_zero(&neigh_node->refcount)) {
+ neigh_node = NULL;
+ goto unlock;
+ }
+
+ rcu_read_unlock();
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, sizeof(struct ethhdr)) < 0)
+ goto out;
+
+ icmp_packet = (struct icmp_packet_rr *)skb->data;
+
+ memcpy(icmp_packet->dst, icmp_packet->orig, ETH_ALEN);
+ memcpy(icmp_packet->orig,
+ bat_priv->primary_if->net_dev->dev_addr, ETH_ALEN);
+ icmp_packet->msg_type = ECHO_REPLY;
+ icmp_packet->ttl = TTL;
+ send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+ ret = NET_RX_SUCCESS;
+ goto out;
+
+unlock:
+ rcu_read_unlock();
+out:
+ if (neigh_node)
+ neigh_node_free_ref(neigh_node);
+ if (orig_node)
+ orig_node_free_ref(orig_node);
return ret;
}
static int recv_icmp_ttl_exceeded(struct bat_priv *bat_priv,
- struct sk_buff *skb, size_t icmp_len)
+ struct sk_buff *skb)
{
- struct orig_node *orig_node;
+ struct orig_node *orig_node = NULL;
+ struct neigh_node *neigh_node = NULL;
struct icmp_packet *icmp_packet;
- struct ethhdr *ethhdr;
- struct batman_if *batman_if;
- int ret;
- uint8_t dstaddr[ETH_ALEN];
+ int ret = NET_RX_DROP;
icmp_packet = (struct icmp_packet *)skb->data;
- ethhdr = (struct ethhdr *)skb_mac_header(skb);
/* send TTL exceeded if packet is an echo request (traceroute) */
if (icmp_packet->msg_type != ECHO_REQUEST) {
pr_debug("Warning - can't forward icmp packet from %pM to "
"%pM: ttl exceeded\n", icmp_packet->orig,
icmp_packet->dst);
- return NET_RX_DROP;
+ goto out;
}
if (!bat_priv->primary_if)
- return NET_RX_DROP;
+ goto out;
/* get routing information */
- spin_lock_bh(&bat_priv->orig_hash_lock);
- orig_node = ((struct orig_node *)
- hash_find(bat_priv->orig_hash, compare_orig, choose_orig,
- icmp_packet->orig));
- ret = NET_RX_DROP;
-
- if ((orig_node) && (orig_node->router)) {
-
- /* don't lock while sending the packets ... we therefore
- * copy the required data before sending */
- batman_if = orig_node->router->if_incoming;
- memcpy(dstaddr, orig_node->router->addr, ETH_ALEN);
- spin_unlock_bh(&bat_priv->orig_hash_lock);
-
- /* create a copy of the skb, if needed, to modify it. */
- if (skb_cow(skb, sizeof(struct ethhdr)) < 0)
- return NET_RX_DROP;
+ rcu_read_lock();
+ orig_node = orig_hash_find(bat_priv, icmp_packet->orig);
- icmp_packet = (struct icmp_packet *) skb->data;
- ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ if (!orig_node)
+ goto unlock;
- memcpy(icmp_packet->dst, icmp_packet->orig, ETH_ALEN);
- memcpy(icmp_packet->orig,
- bat_priv->primary_if->net_dev->dev_addr, ETH_ALEN);
- icmp_packet->msg_type = TTL_EXCEEDED;
- icmp_packet->ttl = TTL;
+ neigh_node = orig_node->router;
- send_skb_packet(skb, batman_if, dstaddr);
- ret = NET_RX_SUCCESS;
+ if (!neigh_node)
+ goto unlock;
- } else
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ if (!atomic_inc_not_zero(&neigh_node->refcount)) {
+ neigh_node = NULL;
+ goto unlock;
+ }
+ rcu_read_unlock();
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, sizeof(struct ethhdr)) < 0)
+ goto out;
+
+ icmp_packet = (struct icmp_packet *)skb->data;
+
+ memcpy(icmp_packet->dst, icmp_packet->orig, ETH_ALEN);
+ memcpy(icmp_packet->orig,
+ bat_priv->primary_if->net_dev->dev_addr, ETH_ALEN);
+ icmp_packet->msg_type = TTL_EXCEEDED;
+ icmp_packet->ttl = TTL;
+
+ send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+ ret = NET_RX_SUCCESS;
+ goto out;
+
+unlock:
+ rcu_read_unlock();
+out:
+ if (neigh_node)
+ neigh_node_free_ref(neigh_node);
+ if (orig_node)
+ orig_node_free_ref(orig_node);
return ret;
}
-int recv_icmp_packet(struct sk_buff *skb, struct batman_if *recv_if)
+int recv_icmp_packet(struct sk_buff *skb, struct hard_iface *recv_if)
{
struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface);
struct icmp_packet_rr *icmp_packet;
struct ethhdr *ethhdr;
- struct orig_node *orig_node;
- struct batman_if *batman_if;
+ struct orig_node *orig_node = NULL;
+ struct neigh_node *neigh_node = NULL;
int hdr_size = sizeof(struct icmp_packet);
- int ret;
- uint8_t dstaddr[ETH_ALEN];
+ int ret = NET_RX_DROP;
/**
* we truncate all incoming icmp packets if they don't match our size
@@ -946,21 +1015,21 @@ int recv_icmp_packet(struct sk_buff *skb, struct batman_if *recv_if)
/* drop packet if it has not necessary minimum size */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
- return NET_RX_DROP;
+ goto out;
ethhdr = (struct ethhdr *)skb_mac_header(skb);
/* packet with unicast indication but broadcast recipient */
if (is_broadcast_ether_addr(ethhdr->h_dest))
- return NET_RX_DROP;
+ goto out;
/* packet with broadcast sender address */
if (is_broadcast_ether_addr(ethhdr->h_source))
- return NET_RX_DROP;
+ goto out;
/* not for me */
if (!is_my_mac(ethhdr->h_dest))
- return NET_RX_DROP;
+ goto out;
icmp_packet = (struct icmp_packet_rr *)skb->data;
@@ -978,53 +1047,61 @@ int recv_icmp_packet(struct sk_buff *skb, struct batman_if *recv_if)
/* TTL exceeded */
if (icmp_packet->ttl < 2)
- return recv_icmp_ttl_exceeded(bat_priv, skb, hdr_size);
-
- ret = NET_RX_DROP;
+ return recv_icmp_ttl_exceeded(bat_priv, skb);
/* get routing information */
- spin_lock_bh(&bat_priv->orig_hash_lock);
- orig_node = ((struct orig_node *)
- hash_find(bat_priv->orig_hash, compare_orig, choose_orig,
- icmp_packet->dst));
+ rcu_read_lock();
+ orig_node = orig_hash_find(bat_priv, icmp_packet->dst);
- if ((orig_node) && (orig_node->router)) {
+ if (!orig_node)
+ goto unlock;
- /* don't lock while sending the packets ... we therefore
- * copy the required data before sending */
- batman_if = orig_node->router->if_incoming;
- memcpy(dstaddr, orig_node->router->addr, ETH_ALEN);
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ neigh_node = orig_node->router;
- /* create a copy of the skb, if needed, to modify it. */
- if (skb_cow(skb, sizeof(struct ethhdr)) < 0)
- return NET_RX_DROP;
+ if (!neigh_node)
+ goto unlock;
- icmp_packet = (struct icmp_packet_rr *)skb->data;
- ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ if (!atomic_inc_not_zero(&neigh_node->refcount)) {
+ neigh_node = NULL;
+ goto unlock;
+ }
- /* decrement ttl */
- icmp_packet->ttl--;
+ rcu_read_unlock();
- /* route it */
- send_skb_packet(skb, batman_if, dstaddr);
- ret = NET_RX_SUCCESS;
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, sizeof(struct ethhdr)) < 0)
+ goto out;
- } else
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ icmp_packet = (struct icmp_packet_rr *)skb->data;
+
+ /* decrement ttl */
+ icmp_packet->ttl--;
+ /* route it */
+ send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+ ret = NET_RX_SUCCESS;
+ goto out;
+
+unlock:
+ rcu_read_unlock();
+out:
+ if (neigh_node)
+ neigh_node_free_ref(neigh_node);
+ if (orig_node)
+ orig_node_free_ref(orig_node);
return ret;
}
/* find a suitable router for this originator, and use
- * bonding if possible. */
+ * bonding if possible. increases the found neighbors
+ * refcount.*/
struct neigh_node *find_router(struct bat_priv *bat_priv,
struct orig_node *orig_node,
- struct batman_if *recv_if)
+ struct hard_iface *recv_if)
{
struct orig_node *primary_orig_node;
struct orig_node *router_orig;
- struct neigh_node *router, *first_candidate, *best_router;
+ struct neigh_node *router, *first_candidate, *tmp_neigh_node;
static uint8_t zero_mac[ETH_ALEN] = {0, 0, 0, 0, 0, 0};
int bonding_enabled;
@@ -1036,78 +1113,128 @@ struct neigh_node *find_router(struct bat_priv *bat_priv,
/* without bonding, the first node should
* always choose the default router. */
-
bonding_enabled = atomic_read(&bat_priv->bonding);
- if ((!recv_if) && (!bonding_enabled))
- return orig_node->router;
-
+ rcu_read_lock();
+ /* select default router to output */
+ router = orig_node->router;
router_orig = orig_node->router->orig_node;
+ if (!router_orig || !atomic_inc_not_zero(&router->refcount)) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ if ((!recv_if) && (!bonding_enabled))
+ goto return_router;
/* if we have something in the primary_addr, we can search
* for a potential bonding candidate. */
- if (memcmp(router_orig->primary_addr, zero_mac, ETH_ALEN) == 0)
- return orig_node->router;
+ if (compare_eth(router_orig->primary_addr, zero_mac))
+ goto return_router;
/* find the orig_node which has the primary interface. might
* even be the same as our router_orig in many cases */
- if (memcmp(router_orig->primary_addr,
- router_orig->orig, ETH_ALEN) == 0) {
+ if (compare_eth(router_orig->primary_addr, router_orig->orig)) {
primary_orig_node = router_orig;
} else {
- primary_orig_node = hash_find(bat_priv->orig_hash, compare_orig,
- choose_orig,
- router_orig->primary_addr);
-
+ primary_orig_node = orig_hash_find(bat_priv,
+ router_orig->primary_addr);
if (!primary_orig_node)
- return orig_node->router;
+ goto return_router;
+
+ orig_node_free_ref(primary_orig_node);
}
/* with less than 2 candidates, we can't do any
* bonding and prefer the original router. */
-
- if (primary_orig_node->bond.candidates < 2)
- return orig_node->router;
+ if (atomic_read(&primary_orig_node->bond_candidates) < 2)
+ goto return_router;
/* all nodes between should choose a candidate which
* is is not on the interface where the packet came
* in. */
- first_candidate = primary_orig_node->bond.selected;
- router = first_candidate;
+
+ neigh_node_free_ref(router);
+ first_candidate = NULL;
+ router = NULL;
if (bonding_enabled) {
/* in the bonding case, send the packets in a round
* robin fashion over the remaining interfaces. */
- do {
+
+ list_for_each_entry_rcu(tmp_neigh_node,
+ &primary_orig_node->bond_list, bonding_list) {
+ if (!first_candidate)
+ first_candidate = tmp_neigh_node;
/* recv_if == NULL on the first node. */
- if (router->if_incoming != recv_if)
+ if (tmp_neigh_node->if_incoming != recv_if &&
+ atomic_inc_not_zero(&tmp_neigh_node->refcount)) {
+ router = tmp_neigh_node;
break;
+ }
+ }
+
+ /* use the first candidate if nothing was found. */
+ if (!router && first_candidate &&
+ atomic_inc_not_zero(&first_candidate->refcount))
+ router = first_candidate;
- router = router->next_bond_candidate;
- } while (router != first_candidate);
+ if (!router) {
+ rcu_read_unlock();
+ return NULL;
+ }
- primary_orig_node->bond.selected = router->next_bond_candidate;
+ /* selected should point to the next element
+ * after the current router */
+ spin_lock_bh(&primary_orig_node->neigh_list_lock);
+ /* this is a list_move(), which unfortunately
+ * does not exist as rcu version */
+ list_del_rcu(&primary_orig_node->bond_list);
+ list_add_rcu(&primary_orig_node->bond_list,
+ &router->bonding_list);
+ spin_unlock_bh(&primary_orig_node->neigh_list_lock);
} else {
/* if bonding is disabled, use the best of the
* remaining candidates which are not using
* this interface. */
- best_router = first_candidate;
+ list_for_each_entry_rcu(tmp_neigh_node,
+ &primary_orig_node->bond_list, bonding_list) {
+ if (!first_candidate)
+ first_candidate = tmp_neigh_node;
- do {
/* recv_if == NULL on the first node. */
- if ((router->if_incoming != recv_if) &&
- (router->tq_avg > best_router->tq_avg))
- best_router = router;
+ if (tmp_neigh_node->if_incoming == recv_if)
+ continue;
- router = router->next_bond_candidate;
- } while (router != first_candidate);
+ if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+ continue;
- router = best_router;
- }
+ /* if we don't have a router yet
+ * or this one is better, choose it. */
+ if ((!router) ||
+ (tmp_neigh_node->tq_avg > router->tq_avg)) {
+ /* decrement refcount of
+ * previously selected router */
+ if (router)
+ neigh_node_free_ref(router);
+
+ router = tmp_neigh_node;
+ atomic_inc_not_zero(&router->refcount);
+ }
+
+ neigh_node_free_ref(tmp_neigh_node);
+ }
+ /* use the first candidate if nothing was found. */
+ if (!router && first_candidate &&
+ atomic_inc_not_zero(&first_candidate->refcount))
+ router = first_candidate;
+ }
+return_router:
+ rcu_read_unlock();
return router;
}
@@ -1136,17 +1263,14 @@ static int check_unicast_packet(struct sk_buff *skb, int hdr_size)
return 0;
}
-int route_unicast_packet(struct sk_buff *skb, struct batman_if *recv_if,
- int hdr_size)
+int route_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if)
{
struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface);
- struct orig_node *orig_node;
- struct neigh_node *router;
- struct batman_if *batman_if;
- uint8_t dstaddr[ETH_ALEN];
+ struct orig_node *orig_node = NULL;
+ struct neigh_node *neigh_node = NULL;
struct unicast_packet *unicast_packet;
struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
- int ret;
+ int ret = NET_RX_DROP;
struct sk_buff *new_skb;
unicast_packet = (struct unicast_packet *)skb->data;
@@ -1156,53 +1280,51 @@ int route_unicast_packet(struct sk_buff *skb, struct batman_if *recv_if,
pr_debug("Warning - can't forward unicast packet from %pM to "
"%pM: ttl exceeded\n", ethhdr->h_source,
unicast_packet->dest);
- return NET_RX_DROP;
+ goto out;
}
/* get routing information */
- spin_lock_bh(&bat_priv->orig_hash_lock);
- orig_node = ((struct orig_node *)
- hash_find(bat_priv->orig_hash, compare_orig, choose_orig,
- unicast_packet->dest));
-
- router = find_router(bat_priv, orig_node, recv_if);
+ rcu_read_lock();
+ orig_node = orig_hash_find(bat_priv, unicast_packet->dest);
- if (!router) {
- spin_unlock_bh(&bat_priv->orig_hash_lock);
- return NET_RX_DROP;
- }
+ if (!orig_node)
+ goto unlock;
- /* don't lock while sending the packets ... we therefore
- * copy the required data before sending */
+ rcu_read_unlock();
- batman_if = router->if_incoming;
- memcpy(dstaddr, router->addr, ETH_ALEN);
+ /* find_router() increases neigh_nodes refcount if found. */
+ neigh_node = find_router(bat_priv, orig_node, recv_if);
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ if (!neigh_node)
+ goto out;
/* create a copy of the skb, if needed, to modify it. */
if (skb_cow(skb, sizeof(struct ethhdr)) < 0)
- return NET_RX_DROP;
+ goto out;
unicast_packet = (struct unicast_packet *)skb->data;
if (unicast_packet->packet_type == BAT_UNICAST &&
atomic_read(&bat_priv->fragmentation) &&
- skb->len > batman_if->net_dev->mtu)
- return frag_send_skb(skb, bat_priv, batman_if,
- dstaddr);
+ skb->len > neigh_node->if_incoming->net_dev->mtu) {
+ ret = frag_send_skb(skb, bat_priv,
+ neigh_node->if_incoming, neigh_node->addr);
+ goto out;
+ }
if (unicast_packet->packet_type == BAT_UNICAST_FRAG &&
- 2 * skb->len - hdr_size <= batman_if->net_dev->mtu) {
+ frag_can_reassemble(skb, neigh_node->if_incoming->net_dev->mtu)) {
ret = frag_reassemble_skb(skb, bat_priv, &new_skb);
if (ret == NET_RX_DROP)
- return NET_RX_DROP;
+ goto out;
/* packet was buffered for late merge */
- if (!new_skb)
- return NET_RX_SUCCESS;
+ if (!new_skb) {
+ ret = NET_RX_SUCCESS;
+ goto out;
+ }
skb = new_skb;
unicast_packet = (struct unicast_packet *)skb->data;
@@ -1212,12 +1334,21 @@ int route_unicast_packet(struct sk_buff *skb, struct batman_if *recv_if,
unicast_packet->ttl--;
/* route it */
- send_skb_packet(skb, batman_if, dstaddr);
+ send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+ ret = NET_RX_SUCCESS;
+ goto out;
- return NET_RX_SUCCESS;
+unlock:
+ rcu_read_unlock();
+out:
+ if (neigh_node)
+ neigh_node_free_ref(neigh_node);
+ if (orig_node)
+ orig_node_free_ref(orig_node);
+ return ret;
}
-int recv_unicast_packet(struct sk_buff *skb, struct batman_if *recv_if)
+int recv_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if)
{
struct unicast_packet *unicast_packet;
int hdr_size = sizeof(struct unicast_packet);
@@ -1233,10 +1364,10 @@ int recv_unicast_packet(struct sk_buff *skb, struct batman_if *recv_if)
return NET_RX_SUCCESS;
}
- return route_unicast_packet(skb, recv_if, hdr_size);
+ return route_unicast_packet(skb, recv_if);
}
-int recv_ucast_frag_packet(struct sk_buff *skb, struct batman_if *recv_if)
+int recv_ucast_frag_packet(struct sk_buff *skb, struct hard_iface *recv_if)
{
struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface);
struct unicast_frag_packet *unicast_packet;
@@ -1266,89 +1397,96 @@ int recv_ucast_frag_packet(struct sk_buff *skb, struct batman_if *recv_if)
return NET_RX_SUCCESS;
}
- return route_unicast_packet(skb, recv_if, hdr_size);
+ return route_unicast_packet(skb, recv_if);
}
-int recv_bcast_packet(struct sk_buff *skb, struct batman_if *recv_if)
+int recv_bcast_packet(struct sk_buff *skb, struct hard_iface *recv_if)
{
struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface);
- struct orig_node *orig_node;
+ struct orig_node *orig_node = NULL;
struct bcast_packet *bcast_packet;
struct ethhdr *ethhdr;
int hdr_size = sizeof(struct bcast_packet);
+ int ret = NET_RX_DROP;
int32_t seq_diff;
/* drop packet if it has not necessary minimum size */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
- return NET_RX_DROP;
+ goto out;
ethhdr = (struct ethhdr *)skb_mac_header(skb);
/* packet with broadcast indication but unicast recipient */
if (!is_broadcast_ether_addr(ethhdr->h_dest))
- return NET_RX_DROP;
+ goto out;
/* packet with broadcast sender address */
if (is_broadcast_ether_addr(ethhdr->h_source))
- return NET_RX_DROP;
+ goto out;
/* ignore broadcasts sent by myself */
if (is_my_mac(ethhdr->h_source))
- return NET_RX_DROP;
+ goto out;
bcast_packet = (struct bcast_packet *)skb->data;
/* ignore broadcasts originated by myself */
if (is_my_mac(bcast_packet->orig))
- return NET_RX_DROP;
+ goto out;
if (bcast_packet->ttl < 2)
- return NET_RX_DROP;
+ goto out;
- spin_lock_bh(&bat_priv->orig_hash_lock);
- orig_node = ((struct orig_node *)
- hash_find(bat_priv->orig_hash, compare_orig, choose_orig,
- bcast_packet->orig));
+ rcu_read_lock();
+ orig_node = orig_hash_find(bat_priv, bcast_packet->orig);
- if (!orig_node) {
- spin_unlock_bh(&bat_priv->orig_hash_lock);
- return NET_RX_DROP;
- }
+ if (!orig_node)
+ goto rcu_unlock;
+
+ rcu_read_unlock();
+
+ spin_lock_bh(&orig_node->bcast_seqno_lock);
/* check whether the packet is a duplicate */
- if (get_bit_status(orig_node->bcast_bits,
- orig_node->last_bcast_seqno,
- ntohl(bcast_packet->seqno))) {
- spin_unlock_bh(&bat_priv->orig_hash_lock);
- return NET_RX_DROP;
- }
+ if (get_bit_status(orig_node->bcast_bits, orig_node->last_bcast_seqno,
+ ntohl(bcast_packet->seqno)))
+ goto spin_unlock;
seq_diff = ntohl(bcast_packet->seqno) - orig_node->last_bcast_seqno;
/* check whether the packet is old and the host just restarted. */
if (window_protected(bat_priv, seq_diff,
- &orig_node->bcast_seqno_reset)) {
- spin_unlock_bh(&bat_priv->orig_hash_lock);
- return NET_RX_DROP;
- }
+ &orig_node->bcast_seqno_reset))
+ goto spin_unlock;
/* mark broadcast in flood history, update window position
* if required. */
if (bit_get_packet(bat_priv, orig_node->bcast_bits, seq_diff, 1))
orig_node->last_bcast_seqno = ntohl(bcast_packet->seqno);
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ spin_unlock_bh(&orig_node->bcast_seqno_lock);
+
/* rebroadcast packet */
add_bcast_packet_to_list(bat_priv, skb);
/* broadcast for me */
interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size);
+ ret = NET_RX_SUCCESS;
+ goto out;
- return NET_RX_SUCCESS;
+rcu_unlock:
+ rcu_read_unlock();
+ goto out;
+spin_unlock:
+ spin_unlock_bh(&orig_node->bcast_seqno_lock);
+out:
+ if (orig_node)
+ orig_node_free_ref(orig_node);
+ return ret;
}
-int recv_vis_packet(struct sk_buff *skb, struct batman_if *recv_if)
+int recv_vis_packet(struct sk_buff *skb, struct hard_iface *recv_if)
{
struct vis_packet *vis_packet;
struct ethhdr *ethhdr;
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index f108f230bfd..b5a064c88a4 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,27 +22,25 @@
#ifndef _NET_BATMAN_ADV_ROUTING_H_
#define _NET_BATMAN_ADV_ROUTING_H_
-#include "types.h"
-
-void slide_own_bcast_window(struct batman_if *batman_if);
+void slide_own_bcast_window(struct hard_iface *hard_iface);
void receive_bat_packet(struct ethhdr *ethhdr,
struct batman_packet *batman_packet,
unsigned char *hna_buff, int hna_buff_len,
- struct batman_if *if_incoming);
+ struct hard_iface *if_incoming);
void update_routes(struct bat_priv *bat_priv, struct orig_node *orig_node,
struct neigh_node *neigh_node, unsigned char *hna_buff,
int hna_buff_len);
-int route_unicast_packet(struct sk_buff *skb, struct batman_if *recv_if,
- int hdr_size);
-int recv_icmp_packet(struct sk_buff *skb, struct batman_if *recv_if);
-int recv_unicast_packet(struct sk_buff *skb, struct batman_if *recv_if);
-int recv_ucast_frag_packet(struct sk_buff *skb, struct batman_if *recv_if);
-int recv_bcast_packet(struct sk_buff *skb, struct batman_if *recv_if);
-int recv_vis_packet(struct sk_buff *skb, struct batman_if *recv_if);
-int recv_bat_packet(struct sk_buff *skb, struct batman_if *recv_if);
+int route_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_icmp_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_ucast_frag_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_bcast_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_vis_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_bat_packet(struct sk_buff *skb, struct hard_iface *recv_if);
struct neigh_node *find_router(struct bat_priv *bat_priv,
- struct orig_node *orig_node, struct batman_if *recv_if);
-void update_bonding_candidates(struct bat_priv *bat_priv,
- struct orig_node *orig_node);
+ struct orig_node *orig_node,
+ struct hard_iface *recv_if);
+void bonding_candidate_del(struct orig_node *orig_node,
+ struct neigh_node *neigh_node);
#endif /* _NET_BATMAN_ADV_ROUTING_H_ */
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index b89b9f7709a..d49e54d932a 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -25,7 +25,6 @@
#include "translation-table.h"
#include "soft-interface.h"
#include "hard-interface.h"
-#include "types.h"
#include "vis.h"
#include "aggregation.h"
#include "gateway_common.h"
@@ -49,7 +48,7 @@ static unsigned long own_send_time(struct bat_priv *bat_priv)
}
/* when do we schedule a forwarded packet to be sent */
-static unsigned long forward_send_time(struct bat_priv *bat_priv)
+static unsigned long forward_send_time(void)
{
return jiffies + msecs_to_jiffies(random32() % (JITTER/2));
}
@@ -57,20 +56,20 @@ static unsigned long forward_send_time(struct bat_priv *bat_priv)
/* send out an already prepared packet to the given address via the
* specified batman interface */
int send_skb_packet(struct sk_buff *skb,
- struct batman_if *batman_if,
+ struct hard_iface *hard_iface,
uint8_t *dst_addr)
{
struct ethhdr *ethhdr;
- if (batman_if->if_status != IF_ACTIVE)
+ if (hard_iface->if_status != IF_ACTIVE)
goto send_skb_err;
- if (unlikely(!batman_if->net_dev))
+ if (unlikely(!hard_iface->net_dev))
goto send_skb_err;
- if (!(batman_if->net_dev->flags & IFF_UP)) {
+ if (!(hard_iface->net_dev->flags & IFF_UP)) {
pr_warning("Interface %s is not up - can't send packet via "
- "that interface!\n", batman_if->net_dev->name);
+ "that interface!\n", hard_iface->net_dev->name);
goto send_skb_err;
}
@@ -81,7 +80,7 @@ int send_skb_packet(struct sk_buff *skb,
skb_reset_mac_header(skb);
ethhdr = (struct ethhdr *) skb_mac_header(skb);
- memcpy(ethhdr->h_source, batman_if->net_dev->dev_addr, ETH_ALEN);
+ memcpy(ethhdr->h_source, hard_iface->net_dev->dev_addr, ETH_ALEN);
memcpy(ethhdr->h_dest, dst_addr, ETH_ALEN);
ethhdr->h_proto = __constant_htons(ETH_P_BATMAN);
@@ -89,7 +88,7 @@ int send_skb_packet(struct sk_buff *skb,
skb->priority = TC_PRIO_CONTROL;
skb->protocol = __constant_htons(ETH_P_BATMAN);
- skb->dev = batman_if->net_dev;
+ skb->dev = hard_iface->net_dev;
/* dev_queue_xmit() returns a negative result on error. However on
* congestion and traffic shaping, it drops and returns NET_XMIT_DROP
@@ -103,16 +102,16 @@ send_skb_err:
/* Send a packet to a given interface */
static void send_packet_to_if(struct forw_packet *forw_packet,
- struct batman_if *batman_if)
+ struct hard_iface *hard_iface)
{
- struct bat_priv *bat_priv = netdev_priv(batman_if->soft_iface);
+ struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
char *fwd_str;
uint8_t packet_num;
int16_t buff_pos;
struct batman_packet *batman_packet;
struct sk_buff *skb;
- if (batman_if->if_status != IF_ACTIVE)
+ if (hard_iface->if_status != IF_ACTIVE)
return;
packet_num = 0;
@@ -127,7 +126,7 @@ static void send_packet_to_if(struct forw_packet *forw_packet,
/* we might have aggregated direct link packets with an
* ordinary base packet */
if ((forw_packet->direct_link_flags & (1 << packet_num)) &&
- (forw_packet->if_incoming == batman_if))
+ (forw_packet->if_incoming == hard_iface))
batman_packet->flags |= DIRECTLINK;
else
batman_packet->flags &= ~DIRECTLINK;
@@ -143,7 +142,8 @@ static void send_packet_to_if(struct forw_packet *forw_packet,
batman_packet->tq, batman_packet->ttl,
(batman_packet->flags & DIRECTLINK ?
"on" : "off"),
- batman_if->net_dev->name, batman_if->net_dev->dev_addr);
+ hard_iface->net_dev->name,
+ hard_iface->net_dev->dev_addr);
buff_pos += sizeof(struct batman_packet) +
(batman_packet->num_hna * ETH_ALEN);
@@ -155,13 +155,13 @@ static void send_packet_to_if(struct forw_packet *forw_packet,
/* create clone because function is called more than once */
skb = skb_clone(forw_packet->skb, GFP_ATOMIC);
if (skb)
- send_skb_packet(skb, batman_if, broadcast_addr);
+ send_skb_packet(skb, hard_iface, broadcast_addr);
}
/* send a batman packet */
static void send_packet(struct forw_packet *forw_packet)
{
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
struct net_device *soft_iface;
struct bat_priv *bat_priv;
struct batman_packet *batman_packet =
@@ -205,17 +205,17 @@ static void send_packet(struct forw_packet *forw_packet)
/* broadcast on every interface */
rcu_read_lock();
- list_for_each_entry_rcu(batman_if, &if_list, list) {
- if (batman_if->soft_iface != soft_iface)
+ list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+ if (hard_iface->soft_iface != soft_iface)
continue;
- send_packet_to_if(forw_packet, batman_if);
+ send_packet_to_if(forw_packet, hard_iface);
}
rcu_read_unlock();
}
static void rebuild_batman_packet(struct bat_priv *bat_priv,
- struct batman_if *batman_if)
+ struct hard_iface *hard_iface)
{
int new_len;
unsigned char *new_buff;
@@ -227,7 +227,7 @@ static void rebuild_batman_packet(struct bat_priv *bat_priv,
/* keep old buffer if kmalloc should fail */
if (new_buff) {
- memcpy(new_buff, batman_if->packet_buff,
+ memcpy(new_buff, hard_iface->packet_buff,
sizeof(struct batman_packet));
batman_packet = (struct batman_packet *)new_buff;
@@ -235,21 +235,21 @@ static void rebuild_batman_packet(struct bat_priv *bat_priv,
new_buff + sizeof(struct batman_packet),
new_len - sizeof(struct batman_packet));
- kfree(batman_if->packet_buff);
- batman_if->packet_buff = new_buff;
- batman_if->packet_len = new_len;
+ kfree(hard_iface->packet_buff);
+ hard_iface->packet_buff = new_buff;
+ hard_iface->packet_len = new_len;
}
}
-void schedule_own_packet(struct batman_if *batman_if)
+void schedule_own_packet(struct hard_iface *hard_iface)
{
- struct bat_priv *bat_priv = netdev_priv(batman_if->soft_iface);
+ struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
unsigned long send_time;
struct batman_packet *batman_packet;
int vis_server;
- if ((batman_if->if_status == IF_NOT_IN_USE) ||
- (batman_if->if_status == IF_TO_BE_REMOVED))
+ if ((hard_iface->if_status == IF_NOT_IN_USE) ||
+ (hard_iface->if_status == IF_TO_BE_REMOVED))
return;
vis_server = atomic_read(&bat_priv->vis_mode);
@@ -261,51 +261,51 @@ void schedule_own_packet(struct batman_if *batman_if)
* outdated packets (especially uninitialized mac addresses) in the
* packet queue
*/
- if (batman_if->if_status == IF_TO_BE_ACTIVATED)
- batman_if->if_status = IF_ACTIVE;
+ if (hard_iface->if_status == IF_TO_BE_ACTIVATED)
+ hard_iface->if_status = IF_ACTIVE;
/* if local hna has changed and interface is a primary interface */
if ((atomic_read(&bat_priv->hna_local_changed)) &&
- (batman_if == bat_priv->primary_if))
- rebuild_batman_packet(bat_priv, batman_if);
+ (hard_iface == bat_priv->primary_if))
+ rebuild_batman_packet(bat_priv, hard_iface);
/**
* NOTE: packet_buff might just have been re-allocated in
* rebuild_batman_packet()
*/
- batman_packet = (struct batman_packet *)batman_if->packet_buff;
+ batman_packet = (struct batman_packet *)hard_iface->packet_buff;
/* change sequence number to network order */
batman_packet->seqno =
- htonl((uint32_t)atomic_read(&batman_if->seqno));
+ htonl((uint32_t)atomic_read(&hard_iface->seqno));
if (vis_server == VIS_TYPE_SERVER_SYNC)
batman_packet->flags |= VIS_SERVER;
else
batman_packet->flags &= ~VIS_SERVER;
- if ((batman_if == bat_priv->primary_if) &&
+ if ((hard_iface == bat_priv->primary_if) &&
(atomic_read(&bat_priv->gw_mode) == GW_MODE_SERVER))
batman_packet->gw_flags =
(uint8_t)atomic_read(&bat_priv->gw_bandwidth);
else
batman_packet->gw_flags = 0;
- atomic_inc(&batman_if->seqno);
+ atomic_inc(&hard_iface->seqno);
- slide_own_bcast_window(batman_if);
+ slide_own_bcast_window(hard_iface);
send_time = own_send_time(bat_priv);
add_bat_packet_to_list(bat_priv,
- batman_if->packet_buff,
- batman_if->packet_len,
- batman_if, 1, send_time);
+ hard_iface->packet_buff,
+ hard_iface->packet_len,
+ hard_iface, 1, send_time);
}
void schedule_forward_packet(struct orig_node *orig_node,
struct ethhdr *ethhdr,
struct batman_packet *batman_packet,
uint8_t directlink, int hna_buff_len,
- struct batman_if *if_incoming)
+ struct hard_iface *if_incoming)
{
struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
unsigned char in_tq, in_ttl, tq_avg = 0;
@@ -327,7 +327,7 @@ void schedule_forward_packet(struct orig_node *orig_node,
if ((orig_node->router) && (orig_node->router->tq_avg != 0)) {
/* rebroadcast ogm of best ranking neighbor as is */
- if (!compare_orig(orig_node->router->addr, ethhdr->h_source)) {
+ if (!compare_eth(orig_node->router->addr, ethhdr->h_source)) {
batman_packet->tq = orig_node->router->tq_avg;
if (orig_node->router->last_ttl)
@@ -356,7 +356,7 @@ void schedule_forward_packet(struct orig_node *orig_node,
else
batman_packet->flags &= ~DIRECTLINK;
- send_time = forward_send_time(bat_priv);
+ send_time = forward_send_time();
add_bat_packet_to_list(bat_priv,
(unsigned char *)batman_packet,
sizeof(struct batman_packet) + hna_buff_len,
@@ -444,7 +444,7 @@ out:
static void send_outstanding_bcast_packet(struct work_struct *work)
{
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
struct delayed_work *delayed_work =
container_of(work, struct delayed_work, work);
struct forw_packet *forw_packet =
@@ -462,14 +462,14 @@ static void send_outstanding_bcast_packet(struct work_struct *work)
/* rebroadcast packet */
rcu_read_lock();
- list_for_each_entry_rcu(batman_if, &if_list, list) {
- if (batman_if->soft_iface != soft_iface)
+ list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+ if (hard_iface->soft_iface != soft_iface)
continue;
/* send a copy of the saved skb */
skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC);
if (skb1)
- send_skb_packet(skb1, batman_if, broadcast_addr);
+ send_skb_packet(skb1, hard_iface, broadcast_addr);
}
rcu_read_unlock();
@@ -522,15 +522,15 @@ out:
}
void purge_outstanding_packets(struct bat_priv *bat_priv,
- struct batman_if *batman_if)
+ struct hard_iface *hard_iface)
{
struct forw_packet *forw_packet;
struct hlist_node *tmp_node, *safe_tmp_node;
- if (batman_if)
+ if (hard_iface)
bat_dbg(DBG_BATMAN, bat_priv,
"purge_outstanding_packets(): %s\n",
- batman_if->net_dev->name);
+ hard_iface->net_dev->name);
else
bat_dbg(DBG_BATMAN, bat_priv,
"purge_outstanding_packets()\n");
@@ -544,8 +544,8 @@ void purge_outstanding_packets(struct bat_priv *bat_priv,
* if purge_outstanding_packets() was called with an argmument
* we delete only packets belonging to the given interface
*/
- if ((batman_if) &&
- (forw_packet->if_incoming != batman_if))
+ if ((hard_iface) &&
+ (forw_packet->if_incoming != hard_iface))
continue;
spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
@@ -568,8 +568,8 @@ void purge_outstanding_packets(struct bat_priv *bat_priv,
* if purge_outstanding_packets() was called with an argmument
* we delete only packets belonging to the given interface
*/
- if ((batman_if) &&
- (forw_packet->if_incoming != batman_if))
+ if ((hard_iface) &&
+ (forw_packet->if_incoming != hard_iface))
continue;
spin_unlock_bh(&bat_priv->forw_bat_list_lock);
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index c4cefa8e4f8..7b2ff19c05e 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,20 +22,18 @@
#ifndef _NET_BATMAN_ADV_SEND_H_
#define _NET_BATMAN_ADV_SEND_H_
-#include "types.h"
-
int send_skb_packet(struct sk_buff *skb,
- struct batman_if *batman_if,
+ struct hard_iface *hard_iface,
uint8_t *dst_addr);
-void schedule_own_packet(struct batman_if *batman_if);
+void schedule_own_packet(struct hard_iface *hard_iface);
void schedule_forward_packet(struct orig_node *orig_node,
struct ethhdr *ethhdr,
struct batman_packet *batman_packet,
uint8_t directlink, int hna_buff_len,
- struct batman_if *if_outgoing);
+ struct hard_iface *if_outgoing);
int add_bcast_packet_to_list(struct bat_priv *bat_priv, struct sk_buff *skb);
void send_outstanding_bat_packet(struct work_struct *work);
void purge_outstanding_packets(struct bat_priv *bat_priv,
- struct batman_if *batman_if);
+ struct hard_iface *hard_iface);
#endif /* _NET_BATMAN_ADV_SEND_H_ */
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index e89ede192ed..9ed26140a26 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -26,18 +26,15 @@
#include "send.h"
#include "bat_debugfs.h"
#include "translation-table.h"
-#include "types.h"
#include "hash.h"
#include "gateway_common.h"
#include "gateway_client.h"
-#include "send.h"
#include "bat_sysfs.h"
#include <linux/slab.h>
#include <linux/ethtool.h>
#include <linux/etherdevice.h>
#include <linux/if_vlan.h>
#include "unicast.h"
-#include "routing.h"
static int bat_get_settings(struct net_device *dev, struct ethtool_cmd *cmd);
@@ -79,20 +76,18 @@ int my_skb_head_push(struct sk_buff *skb, unsigned int len)
return 0;
}
-static void softif_neigh_free_ref(struct kref *refcount)
+static void softif_neigh_free_rcu(struct rcu_head *rcu)
{
struct softif_neigh *softif_neigh;
- softif_neigh = container_of(refcount, struct softif_neigh, refcount);
+ softif_neigh = container_of(rcu, struct softif_neigh, rcu);
kfree(softif_neigh);
}
-static void softif_neigh_free_rcu(struct rcu_head *rcu)
+static void softif_neigh_free_ref(struct softif_neigh *softif_neigh)
{
- struct softif_neigh *softif_neigh;
-
- softif_neigh = container_of(rcu, struct softif_neigh, rcu);
- kref_put(&softif_neigh->refcount, softif_neigh_free_ref);
+ if (atomic_dec_and_test(&softif_neigh->refcount))
+ call_rcu(&softif_neigh->rcu, softif_neigh_free_rcu);
}
void softif_neigh_purge(struct bat_priv *bat_priv)
@@ -119,11 +114,10 @@ void softif_neigh_purge(struct bat_priv *bat_priv)
softif_neigh->addr, softif_neigh->vid);
softif_neigh_tmp = bat_priv->softif_neigh;
bat_priv->softif_neigh = NULL;
- kref_put(&softif_neigh_tmp->refcount,
- softif_neigh_free_ref);
+ softif_neigh_free_ref(softif_neigh_tmp);
}
- call_rcu(&softif_neigh->rcu, softif_neigh_free_rcu);
+ softif_neigh_free_ref(softif_neigh);
}
spin_unlock_bh(&bat_priv->softif_neigh_lock);
@@ -138,14 +132,17 @@ static struct softif_neigh *softif_neigh_get(struct bat_priv *bat_priv,
rcu_read_lock();
hlist_for_each_entry_rcu(softif_neigh, node,
&bat_priv->softif_neigh_list, list) {
- if (memcmp(softif_neigh->addr, addr, ETH_ALEN) != 0)
+ if (!compare_eth(softif_neigh->addr, addr))
continue;
if (softif_neigh->vid != vid)
continue;
+ if (!atomic_inc_not_zero(&softif_neigh->refcount))
+ continue;
+
softif_neigh->last_seen = jiffies;
- goto found;
+ goto out;
}
softif_neigh = kzalloc(sizeof(struct softif_neigh), GFP_ATOMIC);
@@ -155,15 +152,14 @@ static struct softif_neigh *softif_neigh_get(struct bat_priv *bat_priv,
memcpy(softif_neigh->addr, addr, ETH_ALEN);
softif_neigh->vid = vid;
softif_neigh->last_seen = jiffies;
- kref_init(&softif_neigh->refcount);
+ /* initialize with 2 - caller decrements counter by one */
+ atomic_set(&softif_neigh->refcount, 2);
INIT_HLIST_NODE(&softif_neigh->list);
spin_lock_bh(&bat_priv->softif_neigh_lock);
hlist_add_head_rcu(&softif_neigh->list, &bat_priv->softif_neigh_list);
spin_unlock_bh(&bat_priv->softif_neigh_lock);
-found:
- kref_get(&softif_neigh->refcount);
out:
rcu_read_unlock();
return softif_neigh;
@@ -175,8 +171,6 @@ int softif_neigh_seq_print_text(struct seq_file *seq, void *offset)
struct bat_priv *bat_priv = netdev_priv(net_dev);
struct softif_neigh *softif_neigh;
struct hlist_node *node;
- size_t buf_size, pos;
- char *buff;
if (!bat_priv->primary_if) {
return seq_printf(seq, "BATMAN mesh %s disabled - "
@@ -186,33 +180,15 @@ int softif_neigh_seq_print_text(struct seq_file *seq, void *offset)
seq_printf(seq, "Softif neighbor list (%s)\n", net_dev->name);
- buf_size = 1;
- /* Estimate length for: " xx:xx:xx:xx:xx:xx\n" */
rcu_read_lock();
hlist_for_each_entry_rcu(softif_neigh, node,
&bat_priv->softif_neigh_list, list)
- buf_size += 30;
- rcu_read_unlock();
-
- buff = kmalloc(buf_size, GFP_ATOMIC);
- if (!buff)
- return -ENOMEM;
-
- buff[0] = '\0';
- pos = 0;
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(softif_neigh, node,
- &bat_priv->softif_neigh_list, list) {
- pos += snprintf(buff + pos, 31, "%s %pM (vid: %d)\n",
+ seq_printf(seq, "%s %pM (vid: %d)\n",
bat_priv->softif_neigh == softif_neigh
? "=>" : " ", softif_neigh->addr,
softif_neigh->vid);
- }
rcu_read_unlock();
- seq_printf(seq, "%s", buff);
- kfree(buff);
return 0;
}
@@ -267,7 +243,7 @@ static void softif_batman_recv(struct sk_buff *skb, struct net_device *dev,
softif_neigh->addr, softif_neigh->vid);
softif_neigh_tmp = bat_priv->softif_neigh;
bat_priv->softif_neigh = softif_neigh;
- kref_put(&softif_neigh_tmp->refcount, softif_neigh_free_ref);
+ softif_neigh_free_ref(softif_neigh_tmp);
/* we need to hold the additional reference */
goto err;
}
@@ -285,7 +261,7 @@ static void softif_batman_recv(struct sk_buff *skb, struct net_device *dev,
}
out:
- kref_put(&softif_neigh->refcount, softif_neigh_free_ref);
+ softif_neigh_free_ref(softif_neigh);
err:
kfree_skb(skb);
return;
@@ -438,7 +414,7 @@ end:
}
void interface_rx(struct net_device *soft_iface,
- struct sk_buff *skb, struct batman_if *recv_if,
+ struct sk_buff *skb, struct hard_iface *recv_if,
int hdr_size)
{
struct bat_priv *bat_priv = netdev_priv(soft_iface);
@@ -486,7 +462,7 @@ void interface_rx(struct net_device *soft_iface,
memcpy(unicast_packet->dest,
bat_priv->softif_neigh->addr, ETH_ALEN);
- ret = route_unicast_packet(skb, recv_if, hdr_size);
+ ret = route_unicast_packet(skb, recv_if);
if (ret == NET_RX_DROP)
goto dropped;
@@ -646,6 +622,19 @@ void softif_destroy(struct net_device *soft_iface)
unregister_netdevice(soft_iface);
}
+int softif_is_valid(struct net_device *net_dev)
+{
+#ifdef HAVE_NET_DEVICE_OPS
+ if (net_dev->netdev_ops->ndo_start_xmit == interface_tx)
+ return 1;
+#else
+ if (net_dev->hard_start_xmit == interface_tx)
+ return 1;
+#endif
+
+ return 0;
+}
+
/* ethtool */
static int bat_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 02b77334d10..4789b6f2a0b 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -27,9 +27,10 @@ int softif_neigh_seq_print_text(struct seq_file *seq, void *offset);
void softif_neigh_purge(struct bat_priv *bat_priv);
int interface_tx(struct sk_buff *skb, struct net_device *soft_iface);
void interface_rx(struct net_device *soft_iface,
- struct sk_buff *skb, struct batman_if *recv_if,
+ struct sk_buff *skb, struct hard_iface *recv_if,
int hdr_size);
struct net_device *softif_create(char *name);
void softif_destroy(struct net_device *soft_iface);
+int softif_is_valid(struct net_device *net_dev);
#endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index a633b5a435e..8d15b48d169 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,7 +22,6 @@
#include "main.h"
#include "translation-table.h"
#include "soft-interface.h"
-#include "types.h"
#include "hash.h"
#include "originator.h"
@@ -31,12 +30,85 @@ static void _hna_global_del_orig(struct bat_priv *bat_priv,
struct hna_global_entry *hna_global_entry,
char *message);
+/* returns 1 if they are the same mac addr */
+static int compare_lhna(struct hlist_node *node, void *data2)
+{
+ void *data1 = container_of(node, struct hna_local_entry, hash_entry);
+
+ return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0);
+}
+
+/* returns 1 if they are the same mac addr */
+static int compare_ghna(struct hlist_node *node, void *data2)
+{
+ void *data1 = container_of(node, struct hna_global_entry, hash_entry);
+
+ return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0);
+}
+
static void hna_local_start_timer(struct bat_priv *bat_priv)
{
INIT_DELAYED_WORK(&bat_priv->hna_work, hna_local_purge);
queue_delayed_work(bat_event_workqueue, &bat_priv->hna_work, 10 * HZ);
}
+static struct hna_local_entry *hna_local_hash_find(struct bat_priv *bat_priv,
+ void *data)
+{
+ struct hashtable_t *hash = bat_priv->hna_local_hash;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct hna_local_entry *hna_local_entry, *hna_local_entry_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ index = choose_orig(data, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(hna_local_entry, node, head, hash_entry) {
+ if (!compare_eth(hna_local_entry, data))
+ continue;
+
+ hna_local_entry_tmp = hna_local_entry;
+ break;
+ }
+ rcu_read_unlock();
+
+ return hna_local_entry_tmp;
+}
+
+static struct hna_global_entry *hna_global_hash_find(struct bat_priv *bat_priv,
+ void *data)
+{
+ struct hashtable_t *hash = bat_priv->hna_global_hash;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct hna_global_entry *hna_global_entry;
+ struct hna_global_entry *hna_global_entry_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ index = choose_orig(data, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(hna_global_entry, node, head, hash_entry) {
+ if (!compare_eth(hna_global_entry, data))
+ continue;
+
+ hna_global_entry_tmp = hna_global_entry;
+ break;
+ }
+ rcu_read_unlock();
+
+ return hna_global_entry_tmp;
+}
+
int hna_local_init(struct bat_priv *bat_priv)
{
if (bat_priv->hna_local_hash)
@@ -61,10 +133,7 @@ void hna_local_add(struct net_device *soft_iface, uint8_t *addr)
int required_bytes;
spin_lock_bh(&bat_priv->hna_lhash_lock);
- hna_local_entry =
- ((struct hna_local_entry *)hash_find(bat_priv->hna_local_hash,
- compare_orig, choose_orig,
- addr));
+ hna_local_entry = hna_local_hash_find(bat_priv, addr);
spin_unlock_bh(&bat_priv->hna_lhash_lock);
if (hna_local_entry) {
@@ -100,15 +169,15 @@ void hna_local_add(struct net_device *soft_iface, uint8_t *addr)
hna_local_entry->last_seen = jiffies;
/* the batman interface mac address should never be purged */
- if (compare_orig(addr, soft_iface->dev_addr))
+ if (compare_eth(addr, soft_iface->dev_addr))
hna_local_entry->never_purge = 1;
else
hna_local_entry->never_purge = 0;
spin_lock_bh(&bat_priv->hna_lhash_lock);
- hash_add(bat_priv->hna_local_hash, compare_orig, choose_orig,
- hna_local_entry);
+ hash_add(bat_priv->hna_local_hash, compare_lhna, choose_orig,
+ hna_local_entry, &hna_local_entry->hash_entry);
bat_priv->num_local_hna++;
atomic_set(&bat_priv->hna_local_changed, 1);
@@ -117,9 +186,7 @@ void hna_local_add(struct net_device *soft_iface, uint8_t *addr)
/* remove address from global hash if present */
spin_lock_bh(&bat_priv->hna_ghash_lock);
- hna_global_entry = ((struct hna_global_entry *)
- hash_find(bat_priv->hna_global_hash,
- compare_orig, choose_orig, addr));
+ hna_global_entry = hna_global_hash_find(bat_priv, addr);
if (hna_global_entry)
_hna_global_del_orig(bat_priv, hna_global_entry,
@@ -133,28 +200,27 @@ int hna_local_fill_buffer(struct bat_priv *bat_priv,
{
struct hashtable_t *hash = bat_priv->hna_local_hash;
struct hna_local_entry *hna_local_entry;
- struct element_t *bucket;
- int i;
- struct hlist_node *walk;
+ struct hlist_node *node;
struct hlist_head *head;
- int count = 0;
+ int i, count = 0;
spin_lock_bh(&bat_priv->hna_lhash_lock);
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
-
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(hna_local_entry, node,
+ head, hash_entry) {
if (buff_len < (count + 1) * ETH_ALEN)
break;
- hna_local_entry = bucket->data;
memcpy(buff + (count * ETH_ALEN), hna_local_entry->addr,
ETH_ALEN);
count++;
}
+ rcu_read_unlock();
}
/* if we did not get all new local hnas see you next time ;-) */
@@ -171,12 +237,11 @@ int hna_local_seq_print_text(struct seq_file *seq, void *offset)
struct bat_priv *bat_priv = netdev_priv(net_dev);
struct hashtable_t *hash = bat_priv->hna_local_hash;
struct hna_local_entry *hna_local_entry;
- int i;
- struct hlist_node *walk;
+ struct hlist_node *node;
struct hlist_head *head;
- struct element_t *bucket;
size_t buf_size, pos;
char *buff;
+ int i;
if (!bat_priv->primary_if) {
return seq_printf(seq, "BATMAN mesh %s disabled - "
@@ -195,8 +260,10 @@ int hna_local_seq_print_text(struct seq_file *seq, void *offset)
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each(walk, head)
+ rcu_read_lock();
+ __hlist_for_each_rcu(node, head)
buf_size += 21;
+ rcu_read_unlock();
}
buff = kmalloc(buf_size, GFP_ATOMIC);
@@ -204,18 +271,20 @@ int hna_local_seq_print_text(struct seq_file *seq, void *offset)
spin_unlock_bh(&bat_priv->hna_lhash_lock);
return -ENOMEM;
}
+
buff[0] = '\0';
pos = 0;
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- hna_local_entry = bucket->data;
-
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(hna_local_entry, node,
+ head, hash_entry) {
pos += snprintf(buff + pos, 22, " * %pM\n",
hna_local_entry->addr);
}
+ rcu_read_unlock();
}
spin_unlock_bh(&bat_priv->hna_lhash_lock);
@@ -225,9 +294,10 @@ int hna_local_seq_print_text(struct seq_file *seq, void *offset)
return 0;
}
-static void _hna_local_del(void *data, void *arg)
+static void _hna_local_del(struct hlist_node *node, void *arg)
{
struct bat_priv *bat_priv = (struct bat_priv *)arg;
+ void *data = container_of(node, struct hna_local_entry, hash_entry);
kfree(data);
bat_priv->num_local_hna--;
@@ -241,9 +311,9 @@ static void hna_local_del(struct bat_priv *bat_priv,
bat_dbg(DBG_ROUTES, bat_priv, "Deleting local hna entry (%pM): %s\n",
hna_local_entry->addr, message);
- hash_remove(bat_priv->hna_local_hash, compare_orig, choose_orig,
+ hash_remove(bat_priv->hna_local_hash, compare_lhna, choose_orig,
hna_local_entry->addr);
- _hna_local_del(hna_local_entry, bat_priv);
+ _hna_local_del(&hna_local_entry->hash_entry, bat_priv);
}
void hna_local_remove(struct bat_priv *bat_priv,
@@ -253,9 +323,7 @@ void hna_local_remove(struct bat_priv *bat_priv,
spin_lock_bh(&bat_priv->hna_lhash_lock);
- hna_local_entry = (struct hna_local_entry *)
- hash_find(bat_priv->hna_local_hash, compare_orig, choose_orig,
- addr);
+ hna_local_entry = hna_local_hash_find(bat_priv, addr);
if (hna_local_entry)
hna_local_del(bat_priv, hna_local_entry, message);
@@ -271,27 +339,29 @@ static void hna_local_purge(struct work_struct *work)
container_of(delayed_work, struct bat_priv, hna_work);
struct hashtable_t *hash = bat_priv->hna_local_hash;
struct hna_local_entry *hna_local_entry;
- int i;
- struct hlist_node *walk, *safe;
+ struct hlist_node *node, *node_tmp;
struct hlist_head *head;
- struct element_t *bucket;
unsigned long timeout;
+ int i;
spin_lock_bh(&bat_priv->hna_lhash_lock);
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry_safe(bucket, walk, safe, head, hlist) {
- hna_local_entry = bucket->data;
+ hlist_for_each_entry_safe(hna_local_entry, node, node_tmp,
+ head, hash_entry) {
+ if (hna_local_entry->never_purge)
+ continue;
timeout = hna_local_entry->last_seen;
timeout += LOCAL_HNA_TIMEOUT * HZ;
- if ((!hna_local_entry->never_purge) &&
- time_after(jiffies, timeout))
- hna_local_del(bat_priv, hna_local_entry,
- "address timed out");
+ if (time_before(jiffies, timeout))
+ continue;
+
+ hna_local_del(bat_priv, hna_local_entry,
+ "address timed out");
}
}
@@ -335,9 +405,7 @@ void hna_global_add_orig(struct bat_priv *bat_priv,
spin_lock_bh(&bat_priv->hna_ghash_lock);
hna_ptr = hna_buff + (hna_buff_count * ETH_ALEN);
- hna_global_entry = (struct hna_global_entry *)
- hash_find(bat_priv->hna_global_hash, compare_orig,
- choose_orig, hna_ptr);
+ hna_global_entry = hna_global_hash_find(bat_priv, hna_ptr);
if (!hna_global_entry) {
spin_unlock_bh(&bat_priv->hna_ghash_lock);
@@ -357,8 +425,9 @@ void hna_global_add_orig(struct bat_priv *bat_priv,
hna_global_entry->addr, orig_node->orig);
spin_lock_bh(&bat_priv->hna_ghash_lock);
- hash_add(bat_priv->hna_global_hash, compare_orig,
- choose_orig, hna_global_entry);
+ hash_add(bat_priv->hna_global_hash, compare_ghna,
+ choose_orig, hna_global_entry,
+ &hna_global_entry->hash_entry);
}
@@ -369,9 +438,7 @@ void hna_global_add_orig(struct bat_priv *bat_priv,
spin_lock_bh(&bat_priv->hna_lhash_lock);
hna_ptr = hna_buff + (hna_buff_count * ETH_ALEN);
- hna_local_entry = (struct hna_local_entry *)
- hash_find(bat_priv->hna_local_hash, compare_orig,
- choose_orig, hna_ptr);
+ hna_local_entry = hna_local_hash_find(bat_priv, hna_ptr);
if (hna_local_entry)
hna_local_del(bat_priv, hna_local_entry,
@@ -401,12 +468,11 @@ int hna_global_seq_print_text(struct seq_file *seq, void *offset)
struct bat_priv *bat_priv = netdev_priv(net_dev);
struct hashtable_t *hash = bat_priv->hna_global_hash;
struct hna_global_entry *hna_global_entry;
- int i;
- struct hlist_node *walk;
+ struct hlist_node *node;
struct hlist_head *head;
- struct element_t *bucket;
size_t buf_size, pos;
char *buff;
+ int i;
if (!bat_priv->primary_if) {
return seq_printf(seq, "BATMAN mesh %s disabled - "
@@ -424,8 +490,10 @@ int hna_global_seq_print_text(struct seq_file *seq, void *offset)
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each(walk, head)
+ rcu_read_lock();
+ __hlist_for_each_rcu(node, head)
buf_size += 43;
+ rcu_read_unlock();
}
buff = kmalloc(buf_size, GFP_ATOMIC);
@@ -439,14 +507,15 @@ int hna_global_seq_print_text(struct seq_file *seq, void *offset)
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- hna_global_entry = bucket->data;
-
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(hna_global_entry, node,
+ head, hash_entry) {
pos += snprintf(buff + pos, 44,
" * %pM via %pM\n",
hna_global_entry->addr,
hna_global_entry->orig_node->orig);
}
+ rcu_read_unlock();
}
spin_unlock_bh(&bat_priv->hna_ghash_lock);
@@ -465,7 +534,7 @@ static void _hna_global_del_orig(struct bat_priv *bat_priv,
hna_global_entry->addr, hna_global_entry->orig_node->orig,
message);
- hash_remove(bat_priv->hna_global_hash, compare_orig, choose_orig,
+ hash_remove(bat_priv->hna_global_hash, compare_ghna, choose_orig,
hna_global_entry->addr);
kfree(hna_global_entry);
}
@@ -484,9 +553,7 @@ void hna_global_del_orig(struct bat_priv *bat_priv,
while ((hna_buff_count + 1) * ETH_ALEN <= orig_node->hna_buff_len) {
hna_ptr = orig_node->hna_buff + (hna_buff_count * ETH_ALEN);
- hna_global_entry = (struct hna_global_entry *)
- hash_find(bat_priv->hna_global_hash, compare_orig,
- choose_orig, hna_ptr);
+ hna_global_entry = hna_global_hash_find(bat_priv, hna_ptr);
if ((hna_global_entry) &&
(hna_global_entry->orig_node == orig_node))
@@ -503,8 +570,10 @@ void hna_global_del_orig(struct bat_priv *bat_priv,
orig_node->hna_buff = NULL;
}
-static void hna_global_del(void *data, void *arg)
+static void hna_global_del(struct hlist_node *node, void *arg)
{
+ void *data = container_of(node, struct hna_global_entry, hash_entry);
+
kfree(data);
}
@@ -520,15 +589,20 @@ void hna_global_free(struct bat_priv *bat_priv)
struct orig_node *transtable_search(struct bat_priv *bat_priv, uint8_t *addr)
{
struct hna_global_entry *hna_global_entry;
+ struct orig_node *orig_node = NULL;
spin_lock_bh(&bat_priv->hna_ghash_lock);
- hna_global_entry = (struct hna_global_entry *)
- hash_find(bat_priv->hna_global_hash,
- compare_orig, choose_orig, addr);
- spin_unlock_bh(&bat_priv->hna_ghash_lock);
+ hna_global_entry = hna_global_hash_find(bat_priv, addr);
if (!hna_global_entry)
- return NULL;
+ goto out;
- return hna_global_entry->orig_node;
+ if (!atomic_inc_not_zero(&hna_global_entry->orig_node->refcount))
+ goto out;
+
+ orig_node = hna_global_entry->orig_node;
+
+out:
+ spin_unlock_bh(&bat_priv->hna_ghash_lock);
+ return orig_node;
}
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 10c4c5c319b..f19931ca145 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,8 +22,6 @@
#ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
#define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
-#include "types.h"
-
int hna_local_init(struct bat_priv *bat_priv);
void hna_local_add(struct net_device *soft_iface, uint8_t *addr);
void hna_local_remove(struct bat_priv *bat_priv,
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 97cb23dd3e6..83445cf0cc9 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -33,7 +33,7 @@
sizeof(struct bcast_packet))))
-struct batman_if {
+struct hard_iface {
struct list_head list;
int16_t if_num;
char if_status;
@@ -43,7 +43,7 @@ struct batman_if {
unsigned char *packet_buff;
int packet_len;
struct kobject *hardif_obj;
- struct kref refcount;
+ atomic_t refcount;
struct packet_type batman_adv_ptype;
struct net_device *soft_iface;
struct rcu_head rcu;
@@ -70,8 +70,6 @@ struct orig_node {
struct neigh_node *router;
unsigned long *bcast_own;
uint8_t *bcast_own_sum;
- uint8_t tq_own;
- int tq_asym_penalty;
unsigned long last_valid;
unsigned long bcast_seqno_reset;
unsigned long batman_seqno_reset;
@@ -83,20 +81,28 @@ struct orig_node {
uint8_t last_ttl;
unsigned long bcast_bits[NUM_WORDS];
uint32_t last_bcast_seqno;
- struct list_head neigh_list;
+ struct hlist_head neigh_list;
struct list_head frag_list;
+ spinlock_t neigh_list_lock; /* protects neighbor list */
+ atomic_t refcount;
+ struct rcu_head rcu;
+ struct hlist_node hash_entry;
+ struct bat_priv *bat_priv;
unsigned long last_frag_packet;
- struct {
- uint8_t candidates;
- struct neigh_node *selected;
- } bond;
+ spinlock_t ogm_cnt_lock; /* protects: bcast_own, bcast_own_sum,
+ * neigh_node->real_bits,
+ * neigh_node->real_packet_count */
+ spinlock_t bcast_seqno_lock; /* protects bcast_bits,
+ * last_bcast_seqno */
+ atomic_t bond_candidates;
+ struct list_head bond_list;
};
struct gw_node {
struct hlist_node list;
struct orig_node *orig_node;
unsigned long deleted;
- struct kref refcount;
+ atomic_t refcount;
struct rcu_head rcu;
};
@@ -105,18 +111,20 @@ struct gw_node {
* @last_valid: when last packet via this neighbor was received
*/
struct neigh_node {
- struct list_head list;
+ struct hlist_node list;
uint8_t addr[ETH_ALEN];
uint8_t real_packet_count;
uint8_t tq_recv[TQ_GLOBAL_WINDOW_SIZE];
uint8_t tq_index;
uint8_t tq_avg;
uint8_t last_ttl;
- struct neigh_node *next_bond_candidate;
+ struct list_head bonding_list;
unsigned long last_valid;
unsigned long real_bits[NUM_WORDS];
+ atomic_t refcount;
+ struct rcu_head rcu;
struct orig_node *orig_node;
- struct batman_if *if_incoming;
+ struct hard_iface *if_incoming;
};
@@ -140,7 +148,7 @@ struct bat_priv {
struct hlist_head softif_neigh_list;
struct softif_neigh *softif_neigh;
struct debug_log *debug_log;
- struct batman_if *primary_if;
+ struct hard_iface *primary_if;
struct kobject *mesh_obj;
struct dentry *debug_dir;
struct hlist_head forw_bat_list;
@@ -151,12 +159,11 @@ struct bat_priv {
struct hashtable_t *hna_local_hash;
struct hashtable_t *hna_global_hash;
struct hashtable_t *vis_hash;
- spinlock_t orig_hash_lock; /* protects orig_hash */
spinlock_t forw_bat_list_lock; /* protects forw_bat_list */
spinlock_t forw_bcast_list_lock; /* protects */
spinlock_t hna_lhash_lock; /* protects hna_local_hash */
spinlock_t hna_ghash_lock; /* protects hna_global_hash */
- spinlock_t gw_list_lock; /* protects gw_list */
+ spinlock_t gw_list_lock; /* protects gw_list and curr_gw */
spinlock_t vis_hash_lock; /* protects vis_hash */
spinlock_t vis_list_lock; /* protects vis_info::recv_list */
spinlock_t softif_neigh_lock; /* protects soft-interface neigh list */
@@ -165,7 +172,7 @@ struct bat_priv {
struct delayed_work hna_work;
struct delayed_work orig_work;
struct delayed_work vis_work;
- struct gw_node *curr_gw;
+ struct gw_node __rcu *curr_gw; /* rcu protected pointer */
struct vis_info *my_vis_info;
};
@@ -188,11 +195,13 @@ struct hna_local_entry {
uint8_t addr[ETH_ALEN];
unsigned long last_seen;
char never_purge;
+ struct hlist_node hash_entry;
};
struct hna_global_entry {
uint8_t addr[ETH_ALEN];
struct orig_node *orig_node;
+ struct hlist_node hash_entry;
};
/**
@@ -208,7 +217,7 @@ struct forw_packet {
uint32_t direct_link_flags;
uint8_t num_packets;
struct delayed_work delayed_work;
- struct batman_if *if_incoming;
+ struct hard_iface *if_incoming;
};
/* While scanning for vis-entries of a particular vis-originator
@@ -242,17 +251,18 @@ struct vis_info {
* from. we should not reply to them. */
struct list_head send_list;
struct kref refcount;
+ struct hlist_node hash_entry;
struct bat_priv *bat_priv;
/* this packet might be part of the vis send queue. */
struct sk_buff *skb_packet;
/* vis_info may follow here*/
-} __attribute__((packed));
+} __packed;
struct vis_info_entry {
uint8_t src[ETH_ALEN];
uint8_t dest[ETH_ALEN];
uint8_t quality; /* quality = 0 means HNA */
-} __attribute__((packed));
+} __packed;
struct recvlist_node {
struct list_head list;
@@ -264,7 +274,7 @@ struct softif_neigh {
uint8_t addr[ETH_ALEN];
unsigned long last_seen;
short vid;
- struct kref refcount;
+ atomic_t refcount;
struct rcu_head rcu;
};
diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c
index dc2e28bed84..19f84bd443a 100644
--- a/net/batman-adv/unicast.c
+++ b/net/batman-adv/unicast.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Andreas Langer
*
@@ -39,8 +39,8 @@ static struct sk_buff *frag_merge_packet(struct list_head *head,
(struct unicast_frag_packet *)skb->data;
struct sk_buff *tmp_skb;
struct unicast_packet *unicast_packet;
- int hdr_len = sizeof(struct unicast_packet),
- uni_diff = sizeof(struct unicast_frag_packet) - hdr_len;
+ int hdr_len = sizeof(struct unicast_packet);
+ int uni_diff = sizeof(struct unicast_frag_packet) - hdr_len;
/* set skb to the first part and tmp_skb to the second part */
if (up->flags & UNI_FRAG_HEAD) {
@@ -50,12 +50,12 @@ static struct sk_buff *frag_merge_packet(struct list_head *head,
skb = tfp->skb;
}
+ if (skb_linearize(skb) < 0 || skb_linearize(tmp_skb) < 0)
+ goto err;
+
skb_pull(tmp_skb, sizeof(struct unicast_frag_packet));
- if (pskb_expand_head(skb, 0, tmp_skb->len, GFP_ATOMIC) < 0) {
- /* free buffered skb, skb will be freed later */
- kfree_skb(tfp->skb);
- return NULL;
- }
+ if (pskb_expand_head(skb, 0, tmp_skb->len, GFP_ATOMIC) < 0)
+ goto err;
/* move free entry to end */
tfp->skb = NULL;
@@ -70,6 +70,11 @@ static struct sk_buff *frag_merge_packet(struct list_head *head,
unicast_packet->packet_type = BAT_UNICAST;
return skb;
+
+err:
+ /* free buffered skb, skb will be freed later */
+ kfree_skb(tfp->skb);
+ return NULL;
}
static void frag_create_entry(struct list_head *head, struct sk_buff *skb)
@@ -178,15 +183,10 @@ int frag_reassemble_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
(struct unicast_frag_packet *)skb->data;
*new_skb = NULL;
- spin_lock_bh(&bat_priv->orig_hash_lock);
- orig_node = ((struct orig_node *)
- hash_find(bat_priv->orig_hash, compare_orig, choose_orig,
- unicast_packet->orig));
- if (!orig_node) {
- pr_debug("couldn't find originator in orig_hash\n");
+ orig_node = orig_hash_find(bat_priv, unicast_packet->orig);
+ if (!orig_node)
goto out;
- }
orig_node->last_frag_packet = jiffies;
@@ -210,30 +210,36 @@ int frag_reassemble_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
/* if not, merge failed */
if (*new_skb)
ret = NET_RX_SUCCESS;
-out:
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+out:
+ if (orig_node)
+ orig_node_free_ref(orig_node);
return ret;
}
int frag_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
- struct batman_if *batman_if, uint8_t dstaddr[])
+ struct hard_iface *hard_iface, uint8_t dstaddr[])
{
struct unicast_packet tmp_uc, *unicast_packet;
struct sk_buff *frag_skb;
struct unicast_frag_packet *frag1, *frag2;
int uc_hdr_len = sizeof(struct unicast_packet);
int ucf_hdr_len = sizeof(struct unicast_frag_packet);
- int data_len = skb->len;
+ int data_len = skb->len - uc_hdr_len;
+ int large_tail = 0;
+ uint16_t seqno;
if (!bat_priv->primary_if)
goto dropped;
- unicast_packet = (struct unicast_packet *) skb->data;
+ frag_skb = dev_alloc_skb(data_len - (data_len / 2) + ucf_hdr_len);
+ if (!frag_skb)
+ goto dropped;
+ skb_reserve(frag_skb, ucf_hdr_len);
+ unicast_packet = (struct unicast_packet *) skb->data;
memcpy(&tmp_uc, unicast_packet, uc_hdr_len);
- frag_skb = dev_alloc_skb(data_len - (data_len / 2) + ucf_hdr_len);
- skb_split(skb, frag_skb, data_len / 2);
+ skb_split(skb, frag_skb, data_len / 2 + uc_hdr_len);
if (my_skb_head_push(skb, ucf_hdr_len - uc_hdr_len) < 0 ||
my_skb_head_push(frag_skb, ucf_hdr_len) < 0)
@@ -251,16 +257,18 @@ int frag_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
memcpy(frag1->orig, bat_priv->primary_if->net_dev->dev_addr, ETH_ALEN);
memcpy(frag2, frag1, sizeof(struct unicast_frag_packet));
- frag1->flags |= UNI_FRAG_HEAD;
- frag2->flags &= ~UNI_FRAG_HEAD;
+ if (data_len & 1)
+ large_tail = UNI_FRAG_LARGETAIL;
+
+ frag1->flags = UNI_FRAG_HEAD | large_tail;
+ frag2->flags = large_tail;
- frag1->seqno = htons((uint16_t)atomic_inc_return(
- &batman_if->frag_seqno));
- frag2->seqno = htons((uint16_t)atomic_inc_return(
- &batman_if->frag_seqno));
+ seqno = atomic_add_return(2, &hard_iface->frag_seqno);
+ frag1->seqno = htons(seqno - 1);
+ frag2->seqno = htons(seqno);
- send_skb_packet(skb, batman_if, dstaddr);
- send_skb_packet(frag_skb, batman_if, dstaddr);
+ send_skb_packet(skb, hard_iface, dstaddr);
+ send_skb_packet(frag_skb, hard_iface, dstaddr);
return NET_RX_SUCCESS;
drop_frag:
@@ -275,44 +283,36 @@ int unicast_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv)
struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
struct unicast_packet *unicast_packet;
struct orig_node *orig_node;
- struct batman_if *batman_if;
- struct neigh_node *router;
+ struct neigh_node *neigh_node;
int data_len = skb->len;
- uint8_t dstaddr[6];
-
- spin_lock_bh(&bat_priv->orig_hash_lock);
+ int ret = 1;
/* get routing information */
- if (is_multicast_ether_addr(ethhdr->h_dest))
+ if (is_multicast_ether_addr(ethhdr->h_dest)) {
orig_node = (struct orig_node *)gw_get_selected(bat_priv);
- else
- orig_node = ((struct orig_node *)hash_find(bat_priv->orig_hash,
- compare_orig,
- choose_orig,
- ethhdr->h_dest));
-
- /* check for hna host */
- if (!orig_node)
- orig_node = transtable_search(bat_priv, ethhdr->h_dest);
-
- router = find_router(bat_priv, orig_node, NULL);
-
- if (!router)
- goto unlock;
+ if (orig_node)
+ goto find_router;
+ }
- /* don't lock while sending the packets ... we therefore
- * copy the required data before sending */
+ /* check for hna host - increases orig_node refcount */
+ orig_node = transtable_search(bat_priv, ethhdr->h_dest);
- batman_if = router->if_incoming;
- memcpy(dstaddr, router->addr, ETH_ALEN);
+find_router:
+ /**
+ * find_router():
+ * - if orig_node is NULL it returns NULL
+ * - increases neigh_nodes refcount if found.
+ */
+ neigh_node = find_router(bat_priv, orig_node, NULL);
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ if (!neigh_node)
+ goto out;
- if (batman_if->if_status != IF_ACTIVE)
- goto dropped;
+ if (neigh_node->if_incoming->if_status != IF_ACTIVE)
+ goto out;
if (my_skb_head_push(skb, sizeof(struct unicast_packet)) < 0)
- goto dropped;
+ goto out;
unicast_packet = (struct unicast_packet *)skb->data;
@@ -326,18 +326,24 @@ int unicast_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv)
if (atomic_read(&bat_priv->fragmentation) &&
data_len + sizeof(struct unicast_packet) >
- batman_if->net_dev->mtu) {
+ neigh_node->if_incoming->net_dev->mtu) {
/* send frag skb decreases ttl */
unicast_packet->ttl++;
- return frag_send_skb(skb, bat_priv, batman_if,
- dstaddr);
+ ret = frag_send_skb(skb, bat_priv,
+ neigh_node->if_incoming, neigh_node->addr);
+ goto out;
}
- send_skb_packet(skb, batman_if, dstaddr);
- return 0;
-unlock:
- spin_unlock_bh(&bat_priv->orig_hash_lock);
-dropped:
- kfree_skb(skb);
- return 1;
+ send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+ ret = 0;
+ goto out;
+
+out:
+ if (neigh_node)
+ neigh_node_free_ref(neigh_node);
+ if (orig_node)
+ orig_node_free_ref(orig_node);
+ if (ret == 1)
+ kfree_skb(skb);
+ return ret;
}
diff --git a/net/batman-adv/unicast.h b/net/batman-adv/unicast.h
index e32b7867a9a..16ad7a9242b 100644
--- a/net/batman-adv/unicast.h
+++ b/net/batman-adv/unicast.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
*
* Andreas Langer
*
@@ -22,6 +22,8 @@
#ifndef _NET_BATMAN_ADV_UNICAST_H_
#define _NET_BATMAN_ADV_UNICAST_H_
+#include "packet.h"
+
#define FRAG_TIMEOUT 10000 /* purge frag list entrys after time in ms */
#define FRAG_BUFFER_SIZE 6 /* number of list elements in buffer */
@@ -30,6 +32,27 @@ int frag_reassemble_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
void frag_list_free(struct list_head *head);
int unicast_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv);
int frag_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
- struct batman_if *batman_if, uint8_t dstaddr[]);
+ struct hard_iface *hard_iface, uint8_t dstaddr[]);
+
+static inline int frag_can_reassemble(struct sk_buff *skb, int mtu)
+{
+ struct unicast_frag_packet *unicast_packet;
+ int uneven_correction = 0;
+ unsigned int merged_size;
+
+ unicast_packet = (struct unicast_frag_packet *)skb->data;
+
+ if (unicast_packet->flags & UNI_FRAG_LARGETAIL) {
+ if (unicast_packet->flags & UNI_FRAG_HEAD)
+ uneven_correction = 1;
+ else
+ uneven_correction = -1;
+ }
+
+ merged_size = (skb->len - sizeof(struct unicast_frag_packet)) * 2;
+ merged_size += sizeof(struct unicast_packet) + uneven_correction;
+
+ return merged_size <= mtu;
+}
#endif /* _NET_BATMAN_ADV_UNICAST_H_ */
diff --git a/net/batman-adv/vis.c b/net/batman-adv/vis.c
index cd4c4231fa4..f90212f4208 100644
--- a/net/batman-adv/vis.c
+++ b/net/batman-adv/vis.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2008-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2008-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
*
@@ -64,18 +64,20 @@ static void free_info(struct kref *ref)
spin_unlock_bh(&bat_priv->vis_list_lock);
kfree_skb(info->skb_packet);
+ kfree(info);
}
/* Compare two vis packets, used by the hashing algorithm */
-static int vis_info_cmp(void *data1, void *data2)
+static int vis_info_cmp(struct hlist_node *node, void *data2)
{
struct vis_info *d1, *d2;
struct vis_packet *p1, *p2;
- d1 = data1;
+
+ d1 = container_of(node, struct vis_info, hash_entry);
d2 = data2;
p1 = (struct vis_packet *)d1->skb_packet->data;
p2 = (struct vis_packet *)d2->skb_packet->data;
- return compare_orig(p1->vis_orig, p2->vis_orig);
+ return compare_eth(p1->vis_orig, p2->vis_orig);
}
/* hash function to choose an entry in a hash table of given size */
@@ -103,6 +105,34 @@ static int vis_info_choose(void *data, int size)
return hash % size;
}
+static struct vis_info *vis_hash_find(struct bat_priv *bat_priv,
+ void *data)
+{
+ struct hashtable_t *hash = bat_priv->vis_hash;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct vis_info *vis_info, *vis_info_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ index = vis_info_choose(data, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vis_info, node, head, hash_entry) {
+ if (!vis_info_cmp(node, data))
+ continue;
+
+ vis_info_tmp = vis_info;
+ break;
+ }
+ rcu_read_unlock();
+
+ return vis_info_tmp;
+}
+
/* insert interface to the list of interfaces of one originator, if it
* does not already exist in the list */
static void vis_data_insert_interface(const uint8_t *interface,
@@ -113,7 +143,7 @@ static void vis_data_insert_interface(const uint8_t *interface,
struct hlist_node *pos;
hlist_for_each_entry(entry, pos, if_list, list) {
- if (compare_orig(entry->addr, (void *)interface))
+ if (compare_eth(entry->addr, (void *)interface))
return;
}
@@ -165,7 +195,7 @@ static ssize_t vis_data_read_entry(char *buff, struct vis_info_entry *entry,
/* maximal length: max(4+17+2, 3+17+1+3+2) == 26 */
if (primary && entry->quality == 0)
return sprintf(buff, "HNA %pM, ", entry->dest);
- else if (compare_orig(entry->src, src))
+ else if (compare_eth(entry->src, src))
return sprintf(buff, "TQ %pM %d, ", entry->dest,
entry->quality);
@@ -174,9 +204,8 @@ static ssize_t vis_data_read_entry(char *buff, struct vis_info_entry *entry,
int vis_seq_print_text(struct seq_file *seq, void *offset)
{
- struct hlist_node *walk;
+ struct hlist_node *node;
struct hlist_head *head;
- struct element_t *bucket;
struct vis_info *info;
struct vis_packet *packet;
struct vis_info_entry *entries;
@@ -202,8 +231,8 @@ int vis_seq_print_text(struct seq_file *seq, void *offset)
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- info = bucket->data;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(info, node, head, hash_entry) {
packet = (struct vis_packet *)info->skb_packet->data;
entries = (struct vis_info_entry *)
((char *)packet + sizeof(struct vis_packet));
@@ -212,7 +241,7 @@ int vis_seq_print_text(struct seq_file *seq, void *offset)
if (entries[j].quality == 0)
continue;
compare =
- compare_orig(entries[j].src, packet->vis_orig);
+ compare_eth(entries[j].src, packet->vis_orig);
vis_data_insert_interface(entries[j].src,
&vis_if_list,
compare);
@@ -222,7 +251,7 @@ int vis_seq_print_text(struct seq_file *seq, void *offset)
buf_size += 18 + 26 * packet->entries;
/* add primary/secondary records */
- if (compare_orig(entry->addr, packet->vis_orig))
+ if (compare_eth(entry->addr, packet->vis_orig))
buf_size +=
vis_data_count_prim_sec(&vis_if_list);
@@ -235,6 +264,7 @@ int vis_seq_print_text(struct seq_file *seq, void *offset)
kfree(entry);
}
}
+ rcu_read_unlock();
}
buff = kmalloc(buf_size, GFP_ATOMIC);
@@ -248,8 +278,8 @@ int vis_seq_print_text(struct seq_file *seq, void *offset)
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- info = bucket->data;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(info, node, head, hash_entry) {
packet = (struct vis_packet *)info->skb_packet->data;
entries = (struct vis_info_entry *)
((char *)packet + sizeof(struct vis_packet));
@@ -258,7 +288,7 @@ int vis_seq_print_text(struct seq_file *seq, void *offset)
if (entries[j].quality == 0)
continue;
compare =
- compare_orig(entries[j].src, packet->vis_orig);
+ compare_eth(entries[j].src, packet->vis_orig);
vis_data_insert_interface(entries[j].src,
&vis_if_list,
compare);
@@ -268,15 +298,15 @@ int vis_seq_print_text(struct seq_file *seq, void *offset)
buff_pos += sprintf(buff + buff_pos, "%pM,",
entry->addr);
- for (i = 0; i < packet->entries; i++)
+ for (j = 0; j < packet->entries; j++)
buff_pos += vis_data_read_entry(
buff + buff_pos,
- &entries[i],
+ &entries[j],
entry->addr,
entry->primary);
/* add primary/secondary records */
- if (compare_orig(entry->addr, packet->vis_orig))
+ if (compare_eth(entry->addr, packet->vis_orig))
buff_pos +=
vis_data_read_prim_sec(buff + buff_pos,
&vis_if_list);
@@ -290,6 +320,7 @@ int vis_seq_print_text(struct seq_file *seq, void *offset)
kfree(entry);
}
}
+ rcu_read_unlock();
}
spin_unlock_bh(&bat_priv->vis_hash_lock);
@@ -344,7 +375,7 @@ static int recv_list_is_in(struct bat_priv *bat_priv,
spin_lock_bh(&bat_priv->vis_list_lock);
list_for_each_entry(entry, recv_list, list) {
- if (memcmp(entry->mac, mac, ETH_ALEN) == 0) {
+ if (compare_eth(entry->mac, mac)) {
spin_unlock_bh(&bat_priv->vis_list_lock);
return 1;
}
@@ -380,8 +411,7 @@ static struct vis_info *add_packet(struct bat_priv *bat_priv,
sizeof(struct vis_packet));
memcpy(search_packet->vis_orig, vis_packet->vis_orig, ETH_ALEN);
- old_info = hash_find(bat_priv->vis_hash, vis_info_cmp, vis_info_choose,
- &search_elem);
+ old_info = vis_hash_find(bat_priv, &search_elem);
kfree_skb(search_elem.skb_packet);
if (old_info) {
@@ -441,10 +471,10 @@ static struct vis_info *add_packet(struct bat_priv *bat_priv,
/* try to add it */
hash_added = hash_add(bat_priv->vis_hash, vis_info_cmp, vis_info_choose,
- info);
+ info, &info->hash_entry);
if (hash_added < 0) {
/* did not work (for some reason) */
- kref_put(&old_info->refcount, free_info);
+ kref_put(&info->refcount, free_info);
info = NULL;
}
@@ -528,9 +558,8 @@ static int find_best_vis_server(struct bat_priv *bat_priv,
struct vis_info *info)
{
struct hashtable_t *hash = bat_priv->orig_hash;
- struct hlist_node *walk;
+ struct hlist_node *node;
struct hlist_head *head;
- struct element_t *bucket;
struct orig_node *orig_node;
struct vis_packet *packet;
int best_tq = -1, i;
@@ -540,16 +569,17 @@ static int find_best_vis_server(struct bat_priv *bat_priv,
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- orig_node = bucket->data;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
if ((orig_node) && (orig_node->router) &&
- (orig_node->flags & VIS_SERVER) &&
- (orig_node->router->tq_avg > best_tq)) {
+ (orig_node->flags & VIS_SERVER) &&
+ (orig_node->router->tq_avg > best_tq)) {
best_tq = orig_node->router->tq_avg;
memcpy(packet->target_orig, orig_node->orig,
ETH_ALEN);
}
}
+ rcu_read_unlock();
}
return best_tq;
@@ -572,9 +602,8 @@ static bool vis_packet_full(struct vis_info *info)
static int generate_vis_packet(struct bat_priv *bat_priv)
{
struct hashtable_t *hash = bat_priv->orig_hash;
- struct hlist_node *walk;
+ struct hlist_node *node;
struct hlist_head *head;
- struct element_t *bucket;
struct orig_node *orig_node;
struct neigh_node *neigh_node;
struct vis_info *info = (struct vis_info *)bat_priv->my_vis_info;
@@ -586,7 +615,6 @@ static int generate_vis_packet(struct bat_priv *bat_priv)
info->first_seen = jiffies;
packet->vis_type = atomic_read(&bat_priv->vis_mode);
- spin_lock_bh(&bat_priv->orig_hash_lock);
memcpy(packet->target_orig, broadcast_addr, ETH_ALEN);
packet->ttl = TTL;
packet->seqno = htonl(ntohl(packet->seqno) + 1);
@@ -596,23 +624,21 @@ static int generate_vis_packet(struct bat_priv *bat_priv)
if (packet->vis_type == VIS_TYPE_CLIENT_UPDATE) {
best_tq = find_best_vis_server(bat_priv, info);
- if (best_tq < 0) {
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ if (best_tq < 0)
return -1;
- }
}
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- orig_node = bucket->data;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
neigh_node = orig_node->router;
if (!neigh_node)
continue;
- if (!compare_orig(neigh_node->addr, orig_node->orig))
+ if (!compare_eth(neigh_node->addr, orig_node->orig))
continue;
if (neigh_node->if_incoming->if_status != IF_ACTIVE)
@@ -631,23 +657,19 @@ static int generate_vis_packet(struct bat_priv *bat_priv)
entry->quality = neigh_node->tq_avg;
packet->entries++;
- if (vis_packet_full(info)) {
- spin_unlock_bh(&bat_priv->orig_hash_lock);
- return 0;
- }
+ if (vis_packet_full(info))
+ goto unlock;
}
+ rcu_read_unlock();
}
- spin_unlock_bh(&bat_priv->orig_hash_lock);
-
hash = bat_priv->hna_local_hash;
spin_lock_bh(&bat_priv->hna_lhash_lock);
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- hna_local_entry = bucket->data;
+ hlist_for_each_entry(hna_local_entry, node, head, hash_entry) {
entry = (struct vis_info_entry *)
skb_put(info->skb_packet,
sizeof(*entry));
@@ -665,6 +687,10 @@ static int generate_vis_packet(struct bat_priv *bat_priv)
spin_unlock_bh(&bat_priv->hna_lhash_lock);
return 0;
+
+unlock:
+ rcu_read_unlock();
+ return 0;
}
/* free old vis packets. Must be called with this vis_hash_lock
@@ -673,25 +699,22 @@ static void purge_vis_packets(struct bat_priv *bat_priv)
{
int i;
struct hashtable_t *hash = bat_priv->vis_hash;
- struct hlist_node *walk, *safe;
+ struct hlist_node *node, *node_tmp;
struct hlist_head *head;
- struct element_t *bucket;
struct vis_info *info;
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry_safe(bucket, walk, safe, head, hlist) {
- info = bucket->data;
-
+ hlist_for_each_entry_safe(info, node, node_tmp,
+ head, hash_entry) {
/* never purge own data. */
if (info == bat_priv->my_vis_info)
continue;
if (time_after(jiffies,
info->first_seen + VIS_TIMEOUT * HZ)) {
- hlist_del(walk);
- kfree(bucket);
+ hlist_del(node);
send_list_del(info);
kref_put(&info->refcount, free_info);
}
@@ -703,27 +726,24 @@ static void broadcast_vis_packet(struct bat_priv *bat_priv,
struct vis_info *info)
{
struct hashtable_t *hash = bat_priv->orig_hash;
- struct hlist_node *walk;
+ struct hlist_node *node;
struct hlist_head *head;
- struct element_t *bucket;
struct orig_node *orig_node;
struct vis_packet *packet;
struct sk_buff *skb;
- struct batman_if *batman_if;
+ struct hard_iface *hard_iface;
uint8_t dstaddr[ETH_ALEN];
int i;
- spin_lock_bh(&bat_priv->orig_hash_lock);
packet = (struct vis_packet *)info->skb_packet->data;
/* send to all routers in range. */
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
- hlist_for_each_entry(bucket, walk, head, hlist) {
- orig_node = bucket->data;
-
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
/* if it's a vis server and reachable, send it. */
if ((!orig_node) || (!orig_node->router))
continue;
@@ -736,54 +756,61 @@ static void broadcast_vis_packet(struct bat_priv *bat_priv,
continue;
memcpy(packet->target_orig, orig_node->orig, ETH_ALEN);
- batman_if = orig_node->router->if_incoming;
+ hard_iface = orig_node->router->if_incoming;
memcpy(dstaddr, orig_node->router->addr, ETH_ALEN);
- spin_unlock_bh(&bat_priv->orig_hash_lock);
skb = skb_clone(info->skb_packet, GFP_ATOMIC);
if (skb)
- send_skb_packet(skb, batman_if, dstaddr);
+ send_skb_packet(skb, hard_iface, dstaddr);
- spin_lock_bh(&bat_priv->orig_hash_lock);
}
-
+ rcu_read_unlock();
}
-
- spin_unlock_bh(&bat_priv->orig_hash_lock);
}
static void unicast_vis_packet(struct bat_priv *bat_priv,
struct vis_info *info)
{
struct orig_node *orig_node;
+ struct neigh_node *neigh_node = NULL;
struct sk_buff *skb;
struct vis_packet *packet;
- struct batman_if *batman_if;
- uint8_t dstaddr[ETH_ALEN];
- spin_lock_bh(&bat_priv->orig_hash_lock);
packet = (struct vis_packet *)info->skb_packet->data;
- orig_node = ((struct orig_node *)hash_find(bat_priv->orig_hash,
- compare_orig, choose_orig,
- packet->target_orig));
- if ((!orig_node) || (!orig_node->router))
- goto out;
+ rcu_read_lock();
+ orig_node = orig_hash_find(bat_priv, packet->target_orig);
- /* don't lock while sending the packets ... we therefore
- * copy the required data before sending */
- batman_if = orig_node->router->if_incoming;
- memcpy(dstaddr, orig_node->router->addr, ETH_ALEN);
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ if (!orig_node)
+ goto unlock;
+
+ neigh_node = orig_node->router;
+
+ if (!neigh_node)
+ goto unlock;
+
+ if (!atomic_inc_not_zero(&neigh_node->refcount)) {
+ neigh_node = NULL;
+ goto unlock;
+ }
+
+ rcu_read_unlock();
skb = skb_clone(info->skb_packet, GFP_ATOMIC);
if (skb)
- send_skb_packet(skb, batman_if, dstaddr);
+ send_skb_packet(skb, neigh_node->if_incoming,
+ neigh_node->addr);
- return;
+ goto out;
+unlock:
+ rcu_read_unlock();
out:
- spin_unlock_bh(&bat_priv->orig_hash_lock);
+ if (neigh_node)
+ neigh_node_free_ref(neigh_node);
+ if (orig_node)
+ orig_node_free_ref(orig_node);
+ return;
}
/* only send one vis packet. called from send_vis_packets() */
@@ -815,7 +842,7 @@ static void send_vis_packets(struct work_struct *work)
container_of(work, struct delayed_work, work);
struct bat_priv *bat_priv =
container_of(delayed_work, struct bat_priv, vis_work);
- struct vis_info *info, *temp;
+ struct vis_info *info;
spin_lock_bh(&bat_priv->vis_hash_lock);
purge_vis_packets(bat_priv);
@@ -825,8 +852,9 @@ static void send_vis_packets(struct work_struct *work)
send_list_add(bat_priv, bat_priv->my_vis_info);
}
- list_for_each_entry_safe(info, temp, &bat_priv->vis_send_list,
- send_list) {
+ while (!list_empty(&bat_priv->vis_send_list)) {
+ info = list_first_entry(&bat_priv->vis_send_list,
+ typeof(*info), send_list);
kref_get(&info->refcount);
spin_unlock_bh(&bat_priv->vis_hash_lock);
@@ -894,7 +922,8 @@ int vis_init(struct bat_priv *bat_priv)
INIT_LIST_HEAD(&bat_priv->vis_send_list);
hash_added = hash_add(bat_priv->vis_hash, vis_info_cmp, vis_info_choose,
- bat_priv->my_vis_info);
+ bat_priv->my_vis_info,
+ &bat_priv->my_vis_info->hash_entry);
if (hash_added < 0) {
pr_err("Can't add own vis packet into hash\n");
/* not in hash, need to remove it manually. */
@@ -916,10 +945,11 @@ err:
}
/* Decrease the reference count on a hash item info */
-static void free_info_ref(void *data, void *arg)
+static void free_info_ref(struct hlist_node *node, void *arg)
{
- struct vis_info *info = data;
+ struct vis_info *info;
+ info = container_of(node, struct vis_info, hash_entry);
send_list_del(info);
kref_put(&info->refcount, free_info);
}
diff --git a/net/batman-adv/vis.h b/net/batman-adv/vis.h
index 2c3b33089a9..31b820d07f2 100644
--- a/net/batman-adv/vis.h
+++ b/net/batman-adv/vis.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2008-2010 B.A.T.M.A.N. contributors:
+ * Copyright (C) 2008-2011 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
index 9190ae462cb..6dee7bf648a 100644
--- a/net/bridge/Kconfig
+++ b/net/bridge/Kconfig
@@ -6,6 +6,7 @@ config BRIDGE
tristate "802.1d Ethernet Bridging"
select LLC
select STP
+ depends on IPV6 || IPV6=n
---help---
If you say Y here, then your Linux box will be able to act as an
Ethernet bridge, which means that the different Ethernet segments it
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 556443566e9..21e5901186e 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -78,6 +78,8 @@ static int br_dev_open(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
+ netif_carrier_off(dev);
+
br_features_recompute(br);
netif_start_queue(dev);
br_stp_enable_bridge(br);
@@ -94,6 +96,8 @@ static int br_dev_stop(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
+ netif_carrier_off(dev);
+
br_stp_disable_bridge(br);
br_multicast_stop(br);
@@ -297,6 +301,21 @@ void br_netpoll_disable(struct net_bridge_port *p)
#endif
+static int br_add_slave(struct net_device *dev, struct net_device *slave_dev)
+
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ return br_add_if(br, slave_dev);
+}
+
+static int br_del_slave(struct net_device *dev, struct net_device *slave_dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ return br_del_if(br, slave_dev);
+}
+
static const struct ethtool_ops br_ethtool_ops = {
.get_drvinfo = br_getinfo,
.get_link = ethtool_op_get_link,
@@ -326,6 +345,8 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_netpoll_cleanup = br_netpoll_cleanup,
.ndo_poll_controller = br_poll_controller,
#endif
+ .ndo_add_slave = br_add_slave,
+ .ndo_del_slave = br_del_slave,
};
static void br_dev_free(struct net_device *dev)
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 2872393b293..88485cc74dc 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -328,12 +328,12 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
if (fdb) {
memcpy(fdb->addr.addr, addr, ETH_ALEN);
- hlist_add_head_rcu(&fdb->hlist, head);
-
fdb->dst = source;
fdb->is_local = is_local;
fdb->is_static = is_local;
fdb->ageing_timer = jiffies;
+
+ hlist_add_head_rcu(&fdb->hlist, head);
}
return fdb;
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index d9d1e2bac1d..dce8f0009a1 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -148,6 +148,8 @@ static void del_nbp(struct net_bridge_port *p)
netdev_rx_handler_unregister(dev);
+ netdev_set_master(dev, NULL);
+
br_multicast_del_port(p);
kobject_uevent(&p->kobj, KOBJ_REMOVE);
@@ -365,7 +367,7 @@ int br_min_mtu(const struct net_bridge *br)
void br_features_recompute(struct net_bridge *br)
{
struct net_bridge_port *p;
- unsigned long features, mask;
+ u32 features, mask;
features = mask = br->feature_mask;
if (list_empty(&br->port_list))
@@ -379,7 +381,7 @@ void br_features_recompute(struct net_bridge *br)
}
done:
- br->dev->features = netdev_fix_features(features, NULL);
+ br->dev->features = netdev_fix_features(br->dev, features);
}
/* called with RTNL */
@@ -429,10 +431,14 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
if (br_netpoll_info(br) && ((err = br_netpoll_enable(p))))
goto err3;
- err = netdev_rx_handler_register(dev, br_handle_frame, p);
+ err = netdev_set_master(dev, br->dev);
if (err)
goto err3;
+ err = netdev_rx_handler_register(dev, br_handle_frame, p);
+ if (err)
+ goto err4;
+
dev->priv_flags |= IFF_BRIDGE_PORT;
dev_disable_lro(dev);
@@ -455,6 +461,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
kobject_uevent(&p->kobj, KOBJ_ADD);
return 0;
+
+err4:
+ netdev_set_master(dev, NULL);
err3:
sysfs_remove_link(br->ifobj, p->dev->name);
err2:
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 6f6d8e1b776..88e4aa9cb1f 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -80,7 +80,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
if (is_multicast_ether_addr(dest)) {
mdst = br_mdb_get(br, skb);
if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
- if ((mdst && !hlist_unhashed(&mdst->mglist)) ||
+ if ((mdst && mdst->mglist) ||
br_multicast_is_router(br))
skb2 = skb;
br_multicast_forward(mdst, skb, skb2);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f701a21acb3..030a002ff8e 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -37,10 +37,9 @@
rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static inline int ipv6_is_local_multicast(const struct in6_addr *addr)
+static inline int ipv6_is_transient_multicast(const struct in6_addr *addr)
{
- if (ipv6_addr_is_multicast(addr) &&
- IPV6_ADDR_MC_SCOPE(addr) <= IPV6_ADDR_SCOPE_LINKLOCAL)
+ if (ipv6_addr_is_multicast(addr) && IPV6_ADDR_MC_FLAG_TRANSIENT(addr))
return 1;
return 0;
}
@@ -232,8 +231,7 @@ static void br_multicast_group_expired(unsigned long data)
if (!netif_running(br->dev) || timer_pending(&mp->timer))
goto out;
- if (!hlist_unhashed(&mp->mglist))
- hlist_del_init(&mp->mglist);
+ mp->mglist = false;
if (mp->ports)
goto out;
@@ -276,7 +274,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
del_timer(&p->query_timer);
call_rcu_bh(&p->rcu, br_multicast_free_pg);
- if (!mp->ports && hlist_unhashed(&mp->mglist) &&
+ if (!mp->ports && !mp->mglist &&
netif_running(br->dev))
mod_timer(&mp->timer, jiffies);
@@ -436,7 +434,6 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
eth = eth_hdr(skb);
memcpy(eth->h_source, br->dev->dev_addr, 6);
- ipv6_eth_mc_map(group, eth->h_dest);
eth->h_proto = htons(ETH_P_IPV6);
skb_put(skb, sizeof(*eth));
@@ -448,8 +445,10 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
ip6h->payload_len = htons(8 + sizeof(*mldq));
ip6h->nexthdr = IPPROTO_HOPOPTS;
ip6h->hop_limit = 1;
- ipv6_addr_set(&ip6h->saddr, 0, 0, 0, 0);
+ ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
+ &ip6h->saddr);
ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
+ ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
hopopt = (u8 *)(ip6h + 1);
hopopt[0] = IPPROTO_ICMPV6; /* next hdr */
@@ -528,7 +527,7 @@ static void br_multicast_group_query_expired(unsigned long data)
struct net_bridge *br = mp->br;
spin_lock(&br->multicast_lock);
- if (!netif_running(br->dev) || hlist_unhashed(&mp->mglist) ||
+ if (!netif_running(br->dev) || !mp->mglist ||
mp->queries_sent >= br->multicast_last_member_count)
goto out;
@@ -719,7 +718,7 @@ static int br_multicast_add_group(struct net_bridge *br,
goto err;
if (!port) {
- hlist_add_head(&mp->mglist, &br->mglist);
+ mp->mglist = true;
mod_timer(&mp->timer, now + br->multicast_membership_interval);
goto out;
}
@@ -781,11 +780,11 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
{
struct br_ip br_group;
- if (ipv6_is_local_multicast(group))
+ if (!ipv6_is_transient_multicast(group))
return 0;
ipv6_addr_copy(&br_group.u.ip6, group);
- br_group.proto = htons(ETH_P_IP);
+ br_group.proto = htons(ETH_P_IPV6);
return br_multicast_add_group(br, port, &br_group);
}
@@ -1014,18 +1013,19 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
nsrcs = skb_header_pointer(skb,
len + offsetof(struct mld2_grec,
- grec_mca),
+ grec_nsrcs),
sizeof(_nsrcs), &_nsrcs);
if (!nsrcs)
return -EINVAL;
if (!pskb_may_pull(skb,
len + sizeof(*grec) +
- sizeof(struct in6_addr) * (*nsrcs)))
+ sizeof(struct in6_addr) * ntohs(*nsrcs)))
return -EINVAL;
grec = (struct mld2_grec *)(skb->data + len);
- len += sizeof(*grec) + sizeof(struct in6_addr) * (*nsrcs);
+ len += sizeof(*grec) +
+ sizeof(struct in6_addr) * ntohs(*nsrcs);
/* We treat these as MLDv1 reports for now. */
switch (grec->grec_type) {
@@ -1165,7 +1165,7 @@ static int br_ip4_multicast_query(struct net_bridge *br,
max_delay *= br->multicast_last_member_count;
- if (!hlist_unhashed(&mp->mglist) &&
+ if (mp->mglist &&
(timer_pending(&mp->timer) ?
time_after(mp->timer.expires, now + max_delay) :
try_to_del_timer_sync(&mp->timer) >= 0))
@@ -1177,7 +1177,7 @@ static int br_ip4_multicast_query(struct net_bridge *br,
if (timer_pending(&p->timer) ?
time_after(p->timer.expires, now + max_delay) :
try_to_del_timer_sync(&p->timer) >= 0)
- mod_timer(&mp->timer, now + max_delay);
+ mod_timer(&p->timer, now + max_delay);
}
out:
@@ -1236,7 +1236,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
goto out;
max_delay *= br->multicast_last_member_count;
- if (!hlist_unhashed(&mp->mglist) &&
+ if (mp->mglist &&
(timer_pending(&mp->timer) ?
time_after(mp->timer.expires, now + max_delay) :
try_to_del_timer_sync(&mp->timer) >= 0))
@@ -1248,7 +1248,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
if (timer_pending(&p->timer) ?
time_after(p->timer.expires, now + max_delay) :
try_to_del_timer_sync(&p->timer) >= 0)
- mod_timer(&mp->timer, now + max_delay);
+ mod_timer(&p->timer, now + max_delay);
}
out:
@@ -1283,7 +1283,7 @@ static void br_multicast_leave_group(struct net_bridge *br,
br->multicast_last_member_interval;
if (!port) {
- if (!hlist_unhashed(&mp->mglist) &&
+ if (mp->mglist &&
(timer_pending(&mp->timer) ?
time_after(mp->timer.expires, time) :
try_to_del_timer_sync(&mp->timer) >= 0)) {
@@ -1341,7 +1341,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
{
struct br_ip br_group;
- if (ipv6_is_local_multicast(group))
+ if (!ipv6_is_transient_multicast(group))
return;
ipv6_addr_copy(&br_group.u.ip6, group);
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 4b5b66d07bb..f97af5590ba 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -412,10 +412,6 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
if (dnat_took_place(skb)) {
if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
- struct flowi fl = {
- .fl4_dst = iph->daddr,
- .fl4_tos = RT_TOS(iph->tos),
- };
struct in_device *in_dev = __in_dev_get_rcu(dev);
/* If err equals -EHOSTUNREACH the error is due to a
@@ -428,14 +424,16 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
goto free_skb;
- if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
+ rt = ip_route_output(dev_net(dev), iph->daddr, 0,
+ RT_TOS(iph->tos), 0);
+ if (!IS_ERR(rt)) {
/* - Bridged-and-DNAT'ed traffic doesn't
* require ip_forwarding. */
- if (((struct dst_entry *)rt)->dev == dev) {
- skb_dst_set(skb, (struct dst_entry *)rt);
+ if (rt->dst.dev == dev) {
+ skb_dst_set(skb, &rt->dst);
goto bridged_dnat;
}
- dst_release((struct dst_entry *)rt);
+ ip_rt_put(rt);
}
free_skb:
kfree_skb(skb);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 84aac7734bf..f7afc364d77 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -84,13 +84,13 @@ struct net_bridge_port_group {
struct net_bridge_mdb_entry
{
struct hlist_node hlist[2];
- struct hlist_node mglist;
struct net_bridge *br;
struct net_bridge_port_group __rcu *ports;
struct rcu_head rcu;
struct timer_list timer;
struct timer_list query_timer;
struct br_ip addr;
+ bool mglist;
u32 queries_sent;
};
@@ -182,7 +182,7 @@ struct net_bridge
struct br_cpu_netstats __percpu *stats;
spinlock_t hash_lock;
struct hlist_head hash[BR_HASH_SIZE];
- unsigned long feature_mask;
+ u32 feature_mask;
#ifdef CONFIG_BRIDGE_NETFILTER
struct rtable fake_rtable;
bool nf_call_iptables;
@@ -238,7 +238,6 @@ struct net_bridge
spinlock_t multicast_lock;
struct net_bridge_mdb_htable __rcu *mdb;
struct hlist_head router_list;
- struct hlist_head mglist;
struct timer_list multicast_router_timer;
struct timer_list multicast_querier_timer;
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 57186d84d2b..a5badd0f822 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -397,28 +397,37 @@ static void br_make_forwarding(struct net_bridge_port *p)
void br_port_state_selection(struct net_bridge *br)
{
struct net_bridge_port *p;
+ unsigned int liveports = 0;
/* Don't change port states if userspace is handling STP */
if (br->stp_enabled == BR_USER_STP)
return;
list_for_each_entry(p, &br->port_list, list) {
- if (p->state != BR_STATE_DISABLED) {
- if (p->port_no == br->root_port) {
- p->config_pending = 0;
- p->topology_change_ack = 0;
- br_make_forwarding(p);
- } else if (br_is_designated_port(p)) {
- del_timer(&p->message_age_timer);
- br_make_forwarding(p);
- } else {
- p->config_pending = 0;
- p->topology_change_ack = 0;
- br_make_blocking(p);
- }
+ if (p->state == BR_STATE_DISABLED)
+ continue;
+
+ if (p->port_no == br->root_port) {
+ p->config_pending = 0;
+ p->topology_change_ack = 0;
+ br_make_forwarding(p);
+ } else if (br_is_designated_port(p)) {
+ del_timer(&p->message_age_timer);
+ br_make_forwarding(p);
+ } else {
+ p->config_pending = 0;
+ p->topology_change_ack = 0;
+ br_make_blocking(p);
}
+ if (p->state == BR_STATE_FORWARDING)
+ ++liveports;
}
+
+ if (liveports == 0)
+ netif_carrier_off(br->dev);
+ else
+ netif_carrier_on(br->dev);
}
/* called under bridge lock */
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index 7b22456023c..3e965140051 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -94,6 +94,7 @@ static void br_forward_delay_timer_expired(unsigned long arg)
p->state = BR_STATE_FORWARDING;
if (br_is_designated_for_some_port(br))
br_topology_change_detection(br);
+ netif_carrier_on(br->dev);
}
br_log_state(p);
spin_unlock(&br->lock);
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 50a46afc2bc..2ed0056a39a 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -22,9 +22,15 @@
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_bridge/ebt_ip6.h>
-struct tcpudphdr {
- __be16 src;
- __be16 dst;
+union pkthdr {
+ struct {
+ __be16 src;
+ __be16 dst;
+ } tcpudphdr;
+ struct {
+ u8 type;
+ u8 code;
+ } icmphdr;
};
static bool
@@ -33,8 +39,8 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct ebt_ip6_info *info = par->matchinfo;
const struct ipv6hdr *ih6;
struct ipv6hdr _ip6h;
- const struct tcpudphdr *pptr;
- struct tcpudphdr _ports;
+ const union pkthdr *pptr;
+ union pkthdr _pkthdr;
ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h);
if (ih6 == NULL)
@@ -56,26 +62,34 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
return false;
if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO))
return false;
- if (!(info->bitmask & EBT_IP6_DPORT) &&
- !(info->bitmask & EBT_IP6_SPORT))
+ if (!(info->bitmask & ( EBT_IP6_DPORT |
+ EBT_IP6_SPORT | EBT_IP6_ICMP6)))
return true;
- pptr = skb_header_pointer(skb, offset_ph, sizeof(_ports),
- &_ports);
+
+ /* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */
+ pptr = skb_header_pointer(skb, offset_ph, sizeof(_pkthdr),
+ &_pkthdr);
if (pptr == NULL)
return false;
if (info->bitmask & EBT_IP6_DPORT) {
- u32 dst = ntohs(pptr->dst);
+ u16 dst = ntohs(pptr->tcpudphdr.dst);
if (FWINV(dst < info->dport[0] ||
dst > info->dport[1], EBT_IP6_DPORT))
return false;
}
if (info->bitmask & EBT_IP6_SPORT) {
- u32 src = ntohs(pptr->src);
+ u16 src = ntohs(pptr->tcpudphdr.src);
if (FWINV(src < info->sport[0] ||
src > info->sport[1], EBT_IP6_SPORT))
return false;
}
- return true;
+ if ((info->bitmask & EBT_IP6_ICMP6) &&
+ FWINV(pptr->icmphdr.type < info->icmpv6_type[0] ||
+ pptr->icmphdr.type > info->icmpv6_type[1] ||
+ pptr->icmphdr.code < info->icmpv6_code[0] ||
+ pptr->icmphdr.code > info->icmpv6_code[1],
+ EBT_IP6_ICMP6))
+ return false;
}
return true;
}
@@ -103,6 +117,14 @@ static int ebt_ip6_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1])
return -EINVAL;
+ if (info->bitmask & EBT_IP6_ICMP6) {
+ if ((info->invflags & EBT_IP6_PROTO) ||
+ info->protocol != IPPROTO_ICMPV6)
+ return -EINVAL;
+ if (info->icmpv6_type[0] > info->icmpv6_type[1] ||
+ info->icmpv6_code[0] > info->icmpv6_code[1])
+ return -EINVAL;
+ }
return 0;
}
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 16df0532d4b..893669caa8d 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1107,6 +1107,8 @@ static int do_replace(struct net *net, const void __user *user,
if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
return -ENOMEM;
+ tmp.name[sizeof(tmp.name) - 1] = 0;
+
countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
newinfo = vmalloc(sizeof(*newinfo) + countersize);
if (!newinfo)
@@ -1764,6 +1766,7 @@ static int compat_table_info(const struct ebt_table_info *info,
newinfo->entries_size = size;
+ xt_compat_init_offsets(AF_INET, info->nentries);
return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
entries, newinfo);
}
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index 21ede141018..f1f98d967d8 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -23,10 +23,8 @@
#include <asm/atomic.h>
#define MAX_PHY_LAYERS 7
-#define PHY_NAME_LEN 20
#define container_obj(layr) container_of(layr, struct cfcnfg, layer)
-#define RFM_FRAGMENT_SIZE 4030
/* Information about CAIF physical interfaces held by Config Module in order
* to manage physical interfaces
@@ -191,6 +189,7 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
struct cflayer *servl = NULL;
struct cfcnfg_phyinfo *phyinfo = NULL;
u8 phyid = 0;
+
caif_assert(adap_layer != NULL);
channel_id = adap_layer->id;
if (adap_layer->dn == NULL || channel_id == 0) {
@@ -199,16 +198,16 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
goto end;
}
servl = cfmuxl_remove_uplayer(cnfg->mux, channel_id);
- if (servl == NULL)
- goto end;
- layer_set_up(servl, NULL);
- ret = cfctrl_linkdown_req(cnfg->ctrl, channel_id, adap_layer);
if (servl == NULL) {
pr_err("PROTOCOL ERROR - Error removing service_layer Channel_Id(%d)",
channel_id);
ret = -EINVAL;
goto end;
}
+ layer_set_up(servl, NULL);
+ ret = cfctrl_linkdown_req(cnfg->ctrl, channel_id, adap_layer);
+ if (ret)
+ goto end;
caif_assert(channel_id == servl->id);
if (adap_layer->dn != NULL) {
phyid = cfsrvl_getphyid(adap_layer->dn);
diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c
index d3ed264ad6c..27dab26ad3b 100644
--- a/net/caif/cfdgml.c
+++ b/net/caif/cfdgml.c
@@ -18,7 +18,6 @@
#define DGM_CMD_BIT 0x80
#define DGM_FLOW_OFF 0x81
#define DGM_FLOW_ON 0x80
-#define DGM_CTRL_PKT_SIZE 1
#define DGM_MTU 1500
static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt);
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
index 9297f7dea9d..8303fe3ebf8 100644
--- a/net/caif/cfserl.c
+++ b/net/caif/cfserl.c
@@ -25,7 +25,6 @@ struct cfserl {
spinlock_t sync;
bool usestx;
};
-#define STXLEN(layr) (layr->usestx ? 1 : 0)
static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt);
static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt);
diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c
index efad410e4c8..315c0d60136 100644
--- a/net/caif/cfutill.c
+++ b/net/caif/cfutill.c
@@ -20,7 +20,7 @@
#define UTIL_REMOTE_SHUTDOWN 0x82
#define UTIL_FLOW_OFF 0x81
#define UTIL_FLOW_ON 0x80
-#define UTIL_CTRL_PKT_SIZE 1
+
static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt);
static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt);
diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c
index 3b425b189a9..c3b1dec4acf 100644
--- a/net/caif/cfveil.c
+++ b/net/caif/cfveil.c
@@ -17,7 +17,7 @@
#define VEI_FLOW_OFF 0x81
#define VEI_FLOW_ON 0x80
#define VEI_SET_PIN 0x82
-#define VEI_CTRL_PKT_SIZE 1
+
#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt);
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index fa9dab372b6..6008d6dc18a 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -394,9 +394,7 @@ static void ipcaif_net_setup(struct net_device *dev)
priv->conn_req.sockaddr.u.dgm.connection_id = -1;
priv->flowenabled = false;
- ASSERT_RTNL();
init_waitqueue_head(&priv->netmgmt_wq);
- list_add(&priv->list_field, &chnl_net_list);
}
@@ -453,6 +451,8 @@ static int ipcaif_newlink(struct net *src_net, struct net_device *dev,
ret = register_netdevice(dev);
if (ret)
pr_warn("device rtml registration failed\n");
+ else
+ list_add(&caifdev->list_field, &chnl_net_list);
return ret;
}
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 9d5e8accfab..092dc88a7c6 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1256,6 +1256,9 @@ static int bcm_sendmsg(struct kiocb *iocb, struct socket *sock,
struct sockaddr_can *addr =
(struct sockaddr_can *)msg->msg_name;
+ if (msg->msg_namelen < sizeof(*addr))
+ return -EINVAL;
+
if (addr->can_family != AF_CAN)
return -EINVAL;
diff --git a/net/can/raw.c b/net/can/raw.c
index e88f610fdb7..883e9d74fdd 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -649,6 +649,9 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock,
struct sockaddr_can *addr =
(struct sockaddr_can *)msg->msg_name;
+ if (msg->msg_namelen < sizeof(*addr))
+ return -EINVAL;
+
if (addr->can_family != AF_CAN)
return -EINVAL;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index dff633d62e5..05f357828a2 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -252,8 +252,12 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
{
struct kvec iov = {buf, len};
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
- return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+ r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
}
/*
@@ -264,13 +268,17 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
size_t kvlen, size_t len, int more)
{
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
if (more)
msg.msg_flags |= MSG_MORE;
else
msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
- return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+ r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
}
@@ -328,7 +336,6 @@ static void reset_connection(struct ceph_connection *con)
ceph_msg_put(con->out_msg);
con->out_msg = NULL;
}
- con->out_keepalive_pending = false;
con->in_seq = 0;
con->in_seq_acked = 0;
}
@@ -847,6 +854,8 @@ static int write_partial_msg_pages(struct ceph_connection *con)
(msg->pages || msg->pagelist || msg->bio || in_trail))
kunmap(page);
+ if (ret == -EAGAIN)
+ ret = 0;
if (ret <= 0)
goto out;
@@ -1238,8 +1247,6 @@ static int process_connect(struct ceph_connection *con)
con->auth_retry);
if (con->auth_retry == 2) {
con->error_msg = "connect authorization failure";
- reset_connection(con);
- set_bit(CLOSED, &con->state);
return -1;
}
con->auth_retry = 1;
@@ -1705,14 +1712,6 @@ more:
/* open the socket first? */
if (con->sock == NULL) {
- /*
- * if we were STANDBY and are reconnecting _this_
- * connection, bump connect_seq now. Always bump
- * global_seq.
- */
- if (test_and_clear_bit(STANDBY, &con->state))
- con->connect_seq++;
-
prepare_write_banner(msgr, con);
prepare_write_connect(msgr, con, 1);
prepare_read_banner(con);
@@ -1737,16 +1736,12 @@ more_kvec:
if (con->out_skip) {
ret = write_partial_skip(con);
if (ret <= 0)
- goto done;
- if (ret < 0) {
- dout("try_write write_partial_skip err %d\n", ret);
- goto done;
- }
+ goto out;
}
if (con->out_kvec_left) {
ret = write_partial_kvec(con);
if (ret <= 0)
- goto done;
+ goto out;
}
/* msg pages? */
@@ -1761,11 +1756,11 @@ more_kvec:
if (ret == 1)
goto more_kvec; /* we need to send the footer, too! */
if (ret == 0)
- goto done;
+ goto out;
if (ret < 0) {
dout("try_write write_partial_msg_pages err %d\n",
ret);
- goto done;
+ goto out;
}
}
@@ -1789,10 +1784,9 @@ do_next:
/* Nothing to do! */
clear_bit(WRITE_PENDING, &con->state);
dout("try_write nothing else to write.\n");
-done:
ret = 0;
out:
- dout("try_write done on %p\n", con);
+ dout("try_write done on %p ret %d\n", con, ret);
return ret;
}
@@ -1821,19 +1815,17 @@ more:
dout("try_read connecting\n");
ret = read_partial_banner(con);
if (ret <= 0)
- goto done;
- if (process_banner(con) < 0) {
- ret = -1;
goto out;
- }
+ ret = process_banner(con);
+ if (ret < 0)
+ goto out;
}
ret = read_partial_connect(con);
if (ret <= 0)
- goto done;
- if (process_connect(con) < 0) {
- ret = -1;
goto out;
- }
+ ret = process_connect(con);
+ if (ret < 0)
+ goto out;
goto more;
}
@@ -1848,7 +1840,7 @@ more:
dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
ret = ceph_tcp_recvmsg(con->sock, buf, skip);
if (ret <= 0)
- goto done;
+ goto out;
con->in_base_pos += ret;
if (con->in_base_pos)
goto more;
@@ -1859,7 +1851,7 @@ more:
*/
ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
if (ret <= 0)
- goto done;
+ goto out;
dout("try_read got tag %d\n", (int)con->in_tag);
switch (con->in_tag) {
case CEPH_MSGR_TAG_MSG:
@@ -1870,7 +1862,7 @@ more:
break;
case CEPH_MSGR_TAG_CLOSE:
set_bit(CLOSED, &con->state); /* fixme */
- goto done;
+ goto out;
default:
goto bad_tag;
}
@@ -1882,13 +1874,12 @@ more:
case -EBADMSG:
con->error_msg = "bad crc";
ret = -EIO;
- goto out;
+ break;
case -EIO:
con->error_msg = "io error";
- goto out;
- default:
- goto done;
+ break;
}
+ goto out;
}
if (con->in_tag == CEPH_MSGR_TAG_READY)
goto more;
@@ -1898,15 +1889,13 @@ more:
if (con->in_tag == CEPH_MSGR_TAG_ACK) {
ret = read_partial_ack(con);
if (ret <= 0)
- goto done;
+ goto out;
process_ack(con);
goto more;
}
-done:
- ret = 0;
out:
- dout("try_read done on %p\n", con);
+ dout("try_read done on %p ret %d\n", con, ret);
return ret;
bad_tag:
@@ -1951,7 +1940,24 @@ static void con_work(struct work_struct *work)
work.work);
mutex_lock(&con->mutex);
+ if (test_and_clear_bit(BACKOFF, &con->state)) {
+ dout("con_work %p backing off\n", con);
+ if (queue_delayed_work(ceph_msgr_wq, &con->work,
+ round_jiffies_relative(con->delay))) {
+ dout("con_work %p backoff %lu\n", con, con->delay);
+ mutex_unlock(&con->mutex);
+ return;
+ } else {
+ con->ops->put(con);
+ dout("con_work %p FAILED to back off %lu\n", con,
+ con->delay);
+ }
+ }
+ if (test_bit(STANDBY, &con->state)) {
+ dout("con_work %p STANDBY\n", con);
+ goto done;
+ }
if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
dout("con_work CLOSED\n");
con_close_socket(con);
@@ -2008,10 +2014,12 @@ static void ceph_fault(struct ceph_connection *con)
/* Requeue anything that hasn't been acked */
list_splice_init(&con->out_sent, &con->out_queue);
- /* If there are no messages in the queue, place the connection
- * in a STANDBY state (i.e., don't try to reconnect just yet). */
- if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
- dout("fault setting STANDBY\n");
+ /* If there are no messages queued or keepalive pending, place
+ * the connection in a STANDBY state */
+ if (list_empty(&con->out_queue) &&
+ !test_bit(KEEPALIVE_PENDING, &con->state)) {
+ dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
+ clear_bit(WRITE_PENDING, &con->state);
set_bit(STANDBY, &con->state);
} else {
/* retry after a delay. */
@@ -2019,11 +2027,24 @@ static void ceph_fault(struct ceph_connection *con)
con->delay = BASE_DELAY_INTERVAL;
else if (con->delay < MAX_DELAY_INTERVAL)
con->delay *= 2;
- dout("fault queueing %p delay %lu\n", con, con->delay);
con->ops->get(con);
if (queue_delayed_work(ceph_msgr_wq, &con->work,
- round_jiffies_relative(con->delay)) == 0)
+ round_jiffies_relative(con->delay))) {
+ dout("fault queued %p delay %lu\n", con, con->delay);
+ } else {
con->ops->put(con);
+ dout("fault failed to queue %p delay %lu, backoff\n",
+ con, con->delay);
+ /*
+ * In many cases we see a socket state change
+ * while con_work is running and end up
+ * queuing (non-delayed) work, such that we
+ * can't backoff with a delay. Set a flag so
+ * that when con_work restarts we schedule the
+ * delay then.
+ */
+ set_bit(BACKOFF, &con->state);
+ }
}
out_unlock:
@@ -2094,6 +2115,19 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr)
}
EXPORT_SYMBOL(ceph_messenger_destroy);
+static void clear_standby(struct ceph_connection *con)
+{
+ /* come back from STANDBY? */
+ if (test_and_clear_bit(STANDBY, &con->state)) {
+ mutex_lock(&con->mutex);
+ dout("clear_standby %p and ++connect_seq\n", con);
+ con->connect_seq++;
+ WARN_ON(test_bit(WRITE_PENDING, &con->state));
+ WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state));
+ mutex_unlock(&con->mutex);
+ }
+}
+
/*
* Queue up an outgoing message on the given connection.
*/
@@ -2126,6 +2160,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
/* if there wasn't anything waiting to send before, queue
* new work */
+ clear_standby(con);
if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
queue_con(con);
}
@@ -2191,6 +2226,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
*/
void ceph_con_keepalive(struct ceph_connection *con)
{
+ dout("con_keepalive %p\n", con);
+ clear_standby(con);
if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
test_and_set_bit(WRITE_PENDING, &con->state) == 0)
queue_con(con);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 1a040e64c69..cd9c21df87d 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -16,22 +16,30 @@ struct page **ceph_get_direct_page_vector(const char __user *data,
int num_pages, bool write_page)
{
struct page **pages;
- int rc;
+ int got = 0;
+ int rc = 0;
pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
if (!pages)
return ERR_PTR(-ENOMEM);
down_read(&current->mm->mmap_sem);
- rc = get_user_pages(current, current->mm, (unsigned long)data,
- num_pages, write_page, 0, pages, NULL);
+ while (got < num_pages) {
+ rc = get_user_pages(current, current->mm,
+ (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
+ num_pages - got, write_page, 0, pages + got, NULL);
+ if (rc < 0)
+ break;
+ BUG_ON(rc == 0);
+ got += rc;
+ }
up_read(&current->mm->mmap_sem);
- if (rc < num_pages)
+ if (rc < 0)
goto fail;
return pages;
fail:
- ceph_put_page_vector(pages, rc > 0 ? rc : 0, false);
+ ceph_put_page_vector(pages, got, false);
return ERR_PTR(rc);
}
EXPORT_SYMBOL(ceph_get_direct_page_vector);
diff --git a/net/core/dev.c b/net/core/dev.c
index 54277df0f73..0d39032e962 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -132,6 +132,7 @@
#include <trace/events/skb.h>
#include <linux/pci.h>
#include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
#include "net-sysfs.h"
@@ -749,7 +750,8 @@ EXPORT_SYMBOL(dev_get_by_index);
* @ha: hardware address
*
* Search for an interface by MAC address. Returns NULL if the device
- * is not found or a pointer to the device. The caller must hold RCU
+ * is not found or a pointer to the device.
+ * The caller must hold RCU or RTNL.
* The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
@@ -1113,13 +1115,21 @@ EXPORT_SYMBOL(netdev_bonding_change);
void dev_load(struct net *net, const char *name)
{
struct net_device *dev;
+ int no_module;
rcu_read_lock();
dev = dev_get_by_name_rcu(net, name);
rcu_read_unlock();
- if (!dev && capable(CAP_NET_ADMIN))
- request_module("%s", name);
+ no_module = !dev;
+ if (no_module && capable(CAP_NET_ADMIN))
+ no_module = request_module("netdev-%s", name);
+ if (no_module && capable(CAP_SYS_MODULE)) {
+ if (!request_module("%s", name))
+ pr_err("Loading kernel module for a network device "
+"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s "
+"instead\n", name);
+ }
}
EXPORT_SYMBOL(dev_load);
@@ -1279,13 +1289,16 @@ static int __dev_close_many(struct list_head *head)
static int __dev_close(struct net_device *dev)
{
+ int retval;
LIST_HEAD(single);
list_add(&dev->unreg_list, &single);
- return __dev_close_many(&single);
+ retval = __dev_close_many(&single);
+ list_del(&single);
+ return retval;
}
-int dev_close_many(struct list_head *head)
+static int dev_close_many(struct list_head *head)
{
struct net_device *dev, *tmp;
LIST_HEAD(tmp_list);
@@ -1324,7 +1337,7 @@ int dev_close(struct net_device *dev)
list_add(&dev->unreg_list, &single);
dev_close_many(&single);
-
+ list_del(&single);
return 0;
}
EXPORT_SYMBOL(dev_close);
@@ -1593,6 +1606,48 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
rcu_read_unlock();
}
+/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this verify the tc mapping remains valid and if
+ * not NULL the mapping. With no priorities mapping to this
+ * offset/count pair it will no longer be used. In the worst case TC0
+ * is invalid nothing can be done so disable priority mappings. If is
+ * expected that drivers will fix this mapping if they can before
+ * calling netif_set_real_num_tx_queues.
+ */
+static void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+ int i;
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+ /* If TC0 is invalidated disable TC mapping */
+ if (tc->offset + tc->count > txq) {
+ pr_warning("Number of in use tx queues changed "
+ "invalidating tc mappings. Priority "
+ "traffic classification disabled!\n");
+ dev->num_tc = 0;
+ return;
+ }
+
+ /* Invalidated prio to tc mappings set to TC0 */
+ for (i = 1; i < TC_BITMASK + 1; i++) {
+ int q = netdev_get_prio_tc_map(dev, i);
+
+ tc = &dev->tc_to_txq[q];
+ if (tc->offset + tc->count > txq) {
+ pr_warning("Number of in use tx queues "
+ "changed. Priority %i to tc "
+ "mapping %i is no longer valid "
+ "setting map to 0\n",
+ i, q);
+ netdev_set_prio_tc_map(dev, i, 0);
+ }
+ }
+}
+
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -1604,7 +1659,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (txq < 1 || txq > dev->num_tx_queues)
return -EINVAL;
- if (dev->reg_state == NETREG_REGISTERED) {
+ if (dev->reg_state == NETREG_REGISTERED ||
+ dev->reg_state == NETREG_UNREGISTERING) {
ASSERT_RTNL();
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
@@ -1612,6 +1668,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (rc)
return rc;
+ if (dev->num_tc)
+ netif_setup_tc(dev, txq);
+
if (txq < dev->real_num_tx_queues)
qdisc_reset_all_tx_gt(dev, txq);
}
@@ -1811,7 +1870,7 @@ EXPORT_SYMBOL(skb_checksum_help);
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*/
-struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_type *ptype;
@@ -1999,9 +2058,9 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)
protocol == htons(ETH_P_FCOE)));
}
-static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
+static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
{
- if (!can_checksum_protocol(protocol, features)) {
+ if (!can_checksum_protocol(features, protocol)) {
features &= ~NETIF_F_ALL_CSUM;
features &= ~NETIF_F_SG;
} else if (illegal_highdma(skb->dev, skb)) {
@@ -2011,10 +2070,10 @@ static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features
return features;
}
-int netif_skb_features(struct sk_buff *skb)
+u32 netif_skb_features(struct sk_buff *skb)
{
__be16 protocol = skb->protocol;
- int features = skb->dev->features;
+ u32 features = skb->dev->features;
if (protocol == htons(ETH_P_8021Q)) {
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
@@ -2023,13 +2082,13 @@ int netif_skb_features(struct sk_buff *skb)
return harmonize_features(skb, protocol, features);
}
- features &= skb->dev->vlan_features;
+ features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
if (protocol != htons(ETH_P_8021Q)) {
return harmonize_features(skb, protocol, features);
} else {
features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
- NETIF_F_GEN_CSUM;
+ NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
return harmonize_features(skb, protocol, features);
}
}
@@ -2059,7 +2118,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
int rc = NETDEV_TX_OK;
if (likely(!skb->next)) {
- int features;
+ u32 features;
/*
* If device doesnt need skb->dst, release it right now while
@@ -2161,6 +2220,8 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
unsigned int num_tx_queues)
{
u32 hash;
+ u16 qoffset = 0;
+ u16 qcount = num_tx_queues;
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
@@ -2169,13 +2230,19 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
return hash;
}
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+ qoffset = dev->tc_to_txq[tc].offset;
+ qcount = dev->tc_to_txq[tc].count;
+ }
+
if (skb->sk && skb->sk->sk_hash)
hash = skb->sk->sk_hash;
else
hash = (__force u16) skb->protocol ^ skb->rxhash;
hash = jhash_1word(hash, hashrnd);
- return (u16) (((u64) hash * num_tx_queues) >> 32);
+ return (u16) (((u64) hash * qcount) >> 32) + qoffset;
}
EXPORT_SYMBOL(__skb_tx_hash);
@@ -2272,15 +2339,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct netdev_queue *txq)
{
spinlock_t *root_lock = qdisc_lock(q);
- bool contended = qdisc_is_running(q);
+ bool contended;
int rc;
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+ qdisc_calculate_pkt_len(skb, q);
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
* This permits __QDISC_STATE_RUNNING owner to get the lock more often
* and dequeue packets faster.
*/
+ contended = qdisc_is_running(q);
if (unlikely(contended))
spin_lock(&q->busylock);
@@ -2298,7 +2368,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
skb_dst_force(skb);
- qdisc_skb_cb(skb)->pkt_len = skb->len;
qdisc_bstats_update(q, skb);
if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
@@ -2313,7 +2382,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_SUCCESS;
} else {
skb_dst_force(skb);
- rc = qdisc_enqueue_root(skb, q);
+ rc = q->enqueue(skb, q) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -2532,6 +2601,54 @@ EXPORT_SYMBOL(__skb_get_rxhash);
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow *rflow, u16 next_cpu)
+{
+ u16 tcpu;
+
+ tcpu = rflow->cpu = next_cpu;
+ if (tcpu != RPS_NO_CPU) {
+#ifdef CONFIG_RFS_ACCEL
+ struct netdev_rx_queue *rxqueue;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *old_rflow;
+ u32 flow_id;
+ u16 rxq_index;
+ int rc;
+
+ /* Should we steer this flow to a different hardware queue? */
+ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
+ !(dev->features & NETIF_F_NTUPLE))
+ goto out;
+ rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+ if (rxq_index == skb_get_rx_queue(skb))
+ goto out;
+
+ rxqueue = dev->_rx + rxq_index;
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (!flow_table)
+ goto out;
+ flow_id = skb->rxhash & flow_table->mask;
+ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
+ rxq_index, flow_id);
+ if (rc < 0)
+ goto out;
+ old_rflow = rflow;
+ rflow = &flow_table->flows[flow_id];
+ rflow->cpu = next_cpu;
+ rflow->filter = rc;
+ if (old_rflow->filter == rflow->filter)
+ old_rflow->filter = RPS_NO_FILTER;
+ out:
+#endif
+ rflow->last_qtail =
+ per_cpu(softnet_data, tcpu).input_queue_head;
+ }
+
+ return rflow;
+}
+
/*
* get_rps_cpu is called from netif_receive_skb and returns the target
* CPU from the RPS map of the receiving queue for a given skb.
@@ -2562,7 +2679,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
map = rcu_dereference(rxqueue->rps_map);
if (map) {
- if (map->len == 1) {
+ if (map->len == 1 &&
+ !rcu_dereference_raw(rxqueue->rps_flow_table)) {
tcpu = map->cpus[0];
if (cpu_online(tcpu))
cpu = tcpu;
@@ -2602,12 +2720,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (unlikely(tcpu != next_cpu) &&
(tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
- rflow->last_qtail)) >= 0)) {
- tcpu = rflow->cpu = next_cpu;
- if (tcpu != RPS_NO_CPU)
- rflow->last_qtail = per_cpu(softnet_data,
- tcpu).input_queue_head;
- }
+ rflow->last_qtail)) >= 0))
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+
if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
@@ -2628,6 +2743,46 @@ done:
return cpu;
}
+#ifdef CONFIG_RFS_ACCEL
+
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id)
+{
+ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *rflow;
+ bool expire = true;
+ int cpu;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (flow_table && flow_id <= flow_table->mask) {
+ rflow = &flow_table->flows[flow_id];
+ cpu = ACCESS_ONCE(rflow->cpu);
+ if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+ ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+ rflow->last_qtail) <
+ (int)(10 * flow_table->mask)))
+ expire = false;
+ }
+ rcu_read_unlock();
+ return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
+#endif /* CONFIG_RFS_ACCEL */
+
/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
@@ -2949,64 +3104,31 @@ void netdev_rx_handler_unregister(struct net_device *dev)
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
-static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
- struct net_device *master)
-{
- if (skb->pkt_type == PACKET_HOST) {
- u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
-
- memcpy(dest, master->dev_addr, ETH_ALEN);
- }
-}
-
-/* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
- */
-int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
+static void vlan_on_bond_hook(struct sk_buff *skb)
{
- struct net_device *dev = skb->dev;
-
- if (master->priv_flags & IFF_MASTER_ARPMON)
- dev->last_rx = jiffies;
-
- if ((master->priv_flags & IFF_MASTER_ALB) &&
- (master->priv_flags & IFF_BRIDGE_PORT)) {
- /* Do address unmangle. The local destination address
- * will be always the one master has. Provides the right
- * functionality in a bridge.
- */
- skb_bond_set_mac_by_master(skb, master);
- }
-
- if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
- if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
- skb->protocol == __cpu_to_be16(ETH_P_ARP))
- return 0;
-
- if (master->priv_flags & IFF_MASTER_ALB) {
- if (skb->pkt_type != PACKET_BROADCAST &&
- skb->pkt_type != PACKET_MULTICAST)
- return 0;
- }
- if (master->priv_flags & IFF_MASTER_8023AD &&
- skb->protocol == __cpu_to_be16(ETH_P_SLOW))
- return 0;
+ /*
+ * Make sure ARP frames received on VLAN interfaces stacked on
+ * bonding interfaces still make their way to any base bonding
+ * device that may have registered for a specific ptype.
+ */
+ if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
+ vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
+ skb->protocol == htons(ETH_P_ARP)) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
- return 1;
+ if (!skb2)
+ return;
+ skb2->dev = vlan_dev_real_dev(skb->dev);
+ netif_rx(skb2);
}
- return 0;
}
-EXPORT_SYMBOL(__skb_bond_should_drop);
static int __netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
- struct net_device *master;
- struct net_device *null_or_orig;
- struct net_device *orig_or_bond;
+ struct net_device *null_or_dev;
int ret = NET_RX_DROP;
__be16 type;
@@ -3021,28 +3143,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
if (!skb->skb_iif)
skb->skb_iif = skb->dev->ifindex;
-
- /*
- * bonding note: skbs received on inactive slaves should only
- * be delivered to pkt handlers that are exact matches. Also
- * the deliver_no_wcard flag will be set. If packet handlers
- * are sensitive to duplicate packets these skbs will need to
- * be dropped at the handler.
- */
- null_or_orig = NULL;
orig_dev = skb->dev;
- master = ACCESS_ONCE(orig_dev->master);
- if (skb->deliver_no_wcard)
- null_or_orig = orig_dev;
- else if (master) {
- if (skb_bond_should_drop(skb, master)) {
- skb->deliver_no_wcard = 1;
- null_or_orig = orig_dev; /* deliver only exact match */
- } else
- skb->dev = master;
- }
- __this_cpu_inc(softnet_data.processed);
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
skb->mac_len = skb->network_header - skb->mac_header;
@@ -3051,6 +3153,10 @@ static int __netif_receive_skb(struct sk_buff *skb)
rcu_read_lock();
+another_round:
+
+ __this_cpu_inc(softnet_data.processed);
+
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -3059,8 +3165,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
#endif
list_for_each_entry_rcu(ptype, &ptype_all, list) {
- if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
- ptype->dev == orig_dev) {
+ if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
@@ -3074,16 +3179,20 @@ static int __netif_receive_skb(struct sk_buff *skb)
ncls:
#endif
- /* Handle special case of bridge or macvlan */
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
+ struct net_device *prev_dev;
+
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
+ prev_dev = skb->dev;
skb = rx_handler(skb);
if (!skb)
goto out;
+ if (skb->dev != prev_dev)
+ goto another_round;
}
if (vlan_tx_tag_present(skb)) {
@@ -3098,24 +3207,17 @@ ncls:
goto out;
}
- /*
- * Make sure frames received on VLAN interfaces stacked on
- * bonding interfaces still make their way to any base bonding
- * device that may have registered for a specific ptype. The
- * handler may have to adjust skb->dev and orig_dev.
- */
- orig_or_bond = orig_dev;
- if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
- (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
- orig_or_bond = vlan_dev_real_dev(skb->dev);
- }
+ vlan_on_bond_hook(skb);
+
+ /* deliver only exact match when indicated */
+ null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL;
type = skb->protocol;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
- if (ptype->type == type && (ptype->dev == null_or_orig ||
- ptype->dev == skb->dev || ptype->dev == orig_dev ||
- ptype->dev == orig_or_bond)) {
+ if (ptype->type == type &&
+ (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
+ ptype->dev == orig_dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
@@ -3423,6 +3525,8 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
__skb_pull(skb, skb_headlen(skb));
skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
skb->vlan_tci = 0;
+ skb->dev = napi->dev;
+ skb->skb_iif = 0;
napi->skb = skb;
}
@@ -3910,12 +4014,15 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos)
void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct net_device *dev = (v == SEQ_START_TOKEN) ?
- first_net_device(seq_file_net(seq)) :
- next_net_device((struct net_device *)v);
+ struct net_device *dev = v;
+
+ if (v == SEQ_START_TOKEN)
+ dev = first_net_device_rcu(seq_file_net(seq));
+ else
+ dev = next_net_device_rcu(dev);
++*pos;
- return rcu_dereference(dev);
+ return dev;
}
void dev_seq_stop(struct seq_file *seq, void *v)
@@ -4199,15 +4306,14 @@ static int __init dev_proc_init(void)
/**
- * netdev_set_master - set up master/slave pair
+ * netdev_set_master - set up master pointer
* @slave: slave device
* @master: new master device
*
* Changes the master device of the slave. Pass %NULL to break the
* bonding. The caller must hold the RTNL semaphore. On a failure
* a negative errno code is returned. On success the reference counts
- * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
- * function returns zero.
+ * are adjusted and the function returns zero.
*/
int netdev_set_master(struct net_device *slave, struct net_device *master)
{
@@ -4227,6 +4333,29 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
synchronize_net();
dev_put(old);
}
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_master);
+
+/**
+ * netdev_set_bond_master - set up bonding master/slave pair
+ * @slave: slave device
+ * @master: new master device
+ *
+ * Changes the master device of the slave. Pass %NULL to break the
+ * bonding. The caller must hold the RTNL semaphore. On a failure
+ * a negative errno code is returned. On success %RTM_NEWLINK is sent
+ * to the routing socket and the function returns zero.
+ */
+int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = netdev_set_master(slave, master);
+ if (err)
+ return err;
if (master)
slave->flags |= IFF_SLAVE;
else
@@ -4235,7 +4364,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
return 0;
}
-EXPORT_SYMBOL(netdev_set_master);
+EXPORT_SYMBOL(netdev_set_bond_master);
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
@@ -4572,6 +4701,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
EXPORT_SYMBOL(dev_set_mtu);
/**
+ * dev_set_group - Change group this device belongs to
+ * @dev: device
+ * @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+ dev->group = new_group;
+}
+EXPORT_SYMBOL(dev_set_group);
+
+/**
* dev_set_mac_address - Change Media Access Control Address
* @dev: device
* @sa: new address
@@ -5059,43 +5199,58 @@ static void rollback_registered(struct net_device *dev)
list_add(&dev->unreg_list, &single);
rollback_registered_many(&single);
+ list_del(&single);
}
-unsigned long netdev_fix_features(unsigned long features, const char *name)
+u32 netdev_fix_features(struct net_device *dev, u32 features)
{
+ /* Fix illegal checksum combinations */
+ if ((features & NETIF_F_HW_CSUM) &&
+ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_info(dev, "mixed HW and IP checksum settings.\n");
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+ }
+
+ if ((features & NETIF_F_NO_CSUM) &&
+ (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_info(dev, "mixed no checksumming and other settings.\n");
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
+ }
+
/* Fix illegal SG+CSUM combinations. */
if ((features & NETIF_F_SG) &&
!(features & NETIF_F_ALL_CSUM)) {
- if (name)
- printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
- "checksum feature.\n", name);
+ netdev_info(dev,
+ "Dropping NETIF_F_SG since no checksum feature.\n");
features &= ~NETIF_F_SG;
}
/* TSO requires that SG is present as well. */
if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
- if (name)
- printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
- "SG feature.\n", name);
+ netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
features &= ~NETIF_F_TSO;
}
+ /* Software GSO depends on SG. */
+ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
+ netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
+ features &= ~NETIF_F_GSO;
+ }
+
+ /* UFO needs SG and checksumming */
if (features & NETIF_F_UFO) {
/* maybe split UFO into V4 and V6? */
if (!((features & NETIF_F_GEN_CSUM) ||
(features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
== (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- if (name)
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
- "since no checksum offload features.\n",
- name);
+ netdev_info(dev,
+ "Dropping NETIF_F_UFO since no checksum offload features.\n");
features &= ~NETIF_F_UFO;
}
if (!(features & NETIF_F_SG)) {
- if (name)
- printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
- "since no NETIF_F_SG feature.\n", name);
+ netdev_info(dev,
+ "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
features &= ~NETIF_F_UFO;
}
}
@@ -5104,6 +5259,37 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
}
EXPORT_SYMBOL(netdev_fix_features);
+void netdev_update_features(struct net_device *dev)
+{
+ u32 features;
+ int err = 0;
+
+ features = netdev_get_wanted_features(dev);
+
+ if (dev->netdev_ops->ndo_fix_features)
+ features = dev->netdev_ops->ndo_fix_features(dev, features);
+
+ /* driver might be less strict about feature dependencies */
+ features = netdev_fix_features(dev, features);
+
+ if (dev->features == features)
+ return;
+
+ netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
+ dev->features, features);
+
+ if (dev->netdev_ops->ndo_set_features)
+ err = dev->netdev_ops->ndo_set_features(dev, features);
+
+ if (!err)
+ dev->features = features;
+ else if (err < 0)
+ netdev_err(dev,
+ "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
+ err, features, dev->features);
+}
+EXPORT_SYMBOL(netdev_update_features);
+
/**
* netif_stacked_transfer_operstate - transfer operstate
* @rootdev: the root or lower level device to transfer state from
@@ -5238,27 +5424,19 @@ int register_netdevice(struct net_device *dev)
if (dev->iflink == -1)
dev->iflink = dev->ifindex;
- /* Fix illegal checksum combinations */
- if ((dev->features & NETIF_F_HW_CSUM) &&
- (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
- dev->name);
- dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
- }
+ /* Transfer changeable features to wanted_features and enable
+ * software offloads (GSO and GRO).
+ */
+ dev->hw_features |= NETIF_F_SOFT_FEATURES;
+ dev->features |= NETIF_F_SOFT_FEATURES;
+ dev->wanted_features = dev->features & dev->hw_features;
- if ((dev->features & NETIF_F_NO_CSUM) &&
- (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
- dev->name);
- dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
+ /* Avoid warning from netdev_fix_features() for GSO without SG */
+ if (!(dev->wanted_features & NETIF_F_SG)) {
+ dev->wanted_features &= ~NETIF_F_GSO;
+ dev->features &= ~NETIF_F_GSO;
}
- dev->features = netdev_fix_features(dev->features, dev->name);
-
- /* Enable software GSO if SG is supported. */
- if (dev->features & NETIF_F_SG)
- dev->features |= NETIF_F_GSO;
-
/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
* vlan_dev_init() will do the dev->features check, so these features
* are enabled only if supported by underlying device.
@@ -5275,6 +5453,8 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
dev->reg_state = NETREG_REGISTERED;
+ netdev_update_features(dev);
+
/*
* Default initial state at registry is that the
* device is present.
@@ -5656,30 +5836,36 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
+ dev->gso_max_size = GSO_MAX_SIZE;
+
+ INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
+ dev->ethtool_ntuple_list.count = 0;
+ INIT_LIST_HEAD(&dev->napi_list);
+ INIT_LIST_HEAD(&dev->unreg_list);
+ INIT_LIST_HEAD(&dev->link_watch_list);
+ dev->priv_flags = IFF_XMIT_DST_RELEASE;
+ setup(dev);
+
dev->num_tx_queues = txqs;
dev->real_num_tx_queues = txqs;
if (netif_alloc_netdev_queues(dev))
- goto free_pcpu;
+ goto free_all;
#ifdef CONFIG_RPS
dev->num_rx_queues = rxqs;
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
- goto free_pcpu;
+ goto free_all;
#endif
- dev->gso_max_size = GSO_MAX_SIZE;
-
- INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
- dev->ethtool_ntuple_list.count = 0;
- INIT_LIST_HEAD(&dev->napi_list);
- INIT_LIST_HEAD(&dev->unreg_list);
- INIT_LIST_HEAD(&dev->link_watch_list);
- dev->priv_flags = IFF_XMIT_DST_RELEASE;
- setup(dev);
strcpy(dev->name, name);
+ dev->group = INIT_NETDEV_GROUP;
return dev;
+free_all:
+ free_netdev(dev);
+ return NULL;
+
free_pcpu:
free_percpu(dev->pcpu_refcnt);
kfree(dev->_tx);
@@ -5988,8 +6174,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
* @one to the master device with current feature set @all. Will not
* enable anything that is off in @mask. Returns the new feature set.
*/
-unsigned long netdev_increment_features(unsigned long all, unsigned long one,
- unsigned long mask)
+u32 netdev_increment_features(u32 all, u32 one, u32 mask)
{
/* If device needs checksumming, downgrade to it. */
if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
@@ -6207,6 +6392,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
}
}
unregister_netdevice_many(&dev_kill_list);
+ list_del(&dev_kill_list);
rtnl_unlock();
}
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 508f9c18992..133fd22ea28 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -144,7 +144,7 @@ void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
list_for_each_entry(ha, &from_list->list, list) {
type = addr_type ? addr_type : ha->type;
- __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
+ __hw_addr_del(to_list, ha->addr, addr_len, type);
}
}
EXPORT_SYMBOL(__hw_addr_del_multiple);
diff --git a/net/core/dst.c b/net/core/dst.c
index b99c7c7ffce..91104d35de7 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -164,7 +164,9 @@ int dst_discard(struct sk_buff *skb)
}
EXPORT_SYMBOL(dst_discard);
-void *dst_alloc(struct dst_ops *ops)
+const u32 dst_default_metrics[RTAX_MAX];
+
+void *dst_alloc(struct dst_ops *ops, int initial_ref)
{
struct dst_entry *dst;
@@ -175,11 +177,12 @@ void *dst_alloc(struct dst_ops *ops)
dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC);
if (!dst)
return NULL;
- atomic_set(&dst->__refcnt, 0);
+ atomic_set(&dst->__refcnt, initial_ref);
dst->ops = ops;
dst->lastuse = jiffies;
dst->path = dst;
dst->input = dst->output = dst_discard;
+ dst_init_metrics(dst, dst_default_metrics, true);
#if RT_CACHE_DEBUG >= 2
atomic_inc(&dst_total);
#endif
@@ -282,6 +285,42 @@ void dst_release(struct dst_entry *dst)
}
EXPORT_SYMBOL(dst_release);
+u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+
+ if (p) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ kfree(p);
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ }
+ }
+ return p;
+}
+EXPORT_SYMBOL(dst_cow_metrics_generic);
+
+/* Caller asserts that dst_metrics_read_only(dst) is false. */
+void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ unsigned long prev, new;
+
+ new = (unsigned long) dst_default_metrics;
+ prev = cmpxchg(&dst->_metrics, old, new);
+ if (prev == old)
+ kfree(__DST_METRICS_PTR(old));
+}
+EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+
/**
* skb_dst_set_noref - sets skb dst, without a reference
* @skb: buffer
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 17741782a34..c1a71bb738d 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -34,12 +34,6 @@ u32 ethtool_op_get_link(struct net_device *dev)
}
EXPORT_SYMBOL(ethtool_op_get_link);
-u32 ethtool_op_get_rx_csum(struct net_device *dev)
-{
- return (dev->features & NETIF_F_ALL_CSUM) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_rx_csum);
-
u32 ethtool_op_get_tx_csum(struct net_device *dev)
{
return (dev->features & NETIF_F_ALL_CSUM) != 0;
@@ -55,6 +49,7 @@ int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
return 0;
}
+EXPORT_SYMBOL(ethtool_op_set_tx_csum);
int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
{
@@ -171,6 +166,381 @@ EXPORT_SYMBOL(ethtool_ntuple_flush);
/* Handlers for each ethtool command */
+#define ETHTOOL_DEV_FEATURE_WORDS 1
+
+static void ethtool_get_features_compat(struct net_device *dev,
+ struct ethtool_get_features_block *features)
+{
+ if (!dev->ethtool_ops)
+ return;
+
+ /* getting RX checksum */
+ if (dev->ethtool_ops->get_rx_csum)
+ if (dev->ethtool_ops->get_rx_csum(dev))
+ features[0].active |= NETIF_F_RXCSUM;
+
+ /* mark legacy-changeable features */
+ if (dev->ethtool_ops->set_sg)
+ features[0].available |= NETIF_F_SG;
+ if (dev->ethtool_ops->set_tx_csum)
+ features[0].available |= NETIF_F_ALL_CSUM;
+ if (dev->ethtool_ops->set_tso)
+ features[0].available |= NETIF_F_ALL_TSO;
+ if (dev->ethtool_ops->set_rx_csum)
+ features[0].available |= NETIF_F_RXCSUM;
+ if (dev->ethtool_ops->set_flags)
+ features[0].available |= flags_dup_features;
+}
+
+static int ethtool_set_feature_compat(struct net_device *dev,
+ int (*legacy_set)(struct net_device *, u32),
+ struct ethtool_set_features_block *features, u32 mask)
+{
+ u32 do_set;
+
+ if (!legacy_set)
+ return 0;
+
+ if (!(features[0].valid & mask))
+ return 0;
+
+ features[0].valid &= ~mask;
+
+ do_set = !!(features[0].requested & mask);
+
+ if (legacy_set(dev, do_set) < 0)
+ netdev_info(dev,
+ "Legacy feature change (%s) failed for 0x%08x\n",
+ do_set ? "set" : "clear", mask);
+
+ return 1;
+}
+
+static int ethtool_set_features_compat(struct net_device *dev,
+ struct ethtool_set_features_block *features)
+{
+ int compat;
+
+ if (!dev->ethtool_ops)
+ return 0;
+
+ compat = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg,
+ features, NETIF_F_SG);
+ compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum,
+ features, NETIF_F_ALL_CSUM);
+ compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso,
+ features, NETIF_F_ALL_TSO);
+ compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum,
+ features, NETIF_F_RXCSUM);
+ compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_flags,
+ features, flags_dup_features);
+
+ return compat;
+}
+
+static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_gfeatures cmd = {
+ .cmd = ETHTOOL_GFEATURES,
+ .size = ETHTOOL_DEV_FEATURE_WORDS,
+ };
+ struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = {
+ {
+ .available = dev->hw_features,
+ .requested = dev->wanted_features,
+ .active = dev->features,
+ .never_changed = NETIF_F_NEVER_CHANGE,
+ },
+ };
+ u32 __user *sizeaddr;
+ u32 copy_size;
+
+ ethtool_get_features_compat(dev, features);
+
+ sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
+ if (get_user(copy_size, sizeaddr))
+ return -EFAULT;
+
+ if (copy_size > ETHTOOL_DEV_FEATURE_WORDS)
+ copy_size = ETHTOOL_DEV_FEATURE_WORDS;
+
+ if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+ if (copy_to_user(useraddr, features, copy_size * sizeof(*features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_sfeatures cmd;
+ struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ int ret = 0;
+
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+
+ if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS)
+ return -EINVAL;
+
+ if (copy_from_user(features, useraddr, sizeof(features)))
+ return -EFAULT;
+
+ if (features[0].valid & ~NETIF_F_ETHTOOL_BITS)
+ return -EINVAL;
+
+ if (ethtool_set_features_compat(dev, features))
+ ret |= ETHTOOL_F_COMPAT;
+
+ if (features[0].valid & ~dev->hw_features) {
+ features[0].valid &= dev->hw_features;
+ ret |= ETHTOOL_F_UNSUPPORTED;
+ }
+
+ dev->wanted_features &= ~features[0].valid;
+ dev->wanted_features |= features[0].valid & features[0].requested;
+ netdev_update_features(dev);
+
+ if ((dev->wanted_features ^ dev->features) & features[0].valid)
+ ret |= ETHTOOL_F_WISH;
+
+ return ret;
+}
+
+static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = {
+ /* NETIF_F_SG */ "tx-scatter-gather",
+ /* NETIF_F_IP_CSUM */ "tx-checksum-ipv4",
+ /* NETIF_F_NO_CSUM */ "tx-checksum-unneeded",
+ /* NETIF_F_HW_CSUM */ "tx-checksum-ip-generic",
+ /* NETIF_F_IPV6_CSUM */ "tx_checksum-ipv6",
+ /* NETIF_F_HIGHDMA */ "highdma",
+ /* NETIF_F_FRAGLIST */ "tx-scatter-gather-fraglist",
+ /* NETIF_F_HW_VLAN_TX */ "tx-vlan-hw-insert",
+
+ /* NETIF_F_HW_VLAN_RX */ "rx-vlan-hw-parse",
+ /* NETIF_F_HW_VLAN_FILTER */ "rx-vlan-filter",
+ /* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged",
+ /* NETIF_F_GSO */ "tx-generic-segmentation",
+ /* NETIF_F_LLTX */ "tx-lockless",
+ /* NETIF_F_NETNS_LOCAL */ "netns-local",
+ /* NETIF_F_GRO */ "rx-gro",
+ /* NETIF_F_LRO */ "rx-lro",
+
+ /* NETIF_F_TSO */ "tx-tcp-segmentation",
+ /* NETIF_F_UFO */ "tx-udp-fragmentation",
+ /* NETIF_F_GSO_ROBUST */ "tx-gso-robust",
+ /* NETIF_F_TSO_ECN */ "tx-tcp-ecn-segmentation",
+ /* NETIF_F_TSO6 */ "tx-tcp6-segmentation",
+ /* NETIF_F_FSO */ "tx-fcoe-segmentation",
+ "",
+ "",
+
+ /* NETIF_F_FCOE_CRC */ "tx-checksum-fcoe-crc",
+ /* NETIF_F_SCTP_CSUM */ "tx-checksum-sctp",
+ /* NETIF_F_FCOE_MTU */ "fcoe-mtu",
+ /* NETIF_F_NTUPLE */ "rx-ntuple-filter",
+ /* NETIF_F_RXHASH */ "rx-hashing",
+ /* NETIF_F_RXCSUM */ "rx-checksum",
+ "",
+ "",
+};
+
+static int __ethtool_get_sset_count(struct net_device *dev, int sset)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (sset == ETH_SS_FEATURES)
+ return ARRAY_SIZE(netdev_features_strings);
+
+ if (ops && ops->get_sset_count && ops->get_strings)
+ return ops->get_sset_count(dev, sset);
+ else
+ return -EOPNOTSUPP;
+}
+
+static void __ethtool_get_strings(struct net_device *dev,
+ u32 stringset, u8 *data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (stringset == ETH_SS_FEATURES)
+ memcpy(data, netdev_features_strings,
+ sizeof(netdev_features_strings));
+ else
+ /* ops->get_strings is valid because checked earlier */
+ ops->get_strings(dev, stringset, data);
+}
+
+static u32 ethtool_get_feature_mask(u32 eth_cmd)
+{
+ /* feature masks of legacy discrete ethtool ops */
+
+ switch (eth_cmd) {
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_STXCSUM:
+ return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM;
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_SRXCSUM:
+ return NETIF_F_RXCSUM;
+ case ETHTOOL_GSG:
+ case ETHTOOL_SSG:
+ return NETIF_F_SG;
+ case ETHTOOL_GTSO:
+ case ETHTOOL_STSO:
+ return NETIF_F_ALL_TSO;
+ case ETHTOOL_GUFO:
+ case ETHTOOL_SUFO:
+ return NETIF_F_UFO;
+ case ETHTOOL_GGSO:
+ case ETHTOOL_SGSO:
+ return NETIF_F_GSO;
+ case ETHTOOL_GGRO:
+ case ETHTOOL_SGRO:
+ return NETIF_F_GRO;
+ default:
+ BUG();
+ }
+}
+
+static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops)
+ return NULL;
+
+ switch (ethcmd) {
+ case ETHTOOL_GTXCSUM:
+ return ops->get_tx_csum;
+ case ETHTOOL_GRXCSUM:
+ return ops->get_rx_csum;
+ case ETHTOOL_SSG:
+ return ops->get_sg;
+ case ETHTOOL_STSO:
+ return ops->get_tso;
+ case ETHTOOL_SUFO:
+ return ops->get_ufo;
+ default:
+ return NULL;
+ }
+}
+
+static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev)
+{
+ return !!(dev->features & NETIF_F_ALL_CSUM);
+}
+
+static int ethtool_get_one_feature(struct net_device *dev,
+ char __user *useraddr, u32 ethcmd)
+{
+ u32 mask = ethtool_get_feature_mask(ethcmd);
+ struct ethtool_value edata = {
+ .cmd = ethcmd,
+ .data = !!(dev->features & mask),
+ };
+
+ /* compatibility with discrete get_ ops */
+ if (!(dev->hw_features & mask)) {
+ u32 (*actor)(struct net_device *);
+
+ actor = __ethtool_get_one_feature_actor(dev, ethcmd);
+
+ /* bug compatibility with old get_rx_csum */
+ if (ethcmd == ETHTOOL_GRXCSUM && !actor)
+ actor = __ethtool_get_rx_csum_oldbug;
+
+ if (actor)
+ edata.data = actor(dev);
+ }
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int __ethtool_set_tx_csum(struct net_device *dev, u32 data);
+static int __ethtool_set_rx_csum(struct net_device *dev, u32 data);
+static int __ethtool_set_sg(struct net_device *dev, u32 data);
+static int __ethtool_set_tso(struct net_device *dev, u32 data);
+static int __ethtool_set_ufo(struct net_device *dev, u32 data);
+
+static int ethtool_set_one_feature(struct net_device *dev,
+ void __user *useraddr, u32 ethcmd)
+{
+ struct ethtool_value edata;
+ u32 mask;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ mask = ethtool_get_feature_mask(ethcmd);
+ mask &= dev->hw_features;
+ if (mask) {
+ if (edata.data)
+ dev->wanted_features |= mask;
+ else
+ dev->wanted_features &= ~mask;
+
+ netdev_update_features(dev);
+ return 0;
+ }
+
+ /* Driver is not converted to ndo_fix_features or does not
+ * support changing this offload. In the latter case it won't
+ * have corresponding ethtool_ops field set.
+ *
+ * Following part is to be removed after all drivers advertise
+ * their changeable features in netdev->hw_features and stop
+ * using discrete offload setting ops.
+ */
+
+ switch (ethcmd) {
+ case ETHTOOL_STXCSUM:
+ return __ethtool_set_tx_csum(dev, edata.data);
+ case ETHTOOL_SRXCSUM:
+ return __ethtool_set_rx_csum(dev, edata.data);
+ case ETHTOOL_SSG:
+ return __ethtool_set_sg(dev, edata.data);
+ case ETHTOOL_STSO:
+ return __ethtool_set_tso(dev, edata.data);
+ case ETHTOOL_SUFO:
+ return __ethtool_set_ufo(dev, edata.data);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
+{
+ u32 changed;
+
+ if (data & ~flags_dup_features)
+ return -EINVAL;
+
+ /* legacy set_flags() op */
+ if (dev->ethtool_ops->set_flags) {
+ if (unlikely(dev->hw_features & flags_dup_features))
+ netdev_warn(dev,
+ "driver BUG: mixed hw_features and set_flags()\n");
+ return dev->ethtool_ops->set_flags(dev, data);
+ }
+
+ /* allow changing only bits set in hw_features */
+ changed = (data ^ dev->wanted_features) & flags_dup_features;
+ if (changed & ~dev->hw_features)
+ return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
+
+ dev->wanted_features =
+ (dev->wanted_features & ~changed) | data;
+
+ netdev_update_features(dev);
+
+ return 0;
+}
+
static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
{
struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET };
@@ -251,14 +621,10 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
void __user *useraddr)
{
struct ethtool_sset_info info;
- const struct ethtool_ops *ops = dev->ethtool_ops;
u64 sset_mask;
int i, idx = 0, n_bits = 0, ret, rc;
u32 *info_buf = NULL;
- if (!ops->get_sset_count)
- return -EOPNOTSUPP;
-
if (copy_from_user(&info, useraddr, sizeof(info)))
return -EFAULT;
@@ -285,7 +651,7 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
if (!(sset_mask & (1ULL << i)))
continue;
- rc = ops->get_sset_count(dev, i);
+ rc = __ethtool_get_sset_count(dev, i);
if (rc >= 0) {
info.sset_mask |= (1ULL << i);
info_buf[idx++] = rc;
@@ -817,7 +1183,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
if (regs.len > reglen)
regs.len = reglen;
- regbuf = vmalloc(reglen);
+ regbuf = vzalloc(reglen);
if (!regbuf)
return -ENOMEM;
@@ -1091,6 +1457,9 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data)
{
int err;
+ if (data && !(dev->features & NETIF_F_ALL_CSUM))
+ return -EINVAL;
+
if (!data && dev->ethtool_ops->set_tso) {
err = dev->ethtool_ops->set_tso(dev, 0);
if (err)
@@ -1105,145 +1474,55 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data)
return dev->ethtool_ops->set_sg(dev, data);
}
-static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr)
+static int __ethtool_set_tx_csum(struct net_device *dev, u32 data)
{
- struct ethtool_value edata;
int err;
if (!dev->ethtool_ops->set_tx_csum)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (!edata.data && dev->ethtool_ops->set_sg) {
+ if (!data && dev->ethtool_ops->set_sg) {
err = __ethtool_set_sg(dev, 0);
if (err)
return err;
}
- return dev->ethtool_ops->set_tx_csum(dev, edata.data);
+ return dev->ethtool_ops->set_tx_csum(dev, data);
}
-EXPORT_SYMBOL(ethtool_op_set_tx_csum);
-static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr)
+static int __ethtool_set_rx_csum(struct net_device *dev, u32 data)
{
- struct ethtool_value edata;
-
if (!dev->ethtool_ops->set_rx_csum)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (!edata.data && dev->ethtool_ops->set_sg)
+ if (!data)
dev->features &= ~NETIF_F_GRO;
- return dev->ethtool_ops->set_rx_csum(dev, edata.data);
+ return dev->ethtool_ops->set_rx_csum(dev, data);
}
-static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
+static int __ethtool_set_tso(struct net_device *dev, u32 data)
{
- struct ethtool_value edata;
-
- if (!dev->ethtool_ops->set_sg)
- return -EOPNOTSUPP;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (edata.data &&
- !(dev->features & NETIF_F_ALL_CSUM))
- return -EINVAL;
-
- return __ethtool_set_sg(dev, edata.data);
-}
-
-static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
if (!dev->ethtool_ops->set_tso)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (edata.data && !(dev->features & NETIF_F_SG))
+ if (data && !(dev->features & NETIF_F_SG))
return -EINVAL;
- return dev->ethtool_ops->set_tso(dev, edata.data);
+ return dev->ethtool_ops->set_tso(dev, data);
}
-static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
+static int __ethtool_set_ufo(struct net_device *dev, u32 data)
{
- struct ethtool_value edata;
-
if (!dev->ethtool_ops->set_ufo)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
- if (edata.data && !(dev->features & NETIF_F_SG))
+ if (data && !(dev->features & NETIF_F_SG))
return -EINVAL;
- if (edata.data && !((dev->features & NETIF_F_GEN_CSUM) ||
+ if (data && !((dev->features & NETIF_F_GEN_CSUM) ||
(dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
== (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)))
return -EINVAL;
- return dev->ethtool_ops->set_ufo(dev, edata.data);
-}
-
-static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GGSO };
-
- edata.data = dev->features & NETIF_F_GSO;
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
- if (edata.data)
- dev->features |= NETIF_F_GSO;
- else
- dev->features &= ~NETIF_F_GSO;
- return 0;
-}
-
-static int ethtool_get_gro(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata = { ETHTOOL_GGRO };
-
- edata.data = dev->features & NETIF_F_GRO;
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
- return -EFAULT;
- return 0;
-}
-
-static int ethtool_set_gro(struct net_device *dev, char __user *useraddr)
-{
- struct ethtool_value edata;
-
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
- return -EFAULT;
-
- if (edata.data) {
- u32 rxcsum = dev->ethtool_ops->get_rx_csum ?
- dev->ethtool_ops->get_rx_csum(dev) :
- ethtool_op_get_rx_csum(dev);
-
- if (!rxcsum)
- return -EINVAL;
- dev->features |= NETIF_F_GRO;
- } else
- dev->features &= ~NETIF_F_GRO;
-
- return 0;
+ return dev->ethtool_ops->set_ufo(dev, data);
}
static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
@@ -1287,17 +1566,13 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gstrings gstrings;
- const struct ethtool_ops *ops = dev->ethtool_ops;
u8 *data;
int ret;
- if (!ops->get_strings || !ops->get_sset_count)
- return -EOPNOTSUPP;
-
if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
return -EFAULT;
- ret = ops->get_sset_count(dev, gstrings.string_set);
+ ret = __ethtool_get_sset_count(dev, gstrings.string_set);
if (ret < 0)
return ret;
@@ -1307,7 +1582,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
if (!data)
return -ENOMEM;
- ops->get_strings(dev, gstrings.string_set, data);
+ __ethtool_get_strings(dev, gstrings.string_set, data);
ret = -EFAULT;
if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
@@ -1317,7 +1592,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
goto out;
ret = 0;
- out:
+out:
kfree(data);
return ret;
}
@@ -1458,7 +1733,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
void __user *useraddr = ifr->ifr_data;
u32 ethcmd;
int rc;
- unsigned long old_features;
+ u32 old_features;
if (!dev || !netif_device_present(dev))
return -ENODEV;
@@ -1500,6 +1775,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GRXCLSRLCNT:
case ETHTOOL_GRXCLSRULE:
case ETHTOOL_GRXCLSRLALL:
+ case ETHTOOL_GFEATURES:
break;
default:
if (!capable(CAP_NET_ADMIN))
@@ -1570,42 +1846,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SPAUSEPARAM:
rc = ethtool_set_pauseparam(dev, useraddr);
break;
- case ETHTOOL_GRXCSUM:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_rx_csum ?
- dev->ethtool_ops->get_rx_csum :
- ethtool_op_get_rx_csum));
- break;
- case ETHTOOL_SRXCSUM:
- rc = ethtool_set_rx_csum(dev, useraddr);
- break;
- case ETHTOOL_GTXCSUM:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_tx_csum ?
- dev->ethtool_ops->get_tx_csum :
- ethtool_op_get_tx_csum));
- break;
- case ETHTOOL_STXCSUM:
- rc = ethtool_set_tx_csum(dev, useraddr);
- break;
- case ETHTOOL_GSG:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_sg ?
- dev->ethtool_ops->get_sg :
- ethtool_op_get_sg));
- break;
- case ETHTOOL_SSG:
- rc = ethtool_set_sg(dev, useraddr);
- break;
- case ETHTOOL_GTSO:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_tso ?
- dev->ethtool_ops->get_tso :
- ethtool_op_get_tso));
- break;
- case ETHTOOL_STSO:
- rc = ethtool_set_tso(dev, useraddr);
- break;
case ETHTOOL_TEST:
rc = ethtool_self_test(dev, useraddr);
break;
@@ -1621,21 +1861,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GPERMADDR:
rc = ethtool_get_perm_addr(dev, useraddr);
break;
- case ETHTOOL_GUFO:
- rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_ufo ?
- dev->ethtool_ops->get_ufo :
- ethtool_op_get_ufo));
- break;
- case ETHTOOL_SUFO:
- rc = ethtool_set_ufo(dev, useraddr);
- break;
- case ETHTOOL_GGSO:
- rc = ethtool_get_gso(dev, useraddr);
- break;
- case ETHTOOL_SGSO:
- rc = ethtool_set_gso(dev, useraddr);
- break;
case ETHTOOL_GFLAGS:
rc = ethtool_get_value(dev, useraddr, ethcmd,
(dev->ethtool_ops->get_flags ?
@@ -1643,8 +1868,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
ethtool_op_get_flags));
break;
case ETHTOOL_SFLAGS:
- rc = ethtool_set_value(dev, useraddr,
- dev->ethtool_ops->set_flags);
+ rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
break;
case ETHTOOL_GPFLAGS:
rc = ethtool_get_value(dev, useraddr, ethcmd,
@@ -1666,12 +1890,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SRXCLSRLINS:
rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
break;
- case ETHTOOL_GGRO:
- rc = ethtool_get_gro(dev, useraddr);
- break;
- case ETHTOOL_SGRO:
- rc = ethtool_set_gro(dev, useraddr);
- break;
case ETHTOOL_FLASHDEV:
rc = ethtool_flash_device(dev, useraddr);
break;
@@ -1693,6 +1911,30 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SRXFHINDIR:
rc = ethtool_set_rxfh_indir(dev, useraddr);
break;
+ case ETHTOOL_GFEATURES:
+ rc = ethtool_get_features(dev, useraddr);
+ break;
+ case ETHTOOL_SFEATURES:
+ rc = ethtool_set_features(dev, useraddr);
+ break;
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GTSO:
+ case ETHTOOL_GUFO:
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GGRO:
+ rc = ethtool_get_one_feature(dev, useraddr, ethcmd);
+ break;
+ case ETHTOOL_STXCSUM:
+ case ETHTOOL_SRXCSUM:
+ case ETHTOOL_SSG:
+ case ETHTOOL_STSO:
+ case ETHTOOL_SUFO:
+ case ETHTOOL_SGSO:
+ case ETHTOOL_SGRO:
+ rc = ethtool_set_one_feature(dev, useraddr, ethcmd);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index a20e5d3bbfa..8248ebb5891 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -181,13 +181,13 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
{
int ret = 0;
- if (rule->iifindex && (rule->iifindex != fl->iif))
+ if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
goto out;
- if (rule->oifindex && (rule->oifindex != fl->oif))
+ if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
goto out;
- if ((rule->mark ^ fl->mark) & rule->mark_mask)
+ if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
goto out;
ret = ops->match(rule, fl, flags);
diff --git a/net/core/filter.c b/net/core/filter.c
index afc58374ca9..232b1873bb2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -142,14 +142,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
if (err)
return err;
- rcu_read_lock_bh();
- filter = rcu_dereference_bh(sk->sk_filter);
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
if (filter) {
unsigned int pkt_len = sk_run_filter(skb, filter->insns);
err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return err;
}
diff --git a/net/core/flow.c b/net/core/flow.c
index 127c8a7ffd6..990703b8863 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -172,9 +172,9 @@ static void flow_new_hash_rnd(struct flow_cache *fc,
static u32 flow_hash_code(struct flow_cache *fc,
struct flow_cache_percpu *fcp,
- struct flowi *key)
+ const struct flowi *key)
{
- u32 *k = (u32 *) key;
+ const u32 *k = (const u32 *) key;
return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
& (flow_cache_hash_size(fc) - 1);
@@ -186,17 +186,17 @@ typedef unsigned long flow_compare_t;
* important assumptions that we can here, such as alignment and
* constant size.
*/
-static int flow_key_compare(struct flowi *key1, struct flowi *key2)
+static int flow_key_compare(const struct flowi *key1, const struct flowi *key2)
{
- flow_compare_t *k1, *k1_lim, *k2;
+ const flow_compare_t *k1, *k1_lim, *k2;
const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);
BUILD_BUG_ON(sizeof(struct flowi) % sizeof(flow_compare_t));
- k1 = (flow_compare_t *) key1;
+ k1 = (const flow_compare_t *) key1;
k1_lim = k1 + n_elem;
- k2 = (flow_compare_t *) key2;
+ k2 = (const flow_compare_t *) key2;
do {
if (*k1++ != *k2++)
@@ -207,7 +207,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
}
struct flow_cache_object *
-flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
flow_resolve_t resolver, void *ctx)
{
struct flow_cache *fc = &flow_cache_global;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 60a90291342..799f06e03a2 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -316,7 +316,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
{
size_t size = entries * sizeof(struct neighbour *);
struct neigh_hash_table *ret;
- struct neighbour **buckets;
+ struct neighbour __rcu **buckets;
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
@@ -324,14 +324,14 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
if (size <= PAGE_SIZE)
buckets = kzalloc(size, GFP_ATOMIC);
else
- buckets = (struct neighbour **)
+ buckets = (struct neighbour __rcu **)
__get_free_pages(GFP_ATOMIC | __GFP_ZERO,
get_order(size));
if (!buckets) {
kfree(ret);
return NULL;
}
- rcu_assign_pointer(ret->hash_buckets, buckets);
+ ret->hash_buckets = buckets;
ret->hash_mask = entries - 1;
get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
return ret;
@@ -343,7 +343,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
struct neigh_hash_table,
rcu);
size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *);
- struct neighbour **buckets = nht->hash_buckets;
+ struct neighbour __rcu **buckets = nht->hash_buckets;
if (size <= PAGE_SIZE)
kfree(buckets);
@@ -1540,7 +1540,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
panic("cannot create neighbour proc dir entry");
#endif
- tbl->nht = neigh_hash_alloc(8);
+ RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(8));
phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
@@ -1602,7 +1602,8 @@ int neigh_table_clear(struct neigh_table *tbl)
}
write_unlock(&neigh_tbl_lock);
- call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu);
+ call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
+ neigh_hash_free_rcu);
tbl->nht = NULL;
kfree(tbl->phash_buckets);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e23c01be5a5..5ceb257e860 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -99,7 +99,7 @@ NETDEVICE_SHOW(addr_assign_type, fmt_dec);
NETDEVICE_SHOW(addr_len, fmt_dec);
NETDEVICE_SHOW(iflink, fmt_dec);
NETDEVICE_SHOW(ifindex, fmt_dec);
-NETDEVICE_SHOW(features, fmt_long_hex);
+NETDEVICE_SHOW(features, fmt_hex);
NETDEVICE_SHOW(type, fmt_dec);
NETDEVICE_SHOW(link_mode, fmt_dec);
@@ -295,6 +295,20 @@ static ssize_t show_ifalias(struct device *dev,
return ret;
}
+NETDEVICE_SHOW(group, fmt_dec);
+
+static int change_group(struct net_device *net, unsigned long new_group)
+{
+ dev_set_group(net, (int) new_group);
+ return 0;
+}
+
+static ssize_t store_group(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_group);
+}
+
static struct device_attribute net_class_attributes[] = {
__ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL),
__ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
@@ -316,6 +330,7 @@ static struct device_attribute net_class_attributes[] = {
__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
store_tx_queue_len),
+ __ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group),
{}
};
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 02dc2cbcbe8..06be2431753 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -193,6 +193,17 @@ void netpoll_poll_dev(struct net_device *dev)
poll_napi(dev);
+ if (dev->priv_flags & IFF_SLAVE) {
+ if (dev->npinfo) {
+ struct net_device *bond_dev = dev->master;
+ struct sk_buff *skb;
+ while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) {
+ skb->dev = bond_dev;
+ skb_queue_tail(&bond_dev->npinfo->arp_tx, skb);
+ }
+ }
+ }
+
service_arp_queue(dev->npinfo);
zap_completion_queue();
@@ -313,9 +324,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
tries > 0; --tries) {
if (__netif_tx_trylock(txq)) {
if (!netif_tx_queue_stopped(txq)) {
- dev->priv_flags |= IFF_IN_NETPOLL;
status = ops->ndo_start_xmit(skb, dev);
- dev->priv_flags &= ~IFF_IN_NETPOLL;
if (status == NETDEV_TX_OK)
txq_trans_update(txq);
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index a9e7fc4c461..0c55eaa70e3 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -251,6 +251,7 @@ struct pktgen_dev {
int max_pkt_size; /* = ETH_ZLEN; */
int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
int nfrags;
+ struct page *page;
u64 delay; /* nano-seconds */
__u64 count; /* Default No packets to send */
@@ -1134,6 +1135,10 @@ static ssize_t pktgen_if_write(struct file *file,
if (node_possible(value)) {
pkt_dev->node = value;
sprintf(pg_result, "OK: node=%d", pkt_dev->node);
+ if (pkt_dev->page) {
+ put_page(pkt_dev->page);
+ pkt_dev->page = NULL;
+ }
}
else
sprintf(pg_result, "ERROR: node not possible");
@@ -2605,6 +2610,89 @@ static inline __be16 build_tci(unsigned int id, unsigned int cfi,
return htons(id | (cfi << 12) | (prio << 13));
}
+static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
+ int datalen)
+{
+ struct timeval timestamp;
+ struct pktgen_hdr *pgh;
+
+ pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh));
+ datalen -= sizeof(*pgh);
+
+ if (pkt_dev->nfrags <= 0) {
+ memset(skb_put(skb, datalen), 0, datalen);
+ } else {
+ int frags = pkt_dev->nfrags;
+ int i, len;
+
+
+ if (frags > MAX_SKB_FRAGS)
+ frags = MAX_SKB_FRAGS;
+ len = datalen - frags * PAGE_SIZE;
+ if (len > 0) {
+ memset(skb_put(skb, len), 0, len);
+ datalen = frags * PAGE_SIZE;
+ }
+
+ i = 0;
+ while (datalen > 0) {
+ if (unlikely(!pkt_dev->page)) {
+ int node = numa_node_id();
+
+ if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE))
+ node = pkt_dev->node;
+ pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!pkt_dev->page)
+ break;
+ }
+ skb_shinfo(skb)->frags[i].page = pkt_dev->page;
+ get_page(pkt_dev->page);
+ skb_shinfo(skb)->frags[i].page_offset = 0;
+ skb_shinfo(skb)->frags[i].size =
+ (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
+ datalen -= skb_shinfo(skb)->frags[i].size;
+ skb->len += skb_shinfo(skb)->frags[i].size;
+ skb->data_len += skb_shinfo(skb)->frags[i].size;
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+
+ while (i < frags) {
+ int rem;
+
+ if (i == 0)
+ break;
+
+ rem = skb_shinfo(skb)->frags[i - 1].size / 2;
+ if (rem == 0)
+ break;
+
+ skb_shinfo(skb)->frags[i - 1].size -= rem;
+
+ skb_shinfo(skb)->frags[i] =
+ skb_shinfo(skb)->frags[i - 1];
+ get_page(skb_shinfo(skb)->frags[i].page);
+ skb_shinfo(skb)->frags[i].page =
+ skb_shinfo(skb)->frags[i - 1].page;
+ skb_shinfo(skb)->frags[i].page_offset +=
+ skb_shinfo(skb)->frags[i - 1].size;
+ skb_shinfo(skb)->frags[i].size = rem;
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+ }
+
+ /* Stamp the time, and sequence number,
+ * convert them to network byte order
+ */
+ pgh->pgh_magic = htonl(PKTGEN_MAGIC);
+ pgh->seq_num = htonl(pkt_dev->seq_num);
+
+ do_gettimeofday(&timestamp);
+ pgh->tv_sec = htonl(timestamp.tv_sec);
+ pgh->tv_usec = htonl(timestamp.tv_usec);
+}
+
static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
struct pktgen_dev *pkt_dev)
{
@@ -2613,7 +2701,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
struct udphdr *udph;
int datalen, iplen;
struct iphdr *iph;
- struct pktgen_hdr *pgh = NULL;
__be16 protocol = htons(ETH_P_IP);
__be32 *mpls;
__be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
@@ -2729,76 +2816,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
pkt_dev->pkt_overhead);
skb->dev = odev;
skb->pkt_type = PACKET_HOST;
-
- if (pkt_dev->nfrags <= 0) {
- pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
- memset(pgh + 1, 0, datalen - sizeof(struct pktgen_hdr));
- } else {
- int frags = pkt_dev->nfrags;
- int i, len;
-
- pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
-
- if (frags > MAX_SKB_FRAGS)
- frags = MAX_SKB_FRAGS;
- if (datalen > frags * PAGE_SIZE) {
- len = datalen - frags * PAGE_SIZE;
- memset(skb_put(skb, len), 0, len);
- datalen = frags * PAGE_SIZE;
- }
-
- i = 0;
- while (datalen > 0) {
- struct page *page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0);
- skb_shinfo(skb)->frags[i].page = page;
- skb_shinfo(skb)->frags[i].page_offset = 0;
- skb_shinfo(skb)->frags[i].size =
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
- datalen -= skb_shinfo(skb)->frags[i].size;
- skb->len += skb_shinfo(skb)->frags[i].size;
- skb->data_len += skb_shinfo(skb)->frags[i].size;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
-
- while (i < frags) {
- int rem;
-
- if (i == 0)
- break;
-
- rem = skb_shinfo(skb)->frags[i - 1].size / 2;
- if (rem == 0)
- break;
-
- skb_shinfo(skb)->frags[i - 1].size -= rem;
-
- skb_shinfo(skb)->frags[i] =
- skb_shinfo(skb)->frags[i - 1];
- get_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->frags[i].page =
- skb_shinfo(skb)->frags[i - 1].page;
- skb_shinfo(skb)->frags[i].page_offset +=
- skb_shinfo(skb)->frags[i - 1].size;
- skb_shinfo(skb)->frags[i].size = rem;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
- }
-
- /* Stamp the time, and sequence number,
- * convert them to network byte order
- */
- if (pgh) {
- struct timeval timestamp;
-
- pgh->pgh_magic = htonl(PKTGEN_MAGIC);
- pgh->seq_num = htonl(pkt_dev->seq_num);
-
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
- }
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
#ifdef CONFIG_XFRM
if (!process_ipsec(pkt_dev, skb, protocol))
@@ -2980,7 +2998,6 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
struct udphdr *udph;
int datalen;
struct ipv6hdr *iph;
- struct pktgen_hdr *pgh = NULL;
__be16 protocol = htons(ETH_P_IPV6);
__be32 *mpls;
__be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
@@ -3083,75 +3100,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
skb->dev = odev;
skb->pkt_type = PACKET_HOST;
- if (pkt_dev->nfrags <= 0)
- pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
- else {
- int frags = pkt_dev->nfrags;
- int i;
-
- pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
-
- if (frags > MAX_SKB_FRAGS)
- frags = MAX_SKB_FRAGS;
- if (datalen > frags * PAGE_SIZE) {
- skb_put(skb, datalen - frags * PAGE_SIZE);
- datalen = frags * PAGE_SIZE;
- }
-
- i = 0;
- while (datalen > 0) {
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- skb_shinfo(skb)->frags[i].page = page;
- skb_shinfo(skb)->frags[i].page_offset = 0;
- skb_shinfo(skb)->frags[i].size =
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
- datalen -= skb_shinfo(skb)->frags[i].size;
- skb->len += skb_shinfo(skb)->frags[i].size;
- skb->data_len += skb_shinfo(skb)->frags[i].size;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
-
- while (i < frags) {
- int rem;
-
- if (i == 0)
- break;
-
- rem = skb_shinfo(skb)->frags[i - 1].size / 2;
- if (rem == 0)
- break;
-
- skb_shinfo(skb)->frags[i - 1].size -= rem;
-
- skb_shinfo(skb)->frags[i] =
- skb_shinfo(skb)->frags[i - 1];
- get_page(skb_shinfo(skb)->frags[i].page);
- skb_shinfo(skb)->frags[i].page =
- skb_shinfo(skb)->frags[i - 1].page;
- skb_shinfo(skb)->frags[i].page_offset +=
- skb_shinfo(skb)->frags[i - 1].size;
- skb_shinfo(skb)->frags[i].size = rem;
- i++;
- skb_shinfo(skb)->nr_frags = i;
- }
- }
-
- /* Stamp the time, and sequence number,
- * convert them to network byte order
- * should we update cloned packets too ?
- */
- if (pgh) {
- struct timeval timestamp;
-
- pgh->pgh_magic = htonl(PKTGEN_MAGIC);
- pgh->seq_num = htonl(pkt_dev->seq_num);
-
- do_gettimeofday(&timestamp);
- pgh->tv_sec = htonl(timestamp.tv_sec);
- pgh->tv_usec = htonl(timestamp.tv_usec);
- }
- /* pkt_dev->seq_num++; FF: you really mean this? */
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
return skb;
}
@@ -3321,7 +3270,7 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
pkt_dev->started_at);
ktime_t idle = ns_to_ktime(pkt_dev->idle_acc);
- p += sprintf(p, "OK: %llu(c%llu+d%llu) nsec, %llu (%dbyte,%dfrags)\n",
+ p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",
(unsigned long long)ktime_to_us(elapsed),
(unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)),
(unsigned long long)ktime_to_us(idle),
@@ -3884,6 +3833,8 @@ static int pktgen_remove_device(struct pktgen_thread *t,
free_SAs(pkt_dev);
#endif
vfree(pkt_dev->flows);
+ if (pkt_dev->page)
+ put_page(pkt_dev->page);
kfree(pkt_dev);
return 0;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a5f7535aab5..49f7ea5b4c7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -868,6 +868,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+ NLA_PUT_U32(skb, IFLA_GROUP, dev->group);
if (dev->ifindex != dev->iflink)
NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
@@ -1035,6 +1036,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) },
[IFLA_MTU] = { .type = NLA_U32 },
[IFLA_LINK] = { .type = NLA_U32 },
+ [IFLA_MASTER] = { .type = NLA_U32 },
[IFLA_TXQLEN] = { .type = NLA_U32 },
[IFLA_WEIGHT] = { .type = NLA_U32 },
[IFLA_OPERSTATE] = { .type = NLA_U8 },
@@ -1121,8 +1123,7 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
return -EOPNOTSUPP;
if (af_ops->validate_link_af) {
- err = af_ops->validate_link_af(dev,
- tb[IFLA_AF_SPEC]);
+ err = af_ops->validate_link_af(dev, af);
if (err < 0)
return err;
}
@@ -1178,6 +1179,41 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
return err;
}
+static int do_set_master(struct net_device *dev, int ifindex)
+{
+ struct net_device *master_dev;
+ const struct net_device_ops *ops;
+ int err;
+
+ if (dev->master) {
+ if (dev->master->ifindex == ifindex)
+ return 0;
+ ops = dev->master->netdev_ops;
+ if (ops->ndo_del_slave) {
+ err = ops->ndo_del_slave(dev->master, dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (ifindex) {
+ master_dev = __dev_get_by_index(dev_net(dev), ifindex);
+ if (!master_dev)
+ return -EINVAL;
+ ops = master_dev->netdev_ops;
+ if (ops->ndo_add_slave) {
+ err = ops->ndo_add_slave(master_dev, dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
struct nlattr **tb, char *ifname, int modified)
{
@@ -1265,6 +1301,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
modified = 1;
}
+ if (tb[IFLA_GROUP]) {
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ modified = 1;
+ }
+
/*
* Interface selected by interface index but interface
* name provided implies that a name change has been
@@ -1296,6 +1337,13 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
goto errout;
}
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+ if (err)
+ goto errout;
+ modified = 1;
+ }
+
if (tb[IFLA_TXQLEN])
dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
@@ -1542,6 +1590,8 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
if (tb[IFLA_LINKMODE])
dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+ if (tb[IFLA_GROUP])
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
return dev;
@@ -1552,6 +1602,24 @@ err:
}
EXPORT_SYMBOL(rtnl_create_link);
+static int rtnl_group_changelink(struct net *net, int group,
+ struct ifinfomsg *ifm,
+ struct nlattr **tb)
+{
+ struct net_device *dev;
+ int err;
+
+ for_each_netdev(net, dev) {
+ if (dev->group == group) {
+ err = do_setlink(dev, ifm, tb, NULL, 0);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
{
struct net *net = sock_net(skb->sk);
@@ -1579,10 +1647,12 @@ replay:
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
- else if (ifname[0])
- dev = __dev_get_by_name(net, ifname);
- else
- dev = NULL;
+ else {
+ if (ifname[0])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ dev = NULL;
+ }
err = validate_linkmsg(dev, tb);
if (err < 0)
@@ -1646,8 +1716,13 @@ replay:
return do_setlink(dev, ifm, tb, ifname, modified);
}
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
+ return rtnl_group_changelink(net,
+ nla_get_u32(tb[IFLA_GROUP]),
+ ifm, tb);
return -ENODEV;
+ }
if (ifm->ifi_index)
return -EOPNOTSUPP;
@@ -1672,6 +1747,9 @@ replay:
snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
dest_net = rtnl_link_get_net(net, tb);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
+
dev = rtnl_create_link(net, dest_net, ifname, ops, tb);
if (IS_ERR(dev))
@@ -1820,7 +1898,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN))
return -EPERM;
- if (kind == 2 && (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
+ if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
struct sock *rtnl;
rtnl_dumpit_func dumpit;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d31bb36ae0d..1eb526a848f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -210,6 +210,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
shinfo = skb_shinfo(skb);
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref, 1);
+ kmemcheck_annotate_variable(shinfo->destructor_arg);
if (fclone) {
struct sk_buff *child = skb + 1;
@@ -2433,8 +2434,6 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
return -ENOMEM;
/* initialize the next frag */
- sk->sk_sndmsg_page = page;
- sk->sk_sndmsg_off = 0;
skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
skb->truesize += PAGE_SIZE;
atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
@@ -2454,7 +2453,6 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
return -EFAULT;
/* copy was successful so update the size parameters */
- sk->sk_sndmsg_off += copy;
frag->size += copy;
skb->len += copy;
skb->data_len += copy;
@@ -2497,7 +2495,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);
* a pointer to the first in a list of new skbs for the segments.
* In case of error it returns ERR_PTR(err).
*/
-struct sk_buff *skb_segment(struct sk_buff *skb, int features)
+struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
@@ -2507,7 +2505,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
unsigned int offset = doffset;
unsigned int headroom;
unsigned int len;
- int sg = features & NETIF_F_SG;
+ int sg = !!(features & NETIF_F_SG);
int nfrags = skb_shinfo(skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
@@ -2744,8 +2742,12 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
merge:
if (offset > headlen) {
- skbinfo->frags[0].page_offset += offset - headlen;
- skbinfo->frags[0].size -= offset - headlen;
+ unsigned int eat = offset - headlen;
+
+ skbinfo->frags[0].page_offset += eat;
+ skbinfo->frags[0].size -= eat;
+ skb->data_len -= eat;
+ skb->len -= eat;
offset = headlen;
}
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index d900ab99814..3609eacaf4c 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2008, Intel Corporation.
+ * Copyright (c) 2008-2011, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -583,7 +583,7 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlattr **tb,
u8 up, idtype;
int ret = -EINVAL;
- if (!tb[DCB_ATTR_APP] || !netdev->dcbnl_ops->getapp)
+ if (!tb[DCB_ATTR_APP])
goto out;
ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP],
@@ -604,7 +604,16 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlattr **tb,
goto out;
id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]);
- up = netdev->dcbnl_ops->getapp(netdev, idtype, id);
+
+ if (netdev->dcbnl_ops->getapp) {
+ up = netdev->dcbnl_ops->getapp(netdev, idtype, id);
+ } else {
+ struct dcb_app app = {
+ .selector = idtype,
+ .protocol = id,
+ };
+ up = dcb_getapp(netdev, &app);
+ }
/* send this back */
dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
@@ -617,6 +626,9 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlattr **tb,
dcb->cmd = DCB_CMD_GAPP;
app_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_APP);
+ if (!app_nest)
+ goto out_cancel;
+
ret = nla_put_u8(dcbnl_skb, DCB_APP_ATTR_IDTYPE, idtype);
if (ret)
goto out_cancel;
@@ -1181,7 +1193,7 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlattr **tb,
goto err;
}
- if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setets) {
+ if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) {
struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]);
err = ops->ieee_setpfc(netdev, pfc);
if (err)
@@ -1212,6 +1224,59 @@ err:
return err;
}
+static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb,
+ int app_nested_type, int app_info_type,
+ int app_entry_type)
+{
+ struct dcb_peer_app_info info;
+ struct dcb_app *table = NULL;
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ u16 app_count;
+ int err;
+
+
+ /**
+ * retrieve the peer app configuration form the driver. If the driver
+ * handlers fail exit without doing anything
+ */
+ err = ops->peer_getappinfo(netdev, &info, &app_count);
+ if (!err && app_count) {
+ table = kmalloc(sizeof(struct dcb_app) * app_count, GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ err = ops->peer_getapptable(netdev, table);
+ }
+
+ if (!err) {
+ u16 i;
+ struct nlattr *app;
+
+ /**
+ * build the message, from here on the only possible failure
+ * is due to the skb size
+ */
+ err = -EMSGSIZE;
+
+ app = nla_nest_start(skb, app_nested_type);
+ if (!app)
+ goto nla_put_failure;
+
+ if (app_info_type)
+ NLA_PUT(skb, app_info_type, sizeof(info), &info);
+
+ for (i = 0; i < app_count; i++)
+ NLA_PUT(skb, app_entry_type, sizeof(struct dcb_app),
+ &table[i]);
+
+ nla_nest_end(skb, app);
+ }
+ err = 0;
+
+nla_put_failure:
+ kfree(table);
+ return err;
+}
/* Handle IEEE 802.1Qaz GET commands. */
static int dcbnl_ieee_get(struct net_device *netdev, struct nlattr **tb,
@@ -1276,6 +1341,30 @@ static int dcbnl_ieee_get(struct net_device *netdev, struct nlattr **tb,
spin_unlock(&dcb_lock);
nla_nest_end(skb, app);
+ /* get peer info if available */
+ if (ops->ieee_peer_getets) {
+ struct ieee_ets ets;
+ err = ops->ieee_peer_getets(netdev, &ets);
+ if (!err)
+ NLA_PUT(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets);
+ }
+
+ if (ops->ieee_peer_getpfc) {
+ struct ieee_pfc pfc;
+ err = ops->ieee_peer_getpfc(netdev, &pfc);
+ if (!err)
+ NLA_PUT(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc);
+ }
+
+ if (ops->peer_getappinfo && ops->peer_getapptable) {
+ err = dcbnl_build_peer_app(netdev, skb,
+ DCB_ATTR_IEEE_PEER_APP,
+ DCB_ATTR_IEEE_APP_UNSPEC,
+ DCB_ATTR_IEEE_APP);
+ if (err)
+ goto nla_put_failure;
+ }
+
nla_nest_end(skb, ieee);
nlmsg_end(skb, nlh);
@@ -1429,6 +1518,71 @@ err:
return ret;
}
+/* Handle CEE DCBX GET commands. */
+static int dcbnl_cee_get(struct net_device *netdev, struct nlattr **tb,
+ u32 pid, u32 seq, u16 flags)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ struct dcbmsg *dcb;
+ struct nlattr *cee;
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ int err;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ nlh = NLMSG_NEW(skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+ dcb = NLMSG_DATA(nlh);
+ dcb->dcb_family = AF_UNSPEC;
+ dcb->cmd = DCB_CMD_CEE_GET;
+
+ NLA_PUT_STRING(skb, DCB_ATTR_IFNAME, netdev->name);
+
+ cee = nla_nest_start(skb, DCB_ATTR_CEE);
+ if (!cee)
+ goto nla_put_failure;
+
+ /* get peer info if available */
+ if (ops->cee_peer_getpg) {
+ struct cee_pg pg;
+ err = ops->cee_peer_getpg(netdev, &pg);
+ if (!err)
+ NLA_PUT(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg);
+ }
+
+ if (ops->cee_peer_getpfc) {
+ struct cee_pfc pfc;
+ err = ops->cee_peer_getpfc(netdev, &pfc);
+ if (!err)
+ NLA_PUT(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc);
+ }
+
+ if (ops->peer_getappinfo && ops->peer_getapptable) {
+ err = dcbnl_build_peer_app(netdev, skb,
+ DCB_ATTR_CEE_PEER_APP_TABLE,
+ DCB_ATTR_CEE_PEER_APP_INFO,
+ DCB_ATTR_CEE_PEER_APP);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, cee);
+ nlmsg_end(skb, nlh);
+
+ return rtnl_unicast(skb, &init_net, pid);
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+nlmsg_failure:
+ kfree_skb(skb);
+ return -1;
+}
+
static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
{
struct net *net = sock_net(skb->sk);
@@ -1558,6 +1712,10 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
ret = dcbnl_setfeatcfg(netdev, tb, pid, nlh->nlmsg_seq,
nlh->nlmsg_flags);
goto out;
+ case DCB_CMD_CEE_GET:
+ ret = dcbnl_cee_get(netdev, tb, pid, nlh->nlmsg_seq,
+ nlh->nlmsg_flags);
+ goto out;
default:
goto errout;
}
@@ -1604,6 +1762,10 @@ EXPORT_SYMBOL(dcb_getapp);
u8 dcb_setapp(struct net_device *dev, struct dcb_app *new)
{
struct dcb_app_type *itr;
+ struct dcb_app_type event;
+
+ memcpy(&event.name, dev->name, sizeof(event.name));
+ memcpy(&event.app, new, sizeof(event.app));
spin_lock(&dcb_lock);
/* Search for existing match and replace */
@@ -1635,7 +1797,7 @@ u8 dcb_setapp(struct net_device *dev, struct dcb_app *new)
}
out:
spin_unlock(&dcb_lock);
- call_dcbevent_notifiers(DCB_APP_EVENT, new);
+ call_dcbevent_notifiers(DCB_APP_EVENT, &event);
return 0;
}
EXPORT_SYMBOL(dcb_setapp);
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index e96d5e81003..fadecd20d75 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -583,6 +583,15 @@ done:
dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
}
+/*
+ * Convert RFC 3390 larger initial window into an equivalent number of packets.
+ * This is based on the numbers specified in RFC 5681, 3.1.
+ */
+static inline u32 rfc3390_bytes_to_packets(const u32 smss)
+{
+ return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
+}
+
static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
{
struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 8cde009e8b8..4222e7a654b 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -614,6 +614,9 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* Caller (dccp_v4_do_rcv) will send Reset */
dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
return 1;
+ } else if (sk->sk_state == DCCP_CLOSED) {
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+ return 1;
}
if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) {
@@ -668,10 +671,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
}
switch (sk->sk_state) {
- case DCCP_CLOSED:
- dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- return 1;
-
case DCCP_REQUESTING:
queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
if (queued >= 0)
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 45a434f9416..ae451c6d83b 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -43,9 +43,9 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct inet_sock *inet = inet_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ __be16 orig_sport, orig_dport;
struct rtable *rt;
__be32 daddr, nexthop;
- int tmp;
int err;
dp->dccps_role = DCCP_ROLE_CLIENT;
@@ -63,12 +63,14 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
nexthop = inet->opt->faddr;
}
- tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
- IPPROTO_DCCP,
- inet->inet_sport, usin->sin_port, sk, 1);
- if (tmp < 0)
- return tmp;
+ orig_sport = inet->inet_sport;
+ orig_dport = usin->sin_port;
+ rt = ip_route_connect(nexthop, inet->inet_saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_DCCP,
+ orig_sport, orig_dport, sk, true);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
@@ -99,11 +101,13 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (err != 0)
goto failure;
- err = ip_route_newports(&rt, IPPROTO_DCCP, inet->inet_sport,
- inet->inet_dport, sk);
- if (err != 0)
+ rt = ip_route_newports(rt, IPPROTO_DCCP,
+ orig_sport, orig_dport,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt)) {
+ rt = NULL;
goto failure;
-
+ }
/* OK, now commit destination to socket. */
sk_setup_caps(sk, &rt->dst);
@@ -461,17 +465,19 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
struct rtable *rt;
- struct flowi fl = { .oif = skb_rtable(skb)->rt_iif,
- .fl4_dst = ip_hdr(skb)->saddr,
- .fl4_src = ip_hdr(skb)->daddr,
- .fl4_tos = RT_CONN_FLAGS(sk),
- .proto = sk->sk_protocol,
- .fl_ip_sport = dccp_hdr(skb)->dccph_dport,
- .fl_ip_dport = dccp_hdr(skb)->dccph_sport
- };
-
- security_skb_classify_flow(skb, &fl);
- if (ip_route_output_flow(net, &rt, &fl, sk, 0)) {
+ struct flowi4 fl4 = {
+ .flowi4_oif = skb_rtable(skb)->rt_iif,
+ .daddr = ip_hdr(skb)->saddr,
+ .saddr = ip_hdr(skb)->daddr,
+ .flowi4_tos = RT_CONN_FLAGS(sk),
+ .flowi4_proto = sk->sk_protocol,
+ .fl4_sport = dccp_hdr(skb)->dccph_dport,
+ .fl4_dport = dccp_hdr(skb)->dccph_sport,
+ };
+
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt)) {
IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index dca711df9b6..de1b7e37ad5 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -147,30 +147,24 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
dst = __sk_dst_check(sk, np->dst_cookie);
if (dst == NULL) {
struct inet_sock *inet = inet_sk(sk);
- struct flowi fl;
+ struct flowi6 fl6;
/* BUGGG_FUTURE: Again, it is not clear how
to handle rthdr case. Ignore this complexity
for now.
*/
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet->inet_dport;
- fl.fl_ip_sport = inet->inet_sport;
- security_sk_classify_flow(sk, &fl);
-
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err) {
- sk->sk_err_soft = -err;
- goto out;
- }
-
- err = xfrm_lookup(net, &dst, &fl, sk, 0);
- if (err < 0) {
- sk->sk_err_soft = -err;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl6.daddr, &np->daddr);
+ ipv6_addr_copy(&fl6.saddr, &np->saddr);
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.fl6_dport = inet->inet_dport;
+ fl6.fl6_sport = inet->inet_sport;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false);
+ if (IS_ERR(dst)) {
+ sk->sk_err_soft = -PTR_ERR(dst);
goto out;
}
} else
@@ -249,34 +243,30 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
struct sk_buff *skb;
struct ipv6_txoptions *opt = NULL;
struct in6_addr *final_p, final;
- struct flowi fl;
+ struct flowi6 fl6;
int err = -1;
struct dst_entry *dst;
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
- fl.fl6_flowlabel = 0;
- fl.oif = ireq6->iif;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_rsk(req)->loc_port;
- security_req_classify_flow(req, &fl);
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr);
+ ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr);
+ fl6.flowlabel = 0;
+ fl6.flowi6_oif = ireq6->iif;
+ fl6.fl6_dport = inet_rsk(req)->rmt_port;
+ fl6.fl6_sport = inet_rsk(req)->loc_port;
+ security_req_classify_flow(req, flowi6_to_flowi(&fl6));
opt = np->opt;
- final_p = fl6_update_dst(&fl, opt, &final);
+ final_p = fl6_update_dst(&fl6, opt, &final);
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto done;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0);
- if (err < 0)
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
goto done;
+ }
skb = dccp_make_response(sk, dst, req);
if (skb != NULL) {
@@ -285,8 +275,8 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
dh->dccph_checksum = dccp_v6_csum_finish(skb,
&ireq6->loc_addr,
&ireq6->rmt_addr);
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- err = ip6_xmit(sk, skb, &fl, opt);
+ ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr);
+ err = ip6_xmit(sk, skb, &fl6, opt);
err = net_xmit_eval(err);
}
@@ -308,7 +298,7 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
{
struct ipv6hdr *rxip6h;
struct sk_buff *skb;
- struct flowi fl;
+ struct flowi6 fl6;
struct net *net = dev_net(skb_dst(rxskb)->dev);
struct sock *ctl_sk = net->dccp.v6_ctl_sk;
struct dst_entry *dst;
@@ -327,25 +317,24 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr,
&rxip6h->daddr);
- memset(&fl, 0, sizeof(fl));
- ipv6_addr_copy(&fl.fl6_dst, &rxip6h->saddr);
- ipv6_addr_copy(&fl.fl6_src, &rxip6h->daddr);
+ memset(&fl6, 0, sizeof(fl6));
+ ipv6_addr_copy(&fl6.daddr, &rxip6h->saddr);
+ ipv6_addr_copy(&fl6.saddr, &rxip6h->daddr);
- fl.proto = IPPROTO_DCCP;
- fl.oif = inet6_iif(rxskb);
- fl.fl_ip_dport = dccp_hdr(skb)->dccph_dport;
- fl.fl_ip_sport = dccp_hdr(skb)->dccph_sport;
- security_skb_classify_flow(rxskb, &fl);
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.flowi6_oif = inet6_iif(rxskb);
+ fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
+ fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
+ security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
/* sk = NULL, but it is safe for now. RST socket required. */
- if (!ip6_dst_lookup(ctl_sk, &dst, &fl)) {
- if (xfrm_lookup(net, &dst, &fl, NULL, 0) >= 0) {
- skb_dst_set(skb, dst);
- ip6_xmit(ctl_sk, skb, &fl, NULL);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
- return;
- }
+ dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
+ if (!IS_ERR(dst)) {
+ skb_dst_set(skb, dst);
+ ip6_xmit(ctl_sk, skb, &fl6, NULL);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ return;
}
kfree_skb(skb);
@@ -484,7 +473,6 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
struct inet6_request_sock *ireq6 = inet6_rsk(req);
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct inet_sock *newinet;
- struct dccp_sock *newdp;
struct dccp6_sock *newdp6;
struct sock *newsk;
struct ipv6_txoptions *opt;
@@ -498,7 +486,6 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
return NULL;
newdp6 = (struct dccp6_sock *)newsk;
- newdp = dccp_sk(newsk);
newinet = inet_sk(newsk);
newinet->pinet6 = &newdp6->inet6;
newnp = inet6_sk(newsk);
@@ -540,25 +527,20 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
if (dst == NULL) {
struct in6_addr *final_p, final;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- final_p = fl6_update_dst(&fl, opt, &final);
- ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_rsk(req)->loc_port;
- security_sk_classify_flow(sk, &fl);
-
- if (ip6_dst_lookup(sk, &dst, &fl))
- goto out;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0)
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr);
+ final_p = fl6_update_dst(&fl6, opt, &final);
+ ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr);
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.fl6_dport = inet_rsk(req)->rmt_port;
+ fl6.fl6_sport = inet_rsk(req)->loc_port;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+ if (IS_ERR(dst))
goto out;
}
@@ -578,7 +560,6 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
newdp6 = (struct dccp6_sock *)newsk;
newinet = inet_sk(newsk);
newinet->pinet6 = &newdp6->inet6;
- newdp = dccp_sk(newsk);
newnp = inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
@@ -878,7 +859,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct ipv6_pinfo *np = inet6_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
struct in6_addr *saddr = NULL, *final_p, final;
- struct flowi fl;
+ struct flowi6 fl6;
struct dst_entry *dst;
int addr_type;
int err;
@@ -891,14 +872,14 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (usin->sin6_family != AF_INET6)
return -EAFNOSUPPORT;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
if (np->sndflow) {
- fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
- IP6_ECN_flow_init(fl.fl6_flowlabel);
- if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
+ fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl6.flowlabel);
+ if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
struct ip6_flowlabel *flowlabel;
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
if (flowlabel == NULL)
return -EINVAL;
ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
@@ -935,7 +916,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
}
ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
- np->flow_label = fl.fl6_flowlabel;
+ np->flow_label = fl6.flowlabel;
/*
* DCCP over IPv4
@@ -972,33 +953,24 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (!ipv6_addr_any(&np->rcv_saddr))
saddr = &np->rcv_saddr;
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = usin->sin6_port;
- fl.fl_ip_sport = inet->inet_sport;
- security_sk_classify_flow(sk, &fl);
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ ipv6_addr_copy(&fl6.daddr, &np->daddr);
+ ipv6_addr_copy(&fl6.saddr, saddr ? saddr : &np->saddr);
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.fl6_dport = usin->sin6_port;
+ fl6.fl6_sport = inet->inet_sport;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- final_p = fl6_update_dst(&fl, np->opt, &final);
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
goto failure;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT);
- if (err < 0) {
- if (err == -EREMOTE)
- err = ip6_dst_blackhole(sk, &dst, &fl);
- if (err < 0)
- goto failure;
}
if (saddr == NULL) {
- saddr = &fl.fl6_src;
+ saddr = &fl6.saddr;
ipv6_addr_copy(&np->rcv_saddr, saddr);
}
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 2af15b15d1f..ea3b6ee21fc 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -908,7 +908,7 @@ static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen,
struct socket *sock = sk->sk_socket;
struct dn_scp *scp = DN_SK(sk);
int err = -EISCONN;
- struct flowi fl;
+ struct flowidn fld;
if (sock->state == SS_CONNECTED)
goto out;
@@ -947,13 +947,13 @@ static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen,
memcpy(&scp->peer, addr, sizeof(struct sockaddr_dn));
err = -EHOSTUNREACH;
- memset(&fl, 0, sizeof(fl));
- fl.oif = sk->sk_bound_dev_if;
- fl.fld_dst = dn_saddr2dn(&scp->peer);
- fl.fld_src = dn_saddr2dn(&scp->addr);
- dn_sk_ports_copy(&fl, scp);
- fl.proto = DNPROTO_NSP;
- if (dn_route_output_sock(&sk->sk_dst_cache, &fl, sk, flags) < 0)
+ memset(&fld, 0, sizeof(fld));
+ fld.flowidn_oif = sk->sk_bound_dev_if;
+ fld.daddr = dn_saddr2dn(&scp->peer);
+ fld.saddr = dn_saddr2dn(&scp->addr);
+ dn_sk_ports_copy(&fld, scp);
+ fld.flowidn_proto = DNPROTO_NSP;
+ if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, flags) < 0)
goto out;
sk->sk_route_caps = sk->sk_dst_cache->dev->features;
sock->state = SS_CONNECTING;
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 0ef0a81bcd7..1c74ed36ce8 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -201,7 +201,7 @@ static int dn_fib_check_nh(const struct rtmsg *r, struct dn_fib_info *fi, struct
int err;
if (nh->nh_gw) {
- struct flowi fl;
+ struct flowidn fld;
struct dn_fib_res res;
if (nh->nh_flags&RTNH_F_ONLINK) {
@@ -221,15 +221,15 @@ static int dn_fib_check_nh(const struct rtmsg *r, struct dn_fib_info *fi, struct
return 0;
}
- memset(&fl, 0, sizeof(fl));
- fl.fld_dst = nh->nh_gw;
- fl.oif = nh->nh_oif;
- fl.fld_scope = r->rtm_scope + 1;
+ memset(&fld, 0, sizeof(fld));
+ fld.daddr = nh->nh_gw;
+ fld.flowidn_oif = nh->nh_oif;
+ fld.flowidn_scope = r->rtm_scope + 1;
- if (fl.fld_scope < RT_SCOPE_LINK)
- fl.fld_scope = RT_SCOPE_LINK;
+ if (fld.flowidn_scope < RT_SCOPE_LINK)
+ fld.flowidn_scope = RT_SCOPE_LINK;
- if ((err = dn_fib_lookup(&fl, &res)) != 0)
+ if ((err = dn_fib_lookup(&fld, &res)) != 0)
return err;
err = -EINVAL;
@@ -404,7 +404,7 @@ failure:
return NULL;
}
-int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowi *fl, struct dn_fib_res *res)
+int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowidn *fld, struct dn_fib_res *res)
{
int err = dn_fib_props[type].error;
@@ -424,7 +424,8 @@ int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowi *
for_nexthops(fi) {
if (nh->nh_flags & RTNH_F_DEAD)
continue;
- if (!fl->oif || fl->oif == nh->nh_oif)
+ if (!fld->flowidn_oif ||
+ fld->flowidn_oif == nh->nh_oif)
break;
}
if (nhsel < fi->fib_nhs) {
@@ -445,7 +446,7 @@ int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowi *
return err;
}
-void dn_fib_select_multipath(const struct flowi *fl, struct dn_fib_res *res)
+void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res)
{
struct dn_fib_info *fi = res->fi;
int w;
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 2ef115277be..bd78836a81e 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -78,7 +78,7 @@ static void dn_nsp_send(struct sk_buff *skb)
struct sock *sk = skb->sk;
struct dn_scp *scp = DN_SK(sk);
struct dst_entry *dst;
- struct flowi fl;
+ struct flowidn fld;
skb_reset_transport_header(skb);
scp->stamp = jiffies;
@@ -91,13 +91,13 @@ try_again:
return;
}
- memset(&fl, 0, sizeof(fl));
- fl.oif = sk->sk_bound_dev_if;
- fl.fld_src = dn_saddr2dn(&scp->addr);
- fl.fld_dst = dn_saddr2dn(&scp->peer);
- dn_sk_ports_copy(&fl, scp);
- fl.proto = DNPROTO_NSP;
- if (dn_route_output_sock(&sk->sk_dst_cache, &fl, sk, 0) == 0) {
+ memset(&fld, 0, sizeof(fld));
+ fld.flowidn_oif = sk->sk_bound_dev_if;
+ fld.saddr = dn_saddr2dn(&scp->addr);
+ fld.daddr = dn_saddr2dn(&scp->peer);
+ dn_sk_ports_copy(&fld, scp);
+ fld.flowidn_proto = DNPROTO_NSP;
+ if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, 0) == 0) {
dst = sk_dst_get(sk);
sk->sk_route_caps = dst->dev->features;
goto try_again;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 5e636365d33..9f09d4fc288 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -112,6 +112,7 @@ static int dn_dst_gc(struct dst_ops *ops);
static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
static unsigned int dn_dst_default_advmss(const struct dst_entry *dst);
static unsigned int dn_dst_default_mtu(const struct dst_entry *dst);
+static void dn_dst_destroy(struct dst_entry *);
static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
static void dn_dst_link_failure(struct sk_buff *);
static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
@@ -133,11 +134,18 @@ static struct dst_ops dn_dst_ops = {
.check = dn_dst_check,
.default_advmss = dn_dst_default_advmss,
.default_mtu = dn_dst_default_mtu,
+ .cow_metrics = dst_cow_metrics_generic,
+ .destroy = dn_dst_destroy,
.negative_advice = dn_dst_negative_advice,
.link_failure = dn_dst_link_failure,
.update_pmtu = dn_dst_update_pmtu,
};
+static void dn_dst_destroy(struct dst_entry *dst)
+{
+ dst_destroy_metrics_generic(dst);
+}
+
static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
{
__u16 tmp = (__u16 __force)(src ^ dst);
@@ -274,14 +282,14 @@ static void dn_dst_link_failure(struct sk_buff *skb)
{
}
-static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
+static inline int compare_keys(struct flowidn *fl1, struct flowidn *fl2)
{
- return ((fl1->fld_dst ^ fl2->fld_dst) |
- (fl1->fld_src ^ fl2->fld_src) |
- (fl1->mark ^ fl2->mark) |
- (fl1->fld_scope ^ fl2->fld_scope) |
- (fl1->oif ^ fl2->oif) |
- (fl1->iif ^ fl2->iif)) == 0;
+ return ((fl1->daddr ^ fl2->daddr) |
+ (fl1->saddr ^ fl2->saddr) |
+ (fl1->flowidn_mark ^ fl2->flowidn_mark) |
+ (fl1->flowidn_scope ^ fl2->flowidn_scope) |
+ (fl1->flowidn_oif ^ fl2->flowidn_oif) |
+ (fl1->flowidn_iif ^ fl2->flowidn_iif)) == 0;
}
static int dn_insert_route(struct dn_route *rt, unsigned hash, struct dn_route **rp)
@@ -295,7 +303,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned hash, struct dn_route *
spin_lock_bh(&dn_rt_hash_table[hash].lock);
while ((rth = rcu_dereference_protected(*rthp,
lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) {
- if (compare_keys(&rth->fl, &rt->fl)) {
+ if (compare_keys(&rth->fld, &rt->fld)) {
/* Put it first */
*rthp = rth->dst.dn_next;
rcu_assign_pointer(rth->dst.dn_next,
@@ -814,14 +822,14 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
{
struct dn_fib_info *fi = res->fi;
struct net_device *dev = rt->dst.dev;
+ unsigned int mss_metric;
struct neighbour *n;
- unsigned int metric;
if (fi) {
if (DN_FIB_RES_GW(*res) &&
DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = DN_FIB_RES_GW(*res);
- dst_import_metrics(&rt->dst, fi->fib_metrics);
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
}
rt->rt_type = res->type;
@@ -834,10 +842,10 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu)
dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu);
- metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
- if (metric) {
+ mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
+ if (mss_metric) {
unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst));
- if (metric > mss)
+ if (mss_metric > mss)
dst_metric_set(&rt->dst, RTAX_ADVMSS, mss);
}
return 0;
@@ -895,14 +903,16 @@ static inline __le16 dn_fib_rules_map_destination(__le16 daddr, struct dn_fib_re
return (daddr&~mask)|res->fi->fib_nh->nh_gw;
}
-static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *oldflp, int try_hard)
+static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *oldflp, int try_hard)
{
- struct flowi fl = { .fld_dst = oldflp->fld_dst,
- .fld_src = oldflp->fld_src,
- .fld_scope = RT_SCOPE_UNIVERSE,
- .mark = oldflp->mark,
- .iif = init_net.loopback_dev->ifindex,
- .oif = oldflp->oif };
+ struct flowidn fld = {
+ .daddr = oldflp->daddr,
+ .saddr = oldflp->saddr,
+ .flowidn_scope = RT_SCOPE_UNIVERSE,
+ .flowidn_mark = oldflp->flowidn_mark,
+ .flowidn_iif = init_net.loopback_dev->ifindex,
+ .flowidn_oif = oldflp->flowidn_oif,
+ };
struct dn_route *rt = NULL;
struct net_device *dev_out = NULL, *dev;
struct neighbour *neigh = NULL;
@@ -916,13 +926,14 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *old
if (decnet_debug_level & 16)
printk(KERN_DEBUG
"dn_route_output_slow: dst=%04x src=%04x mark=%d"
- " iif=%d oif=%d\n", le16_to_cpu(oldflp->fld_dst),
- le16_to_cpu(oldflp->fld_src),
- oldflp->mark, init_net.loopback_dev->ifindex, oldflp->oif);
+ " iif=%d oif=%d\n", le16_to_cpu(oldflp->daddr),
+ le16_to_cpu(oldflp->saddr),
+ oldflp->flowidn_mark, init_net.loopback_dev->ifindex,
+ oldflp->flowidn_oif);
/* If we have an output interface, verify its a DECnet device */
- if (oldflp->oif) {
- dev_out = dev_get_by_index(&init_net, oldflp->oif);
+ if (oldflp->flowidn_oif) {
+ dev_out = dev_get_by_index(&init_net, oldflp->flowidn_oif);
err = -ENODEV;
if (dev_out && dev_out->dn_ptr == NULL) {
dev_put(dev_out);
@@ -933,11 +944,11 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *old
}
/* If we have a source address, verify that its a local address */
- if (oldflp->fld_src) {
+ if (oldflp->saddr) {
err = -EADDRNOTAVAIL;
if (dev_out) {
- if (dn_dev_islocal(dev_out, oldflp->fld_src))
+ if (dn_dev_islocal(dev_out, oldflp->saddr))
goto source_ok;
dev_put(dev_out);
goto out;
@@ -946,11 +957,11 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *old
for_each_netdev_rcu(&init_net, dev) {
if (!dev->dn_ptr)
continue;
- if (!dn_dev_islocal(dev, oldflp->fld_src))
+ if (!dn_dev_islocal(dev, oldflp->saddr))
continue;
if ((dev->flags & IFF_LOOPBACK) &&
- oldflp->fld_dst &&
- !dn_dev_islocal(dev, oldflp->fld_dst))
+ oldflp->daddr &&
+ !dn_dev_islocal(dev, oldflp->daddr))
continue;
dev_out = dev;
@@ -965,22 +976,22 @@ source_ok:
}
/* No destination? Assume its local */
- if (!fl.fld_dst) {
- fl.fld_dst = fl.fld_src;
+ if (!fld.daddr) {
+ fld.daddr = fld.saddr;
err = -EADDRNOTAVAIL;
if (dev_out)
dev_put(dev_out);
dev_out = init_net.loopback_dev;
dev_hold(dev_out);
- if (!fl.fld_dst) {
- fl.fld_dst =
- fl.fld_src = dnet_select_source(dev_out, 0,
+ if (!fld.daddr) {
+ fld.daddr =
+ fld.saddr = dnet_select_source(dev_out, 0,
RT_SCOPE_HOST);
- if (!fl.fld_dst)
+ if (!fld.daddr)
goto out;
}
- fl.oif = init_net.loopback_dev->ifindex;
+ fld.flowidn_oif = init_net.loopback_dev->ifindex;
res.type = RTN_LOCAL;
goto make_route;
}
@@ -989,8 +1000,8 @@ source_ok:
printk(KERN_DEBUG
"dn_route_output_slow: initial checks complete."
" dst=%o4x src=%04x oif=%d try_hard=%d\n",
- le16_to_cpu(fl.fld_dst), le16_to_cpu(fl.fld_src),
- fl.oif, try_hard);
+ le16_to_cpu(fld.daddr), le16_to_cpu(fld.saddr),
+ fld.flowidn_oif, try_hard);
/*
* N.B. If the kernel is compiled without router support then
@@ -998,7 +1009,7 @@ source_ok:
* will always be executed.
*/
err = -ESRCH;
- if (try_hard || (err = dn_fib_lookup(&fl, &res)) != 0) {
+ if (try_hard || (err = dn_fib_lookup(&fld, &res)) != 0) {
struct dn_dev *dn_db;
if (err != -ESRCH)
goto out;
@@ -1013,19 +1024,19 @@ source_ok:
* here
*/
if (!try_hard) {
- neigh = neigh_lookup_nodev(&dn_neigh_table, &init_net, &fl.fld_dst);
+ neigh = neigh_lookup_nodev(&dn_neigh_table, &init_net, &fld.daddr);
if (neigh) {
- if ((oldflp->oif &&
- (neigh->dev->ifindex != oldflp->oif)) ||
- (oldflp->fld_src &&
+ if ((oldflp->flowidn_oif &&
+ (neigh->dev->ifindex != oldflp->flowidn_oif)) ||
+ (oldflp->saddr &&
(!dn_dev_islocal(neigh->dev,
- oldflp->fld_src)))) {
+ oldflp->saddr)))) {
neigh_release(neigh);
neigh = NULL;
} else {
if (dev_out)
dev_put(dev_out);
- if (dn_dev_islocal(neigh->dev, fl.fld_dst)) {
+ if (dn_dev_islocal(neigh->dev, fld.daddr)) {
dev_out = init_net.loopback_dev;
res.type = RTN_LOCAL;
} else {
@@ -1045,7 +1056,7 @@ source_ok:
goto out;
dn_db = rcu_dereference_raw(dev_out->dn_ptr);
/* Possible improvement - check all devices for local addr */
- if (dn_dev_islocal(dev_out, fl.fld_dst)) {
+ if (dn_dev_islocal(dev_out, fld.daddr)) {
dev_put(dev_out);
dev_out = init_net.loopback_dev;
dev_hold(dev_out);
@@ -1061,16 +1072,16 @@ select_source:
if (neigh)
gateway = ((struct dn_neigh *)neigh)->addr;
if (gateway == 0)
- gateway = fl.fld_dst;
- if (fl.fld_src == 0) {
- fl.fld_src = dnet_select_source(dev_out, gateway,
- res.type == RTN_LOCAL ?
- RT_SCOPE_HOST :
- RT_SCOPE_LINK);
- if (fl.fld_src == 0 && res.type != RTN_LOCAL)
+ gateway = fld.daddr;
+ if (fld.saddr == 0) {
+ fld.saddr = dnet_select_source(dev_out, gateway,
+ res.type == RTN_LOCAL ?
+ RT_SCOPE_HOST :
+ RT_SCOPE_LINK);
+ if (fld.saddr == 0 && res.type != RTN_LOCAL)
goto e_addr;
}
- fl.oif = dev_out->ifindex;
+ fld.flowidn_oif = dev_out->ifindex;
goto make_route;
}
free_res = 1;
@@ -1079,61 +1090,61 @@ select_source:
goto e_inval;
if (res.type == RTN_LOCAL) {
- if (!fl.fld_src)
- fl.fld_src = fl.fld_dst;
+ if (!fld.saddr)
+ fld.saddr = fld.daddr;
if (dev_out)
dev_put(dev_out);
dev_out = init_net.loopback_dev;
dev_hold(dev_out);
- fl.oif = dev_out->ifindex;
+ fld.flowidn_oif = dev_out->ifindex;
if (res.fi)
dn_fib_info_put(res.fi);
res.fi = NULL;
goto make_route;
}
- if (res.fi->fib_nhs > 1 && fl.oif == 0)
- dn_fib_select_multipath(&fl, &res);
+ if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0)
+ dn_fib_select_multipath(&fld, &res);
/*
* We could add some logic to deal with default routes here and
* get rid of some of the special casing above.
*/
- if (!fl.fld_src)
- fl.fld_src = DN_FIB_RES_PREFSRC(res);
+ if (!fld.saddr)
+ fld.saddr = DN_FIB_RES_PREFSRC(res);
if (dev_out)
dev_put(dev_out);
dev_out = DN_FIB_RES_DEV(res);
dev_hold(dev_out);
- fl.oif = dev_out->ifindex;
+ fld.flowidn_oif = dev_out->ifindex;
gateway = DN_FIB_RES_GW(res);
make_route:
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- rt = dst_alloc(&dn_dst_ops);
+ rt = dst_alloc(&dn_dst_ops, 0);
if (rt == NULL)
goto e_nobufs;
atomic_set(&rt->dst.__refcnt, 1);
rt->dst.flags = DST_HOST;
- rt->fl.fld_src = oldflp->fld_src;
- rt->fl.fld_dst = oldflp->fld_dst;
- rt->fl.oif = oldflp->oif;
- rt->fl.iif = 0;
- rt->fl.mark = oldflp->mark;
+ rt->fld.saddr = oldflp->saddr;
+ rt->fld.daddr = oldflp->daddr;
+ rt->fld.flowidn_oif = oldflp->flowidn_oif;
+ rt->fld.flowidn_iif = 0;
+ rt->fld.flowidn_mark = oldflp->flowidn_mark;
- rt->rt_saddr = fl.fld_src;
- rt->rt_daddr = fl.fld_dst;
- rt->rt_gateway = gateway ? gateway : fl.fld_dst;
- rt->rt_local_src = fl.fld_src;
+ rt->rt_saddr = fld.saddr;
+ rt->rt_daddr = fld.daddr;
+ rt->rt_gateway = gateway ? gateway : fld.daddr;
+ rt->rt_local_src = fld.saddr;
- rt->rt_dst_map = fl.fld_dst;
- rt->rt_src_map = fl.fld_src;
+ rt->rt_dst_map = fld.daddr;
+ rt->rt_src_map = fld.saddr;
rt->dst.dev = dev_out;
dev_hold(dev_out);
@@ -1151,7 +1162,7 @@ make_route:
if (err)
goto e_neighbour;
- hash = dn_hash(rt->fl.fld_src, rt->fl.fld_dst);
+ hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
dn_insert_route(rt, hash, (struct dn_route **)pprt);
done:
@@ -1182,20 +1193,20 @@ e_neighbour:
/*
* N.B. The flags may be moved into the flowi at some future stage.
*/
-static int __dn_route_output_key(struct dst_entry **pprt, const struct flowi *flp, int flags)
+static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn *flp, int flags)
{
- unsigned hash = dn_hash(flp->fld_src, flp->fld_dst);
+ unsigned hash = dn_hash(flp->saddr, flp->daddr);
struct dn_route *rt = NULL;
if (!(flags & MSG_TRYHARD)) {
rcu_read_lock_bh();
for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt;
rt = rcu_dereference_bh(rt->dst.dn_next)) {
- if ((flp->fld_dst == rt->fl.fld_dst) &&
- (flp->fld_src == rt->fl.fld_src) &&
- (flp->mark == rt->fl.mark) &&
+ if ((flp->daddr == rt->fld.daddr) &&
+ (flp->saddr == rt->fld.saddr) &&
+ (flp->flowidn_mark == rt->fld.flowidn_mark) &&
dn_is_output_route(rt) &&
- (rt->fl.oif == flp->oif)) {
+ (rt->fld.flowidn_oif == flp->flowidn_oif)) {
dst_use(&rt->dst, jiffies);
rcu_read_unlock_bh();
*pprt = &rt->dst;
@@ -1208,25 +1219,36 @@ static int __dn_route_output_key(struct dst_entry **pprt, const struct flowi *fl
return dn_route_output_slow(pprt, flp, flags);
}
-static int dn_route_output_key(struct dst_entry **pprt, struct flowi *flp, int flags)
+static int dn_route_output_key(struct dst_entry **pprt, struct flowidn *flp, int flags)
{
int err;
err = __dn_route_output_key(pprt, flp, flags);
- if (err == 0 && flp->proto) {
- err = xfrm_lookup(&init_net, pprt, flp, NULL, 0);
+ if (err == 0 && flp->flowidn_proto) {
+ *pprt = xfrm_lookup(&init_net, *pprt,
+ flowidn_to_flowi(flp), NULL, 0);
+ if (IS_ERR(*pprt)) {
+ err = PTR_ERR(*pprt);
+ *pprt = NULL;
+ }
}
return err;
}
-int dn_route_output_sock(struct dst_entry **pprt, struct flowi *fl, struct sock *sk, int flags)
+int dn_route_output_sock(struct dst_entry **pprt, struct flowidn *fl, struct sock *sk, int flags)
{
int err;
err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD);
- if (err == 0 && fl->proto) {
- err = xfrm_lookup(&init_net, pprt, fl, sk,
- (flags & MSG_DONTWAIT) ? 0 : XFRM_LOOKUP_WAIT);
+ if (err == 0 && fl->flowidn_proto) {
+ if (!(flags & MSG_DONTWAIT))
+ fl->flowidn_flags |= FLOWI_FLAG_CAN_SLEEP;
+ *pprt = xfrm_lookup(&init_net, *pprt,
+ flowidn_to_flowi(fl), sk, 0);
+ if (IS_ERR(*pprt)) {
+ err = PTR_ERR(*pprt);
+ *pprt = NULL;
+ }
}
return err;
}
@@ -1243,11 +1265,13 @@ static int dn_route_input_slow(struct sk_buff *skb)
int flags = 0;
__le16 gateway = 0;
__le16 local_src = 0;
- struct flowi fl = { .fld_dst = cb->dst,
- .fld_src = cb->src,
- .fld_scope = RT_SCOPE_UNIVERSE,
- .mark = skb->mark,
- .iif = skb->dev->ifindex };
+ struct flowidn fld = {
+ .daddr = cb->dst,
+ .saddr = cb->src,
+ .flowidn_scope = RT_SCOPE_UNIVERSE,
+ .flowidn_mark = skb->mark,
+ .flowidn_iif = skb->dev->ifindex,
+ };
struct dn_fib_res res = { .fi = NULL, .type = RTN_UNREACHABLE };
int err = -EINVAL;
int free_res = 0;
@@ -1258,7 +1282,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
goto out;
/* Zero source addresses are not allowed */
- if (fl.fld_src == 0)
+ if (fld.saddr == 0)
goto out;
/*
@@ -1272,7 +1296,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
if (dn_dev_islocal(in_dev, cb->src))
goto out;
- err = dn_fib_lookup(&fl, &res);
+ err = dn_fib_lookup(&fld, &res);
if (err) {
if (err != -ESRCH)
goto out;
@@ -1284,7 +1308,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
res.type = RTN_LOCAL;
} else {
- __le16 src_map = fl.fld_src;
+ __le16 src_map = fld.saddr;
free_res = 1;
out_dev = DN_FIB_RES_DEV(res);
@@ -1297,22 +1321,22 @@ static int dn_route_input_slow(struct sk_buff *skb)
dev_hold(out_dev);
if (res.r)
- src_map = fl.fld_src; /* no NAT support for now */
+ src_map = fld.saddr; /* no NAT support for now */
gateway = DN_FIB_RES_GW(res);
if (res.type == RTN_NAT) {
- fl.fld_dst = dn_fib_rules_map_destination(fl.fld_dst, &res);
+ fld.daddr = dn_fib_rules_map_destination(fld.daddr, &res);
dn_fib_res_put(&res);
free_res = 0;
- if (dn_fib_lookup(&fl, &res))
+ if (dn_fib_lookup(&fld, &res))
goto e_inval;
free_res = 1;
if (res.type != RTN_UNICAST)
goto e_inval;
flags |= RTCF_DNAT;
- gateway = fl.fld_dst;
+ gateway = fld.daddr;
}
- fl.fld_src = src_map;
+ fld.saddr = src_map;
}
switch(res.type) {
@@ -1326,8 +1350,8 @@ static int dn_route_input_slow(struct sk_buff *skb)
if (dn_db->parms.forwarding == 0)
goto e_inval;
- if (res.fi->fib_nhs > 1 && fl.oif == 0)
- dn_fib_select_multipath(&fl, &res);
+ if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0)
+ dn_fib_select_multipath(&fld, &res);
/*
* Check for out_dev == in_dev. We use the RTCF_DOREDIRECT
@@ -1345,8 +1369,8 @@ static int dn_route_input_slow(struct sk_buff *skb)
break;
case RTN_LOCAL:
flags |= RTCF_LOCAL;
- fl.fld_src = cb->dst;
- fl.fld_dst = cb->src;
+ fld.saddr = cb->dst;
+ fld.daddr = cb->src;
/* Routing tables gave us a gateway */
if (gateway)
@@ -1375,25 +1399,25 @@ static int dn_route_input_slow(struct sk_buff *skb)
}
make_route:
- rt = dst_alloc(&dn_dst_ops);
+ rt = dst_alloc(&dn_dst_ops, 0);
if (rt == NULL)
goto e_nobufs;
- rt->rt_saddr = fl.fld_src;
- rt->rt_daddr = fl.fld_dst;
- rt->rt_gateway = fl.fld_dst;
+ rt->rt_saddr = fld.saddr;
+ rt->rt_daddr = fld.daddr;
+ rt->rt_gateway = fld.daddr;
if (gateway)
rt->rt_gateway = gateway;
rt->rt_local_src = local_src ? local_src : rt->rt_saddr;
- rt->rt_dst_map = fl.fld_dst;
- rt->rt_src_map = fl.fld_src;
+ rt->rt_dst_map = fld.daddr;
+ rt->rt_src_map = fld.saddr;
- rt->fl.fld_src = cb->src;
- rt->fl.fld_dst = cb->dst;
- rt->fl.oif = 0;
- rt->fl.iif = in_dev->ifindex;
- rt->fl.mark = fl.mark;
+ rt->fld.saddr = cb->src;
+ rt->fld.daddr = cb->dst;
+ rt->fld.flowidn_oif = 0;
+ rt->fld.flowidn_iif = in_dev->ifindex;
+ rt->fld.flowidn_mark = fld.flowidn_mark;
rt->dst.flags = DST_HOST;
rt->dst.neighbour = neigh;
@@ -1423,7 +1447,7 @@ make_route:
if (err)
goto e_neighbour;
- hash = dn_hash(rt->fl.fld_src, rt->fl.fld_dst);
+ hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
dn_insert_route(rt, hash, &rt);
skb_dst_set(skb, &rt->dst);
@@ -1463,11 +1487,11 @@ static int dn_route_input(struct sk_buff *skb)
rcu_read_lock();
for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
rt = rcu_dereference(rt->dst.dn_next)) {
- if ((rt->fl.fld_src == cb->src) &&
- (rt->fl.fld_dst == cb->dst) &&
- (rt->fl.oif == 0) &&
- (rt->fl.mark == skb->mark) &&
- (rt->fl.iif == cb->iif)) {
+ if ((rt->fld.saddr == cb->src) &&
+ (rt->fld.daddr == cb->dst) &&
+ (rt->fld.flowidn_oif == 0) &&
+ (rt->fld.flowidn_mark == skb->mark) &&
+ (rt->fld.flowidn_iif == cb->iif)) {
dst_use(&rt->dst, jiffies);
rcu_read_unlock();
skb_dst_set(skb, (struct dst_entry *)rt);
@@ -1503,9 +1527,9 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
if (rt->rt_flags & RTCF_NOTIFY)
r->rtm_flags |= RTM_F_NOTIFY;
RTA_PUT(skb, RTA_DST, 2, &rt->rt_daddr);
- if (rt->fl.fld_src) {
+ if (rt->fld.saddr) {
r->rtm_src_len = 16;
- RTA_PUT(skb, RTA_SRC, 2, &rt->fl.fld_src);
+ RTA_PUT(skb, RTA_SRC, 2, &rt->fld.saddr);
}
if (rt->dst.dev)
RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->dst.dev->ifindex);
@@ -1524,7 +1548,7 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
rt->dst.error) < 0)
goto rtattr_failure;
if (dn_is_input_route(rt))
- RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
+ RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fld.flowidn_iif);
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
return skb->len;
@@ -1547,13 +1571,13 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
struct dn_skb_cb *cb;
int err;
struct sk_buff *skb;
- struct flowi fl;
+ struct flowidn fld;
if (!net_eq(net, &init_net))
return -EINVAL;
- memset(&fl, 0, sizeof(fl));
- fl.proto = DNPROTO_NSP;
+ memset(&fld, 0, sizeof(fld));
+ fld.flowidn_proto = DNPROTO_NSP;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (skb == NULL)
@@ -1562,15 +1586,15 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
cb = DN_SKB_CB(skb);
if (rta[RTA_SRC-1])
- memcpy(&fl.fld_src, RTA_DATA(rta[RTA_SRC-1]), 2);
+ memcpy(&fld.saddr, RTA_DATA(rta[RTA_SRC-1]), 2);
if (rta[RTA_DST-1])
- memcpy(&fl.fld_dst, RTA_DATA(rta[RTA_DST-1]), 2);
+ memcpy(&fld.daddr, RTA_DATA(rta[RTA_DST-1]), 2);
if (rta[RTA_IIF-1])
- memcpy(&fl.iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
+ memcpy(&fld.flowidn_iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
- if (fl.iif) {
+ if (fld.flowidn_iif) {
struct net_device *dev;
- if ((dev = dev_get_by_index(&init_net, fl.iif)) == NULL) {
+ if ((dev = dev_get_by_index(&init_net, fld.flowidn_iif)) == NULL) {
kfree_skb(skb);
return -ENODEV;
}
@@ -1581,8 +1605,8 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
}
skb->protocol = htons(ETH_P_DNA_RT);
skb->dev = dev;
- cb->src = fl.fld_src;
- cb->dst = fl.fld_dst;
+ cb->src = fld.saddr;
+ cb->dst = fld.daddr;
local_bh_disable();
err = dn_route_input(skb);
local_bh_enable();
@@ -1594,8 +1618,8 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
int oif = 0;
if (rta[RTA_OIF - 1])
memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
- fl.oif = oif;
- err = dn_route_output_key((struct dst_entry **)&rt, &fl, 0);
+ fld.flowidn_oif = oif;
+ err = dn_route_output_key((struct dst_entry **)&rt, &fld, 0);
}
if (skb->dev)
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 6eb91df3c55..f0efb0ccfec 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -49,14 +49,15 @@ struct dn_fib_rule
};
-int dn_fib_lookup(struct flowi *flp, struct dn_fib_res *res)
+int dn_fib_lookup(struct flowidn *flp, struct dn_fib_res *res)
{
struct fib_lookup_arg arg = {
.result = res,
};
int err;
- err = fib_rules_lookup(dn_fib_rules_ops, flp, 0, &arg);
+ err = fib_rules_lookup(dn_fib_rules_ops,
+ flowidn_to_flowi(flp), 0, &arg);
res->r = arg.rule;
return err;
@@ -65,6 +66,7 @@ int dn_fib_lookup(struct flowi *flp, struct dn_fib_res *res)
static int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
{
+ struct flowidn *fld = &flp->u.dn;
int err = -EAGAIN;
struct dn_fib_table *tbl;
@@ -90,7 +92,7 @@ static int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp,
if (tbl == NULL)
goto errout;
- err = tbl->lookup(tbl, flp, (struct dn_fib_res *)arg->result);
+ err = tbl->lookup(tbl, fld, (struct dn_fib_res *)arg->result);
if (err > 0)
err = -EAGAIN;
errout:
@@ -104,8 +106,9 @@ static const struct nla_policy dn_fib_rule_policy[FRA_MAX+1] = {
static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
- __le16 daddr = fl->fld_dst;
- __le16 saddr = fl->fld_src;
+ struct flowidn *fld = &fl->u.dn;
+ __le16 daddr = fld->daddr;
+ __le16 saddr = fld->saddr;
if (((saddr ^ r->src) & r->srcmask) ||
((daddr ^ r->dst) & r->dstmask))
@@ -175,7 +178,7 @@ static int dn_fib_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
unsigned dnet_addr_type(__le16 addr)
{
- struct flowi fl = { .fld_dst = addr };
+ struct flowidn fld = { .daddr = addr };
struct dn_fib_res res;
unsigned ret = RTN_UNICAST;
struct dn_fib_table *tb = dn_fib_get_table(RT_TABLE_LOCAL, 0);
@@ -183,7 +186,7 @@ unsigned dnet_addr_type(__le16 addr)
res.r = NULL;
if (tb) {
- if (!tb->lookup(tb, &fl, &res)) {
+ if (!tb->lookup(tb, &fld, &res)) {
ret = res.type;
dn_fib_res_put(&res);
}
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index f2abd375569..99d8d3a4099 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -59,7 +59,6 @@ struct dn_hash
};
#define dz_key_0(key) ((key).datum = 0)
-#define dz_prefix(key,dz) ((key).datum)
#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
@@ -765,7 +764,7 @@ static int dn_fib_table_flush(struct dn_fib_table *tb)
return found;
}
-static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowi *flp, struct dn_fib_res *res)
+static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowidn *flp, struct dn_fib_res *res)
{
int err;
struct dn_zone *dz;
@@ -774,7 +773,7 @@ static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowi *flp,
read_lock(&dn_fib_tables_lock);
for(dz = t->dh_zone_list; dz; dz = dz->dz_next) {
struct dn_fib_node *f;
- dn_fib_key_t k = dz_key(flp->fld_dst, dz);
+ dn_fib_key_t k = dz_key(flp->daddr, dz);
for(f = dz_chain(k, dz); f; f = f->fn_next) {
if (!dn_key_eq(k, f->fn_key)) {
@@ -789,7 +788,7 @@ static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowi *flp,
if (f->fn_state&DN_S_ZOMBIE)
continue;
- if (f->fn_scope < flp->fld_scope)
+ if (f->fn_scope < flp->flowidn_scope)
continue;
err = dn_fib_semantic_match(f->fn_type, DN_FIB_INFO(f), flp, res);
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 739435a6af3..cfa7a5e1c5c 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -67,8 +67,9 @@ dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen)
size_t result_len = 0;
const char *data = _data, *end, *opt;
- kenter("%%%d,%s,'%s',%zu",
- key->serial, key->description, data, datalen);
+ kenter("%%%d,%s,'%*.*s',%zu",
+ key->serial, key->description,
+ (int)datalen, (int)datalen, data, datalen);
if (datalen <= 1 || !data || data[datalen - 1] != '\0')
return -EINVAL;
@@ -217,6 +218,19 @@ static void dns_resolver_describe(const struct key *key, struct seq_file *m)
seq_printf(m, ": %u", key->datalen);
}
+/*
+ * read the DNS data
+ * - the key's semaphore is read-locked
+ */
+static long dns_resolver_read(const struct key *key,
+ char __user *buffer, size_t buflen)
+{
+ if (key->type_data.x[0])
+ return key->type_data.x[0];
+
+ return user_read(key, buffer, buflen);
+}
+
struct key_type key_type_dns_resolver = {
.name = "dns_resolver",
.instantiate = dns_resolver_instantiate,
@@ -224,7 +238,7 @@ struct key_type key_type_dns_resolver = {
.revoke = user_revoke,
.destroy = user_destroy,
.describe = dns_resolver_describe,
- .read = user_read,
+ .read = dns_resolver_read,
};
static int __init init_dns_resolver(void)
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 0c877a74e1f..3fb14b7c13c 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -428,7 +428,7 @@ static void __exit dsa_cleanup_module(void)
}
module_exit(dsa_cleanup_module);
-MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>")
+MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>");
MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips");
MODULE_LICENSE("GPL");
MODULE_ALIAS("platform:dsa");
diff --git a/net/dsa/mv88e6060.c b/net/dsa/mv88e6060.c
index 83277f463af..8f4ff5a2c81 100644
--- a/net/dsa/mv88e6060.c
+++ b/net/dsa/mv88e6060.c
@@ -18,7 +18,7 @@
static int reg_read(struct dsa_switch *ds, int addr, int reg)
{
- return mdiobus_read(ds->master_mii_bus, addr, reg);
+ return mdiobus_read(ds->master_mii_bus, ds->pd->sw_addr + addr, reg);
}
#define REG_READ(addr, reg) \
@@ -34,7 +34,8 @@ static int reg_read(struct dsa_switch *ds, int addr, int reg)
static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val)
{
- return mdiobus_write(ds->master_mii_bus, addr, reg, val);
+ return mdiobus_write(ds->master_mii_bus, ds->pd->sw_addr + addr,
+ reg, val);
}
#define REG_WRITE(addr, reg, val) \
@@ -50,7 +51,7 @@ static char *mv88e6060_probe(struct mii_bus *bus, int sw_addr)
{
int ret;
- ret = mdiobus_read(bus, REG_PORT(0), 0x03);
+ ret = mdiobus_read(bus, sw_addr + REG_PORT(0), 0x03);
if (ret >= 0) {
ret &= 0xfff0;
if (ret == 0x0600)
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 15dcc1a586b..0c282633791 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -265,13 +265,13 @@ static void ec_tx_done(struct sk_buff *skb, int result)
static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t len)
{
- struct sock *sk = sock->sk;
struct sockaddr_ec *saddr=(struct sockaddr_ec *)msg->msg_name;
struct net_device *dev;
struct ec_addr addr;
int err;
unsigned char port, cb;
#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE)
+ struct sock *sk = sock->sk;
struct sk_buff *skb;
struct ec_cb *eb;
#endif
@@ -488,10 +488,10 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
error_free_buf:
vfree(userbuf);
+error:
#else
err = -EPROTOTYPE;
#endif
- error:
mutex_unlock(&econet_mutex);
return err;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index a5a1050595d..cbb505ba932 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER
If unsure, say N here.
-choice
- prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
- depends on IP_ADVANCED_ROUTER
- default ASK_IP_FIB_HASH
-
-config ASK_IP_FIB_HASH
- bool "FIB_HASH"
- ---help---
- Current FIB is very proven and good enough for most users.
-
-config IP_FIB_TRIE
- bool "FIB_TRIE"
- ---help---
- Use new experimental LC-trie as FIB lookup algorithm.
- This improves lookup performance if you have a large
- number of routes.
-
- LC-trie is a longest matching prefix lookup algorithm which
- performs better than FIB_HASH for large routing tables.
- But, it consumes more memory and is more complex.
-
- LC-trie is described in:
-
- IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
- June 1999
-
- An experimental study of compression methods for dynamic tries
- Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
-
-endchoice
-
-config IP_FIB_HASH
- def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
-
config IP_FIB_TRIE_STATS
bool "FIB TRIE statistics"
- depends on IP_FIB_TRIE
+ depends on IP_ADVANCED_ROUTER
---help---
Keep track of statistics on structure of FIB TRIE table.
Useful for testing and measuring TRIE performance.
@@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE
handled by the klogd daemon which is responsible for kernel messages
("man klogd").
+config IP_ROUTE_CLASSID
+ bool
+
config IP_PNP
bool "IP: kernel level autoconfiguration"
help
@@ -657,4 +624,3 @@ config TCP_MD5SIG
on the Internet.
If unsure, say N.
-
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4978d22f9a7..0dc772d0d12 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_minisocks.o tcp_cong.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o igmp.o \
- fib_frontend.o fib_semantics.o \
+ fib_frontend.o fib_semantics.o fib_trie.o \
inet_fragment.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
-obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
-obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f2b61107df6..807d83c02ef 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -880,6 +880,19 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
EXPORT_SYMBOL(inet_ioctl);
+#ifdef CONFIG_COMPAT
+int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = -ENOIOCTLCMD;
+
+ if (sk->sk_prot->compat_ioctl)
+ err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
+
+ return err;
+}
+#endif
+
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
@@ -903,6 +916,7 @@ const struct proto_ops inet_stream_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);
@@ -929,6 +943,7 @@ const struct proto_ops inet_dgram_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);
@@ -959,6 +974,7 @@ static const struct proto_ops inet_sockraw_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
@@ -1085,23 +1101,20 @@ int sysctl_ip_dynaddr __read_mostly;
static int inet_sk_reselect_saddr(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
- int err;
- struct rtable *rt;
__be32 old_saddr = inet->inet_saddr;
- __be32 new_saddr;
__be32 daddr = inet->inet_daddr;
+ struct rtable *rt;
+ __be32 new_saddr;
if (inet->opt && inet->opt->srr)
daddr = inet->opt->faddr;
/* Query new route. */
- err = ip_route_connect(&rt, daddr, 0,
- RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if,
- sk->sk_protocol,
- inet->inet_sport, inet->inet_dport, sk, 0);
- if (err)
- return err;
+ rt = ip_route_connect(daddr, 0, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if, sk->sk_protocol,
+ inet->inet_sport, inet->inet_dport, sk, false);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
sk_setup_caps(sk, &rt->dst);
@@ -1144,25 +1157,16 @@ int inet_sk_rebuild_header(struct sock *sk)
daddr = inet->inet_daddr;
if (inet->opt && inet->opt->srr)
daddr = inet->opt->faddr;
-{
- struct flowi fl = {
- .oif = sk->sk_bound_dev_if,
- .mark = sk->sk_mark,
- .fl4_dst = daddr,
- .fl4_src = inet->inet_saddr,
- .fl4_tos = RT_CONN_FLAGS(sk),
- .proto = sk->sk_protocol,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = inet->inet_sport,
- .fl_ip_dport = inet->inet_dport,
- };
-
- security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
-}
- if (!err)
+ rt = ip_route_output_ports(sock_net(sk), sk, daddr, inet->inet_saddr,
+ inet->inet_dport, inet->inet_sport,
+ sk->sk_protocol, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (!IS_ERR(rt)) {
+ err = 0;
sk_setup_caps(sk, &rt->dst);
- else {
+ } else {
+ err = PTR_ERR(rt);
+
/* Routing failed... */
sk->sk_route_caps = 0;
/*
@@ -1215,7 +1219,7 @@ out:
return err;
}
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct iphdr *iph;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 86961bec70a..4286fd3cc0e 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -201,11 +201,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->ttl = 0;
top_iph->check = 0;
- ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ else
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
ah->reserved = 0;
ah->spi = x->id.spi;
- ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
+ ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg, 0, skb->len);
@@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
nexthdr = ah->nexthdr;
ah_hlen = (ah->hdrlen + 2) << 2;
- if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
- ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
- goto out;
+ if (x->props.flags & XFRM_STATE_ALIGN4) {
+ if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ } else {
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ }
if (!pskb_may_pull(skb, ah_hlen))
goto out;
@@ -450,8 +459,12 @@ static int ah_init_state(struct xfrm_state *x)
BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
- x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
- ahp->icv_trunc_len);
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ else
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
if (x->props.mode == XFRM_MODE_TUNNEL)
x->props.header_len += sizeof(struct iphdr);
x->data = ahp;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 04c8b69fd42..090d273d786 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -433,14 +433,13 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
{
- struct flowi fl = { .fl4_dst = sip,
- .fl4_src = tip };
struct rtable *rt;
int flag = 0;
/*unsigned long now; */
struct net *net = dev_net(dev);
- if (ip_route_output_key(net, &rt, &fl) < 0)
+ rt = ip_route_output(net, sip, tip, 0, 0);
+ if (IS_ERR(rt))
return 1;
if (rt->dst.dev != dev) {
NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
@@ -1017,14 +1016,13 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
return 0;
}
- if (__in_dev_get_rcu(dev)) {
- IN_DEV_CONF_SET(__in_dev_get_rcu(dev), PROXY_ARP, on);
+ if (__in_dev_get_rtnl(dev)) {
+ IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
return 0;
}
return -ENXIO;
}
-/* must be called with rcu_read_lock() */
static int arp_req_set_public(struct net *net, struct arpreq *r,
struct net_device *dev)
{
@@ -1062,12 +1060,10 @@ static int arp_req_set(struct net *net, struct arpreq *r,
if (r->arp_flags & ATF_PERM)
r->arp_flags |= ATF_COM;
if (dev == NULL) {
- struct flowi fl = { .fl4_dst = ip,
- .fl4_tos = RTO_ONLINK };
- struct rtable *rt;
- err = ip_route_output_key(net, &rt, &fl);
- if (err != 0)
- return err;
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
dev = rt->dst.dev;
ip_rt_put(rt);
if (!dev)
@@ -1178,7 +1174,6 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
static int arp_req_delete(struct net *net, struct arpreq *r,
struct net_device *dev)
{
- int err;
__be32 ip;
if (r->arp_flags & ATF_PUBL)
@@ -1186,12 +1181,9 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
if (dev == NULL) {
- struct flowi fl = { .fl4_dst = ip,
- .fl4_tos = RTO_ONLINK };
- struct rtable *rt;
- err = ip_route_output_key(net, &rt, &fl);
- if (err != 0)
- return err;
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
dev = rt->dst.dev;
ip_rt_put(rt);
if (!dev)
@@ -1233,10 +1225,10 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (!(r.arp_flags & ATF_NETMASK))
((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
htonl(0xFFFFFFFFUL);
- rcu_read_lock();
+ rtnl_lock();
if (r.arp_dev[0]) {
err = -ENODEV;
- dev = dev_get_by_name_rcu(net, r.arp_dev);
+ dev = __dev_get_by_name(net, r.arp_dev);
if (dev == NULL)
goto out;
@@ -1263,7 +1255,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
break;
}
out:
- rcu_read_unlock();
+ rtnl_unlock();
if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
err = -EFAULT;
return err;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 174be6caa5c..85bd24ca4f6 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -46,11 +46,12 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (!saddr)
saddr = inet->mc_addr;
}
- err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
- RT_CONN_FLAGS(sk), oif,
- sk->sk_protocol,
- inet->inet_sport, usin->sin_port, sk, 1);
- if (err) {
+ rt = ip_route_connect(usin->sin_addr.s_addr, saddr,
+ RT_CONN_FLAGS(sk), oif,
+ sk->sk_protocol,
+ inet->inet_sport, usin->sin_port, sk, true);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
if (err == -ENETUNREACH)
IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
return err;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 748cb5b337b..6d85800daeb 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -51,6 +51,7 @@
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/slab.h>
+#include <linux/hash.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
@@ -92,6 +93,71 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
};
+/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
+ * value. So if you change this define, make appropriate changes to
+ * inet_addr_hash as well.
+ */
+#define IN4_ADDR_HSIZE 256
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+static DEFINE_SPINLOCK(inet_addr_hash_lock);
+
+static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
+{
+ u32 val = (__force u32) addr ^ hash_ptr(net, 8);
+
+ return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
+ (IN4_ADDR_HSIZE - 1));
+}
+
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+ unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
+
+ spin_lock(&inet_addr_hash_lock);
+ hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+ spin_unlock(&inet_addr_hash_lock);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+ spin_lock(&inet_addr_hash_lock);
+ hlist_del_init_rcu(&ifa->hash);
+ spin_unlock(&inet_addr_hash_lock);
+}
+
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+ unsigned int hash = inet_addr_hash(net, addr);
+ struct net_device *result = NULL;
+ struct in_ifaddr *ifa;
+ struct hlist_node *node;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
+ struct net_device *dev = ifa->ifa_dev->dev;
+
+ if (!net_eq(dev_net(dev), net))
+ continue;
+ if (ifa->ifa_local == addr) {
+ result = dev;
+ break;
+ }
+ }
+ if (result && devref)
+ dev_hold(result);
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -265,6 +331,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
if (!do_promote) {
+ inet_hash_remove(ifa);
*ifap1 = ifa->ifa_next;
rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
@@ -281,6 +348,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
/* 2. Unlink it */
*ifap = ifa1->ifa_next;
+ inet_hash_remove(ifa1);
/* 3. Announce address deletion */
@@ -368,6 +436,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
ifa->ifa_next = *ifap;
*ifap = ifa;
+ inet_hash_insert(dev_net(in_dev->dev), ifa);
+
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
listeners of netlink will know about new ifaddr */
@@ -521,6 +591,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
if (tb[IFA_ADDRESS] == NULL)
tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+ INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_prefixlen = ifm->ifa_prefixlen;
ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
ifa->ifa_flags = ifm->ifa_flags;
@@ -670,7 +741,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
ifap = &ifa->ifa_next) {
if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
sin_orig.sin_addr.s_addr ==
- ifa->ifa_address) {
+ ifa->ifa_local) {
break; /* found */
}
}
@@ -728,6 +799,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (!ifa) {
ret = -ENOBUFS;
ifa = inet_alloc_ifa();
+ INIT_HLIST_NODE(&ifa->hash);
if (!ifa)
break;
if (colon)
@@ -1030,6 +1102,21 @@ static inline bool inetdev_valid_mtu(unsigned mtu)
return mtu >= 68;
}
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+ struct in_device *in_dev)
+
+{
+ struct in_ifaddr *ifa = in_dev->ifa_list;
+
+ if (!ifa)
+ return;
+
+ arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ ifa->ifa_local, dev,
+ ifa->ifa_local, NULL,
+ dev->dev_addr, NULL);
+}
+
/* Called only under RTNL semaphore */
static int inetdev_event(struct notifier_block *this, unsigned long event,
@@ -1069,6 +1156,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
struct in_ifaddr *ifa = inet_alloc_ifa();
if (ifa) {
+ INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_local =
ifa->ifa_address = htonl(INADDR_LOOPBACK);
ifa->ifa_prefixlen = 8;
@@ -1082,18 +1170,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
}
ip_mc_up(in_dev);
/* fall through */
- case NETDEV_NOTIFY_PEERS:
case NETDEV_CHANGEADDR:
+ if (!IN_DEV_ARP_NOTIFY(in_dev))
+ break;
+ /* fall through */
+ case NETDEV_NOTIFY_PEERS:
/* Send gratuitous ARP to notify of link change */
- if (IN_DEV_ARP_NOTIFY(in_dev)) {
- struct in_ifaddr *ifa = in_dev->ifa_list;
-
- if (ifa)
- arp_send(ARPOP_REQUEST, ETH_P_ARP,
- ifa->ifa_address, dev,
- ifa->ifa_address, NULL,
- dev->dev_addr, NULL);
- }
+ inetdev_send_gratuitous_arp(dev, in_dev);
break;
case NETDEV_DOWN:
ip_mc_down(in_dev);
@@ -1710,6 +1793,11 @@ static struct rtnl_af_ops inet_af_ops = {
void __init devinet_init(void)
{
+ int i;
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&inet_addr_lst[i]);
+
register_pernet_subsys(&devinet_ops);
register_gifconf(PF_INET, inet_gifconf);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index e42a905180f..03f994bcf7d 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -33,11 +33,14 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
*
* TODO: Use spare space in skb for this where possible.
*/
-static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
{
unsigned int len;
- len = crypto_aead_ivsize(aead);
+ len = seqhilen;
+
+ len += crypto_aead_ivsize(aead);
+
if (len) {
len += crypto_aead_alignmask(aead) &
~(crypto_tfm_ctx_alignment() - 1);
@@ -52,10 +55,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
return kmalloc(len, GFP_ATOMIC);
}
-static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp)
+static inline __be32 *esp_tmp_seqhi(void *tmp)
+{
+ return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+}
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
{
return crypto_aead_ivsize(aead) ?
- PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp;
+ PTR_ALIGN((u8 *)tmp + seqhilen,
+ crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
}
static inline struct aead_givcrypt_request *esp_tmp_givreq(
@@ -122,6 +130,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int plen;
int tfclen;
int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
/* skb is pure payload to encrypt */
@@ -151,14 +163,25 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
goto error;
nfrags = err;
- tmp = esp_alloc_tmp(aead, nfrags + 1);
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
if (!tmp)
goto error;
- iv = esp_tmp_iv(aead, tmp);
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_givreq(aead, iv);
asg = esp_givreq_sg(aead, req);
- sg = asg + 1;
+ sg = asg + sglists;
/* Fill padding... */
tail = skb_tail_pointer(trailer);
@@ -215,19 +238,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
}
esph->spi = x->id.spi;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
clen + alen);
- sg_init_one(asg, esph, sizeof(*esph));
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
- aead_givcrypt_set_assoc(req, asg, sizeof(*esph));
+ aead_givcrypt_set_assoc(req, asg, assoclen);
aead_givcrypt_set_giv(req, esph->enc_data,
- XFRM_SKB_CB(skb)->seq.output);
+ XFRM_SKB_CB(skb)->seq.output.low);
ESP_SKB_CB(skb)->tmp = tmp;
err = crypto_aead_givencrypt(req);
@@ -346,6 +377,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
struct sk_buff *trailer;
int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
void *tmp;
u8 *iv;
struct scatterlist *sg;
@@ -362,16 +397,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
goto out;
nfrags = err;
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
err = -ENOMEM;
- tmp = esp_alloc_tmp(aead, nfrags + 1);
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
if (!tmp)
goto out;
ESP_SKB_CB(skb)->tmp = tmp;
- iv = esp_tmp_iv(aead, tmp);
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_req(aead, iv);
asg = esp_req_sg(aead, req);
- sg = asg + 1;
+ sg = asg + sglists;
skb->ip_summed = CHECKSUM_NONE;
@@ -382,11 +428,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
- sg_init_one(asg, esph, sizeof(*esph));
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
aead_request_set_callback(req, 0, esp_input_done, skb);
aead_request_set_crypt(req, sg, sg, elen, iv);
- aead_request_set_assoc(req, asg, sizeof(*esph));
+ aead_request_set_assoc(req, asg, assoclen);
err = crypto_aead_decrypt(req);
if (err == -EINPROGRESS)
@@ -500,10 +554,20 @@ static int esp_init_authenc(struct xfrm_state *x)
goto error;
err = -ENAMETOOLONG;
- if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)",
- x->aalg ? x->aalg->alg_name : "digest_null",
- x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
- goto error;
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authencesn(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ } else {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authenc(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ }
aead = crypto_alloc_aead(authenc_name, 0, 0);
err = PTR_ERR(aead);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1d2cdd43a87..a373a259253 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -51,11 +51,11 @@ static int __net_init fib4_rules_init(struct net *net)
{
struct fib_table *local_table, *main_table;
- local_table = fib_hash_table(RT_TABLE_LOCAL);
+ local_table = fib_trie_table(RT_TABLE_LOCAL);
if (local_table == NULL)
return -ENOMEM;
- main_table = fib_hash_table(RT_TABLE_MAIN);
+ main_table = fib_trie_table(RT_TABLE_MAIN);
if (main_table == NULL)
goto fail;
@@ -82,7 +82,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
if (tb)
return tb;
- tb = fib_hash_table(id);
+ tb = fib_trie_table(id);
if (!tb)
return NULL;
h = id & (FIB_TABLE_HASHSZ - 1);
@@ -114,21 +114,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
-void fib_select_default(struct net *net,
- const struct flowi *flp, struct fib_result *res)
-{
- struct fib_table *tb;
- int table = RT_TABLE_MAIN;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
- return;
- table = res->r->table;
-#endif
- tb = fib_get_table(net, table);
- if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
- fib_table_select_default(tb, flp, res);
-}
-
static void fib_flush(struct net *net)
{
int flushed = 0;
@@ -147,46 +132,6 @@ static void fib_flush(struct net *net)
rt_cache_flush(net, -1);
}
-/**
- * __ip_dev_find - find the first device with a given source address.
- * @net: the net namespace
- * @addr: the source address
- * @devref: if true, take a reference on the found device
- *
- * If a caller uses devref=false, it should be protected by RCU, or RTNL
- */
-struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
-{
- struct flowi fl = {
- .fl4_dst = addr,
- };
- struct fib_result res = { 0 };
- struct net_device *dev = NULL;
- struct fib_table *local_table;
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
- rcu_read_lock();
- local_table = fib_get_table(net, RT_TABLE_LOCAL);
- if (!local_table ||
- fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
- rcu_read_unlock();
- return NULL;
- }
- if (res.type != RTN_LOCAL)
- goto out;
- dev = FIB_RES_DEV(res);
-
- if (dev && devref)
- dev_hold(dev);
-out:
- rcu_read_unlock();
- return dev;
-}
-EXPORT_SYMBOL(__ip_dev_find);
-
/*
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
@@ -195,7 +140,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
const struct net_device *dev,
__be32 addr)
{
- struct flowi fl = { .fl4_dst = addr };
+ struct flowi4 fl4 = { .daddr = addr };
struct fib_result res;
unsigned ret = RTN_BROADCAST;
struct fib_table *local_table;
@@ -213,7 +158,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
if (local_table) {
ret = RTN_UNICAST;
rcu_read_lock();
- if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
+ if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
}
@@ -248,19 +193,21 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
u32 *itag, u32 mark)
{
struct in_device *in_dev;
- struct flowi fl = {
- .fl4_dst = src,
- .fl4_src = dst,
- .fl4_tos = tos,
- .mark = mark,
- .iif = oif
- };
+ struct flowi4 fl4;
struct fib_result res;
int no_addr, rpf, accept_local;
bool dev_match;
int ret;
struct net *net;
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = oif;
+ fl4.flowi4_mark = mark;
+ fl4.daddr = src;
+ fl4.saddr = dst;
+ fl4.flowi4_tos = tos;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+
no_addr = rpf = accept_local = 0;
in_dev = __in_dev_get_rcu(dev);
if (in_dev) {
@@ -268,14 +215,14 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
rpf = IN_DEV_RPFILTER(in_dev);
accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
if (mark && !IN_DEV_SRC_VMARK(in_dev))
- fl.mark = 0;
+ fl4.flowi4_mark = 0;
}
if (in_dev == NULL)
goto e_inval;
net = dev_net(dev);
- if (fib_lookup(net, &fl, &res))
+ if (fib_lookup(net, &fl4, &res))
goto last_resort;
if (res.type != RTN_UNICAST) {
if (res.type != RTN_LOCAL || !accept_local)
@@ -306,10 +253,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
goto last_resort;
if (rpf == 1)
goto e_rpf;
- fl.oif = dev->ifindex;
+ fl4.flowi4_oif = dev->ifindex;
ret = 0;
- if (fib_lookup(net, &fl, &res) == 0) {
+ if (fib_lookup(net, &fl4, &res) == 0) {
if (res.type == RTN_UNICAST) {
*spec_dst = FIB_RES_PREFSRC(res);
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
@@ -849,11 +796,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
{
struct fib_result res;
- struct flowi fl = {
- .mark = frn->fl_mark,
- .fl4_dst = frn->fl_addr,
- .fl4_tos = frn->fl_tos,
- .fl4_scope = frn->fl_scope,
+ struct flowi4 fl4 = {
+ .flowi4_mark = frn->fl_mark,
+ .daddr = frn->fl_addr,
+ .flowi4_tos = frn->fl_tos,
+ .flowi4_scope = frn->fl_scope,
};
#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -866,7 +813,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
frn->tb_id = tb->tb_id;
rcu_read_lock();
- frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
+ frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
if (!frn->err) {
frn->prefixlen = res.prefixlen;
@@ -945,10 +892,12 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
#ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev);
#endif
+ fib_update_nh_saddrs(dev);
rt_cache_flush(dev_net(dev), -1);
break;
case NETDEV_DOWN:
fib_del_ifaddr(ifa);
+ fib_update_nh_saddrs(dev);
if (ifa->ifa_dev->ifa_list == NULL) {
/* Last address was deleted from this interface.
* Disable IP.
@@ -1101,5 +1050,5 @@ void __init ip_fib_init(void)
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
- fib_hash_init();
+ fib_trie_init();
}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index b3acb0417b2..00000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1133 +0,0 @@
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * IPv4 FIB: lookup engine and maintenance routines.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/errno.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/inetdevice.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-
-#include <net/net_namespace.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <net/route.h>
-#include <net/tcp.h>
-#include <net/sock.h>
-#include <net/ip_fib.h>
-
-#include "fib_lookup.h"
-
-static struct kmem_cache *fn_hash_kmem __read_mostly;
-static struct kmem_cache *fn_alias_kmem __read_mostly;
-
-struct fib_node {
- struct hlist_node fn_hash;
- struct list_head fn_alias;
- __be32 fn_key;
- struct fib_alias fn_embedded_alias;
-};
-
-#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
-
-struct fn_zone {
- struct fn_zone __rcu *fz_next; /* Next not empty zone */
- struct hlist_head __rcu *fz_hash; /* Hash table pointer */
- seqlock_t fz_lock;
- u32 fz_hashmask; /* (fz_divisor - 1) */
-
- u8 fz_order; /* Zone order (0..32) */
- u8 fz_revorder; /* 32 - fz_order */
- __be32 fz_mask; /* inet_make_mask(order) */
-#define FZ_MASK(fz) ((fz)->fz_mask)
-
- struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
-
- int fz_nent; /* Number of entries */
- int fz_divisor; /* Hash size (mask+1) */
-};
-
-struct fn_hash {
- struct fn_zone *fn_zones[33];
- struct fn_zone __rcu *fn_zone_list;
-};
-
-static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
-{
- u32 h = ntohl(key) >> fz->fz_revorder;
- h ^= (h>>20);
- h ^= (h>>10);
- h ^= (h>>5);
- h &= fz->fz_hashmask;
- return h;
-}
-
-static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
-{
- return dst & FZ_MASK(fz);
-}
-
-static unsigned int fib_hash_genid;
-
-#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
-
-static struct hlist_head *fz_hash_alloc(int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE)
- return kzalloc(size, GFP_KERNEL);
-
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
-}
-
-/* The fib hash lock must be held when this is called. */
-static inline void fn_rebuild_zone(struct fn_zone *fz,
- struct hlist_head *old_ht,
- int old_divisor)
-{
- int i;
-
- for (i = 0; i < old_divisor; i++) {
- struct hlist_node *node, *n;
- struct fib_node *f;
-
- hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
- struct hlist_head *new_head;
-
- hlist_del_rcu(&f->fn_hash);
-
- new_head = rcu_dereference_protected(fz->fz_hash, 1) +
- fn_hash(f->fn_key, fz);
- hlist_add_head_rcu(&f->fn_hash, new_head);
- }
- }
-}
-
-static void fz_hash_free(struct hlist_head *hash, int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE)
- kfree(hash);
- else
- free_pages((unsigned long)hash, get_order(size));
-}
-
-static void fn_rehash_zone(struct fn_zone *fz)
-{
- struct hlist_head *ht, *old_ht;
- int old_divisor, new_divisor;
- u32 new_hashmask;
-
- new_divisor = old_divisor = fz->fz_divisor;
-
- switch (old_divisor) {
- case EMBEDDED_HASH_SIZE:
- new_divisor *= EMBEDDED_HASH_SIZE;
- break;
- case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
- new_divisor *= (EMBEDDED_HASH_SIZE/2);
- break;
- default:
- if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
- printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
- return;
- }
- new_divisor = (old_divisor << 1);
- break;
- }
-
- new_hashmask = (new_divisor - 1);
-
-#if RT_CACHE_DEBUG >= 2
- printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
- fz->fz_order, old_divisor);
-#endif
-
- ht = fz_hash_alloc(new_divisor);
-
- if (ht) {
- struct fn_zone nfz;
-
- memcpy(&nfz, fz, sizeof(nfz));
-
- write_seqlock_bh(&fz->fz_lock);
- old_ht = rcu_dereference_protected(fz->fz_hash, 1);
- RCU_INIT_POINTER(nfz.fz_hash, ht);
- nfz.fz_hashmask = new_hashmask;
- nfz.fz_divisor = new_divisor;
- fn_rebuild_zone(&nfz, old_ht, old_divisor);
- fib_hash_genid++;
- rcu_assign_pointer(fz->fz_hash, ht);
- fz->fz_hashmask = new_hashmask;
- fz->fz_divisor = new_divisor;
- write_sequnlock_bh(&fz->fz_lock);
-
- if (old_ht != fz->fz_embedded_hash) {
- synchronize_rcu();
- fz_hash_free(old_ht, old_divisor);
- }
- }
-}
-
-static void fn_free_node_rcu(struct rcu_head *head)
-{
- struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
-
- kmem_cache_free(fn_hash_kmem, f);
-}
-
-static inline void fn_free_node(struct fib_node *f)
-{
- call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
-}
-
-static void fn_free_alias_rcu(struct rcu_head *head)
-{
- struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
-
- kmem_cache_free(fn_alias_kmem, fa);
-}
-
-static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
-{
- fib_release_info(fa->fa_info);
- if (fa == &f->fn_embedded_alias)
- fa->fa_info = NULL;
- else
- call_rcu(&fa->rcu, fn_free_alias_rcu);
-}
-
-static struct fn_zone *
-fn_new_zone(struct fn_hash *table, int z)
-{
- int i;
- struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
- if (!fz)
- return NULL;
-
- seqlock_init(&fz->fz_lock);
- fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
- fz->fz_hashmask = fz->fz_divisor - 1;
- RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
- fz->fz_order = z;
- fz->fz_revorder = 32 - z;
- fz->fz_mask = inet_make_mask(z);
-
- /* Find the first not empty zone with more specific mask */
- for (i = z + 1; i <= 32; i++)
- if (table->fn_zones[i])
- break;
- if (i > 32) {
- /* No more specific masks, we are the first. */
- rcu_assign_pointer(fz->fz_next,
- rtnl_dereference(table->fn_zone_list));
- rcu_assign_pointer(table->fn_zone_list, fz);
- } else {
- rcu_assign_pointer(fz->fz_next,
- rtnl_dereference(table->fn_zones[i]->fz_next));
- rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
- }
- table->fn_zones[z] = fz;
- fib_hash_genid++;
- return fz;
-}
-
-int fib_table_lookup(struct fib_table *tb,
- const struct flowi *flp, struct fib_result *res,
- int fib_flags)
-{
- int err;
- struct fn_zone *fz;
- struct fn_hash *t = (struct fn_hash *)tb->tb_data;
-
- rcu_read_lock();
- for (fz = rcu_dereference(t->fn_zone_list);
- fz != NULL;
- fz = rcu_dereference(fz->fz_next)) {
- struct hlist_head *head;
- struct hlist_node *node;
- struct fib_node *f;
- __be32 k;
- unsigned int seq;
-
- do {
- seq = read_seqbegin(&fz->fz_lock);
- k = fz_key(flp->fl4_dst, fz);
-
- head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- if (f->fn_key != k)
- continue;
-
- err = fib_semantic_match(&f->fn_alias,
- flp, res,
- fz->fz_order, fib_flags);
- if (err <= 0)
- goto out;
- }
- } while (read_seqretry(&fz->fz_lock, seq));
- }
- err = 1;
-out:
- rcu_read_unlock();
- return err;
-}
-
-void fib_table_select_default(struct fib_table *tb,
- const struct flowi *flp, struct fib_result *res)
-{
- int order, last_idx;
- struct hlist_node *node;
- struct fib_node *f;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fn_hash *t = (struct fn_hash *)tb->tb_data;
- struct fn_zone *fz = t->fn_zones[0];
- struct hlist_head *head;
-
- if (fz == NULL)
- return;
-
- last_idx = -1;
- last_resort = NULL;
- order = -1;
-
- rcu_read_lock();
- head = rcu_dereference(fz->fz_hash);
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
-
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
- continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
-
- fib_alias_accessed(fa);
-
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- fi = next_fi;
- order++;
- }
- }
-
- if (order <= 0 || fi == NULL) {
- tb->tb_default = -1;
- goto out;
- }
-
- if (!fib_detect_death(fi, order, &last_resort, &last_idx,
- tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
-
- if (last_idx >= 0)
- fib_result_assign(res, last_resort);
- tb->tb_default = last_idx;
-out:
- rcu_read_unlock();
-}
-
-/* Insert node F to FZ. */
-static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
-
- hlist_add_head_rcu(&f->fn_hash, head);
-}
-
-/* Return the node in FZ matching KEY. */
-static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
- struct hlist_node *node;
- struct fib_node *f;
-
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- if (f->fn_key == key)
- return f;
- }
-
- return NULL;
-}
-
-
-static struct fib_alias *fib_fast_alloc(struct fib_node *f)
-{
- struct fib_alias *fa = &f->fn_embedded_alias;
-
- if (fa->fa_info != NULL)
- fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- return fa;
-}
-
-/* Caller must hold RTNL. */
-int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fib_node *new_f = NULL;
- struct fib_node *f;
- struct fib_alias *fa, *new_fa;
- struct fn_zone *fz;
- struct fib_info *fi;
- u8 tos = cfg->fc_tos;
- __be32 key;
- int err;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- fz = table->fn_zones[cfg->fc_dst_len];
- if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
- return -ENOBUFS;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- fi = fib_create_info(cfg);
- if (IS_ERR(fi))
- return PTR_ERR(fi);
-
- if (fz->fz_nent > (fz->fz_divisor<<1) &&
- fz->fz_divisor < FZ_MAX_DIVISOR &&
- (cfg->fc_dst_len == 32 ||
- (1 << cfg->fc_dst_len) > fz->fz_divisor))
- fn_rehash_zone(fz);
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
-
- /* Now fa, if non-NULL, points to the first fib alias
- * with the same keys [prefix,tos,priority], if such key already
- * exists or to the node before which we will insert new one.
- *
- * If fa is NULL, we will need to allocate a new one and
- * insert to the head of f.
- *
- * If f is NULL, no fib node matched the destination key
- * and we need to allocate a new one of those as well.
- */
-
- if (fa && fa->fa_tos == tos &&
- fa->fa_info->fib_priority == fi->fib_priority) {
- struct fib_alias *fa_first, *fa_match;
-
- err = -EEXIST;
- if (cfg->fc_nlflags & NLM_F_EXCL)
- goto out;
-
- /* We have 2 goals:
- * 1. Find exact match for type, scope, fib_info to avoid
- * duplicate routes
- * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
- */
- fa_match = NULL;
- fa_first = fa;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- if (fa->fa_tos != tos)
- break;
- if (fa->fa_info->fib_priority != fi->fib_priority)
- break;
- if (fa->fa_type == cfg->fc_type &&
- fa->fa_scope == cfg->fc_scope &&
- fa->fa_info == fi) {
- fa_match = fa;
- break;
- }
- }
-
- if (cfg->fc_nlflags & NLM_F_REPLACE) {
- u8 state;
-
- fa = fa_first;
- if (fa_match) {
- if (fa == fa_match)
- err = 0;
- goto out;
- }
- err = -ENOBUFS;
- new_fa = fib_fast_alloc(f);
- if (new_fa == NULL)
- goto out;
-
- new_fa->fa_tos = fa->fa_tos;
- new_fa->fa_info = fi;
- new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
- state = fa->fa_state;
- new_fa->fa_state = state & ~FA_S_ACCESSED;
- fib_hash_genid++;
- list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
-
- fn_free_alias(fa, f);
- if (state & FA_S_ACCESSED)
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
- rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
- tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
- return 0;
- }
-
- /* Error if we find a perfect match which
- * uses the same scope, type, and nexthop
- * information.
- */
- if (fa_match)
- goto out;
-
- if (!(cfg->fc_nlflags & NLM_F_APPEND))
- fa = fa_first;
- }
-
- err = -ENOENT;
- if (!(cfg->fc_nlflags & NLM_F_CREATE))
- goto out;
-
- err = -ENOBUFS;
-
- if (!f) {
- new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
- if (new_f == NULL)
- goto out;
-
- INIT_HLIST_NODE(&new_f->fn_hash);
- INIT_LIST_HEAD(&new_f->fn_alias);
- new_f->fn_key = key;
- f = new_f;
- }
-
- new_fa = fib_fast_alloc(f);
- if (new_fa == NULL)
- goto out;
-
- new_fa->fa_info = fi;
- new_fa->fa_tos = tos;
- new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
- new_fa->fa_state = 0;
-
- /*
- * Insert new entry to the list.
- */
-
- if (new_f)
- fib_insert_node(fz, new_f);
- list_add_tail_rcu(&new_fa->fa_list,
- (fa ? &fa->fa_list : &f->fn_alias));
- fib_hash_genid++;
-
- if (new_f)
- fz->fz_nent++;
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-
- rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
- &cfg->fc_nlinfo, 0);
- return 0;
-
-out:
- if (new_f)
- kmem_cache_free(fn_hash_kmem, new_f);
- fib_release_info(fi);
- return err;
-}
-
-int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash *)tb->tb_data;
- struct fib_node *f;
- struct fib_alias *fa, *fa_to_delete;
- struct fn_zone *fz;
- __be32 key;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
- return -ESRCH;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
- if (!fa)
- return -ESRCH;
-
- fa_to_delete = NULL;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fa->fa_tos != cfg->fc_tos)
- break;
-
- if ((!cfg->fc_type ||
- fa->fa_type == cfg->fc_type) &&
- (cfg->fc_scope == RT_SCOPE_NOWHERE ||
- fa->fa_scope == cfg->fc_scope) &&
- (!cfg->fc_protocol ||
- fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi) == 0) {
- fa_to_delete = fa;
- break;
- }
- }
-
- if (fa_to_delete) {
- int kill_fn;
-
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
- tb->tb_id, &cfg->fc_nlinfo, 0);
-
- kill_fn = 0;
- list_del_rcu(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del_rcu(&f->fn_hash);
- kill_fn = 1;
- }
- fib_hash_genid++;
-
- if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
- fn_free_alias(fa, f);
- if (kill_fn) {
- fn_free_node(f);
- fz->fz_nent--;
- }
-
- return 0;
- }
- return -ESRCH;
-}
-
-static int fn_flush_list(struct fn_zone *fz, int idx)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
- struct hlist_node *node, *n;
- struct fib_node *f;
- int found = 0;
-
- hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
- struct fib_alias *fa, *fa_node;
- int kill_f;
-
- kill_f = 0;
- list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
- list_del_rcu(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del_rcu(&f->fn_hash);
- kill_f = 1;
- }
- fib_hash_genid++;
-
- fn_free_alias(fa, f);
- found++;
- }
- }
- if (kill_f) {
- fn_free_node(f);
- fz->fz_nent--;
- }
- }
- return found;
-}
-
-/* caller must hold RTNL. */
-int fib_table_flush(struct fib_table *tb)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fn_zone *fz;
- int found = 0;
-
- for (fz = rtnl_dereference(table->fn_zone_list);
- fz != NULL;
- fz = rtnl_dereference(fz->fz_next)) {
- int i;
-
- for (i = fz->fz_divisor - 1; i >= 0; i--)
- found += fn_flush_list(fz, i);
- }
- return found;
-}
-
-void fib_free_table(struct fib_table *tb)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fn_zone *fz, *next;
-
- next = table->fn_zone_list;
- while (next != NULL) {
- fz = next;
- next = fz->fz_next;
-
- if (fz->fz_hash != fz->fz_embedded_hash)
- fz_hash_free(fz->fz_hash, fz->fz_divisor);
-
- kfree(fz);
- }
-
- kfree(tb);
-}
-
-static inline int
-fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz,
- struct hlist_head *head)
-{
- struct hlist_node *node;
- struct fib_node *f;
- int i, s_i;
-
- s_i = cb->args[4];
- i = 0;
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
- if (i < s_i)
- goto next;
-
- if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->tb_id,
- fa->fa_type,
- fa->fa_scope,
- f->fn_key,
- fz->fz_order,
- fa->fa_tos,
- fa->fa_info,
- NLM_F_MULTI) < 0) {
- cb->args[4] = i;
- return -1;
- }
-next:
- i++;
- }
- }
- cb->args[4] = i;
- return skb->len;
-}
-
-static inline int
-fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz)
-{
- int h, s_h;
- struct hlist_head *head = rcu_dereference(fz->fz_hash);
-
- if (head == NULL)
- return skb->len;
- s_h = cb->args[3];
- for (h = s_h; h < fz->fz_divisor; h++) {
- if (hlist_empty(head + h))
- continue;
- if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
- cb->args[3] = h;
- return -1;
- }
- memset(&cb->args[4], 0,
- sizeof(cb->args) - 4*sizeof(cb->args[0]));
- }
- cb->args[3] = h;
- return skb->len;
-}
-
-int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
-{
- int m = 0, s_m;
- struct fn_zone *fz;
- struct fn_hash *table = (struct fn_hash *)tb->tb_data;
-
- s_m = cb->args[2];
- rcu_read_lock();
- for (fz = rcu_dereference(table->fn_zone_list);
- fz != NULL;
- fz = rcu_dereference(fz->fz_next), m++) {
- if (m < s_m)
- continue;
- if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
- cb->args[2] = m;
- rcu_read_unlock();
- return -1;
- }
- memset(&cb->args[3], 0,
- sizeof(cb->args) - 3*sizeof(cb->args[0]));
- }
- rcu_read_unlock();
- cb->args[2] = m;
- return skb->len;
-}
-
-void __init fib_hash_init(void)
-{
- fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
- 0, SLAB_PANIC, NULL);
-
- fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
- 0, SLAB_PANIC, NULL);
-
-}
-
-struct fib_table *fib_hash_table(u32 id)
-{
- struct fib_table *tb;
-
- tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
- GFP_KERNEL);
- if (tb == NULL)
- return NULL;
-
- tb->tb_id = id;
- tb->tb_default = -1;
-
- memset(tb->tb_data, 0, sizeof(struct fn_hash));
- return tb;
-}
-
-/* ------------------------------------------------------------------------ */
-#ifdef CONFIG_PROC_FS
-
-struct fib_iter_state {
- struct seq_net_private p;
- struct fn_zone *zone;
- int bucket;
- struct hlist_head *hash_head;
- struct fib_node *fn;
- struct fib_alias *fa;
- loff_t pos;
- unsigned int genid;
- int valid;
-};
-
-static struct fib_alias *fib_get_first(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_table *main_table;
- struct fn_hash *table;
-
- main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
- table = (struct fn_hash *)main_table->tb_data;
-
- iter->bucket = 0;
- iter->hash_head = NULL;
- iter->fn = NULL;
- iter->fa = NULL;
- iter->pos = 0;
- iter->genid = fib_hash_genid;
- iter->valid = 1;
-
- for (iter->zone = rcu_dereference(table->fn_zone_list);
- iter->zone != NULL;
- iter->zone = rcu_dereference(iter->zone->fz_next)) {
- int maxslot;
-
- if (!iter->zone->fz_nent)
- continue;
-
- iter->hash_head = rcu_dereference(iter->zone->fz_hash);
- maxslot = iter->zone->fz_divisor;
-
- for (iter->bucket = 0; iter->bucket < maxslot;
- ++iter->bucket, ++iter->hash_head) {
- struct hlist_node *node;
- struct fib_node *fn;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
- }
-out:
- return iter->fa;
-}
-
-static struct fib_alias *fib_get_next(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_node *fn;
- struct fib_alias *fa;
-
- /* Advance FA, if any. */
- fn = iter->fn;
- fa = iter->fa;
- if (fa) {
- BUG_ON(!fn);
- list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
-
- fa = iter->fa = NULL;
-
- /* Advance FN. */
- if (fn) {
- struct hlist_node *node = &fn->fn_hash;
- hlist_for_each_entry_continue(fn, node, fn_hash) {
- iter->fn = fn;
-
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- fn = iter->fn = NULL;
-
- /* Advance hash chain. */
- if (!iter->zone)
- goto out;
-
- for (;;) {
- struct hlist_node *node;
- int maxslot;
-
- maxslot = iter->zone->fz_divisor;
-
- while (++iter->bucket < maxslot) {
- iter->hash_head++;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- iter->zone = rcu_dereference(iter->zone->fz_next);
-
- if (!iter->zone)
- goto out;
-
- iter->bucket = 0;
- iter->hash_head = rcu_dereference(iter->zone->fz_hash);
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-out:
- iter->pos++;
- return fa;
-}
-
-static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_alias *fa;
-
- if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
- fa = iter->fa;
- pos -= iter->pos;
- } else
- fa = fib_get_first(seq);
-
- if (fa)
- while (pos && (fa = fib_get_next(seq)))
- --pos;
- return pos ? NULL : fa;
-}
-
-static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
-{
- void *v = NULL;
-
- rcu_read_lock();
- if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
- v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
- return v;
-}
-
-static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- ++*pos;
- return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
-}
-
-static void fib_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
-{
- rcu_read_unlock();
-}
-
-static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
-{
- static const unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT,
- [8] = RTF_REJECT,
- };
- unsigned flags = type2flags[type];
-
- if (fi && fi->fib_nh->nh_gw)
- flags |= RTF_GATEWAY;
- if (mask == htonl(0xFFFFFFFF))
- flags |= RTF_HOST;
- flags |= RTF_UP;
- return flags;
-}
-
-/*
- * This outputs /proc/net/route.
- *
- * It always works in backward compatibility mode.
- * The format of the file is not supposed to be changed.
- */
-static int fib_seq_show(struct seq_file *seq, void *v)
-{
- struct fib_iter_state *iter;
- int len;
- __be32 prefix, mask;
- unsigned flags;
- struct fib_node *f;
- struct fib_alias *fa;
- struct fib_info *fi;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
- "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
- "\tWindow\tIRTT");
- goto out;
- }
-
- iter = seq->private;
- f = iter->fn;
- fa = iter->fa;
- fi = fa->fa_info;
- prefix = f->fn_key;
- mask = FZ_MASK(iter->zone);
- flags = fib_flag_trans(fa->fa_type, mask, fi);
- if (fi)
- seq_printf(seq,
- "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
- fi->fib_dev ? fi->fib_dev->name : "*", prefix,
- fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
- mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
- fi->fib_window,
- fi->fib_rtt >> 3, &len);
- else
- seq_printf(seq,
- "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
- prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
-
- seq_printf(seq, "%*s\n", 127 - len, "");
-out:
- return 0;
-}
-
-static const struct seq_operations fib_seq_ops = {
- .start = fib_seq_start,
- .next = fib_seq_next,
- .stop = fib_seq_stop,
- .show = fib_seq_show,
-};
-
-static int fib_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &fib_seq_ops,
- sizeof(struct fib_iter_state));
-}
-
-static const struct file_operations fib_seq_fops = {
- .owner = THIS_MODULE,
- .open = fib_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
-
-int __net_init fib_proc_init(struct net *net)
-{
- if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
- return -ENOMEM;
- return 0;
-}
-
-void __net_exit fib_proc_exit(struct net *net)
-{
- proc_net_remove(net, "route");
-}
-#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c079cc0ec65..4ec323875a0 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -25,9 +25,6 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
}
/* Exported by fib_semantics.c */
-extern int fib_semantic_match(struct list_head *head,
- const struct flowi *flp,
- struct fib_result *res, int prefixlen, int fib_flags);
extern void fib_release_info(struct fib_info *);
extern struct fib_info *fib_create_info(struct fib_config *cfg);
extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
@@ -51,4 +48,11 @@ static inline void fib_result_assign(struct fib_result *res,
res->fi = fi;
}
+struct fib_prop {
+ int error;
+ u8 scope;
+};
+
+extern const struct fib_prop fib_props[RTN_MAX + 1];
+
#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 7981a24f5c7..a53bb1b5b11 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -41,19 +41,19 @@ struct fib4_rule {
__be32 srcmask;
__be32 dst;
__be32 dstmask;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
u32 tclassid;
#endif
};
-#ifdef CONFIG_NET_CLS_ROUTE
-u32 fib_rules_tclass(struct fib_result *res)
+#ifdef CONFIG_IP_ROUTE_CLASSID
+u32 fib_rules_tclass(const struct fib_result *res)
{
return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
}
#endif
-int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
+int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
{
struct fib_lookup_arg arg = {
.result = res,
@@ -61,7 +61,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
};
int err;
- err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg);
+ err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
res->r = arg.rule;
return err;
@@ -95,7 +95,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
if (!tbl)
goto errout;
- err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags);
+ err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
if (err > 0)
err = -EAGAIN;
errout:
@@ -106,14 +106,15 @@ errout:
static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
struct fib4_rule *r = (struct fib4_rule *) rule;
- __be32 daddr = fl->fl4_dst;
- __be32 saddr = fl->fl4_src;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ __be32 daddr = fl4->daddr;
+ __be32 saddr = fl4->saddr;
if (((saddr ^ r->src) & r->srcmask) ||
((daddr ^ r->dst) & r->dstmask))
return 0;
- if (r->tos && (r->tos != fl->fl4_tos))
+ if (r->tos && (r->tos != fl4->flowi4_tos))
return 0;
return 1;
@@ -165,7 +166,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (frh->dst_len)
rule4->dst = nla_get_be32(tb[FRA_DST]);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW])
rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
#endif
@@ -195,7 +196,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->tos && (rule4->tos != frh->tos))
return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
return 0;
#endif
@@ -224,7 +225,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
if (rule4->src_len)
NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (rule4->tclassid)
NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
#endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 12d3dc3df1b..622ac4c9502 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -49,7 +49,7 @@
static DEFINE_SPINLOCK(fib_info_lock);
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
-static unsigned int fib_hash_size;
+static unsigned int fib_info_hash_size;
static unsigned int fib_info_cnt;
#define DEVINDEX_HASHBITS 8
@@ -90,11 +90,7 @@ static DEFINE_SPINLOCK(fib_multipath_lock);
#define endfor_nexthops(fi) }
-static const struct
-{
- int error;
- u8 scope;
-} fib_props[RTN_MAX + 1] = {
+const struct fib_prop fib_props[RTN_MAX + 1] = {
[RTN_UNSPEC] = {
.error = 0,
.scope = RT_SCOPE_NOWHERE,
@@ -152,6 +148,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
+ if (fi->fib_metrics != (u32 *) dst_default_metrics)
+ kfree(fi->fib_metrics);
kfree(fi);
}
@@ -200,7 +198,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
nh->nh_weight != onh->nh_weight ||
#endif
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
@@ -221,7 +219,7 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
{
- unsigned int mask = (fib_hash_size - 1);
+ unsigned int mask = (fib_info_hash_size - 1);
unsigned int val = fi->fib_nhs;
val ^= fi->fib_protocol;
@@ -422,7 +420,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
#endif
@@ -476,7 +474,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla && nla_get_be32(nla) != nh->nh_gw)
return 1;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
if (nla && nla_get_u32(nla) != nh->nh_tclassid)
return 1;
@@ -562,16 +560,16 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
}
rcu_read_lock();
{
- struct flowi fl = {
- .fl4_dst = nh->nh_gw,
- .fl4_scope = cfg->fc_scope + 1,
- .oif = nh->nh_oif,
+ struct flowi4 fl4 = {
+ .daddr = nh->nh_gw,
+ .flowi4_scope = cfg->fc_scope + 1,
+ .flowi4_oif = nh->nh_oif,
};
/* It is not necessary, but requires a bit of thinking */
- if (fl.fl4_scope < RT_SCOPE_LINK)
- fl.fl4_scope = RT_SCOPE_LINK;
- err = fib_lookup(net, &fl, &res);
+ if (fl4.flowi4_scope < RT_SCOPE_LINK)
+ fl4.flowi4_scope = RT_SCOPE_LINK;
+ err = fib_lookup(net, &fl4, &res);
if (err) {
rcu_read_unlock();
return err;
@@ -613,14 +611,14 @@ out:
static inline unsigned int fib_laddr_hashfn(__be32 val)
{
- unsigned int mask = (fib_hash_size - 1);
+ unsigned int mask = (fib_info_hash_size - 1);
return ((__force u32)val ^
((__force u32)val >> 7) ^
((__force u32)val >> 14)) & mask;
}
-static struct hlist_head *fib_hash_alloc(int bytes)
+static struct hlist_head *fib_info_hash_alloc(int bytes)
{
if (bytes <= PAGE_SIZE)
return kzalloc(bytes, GFP_KERNEL);
@@ -630,7 +628,7 @@ static struct hlist_head *fib_hash_alloc(int bytes)
get_order(bytes));
}
-static void fib_hash_free(struct hlist_head *hash, int bytes)
+static void fib_info_hash_free(struct hlist_head *hash, int bytes)
{
if (!hash)
return;
@@ -641,18 +639,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
free_pages((unsigned long) hash, get_order(bytes));
}
-static void fib_hash_move(struct hlist_head *new_info_hash,
- struct hlist_head *new_laddrhash,
- unsigned int new_size)
+static void fib_info_hash_move(struct hlist_head *new_info_hash,
+ struct hlist_head *new_laddrhash,
+ unsigned int new_size)
{
struct hlist_head *old_info_hash, *old_laddrhash;
- unsigned int old_size = fib_hash_size;
+ unsigned int old_size = fib_info_hash_size;
unsigned int i, bytes;
spin_lock_bh(&fib_info_lock);
old_info_hash = fib_info_hash;
old_laddrhash = fib_info_laddrhash;
- fib_hash_size = new_size;
+ fib_info_hash_size = new_size;
for (i = 0; i < old_size; i++) {
struct hlist_head *head = &fib_info_hash[i];
@@ -693,8 +691,8 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
spin_unlock_bh(&fib_info_lock);
bytes = old_size * sizeof(struct hlist_head *);
- fib_hash_free(old_info_hash, bytes);
- fib_hash_free(old_laddrhash, bytes);
+ fib_info_hash_free(old_info_hash, bytes);
+ fib_info_hash_free(old_laddrhash, bytes);
}
struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -705,6 +703,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
int nhs = 1;
struct net *net = cfg->fc_nlinfo.nl_net;
+ if (cfg->fc_type > RTN_MAX)
+ goto err_inval;
+
/* Fast check to catch the most weird cases */
if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
goto err_inval;
@@ -718,8 +719,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
#endif
err = -ENOBUFS;
- if (fib_info_cnt >= fib_hash_size) {
- unsigned int new_size = fib_hash_size << 1;
+ if (fib_info_cnt >= fib_info_hash_size) {
+ unsigned int new_size = fib_info_hash_size << 1;
struct hlist_head *new_info_hash;
struct hlist_head *new_laddrhash;
unsigned int bytes;
@@ -727,21 +728,27 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (!new_size)
new_size = 1;
bytes = new_size * sizeof(struct hlist_head *);
- new_info_hash = fib_hash_alloc(bytes);
- new_laddrhash = fib_hash_alloc(bytes);
+ new_info_hash = fib_info_hash_alloc(bytes);
+ new_laddrhash = fib_info_hash_alloc(bytes);
if (!new_info_hash || !new_laddrhash) {
- fib_hash_free(new_info_hash, bytes);
- fib_hash_free(new_laddrhash, bytes);
+ fib_info_hash_free(new_info_hash, bytes);
+ fib_info_hash_free(new_laddrhash, bytes);
} else
- fib_hash_move(new_info_hash, new_laddrhash, new_size);
+ fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
- if (!fib_hash_size)
+ if (!fib_info_hash_size)
goto failure;
}
fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
if (fi == NULL)
goto failure;
+ if (cfg->fc_mx) {
+ fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ if (!fi->fib_metrics)
+ goto failure;
+ } else
+ fi->fib_metrics = (u32 *) dst_default_metrics;
fib_info_cnt++;
fi->fib_net = hold_net(net);
@@ -779,7 +786,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto err_inval;
if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
goto err_inval;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
goto err_inval;
#endif
@@ -792,7 +799,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
nh->nh_oif = cfg->fc_oif;
nh->nh_gw = cfg->fc_gw;
nh->nh_flags = cfg->fc_flags;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid = cfg->fc_flow;
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -804,6 +811,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
goto err_inval;
goto link_it;
+ } else {
+ switch (cfg->fc_type) {
+ case RTN_UNICAST:
+ case RTN_LOCAL:
+ case RTN_BROADCAST:
+ case RTN_ANYCAST:
+ case RTN_MULTICAST:
+ break;
+ default:
+ goto err_inval;
+ }
}
if (cfg->fc_scope > RT_SCOPE_HOST)
@@ -835,6 +853,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto err_inval;
}
+ change_nexthops(fi) {
+ nexthop_nh->nh_cfg_scope = cfg->fc_scope;
+ nexthop_nh->nh_saddr = inet_select_addr(nexthop_nh->nh_dev,
+ nexthop_nh->nh_gw,
+ nexthop_nh->nh_cfg_scope);
+ } endfor_nexthops(fi)
+
link_it:
ofi = fib_find_info(fi);
if (ofi) {
@@ -880,84 +905,6 @@ failure:
return ERR_PTR(err);
}
-/* Note! fib_semantic_match intentionally uses RCU list functions. */
-int fib_semantic_match(struct list_head *head, const struct flowi *flp,
- struct fib_result *res, int prefixlen, int fib_flags)
-{
- struct fib_alias *fa;
- int nh_sel = 0;
-
- list_for_each_entry_rcu(fa, head, fa_list) {
- int err;
-
- if (fa->fa_tos &&
- fa->fa_tos != flp->fl4_tos)
- continue;
-
- if (fa->fa_scope < flp->fl4_scope)
- continue;
-
- fib_alias_accessed(fa);
-
- err = fib_props[fa->fa_type].error;
- if (err == 0) {
- struct fib_info *fi = fa->fa_info;
-
- if (fi->fib_flags & RTNH_F_DEAD)
- continue;
-
- switch (fa->fa_type) {
- case RTN_UNICAST:
- case RTN_LOCAL:
- case RTN_BROADCAST:
- case RTN_ANYCAST:
- case RTN_MULTICAST:
- for_nexthops(fi) {
- if (nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (!flp->oif || flp->oif == nh->nh_oif)
- break;
- }
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (nhsel < fi->fib_nhs) {
- nh_sel = nhsel;
- goto out_fill_res;
- }
-#else
- if (nhsel < 1)
- goto out_fill_res;
-#endif
- endfor_nexthops(fi);
- continue;
-
- default:
- pr_warning("fib_semantic_match bad type %#x\n",
- fa->fa_type);
- return -EINVAL;
- }
- }
- return err;
- }
- return 1;
-
-out_fill_res:
- res->prefixlen = prefixlen;
- res->nh_sel = nh_sel;
- res->type = fa->fa_type;
- res->scope = fa->fa_scope;
- res->fi = fa->fa_info;
- if (!(fib_flags & FIB_LOOKUP_NOREF))
- atomic_inc(&res->fi->fib_clntref);
- return 0;
-}
-
-/* Find appropriate source address to this destination */
-
-__be32 __fib_res_prefsrc(struct fib_result *res)
-{
- return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
-}
-
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
struct fib_info *fi, unsigned int flags)
@@ -1002,7 +949,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
if (fi->fib_nh->nh_oif)
NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (fi->fib_nh[0].nh_tclassid)
NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
#endif
@@ -1027,7 +974,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
if (nh->nh_gw)
NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (nh->nh_tclassid)
NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
#endif
@@ -1125,6 +1072,80 @@ int fib_sync_down_dev(struct net_device *dev, int force)
return ret;
}
+/* Must be invoked inside of an RCU protected region. */
+void fib_select_default(struct fib_result *res)
+{
+ struct fib_info *fi = NULL, *last_resort = NULL;
+ struct list_head *fa_head = res->fa_head;
+ struct fib_table *tb = res->table;
+ int order = -1, last_idx = -1;
+ struct fib_alias *fa;
+
+ list_for_each_entry_rcu(fa, fa_head, fa_list) {
+ struct fib_info *next_fi = fa->fa_info;
+
+ if (fa->fa_scope != res->scope ||
+ fa->fa_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw ||
+ next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+
+ fib_alias_accessed(fa);
+
+ if (fi == NULL) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort,
+ &last_idx, tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+ fi = next_fi;
+ order++;
+ }
+
+ if (order <= 0 || fi == NULL) {
+ tb->tb_default = -1;
+ goto out;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+ tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+
+ if (last_idx >= 0)
+ fib_result_assign(res, last_resort);
+ tb->tb_default = last_idx;
+out:
+ return;
+}
+
+void fib_update_nh_saddrs(struct net_device *dev)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct fib_nh *nh;
+ unsigned int hash;
+
+ hash = fib_devindex_hashfn(dev->ifindex);
+ head = &fib_info_devhash[hash];
+ hlist_for_each_entry(nh, node, head, nh_hash) {
+ if (nh->nh_dev != dev)
+ continue;
+ nh->nh_saddr = inet_select_addr(nh->nh_dev,
+ nh->nh_gw,
+ nh->nh_cfg_scope);
+ }
+}
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/*
@@ -1189,7 +1210,7 @@ int fib_sync_up(struct net_device *dev)
* The algorithm is suboptimal, but it provides really
* fair weighted route distribution.
*/
-void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
+void fib_select_multipath(struct fib_result *res)
{
struct fib_info *fi = res->fi;
int w;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0f280348e0f..3d28a35c2e1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -95,7 +95,7 @@ typedef unsigned int t_key;
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
-struct node {
+struct rt_trie_node {
unsigned long parent;
t_key key;
};
@@ -126,7 +126,7 @@ struct tnode {
struct work_struct work;
struct tnode *tnode_free;
};
- struct node *child[0];
+ struct rt_trie_node *child[0];
};
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -151,16 +151,16 @@ struct trie_stat {
};
struct trie {
- struct node *trie;
+ struct rt_trie_node *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats stats;
#endif
};
-static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull);
-static struct node *resize(struct trie *t, struct tnode *tn);
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
/* tnodes to free after resize(); protected by RTNL */
@@ -177,12 +177,12 @@ static const int sync_pages = 128;
static struct kmem_cache *fn_alias_kmem __read_mostly;
static struct kmem_cache *trie_leaf_kmem __read_mostly;
-static inline struct tnode *node_parent(struct node *node)
+static inline struct tnode *node_parent(struct rt_trie_node *node)
{
return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
}
-static inline struct tnode *node_parent_rcu(struct node *node)
+static inline struct tnode *node_parent_rcu(struct rt_trie_node *node)
{
struct tnode *ret = node_parent(node);
@@ -192,22 +192,22 @@ static inline struct tnode *node_parent_rcu(struct node *node)
/* Same as rcu_assign_pointer
* but that macro() assumes that value is a pointer.
*/
-static inline void node_set_parent(struct node *node, struct tnode *ptr)
+static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
{
smp_wmb();
node->parent = (unsigned long)ptr | NODE_TYPE(node);
}
-static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
+static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i)
{
BUG_ON(i >= 1U << tn->bits);
return tn->child[i];
}
-static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
+static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
{
- struct node *ret = tnode_get_child(tn, i);
+ struct rt_trie_node *ret = tnode_get_child(tn, i);
return rcu_dereference_rtnl(ret);
}
@@ -217,12 +217,12 @@ static inline int tnode_child_length(const struct tnode *tn)
return 1 << tn->bits;
}
-static inline t_key mask_pfx(t_key k, unsigned short l)
+static inline t_key mask_pfx(t_key k, unsigned int l)
{
return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}
-static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
+static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
{
if (offset < KEYLENGTH)
return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
@@ -378,7 +378,7 @@ static void __tnode_free_rcu(struct rcu_head *head)
{
struct tnode *tn = container_of(head, struct tnode, rcu);
size_t size = sizeof(struct tnode) +
- (sizeof(struct node *) << tn->bits);
+ (sizeof(struct rt_trie_node *) << tn->bits);
if (size <= PAGE_SIZE)
kfree(tn);
@@ -402,7 +402,7 @@ static void tnode_free_safe(struct tnode *tn)
tn->tnode_free = tnode_free_head;
tnode_free_head = tn;
tnode_free_size += sizeof(struct tnode) +
- (sizeof(struct node *) << tn->bits);
+ (sizeof(struct rt_trie_node *) << tn->bits);
}
static void tnode_free_flush(void)
@@ -443,7 +443,7 @@ static struct leaf_info *leaf_info_new(int plen)
static struct tnode *tnode_new(t_key key, int pos, int bits)
{
- size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
+ size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
struct tnode *tn = tnode_alloc(sz);
if (tn) {
@@ -456,7 +456,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
}
pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
- sizeof(struct node) << bits);
+ sizeof(struct rt_trie_node) << bits);
return tn;
}
@@ -465,7 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(const struct tnode *tn, const struct node *n)
+static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
{
if (n == NULL || IS_LEAF(n))
return 0;
@@ -474,7 +474,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
}
static inline void put_child(struct trie *t, struct tnode *tn, int i,
- struct node *n)
+ struct rt_trie_node *n)
{
tnode_put_child_reorg(tn, i, n, -1);
}
@@ -484,10 +484,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i,
* Update the value of full_children and empty_children.
*/
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull)
{
- struct node *chi = tn->child[i];
+ struct rt_trie_node *chi = tn->child[i];
int isfull;
BUG_ON(i >= 1<<tn->bits);
@@ -515,7 +515,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
}
#define MAX_WORK 10
-static struct node *resize(struct trie *t, struct tnode *tn)
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
{
int i;
struct tnode *old_tn;
@@ -605,7 +605,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Keep root node larger */
- if (!node_parent((struct node *)tn)) {
+ if (!node_parent((struct rt_trie_node *)tn)) {
inflate_threshold_use = inflate_threshold_root;
halve_threshold_use = halve_threshold_root;
} else {
@@ -635,7 +635,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
- return (struct node *) tn;
+ return (struct rt_trie_node *) tn;
/*
* Halve as long as the number of empty children in this
@@ -663,7 +663,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
for (i = 0; i < tnode_child_length(tn); i++) {
- struct node *n;
+ struct rt_trie_node *n;
n = tn->child[i];
if (!n)
@@ -676,7 +676,7 @@ one_child:
return n;
}
}
- return (struct node *) tn;
+ return (struct rt_trie_node *) tn;
}
static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -723,14 +723,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
goto nomem;
}
- put_child(t, tn, 2*i, (struct node *) left);
- put_child(t, tn, 2*i+1, (struct node *) right);
+ put_child(t, tn, 2*i, (struct rt_trie_node *) left);
+ put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
}
}
for (i = 0; i < olen; i++) {
struct tnode *inode;
- struct node *node = tnode_get_child(oldtnode, i);
+ struct rt_trie_node *node = tnode_get_child(oldtnode, i);
struct tnode *left, *right;
int size, j;
@@ -825,7 +825,7 @@ nomem:
static struct tnode *halve(struct trie *t, struct tnode *tn)
{
struct tnode *oldtnode = tn;
- struct node *left, *right;
+ struct rt_trie_node *left, *right;
int i;
int olen = tnode_child_length(tn);
@@ -856,7 +856,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (!newn)
goto nomem;
- put_child(t, tn, i/2, (struct node *)newn);
+ put_child(t, tn, i/2, (struct rt_trie_node *)newn);
}
}
@@ -958,7 +958,7 @@ fib_find_node(struct trie *t, u32 key)
{
int pos;
struct tnode *tn;
- struct node *n;
+ struct rt_trie_node *n;
pos = 0;
n = rcu_dereference_rtnl(t->trie);
@@ -993,17 +993,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
key = tn->key;
- while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
+ while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
tn = (struct tnode *) resize(t, (struct tnode *)tn);
tnode_put_child_reorg((struct tnode *)tp, cindex,
- (struct node *)tn, wasfull);
+ (struct rt_trie_node *)tn, wasfull);
- tp = node_parent((struct node *) tn);
+ tp = node_parent((struct rt_trie_node *) tn);
if (!tp)
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
if (!tp)
@@ -1015,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
if (IS_TNODE(tn))
tn = (struct tnode *)resize(t, (struct tnode *)tn);
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
}
@@ -1025,7 +1025,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
int pos, newpos;
struct tnode *tp = NULL, *tn = NULL;
- struct node *n;
+ struct rt_trie_node *n;
struct leaf *l;
int missbit;
struct list_head *fa_head = NULL;
@@ -1111,10 +1111,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
if (t->trie && n == NULL) {
/* Case 2: n is NULL, and will just insert a new leaf */
- node_set_parent((struct node *)l, tp);
+ node_set_parent((struct rt_trie_node *)l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+ put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
@@ -1141,18 +1141,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
return NULL;
}
- node_set_parent((struct node *)tn, tp);
+ node_set_parent((struct rt_trie_node *)tn, tp);
missbit = tkey_extract_bits(key, newpos, 1);
- put_child(t, tn, missbit, (struct node *)l);
+ put_child(t, tn, missbit, (struct rt_trie_node *)l);
put_child(t, tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(t, (struct tnode *)tp, cindex,
- (struct node *)tn);
+ (struct rt_trie_node *)tn);
} else {
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tp = tn;
}
}
@@ -1340,8 +1340,8 @@ err:
}
/* should be called with rcu_read_lock */
-static int check_leaf(struct trie *t, struct leaf *l,
- t_key key, const struct flowi *flp,
+static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
+ t_key key, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
struct leaf_info *li;
@@ -1349,40 +1349,75 @@ static int check_leaf(struct trie *t, struct leaf *l,
struct hlist_node *node;
hlist_for_each_entry_rcu(li, node, hhead, hlist) {
- int err;
+ struct fib_alias *fa;
int plen = li->plen;
__be32 mask = inet_make_mask(plen);
if (l->key != (key & ntohl(mask)))
continue;
- err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ int nhsel, err;
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fa->fa_scope < flp->flowi4_scope)
+ continue;
+ fib_alias_accessed(fa);
+ err = fib_props[fa->fa_type].error;
+ if (err) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- if (err <= 0)
- t->stats.semantic_match_passed++;
- else
- t->stats.semantic_match_miss++;
+ t->stats.semantic_match_miss++;
+#endif
+ return 1;
+ }
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+ const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+ if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+ continue;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.semantic_match_passed++;
+#endif
+ res->prefixlen = plen;
+ res->nh_sel = nhsel;
+ res->type = fa->fa_type;
+ res->scope = fa->fa_scope;
+ res->fi = fi;
+ res->table = tb;
+ res->fa_head = &li->falh;
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&res->fi->fib_clntref);
+ return 0;
+ }
+ }
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.semantic_match_miss++;
#endif
- if (err <= 0)
- return err;
}
return 1;
}
-int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
+int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
struct trie *t = (struct trie *) tb->tb_data;
int ret;
- struct node *n;
+ struct rt_trie_node *n;
struct tnode *pn;
- int pos, bits;
- t_key key = ntohl(flp->fl4_dst);
- int chopped_off;
+ unsigned int pos, bits;
+ t_key key = ntohl(flp->daddr);
+ unsigned int chopped_off;
t_key cindex = 0;
- int current_prefix_length = KEYLENGTH;
+ unsigned int current_prefix_length = KEYLENGTH;
struct tnode *cn;
t_key pref_mismatch;
@@ -1398,7 +1433,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
/* Just a leaf? */
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
goto found;
}
@@ -1423,7 +1458,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
}
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
if (ret > 0)
goto backtrace;
goto found;
@@ -1541,7 +1576,7 @@ backtrace:
if (chopped_off <= pn->bits) {
cindex &= ~(1 << (chopped_off-1));
} else {
- struct tnode *parent = node_parent_rcu((struct node *) pn);
+ struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
if (!parent)
goto failed;
@@ -1568,7 +1603,7 @@ found:
*/
static void trie_leaf_remove(struct trie *t, struct leaf *l)
{
- struct tnode *tp = node_parent((struct node *) l);
+ struct tnode *tp = node_parent((struct rt_trie_node *) l);
pr_debug("entering trie_leaf_remove(%p)\n", l);
@@ -1706,7 +1741,7 @@ static int trie_flush_leaf(struct leaf *l)
* Scan for the next right leaf starting at node p->child[idx]
* Since we have back pointer, no recursion necessary.
*/
-static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
+static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
{
do {
t_key idx;
@@ -1732,7 +1767,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
}
/* Node empty, walk back up to parent */
- c = (struct node *) p;
+ c = (struct rt_trie_node *) p;
} while ((p = node_parent_rcu(c)) != NULL);
return NULL; /* Root of trie */
@@ -1753,7 +1788,7 @@ static struct leaf *trie_firstleaf(struct trie *t)
static struct leaf *trie_nextleaf(struct leaf *l)
{
- struct node *c = (struct node *) l;
+ struct rt_trie_node *c = (struct rt_trie_node *) l;
struct tnode *p = node_parent_rcu(c);
if (!p)
@@ -1802,80 +1837,6 @@ void fib_free_table(struct fib_table *tb)
kfree(tb);
}
-void fib_table_select_default(struct fib_table *tb,
- const struct flowi *flp,
- struct fib_result *res)
-{
- struct trie *t = (struct trie *) tb->tb_data;
- int order, last_idx;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fib_alias *fa = NULL;
- struct list_head *fa_head;
- struct leaf *l;
-
- last_idx = -1;
- last_resort = NULL;
- order = -1;
-
- rcu_read_lock();
-
- l = fib_find_node(t, 0);
- if (!l)
- goto out;
-
- fa_head = get_fa_head(l, 0);
- if (!fa_head)
- goto out;
-
- if (list_empty(fa_head))
- goto out;
-
- list_for_each_entry_rcu(fa, fa_head, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
-
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
- continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
-
- fib_alias_accessed(fa);
-
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- fi = next_fi;
- order++;
- }
- if (order <= 0 || fi == NULL) {
- tb->tb_default = -1;
- goto out;
- }
-
- if (!fib_detect_death(fi, order, &last_resort, &last_idx,
- tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- if (last_idx >= 0)
- fib_result_assign(res, last_resort);
- tb->tb_default = last_idx;
-out:
- rcu_read_unlock();
-}
-
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
@@ -1990,7 +1951,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
return skb->len;
}
-void __init fib_hash_init(void)
+void __init fib_trie_init(void)
{
fn_alias_kmem = kmem_cache_create("ip_fib_alias",
sizeof(struct fib_alias),
@@ -2003,8 +1964,7 @@ void __init fib_hash_init(void)
}
-/* Fix more generic FIB names for init later */
-struct fib_table *fib_hash_table(u32 id)
+struct fib_table *fib_trie_table(u32 id)
{
struct fib_table *tb;
struct trie *t;
@@ -2036,7 +1996,7 @@ struct fib_trie_iter {
unsigned int depth;
};
-static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
{
struct tnode *tn = iter->tnode;
unsigned int cindex = iter->index;
@@ -2050,7 +2010,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
iter->tnode, iter->index, iter->depth);
rescan:
while (cindex < (1<<tn->bits)) {
- struct node *n = tnode_get_child_rcu(tn, cindex);
+ struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
if (n) {
if (IS_LEAF(n)) {
@@ -2069,7 +2029,7 @@ rescan:
}
/* Current node exhausted, pop back up */
- p = node_parent_rcu((struct node *)tn);
+ p = node_parent_rcu((struct rt_trie_node *)tn);
if (p) {
cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
tn = p;
@@ -2081,10 +2041,10 @@ rescan:
return NULL;
}
-static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
+static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
struct trie *t)
{
- struct node *n;
+ struct rt_trie_node *n;
if (!t)
return NULL;
@@ -2108,7 +2068,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- struct node *n;
+ struct rt_trie_node *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
@@ -2181,7 +2141,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_putc(seq, '\n');
seq_printf(seq, "\tPointers: %u\n", pointers);
- bytes += sizeof(struct node *) * pointers;
+ bytes += sizeof(struct rt_trie_node *) * pointers;
seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
}
@@ -2262,7 +2222,7 @@ static const struct file_operations fib_triestat_fops = {
.release = single_release_net,
};
-static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
struct fib_trie_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -2275,7 +2235,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
- struct node *n;
+ struct rt_trie_node *n;
for (n = fib_trie_get_first(iter,
(struct trie *) tb->tb_data);
@@ -2304,7 +2264,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct fib_table *tb = iter->tb;
struct hlist_node *tb_node;
unsigned int h;
- struct node *n;
+ struct rt_trie_node *n;
++*pos;
/* next node in same table */
@@ -2390,7 +2350,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
- struct node *n = v;
+ struct rt_trie_node *n = v;
if (!node_parent_rcu(n))
fib_table_print(seq, iter->tb);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4aa1b7f01ea..a91dc161108 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk)
* Send an ICMP frame.
*/
-/*
- * Check transmit rate limitation for given message.
- * The rate information is held in the destination cache now.
- * This function is generic and could be used for other purposes
- * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
- *
- * Note that the same dst_entry fields are modified by functions in
- * route.c too, but these work for packet destinations while xrlim_allow
- * works for icmp destinations. This means the rate limiting information
- * for one "ip object" is shared - and these ICMPs are twice limited:
- * by source and by destination.
- *
- * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
- * SHOULD allow setting of rate limits
- *
- * Shared between ICMPv4 and ICMPv6.
- */
-#define XRLIM_BURST_FACTOR 6
-int xrlim_allow(struct dst_entry *dst, int timeout)
-{
- unsigned long now, token = dst->rate_tokens;
- int rc = 0;
-
- now = jiffies;
- token += now - dst->rate_last;
- dst->rate_last = now;
- if (token > XRLIM_BURST_FACTOR * timeout)
- token = XRLIM_BURST_FACTOR * timeout;
- if (token >= timeout) {
- token -= timeout;
- rc = 1;
- }
- dst->rate_tokens = token;
- return rc;
-}
-EXPORT_SYMBOL(xrlim_allow);
-
-static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
int type, int code)
{
struct dst_entry *dst = &rt->dst;
- int rc = 1;
+ bool rc = true;
if (type > NR_ICMP_TYPES)
goto out;
@@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
goto out;
/* Limit if icmp type is enabled in ratemask. */
- if ((1 << type) & net->ipv4.sysctl_icmp_ratemask)
- rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit);
+ if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ rc = inet_peer_xrlim_allow(rt->peer,
+ net->ipv4.sysctl_icmp_ratelimit);
+ }
out:
return rc;
}
@@ -386,12 +353,15 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
daddr = icmp_param->replyopts.faddr;
}
{
- struct flowi fl = { .fl4_dst= daddr,
- .fl4_src = rt->rt_spec_dst,
- .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
- .proto = IPPROTO_ICMP };
- security_skb_classify_flow(skb, &fl);
- if (ip_route_output_key(net, &rt, &fl))
+ struct flowi4 fl4 = {
+ .daddr = daddr,
+ .saddr = rt->rt_spec_dst,
+ .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
+ .flowi4_proto = IPPROTO_ICMP,
+ };
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
goto out_unlock;
}
if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
@@ -402,6 +372,97 @@ out_unlock:
icmp_xmit_unlock(sk);
}
+static struct rtable *icmp_route_lookup(struct net *net, struct sk_buff *skb_in,
+ struct iphdr *iph,
+ __be32 saddr, u8 tos,
+ int type, int code,
+ struct icmp_bxm *param)
+{
+ struct flowi4 fl4 = {
+ .daddr = (param->replyopts.srr ?
+ param->replyopts.faddr : iph->saddr),
+ .saddr = saddr,
+ .flowi4_tos = RT_TOS(tos),
+ .flowi4_proto = IPPROTO_ICMP,
+ .fl4_icmp_type = type,
+ .fl4_icmp_code = code,
+ };
+ struct rtable *rt, *rt2;
+ int err;
+
+ security_skb_classify_flow(skb_in, flowi4_to_flowi(&fl4));
+ rt = __ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return rt;
+
+ /* No need to clone since we're just using its address. */
+ rt2 = rt;
+
+ if (!fl4.saddr)
+ fl4.saddr = rt->rt_src;
+
+ rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+ flowi4_to_flowi(&fl4), NULL, 0);
+ if (!IS_ERR(rt)) {
+ if (rt != rt2)
+ return rt;
+ } else if (PTR_ERR(rt) == -EPERM) {
+ rt = NULL;
+ } else
+ return rt;
+
+ err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4), AF_INET);
+ if (err)
+ goto relookup_failed;
+
+ if (inet_addr_type(net, fl4.saddr) == RTN_LOCAL) {
+ rt2 = __ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt2))
+ err = PTR_ERR(rt2);
+ } else {
+ struct flowi4 fl4_2 = {};
+ unsigned long orefdst;
+
+ fl4_2.daddr = fl4.saddr;
+ rt2 = ip_route_output_key(net, &fl4_2);
+ if (IS_ERR(rt2)) {
+ err = PTR_ERR(rt2);
+ goto relookup_failed;
+ }
+ /* Ugh! */
+ orefdst = skb_in->_skb_refdst; /* save old refdst */
+ err = ip_route_input(skb_in, fl4.daddr, fl4.saddr,
+ RT_TOS(tos), rt2->dst.dev);
+
+ dst_release(&rt2->dst);
+ rt2 = skb_rtable(skb_in);
+ skb_in->_skb_refdst = orefdst; /* restore old refdst */
+ }
+
+ if (err)
+ goto relookup_failed;
+
+ rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
+ flowi4_to_flowi(&fl4), NULL,
+ XFRM_LOOKUP_ICMP);
+ if (!IS_ERR(rt2)) {
+ dst_release(&rt->dst);
+ rt = rt2;
+ } else if (PTR_ERR(rt2) == -EPERM) {
+ if (rt)
+ dst_release(&rt->dst);
+ return rt2;
+ } else {
+ err = PTR_ERR(rt2);
+ goto relookup_failed;
+ }
+ return rt;
+
+relookup_failed:
+ if (rt)
+ return rt;
+ return ERR_PTR(err);
+}
/*
* Send an ICMP message in response to a situation
@@ -507,7 +568,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
rcu_read_lock();
if (rt_is_input_route(rt) &&
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
- dev = dev_get_by_index_rcu(net, rt->fl.iif);
+ dev = dev_get_by_index_rcu(net, rt->rt_iif);
if (dev)
saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -539,86 +600,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
ipc.opt = &icmp_param.replyopts;
ipc.tx_flags = 0;
- {
- struct flowi fl = {
- .fl4_dst = icmp_param.replyopts.srr ?
- icmp_param.replyopts.faddr : iph->saddr,
- .fl4_src = saddr,
- .fl4_tos = RT_TOS(tos),
- .proto = IPPROTO_ICMP,
- .fl_icmp_type = type,
- .fl_icmp_code = code,
- };
- int err;
- struct rtable *rt2;
-
- security_skb_classify_flow(skb_in, &fl);
- if (__ip_route_output_key(net, &rt, &fl))
- goto out_unlock;
-
- /* No need to clone since we're just using its address. */
- rt2 = rt;
-
- if (!fl.nl_u.ip4_u.saddr)
- fl.nl_u.ip4_u.saddr = rt->rt_src;
-
- err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
- switch (err) {
- case 0:
- if (rt != rt2)
- goto route_done;
- break;
- case -EPERM:
- rt = NULL;
- break;
- default:
- goto out_unlock;
- }
-
- if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET))
- goto relookup_failed;
-
- if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL)
- err = __ip_route_output_key(net, &rt2, &fl);
- else {
- struct flowi fl2 = {};
- unsigned long orefdst;
-
- fl2.fl4_dst = fl.fl4_src;
- if (ip_route_output_key(net, &rt2, &fl2))
- goto relookup_failed;
-
- /* Ugh! */
- orefdst = skb_in->_skb_refdst; /* save old refdst */
- err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
- RT_TOS(tos), rt2->dst.dev);
-
- dst_release(&rt2->dst);
- rt2 = skb_rtable(skb_in);
- skb_in->_skb_refdst = orefdst; /* restore old refdst */
- }
-
- if (err)
- goto relookup_failed;
-
- err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL,
- XFRM_LOOKUP_ICMP);
- switch (err) {
- case 0:
- dst_release(&rt->dst);
- rt = rt2;
- break;
- case -EPERM:
- goto ende;
- default:
-relookup_failed:
- if (!rt)
- goto out_unlock;
- break;
- }
- }
+ rt = icmp_route_lookup(net, skb_in, iph, saddr, tos,
+ type, code, &icmp_param);
+ if (IS_ERR(rt))
+ goto out_unlock;
-route_done:
if (!icmpv4_xrlim_allow(net, rt, type, code))
goto ende;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index e0e77e297de..1fd3d9ce839 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -321,14 +321,12 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
}
igmp_skb_size(skb) = size;
- {
- struct flowi fl = { .oif = dev->ifindex,
- .fl4_dst = IGMPV3_ALL_MCR,
- .proto = IPPROTO_IGMP };
- if (ip_route_output_key(net, &rt, &fl)) {
- kfree_skb(skb);
- return NULL;
- }
+ rt = ip_route_output_ports(net, NULL, IGMPV3_ALL_MCR, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt)) {
+ kfree_skb(skb);
+ return NULL;
}
if (rt->rt_src == 0) {
kfree_skb(skb);
@@ -666,13 +664,12 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
else
dst = group;
- {
- struct flowi fl = { .oif = dev->ifindex,
- .fl4_dst = dst,
- .proto = IPPROTO_IGMP };
- if (ip_route_output_key(net, &rt, &fl))
- return -1;
- }
+ rt = ip_route_output_ports(net, NULL, dst, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt))
+ return -1;
+
if (rt->rt_src == 0) {
ip_rt_put(rt);
return -1;
@@ -1439,8 +1436,6 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
/* RTNL is locked */
static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
{
- struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr };
- struct rtable *rt;
struct net_device *dev = NULL;
struct in_device *idev = NULL;
@@ -1454,9 +1449,14 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
return NULL;
}
- if (!dev && !ip_route_output_key(net, &rt, &fl)) {
- dev = rt->dst.dev;
- ip_rt_put(rt);
+ if (!dev) {
+ struct rtable *rt = ip_route_output(net,
+ imr->imr_multiaddr.s_addr,
+ 0, 0, 0);
+ if (!IS_ERR(rt)) {
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
}
if (dev) {
imr->imr_ifindex = dev->ifindex;
@@ -2329,13 +2329,13 @@ void ip_mc_drop_socket(struct sock *sk)
rtnl_unlock();
}
-int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+/* called with rcu_read_lock() */
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
{
struct ip_mc_list *im;
struct ip_sf_list *psf;
int rv = 0;
- rcu_read_lock();
for_each_pmc_rcu(in_dev, im) {
if (im->multiaddr == mc_addr)
break;
@@ -2357,7 +2357,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
} else
rv = 1; /* unspecified source; tentatively allow */
}
- rcu_read_unlock();
return rv;
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 97e5fb76526..6c0b7f4a3d7 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -356,20 +356,23 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
struct rtable *rt;
const struct inet_request_sock *ireq = inet_rsk(req);
struct ip_options *opt = inet_rsk(req)->opt;
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .mark = sk->sk_mark,
- .fl4_dst = ((opt && opt->srr) ?
- opt->faddr : ireq->rmt_addr),
- .fl4_src = ireq->loc_addr,
- .fl4_tos = RT_CONN_FLAGS(sk),
- .proto = sk->sk_protocol,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = inet_sk(sk)->inet_sport,
- .fl_ip_dport = ireq->rmt_port };
+ struct flowi4 fl4 = {
+ .flowi4_oif = sk->sk_bound_dev_if,
+ .flowi4_mark = sk->sk_mark,
+ .daddr = ((opt && opt->srr) ?
+ opt->faddr : ireq->rmt_addr),
+ .saddr = ireq->loc_addr,
+ .flowi4_tos = RT_CONN_FLAGS(sk),
+ .flowi4_proto = sk->sk_protocol,
+ .flowi4_flags = inet_sk_flowi_flags(sk),
+ .fl4_sport = inet_sk(sk)->inet_sport,
+ .fl4_dport = ireq->rmt_port,
+ };
struct net *net = sock_net(sk);
- security_req_classify_flow(req, &fl);
- if (ip_route_output_flow(net, &rt, &fl, sk, 0))
+ security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt))
goto no_route;
if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
goto route_err;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 2746c1fa641..2ada17129fc 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -858,7 +858,7 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
nlmsg_len(nlh) < hdrlen)
return -EINVAL;
- if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(nlh, hdrlen)) {
struct nlattr *attr;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c5af909cf70..3c8dfa16614 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -505,7 +505,9 @@ restart:
}
rcu_read_unlock();
+ local_bh_disable();
inet_twsk_deschedule(tw, twdr);
+ local_bh_enable();
inet_twsk_put(tw);
goto restart_rcu;
}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index d9bc85751c7..dd1b20eca1a 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -81,19 +81,19 @@ static const struct inet_peer peer_fake_node = {
struct inet_peer_base {
struct inet_peer __rcu *root;
- spinlock_t lock;
+ seqlock_t lock;
int total;
};
static struct inet_peer_base v4_peers = {
.root = peer_avl_empty_rcu,
- .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock),
+ .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
.total = 0,
};
static struct inet_peer_base v6_peers = {
.root = peer_avl_empty_rcu,
- .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock),
+ .lock = __SEQLOCK_UNLOCKED(v6_peers.lock),
.total = 0,
};
@@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a,
int i, n = (a->family == AF_INET ? 1 : 4);
for (i = 0; i < n; i++) {
- if (a->a6[i] == b->a6[i])
+ if (a->addr.a6[i] == b->addr.a6[i])
continue;
- if (a->a6[i] < b->a6[i])
+ if (a->addr.a6[i] < b->addr.a6[i])
return -1;
return 1;
}
@@ -177,6 +177,9 @@ static int addr_compare(const struct inetpeer_addr *a,
return 0;
}
+#define rcu_deref_locked(X, BASE) \
+ rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
+
/*
* Called with local BH disabled and the pool lock held.
*/
@@ -187,8 +190,7 @@ static int addr_compare(const struct inetpeer_addr *a,
\
stackptr = _stack; \
*stackptr++ = &_base->root; \
- for (u = rcu_dereference_protected(_base->root, \
- lockdep_is_held(&_base->lock)); \
+ for (u = rcu_deref_locked(_base->root, _base); \
u != peer_avl_empty; ) { \
int cmp = addr_compare(_daddr, &u->daddr); \
if (cmp == 0) \
@@ -198,23 +200,22 @@ static int addr_compare(const struct inetpeer_addr *a,
else \
v = &u->avl_right; \
*stackptr++ = v; \
- u = rcu_dereference_protected(*v, \
- lockdep_is_held(&_base->lock)); \
+ u = rcu_deref_locked(*v, _base); \
} \
u; \
})
/*
- * Called with rcu_read_lock_bh()
+ * Called with rcu_read_lock()
* Because we hold no lock against a writer, its quite possible we fall
* in an endless loop.
* But every pointer we follow is guaranteed to be valid thanks to RCU.
* We exit from this function if number of links exceeds PEER_MAXDEPTH
*/
-static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
- struct inet_peer_base *base)
+static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
+ struct inet_peer_base *base)
{
- struct inet_peer *u = rcu_dereference_bh(base->root);
+ struct inet_peer *u = rcu_dereference(base->root);
int count = 0;
while (u != peer_avl_empty) {
@@ -230,9 +231,9 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
return u;
}
if (cmp == -1)
- u = rcu_dereference_bh(u->avl_left);
+ u = rcu_dereference(u->avl_left);
else
- u = rcu_dereference_bh(u->avl_right);
+ u = rcu_dereference(u->avl_right);
if (unlikely(++count == PEER_MAXDEPTH))
break;
}
@@ -246,13 +247,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
struct inet_peer __rcu **v; \
*stackptr++ = &start->avl_left; \
v = &start->avl_left; \
- for (u = rcu_dereference_protected(*v, \
- lockdep_is_held(&base->lock)); \
+ for (u = rcu_deref_locked(*v, base); \
u->avl_right != peer_avl_empty_rcu; ) { \
v = &u->avl_right; \
*stackptr++ = v; \
- u = rcu_dereference_protected(*v, \
- lockdep_is_held(&base->lock)); \
+ u = rcu_deref_locked(*v, base); \
} \
u; \
})
@@ -271,21 +270,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
while (stackend > stack) {
nodep = *--stackend;
- node = rcu_dereference_protected(*nodep,
- lockdep_is_held(&base->lock));
- l = rcu_dereference_protected(node->avl_left,
- lockdep_is_held(&base->lock));
- r = rcu_dereference_protected(node->avl_right,
- lockdep_is_held(&base->lock));
+ node = rcu_deref_locked(*nodep, base);
+ l = rcu_deref_locked(node->avl_left, base);
+ r = rcu_deref_locked(node->avl_right, base);
lh = node_height(l);
rh = node_height(r);
if (lh > rh + 1) { /* l: RH+2 */
struct inet_peer *ll, *lr, *lrl, *lrr;
int lrh;
- ll = rcu_dereference_protected(l->avl_left,
- lockdep_is_held(&base->lock));
- lr = rcu_dereference_protected(l->avl_right,
- lockdep_is_held(&base->lock));
+ ll = rcu_deref_locked(l->avl_left, base);
+ lr = rcu_deref_locked(l->avl_right, base);
lrh = node_height(lr);
if (lrh <= node_height(ll)) { /* ll: RH+1 */
RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
@@ -296,10 +290,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
l->avl_height = node->avl_height + 1;
RCU_INIT_POINTER(*nodep, l);
} else { /* ll: RH, lr: RH+1 */
- lrl = rcu_dereference_protected(lr->avl_left,
- lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */
- lrr = rcu_dereference_protected(lr->avl_right,
- lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */
+ lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
+ lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
node->avl_height = rh + 1; /* node: RH+1 */
@@ -314,10 +306,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
} else if (rh > lh + 1) { /* r: LH+2 */
struct inet_peer *rr, *rl, *rlr, *rll;
int rlh;
- rr = rcu_dereference_protected(r->avl_right,
- lockdep_is_held(&base->lock));
- rl = rcu_dereference_protected(r->avl_left,
- lockdep_is_held(&base->lock));
+ rr = rcu_deref_locked(r->avl_right, base);
+ rl = rcu_deref_locked(r->avl_left, base);
rlh = node_height(rl);
if (rlh <= node_height(rr)) { /* rr: LH+1 */
RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
@@ -328,10 +318,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
r->avl_height = node->avl_height + 1;
RCU_INIT_POINTER(*nodep, r);
} else { /* rr: RH, rl: RH+1 */
- rlr = rcu_dereference_protected(rl->avl_right,
- lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */
- rll = rcu_dereference_protected(rl->avl_left,
- lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */
+ rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
+ rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
node->avl_height = lh + 1; /* node: LH+1 */
@@ -372,7 +360,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
do_free = 0;
- spin_lock_bh(&base->lock);
+ write_seqlock_bh(&base->lock);
/* Check the reference counter. It was artificially incremented by 1
* in cleanup() function to prevent sudden disappearing. If we can
* atomically (because of lockless readers) take this last reference,
@@ -392,8 +380,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
/* look for a node to insert instead of p */
struct inet_peer *t;
t = lookup_rightempty(p, base);
- BUG_ON(rcu_dereference_protected(*stackptr[-1],
- lockdep_is_held(&base->lock)) != t);
+ BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
**--stackptr = t->avl_left;
/* t is removed, t->daddr > x->daddr for any
* x in p->avl_left subtree.
@@ -409,10 +396,10 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
base->total--;
do_free = 1;
}
- spin_unlock_bh(&base->lock);
+ write_sequnlock_bh(&base->lock);
if (do_free)
- call_rcu_bh(&p->rcu, inetpeer_free_rcu);
+ call_rcu(&p->rcu, inetpeer_free_rcu);
else
/* The node is used again. Decrease the reference counter
* back. The loop "cleanup -> unlink_from_unused
@@ -475,15 +462,19 @@ static int cleanup_once(unsigned long ttl)
struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
{
struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
- struct inet_peer_base *base = family_to_base(AF_INET);
+ struct inet_peer_base *base = family_to_base(daddr->family);
struct inet_peer *p;
+ unsigned int sequence;
+ int invalidated;
/* Look up for the address quickly, lockless.
* Because of a concurrent writer, we might not find an existing entry.
*/
- rcu_read_lock_bh();
- p = lookup_rcu_bh(daddr, base);
- rcu_read_unlock_bh();
+ rcu_read_lock();
+ sequence = read_seqbegin(&base->lock);
+ p = lookup_rcu(daddr, base);
+ invalidated = read_seqretry(&base->lock, sequence);
+ rcu_read_unlock();
if (p) {
/* The existing node has been found.
@@ -493,14 +484,18 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
return p;
}
+ /* If no writer did a change during our lookup, we can return early. */
+ if (!create && !invalidated)
+ return NULL;
+
/* retry an exact lookup, taking the lock before.
* At least, nodes should be hot in our cache.
*/
- spin_lock_bh(&base->lock);
+ write_seqlock_bh(&base->lock);
p = lookup(daddr, stack, base);
if (p != peer_avl_empty) {
atomic_inc(&p->refcnt);
- spin_unlock_bh(&base->lock);
+ write_sequnlock_bh(&base->lock);
/* Remove the entry from unused list if it was there. */
unlink_from_unused(p);
return p;
@@ -510,8 +505,14 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
p->daddr = *daddr;
atomic_set(&p->refcnt, 1);
atomic_set(&p->rid, 0);
- atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4));
+ atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
p->tcp_ts_stamp = 0;
+ p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+ p->rate_tokens = 0;
+ p->rate_last = 0;
+ p->pmtu_expires = 0;
+ p->pmtu_orig = 0;
+ memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
INIT_LIST_HEAD(&p->unused);
@@ -519,7 +520,7 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
link_to_pool(p, base);
base->total++;
}
- spin_unlock_bh(&base->lock);
+ write_sequnlock_bh(&base->lock);
if (base->total >= inet_peer_threshold)
/* Remove one less-recently-used entry. */
@@ -579,3 +580,44 @@ void inet_putpeer(struct inet_peer *p)
local_bh_enable();
}
EXPORT_SYMBOL_GPL(inet_putpeer);
+
+/*
+ * Check transmit rate limitation for given message.
+ * The rate information is held in the inet_peer entries now.
+ * This function is generic and could be used for other purposes
+ * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ * Note that the same inet_peer fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
+ * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ * SHOULD allow setting of rate limits
+ *
+ * Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
+{
+ unsigned long now, token;
+ bool rc = false;
+
+ if (!peer)
+ return true;
+
+ token = peer->rate_tokens;
+ now = jiffies;
+ token += now - peer->rate_last;
+ peer->rate_last = now;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ if (token >= timeout) {
+ token -= timeout;
+ rc = true;
+ }
+ peer->rate_tokens = token;
+ return rc;
+}
+EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index eb68a0e34e4..da5941f18c3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -769,18 +769,12 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
}
- {
- struct flowi fl = {
- .oif = tunnel->parms.link,
- .fl4_dst = dst,
- .fl4_src = tiph->saddr,
- .fl4_tos = RT_TOS(tos),
- .fl_gre_key = tunnel->parms.o_key
- };
- if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error;
- }
+ rt = ip_route_output_gre(dev_net(dev), dst, tiph->saddr,
+ tunnel->parms.o_key, RT_TOS(tos),
+ tunnel->parms.link);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
}
tdev = rt->dst.dev;
@@ -944,17 +938,13 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
/* Guess output device to choose reasonable mtu and needed_headroom */
if (iph->daddr) {
- struct flowi fl = {
- .oif = tunnel->parms.link,
- .fl4_dst = iph->daddr,
- .fl4_src = iph->saddr,
- .fl4_tos = RT_TOS(iph->tos),
- .proto = IPPROTO_GRE,
- .fl_gre_key = tunnel->parms.o_key
- };
- struct rtable *rt;
-
- if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
+ struct rtable *rt = ip_route_output_gre(dev_net(dev),
+ iph->daddr, iph->saddr,
+ tunnel->parms.o_key,
+ RT_TOS(iph->tos),
+ tunnel->parms.link);
+
+ if (!IS_ERR(rt)) {
tdev = rt->dst.dev;
ip_rt_put(rt);
}
@@ -1206,17 +1196,14 @@ static int ipgre_open(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
if (ipv4_is_multicast(t->parms.iph.daddr)) {
- struct flowi fl = {
- .oif = t->parms.link,
- .fl4_dst = t->parms.iph.daddr,
- .fl4_src = t->parms.iph.saddr,
- .fl4_tos = RT_TOS(t->parms.iph.tos),
- .proto = IPPROTO_GRE,
- .fl_gre_key = t->parms.o_key
- };
- struct rtable *rt;
-
- if (ip_route_output_key(dev_net(dev), &rt, &fl))
+ struct rtable *rt = ip_route_output_gre(dev_net(dev),
+ t->parms.iph.daddr,
+ t->parms.iph.saddr,
+ t->parms.o_key,
+ RT_TOS(t->parms.iph.tos),
+ t->parms.link);
+
+ if (IS_ERR(rt))
return -EADDRNOTAVAIL;
dev = rt->dst.dev;
ip_rt_put(rt);
@@ -1764,4 +1751,4 @@ module_exit(ipgre_fini);
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("gre");
MODULE_ALIAS_RTNL_LINK("gretap");
-MODULE_ALIAS("gre0");
+MODULE_ALIAS_NETDEV("gre0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d859bcc26cb..d7b2b0987a3 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
}
}
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 04c7b3ba6b3..67f241b9764 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -339,25 +339,19 @@ int ip_queue_xmit(struct sk_buff *skb)
if(opt && opt->srr)
daddr = opt->faddr;
- {
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .mark = sk->sk_mark,
- .fl4_dst = daddr,
- .fl4_src = inet->inet_saddr,
- .fl4_tos = RT_CONN_FLAGS(sk),
- .proto = sk->sk_protocol,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = inet->inet_sport,
- .fl_ip_dport = inet->inet_dport };
-
- /* If this fails, retransmit mechanism of transport layer will
- * keep trying until route appears or the connection times
- * itself out.
- */
- security_sk_classify_flow(sk, &fl);
- if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
- goto no_route;
- }
+ /* If this fails, retransmit mechanism of transport layer will
+ * keep trying until route appears or the connection times
+ * itself out.
+ */
+ rt = ip_route_output_ports(sock_net(sk), sk,
+ daddr, inet->inet_saddr,
+ inet->inet_dport,
+ inet->inet_sport,
+ sk->sk_protocol,
+ RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (IS_ERR(rt))
+ goto no_route;
sk_setup_caps(sk, &rt->dst);
}
skb_dst_set_noref(skb, &rt->dst);
@@ -733,6 +727,7 @@ csum_page(struct page *page, int offset, int copy)
}
static inline int ip_ufo_append_data(struct sock *sk,
+ struct sk_buff_head *queue,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
@@ -745,7 +740,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
* device, so create one single skb packet containing complete
* udp datagram
*/
- if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
+ if ((skb = skb_peek_tail(queue)) == NULL) {
skb = sock_alloc_send_skb(sk,
hh_len + fragheaderlen + transhdrlen + 20,
(flags & MSG_DONTWAIT), &err);
@@ -767,40 +762,28 @@ static inline int ip_ufo_append_data(struct sock *sk,
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
- sk->sk_sndmsg_off = 0;
/* specify the length of each IP datagram fragment */
skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
- __skb_queue_tail(&sk->sk_write_queue, skb);
+ __skb_queue_tail(queue, skb);
}
return skb_append_datato_frags(sk, skb, getfrag, from,
(length - transhdrlen));
}
-/*
- * ip_append_data() and ip_append_page() can make one large IP datagram
- * from many pieces of data. Each pieces will be holded on the socket
- * until ip_push_pending_frames() is called. Each piece can be a page
- * or non-page data.
- *
- * Not only UDP, other transport protocols - e.g. raw sockets - can use
- * this interface potentially.
- *
- * LATER: length must be adjusted by pad at tail, when it is required.
- */
-int ip_append_data(struct sock *sk,
- int getfrag(void *from, char *to, int offset, int len,
- int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen,
- struct ipcm_cookie *ipc, struct rtable **rtp,
- unsigned int flags)
+static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
+ struct inet_cork *cork,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb;
- struct ip_options *opt = NULL;
+ struct ip_options *opt = cork->opt;
int hh_len;
int exthdrlen;
int mtu;
@@ -809,58 +792,19 @@ int ip_append_data(struct sock *sk,
int offset = 0;
unsigned int maxfraglen, fragheaderlen;
int csummode = CHECKSUM_NONE;
- struct rtable *rt;
+ struct rtable *rt = (struct rtable *)cork->dst;
- if (flags&MSG_PROBE)
- return 0;
-
- if (skb_queue_empty(&sk->sk_write_queue)) {
- /*
- * setup for corking.
- */
- opt = ipc->opt;
- if (opt) {
- if (inet->cork.opt == NULL) {
- inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
- if (unlikely(inet->cork.opt == NULL))
- return -ENOBUFS;
- }
- memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
- inet->cork.flags |= IPCORK_OPT;
- inet->cork.addr = ipc->addr;
- }
- rt = *rtp;
- if (unlikely(!rt))
- return -EFAULT;
- /*
- * We steal reference to this route, caller should not release it
- */
- *rtp = NULL;
- inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
- rt->dst.dev->mtu :
- dst_mtu(rt->dst.path);
- inet->cork.dst = &rt->dst;
- inet->cork.length = 0;
- sk->sk_sndmsg_page = NULL;
- sk->sk_sndmsg_off = 0;
- exthdrlen = rt->dst.header_len;
- length += exthdrlen;
- transhdrlen += exthdrlen;
- } else {
- rt = (struct rtable *)inet->cork.dst;
- if (inet->cork.flags & IPCORK_OPT)
- opt = inet->cork.opt;
+ exthdrlen = transhdrlen ? rt->dst.header_len : 0;
+ length += exthdrlen;
+ transhdrlen += exthdrlen;
+ mtu = cork->fragsize;
- transhdrlen = 0;
- exthdrlen = 0;
- mtu = inet->cork.fragsize;
- }
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
- if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
+ if (cork->length + length > 0xFFFF - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
mtu-exthdrlen);
return -EMSGSIZE;
@@ -876,15 +820,15 @@ int ip_append_data(struct sock *sk,
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
- skb = skb_peek_tail(&sk->sk_write_queue);
+ skb = skb_peek_tail(queue);
- inet->cork.length += length;
+ cork->length += length;
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO)) {
- err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
- fragheaderlen, transhdrlen, mtu,
- flags);
+ err = ip_ufo_append_data(sk, queue, getfrag, from, length,
+ hh_len, fragheaderlen, transhdrlen,
+ mtu, flags);
if (err)
goto error;
return 0;
@@ -961,7 +905,7 @@ alloc_new_skb:
else
/* only the initial fragment is
time stamped */
- ipc->tx_flags = 0;
+ cork->tx_flags = 0;
}
if (skb == NULL)
goto error;
@@ -972,7 +916,7 @@ alloc_new_skb:
skb->ip_summed = csummode;
skb->csum = 0;
skb_reserve(skb, hh_len);
- skb_shinfo(skb)->tx_flags = ipc->tx_flags;
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
/*
* Find where to start putting bytes.
@@ -1009,7 +953,7 @@ alloc_new_skb:
/*
* Put the packet on the pending queue.
*/
- __skb_queue_tail(&sk->sk_write_queue, skb);
+ __skb_queue_tail(queue, skb);
continue;
}
@@ -1029,8 +973,8 @@ alloc_new_skb:
} else {
int i = skb_shinfo(skb)->nr_frags;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
- struct page *page = sk->sk_sndmsg_page;
- int off = sk->sk_sndmsg_off;
+ struct page *page = cork->page;
+ int off = cork->off;
unsigned int left;
if (page && (left = PAGE_SIZE - off) > 0) {
@@ -1042,7 +986,7 @@ alloc_new_skb:
goto error;
}
get_page(page);
- skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
+ skb_fill_page_desc(skb, i, page, off, 0);
frag = &skb_shinfo(skb)->frags[i];
}
} else if (i < MAX_SKB_FRAGS) {
@@ -1053,8 +997,8 @@ alloc_new_skb:
err = -ENOMEM;
goto error;
}
- sk->sk_sndmsg_page = page;
- sk->sk_sndmsg_off = 0;
+ cork->page = page;
+ cork->off = 0;
skb_fill_page_desc(skb, i, page, 0, 0);
frag = &skb_shinfo(skb)->frags[i];
@@ -1066,7 +1010,7 @@ alloc_new_skb:
err = -EFAULT;
goto error;
}
- sk->sk_sndmsg_off += copy;
+ cork->off += copy;
frag->size += copy;
skb->len += copy;
skb->data_len += copy;
@@ -1080,11 +1024,87 @@ alloc_new_skb:
return 0;
error:
- inet->cork.length -= length;
+ cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
return err;
}
+static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
+ struct ipcm_cookie *ipc, struct rtable **rtp)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_options *opt;
+ struct rtable *rt;
+
+ /*
+ * setup for corking.
+ */
+ opt = ipc->opt;
+ if (opt) {
+ if (cork->opt == NULL) {
+ cork->opt = kmalloc(sizeof(struct ip_options) + 40,
+ sk->sk_allocation);
+ if (unlikely(cork->opt == NULL))
+ return -ENOBUFS;
+ }
+ memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
+ cork->flags |= IPCORK_OPT;
+ cork->addr = ipc->addr;
+ }
+ rt = *rtp;
+ if (unlikely(!rt))
+ return -EFAULT;
+ /*
+ * We steal reference to this route, caller should not release it
+ */
+ *rtp = NULL;
+ cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
+ rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+ cork->dst = &rt->dst;
+ cork->length = 0;
+ cork->tx_flags = ipc->tx_flags;
+ cork->page = NULL;
+ cork->off = 0;
+
+ return 0;
+}
+
+/*
+ * ip_append_data() and ip_append_page() can make one large IP datagram
+ * from many pieces of data. Each pieces will be holded on the socket
+ * until ip_push_pending_frames() is called. Each piece can be a page
+ * or non-page data.
+ *
+ * Not only UDP, other transport protocols - e.g. raw sockets - can use
+ * this interface potentially.
+ *
+ * LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable **rtp,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int err;
+
+ if (flags&MSG_PROBE)
+ return 0;
+
+ if (skb_queue_empty(&sk->sk_write_queue)) {
+ err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
+ if (err)
+ return err;
+ } else {
+ transhdrlen = 0;
+ }
+
+ return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
+ from, length, transhdrlen, flags);
+}
+
ssize_t ip_append_page(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
@@ -1228,40 +1248,41 @@ error:
return err;
}
-static void ip_cork_release(struct inet_sock *inet)
+static void ip_cork_release(struct inet_cork *cork)
{
- inet->cork.flags &= ~IPCORK_OPT;
- kfree(inet->cork.opt);
- inet->cork.opt = NULL;
- dst_release(inet->cork.dst);
- inet->cork.dst = NULL;
+ cork->flags &= ~IPCORK_OPT;
+ kfree(cork->opt);
+ cork->opt = NULL;
+ dst_release(cork->dst);
+ cork->dst = NULL;
}
/*
* Combined all pending IP fragments on the socket as one IP datagram
* and push them out.
*/
-int ip_push_pending_frames(struct sock *sk)
+struct sk_buff *__ip_make_skb(struct sock *sk,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork)
{
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
struct ip_options *opt = NULL;
- struct rtable *rt = (struct rtable *)inet->cork.dst;
+ struct rtable *rt = (struct rtable *)cork->dst;
struct iphdr *iph;
__be16 df = 0;
__u8 ttl;
- int err = 0;
- if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
+ if ((skb = __skb_dequeue(queue)) == NULL)
goto out;
tail_skb = &(skb_shinfo(skb)->frag_list);
/* move skb->data to ip header from ext header */
if (skb->data < skb_network_header(skb))
__skb_pull(skb, skb_network_offset(skb));
- while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+ while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
__skb_pull(tmp_skb, skb_network_header_len(skb));
*tail_skb = tmp_skb;
tail_skb = &(tmp_skb->next);
@@ -1287,8 +1308,8 @@ int ip_push_pending_frames(struct sock *sk)
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
- if (inet->cork.flags & IPCORK_OPT)
- opt = inet->cork.opt;
+ if (cork->flags & IPCORK_OPT)
+ opt = cork->opt;
if (rt->rt_type == RTN_MULTICAST)
ttl = inet->mc_ttl;
@@ -1300,7 +1321,7 @@ int ip_push_pending_frames(struct sock *sk)
iph->ihl = 5;
if (opt) {
iph->ihl += opt->optlen>>2;
- ip_options_build(skb, opt, inet->cork.addr, rt, 0);
+ ip_options_build(skb, opt, cork->addr, rt, 0);
}
iph->tos = inet->tos;
iph->frag_off = df;
@@ -1316,44 +1337,95 @@ int ip_push_pending_frames(struct sock *sk)
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
* on dst refcount
*/
- inet->cork.dst = NULL;
+ cork->dst = NULL;
skb_dst_set(skb, &rt->dst);
if (iph->protocol == IPPROTO_ICMP)
icmp_out_count(net, ((struct icmphdr *)
skb_transport_header(skb))->type);
- /* Netfilter gets whole the not fragmented skb. */
+ ip_cork_release(cork);
+out:
+ return skb;
+}
+
+int ip_send_skb(struct sk_buff *skb)
+{
+ struct net *net = sock_net(skb->sk);
+ int err;
+
err = ip_local_out(skb);
if (err) {
if (err > 0)
err = net_xmit_errno(err);
if (err)
- goto error;
+ IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
}
-out:
- ip_cork_release(inet);
return err;
+}
-error:
- IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
- goto out;
+int ip_push_pending_frames(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ skb = ip_finish_skb(sk);
+ if (!skb)
+ return 0;
+
+ /* Netfilter gets whole the not fragmented skb. */
+ return ip_send_skb(skb);
}
/*
* Throw away all pending data on the socket.
*/
-void ip_flush_pending_frames(struct sock *sk)
+static void __ip_flush_pending_frames(struct sock *sk,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork)
{
struct sk_buff *skb;
- while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+ while ((skb = __skb_dequeue_tail(queue)) != NULL)
kfree_skb(skb);
- ip_cork_release(inet_sk(sk));
+ ip_cork_release(cork);
+}
+
+void ip_flush_pending_frames(struct sock *sk)
+{
+ __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
}
+struct sk_buff *ip_make_skb(struct sock *sk,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable **rtp,
+ unsigned int flags)
+{
+ struct inet_cork cork = {};
+ struct sk_buff_head queue;
+ int err;
+
+ if (flags & MSG_PROBE)
+ return NULL;
+
+ __skb_queue_head_init(&queue);
+
+ err = ip_setup_cork(sk, &cork, ipc, rtp);
+ if (err)
+ return ERR_PTR(err);
+
+ err = __ip_append_data(sk, &queue, &cork, getfrag,
+ from, length, transhdrlen, flags);
+ if (err) {
+ __ip_flush_pending_frames(sk, &queue, &cork);
+ return ERR_PTR(err);
+ }
+
+ return __ip_make_skb(sk, &queue, &cork);
+}
/*
* Fetch data from kernel space and fill in checksum if needed.
@@ -1402,16 +1474,19 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
}
{
- struct flowi fl = { .oif = arg->bound_dev_if,
- .fl4_dst = daddr,
- .fl4_src = rt->rt_spec_dst,
- .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
- .fl_ip_sport = tcp_hdr(skb)->dest,
- .fl_ip_dport = tcp_hdr(skb)->source,
- .proto = sk->sk_protocol,
- .flags = ip_reply_arg_flowi_flags(arg) };
- security_skb_classify_flow(skb, &fl);
- if (ip_route_output_key(sock_net(sk), &rt, &fl))
+ struct flowi4 fl4 = {
+ .flowi4_oif = arg->bound_dev_if,
+ .daddr = daddr,
+ .saddr = rt->rt_spec_dst,
+ .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
+ .fl4_sport = tcp_hdr(skb)->dest,
+ .fl4_dport = tcp_hdr(skb)->source,
+ .flowi4_proto = sk->sk_protocol,
+ .flowi4_flags = ip_reply_arg_flowi_flags(arg),
+ };
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(sock_net(sk), &fl4);
+ if (IS_ERR(rt))
return;
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 988f52fba54..bfc17c5914e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -460,19 +460,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
goto tx_error_icmp;
}
- {
- struct flowi fl = {
- .oif = tunnel->parms.link,
- .fl4_dst = dst,
- .fl4_src= tiph->saddr,
- .fl4_tos = RT_TOS(tos),
- .proto = IPPROTO_IPIP
- };
-
- if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error_icmp;
- }
+ rt = ip_route_output_ports(dev_net(dev), NULL,
+ dst, tiph->saddr,
+ 0, 0,
+ IPPROTO_IPIP, RT_TOS(tos),
+ tunnel->parms.link);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
}
tdev = rt->dst.dev;
@@ -583,16 +578,14 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
iph = &tunnel->parms.iph;
if (iph->daddr) {
- struct flowi fl = {
- .oif = tunnel->parms.link,
- .fl4_dst = iph->daddr,
- .fl4_src = iph->saddr,
- .fl4_tos = RT_TOS(iph->tos),
- .proto = IPPROTO_IPIP
- };
- struct rtable *rt;
-
- if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
+ struct rtable *rt = ip_route_output_ports(dev_net(dev), NULL,
+ iph->daddr, iph->saddr,
+ 0, 0,
+ IPPROTO_IPIP,
+ RT_TOS(iph->tos),
+ tunnel->parms.link);
+
+ if (!IS_ERR(rt)) {
tdev = rt->dst.dev;
ip_rt_put(rt);
}
@@ -913,4 +906,4 @@ static void __exit ipip_fini(void)
module_init(ipip_init);
module_exit(ipip_fini);
MODULE_LICENSE("GPL");
-MODULE_ALIAS("tunl0");
+MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3f3a9afd73e..1f62eaeb6de 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -60,6 +60,7 @@
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
#include <net/ipip.h>
#include <net/checksum.h>
#include <net/netlink.h>
@@ -147,14 +148,15 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
return NULL;
}
-static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
struct mr_table **mrt)
{
struct ipmr_result res;
struct fib_lookup_arg arg = { .result = &res, };
int err;
- err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
+ err = fib_rules_lookup(net->ipv4.mr_rules_ops,
+ flowi4_to_flowi(flp4), 0, &arg);
if (err < 0)
return err;
*mrt = res.mrt;
@@ -282,7 +284,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
return net->ipv4.mrt;
}
-static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
struct mr_table **mrt)
{
*mrt = net->ipv4.mrt;
@@ -434,14 +436,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct net *net = dev_net(dev);
struct mr_table *mrt;
- struct flowi fl = {
- .oif = dev->ifindex,
- .iif = skb->skb_iif,
- .mark = skb->mark,
+ struct flowi4 fl4 = {
+ .flowi4_oif = dev->ifindex,
+ .flowi4_iif = skb->skb_iif,
+ .flowi4_mark = skb->mark,
};
int err;
- err = ipmr_fib_lookup(net, &fl, &mrt);
+ err = ipmr_fib_lookup(net, &fl4, &mrt);
if (err < 0) {
kfree_skb(skb);
return err;
@@ -1434,6 +1436,81 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
}
}
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req {
+ struct in_addr src;
+ struct in_addr grp;
+ compat_ulong_t pktcnt;
+ compat_ulong_t bytecnt;
+ compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_vif_req {
+ vifi_t vifi; /* Which iface */
+ compat_ulong_t icount;
+ compat_ulong_t ocount;
+ compat_ulong_t ibytes;
+ compat_ulong_t obytes;
+};
+
+int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ struct compat_sioc_sg_req sr;
+ struct compat_sioc_vif_req vr;
+ struct vif_device *vif;
+ struct mfc_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (mrt == NULL)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETVIFCNT:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.vifi >= mrt->maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif = &mrt->vif_table[vr.vifi];
+ if (VIF_EXISTS(mrt, vr.vifi)) {
+ vr.icount = vif->pkt_in;
+ vr.ocount = vif->pkt_out;
+ vr.ibytes = vif->bytes_in;
+ vr.obytes = vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
@@ -1535,26 +1612,20 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
#endif
if (vif->flags & VIFF_TUNNEL) {
- struct flowi fl = {
- .oif = vif->link,
- .fl4_dst = vif->remote,
- .fl4_src = vif->local,
- .fl4_tos = RT_TOS(iph->tos),
- .proto = IPPROTO_IPIP
- };
-
- if (ip_route_output_key(net, &rt, &fl))
+ rt = ip_route_output_ports(net, NULL,
+ vif->remote, vif->local,
+ 0, 0,
+ IPPROTO_IPIP,
+ RT_TOS(iph->tos), vif->link);
+ if (IS_ERR(rt))
goto out_free;
encap = sizeof(struct iphdr);
} else {
- struct flowi fl = {
- .oif = vif->link,
- .fl4_dst = iph->daddr,
- .fl4_tos = RT_TOS(iph->tos),
- .proto = IPPROTO_IPIP
- };
-
- if (ip_route_output_key(net, &rt, &fl))
+ rt = ip_route_output_ports(net, NULL, iph->daddr, 0,
+ 0, 0,
+ IPPROTO_IPIP,
+ RT_TOS(iph->tos), vif->link);
+ if (IS_ERR(rt))
goto out_free;
}
@@ -1717,6 +1788,24 @@ dont_forward:
return 0;
}
+static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct rtable *rt)
+{
+ struct flowi4 fl4 = {
+ .daddr = rt->rt_key_dst,
+ .saddr = rt->rt_key_src,
+ .flowi4_tos = rt->rt_tos,
+ .flowi4_oif = rt->rt_oif,
+ .flowi4_iif = rt->rt_iif,
+ .flowi4_mark = rt->rt_mark,
+ };
+ struct mr_table *mrt;
+ int err;
+
+ err = ipmr_fib_lookup(net, &fl4, &mrt);
+ if (err)
+ return ERR_PTR(err);
+ return mrt;
+}
/*
* Multicast packets for forwarding arrive here
@@ -1729,7 +1818,6 @@ int ip_mr_input(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
struct mr_table *mrt;
- int err;
/* Packet is looped back after forward, it should not be
* forwarded second time, but still can be delivered locally.
@@ -1737,12 +1825,11 @@ int ip_mr_input(struct sk_buff *skb)
if (IPCB(skb)->flags & IPSKB_FORWARDED)
goto dont_forward;
- err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
- if (err < 0) {
+ mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
+ if (IS_ERR(mrt)) {
kfree_skb(skb);
- return err;
+ return PTR_ERR(mrt);
}
-
if (!local) {
if (IPCB(skb)->opt.router_alert) {
if (ip_call_ra_chain(skb))
@@ -1870,9 +1957,9 @@ int pim_rcv_v1(struct sk_buff *skb)
pim = igmp_hdr(skb);
- if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
+ mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
+ if (IS_ERR(mrt))
goto drop;
-
if (!mrt->mroute_do_pim ||
pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
goto drop;
@@ -1902,9 +1989,9 @@ static int pim_rcv(struct sk_buff *skb)
csum_fold(skb_checksum(skb, 0, skb->len, 0))))
goto drop;
- if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
+ mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
+ if (IS_ERR(mrt))
goto drop;
-
if (__pim_rcv(mrt, skb, sizeof(*pim))) {
drop:
kfree_skb(skb);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 994a1f29ebb..f3c0b549b8e 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -16,7 +16,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
struct net *net = dev_net(skb_dst(skb)->dev);
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
- struct flowi fl = {};
+ struct flowi4 fl4 = {};
unsigned long orefdst;
unsigned int hh_len;
unsigned int type;
@@ -31,14 +31,15 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
* packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
*/
if (addr_type == RTN_LOCAL) {
- fl.fl4_dst = iph->daddr;
+ fl4.daddr = iph->daddr;
if (type == RTN_LOCAL)
- fl.fl4_src = iph->saddr;
- fl.fl4_tos = RT_TOS(iph->tos);
- fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
- fl.mark = skb->mark;
- fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
- if (ip_route_output_key(net, &rt, &fl) != 0)
+ fl4.saddr = iph->saddr;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
return -1;
/* Drop old route. */
@@ -47,8 +48,9 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
} else {
/* non-local src, find valid iif to satisfy
* rp-filter when calling ip_route_input. */
- fl.fl4_dst = iph->saddr;
- if (ip_route_output_key(net, &rt, &fl) != 0)
+ fl4.daddr = iph->saddr;
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
return -1;
orefdst = skb->_skb_refdst;
@@ -66,10 +68,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
#ifdef CONFIG_XFRM
if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
- xfrm_decode_session(skb, &fl, AF_INET) == 0) {
+ xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
struct dst_entry *dst = skb_dst(skb);
skb_dst_set(skb, NULL);
- if (xfrm_lookup(net, &dst, &fl, skb->sk, 0))
+ dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
+ if (IS_ERR(dst))
return -1;
skb_dst_set(skb, dst);
}
@@ -102,7 +105,8 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
dst = ((struct xfrm_dst *)dst)->route;
dst_hold(dst);
- if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0)
+ dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
+ if (IS_ERR(dst))
return -1;
skb_dst_drop(skb);
@@ -219,7 +223,11 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
static int nf_ip_route(struct dst_entry **dst, struct flowi *fl)
{
- return ip_route_output_key(&init_net, (struct rtable **)dst, fl);
+ struct rtable *rt = ip_route_output_key(&init_net, &fl->u.ip4);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ *dst = &rt->dst;
+ return 0;
}
static const struct nf_afinfo nf_ip_afinfo = {
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index babd1a2bae5..f926a310075 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -206,8 +206,9 @@ config IP_NF_TARGET_REDIRECT
config NF_NAT_SNMP_BASIC
tristate "Basic SNMP-ALG support"
- depends on NF_NAT
+ depends on NF_CONNTRACK_SNMP && NF_NAT
depends on NETFILTER_ADVANCED
+ default NF_NAT && NF_CONNTRACK_SNMP
---help---
This module implements an Application Layer Gateway (ALG) for
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e855fffaed9..e95054c690c 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(NFPROTO_ARP, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
if (ret != 0)
@@ -1333,6 +1334,7 @@ static int translate_compat_table(const char *name,
duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(NFPROTO_ARP);
+ xt_compat_init_offsets(NFPROTO_ARP, number);
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter0, entry0, total_size) {
ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index b8ddcc480ed..a5e52a9f0a1 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -60,12 +60,12 @@ static int checkentry(const struct xt_tgchk_param *par)
if (mangle->flags & ~ARPT_MANGLE_MASK ||
!(mangle->flags & ARPT_MANGLE_MASK))
- return false;
+ return -EINVAL;
if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
mangle->target != XT_CONTINUE)
- return false;
- return true;
+ return -EINVAL;
+ return 0;
}
static struct xt_target arpt_mangle_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 652efea013d..ef7d7b9680e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(AF_INET, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
if (ret != 0)
@@ -1664,6 +1665,7 @@ translate_compat_table(struct net *net,
duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(AF_INET);
+ xt_compat_init_offsets(AF_INET, number);
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter0, entry0, total_size) {
ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1e26a489765..403ca57f601 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
* that the ->target() function isn't called after ->destroy() */
ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL) {
- pr_info("no conntrack!\n");
- /* FIXME: need to drop invalid ones, since replies
- * to outgoing connections of other nodes will be
- * marked as INVALID */
+ if (ct == NULL)
return NF_DROP;
- }
/* special case: ICMP error handling. conntrack distinguishes between
* error messages (RELATED) and information requests (see below) */
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 72ffc8fda2e..d76d6c9ed94 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf,
}
#endif
- /* MAC logging for input path only. */
- if (in && !out)
+ if (in != NULL)
dump_mac_header(m, loginfo, skb);
dump_packet(m, loginfo, skb, 0);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 294a2a32f29..aef5d1fbe77 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
dev_net(out)->ipv4.iptable_mangle);
/* Reroute for ANY change. */
- if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
+ if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
if (iph->saddr != saddr ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 63f60fc5d26..5585980fce2 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -20,6 +20,7 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_acct.h>
+#include <linux/rculist_nulls.h>
struct ct_iter_state {
struct seq_net_private p;
@@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket++) {
- n = rcu_dereference(net->ct.hash[st->bucket].first);
+ n = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
return NULL;
}
- head = rcu_dereference(net->ct.hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
}
return head;
}
@@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ n = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
if (n)
return n;
}
@@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
- head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
}
return head;
}
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 0f23b3f06df..703f366fd23 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb,
/* Try to get same port: if not, try to change it. */
for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
+ int res;
exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
+ res = nf_ct_expect_related(exp);
+ if (res == 0)
break;
- else if (ret != -EBUSY) {
+ else if (res != -EBUSY) {
port = 0;
break;
}
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c04787ce1a7..21bcf471b25 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
manips not an issue. */
if (maniptype == IP_NAT_MANIP_SRC &&
!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
- if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
+ /* try the original tuple first */
+ if (in_range(orig_tuple, range)) {
+ if (!nf_nat_used_tuple(orig_tuple, ct)) {
+ *tuple = *orig_tuple;
+ return;
+ }
+ } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
+ range)) {
pr_debug("get_unique_tuple: Found current src map\n");
if (!nf_nat_used_tuple(tuple, ct))
return;
@@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct,
struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
struct nf_conn_nat *nat;
- int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
/* nat helper or nfctnetlink also setup binding */
nat = nfct_nat(ct);
@@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct,
ct->status |= IPS_DST_NAT;
}
- /* Place in source hash if this is the first time. */
- if (have_to_hash) {
+ if (maniptype == IP_NAT_MANIP_SRC) {
unsigned int srchash;
srchash = hash_by_src(net, nf_ct_zone(ct),
@@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct,
/* It's done. */
if (maniptype == IP_NAT_MANIP_DST)
- set_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
+ ct->status |= IPS_DST_NAT_DONE;
else
- set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
+ ct->status |= IPS_SRC_NAT_DONE;
return NF_ACCEPT;
}
@@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
int ret = 0;
spin_lock_bh(&nf_nat_lock);
- if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) {
+ if (rcu_dereference_protected(
+ nf_nat_protos[proto->protonum],
+ lockdep_is_held(&nf_nat_lock)
+ ) != &nf_nat_unknown_protocol) {
ret = -EBUSY;
goto out;
}
@@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
if (nat == NULL || nat->ct == NULL)
return;
- NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK);
+ NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
spin_lock_bh(&nf_nat_lock);
hlist_del_rcu(&nat->bysource);
@@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old)
struct nf_conn_nat *old_nat = old;
struct nf_conn *ct = old_nat->ct;
- if (!ct || !(ct->status & IPS_NAT_DONE_MASK))
+ if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
return;
spin_lock_bh(&nf_nat_lock);
- new_nat->ct = ct;
hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
spin_unlock_bh(&nf_nat_lock);
}
@@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net)
{
/* Leave them the same for the moment. */
net->ipv4.nat_htable_size = net->ct.htable_size;
- net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
- &net->ipv4.nat_vmalloced, 0);
+ net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
if (!net->ipv4.nat_bysource)
return -ENOMEM;
return 0;
@@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
{
nf_ct_iterate_cleanup(net, &clean_nat, NULL);
synchronize_rcu();
- nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
- net->ipv4.nat_htable_size);
+ nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
}
static struct pernet_operations nf_nat_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419d0a5..8812a02078a 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -54,6 +54,7 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void)
{
int ret = 0;
- ret = nf_conntrack_helper_register(&snmp_helper);
- if (ret < 0)
- return ret;
+ BUG_ON(nf_nat_snmp_hook != NULL);
+ rcu_assign_pointer(nf_nat_snmp_hook, help);
+
ret = nf_conntrack_helper_register(&snmp_trap_helper);
if (ret < 0) {
nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void)
static void __exit nf_nat_snmp_basic_fini(void)
{
- nf_conntrack_helper_unregister(&snmp_helper);
+ rcu_assign_pointer(nf_nat_snmp_hook, NULL);
nf_conntrack_helper_unregister(&snmp_trap_helper);
}
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 95481fee8bd..7317bdf1d45 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -31,6 +31,7 @@
#ifdef CONFIG_XFRM
static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
{
+ struct flowi4 *fl4 = &fl->u.ip4;
const struct nf_conn *ct;
const struct nf_conntrack_tuple *t;
enum ip_conntrack_info ctinfo;
@@ -49,25 +50,25 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
statusbit = IPS_SRC_NAT;
if (ct->status & statusbit) {
- fl->fl4_dst = t->dst.u3.ip;
+ fl4->daddr = t->dst.u3.ip;
if (t->dst.protonum == IPPROTO_TCP ||
t->dst.protonum == IPPROTO_UDP ||
t->dst.protonum == IPPROTO_UDPLITE ||
t->dst.protonum == IPPROTO_DCCP ||
t->dst.protonum == IPPROTO_SCTP)
- fl->fl_ip_dport = t->dst.u.tcp.port;
+ fl4->fl4_dport = t->dst.u.tcp.port;
}
statusbit ^= IPS_NAT_MASK;
if (ct->status & statusbit) {
- fl->fl4_src = t->src.u3.ip;
+ fl4->saddr = t->src.u3.ip;
if (t->dst.protonum == IPPROTO_TCP ||
t->dst.protonum == IPPROTO_UDP ||
t->dst.protonum == IPPROTO_UDPLITE ||
t->dst.protonum == IPPROTO_DCCP ||
t->dst.protonum == IPPROTO_SCTP)
- fl->fl_ip_sport = t->src.u.tcp.port;
+ fl4->fl4_sport = t->src.u.tcp.port;
}
}
#endif
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a3d5ab786e8..e837ffd3edc 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -76,6 +76,7 @@
#include <linux/seq_file.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
static struct raw_hashinfo raw_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
@@ -401,7 +402,7 @@ error:
return err;
}
-static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
+static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
{
struct iovec *iov;
u8 __user *type = NULL;
@@ -417,7 +418,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
if (!iov)
continue;
- switch (fl->proto) {
+ switch (fl4->flowi4_proto) {
case IPPROTO_ICMP:
/* check if one-byte field is readable or not. */
if (iov->iov_base && iov->iov_len < 1)
@@ -432,8 +433,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
code = iov->iov_base;
if (type && code) {
- if (get_user(fl->fl_icmp_type, type) ||
- get_user(fl->fl_icmp_code, code))
+ if (get_user(fl4->fl4_icmp_type, type) ||
+ get_user(fl4->fl4_icmp_code, code))
return -EFAULT;
probed = 1;
}
@@ -547,25 +548,30 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
}
{
- struct flowi fl = { .oif = ipc.oif,
- .mark = sk->sk_mark,
- .fl4_dst = daddr,
- .fl4_src = saddr,
- .fl4_tos = tos,
- .proto = inet->hdrincl ? IPPROTO_RAW :
- sk->sk_protocol,
- };
+ struct flowi4 fl4 = {
+ .flowi4_oif = ipc.oif,
+ .flowi4_mark = sk->sk_mark,
+ .daddr = daddr,
+ .saddr = saddr,
+ .flowi4_tos = tos,
+ .flowi4_proto = (inet->hdrincl ?
+ IPPROTO_RAW :
+ sk->sk_protocol),
+ .flowi4_flags = FLOWI_FLAG_CAN_SLEEP,
+ };
if (!inet->hdrincl) {
- err = raw_probe_proto_opt(&fl, msg);
+ err = raw_probe_proto_opt(&fl4, msg);
if (err)
goto done;
}
- security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1);
+ security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto done;
+ }
}
- if (err)
- goto done;
err = -EACCES;
if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
@@ -838,6 +844,23 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
}
}
+#ifdef CONFIG_COMPAT
+static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ case SIOCINQ:
+ return -ENOIOCTLCMD;
+ default:
+#ifdef CONFIG_IP_MROUTE
+ return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+#endif
+
struct proto raw_prot = {
.name = "RAW",
.owner = THIS_MODULE,
@@ -860,6 +883,7 @@ struct proto raw_prot = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_raw_setsockopt,
.compat_getsockopt = compat_raw_getsockopt,
+ .compat_ioctl = compat_raw_ioctl,
#endif
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 351dc4e8524..209989cf7d1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,8 +109,8 @@
#include <linux/sysctl.h>
#endif
-#define RT_FL_TOS(oldflp) \
- ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
+#define RT_FL_TOS(oldflp4) \
+ ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
#define IP_MAX_MTU 0xFFF0
@@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
static int rt_chain_length_max __read_mostly = 20;
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
-
/*
* Interface to generic destination cache.
*/
@@ -152,6 +149,41 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
{
}
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ struct inet_peer *peer;
+ u32 *p = NULL;
+
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+
+ peer = rt->peer;
+ if (peer) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ p = peer->metrics;
+ if (inet_metrics_new(peer))
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ } else {
+ if (rt->fi) {
+ fib_info_put(rt->fi);
+ rt->fi = NULL;
+ }
+ }
+ }
+ return p;
+}
+
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
@@ -159,6 +191,7 @@ static struct dst_ops ipv4_dst_ops = {
.check = ipv4_dst_check,
.default_advmss = ipv4_default_advmss,
.default_mtu = ipv4_default_mtu,
+ .cow_metrics = ipv4_cow_metrics,
.destroy = ipv4_dst_destroy,
.ifdown = ipv4_dst_ifdown,
.negative_advice = ipv4_negative_advice,
@@ -391,7 +424,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
dst_metric(&r->dst, RTAX_WINDOW),
(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
dst_metric(&r->dst, RTAX_RTTVAR)),
- r->fl.fl4_tos,
+ r->rt_tos,
r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
r->dst.hh ? (r->dst.hh->hh_output ==
dev_queue_xmit) : 0,
@@ -514,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = {
.release = seq_release,
};
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
static int rt_acct_proc_show(struct seq_file *m, void *v)
{
struct ip_rt_acct *dst, *src;
@@ -567,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
if (!pde)
goto err2;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
if (!pde)
goto err3;
#endif
return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
err3:
remove_proc_entry("rt_cache", net->proc_net_stat);
#endif
@@ -588,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
{
remove_proc_entry("rt_cache", net->proc_net_stat);
remove_proc_entry("rt_cache", net->proc_net);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
remove_proc_entry("rt_acct", net->proc_net);
#endif
}
@@ -632,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth)
static inline int rt_valuable(struct rtable *rth)
{
return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- rth->dst.expires;
+ (rth->peer && rth->peer->pmtu_expires);
}
static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -643,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
if (atomic_read(&rth->dst.__refcnt))
goto out;
- ret = 1;
- if (rth->dst.expires &&
- time_after_eq(jiffies, rth->dst.expires))
- goto out;
-
age = jiffies - rth->dst.lastuse;
- ret = 0;
if ((age <= tmo1 && !rt_fast_clean(rth)) ||
(age <= tmo2 && rt_valuable(rth)))
goto out;
@@ -684,22 +711,22 @@ static inline bool rt_caching(const struct net *net)
net->ipv4.sysctl_rt_cache_rebuild_count;
}
-static inline bool compare_hash_inputs(const struct flowi *fl1,
- const struct flowi *fl2)
+static inline bool compare_hash_inputs(const struct rtable *rt1,
+ const struct rtable *rt2)
{
- return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
- ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
- (fl1->iif ^ fl2->iif)) == 0);
+ return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
+ ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
+ (rt1->rt_iif ^ rt2->rt_iif)) == 0);
}
-static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
+static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
{
- return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
- ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
- (fl1->mark ^ fl2->mark) |
- (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
- (fl1->oif ^ fl2->oif) |
- (fl1->iif ^ fl2->iif)) == 0;
+ return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
+ ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
+ (rt1->rt_mark ^ rt2->rt_mark) |
+ (rt1->rt_tos ^ rt2->rt_tos) |
+ (rt1->rt_oif ^ rt2->rt_oif) |
+ (rt1->rt_iif ^ rt2->rt_iif)) == 0;
}
static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
@@ -786,104 +813,13 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
const struct rtable *aux = head;
while (aux != rth) {
- if (compare_hash_inputs(&aux->fl, &rth->fl))
+ if (compare_hash_inputs(aux, rth))
return 0;
aux = rcu_dereference_protected(aux->dst.rt_next, 1);
}
return ONE;
}
-static void rt_check_expire(void)
-{
- static unsigned int rover;
- unsigned int i = rover, goal;
- struct rtable *rth;
- struct rtable __rcu **rthp;
- unsigned long samples = 0;
- unsigned long sum = 0, sum2 = 0;
- unsigned long delta;
- u64 mult;
-
- delta = jiffies - expires_ljiffies;
- expires_ljiffies = jiffies;
- mult = ((u64)delta) << rt_hash_log;
- if (ip_rt_gc_timeout > 1)
- do_div(mult, ip_rt_gc_timeout);
- goal = (unsigned int)mult;
- if (goal > rt_hash_mask)
- goal = rt_hash_mask + 1;
- for (; goal > 0; goal--) {
- unsigned long tmo = ip_rt_gc_timeout;
- unsigned long length;
-
- i = (i + 1) & rt_hash_mask;
- rthp = &rt_hash_table[i].chain;
-
- if (need_resched())
- cond_resched();
-
- samples++;
-
- if (rcu_dereference_raw(*rthp) == NULL)
- continue;
- length = 0;
- spin_lock_bh(rt_hash_lock_addr(i));
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
- prefetch(rth->dst.rt_next);
- if (rt_is_expired(rth)) {
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- continue;
- }
- if (rth->dst.expires) {
- /* Entry is expired even if it is in use */
- if (time_before_eq(jiffies, rth->dst.expires)) {
-nofree:
- tmo >>= 1;
- rthp = &rth->dst.rt_next;
- /*
- * We only count entries on
- * a chain with equal hash inputs once
- * so that entries for different QOS
- * levels, and other non-hash input
- * attributes don't unfairly skew
- * the length computation
- */
- length += has_noalias(rt_hash_table[i].chain, rth);
- continue;
- }
- } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
- goto nofree;
-
- /* Cleanup aged off entries. */
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- }
- spin_unlock_bh(rt_hash_lock_addr(i));
- sum += length;
- sum2 += length*length;
- }
- if (samples) {
- unsigned long avg = sum / samples;
- unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
- rt_chain_length_max = max_t(unsigned long,
- ip_rt_gc_elasticity,
- (avg + 4*sd) >> FRACT_BITS);
- }
- rover = i;
-}
-
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
-{
- rt_check_expire();
- schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
-
/*
* Pertubation of rt_genid by a small quantity [1..256]
* Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -1078,8 +1014,8 @@ static int slow_chain_length(const struct rtable *head)
return length >> FRACT_BITS;
}
-static int rt_intern_hash(unsigned hash, struct rtable *rt,
- struct rtable **rp, struct sk_buff *skb, int ifindex)
+static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
+ struct sk_buff *skb, int ifindex)
{
struct rtable *rth, *cand;
struct rtable __rcu **rthp, **candp;
@@ -1120,7 +1056,7 @@ restart:
printk(KERN_WARNING
"Neighbour table failure & not caching routes.\n");
ip_rt_put(rt);
- return err;
+ return ERR_PTR(err);
}
}
@@ -1137,7 +1073,7 @@ restart:
rt_free(rth);
continue;
}
- if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
+ if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
/* Put it first */
*rthp = rth->dst.rt_next;
/*
@@ -1157,11 +1093,9 @@ restart:
spin_unlock_bh(rt_hash_lock_addr(hash));
rt_drop(rt);
- if (rp)
- *rp = rth;
- else
+ if (skb)
skb_dst_set(skb, &rth->dst);
- return 0;
+ return rth;
}
if (!atomic_read(&rth->dst.__refcnt)) {
@@ -1202,7 +1136,7 @@ restart:
rt_emergency_hash_rebuild(net);
spin_unlock_bh(rt_hash_lock_addr(hash));
- hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
+ hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
ifindex, rt_genid(net));
goto restart;
}
@@ -1218,7 +1152,7 @@ restart:
if (err != -ENOBUFS) {
rt_drop(rt);
- return err;
+ return ERR_PTR(err);
}
/* Neighbour tables are full and nothing
@@ -1239,7 +1173,7 @@ restart:
if (net_ratelimit())
printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
rt_drop(rt);
- return -ENOBUFS;
+ return ERR_PTR(-ENOBUFS);
}
}
@@ -1265,11 +1199,16 @@ restart:
spin_unlock_bh(rt_hash_lock_addr(hash));
skip_hashing:
- if (rp)
- *rp = rt;
- else
+ if (skb)
skb_dst_set(skb, &rt->dst);
- return 0;
+ return rt;
+}
+
+static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
+
+static u32 rt_peer_genid(void)
+{
+ return atomic_read(&__rt_peer_genid);
}
void rt_bind_peer(struct rtable *rt, int create)
@@ -1280,6 +1219,8 @@ void rt_bind_peer(struct rtable *rt, int create)
if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
inet_putpeer(peer);
+ else
+ rt->rt_peer_genid = rt_peer_genid();
}
/*
@@ -1349,13 +1290,8 @@ static void rt_del(unsigned hash, struct rtable *rt)
void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
__be32 saddr, struct net_device *dev)
{
- int i, k;
struct in_device *in_dev = __in_dev_get_rcu(dev);
- struct rtable *rth;
- struct rtable __rcu **rthp;
- __be32 skeys[2] = { saddr, 0 };
- int ikeys[2] = { dev->ifindex, 0 };
- struct netevent_redirect netevent;
+ struct inet_peer *peer;
struct net *net;
if (!in_dev)
@@ -1367,9 +1303,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
ipv4_is_zeronet(new_gw))
goto reject_redirect;
- if (!rt_caching(net))
- goto reject_redirect;
-
if (!IN_DEV_SHARED_MEDIA(in_dev)) {
if (!inet_addr_onlink(in_dev, new_gw, old_gw))
goto reject_redirect;
@@ -1380,91 +1313,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
goto reject_redirect;
}
- for (i = 0; i < 2; i++) {
- for (k = 0; k < 2; k++) {
- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
- rt_genid(net));
-
- rthp = &rt_hash_table[hash].chain;
-
- while ((rth = rcu_dereference(*rthp)) != NULL) {
- struct rtable *rt;
-
- if (rth->fl.fl4_dst != daddr ||
- rth->fl.fl4_src != skeys[i] ||
- rth->fl.oif != ikeys[k] ||
- rt_is_input_route(rth) ||
- rt_is_expired(rth) ||
- !net_eq(dev_net(rth->dst.dev), net)) {
- rthp = &rth->dst.rt_next;
- continue;
- }
-
- if (rth->rt_dst != daddr ||
- rth->rt_src != saddr ||
- rth->dst.error ||
- rth->rt_gateway != old_gw ||
- rth->dst.dev != dev)
- break;
-
- dst_hold(&rth->dst);
-
- rt = dst_alloc(&ipv4_dst_ops);
- if (rt == NULL) {
- ip_rt_put(rth);
- return;
- }
-
- /* Copy all the information. */
- *rt = *rth;
- rt->dst.__use = 1;
- atomic_set(&rt->dst.__refcnt, 1);
- rt->dst.child = NULL;
- if (rt->dst.dev)
- dev_hold(rt->dst.dev);
- rt->dst.obsolete = -1;
- rt->dst.lastuse = jiffies;
- rt->dst.path = &rt->dst;
- rt->dst.neighbour = NULL;
- rt->dst.hh = NULL;
-#ifdef CONFIG_XFRM
- rt->dst.xfrm = NULL;
-#endif
- rt->rt_genid = rt_genid(net);
- rt->rt_flags |= RTCF_REDIRECTED;
-
- /* Gateway is different ... */
- rt->rt_gateway = new_gw;
-
- /* Redirect received -> path was valid */
- dst_confirm(&rth->dst);
-
- if (rt->peer)
- atomic_inc(&rt->peer->refcnt);
-
- if (arp_bind_neighbour(&rt->dst) ||
- !(rt->dst.neighbour->nud_state &
- NUD_VALID)) {
- if (rt->dst.neighbour)
- neigh_event_send(rt->dst.neighbour, NULL);
- ip_rt_put(rth);
- rt_drop(rt);
- goto do_next;
- }
+ peer = inet_getpeer_v4(daddr, 1);
+ if (peer) {
+ peer->redirect_learned.a4 = new_gw;
- netevent.old = &rth->dst;
- netevent.new = &rt->dst;
- call_netevent_notifiers(NETEVENT_REDIRECT,
- &netevent);
+ inet_putpeer(peer);
- rt_del(hash, rth);
- if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
- ip_rt_put(rt);
- goto do_next;
- }
- do_next:
- ;
- }
+ atomic_inc(&__rt_peer_genid);
}
return;
@@ -1488,18 +1343,24 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
if (dst->obsolete > 0) {
ip_rt_put(rt);
ret = NULL;
- } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
- (rt->dst.expires &&
- time_after_eq(jiffies, rt->dst.expires))) {
- unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
- rt->fl.oif,
+ } else if (rt->rt_flags & RTCF_REDIRECTED) {
+ unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
+ rt->rt_oif,
rt_genid(dev_net(dst->dev)));
#if RT_CACHE_DEBUG >= 1
printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
- &rt->rt_dst, rt->fl.fl4_tos);
+ &rt->rt_dst, rt->rt_tos);
#endif
rt_del(hash, rt);
ret = NULL;
+ } else if (rt->peer &&
+ rt->peer->pmtu_expires &&
+ time_after_eq(jiffies, rt->peer->pmtu_expires)) {
+ unsigned long orig = rt->peer->pmtu_expires;
+
+ if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
+ dst_metric_set(dst, RTAX_MTU,
+ rt->peer->pmtu_orig);
}
}
return ret;
@@ -1525,6 +1386,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
struct in_device *in_dev;
+ struct inet_peer *peer;
int log_martians;
rcu_read_lock();
@@ -1536,33 +1398,41 @@ void ip_rt_send_redirect(struct sk_buff *skb)
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
rcu_read_unlock();
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ peer = rt->peer;
+ if (!peer) {
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+ return;
+ }
+
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
*/
- if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
- rt->dst.rate_tokens = 0;
+ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
+ peer->rate_tokens = 0;
/* Too many ignored redirects; do not send anything
* set dst.rate_last to the last seen redirected packet.
*/
- if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
- rt->dst.rate_last = jiffies;
+ if (peer->rate_tokens >= ip_rt_redirect_number) {
+ peer->rate_last = jiffies;
return;
}
/* Check for load limit; set rate_last to the latest sent
* redirect.
*/
- if (rt->dst.rate_tokens == 0 ||
+ if (peer->rate_tokens == 0 ||
time_after(jiffies,
- (rt->dst.rate_last +
- (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
+ (peer->rate_last +
+ (ip_rt_redirect_load << peer->rate_tokens)))) {
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
- rt->dst.rate_last = jiffies;
- ++rt->dst.rate_tokens;
+ peer->rate_last = jiffies;
+ ++peer->rate_tokens;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (log_martians &&
- rt->dst.rate_tokens == ip_rt_redirect_number &&
+ peer->rate_tokens == ip_rt_redirect_number &&
net_ratelimit())
printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
&rt->rt_src, rt->rt_iif,
@@ -1574,7 +1444,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
static int ip_error(struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
+ struct inet_peer *peer;
unsigned long now;
+ bool send;
int code;
switch (rt->dst.error) {
@@ -1594,15 +1466,24 @@ static int ip_error(struct sk_buff *skb)
break;
}
- now = jiffies;
- rt->dst.rate_tokens += now - rt->dst.rate_last;
- if (rt->dst.rate_tokens > ip_rt_error_burst)
- rt->dst.rate_tokens = ip_rt_error_burst;
- rt->dst.rate_last = now;
- if (rt->dst.rate_tokens >= ip_rt_error_cost) {
- rt->dst.rate_tokens -= ip_rt_error_cost;
- icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ peer = rt->peer;
+
+ send = true;
+ if (peer) {
+ now = jiffies;
+ peer->rate_tokens += now - peer->rate_last;
+ if (peer->rate_tokens > ip_rt_error_burst)
+ peer->rate_tokens = ip_rt_error_burst;
+ peer->rate_last = now;
+ if (peer->rate_tokens >= ip_rt_error_cost)
+ peer->rate_tokens -= ip_rt_error_cost;
+ else
+ send = false;
}
+ if (send)
+ icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
out: kfree_skb(skb);
return 0;
@@ -1630,88 +1511,142 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
unsigned short new_mtu,
struct net_device *dev)
{
- int i, k;
unsigned short old_mtu = ntohs(iph->tot_len);
- struct rtable *rth;
- int ikeys[2] = { dev->ifindex, 0 };
- __be32 skeys[2] = { iph->saddr, 0, };
- __be32 daddr = iph->daddr;
unsigned short est_mtu = 0;
+ struct inet_peer *peer;
- for (k = 0; k < 2; k++) {
- for (i = 0; i < 2; i++) {
- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
- rt_genid(net));
-
- rcu_read_lock();
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->dst.rt_next)) {
- unsigned short mtu = new_mtu;
-
- if (rth->fl.fl4_dst != daddr ||
- rth->fl.fl4_src != skeys[i] ||
- rth->rt_dst != daddr ||
- rth->rt_src != iph->saddr ||
- rth->fl.oif != ikeys[k] ||
- rt_is_input_route(rth) ||
- dst_metric_locked(&rth->dst, RTAX_MTU) ||
- !net_eq(dev_net(rth->dst.dev), net) ||
- rt_is_expired(rth))
- continue;
+ peer = inet_getpeer_v4(iph->daddr, 1);
+ if (peer) {
+ unsigned short mtu = new_mtu;
- if (new_mtu < 68 || new_mtu >= old_mtu) {
+ if (new_mtu < 68 || new_mtu >= old_mtu) {
+ /* BSD 4.2 derived systems incorrectly adjust
+ * tot_len by the IP header length, and report
+ * a zero MTU in the ICMP message.
+ */
+ if (mtu == 0 &&
+ old_mtu >= 68 + (iph->ihl << 2))
+ old_mtu -= iph->ihl << 2;
+ mtu = guess_mtu(old_mtu);
+ }
- /* BSD 4.2 compatibility hack :-( */
- if (mtu == 0 &&
- old_mtu >= dst_mtu(&rth->dst) &&
- old_mtu >= 68 + (iph->ihl << 2))
- old_mtu -= iph->ihl << 2;
+ if (mtu < ip_rt_min_pmtu)
+ mtu = ip_rt_min_pmtu;
+ if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
+ unsigned long pmtu_expires;
- mtu = guess_mtu(old_mtu);
- }
- if (mtu <= dst_mtu(&rth->dst)) {
- if (mtu < dst_mtu(&rth->dst)) {
- dst_confirm(&rth->dst);
- if (mtu < ip_rt_min_pmtu) {
- u32 lock = dst_metric(&rth->dst,
- RTAX_LOCK);
- mtu = ip_rt_min_pmtu;
- lock |= (1 << RTAX_MTU);
- dst_metric_set(&rth->dst, RTAX_LOCK,
- lock);
- }
- dst_metric_set(&rth->dst, RTAX_MTU, mtu);
- dst_set_expires(&rth->dst,
- ip_rt_mtu_expires);
- }
- est_mtu = mtu;
- }
- }
- rcu_read_unlock();
+ pmtu_expires = jiffies + ip_rt_mtu_expires;
+ if (!pmtu_expires)
+ pmtu_expires = 1UL;
+
+ est_mtu = mtu;
+ peer->pmtu_learned = mtu;
+ peer->pmtu_expires = pmtu_expires;
}
+
+ inet_putpeer(peer);
+
+ atomic_inc(&__rt_peer_genid);
}
return est_mtu ? : new_mtu;
}
+static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
+{
+ unsigned long expires = peer->pmtu_expires;
+
+ if (time_before(jiffies, expires)) {
+ u32 orig_dst_mtu = dst_mtu(dst);
+ if (peer->pmtu_learned < orig_dst_mtu) {
+ if (!peer->pmtu_orig)
+ peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
+ dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
+ }
+ } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
+ dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
+}
+
static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
{
- if (dst_mtu(dst) > mtu && mtu >= 68 &&
- !(dst_metric_locked(dst, RTAX_MTU))) {
- if (mtu < ip_rt_min_pmtu) {
- u32 lock = dst_metric(dst, RTAX_LOCK);
+ struct rtable *rt = (struct rtable *) dst;
+ struct inet_peer *peer;
+
+ dst_confirm(dst);
+
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ peer = rt->peer;
+ if (peer) {
+ if (mtu < ip_rt_min_pmtu)
mtu = ip_rt_min_pmtu;
- dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
+ if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
+ unsigned long pmtu_expires;
+
+ pmtu_expires = jiffies + ip_rt_mtu_expires;
+ if (!pmtu_expires)
+ pmtu_expires = 1UL;
+
+ peer->pmtu_learned = mtu;
+ peer->pmtu_expires = pmtu_expires;
+
+ atomic_inc(&__rt_peer_genid);
+ rt->rt_peer_genid = rt_peer_genid();
}
- dst_metric_set(dst, RTAX_MTU, mtu);
- dst_set_expires(dst, ip_rt_mtu_expires);
- call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
+ check_peer_pmtu(dst, peer);
+
+ inet_putpeer(peer);
+ }
+}
+
+static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ __be32 orig_gw = rt->rt_gateway;
+
+ dst_confirm(&rt->dst);
+
+ neigh_release(rt->dst.neighbour);
+ rt->dst.neighbour = NULL;
+
+ rt->rt_gateway = peer->redirect_learned.a4;
+ if (arp_bind_neighbour(&rt->dst) ||
+ !(rt->dst.neighbour->nud_state & NUD_VALID)) {
+ if (rt->dst.neighbour)
+ neigh_event_send(rt->dst.neighbour, NULL);
+ rt->rt_gateway = orig_gw;
+ return -EAGAIN;
+ } else {
+ rt->rt_flags |= RTCF_REDIRECTED;
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
+ rt->dst.neighbour);
}
+ return 0;
}
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
{
- if (rt_is_expired((struct rtable *)dst))
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (rt_is_expired(rt))
return NULL;
+ if (rt->rt_peer_genid != rt_peer_genid()) {
+ struct inet_peer *peer;
+
+ if (!rt->peer)
+ rt_bind_peer(rt, 0);
+
+ peer = rt->peer;
+ if (peer && peer->pmtu_expires)
+ check_peer_pmtu(dst, peer);
+
+ if (peer && peer->redirect_learned.a4 &&
+ peer->redirect_learned.a4 != rt->rt_gateway) {
+ if (check_peer_redir(dst, peer))
+ return NULL;
+ }
+
+ rt->rt_peer_genid = rt_peer_genid();
+ }
return dst;
}
@@ -1720,6 +1655,10 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
struct rtable *rt = (struct rtable *) dst;
struct inet_peer *peer = rt->peer;
+ if (rt->fi) {
+ fib_info_put(rt->fi);
+ rt->fi = NULL;
+ }
if (peer) {
rt->peer = NULL;
inet_putpeer(peer);
@@ -1734,8 +1673,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
rt = skb_rtable(skb);
- if (rt)
- dst_set_expires(&rt->dst, 0);
+ if (rt &&
+ rt->peer &&
+ rt->peer->pmtu_expires) {
+ unsigned long orig = rt->peer->pmtu_expires;
+
+ if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
+ dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
+ }
}
static int ip_rt_bug(struct sk_buff *skb)
@@ -1764,8 +1709,17 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
if (rt_is_output_route(rt))
src = rt->rt_src;
else {
+ struct flowi4 fl4 = {
+ .daddr = rt->rt_key_dst,
+ .saddr = rt->rt_key_src,
+ .flowi4_tos = rt->rt_tos,
+ .flowi4_oif = rt->rt_oif,
+ .flowi4_iif = rt->rt_iif,
+ .flowi4_mark = rt->rt_mark,
+ };
+
rcu_read_lock();
- if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
+ if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
src = FIB_RES_PREFSRC(res);
else
src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
@@ -1775,7 +1729,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
memcpy(addr, &src, 4);
}
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
static void set_class_tag(struct rtable *rt, u32 tag)
{
if (!(rt->dst.tclassid & 0xFFFF))
@@ -1815,17 +1769,54 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
return mtu;
}
-static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+static void rt_init_metrics(struct rtable *rt, const struct flowi4 *oldflp4,
+ struct fib_info *fi)
+{
+ struct inet_peer *peer;
+ int create = 0;
+
+ /* If a peer entry exists for this destination, we must hook
+ * it up in order to get at cached metrics.
+ */
+ if (oldflp4 && (oldflp4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
+ create = 1;
+
+ rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
+ if (peer) {
+ rt->rt_peer_genid = rt_peer_genid();
+ if (inet_metrics_new(peer))
+ memcpy(peer->metrics, fi->fib_metrics,
+ sizeof(u32) * RTAX_MAX);
+ dst_init_metrics(&rt->dst, peer->metrics, false);
+
+ if (peer->pmtu_expires)
+ check_peer_pmtu(&rt->dst, peer);
+ if (peer->redirect_learned.a4 &&
+ peer->redirect_learned.a4 != rt->rt_gateway) {
+ rt->rt_gateway = peer->redirect_learned.a4;
+ rt->rt_flags |= RTCF_REDIRECTED;
+ }
+ } else {
+ if (fi->fib_metrics != (u32 *) dst_default_metrics) {
+ rt->fi = fi;
+ atomic_inc(&fi->fib_clntref);
+ }
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+ }
+}
+
+static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *oldflp4,
+ const struct fib_result *res,
+ struct fib_info *fi, u16 type, u32 itag)
{
struct dst_entry *dst = &rt->dst;
- struct fib_info *fi = res->fi;
if (fi) {
if (FIB_RES_GW(*res) &&
FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = FIB_RES_GW(*res);
- dst_import_metrics(dst, fi->fib_metrics);
-#ifdef CONFIG_NET_CLS_ROUTE
+ rt_init_metrics(rt, oldflp4, fi);
+#ifdef CONFIG_IP_ROUTE_CLASSID
dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
#endif
}
@@ -1835,13 +1826,26 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
set_class_tag(rt, fib_rules_tclass(res));
#endif
set_class_tag(rt, itag);
#endif
- rt->rt_type = res->type;
+ rt->rt_type = type;
+}
+
+static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm)
+{
+ struct rtable *rt = dst_alloc(&ipv4_dst_ops, 1);
+ if (rt) {
+ rt->dst.obsolete = -1;
+
+ rt->dst.flags = DST_HOST |
+ (nopolicy ? DST_NOPOLICY : 0) |
+ (noxfrm ? DST_NOXFRM : 0);
+ }
+ return rt;
}
/* called in rcu_read_lock() section */
@@ -1874,31 +1878,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (err < 0)
goto e_err;
}
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
rth->dst.output = ip_rt_bug;
- rth->dst.obsolete = -1;
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
- rth->fl.fl4_dst = daddr;
+ rth->rt_key_dst = daddr;
rth->rt_dst = daddr;
- rth->fl.fl4_tos = tos;
- rth->fl.mark = skb->mark;
- rth->fl.fl4_src = saddr;
+ rth->rt_tos = tos;
+ rth->rt_mark = skb->mark;
+ rth->rt_key_src = saddr;
rth->rt_src = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
- rth->rt_iif =
- rth->fl.iif = dev->ifindex;
+ rth->rt_iif = dev->ifindex;
rth->dst.dev = init_net.loopback_dev;
dev_hold(rth->dst.dev);
- rth->fl.oif = 0;
+ rth->rt_oif = 0;
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
rth->rt_genid = rt_genid(dev_net(dev));
@@ -1916,7 +1914,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
RT_CACHE_STAT_INC(in_slow_mc);
hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
- return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
+ rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
+ err = 0;
+ if (IS_ERR(rth))
+ err = PTR_ERR(rth);
e_nobufs:
return -ENOBUFS;
@@ -1959,7 +1960,7 @@ static void ip_handle_martian_source(struct net_device *dev,
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
- struct fib_result *res,
+ const struct fib_result *res,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos,
struct rtable **result)
@@ -2013,39 +2014,31 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
-
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(out_dev, NOXFRM));
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
- if (IN_DEV_CONF_GET(out_dev, NOXFRM))
- rth->dst.flags |= DST_NOXFRM;
- rth->fl.fl4_dst = daddr;
+ rth->rt_key_dst = daddr;
rth->rt_dst = daddr;
- rth->fl.fl4_tos = tos;
- rth->fl.mark = skb->mark;
- rth->fl.fl4_src = saddr;
+ rth->rt_tos = tos;
+ rth->rt_mark = skb->mark;
+ rth->rt_key_src = saddr;
rth->rt_src = saddr;
rth->rt_gateway = daddr;
- rth->rt_iif =
- rth->fl.iif = in_dev->dev->ifindex;
+ rth->rt_iif = in_dev->dev->ifindex;
rth->dst.dev = (out_dev)->dev;
dev_hold(rth->dst.dev);
- rth->fl.oif = 0;
+ rth->rt_oif = 0;
rth->rt_spec_dst= spec_dst;
- rth->dst.obsolete = -1;
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
- rt_set_nexthop(rth, res, itag);
+ rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
rth->rt_flags = flags;
@@ -2057,7 +2050,7 @@ static int __mkroute_input(struct sk_buff *skb,
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
- const struct flowi *fl,
+ const struct flowi4 *fl4,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
@@ -2066,8 +2059,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
unsigned hash;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
- fib_select_multipath(fl, res);
+ if (res->fi && res->fi->fib_nhs > 1)
+ fib_select_multipath(res);
#endif
/* create a routing cache entry */
@@ -2076,9 +2069,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
return err;
/* put it into the cache */
- hash = rt_hash(daddr, saddr, fl->iif,
+ hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
rt_genid(dev_net(rth->dst.dev)));
- return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
+ rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
+ if (IS_ERR(rth))
+ return PTR_ERR(rth);
+ return 0;
}
/*
@@ -2097,12 +2093,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
- struct flowi fl = { .fl4_dst = daddr,
- .fl4_src = saddr,
- .fl4_tos = tos,
- .fl4_scope = RT_SCOPE_UNIVERSE,
- .mark = skb->mark,
- .iif = dev->ifindex };
+ struct flowi4 fl4;
unsigned flags = 0;
u32 itag = 0;
struct rtable * rth;
@@ -2139,7 +2130,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
/*
* Now we are ready to route packet.
*/
- err = fib_lookup(net, &fl, &res);
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = dev->ifindex;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_tos = tos;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+ err = fib_lookup(net, &fl4, &res);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
goto e_hostunreach;
@@ -2168,7 +2166,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res.type != RTN_UNICAST)
goto martian_destination;
- err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
+ err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
out: return err;
brd_input:
@@ -2190,29 +2188,23 @@ brd_input:
RT_CACHE_STAT_INC(in_brd);
local_input:
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
rth->dst.output= ip_rt_bug;
- rth->dst.obsolete = -1;
rth->rt_genid = rt_genid(net);
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
- rth->fl.fl4_dst = daddr;
+ rth->rt_key_dst = daddr;
rth->rt_dst = daddr;
- rth->fl.fl4_tos = tos;
- rth->fl.mark = skb->mark;
- rth->fl.fl4_src = saddr;
+ rth->rt_tos = tos;
+ rth->rt_mark = skb->mark;
+ rth->rt_key_src = saddr;
rth->rt_src = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
- rth->rt_iif =
- rth->fl.iif = dev->ifindex;
+ rth->rt_iif = dev->ifindex;
rth->dst.dev = net->loopback_dev;
dev_hold(rth->dst.dev);
rth->rt_gateway = daddr;
@@ -2225,8 +2217,11 @@ local_input:
rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
- hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
- err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
+ hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
+ rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
+ err = 0;
+ if (IS_ERR(rth))
+ err = PTR_ERR(rth);
goto out;
no_route:
@@ -2288,12 +2283,12 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->dst.rt_next)) {
- if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
- ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
- (rth->fl.iif ^ iif) |
- rth->fl.oif |
- (rth->fl.fl4_tos ^ tos)) == 0 &&
- rth->fl.mark == skb->mark &&
+ if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
+ ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
+ (rth->rt_iif ^ iif) |
+ rth->rt_oif |
+ (rth->rt_tos ^ tos)) == 0 &&
+ rth->rt_mark == skb->mark &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
if (noref) {
@@ -2326,8 +2321,8 @@ skip_cache:
struct in_device *in_dev = __in_dev_get_rcu(dev);
if (in_dev) {
- int our = ip_check_mc(in_dev, daddr, saddr,
- ip_hdr(skb)->protocol);
+ int our = ip_check_mc_rcu(in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
||
@@ -2351,98 +2346,91 @@ skip_cache:
EXPORT_SYMBOL(ip_route_input_common);
/* called with rcu_read_lock() */
-static int __mkroute_output(struct rtable **result,
- struct fib_result *res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
+static struct rtable *__mkroute_output(const struct fib_result *res,
+ const struct flowi4 *fl4,
+ const struct flowi4 *oldflp4,
+ struct net_device *dev_out,
+ unsigned int flags)
{
- struct rtable *rth;
+ struct fib_info *fi = res->fi;
+ u32 tos = RT_FL_TOS(oldflp4);
struct in_device *in_dev;
- u32 tos = RT_FL_TOS(oldflp);
+ u16 type = res->type;
+ struct rtable *rth;
- if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
- return -EINVAL;
+ if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+ return ERR_PTR(-EINVAL);
- if (ipv4_is_lbcast(fl->fl4_dst))
- res->type = RTN_BROADCAST;
- else if (ipv4_is_multicast(fl->fl4_dst))
- res->type = RTN_MULTICAST;
- else if (ipv4_is_zeronet(fl->fl4_dst))
- return -EINVAL;
+ if (ipv4_is_lbcast(fl4->daddr))
+ type = RTN_BROADCAST;
+ else if (ipv4_is_multicast(fl4->daddr))
+ type = RTN_MULTICAST;
+ else if (ipv4_is_zeronet(fl4->daddr))
+ return ERR_PTR(-EINVAL);
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
in_dev = __in_dev_get_rcu(dev_out);
if (!in_dev)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- if (res->type == RTN_BROADCAST) {
+ if (type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
- res->fi = NULL;
- } else if (res->type == RTN_MULTICAST) {
+ fi = NULL;
+ } else if (type == RTN_MULTICAST) {
flags |= RTCF_MULTICAST | RTCF_LOCAL;
- if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
- oldflp->proto))
+ if (!ip_check_mc_rcu(in_dev, oldflp4->daddr, oldflp4->saddr,
+ oldflp4->flowi4_proto))
flags &= ~RTCF_LOCAL;
/* If multicast route do not exist use
* default one, but do not gateway in this case.
* Yes, it is hack.
*/
- if (res->fi && res->prefixlen < 4)
- res->fi = NULL;
+ if (fi && res->prefixlen < 4)
+ fi = NULL;
}
-
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(in_dev, NOXFRM));
if (!rth)
- return -ENOBUFS;
-
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOXFRM))
- rth->dst.flags |= DST_NOXFRM;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
-
- rth->fl.fl4_dst = oldflp->fl4_dst;
- rth->fl.fl4_tos = tos;
- rth->fl.fl4_src = oldflp->fl4_src;
- rth->fl.oif = oldflp->oif;
- rth->fl.mark = oldflp->mark;
- rth->rt_dst = fl->fl4_dst;
- rth->rt_src = fl->fl4_src;
- rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
+ return ERR_PTR(-ENOBUFS);
+
+ rth->rt_key_dst = oldflp4->daddr;
+ rth->rt_tos = tos;
+ rth->rt_key_src = oldflp4->saddr;
+ rth->rt_oif = oldflp4->flowi4_oif;
+ rth->rt_mark = oldflp4->flowi4_mark;
+ rth->rt_dst = fl4->daddr;
+ rth->rt_src = fl4->saddr;
+ rth->rt_iif = 0;
/* get references to the devices that are to be hold by the routing
cache entry */
rth->dst.dev = dev_out;
dev_hold(dev_out);
- rth->rt_gateway = fl->fl4_dst;
- rth->rt_spec_dst= fl->fl4_src;
+ rth->rt_gateway = fl4->daddr;
+ rth->rt_spec_dst= fl4->saddr;
rth->dst.output=ip_output;
- rth->dst.obsolete = -1;
rth->rt_genid = rt_genid(dev_net(dev_out));
RT_CACHE_STAT_INC(out_slow_tot);
if (flags & RTCF_LOCAL) {
rth->dst.input = ip_local_deliver;
- rth->rt_spec_dst = fl->fl4_dst;
+ rth->rt_spec_dst = fl4->daddr;
}
if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
- rth->rt_spec_dst = fl->fl4_src;
+ rth->rt_spec_dst = fl4->saddr;
if (flags & RTCF_LOCAL &&
!(dev_out->flags & IFF_LOOPBACK)) {
rth->dst.output = ip_mc_output;
RT_CACHE_STAT_INC(out_slow_mc);
}
#ifdef CONFIG_IP_MROUTE
- if (res->type == RTN_MULTICAST) {
+ if (type == RTN_MULTICAST) {
if (IN_DEV_MFORWARD(in_dev) &&
- !ipv4_is_local_multicast(oldflp->fl4_dst)) {
+ !ipv4_is_local_multicast(oldflp4->daddr)) {
rth->dst.input = ip_mr_input;
rth->dst.output = ip_mc_output;
}
@@ -2450,31 +2438,10 @@ static int __mkroute_output(struct rtable **result,
#endif
}
- rt_set_nexthop(rth, res, 0);
+ rt_set_nexthop(rth, oldflp4, res, fi, type, 0);
rth->rt_flags = flags;
- *result = rth;
- return 0;
-}
-
-/* called with rcu_read_lock() */
-static int ip_mkroute_output(struct rtable **rp,
- struct fib_result *res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
-{
- struct rtable *rth = NULL;
- int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
- unsigned hash;
- if (err == 0) {
- hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
- rt_genid(dev_net(dev_out)));
- err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
- }
-
- return err;
+ return rth;
}
/*
@@ -2482,34 +2449,36 @@ static int ip_mkroute_output(struct rtable **rp,
* called with rcu_read_lock();
*/
-static int ip_route_output_slow(struct net *net, struct rtable **rp,
- const struct flowi *oldflp)
-{
- u32 tos = RT_FL_TOS(oldflp);
- struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
- .fl4_src = oldflp->fl4_src,
- .fl4_tos = tos & IPTOS_RT_MASK,
- .fl4_scope = ((tos & RTO_ONLINK) ?
- RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
- .mark = oldflp->mark,
- .iif = net->loopback_dev->ifindex,
- .oif = oldflp->oif };
+static struct rtable *ip_route_output_slow(struct net *net,
+ const struct flowi4 *oldflp4)
+{
+ u32 tos = RT_FL_TOS(oldflp4);
+ struct flowi4 fl4;
struct fib_result res;
unsigned int flags = 0;
struct net_device *dev_out = NULL;
- int err;
-
+ struct rtable *rth;
res.fi = NULL;
#ifdef CONFIG_IP_MULTIPLE_TABLES
res.r = NULL;
#endif
- if (oldflp->fl4_src) {
- err = -EINVAL;
- if (ipv4_is_multicast(oldflp->fl4_src) ||
- ipv4_is_lbcast(oldflp->fl4_src) ||
- ipv4_is_zeronet(oldflp->fl4_src))
+ fl4.flowi4_oif = oldflp4->flowi4_oif;
+ fl4.flowi4_iif = net->loopback_dev->ifindex;
+ fl4.flowi4_mark = oldflp4->flowi4_mark;
+ fl4.daddr = oldflp4->daddr;
+ fl4.saddr = oldflp4->saddr;
+ fl4.flowi4_tos = tos & IPTOS_RT_MASK;
+ fl4.flowi4_scope = ((tos & RTO_ONLINK) ?
+ RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
+
+ rcu_read_lock();
+ if (oldflp4->saddr) {
+ rth = ERR_PTR(-EINVAL);
+ if (ipv4_is_multicast(oldflp4->saddr) ||
+ ipv4_is_lbcast(oldflp4->saddr) ||
+ ipv4_is_zeronet(oldflp4->saddr))
goto out;
/* I removed check for oif == dev_out->oif here.
@@ -2520,11 +2489,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
of another iface. --ANK
*/
- if (oldflp->oif == 0 &&
- (ipv4_is_multicast(oldflp->fl4_dst) ||
- ipv4_is_lbcast(oldflp->fl4_dst))) {
+ if (oldflp4->flowi4_oif == 0 &&
+ (ipv4_is_multicast(oldflp4->daddr) ||
+ ipv4_is_lbcast(oldflp4->daddr))) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
+ dev_out = __ip_dev_find(net, oldflp4->saddr, false);
if (dev_out == NULL)
goto out;
@@ -2543,60 +2512,60 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
Luckily, this hack is good workaround.
*/
- fl.oif = dev_out->ifindex;
+ fl4.flowi4_oif = dev_out->ifindex;
goto make_route;
}
- if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
+ if (!(oldflp4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- if (!__ip_dev_find(net, oldflp->fl4_src, false))
+ if (!__ip_dev_find(net, oldflp4->saddr, false))
goto out;
}
}
- if (oldflp->oif) {
- dev_out = dev_get_by_index_rcu(net, oldflp->oif);
- err = -ENODEV;
+ if (oldflp4->flowi4_oif) {
+ dev_out = dev_get_by_index_rcu(net, oldflp4->flowi4_oif);
+ rth = ERR_PTR(-ENODEV);
if (dev_out == NULL)
goto out;
/* RACE: Check return value of inet_select_addr instead. */
if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
- err = -ENETUNREACH;
+ rth = ERR_PTR(-ENETUNREACH);
goto out;
}
- if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
- ipv4_is_lbcast(oldflp->fl4_dst)) {
- if (!fl.fl4_src)
- fl.fl4_src = inet_select_addr(dev_out, 0,
- RT_SCOPE_LINK);
+ if (ipv4_is_local_multicast(oldflp4->daddr) ||
+ ipv4_is_lbcast(oldflp4->daddr)) {
+ if (!fl4.saddr)
+ fl4.saddr = inet_select_addr(dev_out, 0,
+ RT_SCOPE_LINK);
goto make_route;
}
- if (!fl.fl4_src) {
- if (ipv4_is_multicast(oldflp->fl4_dst))
- fl.fl4_src = inet_select_addr(dev_out, 0,
- fl.fl4_scope);
- else if (!oldflp->fl4_dst)
- fl.fl4_src = inet_select_addr(dev_out, 0,
- RT_SCOPE_HOST);
+ if (!fl4.saddr) {
+ if (ipv4_is_multicast(oldflp4->daddr))
+ fl4.saddr = inet_select_addr(dev_out, 0,
+ fl4.flowi4_scope);
+ else if (!oldflp4->daddr)
+ fl4.saddr = inet_select_addr(dev_out, 0,
+ RT_SCOPE_HOST);
}
}
- if (!fl.fl4_dst) {
- fl.fl4_dst = fl.fl4_src;
- if (!fl.fl4_dst)
- fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
+ if (!fl4.daddr) {
+ fl4.daddr = fl4.saddr;
+ if (!fl4.daddr)
+ fl4.daddr = fl4.saddr = htonl(INADDR_LOOPBACK);
dev_out = net->loopback_dev;
- fl.oif = net->loopback_dev->ifindex;
+ fl4.flowi4_oif = net->loopback_dev->ifindex;
res.type = RTN_LOCAL;
flags |= RTCF_LOCAL;
goto make_route;
}
- if (fib_lookup(net, &fl, &res)) {
+ if (fib_lookup(net, &fl4, &res)) {
res.fi = NULL;
- if (oldflp->oif) {
+ if (oldflp4->flowi4_oif) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2615,90 +2584,93 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
likely IPv6, but we do not.
*/
- if (fl.fl4_src == 0)
- fl.fl4_src = inet_select_addr(dev_out, 0,
- RT_SCOPE_LINK);
+ if (fl4.saddr == 0)
+ fl4.saddr = inet_select_addr(dev_out, 0,
+ RT_SCOPE_LINK);
res.type = RTN_UNICAST;
goto make_route;
}
- err = -ENETUNREACH;
+ rth = ERR_PTR(-ENETUNREACH);
goto out;
}
if (res.type == RTN_LOCAL) {
- if (!fl.fl4_src) {
+ if (!fl4.saddr) {
if (res.fi->fib_prefsrc)
- fl.fl4_src = res.fi->fib_prefsrc;
+ fl4.saddr = res.fi->fib_prefsrc;
else
- fl.fl4_src = fl.fl4_dst;
+ fl4.saddr = fl4.daddr;
}
dev_out = net->loopback_dev;
- fl.oif = dev_out->ifindex;
+ fl4.flowi4_oif = dev_out->ifindex;
res.fi = NULL;
flags |= RTCF_LOCAL;
goto make_route;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res.fi->fib_nhs > 1 && fl.oif == 0)
- fib_select_multipath(&fl, &res);
+ if (res.fi->fib_nhs > 1 && fl4.flowi4_oif == 0)
+ fib_select_multipath(&res);
else
#endif
- if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
- fib_select_default(net, &fl, &res);
+ if (!res.prefixlen && res.type == RTN_UNICAST && !fl4.flowi4_oif)
+ fib_select_default(&res);
- if (!fl.fl4_src)
- fl.fl4_src = FIB_RES_PREFSRC(res);
+ if (!fl4.saddr)
+ fl4.saddr = FIB_RES_PREFSRC(res);
dev_out = FIB_RES_DEV(res);
- fl.oif = dev_out->ifindex;
+ fl4.flowi4_oif = dev_out->ifindex;
make_route:
- err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
+ rth = __mkroute_output(&res, &fl4, oldflp4, dev_out, flags);
+ if (!IS_ERR(rth)) {
+ unsigned int hash;
-out: return err;
+ hash = rt_hash(oldflp4->daddr, oldflp4->saddr, oldflp4->flowi4_oif,
+ rt_genid(dev_net(dev_out)));
+ rth = rt_intern_hash(hash, rth, NULL, oldflp4->flowi4_oif);
+ }
+
+out:
+ rcu_read_unlock();
+ return rth;
}
-int __ip_route_output_key(struct net *net, struct rtable **rp,
- const struct flowi *flp)
+struct rtable *__ip_route_output_key(struct net *net, const struct flowi4 *flp4)
{
- unsigned int hash;
- int res;
struct rtable *rth;
+ unsigned int hash;
if (!rt_caching(net))
goto slow_output;
- hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
+ hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
rcu_read_lock_bh();
for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
rth = rcu_dereference_bh(rth->dst.rt_next)) {
- if (rth->fl.fl4_dst == flp->fl4_dst &&
- rth->fl.fl4_src == flp->fl4_src &&
+ if (rth->rt_key_dst == flp4->daddr &&
+ rth->rt_key_src == flp4->saddr &&
rt_is_output_route(rth) &&
- rth->fl.oif == flp->oif &&
- rth->fl.mark == flp->mark &&
- !((rth->fl.fl4_tos ^ flp->fl4_tos) &
+ rth->rt_oif == flp4->flowi4_oif &&
+ rth->rt_mark == flp4->flowi4_mark &&
+ !((rth->rt_tos ^ flp4->flowi4_tos) &
(IPTOS_RT_MASK | RTO_ONLINK)) &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
dst_use(&rth->dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
- *rp = rth;
- return 0;
+ return rth;
}
RT_CACHE_STAT_INC(out_hlist_search);
}
rcu_read_unlock_bh();
slow_output:
- rcu_read_lock();
- res = ip_route_output_slow(net, rp, flp);
- rcu_read_unlock();
- return res;
+ return ip_route_output_slow(net, flp4);
}
EXPORT_SYMBOL_GPL(__ip_route_output_key);
@@ -2707,6 +2679,11 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo
return NULL;
}
+static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
+{
+ return 0;
+}
+
static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
{
}
@@ -2716,20 +2693,19 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
.protocol = cpu_to_be16(ETH_P_IP),
.destroy = ipv4_dst_destroy,
.check = ipv4_blackhole_dst_check,
+ .default_mtu = ipv4_blackhole_default_mtu,
+ .default_advmss = ipv4_default_advmss,
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
};
-
-static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
+struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
- struct rtable *ort = *rp;
- struct rtable *rt = (struct rtable *)
- dst_alloc(&ipv4_dst_blackhole_ops);
+ struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, 1);
+ struct rtable *ort = (struct rtable *) dst_orig;
if (rt) {
struct dst_entry *new = &rt->dst;
- atomic_set(&new->__refcnt, 1);
new->__use = 1;
new->input = dst_discard;
new->output = dst_discard;
@@ -2739,7 +2715,12 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
if (new->dev)
dev_hold(new->dev);
- rt->fl = ort->fl;
+ rt->rt_key_dst = ort->rt_key_dst;
+ rt->rt_key_src = ort->rt_key_src;
+ rt->rt_tos = ort->rt_tos;
+ rt->rt_iif = ort->rt_iif;
+ rt->rt_oif = ort->rt_oif;
+ rt->rt_mark = ort->rt_mark;
rt->rt_genid = rt_genid(net);
rt->rt_flags = ort->rt_flags;
@@ -2752,46 +2733,40 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
rt->peer = ort->peer;
if (rt->peer)
atomic_inc(&rt->peer->refcnt);
+ rt->fi = ort->fi;
+ if (rt->fi)
+ atomic_inc(&rt->fi->fib_clntref);
dst_free(new);
}
- dst_release(&(*rp)->dst);
- *rp = rt;
- return rt ? 0 : -ENOMEM;
+ dst_release(dst_orig);
+
+ return rt ? &rt->dst : ERR_PTR(-ENOMEM);
}
-int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
- struct sock *sk, int flags)
+struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
+ struct sock *sk)
{
- int err;
+ struct rtable *rt = __ip_route_output_key(net, flp4);
- if ((err = __ip_route_output_key(net, rp, flp)) != 0)
- return err;
+ if (IS_ERR(rt))
+ return rt;
- if (flp->proto) {
- if (!flp->fl4_src)
- flp->fl4_src = (*rp)->rt_src;
- if (!flp->fl4_dst)
- flp->fl4_dst = (*rp)->rt_dst;
- err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
- flags ? XFRM_LOOKUP_WAIT : 0);
- if (err == -EREMOTE)
- err = ipv4_dst_blackhole(net, rp, flp);
-
- return err;
+ if (flp4->flowi4_proto) {
+ if (!flp4->saddr)
+ flp4->saddr = rt->rt_src;
+ if (!flp4->daddr)
+ flp4->daddr = rt->rt_dst;
+ rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+ flowi4_to_flowi(flp4),
+ sk, 0);
}
- return 0;
+ return rt;
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
-int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
-{
- return ip_route_output_flow(net, rp, flp, NULL, 0);
-}
-EXPORT_SYMBOL(ip_route_output_key);
-
static int rt_fill_info(struct net *net,
struct sk_buff *skb, u32 pid, u32 seq, int event,
int nowait, unsigned int flags)
@@ -2810,7 +2785,7 @@ static int rt_fill_info(struct net *net,
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
- r->rtm_tos = rt->fl.fl4_tos;
+ r->rtm_tos = rt->rt_tos;
r->rtm_table = RT_TABLE_MAIN;
NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
r->rtm_type = rt->rt_type;
@@ -2822,19 +2797,19 @@ static int rt_fill_info(struct net *net,
NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
- if (rt->fl.fl4_src) {
+ if (rt->rt_key_src) {
r->rtm_src_len = 32;
- NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
+ NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
}
if (rt->dst.dev)
NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (rt->dst.tclassid)
NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
#endif
if (rt_is_input_route(rt))
NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
- else if (rt->rt_src != rt->fl.fl4_src)
+ else if (rt->rt_src != rt->rt_key_src)
NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
if (rt->rt_dst != rt->rt_gateway)
@@ -2843,11 +2818,12 @@ static int rt_fill_info(struct net *net,
if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
goto nla_put_failure;
- if (rt->fl.mark)
- NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
+ if (rt->rt_mark)
+ NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
error = rt->dst.error;
- expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
+ expires = (rt->peer && rt->peer->pmtu_expires) ?
+ rt->peer->pmtu_expires - jiffies : 0;
if (rt->peer) {
inet_peer_refcheck(rt->peer);
id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
@@ -2877,7 +2853,7 @@ static int rt_fill_info(struct net *net,
}
} else
#endif
- NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
+ NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
}
if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
@@ -2951,14 +2927,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
if (err == 0 && rt->dst.error)
err = -rt->dst.error;
} else {
- struct flowi fl = {
- .fl4_dst = dst,
- .fl4_src = src,
- .fl4_tos = rtm->rtm_tos,
- .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
- .mark = mark,
+ struct flowi4 fl4 = {
+ .daddr = dst,
+ .saddr = src,
+ .flowi4_tos = rtm->rtm_tos,
+ .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
+ .flowi4_mark = mark,
};
- err = ip_route_output_key(net, &rt, &fl);
+ rt = ip_route_output_key(net, &fl4);
+
+ err = 0;
+ if (IS_ERR(rt))
+ err = PTR_ERR(rt);
}
if (err)
@@ -3249,9 +3229,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
};
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
-#endif /* CONFIG_NET_CLS_ROUTE */
+#endif /* CONFIG_IP_ROUTE_CLASSID */
static __initdata unsigned long rhash_entries;
static int __init set_rhash_entries(char *str)
@@ -3267,7 +3247,7 @@ int __init ip_rt_init(void)
{
int rc = 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
if (!ip_rt_acct)
panic("IP: failed to allocate ip_rt_acct\n");
@@ -3304,14 +3284,6 @@ int __init ip_rt_init(void)
devinet_init();
ip_fib_init();
- /* All the timers, started at system startup tend
- to synchronize. Perturb it a bit.
- */
- INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
- expires_ljiffies = jiffies;
- schedule_delayed_work(&expires_work,
- net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
-
if (ip_rt_proc_init())
printk(KERN_ERR "Unable to create route proc files\n");
#ifdef CONFIG_XFRM
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 47519205a01..8b44c6d2a79 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -345,17 +345,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
* no easy way to do this.
*/
{
- struct flowi fl = { .mark = sk->sk_mark,
- .fl4_dst = ((opt && opt->srr) ?
- opt->faddr : ireq->rmt_addr),
- .fl4_src = ireq->loc_addr,
- .fl4_tos = RT_CONN_FLAGS(sk),
- .proto = IPPROTO_TCP,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = th->dest,
- .fl_ip_dport = th->source };
- security_req_classify_flow(req, &fl);
- if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
+ struct flowi4 fl4 = {
+ .flowi4_mark = sk->sk_mark,
+ .daddr = ((opt && opt->srr) ?
+ opt->faddr : ireq->rmt_addr),
+ .saddr = ireq->loc_addr,
+ .flowi4_tos = RT_CONN_FLAGS(sk),
+ .flowi4_proto = IPPROTO_TCP,
+ .flowi4_flags = inet_sk_flowi_flags(sk),
+ .fl4_sport = th->dest,
+ .fl4_dport = th->source,
+ };
+ security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(sock_net(sk), &fl4);
+ if (IS_ERR(rt)) {
reqsk_free(req);
goto out;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6c11eece262..b22d4501054 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -505,6 +505,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
else
answ = tp->write_seq - tp->snd_una;
break;
+ case SIOCOUTQNSD:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ answ = 0;
+ else
+ answ = tp->write_seq - tp->snd_nxt;
+ break;
default:
return -ENOIOCTLCMD;
}
@@ -873,9 +882,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
flags);
lock_sock(sk);
- TCP_CHECK_TIMER(sk);
res = do_tcp_sendpages(sk, &page, offset, size, flags);
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return res;
}
@@ -916,7 +923,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
long timeo;
lock_sock(sk);
- TCP_CHECK_TIMER(sk);
flags = msg->msg_flags;
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -1104,7 +1110,6 @@ wait_for_memory:
out:
if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle);
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return copied;
@@ -1123,7 +1128,6 @@ do_error:
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return err;
}
@@ -1415,8 +1419,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
lock_sock(sk);
- TCP_CHECK_TIMER(sk);
-
err = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;
@@ -1767,12 +1769,10 @@ skip_copy:
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return copied;
out:
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return err;
@@ -2653,7 +2653,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct tcphdr *th;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 3b53fd1af23..6187eb4d1dc 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -209,7 +209,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
}
-static struct tcp_congestion_ops bictcp = {
+static struct tcp_congestion_ops bictcp __read_mostly = {
.init = bictcp_init,
.ssthresh = bictcp_recalc_ssthresh,
.cong_avoid = bictcp_cong_avoid,
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 71d5f2f29fa..62f775cb086 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -405,7 +405,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
hystart_update(sk, delay);
}
-static struct tcp_congestion_ops cubictcp = {
+static struct tcp_congestion_ops cubictcp __read_mostly = {
.init = bictcp_init,
.ssthresh = bictcp_recalc_ssthresh,
.cong_avoid = bictcp_cong_avoid,
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8b6caaf75bb..30f27f6b365 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -158,7 +158,7 @@ static u32 hstcp_ssthresh(struct sock *sk)
}
-static struct tcp_congestion_ops tcp_highspeed = {
+static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
.init = hstcp_init,
.ssthresh = hstcp_ssthresh,
.cong_avoid = hstcp_cong_avoid,
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 7c94a495541..c1a8175361e 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state)
}
}
-static struct tcp_congestion_ops htcp = {
+static struct tcp_congestion_ops htcp __read_mostly = {
.init = htcp_init,
.ssthresh = htcp_recalc_ssthresh,
.cong_avoid = htcp_cong_avoid,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 377bc934937..fe3ecf484b4 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -162,7 +162,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
}
-static struct tcp_congestion_ops tcp_hybla = {
+static struct tcp_congestion_ops tcp_hybla __read_mostly = {
.init = hybla_init,
.ssthresh = tcp_reno_ssthresh,
.min_cwnd = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 00ca688d896..813b43a76fe 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -322,7 +322,7 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
}
}
-static struct tcp_congestion_ops tcp_illinois = {
+static struct tcp_congestion_ops tcp_illinois __read_mostly = {
.flags = TCP_CONG_RTT_STAMP,
.init = tcp_illinois_init,
.ssthresh = tcp_illinois_ssthresh,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2549b29b062..08ea735b9d7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
if (!cwnd)
- cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
+ cwnd = TCP_INIT_CWND;
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
@@ -1222,7 +1222,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
}
/* D-SACK for already forgotten data... Do dumb counting. */
- if (dup_sack &&
+ if (dup_sack && tp->undo_marker && tp->undo_retrans &&
!after(end_seq_0, prior_snd_una) &&
after(end_seq_0, tp->undo_marker))
tp->undo_retrans--;
@@ -1299,7 +1299,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
/* Account D-SACK for retransmitted packet. */
if (dup_sack && (sacked & TCPCB_RETRANS)) {
- if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+ if (tp->undo_marker && tp->undo_retrans &&
+ after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
tp->undo_retrans--;
if (sacked & TCPCB_SACKED_ACKED)
state->reord = min(fack_count, state->reord);
@@ -4399,7 +4400,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
- eaten = (chunk == skb->len && !th->fin);
+ eaten = (chunk == skb->len);
tcp_rcv_space_adjust(sk);
}
local_bh_disable();
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 856f68466d4..f7e6c2c2d2b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -149,9 +149,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ __be16 orig_sport, orig_dport;
struct rtable *rt;
__be32 daddr, nexthop;
- int tmp;
int err;
if (addr_len < sizeof(struct sockaddr_in))
@@ -167,14 +167,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
nexthop = inet->opt->faddr;
}
- tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
- IPPROTO_TCP,
- inet->inet_sport, usin->sin_port, sk, 1);
- if (tmp < 0) {
- if (tmp == -ENETUNREACH)
+ orig_sport = inet->inet_sport;
+ orig_dport = usin->sin_port;
+ rt = ip_route_connect(nexthop, inet->inet_saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_TCP,
+ orig_sport, orig_dport, sk, true);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ if (err == -ENETUNREACH)
IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
- return tmp;
+ return err;
}
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
@@ -233,11 +236,14 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (err)
goto failure;
- err = ip_route_newports(&rt, IPPROTO_TCP,
- inet->inet_sport, inet->inet_dport, sk);
- if (err)
+ rt = ip_route_newports(rt, IPPROTO_TCP,
+ orig_sport, orig_dport,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
goto failure;
-
+ }
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
@@ -1341,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_death_row.sysctl_tw_recycle &&
(dst = inet_csk_route_req(sk, req)) != NULL &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
- peer->daddr.a4 == saddr) {
+ peer->daddr.addr.a4 == saddr) {
inet_peer_refcheck(peer);
if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
(s32)(peer->tcp_ts - req->ts_recent) >
@@ -1556,12 +1562,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
sock_rps_save_rxhash(sk, skb->rxhash);
- TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
- TCP_CHECK_TIMER(sk);
return 0;
}
@@ -1583,13 +1587,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
} else
sock_rps_save_rxhash(sk, skb->rxhash);
-
- TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
- TCP_CHECK_TIMER(sk);
return 0;
reset:
@@ -1994,7 +1995,6 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
}
req = req->dl_next;
}
- st->offset = 0;
if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
break;
get_req:
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index de870377fbb..656d431c99a 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -313,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
lp->last_drop = tcp_time_stamp;
}
-static struct tcp_congestion_ops tcp_lp = {
+static struct tcp_congestion_ops tcp_lp __read_mostly = {
.flags = TCP_CONG_RTT_STAMP,
.init = tcp_lp_init,
.ssthresh = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 406f320336e..dfa5beb0c1c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2162,7 +2162,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (!tp->retrans_stamp)
tp->retrans_stamp = TCP_SKB_CB(skb)->when;
- tp->undo_retrans++;
+ tp->undo_retrans += tcp_skb_pcount(skb);
/* snd_nxt is stored to detect loss of retransmitted segment,
* see tcp_input.c tcp_sacktag_write_queue().
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index a76513779e2..8ce55b8aaec 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)
}
-static struct tcp_congestion_ops tcp_scalable = {
+static struct tcp_congestion_ops tcp_scalable __read_mostly = {
.ssthresh = tcp_scalable_ssthresh,
.cong_avoid = tcp_scalable_cong_avoid,
.min_cwnd = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 74a6aa00365..ecd44b0c45f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -259,7 +259,6 @@ static void tcp_delack_timer(unsigned long data)
tcp_send_ack(sk);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
- TCP_CHECK_TIMER(sk);
out:
if (tcp_memory_pressure)
@@ -481,7 +480,6 @@ static void tcp_write_timer(unsigned long data)
tcp_probe_timer(sk);
break;
}
- TCP_CHECK_TIMER(sk);
out:
sk_mem_reclaim(sk);
@@ -589,7 +587,6 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = keepalive_time_when(tp) - elapsed;
}
- TCP_CHECK_TIMER(sk);
sk_mem_reclaim(sk);
resched:
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index c6743eec9b7..80fa2bfd7ed 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -304,7 +304,7 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
-static struct tcp_congestion_ops tcp_vegas = {
+static struct tcp_congestion_ops tcp_vegas __read_mostly = {
.flags = TCP_CONG_RTT_STAMP,
.init = tcp_vegas_init,
.ssthresh = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 38bc0b52d74..ac43cd747bc 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -201,7 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
return max(tp->snd_cwnd >> 1U, 2U);
}
-static struct tcp_congestion_ops tcp_veno = {
+static struct tcp_congestion_ops tcp_veno __read_mostly = {
.flags = TCP_CONG_RTT_STAMP,
.init = tcp_veno_init,
.ssthresh = tcp_veno_ssthresh,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index a534dda5456..1b91bf48e27 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -272,7 +272,7 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
}
-static struct tcp_congestion_ops tcp_westwood = {
+static struct tcp_congestion_ops tcp_westwood __read_mostly = {
.init = tcp_westwood_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index a0f24035889..dc7f43179c9 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -225,7 +225,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
return tp->snd_cwnd - reduction;
}
-static struct tcp_congestion_ops tcp_yeah = {
+static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.flags = TCP_CONG_RTT_STAMP,
.init = tcp_yeah_init,
.ssthresh = tcp_yeah_ssthresh,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8157b17959e..588f47af5fa 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -663,75 +663,72 @@ void udp_flush_pending_frames(struct sock *sk)
EXPORT_SYMBOL(udp_flush_pending_frames);
/**
- * udp4_hwcsum_outgoing - handle outgoing HW checksumming
- * @sk: socket we are sending on
+ * udp4_hwcsum - handle outgoing HW checksumming
* @skb: sk_buff containing the filled-in UDP header
* (checksum field must be zeroed out)
+ * @src: source IP address
+ * @dst: destination IP address
*/
-static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
- __be32 src, __be32 dst, int len)
+static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
{
- unsigned int offset;
struct udphdr *uh = udp_hdr(skb);
+ struct sk_buff *frags = skb_shinfo(skb)->frag_list;
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
+ int hlen = len;
__wsum csum = 0;
- if (skb_queue_len(&sk->sk_write_queue) == 1) {
+ if (!frags) {
/*
* Only one fragment on the socket.
*/
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
- uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0);
+ uh->check = ~csum_tcpudp_magic(src, dst, len,
+ IPPROTO_UDP, 0);
} else {
/*
* HW-checksum won't work as there are two or more
* fragments on the socket so that all csums of sk_buffs
* should be together
*/
- offset = skb_transport_offset(skb);
- skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
+ do {
+ csum = csum_add(csum, frags->csum);
+ hlen -= frags->len;
+ } while ((frags = frags->next));
+ csum = skb_checksum(skb, offset, hlen, csum);
skb->ip_summed = CHECKSUM_NONE;
- skb_queue_walk(&sk->sk_write_queue, skb) {
- csum = csum_add(csum, skb->csum);
- }
-
uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
}
}
-/*
- * Push out all pending data as one UDP datagram. Socket is locked.
- */
-static int udp_push_pending_frames(struct sock *sk)
+static int udp_send_skb(struct sk_buff *skb, __be32 daddr, __be32 dport)
{
- struct udp_sock *up = udp_sk(sk);
+ struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
- struct flowi *fl = &inet->cork.fl;
- struct sk_buff *skb;
struct udphdr *uh;
+ struct rtable *rt = (struct rtable *)skb_dst(skb);
int err = 0;
int is_udplite = IS_UDPLITE(sk);
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
__wsum csum = 0;
- /* Grab the skbuff where UDP header space exists. */
- if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
- goto out;
-
/*
* Create a UDP header
*/
uh = udp_hdr(skb);
- uh->source = fl->fl_ip_sport;
- uh->dest = fl->fl_ip_dport;
- uh->len = htons(up->len);
+ uh->source = inet->inet_sport;
+ uh->dest = dport;
+ uh->len = htons(len);
uh->check = 0;
if (is_udplite) /* UDP-Lite */
- csum = udplite_csum_outgoing(sk, skb);
+ csum = udplite_csum(skb);
else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
@@ -740,20 +737,20 @@ static int udp_push_pending_frames(struct sock *sk)
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
- udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len);
+ udp4_hwcsum(skb, rt->rt_src, daddr);
goto send;
- } else /* `normal' UDP */
- csum = udp_csum_outgoing(sk, skb);
+ } else
+ csum = udp_csum(skb);
/* add protocol-dependent pseudo-header */
- uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len,
+ uh->check = csum_tcpudp_magic(rt->rt_src, daddr, len,
sk->sk_protocol, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
send:
- err = ip_push_pending_frames(sk);
+ err = ip_send_skb(skb);
if (err) {
if (err == -ENOBUFS && !inet->recverr) {
UDP_INC_STATS_USER(sock_net(sk),
@@ -763,6 +760,26 @@ send:
} else
UDP_INC_STATS_USER(sock_net(sk),
UDP_MIB_OUTDATAGRAMS, is_udplite);
+ return err;
+}
+
+/*
+ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+static int udp_push_pending_frames(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
+ struct sk_buff *skb;
+ int err = 0;
+
+ skb = ip_finish_skb(sk);
+ if (!skb)
+ goto out;
+
+ err = udp_send_skb(skb, fl4->daddr, fl4->fl4_dport);
+
out:
up->len = 0;
up->pending = 0;
@@ -774,6 +791,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
{
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
+ struct flowi4 *fl4;
int ulen = len;
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
@@ -785,6 +803,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int err, is_udplite = IS_UDPLITE(sk);
int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+ struct sk_buff *skb;
if (len > 0xFFFF)
return -EMSGSIZE;
@@ -799,6 +818,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.opt = NULL;
ipc.tx_flags = 0;
+ getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
+
if (up->pending) {
/*
* There are pending frames.
@@ -888,20 +909,25 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
rt = (struct rtable *)sk_dst_check(sk, 0);
if (rt == NULL) {
- struct flowi fl = { .oif = ipc.oif,
- .mark = sk->sk_mark,
- .fl4_dst = faddr,
- .fl4_src = saddr,
- .fl4_tos = tos,
- .proto = sk->sk_protocol,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = inet->inet_sport,
- .fl_ip_dport = dport };
+ struct flowi4 fl4 = {
+ .flowi4_oif = ipc.oif,
+ .flowi4_mark = sk->sk_mark,
+ .daddr = faddr,
+ .saddr = saddr,
+ .flowi4_tos = tos,
+ .flowi4_proto = sk->sk_protocol,
+ .flowi4_flags = (inet_sk_flowi_flags(sk) |
+ FLOWI_FLAG_CAN_SLEEP),
+ .fl4_sport = inet->inet_sport,
+ .fl4_dport = dport,
+ };
struct net *net = sock_net(sk);
- security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(net, &rt, &fl, sk, 1);
- if (err) {
+ security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
if (err == -ENETUNREACH)
IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
@@ -923,6 +949,17 @@ back_from_confirm:
if (!ipc.addr)
daddr = ipc.addr = rt->rt_dst;
+ /* Lockless fast path for the non-corking case. */
+ if (!corkreq) {
+ skb = ip_make_skb(sk, getfrag, msg->msg_iov, ulen,
+ sizeof(struct udphdr), &ipc, &rt,
+ msg->msg_flags);
+ err = PTR_ERR(skb);
+ if (skb && !IS_ERR(skb))
+ err = udp_send_skb(skb, daddr, dport);
+ goto out;
+ }
+
lock_sock(sk);
if (unlikely(up->pending)) {
/* The socket is already corked while preparing it. */
@@ -936,15 +973,15 @@ back_from_confirm:
/*
* Now cork the socket to pend data.
*/
- inet->cork.fl.fl4_dst = daddr;
- inet->cork.fl.fl_ip_dport = dport;
- inet->cork.fl.fl4_src = saddr;
- inet->cork.fl.fl_ip_sport = inet->inet_sport;
+ fl4 = &inet->cork.fl.u.ip4;
+ fl4->daddr = daddr;
+ fl4->saddr = saddr;
+ fl4->fl4_dport = dport;
+ fl4->fl4_sport = inet->inet_sport;
up->pending = AF_INET;
do_append_data:
up->len += ulen;
- getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
sizeof(struct udphdr), &ipc, &rt,
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
@@ -2199,7 +2236,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
return 0;
}
-struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
+struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b057d40adde..13e0e7f659f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -19,25 +19,23 @@
static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
- xfrm_address_t *saddr,
- xfrm_address_t *daddr)
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
{
- struct flowi fl = {
- .fl4_dst = daddr->a4,
- .fl4_tos = tos,
+ struct flowi4 fl4 = {
+ .daddr = daddr->a4,
+ .flowi4_tos = tos,
};
- struct dst_entry *dst;
struct rtable *rt;
- int err;
if (saddr)
- fl.fl4_src = saddr->a4;
+ fl4.saddr = saddr->a4;
+
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt))
+ return &rt->dst;
- err = __ip_route_output_key(net, &rt, &fl);
- dst = &rt->dst;
- if (err)
- dst = ERR_PTR(err);
- return dst;
+ return ERR_CAST(rt);
}
static int xfrm4_get_saddr(struct net *net,
@@ -56,9 +54,9 @@ static int xfrm4_get_saddr(struct net *net,
return 0;
}
-static int xfrm4_get_tos(struct flowi *fl)
+static int xfrm4_get_tos(const struct flowi *fl)
{
- return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
+ return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
}
static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -68,11 +66,17 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
}
static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
- struct flowi *fl)
+ const struct flowi *fl)
{
struct rtable *rt = (struct rtable *)xdst->route;
+ const struct flowi4 *fl4 = &fl->u.ip4;
- xdst->u.rt.fl = *fl;
+ rt->rt_key_dst = fl4->daddr;
+ rt->rt_key_src = fl4->saddr;
+ rt->rt_tos = fl4->flowi4_tos;
+ rt->rt_iif = fl4->flowi4_iif;
+ rt->rt_oif = fl4->flowi4_oif;
+ rt->rt_mark = fl4->flowi4_mark;
xdst->u.dst.dev = dev;
dev_hold(dev);
@@ -99,9 +103,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
{
struct iphdr *iph = ip_hdr(skb);
u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
+ struct flowi4 *fl4 = &fl->u.ip4;
- memset(fl, 0, sizeof(struct flowi));
- fl->mark = skb->mark;
+ memset(fl4, 0, sizeof(struct flowi4));
+ fl4->flowi4_mark = skb->mark;
if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
switch (iph->protocol) {
@@ -114,8 +119,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
pskb_may_pull(skb, xprth + 4 - skb->data)) {
__be16 *ports = (__be16 *)xprth;
- fl->fl_ip_sport = ports[!!reverse];
- fl->fl_ip_dport = ports[!reverse];
+ fl4->fl4_sport = ports[!!reverse];
+ fl4->fl4_dport = ports[!reverse];
}
break;
@@ -123,8 +128,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
u8 *icmp = xprth;
- fl->fl_icmp_type = icmp[0];
- fl->fl_icmp_code = icmp[1];
+ fl4->fl4_icmp_type = icmp[0];
+ fl4->fl4_icmp_code = icmp[1];
}
break;
@@ -132,7 +137,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
__be32 *ehdr = (__be32 *)xprth;
- fl->fl_ipsec_spi = ehdr[0];
+ fl4->fl4_ipsec_spi = ehdr[0];
}
break;
@@ -140,7 +145,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
__be32 *ah_hdr = (__be32*)xprth;
- fl->fl_ipsec_spi = ah_hdr[1];
+ fl4->fl4_ipsec_spi = ah_hdr[1];
}
break;
@@ -148,7 +153,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
__be16 *ipcomp_hdr = (__be16 *)xprth;
- fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
+ fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
}
break;
@@ -160,20 +165,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
if (greflags[0] & GRE_KEY) {
if (greflags[0] & GRE_CSUM)
gre_hdr++;
- fl->fl_gre_key = gre_hdr[1];
+ fl4->fl4_gre_key = gre_hdr[1];
}
}
break;
default:
- fl->fl_ipsec_spi = 0;
+ fl4->fl4_ipsec_spi = 0;
break;
}
}
- fl->proto = iph->protocol;
- fl->fl4_dst = reverse ? iph->saddr : iph->daddr;
- fl->fl4_src = reverse ? iph->daddr : iph->saddr;
- fl->fl4_tos = iph->tos;
+ fl4->flowi4_proto = iph->protocol;
+ fl4->daddr = reverse ? iph->saddr : iph->daddr;
+ fl4->saddr = reverse ? iph->daddr : iph->saddr;
+ fl4->flowi4_tos = iph->tos;
}
static inline int xfrm4_garbage_collect(struct dst_ops *ops)
@@ -196,8 +201,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ dst_destroy_metrics_generic(dst);
+
if (likely(xdst->u.rt.peer))
inet_putpeer(xdst->u.rt.peer);
+
xfrm_dst_destroy(xdst);
}
@@ -215,6 +223,7 @@ static struct dst_ops xfrm4_dst_ops = {
.protocol = cpu_to_be16(ETH_P_IP),
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
+ .cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
@@ -230,6 +239,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.get_tos = xfrm4_get_tos,
.init_path = xfrm4_init_path,
.fill_dst = xfrm4_fill_dst,
+ .blackhole_route = ipv4_blackhole_route,
};
#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 47947624ecc..1717c64628d 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,24 +21,26 @@ static int xfrm4_init_flags(struct xfrm_state *x)
}
static void
-__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
+__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
{
- sel->daddr.a4 = fl->fl4_dst;
- sel->saddr.a4 = fl->fl4_src;
- sel->dport = xfrm_flowi_dport(fl);
+ const struct flowi4 *fl4 = &fl->u.ip4;
+
+ sel->daddr.a4 = fl4->daddr;
+ sel->saddr.a4 = fl4->saddr;
+ sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
sel->dport_mask = htons(0xffff);
- sel->sport = xfrm_flowi_sport(fl);
+ sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
sel->sport_mask = htons(0xffff);
sel->family = AF_INET;
sel->prefixlen_d = 32;
sel->prefixlen_s = 32;
- sel->proto = fl->proto;
- sel->ifindex = fl->oif;
+ sel->proto = fl4->flowi4_proto;
+ sel->ifindex = fl4->flowi4_oif;
}
static void
-xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
- xfrm_address_t *daddr, xfrm_address_t *saddr)
+xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
+ const xfrm_address_t *daddr, const xfrm_address_t *saddr)
{
x->id = tmpl->id;
if (x->id.daddr.a4 == 0)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 5b189c97c2f..3daaf3c7703 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -420,9 +420,6 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
dev->type == ARPHRD_TUNNEL6 ||
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_NONE) {
- printk(KERN_INFO
- "%s: Disabled Privacy Extensions\n",
- dev->name);
ndev->cnf.use_tempaddr = -1;
} else {
in6_dev_hold(ndev);
@@ -721,12 +718,9 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
struct inet6_ifaddr *ifa, *ifn;
struct inet6_dev *idev = ifp->idev;
int state;
- int hash;
int deleted = 0, onlink = 0;
unsigned long expires = jiffies;
- hash = ipv6_addr_hash(&ifp->addr);
-
spin_lock_bh(&ifp->state_lock);
state = ifp->state;
ifp->state = INET6_IFADDR_STATE_DEAD;
@@ -2664,14 +2658,12 @@ static int addrconf_ifdown(struct net_device *dev, int how)
struct net *net = dev_net(dev);
struct inet6_dev *idev;
struct inet6_ifaddr *ifa;
- LIST_HEAD(keep_list);
- int state;
+ int state, i;
ASSERT_RTNL();
- /* Flush routes if device is being removed or it is not loopback */
- if (how || !(dev->flags & IFF_LOOPBACK))
- rt6_ifdown(net, dev);
+ rt6_ifdown(net, dev);
+ neigh_ifdown(&nd_tbl, dev);
idev = __in6_dev_get(dev);
if (idev == NULL)
@@ -2692,6 +2684,23 @@ static int addrconf_ifdown(struct net_device *dev, int how)
}
+ /* Step 2: clear hash table */
+ for (i = 0; i < IN6_ADDR_HSIZE; i++) {
+ struct hlist_head *h = &inet6_addr_lst[i];
+ struct hlist_node *n;
+
+ spin_lock_bh(&addrconf_hash_lock);
+ restart:
+ hlist_for_each_entry_rcu(ifa, n, h, addr_lst) {
+ if (ifa->idev == idev) {
+ hlist_del_init_rcu(&ifa->addr_lst);
+ addrconf_del_timer(ifa);
+ goto restart;
+ }
+ }
+ spin_unlock_bh(&addrconf_hash_lock);
+ }
+
write_lock_bh(&idev->lock);
/* Step 2: clear flags for stateless addrconf */
@@ -2725,52 +2734,23 @@ static int addrconf_ifdown(struct net_device *dev, int how)
struct inet6_ifaddr, if_list);
addrconf_del_timer(ifa);
- /* If just doing link down, and address is permanent
- and not link-local, then retain it. */
- if (!how &&
- (ifa->flags&IFA_F_PERMANENT) &&
- !(ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)) {
- list_move_tail(&ifa->if_list, &keep_list);
-
- /* If not doing DAD on this address, just keep it. */
- if ((dev->flags&(IFF_NOARP|IFF_LOOPBACK)) ||
- idev->cnf.accept_dad <= 0 ||
- (ifa->flags & IFA_F_NODAD))
- continue;
+ list_del(&ifa->if_list);
- /* If it was tentative already, no need to notify */
- if (ifa->flags & IFA_F_TENTATIVE)
- continue;
+ write_unlock_bh(&idev->lock);
- /* Flag it for later restoration when link comes up */
- ifa->flags |= IFA_F_TENTATIVE;
- ifa->state = INET6_IFADDR_STATE_DAD;
- } else {
- list_del(&ifa->if_list);
-
- /* clear hash table */
- spin_lock_bh(&addrconf_hash_lock);
- hlist_del_init_rcu(&ifa->addr_lst);
- spin_unlock_bh(&addrconf_hash_lock);
-
- write_unlock_bh(&idev->lock);
- spin_lock_bh(&ifa->state_lock);
- state = ifa->state;
- ifa->state = INET6_IFADDR_STATE_DEAD;
- spin_unlock_bh(&ifa->state_lock);
-
- if (state != INET6_IFADDR_STATE_DEAD) {
- __ipv6_ifa_notify(RTM_DELADDR, ifa);
- atomic_notifier_call_chain(&inet6addr_chain,
- NETDEV_DOWN, ifa);
- }
+ spin_lock_bh(&ifa->state_lock);
+ state = ifa->state;
+ ifa->state = INET6_IFADDR_STATE_DEAD;
+ spin_unlock_bh(&ifa->state_lock);
- in6_ifa_put(ifa);
- write_lock_bh(&idev->lock);
+ if (state != INET6_IFADDR_STATE_DEAD) {
+ __ipv6_ifa_notify(RTM_DELADDR, ifa);
+ atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa);
}
- }
+ in6_ifa_put(ifa);
- list_splice(&keep_list, &idev->addr_list);
+ write_lock_bh(&idev->lock);
+ }
write_unlock_bh(&idev->lock);
@@ -4159,8 +4139,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
addrconf_leave_solict(ifp->idev, &ifp->addr);
dst_hold(&ifp->rt->dst);
- if (ifp->state == INET6_IFADDR_STATE_DEAD &&
- ip6_del_rt(ifp->rt))
+ if (ip6_del_rt(ifp->rt))
dst_free(&ifp->rt->dst);
break;
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 978e80e2c4a..4b13d5d8890 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -644,41 +644,34 @@ EXPORT_SYMBOL(inet6_unregister_protosw);
int inet6_sk_rebuild_header(struct sock *sk)
{
- int err;
- struct dst_entry *dst;
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct dst_entry *dst;
dst = __sk_dst_check(sk, np->dst_cookie);
if (dst == NULL) {
struct inet_sock *inet = inet_sk(sk);
struct in6_addr *final_p, final;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = sk->sk_protocol;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.fl6_flowlabel = np->flow_label;
- fl.oif = sk->sk_bound_dev_if;
- fl.mark = sk->sk_mark;
- fl.fl_ip_dport = inet->inet_dport;
- fl.fl_ip_sport = inet->inet_sport;
- security_sk_classify_flow(sk, &fl);
-
- final_p = fl6_update_dst(&fl, np->opt, &final);
-
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err) {
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = sk->sk_protocol;
+ ipv6_addr_copy(&fl6.daddr, &np->daddr);
+ ipv6_addr_copy(&fl6.saddr, &np->saddr);
+ fl6.flowlabel = np->flow_label;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = inet->inet_dport;
+ fl6.fl6_sport = inet->inet_sport;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+ if (IS_ERR(dst)) {
sk->sk_route_caps = 0;
- return err;
- }
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) {
- sk->sk_err_soft = -err;
- return err;
+ sk->sk_err_soft = -PTR_ERR(dst);
+ return PTR_ERR(dst);
}
__ip6_dst_store(sk, dst, NULL, NULL);
@@ -772,7 +765,7 @@ out:
return err;
}
-static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features)
+static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct ipv6hdr *ipv6h;
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 1aba54ae53c..2195ae65192 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -409,7 +409,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
ah->reserved = 0;
ah->spi = x->id.spi;
- ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
+ ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg, 0, skb->len);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 320bdb877ee..16560336eb7 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -40,7 +40,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *daddr, *final_p, final;
struct dst_entry *dst;
- struct flowi fl;
+ struct flowi6 fl6;
struct ip6_flowlabel *flowlabel = NULL;
struct ipv6_txoptions *opt;
int addr_type;
@@ -59,11 +59,11 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (usin->sin6_family != AF_INET6)
return -EAFNOSUPPORT;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
if (np->sndflow) {
- fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) {
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+ fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
if (flowlabel == NULL)
return -EINVAL;
ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
@@ -137,7 +137,7 @@ ipv4_connected:
}
ipv6_addr_copy(&np->daddr, daddr);
- np->flow_label = fl.fl6_flowlabel;
+ np->flow_label = fl6.flowlabel;
inet->inet_dport = usin->sin6_port;
@@ -146,53 +146,46 @@ ipv4_connected:
* destination cache for it.
*/
- fl.proto = sk->sk_protocol;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.mark = sk->sk_mark;
- fl.fl_ip_dport = inet->inet_dport;
- fl.fl_ip_sport = inet->inet_sport;
+ fl6.flowi6_proto = sk->sk_protocol;
+ ipv6_addr_copy(&fl6.daddr, &np->daddr);
+ ipv6_addr_copy(&fl6.saddr, &np->saddr);
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = inet->inet_dport;
+ fl6.fl6_sport = inet->inet_sport;
- if (!fl.oif && (addr_type&IPV6_ADDR_MULTICAST))
- fl.oif = np->mcast_oif;
+ if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST))
+ fl6.flowi6_oif = np->mcast_oif;
- security_sk_classify_flow(sk, &fl);
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
opt = flowlabel ? flowlabel->opt : np->opt;
- final_p = fl6_update_dst(&fl, opt, &final);
+ final_p = fl6_update_dst(&fl6, opt, &final);
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+ err = 0;
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
goto out;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT);
- if (err < 0) {
- if (err == -EREMOTE)
- err = ip6_dst_blackhole(sk, &dst, &fl);
- if (err < 0)
- goto out;
}
/* source address lookup done in ip6_dst_lookup */
if (ipv6_addr_any(&np->saddr))
- ipv6_addr_copy(&np->saddr, &fl.fl6_src);
+ ipv6_addr_copy(&np->saddr, &fl6.saddr);
if (ipv6_addr_any(&np->rcv_saddr)) {
- ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src);
+ ipv6_addr_copy(&np->rcv_saddr, &fl6.saddr);
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
ip6_dst_store(sk, dst,
- ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ?
+ ipv6_addr_equal(&fl6.daddr, &np->daddr) ?
&np->daddr : NULL,
#ifdef CONFIG_IPV6_SUBTREES
- ipv6_addr_equal(&fl.fl6_src, &np->saddr) ?
+ ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
&np->saddr :
#endif
NULL);
@@ -238,7 +231,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
kfree_skb(skb);
}
-void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info)
+void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct sock_exterr_skb *serr;
@@ -257,7 +250,7 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info)
skb_put(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
iph = ipv6_hdr(skb);
- ipv6_addr_copy(&iph->daddr, &fl->fl6_dst);
+ ipv6_addr_copy(&iph->daddr, &fl6->daddr);
serr = SKB_EXT_ERR(skb);
serr->ee.ee_errno = err;
@@ -268,7 +261,7 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info)
serr->ee.ee_info = info;
serr->ee.ee_data = 0;
serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
- serr->port = fl->fl_ip_dport;
+ serr->port = fl6->fl6_dport;
__skb_pull(skb, skb_tail_pointer(skb) - skb->data);
skb_reset_transport_header(skb);
@@ -277,7 +270,7 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info)
kfree_skb(skb);
}
-void ipv6_local_rxpmtu(struct sock *sk, struct flowi *fl, u32 mtu)
+void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct ipv6hdr *iph;
@@ -294,7 +287,7 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi *fl, u32 mtu)
skb_put(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
iph = ipv6_hdr(skb);
- ipv6_addr_copy(&iph->daddr, &fl->fl6_dst);
+ ipv6_addr_copy(&iph->daddr, &fl6->daddr);
mtu_info = IP6CBMTU(skb);
if (!mtu_info) {
@@ -306,7 +299,7 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi *fl, u32 mtu)
mtu_info->ip6m_addr.sin6_family = AF_INET6;
mtu_info->ip6m_addr.sin6_port = 0;
mtu_info->ip6m_addr.sin6_flowinfo = 0;
- mtu_info->ip6m_addr.sin6_scope_id = fl->oif;
+ mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif;
ipv6_addr_copy(&mtu_info->ip6m_addr.sin6_addr, &ipv6_hdr(skb)->daddr);
__skb_pull(skb, skb_tail_pointer(skb) - skb->data);
@@ -600,7 +593,7 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
}
int datagram_send_ctl(struct net *net,
- struct msghdr *msg, struct flowi *fl,
+ struct msghdr *msg, struct flowi6 *fl6,
struct ipv6_txoptions *opt,
int *hlimit, int *tclass, int *dontfrag)
{
@@ -636,16 +629,17 @@ int datagram_send_ctl(struct net *net,
src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
if (src_info->ipi6_ifindex) {
- if (fl->oif && src_info->ipi6_ifindex != fl->oif)
+ if (fl6->flowi6_oif &&
+ src_info->ipi6_ifindex != fl6->flowi6_oif)
return -EINVAL;
- fl->oif = src_info->ipi6_ifindex;
+ fl6->flowi6_oif = src_info->ipi6_ifindex;
}
addr_type = __ipv6_addr_type(&src_info->ipi6_addr);
rcu_read_lock();
- if (fl->oif) {
- dev = dev_get_by_index_rcu(net, fl->oif);
+ if (fl6->flowi6_oif) {
+ dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
if (!dev) {
rcu_read_unlock();
return -ENODEV;
@@ -661,7 +655,7 @@ int datagram_send_ctl(struct net *net,
strict ? dev : NULL, 0))
err = -EINVAL;
else
- ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr);
+ ipv6_addr_copy(&fl6->saddr, &src_info->ipi6_addr);
}
rcu_read_unlock();
@@ -678,13 +672,13 @@ int datagram_send_ctl(struct net *net,
goto exit_f;
}
- if (fl->fl6_flowlabel&IPV6_FLOWINFO_MASK) {
- if ((fl->fl6_flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) {
+ if (fl6->flowlabel&IPV6_FLOWINFO_MASK) {
+ if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) {
err = -EINVAL;
goto exit_f;
}
}
- fl->fl6_flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg);
+ fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg);
break;
case IPV6_2292HOPOPTS:
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 1b5c9825743..5aa8ec88f19 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -54,16 +54,20 @@ static u32 esp6_get_mtu(struct xfrm_state *x, int mtu);
/*
* Allocate an AEAD request structure with extra space for SG and IV.
*
- * For alignment considerations the IV is placed at the front, followed
- * by the request and finally the SG list.
+ * For alignment considerations the upper 32 bits of the sequence number are
+ * placed at the front, if present. Followed by the IV, the request and finally
+ * the SG list.
*
* TODO: Use spare space in skb for this where possible.
*/
-static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen)
{
unsigned int len;
- len = crypto_aead_ivsize(aead);
+ len = seqihlen;
+
+ len += crypto_aead_ivsize(aead);
+
if (len) {
len += crypto_aead_alignmask(aead) &
~(crypto_tfm_ctx_alignment() - 1);
@@ -78,10 +82,16 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
return kmalloc(len, GFP_ATOMIC);
}
-static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp)
+static inline __be32 *esp_tmp_seqhi(void *tmp)
+{
+ return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+}
+
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
{
return crypto_aead_ivsize(aead) ?
- PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp;
+ PTR_ALIGN((u8 *)tmp + seqhilen,
+ crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
}
static inline struct aead_givcrypt_request *esp_tmp_givreq(
@@ -145,8 +155,12 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
int plen;
int tfclen;
int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
u8 *iv;
u8 *tail;
+ __be32 *seqhi;
struct esp_data *esp = x->data;
/* skb is pure payload to encrypt */
@@ -175,14 +189,25 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
goto error;
nfrags = err;
- tmp = esp_alloc_tmp(aead, nfrags + 1);
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
if (!tmp)
goto error;
- iv = esp_tmp_iv(aead, tmp);
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_givreq(aead, iv);
asg = esp_givreq_sg(aead, req);
- sg = asg + 1;
+ sg = asg + sglists;
/* Fill padding... */
tail = skb_tail_pointer(trailer);
@@ -204,19 +229,27 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
*skb_mac_header(skb) = IPPROTO_ESP;
esph->spi = x->id.spi;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
clen + alen);
- sg_init_one(asg, esph, sizeof(*esph));
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
- aead_givcrypt_set_assoc(req, asg, sizeof(*esph));
+ aead_givcrypt_set_assoc(req, asg, assoclen);
aead_givcrypt_set_giv(req, esph->enc_data,
- XFRM_SKB_CB(skb)->seq.output);
+ XFRM_SKB_CB(skb)->seq.output.low);
ESP_SKB_CB(skb)->tmp = tmp;
err = crypto_aead_givencrypt(req);
@@ -292,8 +325,12 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
struct sk_buff *trailer;
int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
int ret = 0;
void *tmp;
+ __be32 *seqhi;
u8 *iv;
struct scatterlist *sg;
struct scatterlist *asg;
@@ -314,12 +351,24 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
}
ret = -ENOMEM;
- tmp = esp_alloc_tmp(aead, nfrags + 1);
+
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
if (!tmp)
goto out;
ESP_SKB_CB(skb)->tmp = tmp;
- iv = esp_tmp_iv(aead, tmp);
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_req(aead, iv);
asg = esp_req_sg(aead, req);
sg = asg + 1;
@@ -333,11 +382,19 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
- sg_init_one(asg, esph, sizeof(*esph));
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
aead_request_set_callback(req, 0, esp_input_done, skb);
aead_request_set_crypt(req, sg, sg, elen, iv);
- aead_request_set_assoc(req, asg, sizeof(*esph));
+ aead_request_set_assoc(req, asg, assoclen);
ret = crypto_aead_decrypt(req);
if (ret == -EINPROGRESS)
@@ -443,10 +500,20 @@ static int esp_init_authenc(struct xfrm_state *x)
goto error;
err = -ENAMETOOLONG;
- if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)",
- x->aalg ? x->aalg->alg_name : "digest_null",
- x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
- goto error;
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authencesn(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ } else {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authenc(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ }
aead = crypto_alloc_aead(authenc_name, 0, 0);
err = PTR_ERR(aead);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 262f105d23b..79a485e8a70 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -876,22 +876,22 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
* fl6_update_dst - update flowi destination address with info given
* by srcrt option, if any.
*
- * @fl: flowi for which fl6_dst is to be updated
+ * @fl6: flowi6 for which daddr is to be updated
* @opt: struct ipv6_txoptions in which to look for srcrt opt
- * @orig: copy of original fl6_dst address if modified
+ * @orig: copy of original daddr address if modified
*
* Returns NULL if no txoptions or no srcrt, otherwise returns orig
- * and initial value of fl->fl6_dst set in orig
+ * and initial value of fl6->daddr set in orig
*/
-struct in6_addr *fl6_update_dst(struct flowi *fl,
+struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
const struct ipv6_txoptions *opt,
struct in6_addr *orig)
{
if (!opt || !opt->srcrt)
return NULL;
- ipv6_addr_copy(orig, &fl->fl6_dst);
- ipv6_addr_copy(&fl->fl6_dst, ((struct rt0_hdr *)opt->srcrt)->addr);
+ ipv6_addr_copy(orig, &fl6->daddr);
+ ipv6_addr_copy(&fl6->daddr, ((struct rt0_hdr *)opt->srcrt)->addr);
return orig;
}
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d829874d894..34d244df907 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -29,7 +29,7 @@ struct fib6_rule
u8 tclass;
};
-struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl,
+struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
int flags, pol_lookup_t lookup)
{
struct fib_lookup_arg arg = {
@@ -37,7 +37,8 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl,
.flags = FIB_LOOKUP_NOREF,
};
- fib_rules_lookup(net->ipv6.fib6_rules_ops, fl, flags, &arg);
+ fib_rules_lookup(net->ipv6.fib6_rules_ops,
+ flowi6_to_flowi(fl6), flags, &arg);
if (arg.result)
return arg.result;
@@ -49,6 +50,7 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl,
static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
{
+ struct flowi6 *flp6 = &flp->u.ip6;
struct rt6_info *rt = NULL;
struct fib6_table *table;
struct net *net = rule->fr_net;
@@ -71,7 +73,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
table = fib6_get_table(net, rule->table);
if (table)
- rt = lookup(net, table, flp, flags);
+ rt = lookup(net, table, flp6, flags);
if (rt != net->ipv6.ip6_null_entry) {
struct fib6_rule *r = (struct fib6_rule *)rule;
@@ -86,14 +88,14 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
if (ipv6_dev_get_saddr(net,
ip6_dst_idev(&rt->dst)->dev,
- &flp->fl6_dst,
+ &flp6->daddr,
rt6_flags2srcprefs(flags),
&saddr))
goto again;
if (!ipv6_prefix_equal(&saddr, &r->src.addr,
r->src.plen))
goto again;
- ipv6_addr_copy(&flp->fl6_src, &saddr);
+ ipv6_addr_copy(&flp6->saddr, &saddr);
}
goto out;
}
@@ -113,9 +115,10 @@ out:
static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
struct fib6_rule *r = (struct fib6_rule *) rule;
+ struct flowi6 *fl6 = &fl->u.ip6;
if (r->dst.plen &&
- !ipv6_prefix_equal(&fl->fl6_dst, &r->dst.addr, r->dst.plen))
+ !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen))
return 0;
/*
@@ -125,14 +128,14 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
*/
if (r->src.plen) {
if (flags & RT6_LOOKUP_F_HAS_SADDR) {
- if (!ipv6_prefix_equal(&fl->fl6_src, &r->src.addr,
+ if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr,
r->src.plen))
return 0;
} else if (!(r->common.flags & FIB_RULE_FIND_SADDR))
return 0;
}
- if (r->tclass && r->tclass != ((ntohl(fl->fl6_flowlabel) >> 20) & 0xff))
+ if (r->tclass && r->tclass != ((ntohl(fl6->flowlabel) >> 20) & 0xff))
return 0;
return 1;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 03e62f94ff8..83cb4f9add8 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -157,32 +157,32 @@ static int is_ineligible(struct sk_buff *skb)
/*
* Check the ICMP output rate limit
*/
-static inline int icmpv6_xrlim_allow(struct sock *sk, u8 type,
- struct flowi *fl)
+static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
+ struct flowi6 *fl6)
{
struct dst_entry *dst;
struct net *net = sock_net(sk);
- int res = 0;
+ bool res = false;
/* Informational messages are not limited. */
if (type & ICMPV6_INFOMSG_MASK)
- return 1;
+ return true;
/* Do not limit pmtu discovery, it would break it. */
if (type == ICMPV6_PKT_TOOBIG)
- return 1;
+ return true;
/*
* Look up the output route.
* XXX: perhaps the expire for routing entries cloned by
* this lookup should be more aggressive (not longer than timeout).
*/
- dst = ip6_route_output(net, sk, fl);
+ dst = ip6_route_output(net, sk, fl6);
if (dst->error) {
IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_OUTNOROUTES);
} else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
- res = 1;
+ res = true;
} else {
struct rt6_info *rt = (struct rt6_info *)dst;
int tmo = net->ipv6.sysctl.icmpv6_time;
@@ -191,7 +191,9 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, u8 type,
if (rt->rt6i_dst.plen < 128)
tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
- res = xrlim_allow(dst, tmo);
+ if (!rt->rt6i_peer)
+ rt6_bind_peer(rt, 1);
+ res = inet_peer_xrlim_allow(rt->rt6i_peer, tmo);
}
dst_release(dst);
return res;
@@ -215,7 +217,7 @@ static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset)
return (*op & 0xC0) == 0x80;
}
-static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct icmp6hdr *thdr, int len)
+static int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len)
{
struct sk_buff *skb;
struct icmp6hdr *icmp6h;
@@ -231,9 +233,9 @@ static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct
if (skb_queue_len(&sk->sk_write_queue) == 1) {
skb->csum = csum_partial(icmp6h,
sizeof(struct icmp6hdr), skb->csum);
- icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src,
- &fl->fl6_dst,
- len, fl->proto,
+ icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
+ &fl6->daddr,
+ len, fl6->flowi6_proto,
skb->csum);
} else {
__wsum tmp_csum = 0;
@@ -244,9 +246,9 @@ static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct
tmp_csum = csum_partial(icmp6h,
sizeof(struct icmp6hdr), tmp_csum);
- icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src,
- &fl->fl6_dst,
- len, fl->proto,
+ icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
+ &fl6->daddr,
+ len, fl6->flowi6_proto,
tmp_csum);
}
ip6_push_pending_frames(sk);
@@ -298,6 +300,68 @@ static void mip6_addr_swap(struct sk_buff *skb)
static inline void mip6_addr_swap(struct sk_buff *skb) {}
#endif
+static struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *skb,
+ struct sock *sk, struct flowi6 *fl6)
+{
+ struct dst_entry *dst, *dst2;
+ struct flowi6 fl2;
+ int err;
+
+ err = ip6_dst_lookup(sk, &dst, fl6);
+ if (err)
+ return ERR_PTR(err);
+
+ /*
+ * We won't send icmp if the destination is known
+ * anycast.
+ */
+ if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) {
+ LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n");
+ dst_release(dst);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* No need to clone since we're just using its address. */
+ dst2 = dst;
+
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0);
+ if (!IS_ERR(dst)) {
+ if (dst != dst2)
+ return dst;
+ } else {
+ if (PTR_ERR(dst) == -EPERM)
+ dst = NULL;
+ else
+ return dst;
+ }
+
+ err = xfrm_decode_session_reverse(skb, flowi6_to_flowi(&fl2), AF_INET6);
+ if (err)
+ goto relookup_failed;
+
+ err = ip6_dst_lookup(sk, &dst2, &fl2);
+ if (err)
+ goto relookup_failed;
+
+ dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP);
+ if (!IS_ERR(dst2)) {
+ dst_release(dst);
+ dst = dst2;
+ } else {
+ err = PTR_ERR(dst2);
+ if (err == -EPERM) {
+ dst_release(dst);
+ return dst2;
+ } else
+ goto relookup_failed;
+ }
+
+relookup_failed:
+ if (dst)
+ return dst;
+ return ERR_PTR(err);
+}
+
/*
* Send an ICMP message in response to a packet in error
*/
@@ -310,10 +374,8 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
struct ipv6_pinfo *np;
struct in6_addr *saddr = NULL;
struct dst_entry *dst;
- struct dst_entry *dst2;
struct icmp6hdr tmp_hdr;
- struct flowi fl;
- struct flowi fl2;
+ struct flowi6 fl6;
struct icmpv6_msg msg;
int iif = 0;
int addr_type = 0;
@@ -380,22 +442,22 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
mip6_addr_swap(skb);
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_ICMPV6;
- ipv6_addr_copy(&fl.fl6_dst, &hdr->saddr);
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_ICMPV6;
+ ipv6_addr_copy(&fl6.daddr, &hdr->saddr);
if (saddr)
- ipv6_addr_copy(&fl.fl6_src, saddr);
- fl.oif = iif;
- fl.fl_icmp_type = type;
- fl.fl_icmp_code = code;
- security_skb_classify_flow(skb, &fl);
+ ipv6_addr_copy(&fl6.saddr, saddr);
+ fl6.flowi6_oif = iif;
+ fl6.fl6_icmp_type = type;
+ fl6.fl6_icmp_code = code;
+ security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
if (sk == NULL)
return;
np = inet6_sk(sk);
- if (!icmpv6_xrlim_allow(sk, type, &fl))
+ if (!icmpv6_xrlim_allow(sk, type, &fl6))
goto out;
tmp_hdr.icmp6_type = type;
@@ -403,61 +465,14 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
tmp_hdr.icmp6_cksum = 0;
tmp_hdr.icmp6_pointer = htonl(info);
- if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
- fl.oif = np->mcast_oif;
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = np->mcast_oif;
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
+ dst = icmpv6_route_lookup(net, skb, sk, &fl6);
+ if (IS_ERR(dst))
goto out;
- /*
- * We won't send icmp if the destination is known
- * anycast.
- */
- if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) {
- LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n");
- goto out_dst_release;
- }
-
- /* No need to clone since we're just using its address. */
- dst2 = dst;
-
- err = xfrm_lookup(net, &dst, &fl, sk, 0);
- switch (err) {
- case 0:
- if (dst != dst2)
- goto route_done;
- break;
- case -EPERM:
- dst = NULL;
- break;
- default:
- goto out;
- }
-
- if (xfrm_decode_session_reverse(skb, &fl2, AF_INET6))
- goto relookup_failed;
-
- if (ip6_dst_lookup(sk, &dst2, &fl2))
- goto relookup_failed;
-
- err = xfrm_lookup(net, &dst2, &fl2, sk, XFRM_LOOKUP_ICMP);
- switch (err) {
- case 0:
- dst_release(dst);
- dst = dst2;
- break;
- case -EPERM:
- goto out_dst_release;
- default:
-relookup_failed:
- if (!dst)
- goto out;
- break;
- }
-
-route_done:
- if (ipv6_addr_is_multicast(&fl.fl6_dst))
+ if (ipv6_addr_is_multicast(&fl6.daddr))
hlimit = np->mcast_hops;
else
hlimit = np->hop_limit;
@@ -480,14 +495,14 @@ route_done:
err = ip6_append_data(sk, icmpv6_getfrag, &msg,
len + sizeof(struct icmp6hdr),
sizeof(struct icmp6hdr), hlimit,
- np->tclass, NULL, &fl, (struct rt6_info*)dst,
+ np->tclass, NULL, &fl6, (struct rt6_info*)dst,
MSG_DONTWAIT, np->dontfrag);
if (err) {
ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS);
ip6_flush_pending_frames(sk);
goto out_put;
}
- err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, len + sizeof(struct icmp6hdr));
+ err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr));
out_put:
if (likely(idev != NULL))
@@ -509,7 +524,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
struct in6_addr *saddr = NULL;
struct icmp6hdr *icmph = icmp6_hdr(skb);
struct icmp6hdr tmp_hdr;
- struct flowi fl;
+ struct flowi6 fl6;
struct icmpv6_msg msg;
struct dst_entry *dst;
int err = 0;
@@ -523,30 +538,31 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY;
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_ICMPV6;
- ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr);
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_ICMPV6;
+ ipv6_addr_copy(&fl6.daddr, &ipv6_hdr(skb)->saddr);
if (saddr)
- ipv6_addr_copy(&fl.fl6_src, saddr);
- fl.oif = skb->dev->ifindex;
- fl.fl_icmp_type = ICMPV6_ECHO_REPLY;
- security_skb_classify_flow(skb, &fl);
+ ipv6_addr_copy(&fl6.saddr, saddr);
+ fl6.flowi6_oif = skb->dev->ifindex;
+ fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
+ security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
if (sk == NULL)
return;
np = inet6_sk(sk);
- if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
- fl.oif = np->mcast_oif;
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = np->mcast_oif;
- err = ip6_dst_lookup(sk, &dst, &fl);
+ err = ip6_dst_lookup(sk, &dst, &fl6);
if (err)
goto out;
- if ((err = xfrm_lookup(net, &dst, &fl, sk, 0)) < 0)
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
+ if (IS_ERR(dst))
goto out;
- if (ipv6_addr_is_multicast(&fl.fl6_dst))
+ if (ipv6_addr_is_multicast(&fl6.daddr))
hlimit = np->mcast_hops;
else
hlimit = np->hop_limit;
@@ -560,7 +576,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
msg.type = ICMPV6_ECHO_REPLY;
err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr),
- sizeof(struct icmp6hdr), hlimit, np->tclass, NULL, &fl,
+ sizeof(struct icmp6hdr), hlimit, np->tclass, NULL, &fl6,
(struct rt6_info*)dst, MSG_DONTWAIT,
np->dontfrag);
@@ -569,7 +585,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
ip6_flush_pending_frames(sk);
goto out_put;
}
- err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, skb->len + sizeof(struct icmp6hdr));
+ err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr));
out_put:
if (likely(idev != NULL))
@@ -768,20 +784,20 @@ drop_no_count:
return 0;
}
-void icmpv6_flow_init(struct sock *sk, struct flowi *fl,
+void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
u8 type,
const struct in6_addr *saddr,
const struct in6_addr *daddr,
int oif)
{
- memset(fl, 0, sizeof(*fl));
- ipv6_addr_copy(&fl->fl6_src, saddr);
- ipv6_addr_copy(&fl->fl6_dst, daddr);
- fl->proto = IPPROTO_ICMPV6;
- fl->fl_icmp_type = type;
- fl->fl_icmp_code = 0;
- fl->oif = oif;
- security_sk_classify_flow(sk, fl);
+ memset(fl6, 0, sizeof(*fl6));
+ ipv6_addr_copy(&fl6->saddr, saddr);
+ ipv6_addr_copy(&fl6->daddr, daddr);
+ fl6->flowi6_proto = IPPROTO_ICMPV6;
+ fl6->fl6_icmp_type = type;
+ fl6->fl6_icmp_code = 0;
+ fl6->flowi6_oif = oif;
+ security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
}
/*
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index d144e629d2b..16605465046 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -61,26 +61,21 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *final_p, final;
struct dst_entry *dst;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
- final_p = fl6_update_dst(&fl, np->opt, &final);
- ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
- fl.oif = sk->sk_bound_dev_if;
- fl.mark = sk->sk_mark;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_rsk(req)->loc_port;
- security_req_classify_flow(req, &fl);
-
- if (ip6_dst_lookup(sk, &dst, &fl))
- return NULL;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0)
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr);
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+ ipv6_addr_copy(&fl6.saddr, &treq->loc_addr);
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = inet_rsk(req)->rmt_port;
+ fl6.fl6_sport = inet_rsk(req)->loc_port;
+ security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+ if (IS_ERR(dst))
return NULL;
return dst;
@@ -213,42 +208,34 @@ int inet6_csk_xmit(struct sk_buff *skb)
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
- struct flowi fl;
+ struct flowi6 fl6;
struct dst_entry *dst;
struct in6_addr *final_p, final;
- memset(&fl, 0, sizeof(fl));
- fl.proto = sk->sk_protocol;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.fl6_flowlabel = np->flow_label;
- IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
- fl.oif = sk->sk_bound_dev_if;
- fl.mark = sk->sk_mark;
- fl.fl_ip_sport = inet->inet_sport;
- fl.fl_ip_dport = inet->inet_dport;
- security_sk_classify_flow(sk, &fl);
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = sk->sk_protocol;
+ ipv6_addr_copy(&fl6.daddr, &np->daddr);
+ ipv6_addr_copy(&fl6.saddr, &np->saddr);
+ fl6.flowlabel = np->flow_label;
+ IP6_ECN_flow_xmit(sk, fl6.flowlabel);
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_sport = inet->inet_sport;
+ fl6.fl6_dport = inet->inet_dport;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- final_p = fl6_update_dst(&fl, np->opt, &final);
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
dst = __inet6_csk_dst_check(sk, np->dst_cookie);
if (dst == NULL) {
- int err = ip6_dst_lookup(sk, &dst, &fl);
-
- if (err) {
- sk->sk_err_soft = -err;
- kfree_skb(skb);
- return err;
- }
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
- if ((err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) {
+ if (IS_ERR(dst)) {
+ sk->sk_err_soft = -PTR_ERR(dst);
sk->sk_route_caps = 0;
kfree_skb(skb);
- return err;
+ return PTR_ERR(dst);
}
__inet6_csk_dst_store(sk, dst, NULL, NULL);
@@ -257,9 +244,9 @@ int inet6_csk_xmit(struct sk_buff *skb)
skb_dst_set(skb, dst_clone(dst));
/* Restore final destination back after routing done */
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+ ipv6_addr_copy(&fl6.daddr, &np->daddr);
- return ip6_xmit(sk, skb, &fl, np->opt);
+ return ip6_xmit(sk, skb, &fl6, np->opt);
}
EXPORT_SYMBOL_GPL(inet6_csk_xmit);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index de382114609..7548905e79e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -260,10 +260,10 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id)
return net->ipv6.fib6_main_tbl;
}
-struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl,
+struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
int flags, pol_lookup_t lookup)
{
- return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl, flags);
+ return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
}
static void __net_init fib6_tables_init(struct net *net)
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 13654686aea..f3caf1b8d57 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -342,7 +342,7 @@ fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval,
if (olen > 0) {
struct msghdr msg;
- struct flowi flowi;
+ struct flowi6 flowi6;
int junk;
err = -ENOMEM;
@@ -358,9 +358,9 @@ fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval,
msg.msg_controllen = olen;
msg.msg_control = (void*)(fl->opt+1);
- flowi.oif = 0;
+ memset(&flowi6, 0, sizeof(flowi6));
- err = datagram_send_ctl(net, &msg, &flowi, fl->opt, &junk,
+ err = datagram_send_ctl(net, &msg, &flowi6, fl->opt, &junk,
&junk, &junk);
if (err)
goto done;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5f8d242be3f..18208876aa8 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -174,15 +174,15 @@ int ip6_output(struct sk_buff *skb)
* xmit an sk_buff (used by TCP, SCTP and DCCP)
*/
-int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
struct ipv6_txoptions *opt)
{
struct net *net = sock_net(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
- struct in6_addr *first_hop = &fl->fl6_dst;
+ struct in6_addr *first_hop = &fl6->daddr;
struct dst_entry *dst = skb_dst(skb);
struct ipv6hdr *hdr;
- u8 proto = fl->proto;
+ u8 proto = fl6->flowi6_proto;
int seg_len = skb->len;
int hlimit = -1;
int tclass = 0;
@@ -230,13 +230,13 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
if (hlimit < 0)
hlimit = ip6_dst_hoplimit(dst);
- *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
+ *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
hdr->payload_len = htons(seg_len);
hdr->nexthdr = proto;
hdr->hop_limit = hlimit;
- ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
+ ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
ipv6_addr_copy(&hdr->daddr, first_hop);
skb->priority = sk->sk_priority;
@@ -274,13 +274,10 @@ int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct ipv6hdr *hdr;
- int totlen;
skb->protocol = htons(ETH_P_IPV6);
skb->dev = dev;
- totlen = len + sizeof(struct ipv6hdr);
-
skb_reset_network_header(skb);
skb_put(skb, sizeof(struct ipv6hdr));
hdr = ipv6_hdr(skb);
@@ -479,10 +476,13 @@ int ip6_forward(struct sk_buff *skb)
else
target = &hdr->daddr;
+ if (!rt->rt6i_peer)
+ rt6_bind_peer(rt, 1);
+
/* Limit redirects both by destination (here)
and by source (inside ndisc_send_redirect)
*/
- if (xrlim_allow(dst, 1*HZ))
+ if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
ndisc_send_redirect(skb, n, target);
} else {
int addrtype = ipv6_addr_type(&hdr->saddr);
@@ -879,7 +879,7 @@ static inline int ip6_rt_check(struct rt6key *rt_key,
static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
struct dst_entry *dst,
- struct flowi *fl)
+ struct flowi6 *fl6)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct rt6_info *rt = (struct rt6_info *)dst;
@@ -904,11 +904,11 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
* sockets.
* 2. oif also should be the same.
*/
- if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
+ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
#ifdef CONFIG_IPV6_SUBTREES
- ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
+ ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
#endif
- (fl->oif && fl->oif != dst->dev->ifindex)) {
+ (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
dst_release(dst);
dst = NULL;
}
@@ -918,22 +918,22 @@ out:
}
static int ip6_dst_lookup_tail(struct sock *sk,
- struct dst_entry **dst, struct flowi *fl)
+ struct dst_entry **dst, struct flowi6 *fl6)
{
int err;
struct net *net = sock_net(sk);
if (*dst == NULL)
- *dst = ip6_route_output(net, sk, fl);
+ *dst = ip6_route_output(net, sk, fl6);
if ((err = (*dst)->error))
goto out_err_release;
- if (ipv6_addr_any(&fl->fl6_src)) {
+ if (ipv6_addr_any(&fl6->saddr)) {
err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
- &fl->fl6_dst,
+ &fl6->daddr,
sk ? inet6_sk(sk)->srcprefs : 0,
- &fl->fl6_src);
+ &fl6->saddr);
if (err)
goto out_err_release;
}
@@ -949,10 +949,10 @@ static int ip6_dst_lookup_tail(struct sock *sk,
*/
if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
struct inet6_ifaddr *ifp;
- struct flowi fl_gw;
+ struct flowi6 fl_gw6;
int redirect;
- ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
+ ifp = ipv6_get_ifaddr(net, &fl6->saddr,
(*dst)->dev, 1);
redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
@@ -965,9 +965,9 @@ static int ip6_dst_lookup_tail(struct sock *sk,
* default router instead
*/
dst_release(*dst);
- memcpy(&fl_gw, fl, sizeof(struct flowi));
- memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
- *dst = ip6_route_output(net, sk, &fl_gw);
+ memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
+ memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
+ *dst = ip6_route_output(net, sk, &fl_gw6);
if ((err = (*dst)->error))
goto out_err_release;
}
@@ -988,43 +988,85 @@ out_err_release:
* ip6_dst_lookup - perform route lookup on flow
* @sk: socket which provides route info
* @dst: pointer to dst_entry * for result
- * @fl: flow to lookup
+ * @fl6: flow to lookup
*
* This function performs a route lookup on the given flow.
*
* It returns zero on success, or a standard errno code on error.
*/
-int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
+int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
{
*dst = NULL;
- return ip6_dst_lookup_tail(sk, dst, fl);
+ return ip6_dst_lookup_tail(sk, dst, fl6);
}
EXPORT_SYMBOL_GPL(ip6_dst_lookup);
/**
- * ip6_sk_dst_lookup - perform socket cached route lookup on flow
+ * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
+ * @sk: socket which provides route info
+ * @fl6: flow to lookup
+ * @final_dst: final destination address for ipsec lookup
+ * @can_sleep: we are in a sleepable context
+ *
+ * This function performs a route lookup on the given flow.
+ *
+ * It returns a valid dst pointer on success, or a pointer encoded
+ * error code.
+ */
+struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
+ const struct in6_addr *final_dst,
+ bool can_sleep)
+{
+ struct dst_entry *dst = NULL;
+ int err;
+
+ err = ip6_dst_lookup_tail(sk, &dst, fl6);
+ if (err)
+ return ERR_PTR(err);
+ if (final_dst)
+ ipv6_addr_copy(&fl6->daddr, final_dst);
+ if (can_sleep)
+ fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
+
+ return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
+}
+EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
+
+/**
+ * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
* @sk: socket which provides the dst cache and route info
- * @dst: pointer to dst_entry * for result
- * @fl: flow to lookup
+ * @fl6: flow to lookup
+ * @final_dst: final destination address for ipsec lookup
+ * @can_sleep: we are in a sleepable context
*
* This function performs a route lookup on the given flow with the
* possibility of using the cached route in the socket if it is valid.
* It will take the socket dst lock when operating on the dst cache.
* As a result, this function can only be used in process context.
*
- * It returns zero on success, or a standard errno code on error.
+ * It returns a valid dst pointer on success, or a pointer encoded
+ * error code.
*/
-int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
+struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
+ const struct in6_addr *final_dst,
+ bool can_sleep)
{
- *dst = NULL;
- if (sk) {
- *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
- *dst = ip6_sk_dst_check(sk, *dst, fl);
- }
+ struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
+ int err;
+
+ dst = ip6_sk_dst_check(sk, dst, fl6);
+
+ err = ip6_dst_lookup_tail(sk, &dst, fl6);
+ if (err)
+ return ERR_PTR(err);
+ if (final_dst)
+ ipv6_addr_copy(&fl6->daddr, final_dst);
+ if (can_sleep)
+ fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
- return ip6_dst_lookup_tail(sk, dst, fl);
+ return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
}
-EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
+EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
static inline int ip6_ufo_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
@@ -1061,7 +1103,6 @@ static inline int ip6_ufo_append_data(struct sock *sk,
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
- sk->sk_sndmsg_off = 0;
}
err = skb_append_datato_frags(sk,skb, getfrag, from,
@@ -1104,7 +1145,7 @@ static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
int offset, int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
- int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
+ int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags, int dontfrag)
{
struct inet_sock *inet = inet_sk(sk);
@@ -1118,6 +1159,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
int err;
int offset = 0;
int csummode = CHECKSUM_NONE;
+ __u8 tx_flags = 0;
if (flags&MSG_PROBE)
return 0;
@@ -1161,7 +1203,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
}
dst_hold(&rt->dst);
inet->cork.dst = &rt->dst;
- inet->cork.fl = *fl;
+ inet->cork.fl.u.ip6 = *fl6;
np->cork.hop_limit = hlimit;
np->cork.tclass = tclass;
mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
@@ -1182,7 +1224,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
transhdrlen += exthdrlen;
} else {
rt = (struct rt6_info *)inet->cork.dst;
- fl = &inet->cork.fl;
+ fl6 = &inet->cork.fl.u.ip6;
opt = np->cork.opt;
transhdrlen = 0;
exthdrlen = 0;
@@ -1197,11 +1239,18 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
- ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
+ ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
return -EMSGSIZE;
}
}
+ /* For UDP, check if TX timestamp is enabled */
+ if (sk->sk_type == SOCK_DGRAM) {
+ err = sock_tx_timestamp(sk, &tx_flags);
+ if (err)
+ goto error;
+ }
+
/*
* Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU.
@@ -1222,7 +1271,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
if (length > mtu) {
int proto = sk->sk_protocol;
if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
- ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
+ ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
return -EMSGSIZE;
}
@@ -1306,6 +1355,12 @@ alloc_new_skb:
sk->sk_allocation);
if (unlikely(skb == NULL))
err = -ENOBUFS;
+ else {
+ /* Only the initial fragment
+ * is time stamped.
+ */
+ tx_flags = 0;
+ }
}
if (skb == NULL)
goto error;
@@ -1317,6 +1372,9 @@ alloc_new_skb:
/* reserve for fragmentation */
skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
+ if (sk->sk_type == SOCK_DGRAM)
+ skb_shinfo(skb)->tx_flags = tx_flags;
+
/*
* Find where to start putting bytes
*/
@@ -1458,8 +1516,8 @@ int ip6_push_pending_frames(struct sock *sk)
struct ipv6hdr *hdr;
struct ipv6_txoptions *opt = np->cork.opt;
struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
- struct flowi *fl = &inet->cork.fl;
- unsigned char proto = fl->proto;
+ struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
+ unsigned char proto = fl6->flowi6_proto;
int err = 0;
if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
@@ -1484,7 +1542,7 @@ int ip6_push_pending_frames(struct sock *sk)
if (np->pmtudisc < IPV6_PMTUDISC_DO)
skb->local_df = 1;
- ipv6_addr_copy(final_dst, &fl->fl6_dst);
+ ipv6_addr_copy(final_dst, &fl6->daddr);
__skb_pull(skb, skb_network_header_len(skb));
if (opt && opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
@@ -1495,12 +1553,12 @@ int ip6_push_pending_frames(struct sock *sk)
skb_reset_network_header(skb);
hdr = ipv6_hdr(skb);
- *(__be32*)hdr = fl->fl6_flowlabel |
+ *(__be32*)hdr = fl6->flowlabel |
htonl(0x60000000 | ((int)np->cork.tclass << 20));
hdr->hop_limit = np->cork.hop_limit;
hdr->nexthdr = proto;
- ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
+ ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
ipv6_addr_copy(&hdr->daddr, final_dst);
skb->priority = sk->sk_priority;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 4f4483e697b..c1b1bd312df 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -57,6 +57,7 @@
MODULE_AUTHOR("Ville Nuorvala");
MODULE_DESCRIPTION("IPv6 tunneling device");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETDEV("ip6tnl0");
#ifdef IP6_TNL_DEBUG
#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __func__)
@@ -535,7 +536,6 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
int err;
struct sk_buff *skb2;
struct iphdr *eiph;
- struct flowi fl;
struct rtable *rt;
err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code,
@@ -577,11 +577,11 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
eiph = ip_hdr(skb2);
/* Try to guess incoming interface */
- memset(&fl, 0, sizeof(fl));
- fl.fl4_dst = eiph->saddr;
- fl.fl4_tos = RT_TOS(eiph->tos);
- fl.proto = IPPROTO_IPIP;
- if (ip_route_output_key(dev_net(skb->dev), &rt, &fl))
+ rt = ip_route_output_ports(dev_net(skb->dev), NULL,
+ eiph->saddr, 0,
+ 0, 0,
+ IPPROTO_IPIP, RT_TOS(eiph->tos), 0);
+ if (IS_ERR(rt))
goto out;
skb2->dev = rt->dst.dev;
@@ -590,15 +590,18 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (rt->rt_flags & RTCF_LOCAL) {
ip_rt_put(rt);
rt = NULL;
- fl.fl4_dst = eiph->daddr;
- fl.fl4_src = eiph->saddr;
- fl.fl4_tos = eiph->tos;
- if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) ||
+ rt = ip_route_output_ports(dev_net(skb->dev), NULL,
+ eiph->daddr, eiph->saddr,
+ 0, 0,
+ IPPROTO_IPIP,
+ RT_TOS(eiph->tos), 0);
+ if (IS_ERR(rt) ||
rt->dst.dev->type != ARPHRD_TUNNEL) {
- ip_rt_put(rt);
+ if (!IS_ERR(rt))
+ ip_rt_put(rt);
goto out;
}
- skb_dst_set(skb2, (struct dst_entry *)rt);
+ skb_dst_set(skb2, &rt->dst);
} else {
ip_rt_put(rt);
if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos,
@@ -881,7 +884,7 @@ static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t)
static int ip6_tnl_xmit2(struct sk_buff *skb,
struct net_device *dev,
__u8 dsfield,
- struct flowi *fl,
+ struct flowi6 *fl6,
int encap_limit,
__u32 *pmtu)
{
@@ -901,10 +904,16 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
if ((dst = ip6_tnl_dst_check(t)) != NULL)
dst_hold(dst);
else {
- dst = ip6_route_output(net, NULL, fl);
+ dst = ip6_route_output(net, NULL, fl6);
- if (dst->error || xfrm_lookup(net, &dst, fl, NULL, 0) < 0)
+ if (dst->error)
goto tx_err_link_failure;
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
}
tdev = dst->dev;
@@ -954,7 +963,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
skb->transport_header = skb->network_header;
- proto = fl->proto;
+ proto = fl6->flowi6_proto;
if (encap_limit >= 0) {
init_tel_txopt(&opt, encap_limit);
ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
@@ -962,13 +971,13 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
ipv6h = ipv6_hdr(skb);
- *(__be32*)ipv6h = fl->fl6_flowlabel | htonl(0x60000000);
+ *(__be32*)ipv6h = fl6->flowlabel | htonl(0x60000000);
dsfield = INET_ECN_encapsulate(0, dsfield);
ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield);
ipv6h->hop_limit = t->parms.hop_limit;
ipv6h->nexthdr = proto;
- ipv6_addr_copy(&ipv6h->saddr, &fl->fl6_src);
- ipv6_addr_copy(&ipv6h->daddr, &fl->fl6_dst);
+ ipv6_addr_copy(&ipv6h->saddr, &fl6->saddr);
+ ipv6_addr_copy(&ipv6h->daddr, &fl6->daddr);
nf_reset(skb);
pkt_len = skb->len;
err = ip6_local_out(skb);
@@ -998,7 +1007,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
struct ip6_tnl *t = netdev_priv(dev);
struct iphdr *iph = ip_hdr(skb);
int encap_limit = -1;
- struct flowi fl;
+ struct flowi6 fl6;
__u8 dsfield;
__u32 mtu;
int err;
@@ -1010,16 +1019,16 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
- memcpy(&fl, &t->fl, sizeof (fl));
- fl.proto = IPPROTO_IPIP;
+ memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6));
+ fl6.flowi6_proto = IPPROTO_IPIP;
dsfield = ipv4_get_dsfield(iph);
if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS))
- fl.fl6_flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
+ fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
& IPV6_TCLASS_MASK;
- err = ip6_tnl_xmit2(skb, dev, dsfield, &fl, encap_limit, &mtu);
+ err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
if (err != 0) {
/* XXX: send ICMP error even if DF is not set. */
if (err == -EMSGSIZE)
@@ -1038,7 +1047,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
int encap_limit = -1;
__u16 offset;
- struct flowi fl;
+ struct flowi6 fl6;
__u8 dsfield;
__u32 mtu;
int err;
@@ -1060,16 +1069,16 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
- memcpy(&fl, &t->fl, sizeof (fl));
- fl.proto = IPPROTO_IPV6;
+ memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6));
+ fl6.flowi6_proto = IPPROTO_IPV6;
dsfield = ipv6_get_dsfield(ipv6h);
if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS))
- fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
+ fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL))
- fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK);
+ fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK);
- err = ip6_tnl_xmit2(skb, dev, dsfield, &fl, encap_limit, &mtu);
+ err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
if (err != 0) {
if (err == -EMSGSIZE)
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
@@ -1132,21 +1141,21 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
{
struct net_device *dev = t->dev;
struct ip6_tnl_parm *p = &t->parms;
- struct flowi *fl = &t->fl;
+ struct flowi6 *fl6 = &t->fl.u.ip6;
memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
/* Set up flowi template */
- ipv6_addr_copy(&fl->fl6_src, &p->laddr);
- ipv6_addr_copy(&fl->fl6_dst, &p->raddr);
- fl->oif = p->link;
- fl->fl6_flowlabel = 0;
+ ipv6_addr_copy(&fl6->saddr, &p->laddr);
+ ipv6_addr_copy(&fl6->daddr, &p->raddr);
+ fl6->flowi6_oif = p->link;
+ fl6->flowlabel = 0;
if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
- fl->fl6_flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
+ fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
- fl->fl6_flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
+ fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
ip6_tnl_set_cap(t);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 9fab274019c..7ff0343e05c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -34,6 +34,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/compat.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
@@ -134,14 +135,15 @@ static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
return NULL;
}
-static int ip6mr_fib_lookup(struct net *net, struct flowi *flp,
+static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
struct mr6_table **mrt)
{
struct ip6mr_result res;
struct fib_lookup_arg arg = { .result = &res, };
int err;
- err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flp, 0, &arg);
+ err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
+ flowi6_to_flowi(flp6), 0, &arg);
if (err < 0)
return err;
*mrt = res.mrt;
@@ -269,7 +271,7 @@ static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
return net->ipv6.mrt6;
}
-static int ip6mr_fib_lookup(struct net *net, struct flowi *flp,
+static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
struct mr6_table **mrt)
{
*mrt = net->ipv6.mrt6;
@@ -616,9 +618,9 @@ static int pim6_rcv(struct sk_buff *skb)
struct net_device *reg_dev = NULL;
struct net *net = dev_net(skb->dev);
struct mr6_table *mrt;
- struct flowi fl = {
- .iif = skb->dev->ifindex,
- .mark = skb->mark,
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->dev->ifindex,
+ .flowi6_mark = skb->mark,
};
int reg_vif_num;
@@ -643,7 +645,7 @@ static int pim6_rcv(struct sk_buff *skb)
ntohs(encap->payload_len) + sizeof(*pim) > skb->len)
goto drop;
- if (ip6mr_fib_lookup(net, &fl, &mrt) < 0)
+ if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
goto drop;
reg_vif_num = mrt->mroute_reg_vif_num;
@@ -686,14 +688,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
{
struct net *net = dev_net(dev);
struct mr6_table *mrt;
- struct flowi fl = {
- .oif = dev->ifindex,
- .iif = skb->skb_iif,
- .mark = skb->mark,
+ struct flowi6 fl6 = {
+ .flowi6_oif = dev->ifindex,
+ .flowi6_iif = skb->skb_iif,
+ .flowi6_mark = skb->mark,
};
int err;
- err = ip6mr_fib_lookup(net, &fl, &mrt);
+ err = ip6mr_fib_lookup(net, &fl6, &mrt);
if (err < 0)
return err;
@@ -1038,7 +1040,6 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt,
while((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
if (ipv6_hdr(skb)->version == 0) {
- int err;
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr));
if (__ip6mr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
@@ -1049,7 +1050,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt,
skb_trim(skb, nlh->nlmsg_len);
((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -EMSGSIZE;
}
- err = rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
+ rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
} else
ip6_mr_forward(net, mrt, skb, c);
}
@@ -1547,13 +1548,13 @@ int ip6mr_sk_done(struct sock *sk)
struct sock *mroute6_socket(struct net *net, struct sk_buff *skb)
{
struct mr6_table *mrt;
- struct flowi fl = {
- .iif = skb->skb_iif,
- .oif = skb->dev->ifindex,
- .mark = skb->mark,
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->skb_iif,
+ .flowi6_oif = skb->dev->ifindex,
+ .flowi6_mark = skb->mark,
};
- if (ip6mr_fib_lookup(net, &fl, &mrt) < 0)
+ if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
return NULL;
return mrt->mroute6_sk;
@@ -1804,6 +1805,80 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
}
}
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req6 {
+ struct sockaddr_in6 src;
+ struct sockaddr_in6 grp;
+ compat_ulong_t pktcnt;
+ compat_ulong_t bytecnt;
+ compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_mif_req6 {
+ mifi_t mifi;
+ compat_ulong_t icount;
+ compat_ulong_t ocount;
+ compat_ulong_t ibytes;
+ compat_ulong_t obytes;
+};
+
+int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ struct compat_sioc_sg_req6 sr;
+ struct compat_sioc_mif_req6 vr;
+ struct mif_device *vif;
+ struct mfc6_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr6_table *mrt;
+
+ mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+ if (mrt == NULL)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETMIFCNT_IN6:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.mifi >= mrt->maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif = &mrt->vif6_table[vr.mifi];
+ if (MIF_EXISTS(mrt, vr.mifi)) {
+ vr.icount = vif->pkt_in;
+ vr.ocount = vif->pkt_out;
+ vr.ibytes = vif->bytes_in;
+ vr.obytes = vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT_IN6:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ read_lock(&mrt_lock);
+ c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
static inline int ip6mr_forward2_finish(struct sk_buff *skb)
{
@@ -1823,7 +1898,7 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt,
struct mif_device *vif = &mrt->vif6_table[vifi];
struct net_device *dev;
struct dst_entry *dst;
- struct flowi fl;
+ struct flowi6 fl6;
if (vif->dev == NULL)
goto out_free;
@@ -1841,12 +1916,12 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt,
ipv6h = ipv6_hdr(skb);
- fl = (struct flowi) {
- .oif = vif->link,
- .fl6_dst = ipv6h->daddr,
+ fl6 = (struct flowi6) {
+ .flowi6_oif = vif->link,
+ .daddr = ipv6h->daddr,
};
- dst = ip6_route_output(net, NULL, &fl);
+ dst = ip6_route_output(net, NULL, &fl6);
if (!dst)
goto out_free;
@@ -1969,13 +2044,13 @@ int ip6_mr_input(struct sk_buff *skb)
struct mfc6_cache *cache;
struct net *net = dev_net(skb->dev);
struct mr6_table *mrt;
- struct flowi fl = {
- .iif = skb->dev->ifindex,
- .mark = skb->mark,
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->dev->ifindex,
+ .flowi6_mark = skb->mark,
};
int err;
- err = ip6mr_fib_lookup(net, &fl, &mrt);
+ err = ip6mr_fib_lookup(net, &fl6, &mrt);
if (err < 0)
return err;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index d1770e061c0..9cb191ecaba 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -444,12 +444,12 @@ sticky_done:
{
struct ipv6_txoptions *opt = NULL;
struct msghdr msg;
- struct flowi fl;
+ struct flowi6 fl6;
int junk;
- fl.fl6_flowlabel = 0;
- fl.oif = sk->sk_bound_dev_if;
- fl.mark = sk->sk_mark;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
if (optlen == 0)
goto update;
@@ -475,7 +475,7 @@ sticky_done:
msg.msg_controllen = optlen;
msg.msg_control = (void*)(opt+1);
- retv = datagram_send_ctl(net, &msg, &fl, opt, &junk, &junk,
+ retv = datagram_send_ctl(net, &msg, &fl6, opt, &junk, &junk,
&junk);
if (retv)
goto done;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 49f986d626a..76b893771e6 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -319,7 +319,6 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
{
struct in6_addr *source, *group;
struct ipv6_mc_socklist *pmc;
- struct net_device *dev;
struct inet6_dev *idev;
struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *psl;
@@ -341,7 +340,6 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
rcu_read_unlock();
return -ENODEV;
}
- dev = idev->dev;
err = -EADDRNOTAVAIL;
@@ -455,7 +453,6 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
{
struct in6_addr *group;
struct ipv6_mc_socklist *pmc;
- struct net_device *dev;
struct inet6_dev *idev;
struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *newpsl, *psl;
@@ -478,7 +475,6 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
rcu_read_unlock();
return -ENODEV;
}
- dev = idev->dev;
err = 0;
@@ -549,7 +545,6 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
struct in6_addr *group;
struct ipv6_mc_socklist *pmc;
struct inet6_dev *idev;
- struct net_device *dev;
struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *psl;
struct net *net = sock_net(sk);
@@ -566,7 +561,6 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
rcu_read_unlock();
return -ENODEV;
}
- dev = idev->dev;
err = -EADDRNOTAVAIL;
/*
@@ -1402,7 +1396,7 @@ static void mld_sendpack(struct sk_buff *skb)
struct inet6_dev *idev;
struct net *net = dev_net(skb->dev);
int err;
- struct flowi fl;
+ struct flowi6 fl6;
struct dst_entry *dst;
rcu_read_lock();
@@ -1425,11 +1419,16 @@ static void mld_sendpack(struct sk_buff *skb)
goto err_out;
}
- icmpv6_flow_init(net->ipv6.igmp_sk, &fl, ICMPV6_MLD2_REPORT,
+ icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT,
&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
skb->dev->ifindex);
- err = xfrm_lookup(net, &dst, &fl, NULL, 0);
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ err = 0;
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ }
skb_dst_set(skb, dst);
if (err)
goto err_out;
@@ -1732,7 +1731,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
u8 ra[8] = { IPPROTO_ICMPV6, 0,
IPV6_TLV_ROUTERALERT, 2, 0, 0,
IPV6_TLV_PADN, 0 };
- struct flowi fl;
+ struct flowi6 fl6;
struct dst_entry *dst;
if (type == ICMPV6_MGM_REDUCTION)
@@ -1792,13 +1791,15 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
goto err_out;
}
- icmpv6_flow_init(sk, &fl, type,
+ icmpv6_flow_init(sk, &fl6, type,
&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
skb->dev->ifindex);
- err = xfrm_lookup(net, &dst, &fl, NULL, 0);
- if (err)
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
goto err_out;
+ }
skb_dst_set(skb, dst);
err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev,
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index d6e9599d070..9b210482fb0 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -203,18 +203,20 @@ static inline int mip6_report_rl_allow(struct timeval *stamp,
return allow;
}
-static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, struct flowi *fl)
+static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb,
+ const struct flowi *fl)
{
struct net *net = xs_net(x);
struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+ const struct flowi6 *fl6 = &fl->u.ip6;
struct ipv6_destopt_hao *hao = NULL;
struct xfrm_selector sel;
int offset;
struct timeval stamp;
int err = 0;
- if (unlikely(fl->proto == IPPROTO_MH &&
- fl->fl_mh_type <= IP6_MH_TYPE_MAX))
+ if (unlikely(fl6->flowi6_proto == IPPROTO_MH &&
+ fl6->fl6_mh_type <= IP6_MH_TYPE_MAX))
goto out;
if (likely(opt->dsthao)) {
@@ -239,14 +241,14 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, struct
sizeof(sel.saddr));
sel.prefixlen_s = 128;
sel.family = AF_INET6;
- sel.proto = fl->proto;
- sel.dport = xfrm_flowi_dport(fl);
+ sel.proto = fl6->flowi6_proto;
+ sel.dport = xfrm_flowi_dport(fl, &fl6->uli);
if (sel.dport)
sel.dport_mask = htons(~0);
- sel.sport = xfrm_flowi_sport(fl);
+ sel.sport = xfrm_flowi_sport(fl, &fl6->uli);
if (sel.sport)
sel.sport_mask = htons(~0);
- sel.ifindex = fl->oif;
+ sel.ifindex = fl6->flowi6_oif;
err = km_report(net, IPPROTO_DSTOPTS, &sel,
(hao ? (xfrm_address_t *)&hao->addr : NULL));
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 2342545a5ee..0e49c9db3c9 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -511,7 +511,7 @@ void ndisc_send_skb(struct sk_buff *skb,
const struct in6_addr *saddr,
struct icmp6hdr *icmp6h)
{
- struct flowi fl;
+ struct flowi6 fl6;
struct dst_entry *dst;
struct net *net = dev_net(dev);
struct sock *sk = net->ipv6.ndisc_sk;
@@ -521,7 +521,7 @@ void ndisc_send_skb(struct sk_buff *skb,
type = icmp6h->icmp6_type;
- icmpv6_flow_init(sk, &fl, type, saddr, daddr, dev->ifindex);
+ icmpv6_flow_init(sk, &fl6, type, saddr, daddr, dev->ifindex);
dst = icmp6_dst_alloc(dev, neigh, daddr);
if (!dst) {
@@ -529,8 +529,8 @@ void ndisc_send_skb(struct sk_buff *skb,
return;
}
- err = xfrm_lookup(net, &dst, &fl, NULL, 0);
- if (err < 0) {
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst)) {
kfree_skb(skb);
return;
}
@@ -1515,7 +1515,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
struct rt6_info *rt;
struct dst_entry *dst;
struct inet6_dev *idev;
- struct flowi fl;
+ struct flowi6 fl6;
u8 *opt;
int rd_len;
int err;
@@ -1535,15 +1535,15 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
return;
}
- icmpv6_flow_init(sk, &fl, NDISC_REDIRECT,
+ icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
&saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
- dst = ip6_route_output(net, NULL, &fl);
+ dst = ip6_route_output(net, NULL, &fl6);
if (dst == NULL)
return;
- err = xfrm_lookup(net, &dst, &fl, NULL, 0);
- if (err)
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst))
return;
rt = (struct rt6_info *) dst;
@@ -1553,7 +1553,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
"ICMPv6 Redirect: destination is not a neighbour.\n");
goto release;
}
- if (!xrlim_allow(dst, 1*HZ))
+ if (!rt->rt6i_peer)
+ rt6_bind_peer(rt, 1);
+ if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
goto release;
if (dev->addr_len) {
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 35915e8617f..39aaca2b4fd 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -15,14 +15,14 @@ int ip6_route_me_harder(struct sk_buff *skb)
struct net *net = dev_net(skb_dst(skb)->dev);
struct ipv6hdr *iph = ipv6_hdr(skb);
struct dst_entry *dst;
- struct flowi fl = {
- .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
- .mark = skb->mark,
- .fl6_dst = iph->daddr,
- .fl6_src = iph->saddr,
+ struct flowi6 fl6 = {
+ .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+ .flowi6_mark = skb->mark,
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
};
- dst = ip6_route_output(net, skb->sk, &fl);
+ dst = ip6_route_output(net, skb->sk, &fl6);
if (dst->error) {
IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
@@ -37,9 +37,10 @@ int ip6_route_me_harder(struct sk_buff *skb)
#ifdef CONFIG_XFRM
if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
- xfrm_decode_session(skb, &fl, AF_INET6) == 0) {
+ xfrm_decode_session(skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) {
skb_dst_set(skb, NULL);
- if (xfrm_lookup(net, &dst, &fl, skb->sk, 0))
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), skb->sk, 0);
+ if (IS_ERR(dst))
return -1;
skb_dst_set(skb, dst);
}
@@ -91,7 +92,7 @@ static int nf_ip6_reroute(struct sk_buff *skb,
static int nf_ip6_route(struct dst_entry **dst, struct flowi *fl)
{
- *dst = ip6_route_output(&init_net, NULL, fl);
+ *dst = ip6_route_output(&init_net, NULL, &fl->u.ip6);
return (*dst)->error;
}
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7d227c644f7..47b7b8df7fa 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1076,6 +1076,7 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(AF_INET6, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
if (ret != 0)
@@ -1679,6 +1680,7 @@ translate_compat_table(struct net *net,
duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(AF_INET6);
+ xt_compat_init_offsets(AF_INET6, number);
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter0, entry0, total_size) {
ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 09c88891a75..e6af8d72f26 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -410,7 +410,7 @@ fallback:
if (p != NULL) {
sb_add(m, "%02x", *p++);
for (i = 1; i < len; i++)
- sb_add(m, ":%02x", p[i]);
+ sb_add(m, ":%02x", *p++);
}
sb_add(m, " ");
@@ -452,8 +452,7 @@ ip6t_log_packet(u_int8_t pf,
in ? in->name : "",
out ? out->name : "");
- /* MAC logging for input path only. */
- if (in && !out)
+ if (in != NULL)
dump_mac_header(m, loginfo, skb);
dump_packet(m, loginfo, skb, skb_network_offset(skb), 1);
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index bf998feac14..28e74488a32 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -47,7 +47,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb)
struct ipv6hdr *ip6h;
struct dst_entry *dst = NULL;
u8 proto;
- struct flowi fl;
+ struct flowi6 fl6;
if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
(!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
@@ -89,19 +89,20 @@ static void send_reset(struct net *net, struct sk_buff *oldskb)
return;
}
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr);
- ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr);
- fl.fl_ip_sport = otcph.dest;
- fl.fl_ip_dport = otcph.source;
- security_skb_classify_flow(oldskb, &fl);
- dst = ip6_route_output(net, NULL, &fl);
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ ipv6_addr_copy(&fl6.saddr, &oip6h->daddr);
+ ipv6_addr_copy(&fl6.daddr, &oip6h->saddr);
+ fl6.fl6_sport = otcph.dest;
+ fl6.fl6_dport = otcph.source;
+ security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
+ dst = ip6_route_output(net, NULL, &fl6);
if (dst == NULL || dst->error) {
dst_release(dst);
return;
}
- if (xfrm_lookup(net, &dst, &fl, NULL, 0))
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst))
return;
hh_len = (dst->dev->hard_header_len + 15)&~15;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 79d43aa8fa8..08572726381 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -45,6 +45,7 @@
#include <linux/netfilter_ipv6.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
struct nf_ct_frag6_skb_cb
@@ -73,7 +74,7 @@ static struct inet_frags nf_frags;
static struct netns_frags nf_init_frags;
#ifdef CONFIG_SYSCTL
-struct ctl_table nf_ct_frag6_sysctl_table[] = {
+static struct ctl_table nf_ct_frag6_sysctl_table[] = {
{
.procname = "nf_conntrack_frag6_timeout",
.data = &nf_init_frags.timeout,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 86c39526ba5..4a1c3b46c56 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -31,6 +31,7 @@
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <linux/skbuff.h>
+#include <linux/compat.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -123,18 +124,18 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
}
#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
-static int (*mh_filter)(struct sock *sock, struct sk_buff *skb);
+typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb);
-int rawv6_mh_filter_register(int (*filter)(struct sock *sock,
- struct sk_buff *skb))
+static mh_filter_t __rcu *mh_filter __read_mostly;
+
+int rawv6_mh_filter_register(mh_filter_t filter)
{
rcu_assign_pointer(mh_filter, filter);
return 0;
}
EXPORT_SYMBOL(rawv6_mh_filter_register);
-int rawv6_mh_filter_unregister(int (*filter)(struct sock *sock,
- struct sk_buff *skb))
+int rawv6_mh_filter_unregister(mh_filter_t filter)
{
rcu_assign_pointer(mh_filter, NULL);
synchronize_rcu();
@@ -192,10 +193,10 @@ static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
* policy is placed in rawv6_rcv() because it is
* required for each socket.
*/
- int (*filter)(struct sock *sock, struct sk_buff *skb);
+ mh_filter_t *filter;
filter = rcu_dereference(mh_filter);
- filtered = filter ? filter(sk, skb) : 0;
+ filtered = filter ? (*filter)(sk, skb) : 0;
break;
}
#endif
@@ -523,7 +524,7 @@ csum_copy_err:
goto out;
}
-static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
+static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
struct raw6_sock *rp)
{
struct sk_buff *skb;
@@ -585,11 +586,10 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
if (unlikely(csum))
tmp_csum = csum_sub(tmp_csum, csum_unfold(csum));
- csum = csum_ipv6_magic(&fl->fl6_src,
- &fl->fl6_dst,
- total_len, fl->proto, tmp_csum);
+ csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
+ total_len, fl6->flowi6_proto, tmp_csum);
- if (csum == 0 && fl->proto == IPPROTO_UDP)
+ if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP)
csum = CSUM_MANGLED_0;
if (skb_store_bits(skb, offset, &csum, 2))
@@ -602,7 +602,7 @@ out:
}
static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
- struct flowi *fl, struct dst_entry **dstp,
+ struct flowi6 *fl6, struct dst_entry **dstp,
unsigned int flags)
{
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -612,7 +612,7 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
struct rt6_info *rt = (struct rt6_info *)*dstp;
if (length > rt->dst.dev->mtu) {
- ipv6_local_error(sk, EMSGSIZE, fl, rt->dst.dev->mtu);
+ ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu);
return -EMSGSIZE;
}
if (flags&MSG_PROBE)
@@ -661,7 +661,7 @@ error:
return err;
}
-static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
+static int rawv6_probe_proto_opt(struct flowi6 *fl6, struct msghdr *msg)
{
struct iovec *iov;
u8 __user *type = NULL;
@@ -678,7 +678,7 @@ static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
if (!iov)
continue;
- switch (fl->proto) {
+ switch (fl6->flowi6_proto) {
case IPPROTO_ICMPV6:
/* check if one-byte field is readable or not. */
if (iov->iov_base && iov->iov_len < 1)
@@ -693,8 +693,8 @@ static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
code = iov->iov_base;
if (type && code) {
- if (get_user(fl->fl_icmp_type, type) ||
- get_user(fl->fl_icmp_code, code))
+ if (get_user(fl6->fl6_icmp_type, type) ||
+ get_user(fl6->fl6_icmp_code, code))
return -EFAULT;
probed = 1;
}
@@ -705,7 +705,7 @@ static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
/* check if type field is readable or not. */
if (iov->iov_len > 2 - len) {
u8 __user *p = iov->iov_base;
- if (get_user(fl->fl_mh_type, &p[2 - len]))
+ if (get_user(fl6->fl6_mh_type, &p[2 - len]))
return -EFAULT;
probed = 1;
} else
@@ -734,7 +734,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
struct ipv6_txoptions *opt = NULL;
struct ip6_flowlabel *flowlabel = NULL;
struct dst_entry *dst = NULL;
- struct flowi fl;
+ struct flowi6 fl6;
int addr_len = msg->msg_namelen;
int hlimit = -1;
int tclass = -1;
@@ -755,9 +755,9 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
/*
* Get and verify the address.
*/
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
- fl.mark = sk->sk_mark;
+ fl6.flowi6_mark = sk->sk_mark;
if (sin6) {
if (addr_len < SIN6_LEN_RFC2133)
@@ -779,9 +779,9 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
daddr = &sin6->sin6_addr;
if (np->sndflow) {
- fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) {
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+ fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
if (flowlabel == NULL)
return -EINVAL;
daddr = &flowlabel->dst;
@@ -799,32 +799,32 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
if (addr_len >= sizeof(struct sockaddr_in6) &&
sin6->sin6_scope_id &&
ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL)
- fl.oif = sin6->sin6_scope_id;
+ fl6.flowi6_oif = sin6->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
proto = inet->inet_num;
daddr = &np->daddr;
- fl.fl6_flowlabel = np->flow_label;
+ fl6.flowlabel = np->flow_label;
}
- if (fl.oif == 0)
- fl.oif = sk->sk_bound_dev_if;
+ if (fl6.flowi6_oif == 0)
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
opt = &opt_space;
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(struct ipv6_txoptions);
- err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit,
+ err = datagram_send_ctl(sock_net(sk), msg, &fl6, opt, &hlimit,
&tclass, &dontfrag);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
}
- if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+ if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
if (flowlabel == NULL)
return -EINVAL;
}
@@ -837,40 +837,31 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
- fl.proto = proto;
- err = rawv6_probe_proto_opt(&fl, msg);
+ fl6.flowi6_proto = proto;
+ err = rawv6_probe_proto_opt(&fl6, msg);
if (err)
goto out;
if (!ipv6_addr_any(daddr))
- ipv6_addr_copy(&fl.fl6_dst, daddr);
+ ipv6_addr_copy(&fl6.daddr, daddr);
else
- fl.fl6_dst.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
- if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr))
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+ fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
+ ipv6_addr_copy(&fl6.saddr, &np->saddr);
- final_p = fl6_update_dst(&fl, opt, &final);
+ final_p = fl6_update_dst(&fl6, opt, &final);
- if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
- fl.oif = np->mcast_oif;
- security_sk_classify_flow(sk, &fl);
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = np->mcast_oif;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
goto out;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT);
- if (err < 0) {
- if (err == -EREMOTE)
- err = ip6_dst_blackhole(sk, &dst, &fl);
- if (err < 0)
- goto out;
}
-
if (hlimit < 0) {
- if (ipv6_addr_is_multicast(&fl.fl6_dst))
+ if (ipv6_addr_is_multicast(&fl6.daddr))
hlimit = np->mcast_hops;
else
hlimit = np->hop_limit;
@@ -889,17 +880,17 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
back_from_confirm:
if (inet->hdrincl)
- err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl, &dst, msg->msg_flags);
+ err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl6, &dst, msg->msg_flags);
else {
lock_sock(sk);
err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov,
- len, 0, hlimit, tclass, opt, &fl, (struct rt6_info*)dst,
+ len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info*)dst,
msg->msg_flags, dontfrag);
if (err)
ip6_flush_pending_frames(sk);
else if (!(msg->msg_flags & MSG_MORE))
- err = rawv6_push_pending_frames(sk, &fl, rp);
+ err = rawv6_push_pending_frames(sk, &fl6, rp);
release_sock(sk);
}
done:
@@ -1157,6 +1148,23 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
}
}
+#ifdef CONFIG_COMPAT
+static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ case SIOCINQ:
+ return -ENOIOCTLCMD;
+ default:
+#ifdef CONFIG_IPV6_MROUTE
+ return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+#endif
+
static void rawv6_close(struct sock *sk, long timeout)
{
if (inet_sk(sk)->inet_num == IPPROTO_RAW)
@@ -1215,6 +1223,7 @@ struct proto rawv6_prot = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_rawv6_setsockopt,
.compat_getsockopt = compat_rawv6_getsockopt,
+ .compat_ioctl = compat_rawv6_ioctl,
#endif
};
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 373bd0416f6..6814c8722fa 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -72,8 +72,6 @@
#define RT6_TRACE(x...) do { ; } while (0)
#endif
-#define CLONE_OFFLINK_ROUTE 0
-
static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
@@ -99,6 +97,36 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
struct in6_addr *gwaddr, int ifindex);
#endif
+static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ struct rt6_info *rt = (struct rt6_info *) dst;
+ struct inet_peer *peer;
+ u32 *p = NULL;
+
+ if (!rt->rt6i_peer)
+ rt6_bind_peer(rt, 1);
+
+ peer = rt->rt6i_peer;
+ if (peer) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ p = peer->metrics;
+ if (inet_metrics_new(peer))
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ }
+ }
+ return p;
+}
+
static struct dst_ops ip6_dst_ops_template = {
.family = AF_INET6,
.protocol = cpu_to_be16(ETH_P_IPV6),
@@ -107,6 +135,7 @@ static struct dst_ops ip6_dst_ops_template = {
.check = ip6_dst_check,
.default_advmss = ip6_default_advmss,
.default_mtu = ip6_default_mtu,
+ .cow_metrics = ipv6_cow_metrics,
.destroy = ip6_dst_destroy,
.ifdown = ip6_dst_ifdown,
.negative_advice = ip6_negative_advice,
@@ -115,6 +144,11 @@ static struct dst_ops ip6_dst_ops_template = {
.local_out = __ip6_local_out,
};
+static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
+{
+ return 0;
+}
+
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
{
}
@@ -124,9 +158,15 @@ static struct dst_ops ip6_dst_blackhole_ops = {
.protocol = cpu_to_be16(ETH_P_IPV6),
.destroy = ip6_dst_destroy,
.check = ip6_dst_check,
+ .default_mtu = ip6_blackhole_default_mtu,
+ .default_advmss = ip6_default_advmss,
.update_pmtu = ip6_rt_blackhole_update_pmtu,
};
+static const u32 ip6_template_metrics[RTAX_MAX] = {
+ [RTAX_HOPLIMIT - 1] = 255,
+};
+
static struct rt6_info ip6_null_entry_template = {
.dst = {
.__refcnt = ATOMIC_INIT(1),
@@ -182,7 +222,7 @@ static struct rt6_info ip6_blk_hole_entry_template = {
/* allocate dst with ip6_dst_ops */
static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
{
- return (struct rt6_info *)dst_alloc(ops);
+ return (struct rt6_info *)dst_alloc(ops, 0);
}
static void ip6_dst_destroy(struct dst_entry *dst)
@@ -196,22 +236,27 @@ static void ip6_dst_destroy(struct dst_entry *dst)
in6_dev_put(idev);
}
if (peer) {
- BUG_ON(!(rt->rt6i_flags & RTF_CACHE));
rt->rt6i_peer = NULL;
inet_putpeer(peer);
}
}
+static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
+
+static u32 rt6_peer_genid(void)
+{
+ return atomic_read(&__rt6_peer_genid);
+}
+
void rt6_bind_peer(struct rt6_info *rt, int create)
{
struct inet_peer *peer;
- if (WARN_ON(!(rt->rt6i_flags & RTF_CACHE)))
- return;
-
peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
inet_putpeer(peer);
+ else
+ rt->rt6i_peer_genid = rt6_peer_genid();
}
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -554,17 +599,17 @@ do { \
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
struct fib6_table *table,
- struct flowi *fl, int flags)
+ struct flowi6 *fl6, int flags)
{
struct fib6_node *fn;
struct rt6_info *rt;
read_lock_bh(&table->tb6_lock);
- fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
+ fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
rt = fn->leaf;
- rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
- BACKTRACK(net, &fl->fl6_src);
+ rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
+ BACKTRACK(net, &fl6->saddr);
out:
dst_use(&rt->dst, jiffies);
read_unlock_bh(&table->tb6_lock);
@@ -575,19 +620,19 @@ out:
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
const struct in6_addr *saddr, int oif, int strict)
{
- struct flowi fl = {
- .oif = oif,
- .fl6_dst = *daddr,
+ struct flowi6 fl6 = {
+ .flowi6_oif = oif,
+ .daddr = *daddr,
};
struct dst_entry *dst;
int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
if (saddr) {
- memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
+ memcpy(&fl6.saddr, saddr, sizeof(*saddr));
flags |= RT6_LOOKUP_F_HAS_SADDR;
}
- dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
+ dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
if (dst->error == 0)
return (struct rt6_info *) dst;
@@ -708,7 +753,7 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *d
}
static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
- struct flowi *fl, int flags)
+ struct flowi6 *fl6, int flags)
{
struct fib6_node *fn;
struct rt6_info *rt, *nrt;
@@ -723,12 +768,12 @@ relookup:
read_lock_bh(&table->tb6_lock);
restart_2:
- fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
+ fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
rt = rt6_select(fn, oif, strict | reachable);
- BACKTRACK(net, &fl->fl6_src);
+ BACKTRACK(net, &fl6->saddr);
if (rt == net->ipv6.ip6_null_entry ||
rt->rt6i_flags & RTF_CACHE)
goto out;
@@ -737,14 +782,11 @@ restart:
read_unlock_bh(&table->tb6_lock);
if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
- nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
- else {
-#if CLONE_OFFLINK_ROUTE
- nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
-#else
+ nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
+ else if (!(rt->dst.flags & DST_HOST))
+ nrt = rt6_alloc_clone(rt, &fl6->daddr);
+ else
goto out2;
-#endif
- }
dst_release(&rt->dst);
rt = nrt ? : net->ipv6.ip6_null_entry;
@@ -781,9 +823,9 @@ out2:
}
static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
- struct flowi *fl, int flags)
+ struct flowi6 *fl6, int flags)
{
- return ip6_pol_route(net, table, fl->iif, fl, flags);
+ return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
}
void ip6_route_input(struct sk_buff *skb)
@@ -791,56 +833,54 @@ void ip6_route_input(struct sk_buff *skb)
struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
int flags = RT6_LOOKUP_F_HAS_SADDR;
- struct flowi fl = {
- .iif = skb->dev->ifindex,
- .fl6_dst = iph->daddr,
- .fl6_src = iph->saddr,
- .fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
- .mark = skb->mark,
- .proto = iph->nexthdr,
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->dev->ifindex,
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
+ .flowi6_mark = skb->mark,
+ .flowi6_proto = iph->nexthdr,
};
if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
flags |= RT6_LOOKUP_F_IFACE;
- skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
+ skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
}
static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
- struct flowi *fl, int flags)
+ struct flowi6 *fl6, int flags)
{
- return ip6_pol_route(net, table, fl->oif, fl, flags);
+ return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
}
struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
- struct flowi *fl)
+ struct flowi6 *fl6)
{
int flags = 0;
- if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
+ if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
flags |= RT6_LOOKUP_F_IFACE;
- if (!ipv6_addr_any(&fl->fl6_src))
+ if (!ipv6_addr_any(&fl6->saddr))
flags |= RT6_LOOKUP_F_HAS_SADDR;
else if (sk)
flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
- return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
+ return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
}
EXPORT_SYMBOL(ip6_route_output);
-int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
+struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
- struct rt6_info *ort = (struct rt6_info *) *dstp;
- struct rt6_info *rt = (struct rt6_info *)
- dst_alloc(&ip6_dst_blackhole_ops);
+ struct rt6_info *rt = dst_alloc(&ip6_dst_blackhole_ops, 1);
+ struct rt6_info *ort = (struct rt6_info *) dst_orig;
struct dst_entry *new = NULL;
if (rt) {
new = &rt->dst;
- atomic_set(&new->__refcnt, 1);
new->__use = 1;
new->input = dst_discard;
new->output = dst_discard;
@@ -866,11 +906,9 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl
dst_free(new);
}
- dst_release(*dstp);
- *dstp = new;
- return new ? 0 : -ENOMEM;
+ dst_release(dst_orig);
+ return new ? new : ERR_PTR(-ENOMEM);
}
-EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
/*
* Destination cache support functions
@@ -882,9 +920,14 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
rt = (struct rt6_info *) dst;
- if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
+ if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
+ if (rt->rt6i_peer_genid != rt6_peer_genid()) {
+ if (!rt->rt6i_peer)
+ rt6_bind_peer(rt, 0);
+ rt->rt6i_peer_genid = rt6_peer_genid();
+ }
return dst;
-
+ }
return NULL;
}
@@ -935,7 +978,6 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
dst_metric_set(dst, RTAX_FEATURES, features);
}
dst_metric_set(dst, RTAX_MTU, mtu);
- call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
}
}
@@ -1032,11 +1074,9 @@ out:
int icmp6_dst_gc(void)
{
- struct dst_entry *dst, *next, **pprev;
+ struct dst_entry *dst, **pprev;
int more = 0;
- next = NULL;
-
spin_lock_bh(&icmp6_dst_lock);
pprev = &icmp6_dst_gc_list;
@@ -1404,16 +1444,16 @@ static int ip6_route_del(struct fib6_config *cfg)
* Handle redirects
*/
struct ip6rd_flowi {
- struct flowi fl;
+ struct flowi6 fl6;
struct in6_addr gateway;
};
static struct rt6_info *__ip6_route_redirect(struct net *net,
struct fib6_table *table,
- struct flowi *fl,
+ struct flowi6 *fl6,
int flags)
{
- struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
+ struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
struct rt6_info *rt;
struct fib6_node *fn;
@@ -1429,7 +1469,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
*/
read_lock_bh(&table->tb6_lock);
- fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
+ fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
/*
@@ -1444,7 +1484,7 @@ restart:
continue;
if (!(rt->rt6i_flags & RTF_GATEWAY))
continue;
- if (fl->oif != rt->rt6i_dev->ifindex)
+ if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
continue;
if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
continue;
@@ -1453,7 +1493,7 @@ restart:
if (!rt)
rt = net->ipv6.ip6_null_entry;
- BACKTRACK(net, &fl->fl6_src);
+ BACKTRACK(net, &fl6->saddr);
out:
dst_hold(&rt->dst);
@@ -1470,10 +1510,10 @@ static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
int flags = RT6_LOOKUP_F_HAS_SADDR;
struct net *net = dev_net(dev);
struct ip6rd_flowi rdfl = {
- .fl = {
- .oif = dev->ifindex,
- .fl6_dst = *dest,
- .fl6_src = *src,
+ .fl6 = {
+ .flowi6_oif = dev->ifindex,
+ .daddr = *dest,
+ .saddr = *src,
},
};
@@ -1482,7 +1522,7 @@ static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
if (rt6_need_strict(dest))
flags |= RT6_LOOKUP_F_IFACE;
- return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
+ return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
flags, __ip6_route_redirect);
}
@@ -1984,12 +2024,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
if (IS_ERR(neigh)) {
dst_free(&rt->dst);
- /* We are casting this because that is the return
- * value type. But an errno encoded pointer is the
- * same regardless of the underlying pointer type,
- * and that's what we are returning. So this is OK.
- */
- return (struct rt6_info *) neigh;
+ return ERR_CAST(neigh);
}
rt->rt6i_nexthop = neigh;
@@ -2350,7 +2385,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
struct rt6_info *rt;
struct sk_buff *skb;
struct rtmsg *rtm;
- struct flowi fl;
+ struct flowi6 fl6;
int err, iif = 0;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
@@ -2358,27 +2393,27 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
goto errout;
err = -EINVAL;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
if (tb[RTA_SRC]) {
if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
goto errout;
- ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
+ ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
}
if (tb[RTA_DST]) {
if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
goto errout;
- ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
+ ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
}
if (tb[RTA_IIF])
iif = nla_get_u32(tb[RTA_IIF]);
if (tb[RTA_OIF])
- fl.oif = nla_get_u32(tb[RTA_OIF]);
+ fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
if (iif) {
struct net_device *dev;
@@ -2401,10 +2436,10 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
skb_reset_mac_header(skb);
skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
- rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
+ rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
skb_dst_set(skb, &rt->dst);
- err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
+ err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
nlh->nlmsg_seq, 0, 0, 0);
if (err < 0) {
@@ -2561,14 +2596,16 @@ static
int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- struct net *net = current->nsproxy->net_ns;
- int delay = net->ipv6.sysctl.flush_delay;
- if (write) {
- proc_dointvec(ctl, write, buffer, lenp, ppos);
- fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
- return 0;
- } else
+ struct net *net;
+ int delay;
+ if (!write)
return -EINVAL;
+
+ net = (struct net *)ctl->extra1;
+ delay = net->ipv6.sysctl.flush_delay;
+ proc_dointvec(ctl, write, buffer, lenp, ppos);
+ fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
+ return 0;
}
ctl_table ipv6_route_table_template[] = {
@@ -2655,6 +2692,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
if (table) {
table[0].data = &net->ipv6.sysctl.flush_delay;
+ table[0].extra1 = net;
table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
@@ -2688,7 +2726,8 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_null_entry->dst.path =
(struct dst_entry *)net->ipv6.ip6_null_entry;
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
+ dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
+ ip6_template_metrics, true);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
@@ -2699,7 +2738,8 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_prohibit_entry->dst.path =
(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
+ dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
+ ip6_template_metrics, true);
net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -2709,7 +2749,8 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_blk_hole_entry->dst.path =
(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
- dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
+ dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
+ ip6_template_metrics, true);
#endif
net->ipv6.sysctl.flush_delay = 0;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 8ce38f10a54..43b33373adb 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -412,7 +412,7 @@ static void prl_list_destroy_rcu(struct rcu_head *head)
p = container_of(head, struct ip_tunnel_prl_entry, rcu_head);
do {
- n = p->next;
+ n = rcu_dereference_protected(p->next, 1);
kfree(p);
p = n;
} while (p);
@@ -421,15 +421,17 @@ static void prl_list_destroy_rcu(struct rcu_head *head)
static int
ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
{
- struct ip_tunnel_prl_entry *x, **p;
+ struct ip_tunnel_prl_entry *x;
+ struct ip_tunnel_prl_entry __rcu **p;
int err = 0;
ASSERT_RTNL();
if (a && a->addr != htonl(INADDR_ANY)) {
- for (p = &t->prl; *p; p = &(*p)->next) {
- if ((*p)->addr == a->addr) {
- x = *p;
+ for (p = &t->prl;
+ (x = rtnl_dereference(*p)) != NULL;
+ p = &x->next) {
+ if (x->addr == a->addr) {
*p = x->next;
call_rcu(&x->rcu_head, prl_entry_destroy_rcu);
t->prl_count--;
@@ -438,9 +440,9 @@ ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
}
err = -ENXIO;
} else {
- if (t->prl) {
+ x = rtnl_dereference(t->prl);
+ if (x) {
t->prl_count = 0;
- x = t->prl;
call_rcu(&x->rcu_head, prl_list_destroy_rcu);
t->prl = NULL;
}
@@ -730,16 +732,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
dst = addr6->s6_addr32[3];
}
- {
- struct flowi fl = { .fl4_dst = dst,
- .fl4_src = tiph->saddr,
- .fl4_tos = RT_TOS(tos),
- .oif = tunnel->parms.link,
- .proto = IPPROTO_IPV6 };
- if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error_icmp;
- }
+ rt = ip_route_output_ports(dev_net(dev), NULL,
+ dst, tiph->saddr,
+ 0, 0,
+ IPPROTO_IPV6, RT_TOS(tos),
+ tunnel->parms.link);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
}
if (rt->rt_type != RTN_UNICAST) {
ip_rt_put(rt);
@@ -855,13 +855,14 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev)
iph = &tunnel->parms.iph;
if (iph->daddr) {
- struct flowi fl = { .fl4_dst = iph->daddr,
- .fl4_src = iph->saddr,
- .fl4_tos = RT_TOS(iph->tos),
- .oif = tunnel->parms.link,
- .proto = IPPROTO_IPV6 };
- struct rtable *rt;
- if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
+ struct rtable *rt = ip_route_output_ports(dev_net(dev), NULL,
+ iph->daddr, iph->saddr,
+ 0, 0,
+ IPPROTO_IPV6,
+ RT_TOS(iph->tos),
+ tunnel->parms.link);
+
+ if (!IS_ERR(rt)) {
tdev = rt->dst.dev;
ip_rt_put(rt);
}
@@ -1179,7 +1180,7 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
if (!dev->tstats)
return -ENOMEM;
dev_hold(dev);
- sitn->tunnels_wc[0] = tunnel;
+ rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
return 0;
}
@@ -1196,11 +1197,12 @@ static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_hea
for (prio = 1; prio < 4; prio++) {
int h;
for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t = sitn->tunnels[prio][h];
+ struct ip_tunnel *t;
+ t = rtnl_dereference(sitn->tunnels[prio][h]);
while (t != NULL) {
unregister_netdevice_queue(t->dev, head);
- t = t->next;
+ t = rtnl_dereference(t->next);
}
}
}
@@ -1290,4 +1292,4 @@ static int __init sit_init(void)
module_init(sit_init);
module_exit(sit_cleanup);
MODULE_LICENSE("GPL");
-MODULE_ALIAS("sit0");
+MODULE_ALIAS_NETDEV("sit0");
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 09fd34f0dbf..352c26081f5 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -232,23 +232,20 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
*/
{
struct in6_addr *final_p, final;
- struct flowi fl;
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- final_p = fl6_update_dst(&fl, np->opt, &final);
- ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
- fl.oif = sk->sk_bound_dev_if;
- fl.mark = sk->sk_mark;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_sk(sk)->inet_sport;
- security_req_classify_flow(req, &fl);
- if (ip6_dst_lookup(sk, &dst, &fl))
- goto out_free;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
- if ((xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0)
+ struct flowi6 fl6;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr);
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+ ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr);
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = inet_rsk(req)->rmt_port;
+ fl6.fl6_sport = inet_sk(sk)->inet_sport;
+ security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+ if (IS_ERR(dst))
goto out_free;
}
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index fa1d8f4e005..7cb65ef79f9 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -15,6 +15,8 @@
#include <net/addrconf.h>
#include <net/inet_frag.h>
+static struct ctl_table empty[1];
+
static ctl_table ipv6_table_template[] = {
{
.procname = "route",
@@ -35,6 +37,12 @@ static ctl_table ipv6_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "neigh",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = empty,
+ },
{ }
};
@@ -152,7 +160,6 @@ static struct ctl_table_header *ip6_base;
int ipv6_static_sysctl_register(void)
{
- static struct ctl_table empty[1];
ip6_base = register_sysctl_paths(net_ipv6_ctl_path, empty);
if (ip6_base == NULL)
return -ENOMEM;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 20aa95e3735..2b0c186862c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -131,7 +131,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct tcp_sock *tp = tcp_sk(sk);
struct in6_addr *saddr = NULL, *final_p, final;
struct rt6_info *rt;
- struct flowi fl;
+ struct flowi6 fl6;
struct dst_entry *dst;
int addr_type;
int err;
@@ -142,14 +142,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (usin->sin6_family != AF_INET6)
return -EAFNOSUPPORT;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
if (np->sndflow) {
- fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- IP6_ECN_flow_init(fl.fl6_flowlabel);
- if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) {
+ fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl6.flowlabel);
+ if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
struct ip6_flowlabel *flowlabel;
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
if (flowlabel == NULL)
return -EINVAL;
ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
@@ -195,7 +195,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
}
ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
- np->flow_label = fl.fl6_flowlabel;
+ np->flow_label = fl6.flowlabel;
/*
* TCP over IPv4
@@ -242,35 +242,27 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (!ipv6_addr_any(&np->rcv_saddr))
saddr = &np->rcv_saddr;
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src,
+ fl6.flowi6_proto = IPPROTO_TCP;
+ ipv6_addr_copy(&fl6.daddr, &np->daddr);
+ ipv6_addr_copy(&fl6.saddr,
(saddr ? saddr : &np->saddr));
- fl.oif = sk->sk_bound_dev_if;
- fl.mark = sk->sk_mark;
- fl.fl_ip_dport = usin->sin6_port;
- fl.fl_ip_sport = inet->inet_sport;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = usin->sin6_port;
+ fl6.fl6_sport = inet->inet_sport;
- final_p = fl6_update_dst(&fl, np->opt, &final);
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
- security_sk_classify_flow(sk, &fl);
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
goto failure;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT);
- if (err < 0) {
- if (err == -EREMOTE)
- err = ip6_dst_blackhole(sk, &dst, &fl);
- if (err < 0)
- goto failure;
}
if (saddr == NULL) {
- saddr = &fl.fl6_src;
+ saddr = &fl6.saddr;
ipv6_addr_copy(&np->rcv_saddr, saddr);
}
@@ -385,7 +377,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
np = inet6_sk(sk);
if (type == ICMPV6_PKT_TOOBIG) {
- struct dst_entry *dst = NULL;
+ struct dst_entry *dst;
if (sock_owned_by_user(sk))
goto out;
@@ -397,29 +389,25 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (dst == NULL) {
struct inet_sock *inet = inet_sk(sk);
- struct flowi fl;
+ struct flowi6 fl6;
/* BUGGG_FUTURE: Again, it is not clear how
to handle rthdr case. Ignore this complexity
for now.
*/
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.mark = sk->sk_mark;
- fl.fl_ip_dport = inet->inet_dport;
- fl.fl_ip_sport = inet->inet_sport;
- security_skb_classify_flow(skb, &fl);
-
- if ((err = ip6_dst_lookup(sk, &dst, &fl))) {
- sk->sk_err_soft = -err;
- goto out;
- }
-
- if ((err = xfrm_lookup(net, &dst, &fl, sk, 0)) < 0) {
- sk->sk_err_soft = -err;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ ipv6_addr_copy(&fl6.daddr, &np->daddr);
+ ipv6_addr_copy(&fl6.saddr, &np->saddr);
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = inet->inet_dport;
+ fl6.fl6_sport = inet->inet_sport;
+ security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false);
+ if (IS_ERR(dst)) {
+ sk->sk_err_soft = -PTR_ERR(dst);
goto out;
}
@@ -494,38 +482,36 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
struct sk_buff * skb;
struct ipv6_txoptions *opt = NULL;
struct in6_addr * final_p, final;
- struct flowi fl;
+ struct flowi6 fl6;
struct dst_entry *dst;
- int err = -1;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
- ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
- fl.fl6_flowlabel = 0;
- fl.oif = treq->iif;
- fl.mark = sk->sk_mark;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_rsk(req)->loc_port;
- security_req_classify_flow(req, &fl);
+ int err;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr);
+ ipv6_addr_copy(&fl6.saddr, &treq->loc_addr);
+ fl6.flowlabel = 0;
+ fl6.flowi6_oif = treq->iif;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = inet_rsk(req)->rmt_port;
+ fl6.fl6_sport = inet_rsk(req)->loc_port;
+ security_req_classify_flow(req, flowi6_to_flowi(&fl6));
opt = np->opt;
- final_p = fl6_update_dst(&fl, opt, &final);
+ final_p = fl6_update_dst(&fl6, opt, &final);
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto done;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
- if ((err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0)
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
goto done;
-
+ }
skb = tcp_make_synack(sk, dst, req, rvp);
+ err = -ENOMEM;
if (skb) {
__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
- ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
- err = ip6_xmit(sk, skb, &fl, opt);
+ ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr);
+ err = ip6_xmit(sk, skb, &fl6, opt);
err = net_xmit_eval(err);
}
@@ -1006,7 +992,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
{
struct tcphdr *th = tcp_hdr(skb), *t1;
struct sk_buff *buff;
- struct flowi fl;
+ struct flowi6 fl6;
struct net *net = dev_net(skb_dst(skb)->dev);
struct sock *ctl_sk = net->ipv6.tcp_sk;
unsigned int tot_len = sizeof(struct tcphdr);
@@ -1060,34 +1046,33 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
}
#endif
- memset(&fl, 0, sizeof(fl));
- ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr);
- ipv6_addr_copy(&fl.fl6_src, &ipv6_hdr(skb)->daddr);
+ memset(&fl6, 0, sizeof(fl6));
+ ipv6_addr_copy(&fl6.daddr, &ipv6_hdr(skb)->saddr);
+ ipv6_addr_copy(&fl6.saddr, &ipv6_hdr(skb)->daddr);
buff->ip_summed = CHECKSUM_PARTIAL;
buff->csum = 0;
- __tcp_v6_send_check(buff, &fl.fl6_src, &fl.fl6_dst);
+ __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr);
- fl.proto = IPPROTO_TCP;
- fl.oif = inet6_iif(skb);
- fl.fl_ip_dport = t1->dest;
- fl.fl_ip_sport = t1->source;
- security_skb_classify_flow(skb, &fl);
+ fl6.flowi6_proto = IPPROTO_TCP;
+ fl6.flowi6_oif = inet6_iif(skb);
+ fl6.fl6_dport = t1->dest;
+ fl6.fl6_sport = t1->source;
+ security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST
* Underlying function will use this to retrieve the network
* namespace
*/
- if (!ip6_dst_lookup(ctl_sk, &dst, &fl)) {
- if (xfrm_lookup(net, &dst, &fl, NULL, 0) >= 0) {
- skb_dst_set(buff, dst);
- ip6_xmit(ctl_sk, buff, &fl, NULL);
- TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
- if (rst)
- TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
- return;
- }
+ dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
+ if (!IS_ERR(dst)) {
+ skb_dst_set(buff, dst);
+ ip6_xmit(ctl_sk, buff, &fl6, NULL);
+ TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+ if (rst)
+ TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+ return;
}
kfree_skb(buff);
@@ -1323,7 +1308,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_death_row.sysctl_tw_recycle &&
(dst = inet6_csk_route_req(sk, req)) != NULL &&
(peer = rt6_get_peer((struct rt6_info *)dst)) != NULL &&
- ipv6_addr_equal((struct in6_addr *)peer->daddr.a6,
+ ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6,
&treq->rmt_addr)) {
inet_peer_refcheck(peer);
if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
@@ -1636,10 +1621,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
opt_skb = skb_clone(skb, GFP_ATOMIC);
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
- TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len))
goto reset;
- TCP_CHECK_TIMER(sk);
if (opt_skb)
goto ipv6_pktoptions;
return 0;
@@ -1667,10 +1650,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
}
}
- TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len))
goto reset;
- TCP_CHECK_TIMER(sk);
if (opt_skb)
goto ipv6_pktoptions;
return 0;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 9a009c66c8a..d7037c006e1 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -886,7 +886,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
struct udphdr *uh;
struct udp_sock *up = udp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
- struct flowi *fl = &inet->cork.fl;
+ struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
int err = 0;
int is_udplite = IS_UDPLITE(sk);
__wsum csum = 0;
@@ -899,23 +899,23 @@ static int udp_v6_push_pending_frames(struct sock *sk)
* Create a UDP header
*/
uh = udp_hdr(skb);
- uh->source = fl->fl_ip_sport;
- uh->dest = fl->fl_ip_dport;
+ uh->source = fl6->fl6_sport;
+ uh->dest = fl6->fl6_dport;
uh->len = htons(up->len);
uh->check = 0;
if (is_udplite)
csum = udplite_csum_outgoing(sk, skb);
else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
- udp6_hwcsum_outgoing(sk, skb, &fl->fl6_src, &fl->fl6_dst,
+ udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr,
up->len);
goto send;
} else
csum = udp_csum_outgoing(sk, skb);
/* add protocol-dependent pseudo-header */
- uh->check = csum_ipv6_magic(&fl->fl6_src, &fl->fl6_dst,
- up->len, fl->proto, csum );
+ uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
+ up->len, fl6->flowi6_proto, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
@@ -947,7 +947,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
struct in6_addr *daddr, *final_p, final;
struct ipv6_txoptions *opt = NULL;
struct ip6_flowlabel *flowlabel = NULL;
- struct flowi fl;
+ struct flowi6 fl6;
struct dst_entry *dst;
int addr_len = msg->msg_namelen;
int ulen = len;
@@ -1030,19 +1030,19 @@ do_udp_sendmsg:
}
ulen += sizeof(struct udphdr);
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
if (sin6) {
if (sin6->sin6_port == 0)
return -EINVAL;
- fl.fl_ip_dport = sin6->sin6_port;
+ fl6.fl6_dport = sin6->sin6_port;
daddr = &sin6->sin6_addr;
if (np->sndflow) {
- fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) {
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+ fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
if (flowlabel == NULL)
return -EINVAL;
daddr = &flowlabel->dst;
@@ -1060,38 +1060,38 @@ do_udp_sendmsg:
if (addr_len >= sizeof(struct sockaddr_in6) &&
sin6->sin6_scope_id &&
ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL)
- fl.oif = sin6->sin6_scope_id;
+ fl6.flowi6_oif = sin6->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
- fl.fl_ip_dport = inet->inet_dport;
+ fl6.fl6_dport = inet->inet_dport;
daddr = &np->daddr;
- fl.fl6_flowlabel = np->flow_label;
+ fl6.flowlabel = np->flow_label;
connected = 1;
}
- if (!fl.oif)
- fl.oif = sk->sk_bound_dev_if;
+ if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
- if (!fl.oif)
- fl.oif = np->sticky_pktinfo.ipi6_ifindex;
+ if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
- fl.mark = sk->sk_mark;
+ fl6.flowi6_mark = sk->sk_mark;
if (msg->msg_controllen) {
opt = &opt_space;
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(*opt);
- err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit,
+ err = datagram_send_ctl(sock_net(sk), msg, &fl6, opt, &hlimit,
&tclass, &dontfrag);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
}
- if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+ if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
if (flowlabel == NULL)
return -EINVAL;
}
@@ -1105,42 +1105,35 @@ do_udp_sendmsg:
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
- fl.proto = sk->sk_protocol;
+ fl6.flowi6_proto = sk->sk_protocol;
if (!ipv6_addr_any(daddr))
- ipv6_addr_copy(&fl.fl6_dst, daddr);
+ ipv6_addr_copy(&fl6.daddr, daddr);
else
- fl.fl6_dst.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
- if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr))
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.fl_ip_sport = inet->inet_sport;
+ fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
+ ipv6_addr_copy(&fl6.saddr, &np->saddr);
+ fl6.fl6_sport = inet->inet_sport;
- final_p = fl6_update_dst(&fl, opt, &final);
+ final_p = fl6_update_dst(&fl6, opt, &final);
if (final_p)
connected = 0;
- if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) {
- fl.oif = np->mcast_oif;
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) {
+ fl6.flowi6_oif = np->mcast_oif;
connected = 0;
}
- security_sk_classify_flow(sk, &fl);
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- err = ip6_sk_dst_lookup(sk, &dst, &fl);
- if (err)
+ dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, true);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
goto out;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT);
- if (err < 0) {
- if (err == -EREMOTE)
- err = ip6_dst_blackhole(sk, &dst, &fl);
- if (err < 0)
- goto out;
}
if (hlimit < 0) {
- if (ipv6_addr_is_multicast(&fl.fl6_dst))
+ if (ipv6_addr_is_multicast(&fl6.daddr))
hlimit = np->mcast_hops;
else
hlimit = np->hop_limit;
@@ -1175,7 +1168,7 @@ do_append_data:
up->len += ulen;
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
err = ip6_append_data(sk, getfrag, msg->msg_iov, ulen,
- sizeof(struct udphdr), hlimit, tclass, opt, &fl,
+ sizeof(struct udphdr), hlimit, tclass, opt, &fl6,
(struct rt6_info*)dst,
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag);
if (err)
@@ -1188,10 +1181,10 @@ do_append_data:
if (dst) {
if (connected) {
ip6_dst_store(sk, dst,
- ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ?
+ ipv6_addr_equal(&fl6.daddr, &np->daddr) ?
&np->daddr : NULL,
#ifdef CONFIG_IPV6_SUBTREES
- ipv6_addr_equal(&fl.fl6_src, &np->saddr) ?
+ ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
&np->saddr :
#endif
NULL);
@@ -1299,7 +1292,7 @@ static int udp6_ufo_send_check(struct sk_buff *skb)
return 0;
}
-static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, int features)
+static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 7e74023ea6e..05e34c8ec91 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -27,18 +27,19 @@
static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos,
- xfrm_address_t *saddr,
- xfrm_address_t *daddr)
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
{
- struct flowi fl = {};
+ struct flowi6 fl6;
struct dst_entry *dst;
int err;
- memcpy(&fl.fl6_dst, daddr, sizeof(fl.fl6_dst));
+ memset(&fl6, 0, sizeof(fl6));
+ memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
if (saddr)
- memcpy(&fl.fl6_src, saddr, sizeof(fl.fl6_src));
+ memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr));
- dst = ip6_route_output(net, NULL, &fl);
+ dst = ip6_route_output(net, NULL, &fl6);
err = dst->error;
if (dst->error) {
@@ -67,7 +68,7 @@ static int xfrm6_get_saddr(struct net *net,
return 0;
}
-static int xfrm6_get_tos(struct flowi *fl)
+static int xfrm6_get_tos(const struct flowi *fl)
{
return 0;
}
@@ -87,7 +88,7 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst,
}
static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
- struct flowi *fl)
+ const struct flowi *fl)
{
struct rt6_info *rt = (struct rt6_info*)xdst->route;
@@ -98,6 +99,10 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
if (!xdst->u.rt6.rt6i_idev)
return -ENODEV;
+ xdst->u.rt6.rt6i_peer = rt->rt6i_peer;
+ if (rt->rt6i_peer)
+ atomic_inc(&rt->rt6i_peer->refcnt);
+
/* Sheit... I remember I did this right. Apparently,
* it was magically lost, so this code needs audit */
xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST |
@@ -116,6 +121,7 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
static inline void
_decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
{
+ struct flowi6 *fl6 = &fl->u.ip6;
int onlyproto = 0;
u16 offset = skb_network_header_len(skb);
struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -123,11 +129,11 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
const unsigned char *nh = skb_network_header(skb);
u8 nexthdr = nh[IP6CB(skb)->nhoff];
- memset(fl, 0, sizeof(struct flowi));
- fl->mark = skb->mark;
+ memset(fl6, 0, sizeof(struct flowi6));
+ fl6->flowi6_mark = skb->mark;
- ipv6_addr_copy(&fl->fl6_dst, reverse ? &hdr->saddr : &hdr->daddr);
- ipv6_addr_copy(&fl->fl6_src, reverse ? &hdr->daddr : &hdr->saddr);
+ ipv6_addr_copy(&fl6->daddr, reverse ? &hdr->saddr : &hdr->daddr);
+ ipv6_addr_copy(&fl6->saddr, reverse ? &hdr->daddr : &hdr->saddr);
while (nh + offset + 1 < skb->data ||
pskb_may_pull(skb, nh + offset + 1 - skb->data)) {
@@ -154,20 +160,20 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
__be16 *ports = (__be16 *)exthdr;
- fl->fl_ip_sport = ports[!!reverse];
- fl->fl_ip_dport = ports[!reverse];
+ fl6->fl6_sport = ports[!!reverse];
+ fl6->fl6_dport = ports[!reverse];
}
- fl->proto = nexthdr;
+ fl6->flowi6_proto = nexthdr;
return;
case IPPROTO_ICMPV6:
if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) {
u8 *icmp = (u8 *)exthdr;
- fl->fl_icmp_type = icmp[0];
- fl->fl_icmp_code = icmp[1];
+ fl6->fl6_icmp_type = icmp[0];
+ fl6->fl6_icmp_code = icmp[1];
}
- fl->proto = nexthdr;
+ fl6->flowi6_proto = nexthdr;
return;
#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
@@ -176,9 +182,9 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
struct ip6_mh *mh;
mh = (struct ip6_mh *)exthdr;
- fl->fl_mh_type = mh->ip6mh_type;
+ fl6->fl6_mh_type = mh->ip6mh_type;
}
- fl->proto = nexthdr;
+ fl6->flowi6_proto = nexthdr;
return;
#endif
@@ -187,8 +193,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
case IPPROTO_ESP:
case IPPROTO_COMP:
default:
- fl->fl_ipsec_spi = 0;
- fl->proto = nexthdr;
+ fl6->fl6_ipsec_spi = 0;
+ fl6->flowi6_proto = nexthdr;
return;
}
}
@@ -216,6 +222,9 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
if (likely(xdst->u.rt6.rt6i_idev))
in6_dev_put(xdst->u.rt6.rt6i_idev);
+ dst_destroy_metrics_generic(dst);
+ if (likely(xdst->u.rt6.rt6i_peer))
+ inet_putpeer(xdst->u.rt6.rt6i_peer);
xfrm_dst_destroy(xdst);
}
@@ -251,6 +260,7 @@ static struct dst_ops xfrm6_dst_ops = {
.protocol = cpu_to_be16(ETH_P_IPV6),
.gc = xfrm6_garbage_collect,
.update_pmtu = xfrm6_update_pmtu,
+ .cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm6_dst_destroy,
.ifdown = xfrm6_dst_ifdown,
.local_out = __ip6_local_out,
@@ -266,6 +276,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
.get_tos = xfrm6_get_tos,
.init_path = xfrm6_init_path,
.fill_dst = xfrm6_fill_dst,
+ .blackhole_route = ip6_blackhole_route,
};
static int __init xfrm6_policy_init(void)
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index a67575d472a..afe941e9415 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -20,26 +20,28 @@
#include <net/addrconf.h>
static void
-__xfrm6_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
+__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
{
+ const struct flowi6 *fl6 = &fl->u.ip6;
+
/* Initialize temporary selector matching only
* to current session. */
- ipv6_addr_copy((struct in6_addr *)&sel->daddr, &fl->fl6_dst);
- ipv6_addr_copy((struct in6_addr *)&sel->saddr, &fl->fl6_src);
- sel->dport = xfrm_flowi_dport(fl);
+ ipv6_addr_copy((struct in6_addr *)&sel->daddr, &fl6->daddr);
+ ipv6_addr_copy((struct in6_addr *)&sel->saddr, &fl6->saddr);
+ sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
sel->dport_mask = htons(0xffff);
- sel->sport = xfrm_flowi_sport(fl);
+ sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
sel->sport_mask = htons(0xffff);
sel->family = AF_INET6;
sel->prefixlen_d = 128;
sel->prefixlen_s = 128;
- sel->proto = fl->proto;
- sel->ifindex = fl->oif;
+ sel->proto = fl6->flowi6_proto;
+ sel->ifindex = fl6->flowi6_oif;
}
static void
-xfrm6_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
- xfrm_address_t *daddr, xfrm_address_t *saddr)
+xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
+ const xfrm_address_t *daddr, const xfrm_address_t *saddr)
{
x->id = tmpl->id;
if (ipv6_addr_any((struct in6_addr*)&x->id.daddr))
diff --git a/net/key/af_key.c b/net/key/af_key.c
index d87c22df6f1..7db86ffcf07 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -70,7 +70,7 @@ static inline struct pfkey_sock *pfkey_sk(struct sock *sk)
return (struct pfkey_sock *)sk;
}
-static int pfkey_can_dump(struct sock *sk)
+static int pfkey_can_dump(const struct sock *sk)
{
if (3 * atomic_read(&sk->sk_rmem_alloc) <= 2 * sk->sk_rcvbuf)
return 1;
@@ -303,12 +303,13 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
return rc;
}
-static inline void pfkey_hdr_dup(struct sadb_msg *new, struct sadb_msg *orig)
+static inline void pfkey_hdr_dup(struct sadb_msg *new,
+ const struct sadb_msg *orig)
{
*new = *orig;
}
-static int pfkey_error(struct sadb_msg *orig, int err, struct sock *sk)
+static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk)
{
struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
struct sadb_msg *hdr;
@@ -369,13 +370,13 @@ static u8 sadb_ext_min_len[] = {
};
/* Verify sadb_address_{len,prefixlen} against sa_family. */
-static int verify_address_len(void *p)
+static int verify_address_len(const void *p)
{
- struct sadb_address *sp = p;
- struct sockaddr *addr = (struct sockaddr *)(sp + 1);
- struct sockaddr_in *sin;
+ const struct sadb_address *sp = p;
+ const struct sockaddr *addr = (const struct sockaddr *)(sp + 1);
+ const struct sockaddr_in *sin;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- struct sockaddr_in6 *sin6;
+ const struct sockaddr_in6 *sin6;
#endif
int len;
@@ -411,16 +412,16 @@ static int verify_address_len(void *p)
return 0;
}
-static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx)
+static inline int pfkey_sec_ctx_len(const struct sadb_x_sec_ctx *sec_ctx)
{
return DIV_ROUND_UP(sizeof(struct sadb_x_sec_ctx) +
sec_ctx->sadb_x_ctx_len,
sizeof(uint64_t));
}
-static inline int verify_sec_ctx_len(void *p)
+static inline int verify_sec_ctx_len(const void *p)
{
- struct sadb_x_sec_ctx *sec_ctx = (struct sadb_x_sec_ctx *)p;
+ const struct sadb_x_sec_ctx *sec_ctx = p;
int len = sec_ctx->sadb_x_ctx_len;
if (len > PAGE_SIZE)
@@ -434,7 +435,7 @@ static inline int verify_sec_ctx_len(void *p)
return 0;
}
-static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb_x_sec_ctx *sec_ctx)
+static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(const struct sadb_x_sec_ctx *sec_ctx)
{
struct xfrm_user_sec_ctx *uctx = NULL;
int ctx_size = sec_ctx->sadb_x_ctx_len;
@@ -455,16 +456,16 @@ static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb
return uctx;
}
-static int present_and_same_family(struct sadb_address *src,
- struct sadb_address *dst)
+static int present_and_same_family(const struct sadb_address *src,
+ const struct sadb_address *dst)
{
- struct sockaddr *s_addr, *d_addr;
+ const struct sockaddr *s_addr, *d_addr;
if (!src || !dst)
return 0;
- s_addr = (struct sockaddr *)(src + 1);
- d_addr = (struct sockaddr *)(dst + 1);
+ s_addr = (const struct sockaddr *)(src + 1);
+ d_addr = (const struct sockaddr *)(dst + 1);
if (s_addr->sa_family != d_addr->sa_family)
return 0;
if (s_addr->sa_family != AF_INET
@@ -477,15 +478,15 @@ static int present_and_same_family(struct sadb_address *src,
return 1;
}
-static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void **ext_hdrs)
{
- char *p = (char *) hdr;
+ const char *p = (char *) hdr;
int len = skb->len;
len -= sizeof(*hdr);
p += sizeof(*hdr);
while (len > 0) {
- struct sadb_ext *ehdr = (struct sadb_ext *) p;
+ const struct sadb_ext *ehdr = (const struct sadb_ext *) p;
uint16_t ext_type;
int ext_len;
@@ -514,7 +515,7 @@ static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_h
if (verify_sec_ctx_len(p))
return -EINVAL;
}
- ext_hdrs[ext_type-1] = p;
+ ext_hdrs[ext_type-1] = (void *) p;
}
p += ext_len;
len -= ext_len;
@@ -606,21 +607,21 @@ int pfkey_sockaddr_extract(const struct sockaddr *sa, xfrm_address_t *xaddr)
}
static
-int pfkey_sadb_addr2xfrm_addr(struct sadb_address *addr, xfrm_address_t *xaddr)
+int pfkey_sadb_addr2xfrm_addr(const struct sadb_address *addr, xfrm_address_t *xaddr)
{
return pfkey_sockaddr_extract((struct sockaddr *)(addr + 1),
xaddr);
}
-static struct xfrm_state *pfkey_xfrm_state_lookup(struct net *net, struct sadb_msg *hdr, void **ext_hdrs)
+static struct xfrm_state *pfkey_xfrm_state_lookup(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
- struct sadb_sa *sa;
- struct sadb_address *addr;
+ const struct sadb_sa *sa;
+ const struct sadb_address *addr;
uint16_t proto;
unsigned short family;
xfrm_address_t *xaddr;
- sa = (struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1];
+ sa = (const struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1];
if (sa == NULL)
return NULL;
@@ -629,18 +630,18 @@ static struct xfrm_state *pfkey_xfrm_state_lookup(struct net *net, struct sadb_
return NULL;
/* sadb_address_len should be checked by caller */
- addr = (struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1];
+ addr = (const struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1];
if (addr == NULL)
return NULL;
- family = ((struct sockaddr *)(addr + 1))->sa_family;
+ family = ((const struct sockaddr *)(addr + 1))->sa_family;
switch (family) {
case AF_INET:
- xaddr = (xfrm_address_t *)&((struct sockaddr_in *)(addr + 1))->sin_addr;
+ xaddr = (xfrm_address_t *)&((const struct sockaddr_in *)(addr + 1))->sin_addr;
break;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
case AF_INET6:
- xaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(addr + 1))->sin6_addr;
+ xaddr = (xfrm_address_t *)&((const struct sockaddr_in6 *)(addr + 1))->sin6_addr;
break;
#endif
default:
@@ -690,9 +691,9 @@ static inline int pfkey_mode_to_xfrm(int mode)
}
}
-static unsigned int pfkey_sockaddr_fill(xfrm_address_t *xaddr, __be16 port,
- struct sockaddr *sa,
- unsigned short family)
+static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port,
+ struct sockaddr *sa,
+ unsigned short family)
{
switch (family) {
case AF_INET:
@@ -720,7 +721,7 @@ static unsigned int pfkey_sockaddr_fill(xfrm_address_t *xaddr, __be16 port,
return 0;
}
-static struct sk_buff *__pfkey_xfrm_state2msg(struct xfrm_state *x,
+static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x,
int add_keys, int hsc)
{
struct sk_buff *skb;
@@ -1010,7 +1011,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(struct xfrm_state *x,
}
-static inline struct sk_buff *pfkey_xfrm_state2msg(struct xfrm_state *x)
+static inline struct sk_buff *pfkey_xfrm_state2msg(const struct xfrm_state *x)
{
struct sk_buff *skb;
@@ -1019,26 +1020,26 @@ static inline struct sk_buff *pfkey_xfrm_state2msg(struct xfrm_state *x)
return skb;
}
-static inline struct sk_buff *pfkey_xfrm_state2msg_expire(struct xfrm_state *x,
+static inline struct sk_buff *pfkey_xfrm_state2msg_expire(const struct xfrm_state *x,
int hsc)
{
return __pfkey_xfrm_state2msg(x, 0, hsc);
}
static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
- struct sadb_msg *hdr,
- void **ext_hdrs)
+ const struct sadb_msg *hdr,
+ void * const *ext_hdrs)
{
struct xfrm_state *x;
- struct sadb_lifetime *lifetime;
- struct sadb_sa *sa;
- struct sadb_key *key;
- struct sadb_x_sec_ctx *sec_ctx;
+ const struct sadb_lifetime *lifetime;
+ const struct sadb_sa *sa;
+ const struct sadb_key *key;
+ const struct sadb_x_sec_ctx *sec_ctx;
uint16_t proto;
int err;
- sa = (struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1];
+ sa = (const struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1];
if (!sa ||
!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
@@ -1077,7 +1078,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
sa->sadb_sa_encrypt > SADB_X_CALG_MAX) ||
sa->sadb_sa_encrypt > SADB_EALG_MAX)
return ERR_PTR(-EINVAL);
- key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
+ key = (const struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
if (key != NULL &&
sa->sadb_sa_auth != SADB_X_AALG_NULL &&
((key->sadb_key_bits+7) / 8 == 0 ||
@@ -1104,14 +1105,14 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC)
x->props.flags |= XFRM_STATE_NOPMTUDISC;
- lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1];
+ lifetime = (const struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1];
if (lifetime != NULL) {
x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime;
x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime;
}
- lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_SOFT-1];
+ lifetime = (const struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_SOFT-1];
if (lifetime != NULL) {
x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
@@ -1119,7 +1120,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
}
- sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+ sec_ctx = (const struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
if (sec_ctx != NULL) {
struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
@@ -1133,7 +1134,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
goto out;
}
- key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
+ key = (const struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
if (sa->sadb_sa_auth) {
int keysize = 0;
struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth);
@@ -1202,7 +1203,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
&x->id.daddr);
if (ext_hdrs[SADB_X_EXT_SA2-1]) {
- struct sadb_x_sa2 *sa2 = (void*)ext_hdrs[SADB_X_EXT_SA2-1];
+ const struct sadb_x_sa2 *sa2 = ext_hdrs[SADB_X_EXT_SA2-1];
int mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode);
if (mode < 0) {
err = -EINVAL;
@@ -1213,7 +1214,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
}
if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) {
- struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1];
+ const struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1];
/* Nobody uses this, but we try. */
x->sel.family = pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr);
@@ -1224,7 +1225,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
x->sel.family = x->props.family;
if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) {
- struct sadb_x_nat_t_type* n_type;
+ const struct sadb_x_nat_t_type* n_type;
struct xfrm_encap_tmpl *natt;
x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL);
@@ -1236,12 +1237,12 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
natt->encap_type = n_type->sadb_x_nat_t_type_type;
if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) {
- struct sadb_x_nat_t_port* n_port =
+ const struct sadb_x_nat_t_port *n_port =
ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1];
natt->encap_sport = n_port->sadb_x_nat_t_port_port;
}
if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) {
- struct sadb_x_nat_t_port* n_port =
+ const struct sadb_x_nat_t_port *n_port =
ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1];
natt->encap_dport = n_port->sadb_x_nat_t_port_port;
}
@@ -1261,12 +1262,12 @@ out:
return ERR_PTR(err);
}
-static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
return -EOPNOTSUPP;
}
-static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct net *net = sock_net(sk);
struct sk_buff *resp_skb;
@@ -1365,7 +1366,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
return 0;
}
-static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct net *net = sock_net(sk);
struct xfrm_state *x;
@@ -1429,7 +1430,7 @@ static inline int event2keytype(int event)
}
/* ADD/UPD/DEL */
-static int key_notify_sa(struct xfrm_state *x, struct km_event *c)
+static int key_notify_sa(struct xfrm_state *x, const struct km_event *c)
{
struct sk_buff *skb;
struct sadb_msg *hdr;
@@ -1453,7 +1454,7 @@ static int key_notify_sa(struct xfrm_state *x, struct km_event *c)
return 0;
}
-static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_add(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct net *net = sock_net(sk);
struct xfrm_state *x;
@@ -1492,7 +1493,7 @@ out:
return err;
}
-static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_delete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct net *net = sock_net(sk);
struct xfrm_state *x;
@@ -1534,7 +1535,7 @@ out:
return err;
}
-static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct net *net = sock_net(sk);
__u8 proto;
@@ -1570,7 +1571,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
return 0;
}
-static struct sk_buff *compose_sadb_supported(struct sadb_msg *orig,
+static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig,
gfp_t allocation)
{
struct sk_buff *skb;
@@ -1642,7 +1643,7 @@ out_put_algs:
return skb;
}
-static int pfkey_register(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct pfkey_sock *pfk = pfkey_sk(sk);
struct sk_buff *supp_skb;
@@ -1671,7 +1672,7 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, struct sadb_msg
return 0;
}
-static int unicast_flush_resp(struct sock *sk, struct sadb_msg *ihdr)
+static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr)
{
struct sk_buff *skb;
struct sadb_msg *hdr;
@@ -1688,7 +1689,7 @@ static int unicast_flush_resp(struct sock *sk, struct sadb_msg *ihdr)
return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk));
}
-static int key_notify_sa_flush(struct km_event *c)
+static int key_notify_sa_flush(const struct km_event *c)
{
struct sk_buff *skb;
struct sadb_msg *hdr;
@@ -1710,7 +1711,7 @@ static int key_notify_sa_flush(struct km_event *c)
return 0;
}
-static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct net *net = sock_net(sk);
unsigned proto;
@@ -1784,7 +1785,7 @@ static void pfkey_dump_sa_done(struct pfkey_sock *pfk)
xfrm_state_walk_done(&pfk->dump.u.state);
}
-static int pfkey_dump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
u8 proto;
struct pfkey_sock *pfk = pfkey_sk(sk);
@@ -1805,19 +1806,29 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr
return pfkey_do_dump(pfk);
}
-static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct pfkey_sock *pfk = pfkey_sk(sk);
int satype = hdr->sadb_msg_satype;
+ bool reset_errno = false;
if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) {
- /* XXX we mangle packet... */
- hdr->sadb_msg_errno = 0;
+ reset_errno = true;
if (satype != 0 && satype != 1)
return -EINVAL;
pfk->promisc = satype;
}
- pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk));
+ if (reset_errno && skb_cloned(skb))
+ skb = skb_copy(skb, GFP_KERNEL);
+ else
+ skb = skb_clone(skb, GFP_KERNEL);
+
+ if (reset_errno && skb) {
+ struct sadb_msg *new_hdr = (struct sadb_msg *) skb->data;
+ new_hdr->sadb_msg_errno = 0;
+ }
+
+ pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk));
return 0;
}
@@ -1921,7 +1932,7 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
return 0;
}
-static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp)
+static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp)
{
struct xfrm_sec_ctx *xfrm_ctx = xp->security;
@@ -1933,9 +1944,9 @@ static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp)
return 0;
}
-static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
+static int pfkey_xfrm_policy2msg_size(const struct xfrm_policy *xp)
{
- struct xfrm_tmpl *t;
+ const struct xfrm_tmpl *t;
int sockaddr_size = pfkey_sockaddr_size(xp->family);
int socklen = 0;
int i;
@@ -1955,7 +1966,7 @@ static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
pfkey_xfrm_policy2sec_ctx_size(xp);
}
-static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp)
+static struct sk_buff * pfkey_xfrm_policy2msg_prep(const struct xfrm_policy *xp)
{
struct sk_buff *skb;
int size;
@@ -1969,7 +1980,7 @@ static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp)
return skb;
}
-static int pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, int dir)
+static int pfkey_xfrm_policy2msg(struct sk_buff *skb, const struct xfrm_policy *xp, int dir)
{
struct sadb_msg *hdr;
struct sadb_address *addr;
@@ -2065,8 +2076,8 @@ static int pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, in
pol->sadb_x_policy_priority = xp->priority;
for (i=0; i<xp->xfrm_nr; i++) {
+ const struct xfrm_tmpl *t = xp->xfrm_vec + i;
struct sadb_x_ipsecrequest *rq;
- struct xfrm_tmpl *t = xp->xfrm_vec + i;
int req_size;
int mode;
@@ -2123,7 +2134,7 @@ static int pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, in
return 0;
}
-static int key_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
+static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
struct sk_buff *out_skb;
struct sadb_msg *out_hdr;
@@ -2152,7 +2163,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c
}
-static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct net *net = sock_net(sk);
int err = 0;
@@ -2273,7 +2284,7 @@ out:
return err;
}
-static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct net *net = sock_net(sk);
int err;
@@ -2350,7 +2361,7 @@ out:
return err;
}
-static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, struct sadb_msg *hdr, int dir)
+static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struct sadb_msg *hdr, int dir)
{
int err;
struct sk_buff *out_skb;
@@ -2458,7 +2469,7 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
}
static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
- struct sadb_msg *hdr, void **ext_hdrs)
+ const struct sadb_msg *hdr, void * const *ext_hdrs)
{
int i, len, ret, err = -EINVAL;
u8 dir;
@@ -2549,14 +2560,14 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
}
#else
static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
- struct sadb_msg *hdr, void **ext_hdrs)
+ const struct sadb_msg *hdr, void * const *ext_hdrs)
{
return -ENOPROTOOPT;
}
#endif
-static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct net *net = sock_net(sk);
unsigned int dir;
@@ -2644,7 +2655,7 @@ static void pfkey_dump_sp_done(struct pfkey_sock *pfk)
xfrm_policy_walk_done(&pfk->dump.u.policy);
}
-static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct pfkey_sock *pfk = pfkey_sk(sk);
@@ -2660,7 +2671,7 @@ static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *
return pfkey_do_dump(pfk);
}
-static int key_notify_policy_flush(struct km_event *c)
+static int key_notify_policy_flush(const struct km_event *c)
{
struct sk_buff *skb_out;
struct sadb_msg *hdr;
@@ -2680,7 +2691,7 @@ static int key_notify_policy_flush(struct km_event *c)
}
-static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
{
struct net *net = sock_net(sk);
struct km_event c;
@@ -2709,7 +2720,7 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg
}
typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb,
- struct sadb_msg *hdr, void **ext_hdrs);
+ const struct sadb_msg *hdr, void * const *ext_hdrs);
static pfkey_handler pfkey_funcs[SADB_MAX + 1] = {
[SADB_RESERVED] = pfkey_reserved,
[SADB_GETSPI] = pfkey_getspi,
@@ -2736,7 +2747,7 @@ static pfkey_handler pfkey_funcs[SADB_MAX + 1] = {
[SADB_X_MIGRATE] = pfkey_migrate,
};
-static int pfkey_process(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr)
+static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr)
{
void *ext_hdrs[SADB_EXT_MAX];
int err;
@@ -2781,7 +2792,8 @@ static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp)
return hdr;
}
-static inline int aalg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d)
+static inline int aalg_tmpl_set(const struct xfrm_tmpl *t,
+ const struct xfrm_algo_desc *d)
{
unsigned int id = d->desc.sadb_alg_id;
@@ -2791,7 +2803,8 @@ static inline int aalg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d)
return (t->aalgos >> id) & 1;
}
-static inline int ealg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d)
+static inline int ealg_tmpl_set(const struct xfrm_tmpl *t,
+ const struct xfrm_algo_desc *d)
{
unsigned int id = d->desc.sadb_alg_id;
@@ -2801,12 +2814,12 @@ static inline int ealg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d)
return (t->ealgos >> id) & 1;
}
-static int count_ah_combs(struct xfrm_tmpl *t)
+static int count_ah_combs(const struct xfrm_tmpl *t)
{
int i, sz = 0;
for (i = 0; ; i++) {
- struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
+ const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
if (!aalg)
break;
if (aalg_tmpl_set(t, aalg) && aalg->available)
@@ -2815,12 +2828,12 @@ static int count_ah_combs(struct xfrm_tmpl *t)
return sz + sizeof(struct sadb_prop);
}
-static int count_esp_combs(struct xfrm_tmpl *t)
+static int count_esp_combs(const struct xfrm_tmpl *t)
{
int i, k, sz = 0;
for (i = 0; ; i++) {
- struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
+ const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
if (!ealg)
break;
@@ -2828,7 +2841,7 @@ static int count_esp_combs(struct xfrm_tmpl *t)
continue;
for (k = 1; ; k++) {
- struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k);
+ const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k);
if (!aalg)
break;
@@ -2839,7 +2852,7 @@ static int count_esp_combs(struct xfrm_tmpl *t)
return sz + sizeof(struct sadb_prop);
}
-static void dump_ah_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
+static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
{
struct sadb_prop *p;
int i;
@@ -2851,7 +2864,7 @@ static void dump_ah_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved));
for (i = 0; ; i++) {
- struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
+ const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
if (!aalg)
break;
@@ -2871,7 +2884,7 @@ static void dump_ah_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
}
}
-static void dump_esp_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
+static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
{
struct sadb_prop *p;
int i, k;
@@ -2883,7 +2896,7 @@ static void dump_esp_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved));
for (i=0; ; i++) {
- struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
+ const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
if (!ealg)
break;
@@ -2892,7 +2905,7 @@ static void dump_esp_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
for (k = 1; ; k++) {
struct sadb_comb *c;
- struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k);
+ const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k);
if (!aalg)
break;
if (!(aalg_tmpl_set(t, aalg) && aalg->available))
@@ -2914,12 +2927,12 @@ static void dump_esp_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
}
}
-static int key_notify_policy_expire(struct xfrm_policy *xp, struct km_event *c)
+static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c)
{
return 0;
}
-static int key_notify_sa_expire(struct xfrm_state *x, struct km_event *c)
+static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c)
{
struct sk_buff *out_skb;
struct sadb_msg *out_hdr;
@@ -2949,7 +2962,7 @@ static int key_notify_sa_expire(struct xfrm_state *x, struct km_event *c)
return 0;
}
-static int pfkey_send_notify(struct xfrm_state *x, struct km_event *c)
+static int pfkey_send_notify(struct xfrm_state *x, const struct km_event *c)
{
struct net *net = x ? xs_net(x) : c->net;
struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
@@ -2976,7 +2989,7 @@ static int pfkey_send_notify(struct xfrm_state *x, struct km_event *c)
return 0;
}
-static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
+static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
if (xp && xp->type != XFRM_POLICY_TYPE_MAIN)
return 0;
@@ -3318,7 +3331,7 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
#ifdef CONFIG_NET_KEY_MIGRATE
static int set_sadb_address(struct sk_buff *skb, int sasize, int type,
- struct xfrm_selector *sel)
+ const struct xfrm_selector *sel)
{
struct sadb_address *addr;
addr = (struct sadb_address *)skb_put(skb, sizeof(struct sadb_address) + sasize);
@@ -3348,7 +3361,7 @@ static int set_sadb_address(struct sk_buff *skb, int sasize, int type,
}
-static int set_sadb_kmaddress(struct sk_buff *skb, struct xfrm_kmaddress *k)
+static int set_sadb_kmaddress(struct sk_buff *skb, const struct xfrm_kmaddress *k)
{
struct sadb_x_kmaddress *kma;
u8 *sa;
@@ -3376,7 +3389,7 @@ static int set_sadb_kmaddress(struct sk_buff *skb, struct xfrm_kmaddress *k)
static int set_ipsecrequest(struct sk_buff *skb,
uint8_t proto, uint8_t mode, int level,
uint32_t reqid, uint8_t family,
- xfrm_address_t *src, xfrm_address_t *dst)
+ const xfrm_address_t *src, const xfrm_address_t *dst)
{
struct sadb_x_ipsecrequest *rq;
u8 *sa;
@@ -3404,9 +3417,9 @@ static int set_ipsecrequest(struct sk_buff *skb,
#endif
#ifdef CONFIG_NET_KEY_MIGRATE
-static int pfkey_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
- struct xfrm_migrate *m, int num_bundles,
- struct xfrm_kmaddress *k)
+static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+ const struct xfrm_migrate *m, int num_bundles,
+ const struct xfrm_kmaddress *k)
{
int i;
int sasize_sel;
@@ -3415,7 +3428,7 @@ static int pfkey_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
struct sk_buff *skb;
struct sadb_msg *hdr;
struct sadb_x_policy *pol;
- struct xfrm_migrate *mp;
+ const struct xfrm_migrate *mp;
if (type != XFRM_POLICY_TYPE_MAIN)
return 0;
@@ -3513,9 +3526,9 @@ err:
return -EINVAL;
}
#else
-static int pfkey_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
- struct xfrm_migrate *m, int num_bundles,
- struct xfrm_kmaddress *k)
+static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+ const struct xfrm_migrate *m, int num_bundles,
+ const struct xfrm_kmaddress *k)
{
return -ENOPROTOOPT;
}
@@ -3655,6 +3668,7 @@ static int pfkey_seq_show(struct seq_file *f, void *v)
}
static void *pfkey_seq_start(struct seq_file *f, loff_t *ppos)
+ __acquires(rcu)
{
struct net *net = seq_file_net(f);
struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
@@ -3672,6 +3686,7 @@ static void *pfkey_seq_next(struct seq_file *f, void *v, loff_t *ppos)
}
static void pfkey_seq_stop(struct seq_file *f, void *v)
+ __releases(rcu)
{
rcu_read_unlock();
}
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 110efb704c9..fce9bd3bd3f 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -320,11 +320,12 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
if (ipv4_is_multicast(lsa->l2tp_addr.s_addr))
goto out;
- rc = ip_route_connect(&rt, lsa->l2tp_addr.s_addr, saddr,
+ rt = ip_route_connect(lsa->l2tp_addr.s_addr, saddr,
RT_CONN_FLAGS(sk), oif,
IPPROTO_L2TP,
- 0, 0, sk, 1);
- if (rc) {
+ 0, 0, sk, true);
+ if (IS_ERR(rt)) {
+ rc = PTR_ERR(rt);
if (rc == -ENETUNREACH)
IP_INC_STATS_BH(&init_net, IPSTATS_MIB_OUTNOROUTES);
goto out;
@@ -474,24 +475,17 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
if (opt && opt->srr)
daddr = opt->faddr;
- {
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .fl4_dst = daddr,
- .fl4_src = inet->inet_saddr,
- .fl4_tos = RT_CONN_FLAGS(sk),
- .proto = sk->sk_protocol,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = inet->inet_sport,
- .fl_ip_dport = inet->inet_dport };
-
- /* If this fails, retransmit mechanism of transport layer will
- * keep trying until route appears or the connection times
- * itself out.
- */
- security_sk_classify_flow(sk, &fl);
- if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
- goto no_route;
- }
+ /* If this fails, retransmit mechanism of transport layer will
+ * keep trying until route appears or the connection times
+ * itself out.
+ */
+ rt = ip_route_output_ports(sock_net(sk), sk,
+ daddr, inet->inet_saddr,
+ inet->inet_dport, inet->inet_sport,
+ sk->sk_protocol, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (IS_ERR(rt))
+ goto no_route;
sk_setup_caps(sk, &rt->dst);
}
skb_dst_set(skb, dst_clone(&rt->dst));
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index f9968743913..058f1e9a912 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -181,25 +181,26 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
* LLC functionality
*/
rcv = rcu_dereference(sap->rcv_func);
- if (rcv) {
- struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC);
- if (cskb)
- rcv(cskb, dev, pt, orig_dev);
- }
dest = llc_pdu_type(skb);
- if (unlikely(!dest || !llc_type_handlers[dest - 1]))
- goto drop_put;
- llc_type_handlers[dest - 1](sap, skb);
-out_put:
+ if (unlikely(!dest || !llc_type_handlers[dest - 1])) {
+ if (rcv)
+ rcv(skb, dev, pt, orig_dev);
+ else
+ kfree_skb(skb);
+ } else {
+ if (rcv) {
+ struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC);
+ if (cskb)
+ rcv(cskb, dev, pt, orig_dev);
+ }
+ llc_type_handlers[dest - 1](sap, skb);
+ }
llc_sap_put(sap);
out:
return 0;
drop:
kfree_skb(skb);
goto out;
-drop_put:
- kfree_skb(skb);
- goto out_put;
handle_station:
if (!llc_station_handler)
goto drop;
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 841dd1e2909..513f85cc2ae 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -20,7 +20,7 @@ config MAC80211_HAS_RC
bool
config MAC80211_RC_PID
- bool "PID controller based rate control algorithm" if EMBEDDED
+ bool "PID controller based rate control algorithm" if EXPERT
select MAC80211_HAS_RC
---help---
This option enables a TX rate control algorithm for
@@ -28,14 +28,14 @@ config MAC80211_RC_PID
rate.
config MAC80211_RC_MINSTREL
- bool "Minstrel" if EMBEDDED
+ bool "Minstrel" if EXPERT
select MAC80211_HAS_RC
default y
---help---
This option enables the 'minstrel' TX rate control algorithm
config MAC80211_RC_MINSTREL_HT
- bool "Minstrel 802.11n support" if EMBEDDED
+ bool "Minstrel 802.11n support" if EXPERT
depends on MAC80211_RC_MINSTREL
default y
---help---
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 5a4e19b8803..4054399be90 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1236,6 +1236,7 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local)
}
mutex_unlock(&local->iflist_mtx);
unregister_netdevice_many(&unreg_list);
+ list_del(&unreg_list);
}
static u32 ieee80211_idle_off(struct ieee80211_local *local,
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 1534f2b44ca..82a6e0d80f0 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -85,6 +85,17 @@ config NF_CONNTRACK_EVENTS
If unsure, say `N'.
+config NF_CONNTRACK_TIMESTAMP
+ bool 'Connection tracking timestamping'
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking timestamping.
+ This allows you to store the flow start-time and to obtain
+ the flow-stop time (once it has been destroyed) via Connection
+ tracking events.
+
+ If unsure, say `N'.
+
config NF_CT_PROTO_DCCP
tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)'
depends on EXPERIMENTAL
@@ -185,9 +196,13 @@ config NF_CONNTRACK_IRC
To compile it as a module, choose M here. If unsure, say N.
+config NF_CONNTRACK_BROADCAST
+ tristate
+
config NF_CONNTRACK_NETBIOS_NS
tristate "NetBIOS name service protocol support"
depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_BROADCAST
help
NetBIOS name service requests are sent as broadcast messages from an
unprivileged port and responded to with unicast messages to the
@@ -204,6 +219,21 @@ config NF_CONNTRACK_NETBIOS_NS
To compile it as a module, choose M here. If unsure, say N.
+config NF_CONNTRACK_SNMP
+ tristate "SNMP service protocol support"
+ depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_BROADCAST
+ help
+ SNMP service requests are sent as broadcast messages from an
+ unprivileged port and responded to with unicast messages to the
+ same port. This make them hard to firewall properly because connection
+ tracking doesn't deal with broadcasts. This helper tracks locally
+ originating SNMP service requests and the corresponding
+ responses. It relies on correct IP address configuration, specifically
+ netmask and broadcast address.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NF_CONNTRACK_PPTP
tristate "PPtP protocol support"
depends on NETFILTER_ADVANCED
@@ -322,10 +352,32 @@ config NETFILTER_XT_CONNMARK
ctmark), similarly to the packet mark (nfmark). Using this
target and match, you can set and match on this mark.
+config NETFILTER_XT_SET
+ tristate 'set target and match support'
+ depends on IP_SET
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds the "SET" target and "set" match.
+
+ Using this target and match, you can add/delete and match
+ elements in the sets created by ipset(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# alphabetically ordered list of targets
comment "Xtables targets"
+config NETFILTER_XT_TARGET_AUDIT
+ tristate "AUDIT target support"
+ depends on AUDIT
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a 'AUDIT' target, which can be used to create
+ audit records for packets dropped/accepted.
+
+ To compileit as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_CHECKSUM
tristate "CHECKSUM target support"
depends on IP_NF_MANGLE || IP6_NF_MANGLE
@@ -477,6 +529,7 @@ config NETFILTER_XT_TARGET_NFLOG
config NETFILTER_XT_TARGET_NFQUEUE
tristate '"NFQUEUE" target Support'
depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_QUEUE
help
This target replaced the old obsolete QUEUE target.
@@ -685,6 +738,15 @@ config NETFILTER_XT_MATCH_DCCP
If you want to compile it as a module, say M here and read
<file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+config NETFILTER_XT_MATCH_DEVGROUP
+ tristate '"devgroup" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This options adds a `devgroup' match, which allows to match on the
+ device group a network device is assigned to.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_MATCH_DSCP
tristate '"dscp" and "tos" match support'
depends on NETFILTER_ADVANCED
@@ -886,7 +948,7 @@ config NETFILTER_XT_MATCH_RATEEST
config NETFILTER_XT_MATCH_REALM
tristate '"realm" match support'
depends on NETFILTER_ADVANCED
- select NET_CLS_ROUTE
+ select IP_ROUTE_CLASSID
help
This option adds a `realm' match, which allows you to use the realm
key from the routing subsystem inside iptables.
@@ -1011,4 +1073,6 @@ endif # NETFILTER_XTABLES
endmenu
+source "net/netfilter/ipset/Kconfig"
+
source "net/netfilter/ipvs/Kconfig"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 441050f3111..d57a890eaee 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,6 +1,7 @@
netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
obj-$(CONFIG_NETFILTER) = netfilter.o
@@ -28,7 +29,9 @@ obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o
obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o
obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o
+obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o
obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o
+obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o
obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o
obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o
obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o
@@ -43,8 +46,10 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
# combos
obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
+obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o
# targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
@@ -72,6 +77,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o
obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DEVGROUP) += xt_devgroup.o
obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o
@@ -101,5 +107,8 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
+# ipset
+obj-$(CONFIG_IP_SET) += ipset/
+
# IPVS
obj-$(CONFIG_IP_VS) += ipvs/
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 32fcbe290c0..899b71c0ff5 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -133,6 +133,7 @@ unsigned int nf_iterate(struct list_head *head,
/* Optimization: we don't need to hold module
reference here, since function can't sleep. --RR */
+repeat:
verdict = elem->hook(hook, skb, indev, outdev, okfn);
if (verdict != NF_ACCEPT) {
#ifdef CONFIG_NETFILTER_DEBUG
@@ -145,7 +146,7 @@ unsigned int nf_iterate(struct list_head *head,
#endif
if (verdict != NF_REPEAT)
return verdict;
- *i = (*i)->prev;
+ goto repeat;
}
}
return NF_ACCEPT;
@@ -175,13 +176,21 @@ next_hook:
ret = 1;
} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
kfree_skb(skb);
- ret = -(verdict >> NF_VERDICT_BITS);
+ ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
- if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
- verdict >> NF_VERDICT_BITS))
- goto next_hook;
+ ret = nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
+ verdict >> NF_VERDICT_QBITS);
+ if (ret < 0) {
+ if (ret == -ECANCELED)
+ goto next_hook;
+ if (ret == -ESRCH &&
+ (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+ goto next_hook;
+ kfree_skb(skb);
+ }
+ ret = 0;
}
rcu_read_unlock();
return ret;
@@ -214,7 +223,7 @@ EXPORT_SYMBOL(skb_make_writable);
/* This does not belong here, but locally generated errors need it if connection
tracking in use: without this, connection may not be in hash table, and hence
manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
+void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly;
EXPORT_SYMBOL(ip_ct_attach);
void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
@@ -231,7 +240,7 @@ void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
}
EXPORT_SYMBOL(nf_ct_attach);
-void (*nf_ct_destroy)(struct nf_conntrack *);
+void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
EXPORT_SYMBOL(nf_ct_destroy);
void nf_conntrack_destroy(struct nf_conntrack *nfct)
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
new file mode 100644
index 00000000000..2c5b348eb3a
--- /dev/null
+++ b/net/netfilter/ipset/Kconfig
@@ -0,0 +1,122 @@
+menuconfig IP_SET
+ tristate "IP set support"
+ depends on INET && NETFILTER
+ depends on NETFILTER_NETLINK
+ help
+ This option adds IP set support to the kernel.
+ In order to define and use the sets, you need the userspace utility
+ ipset(8). You can use the sets in netfilter via the "set" match
+ and "SET" target.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP_SET
+
+config IP_SET_MAX
+ int "Maximum number of IP sets"
+ default 256
+ range 2 65534
+ depends on IP_SET
+ help
+ You can define here default value of the maximum number
+ of IP sets for the kernel.
+
+ The value can be overriden by the 'max_sets' module
+ parameter of the 'ip_set' module.
+
+config IP_SET_BITMAP_IP
+ tristate "bitmap:ip set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:ip set type support, by which one
+ can store IPv4 addresses (or network addresse) from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_BITMAP_IPMAC
+ tristate "bitmap:ip,mac set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:ip,mac set type support, by which one
+ can store IPv4 address and (source) MAC address pairs from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_BITMAP_PORT
+ tristate "bitmap:port set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:port set type support, by which one
+ can store TCP/UDP port numbers from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IP
+ tristate "hash:ip set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip set type support, by which one
+ can store arbitrary IPv4 or IPv6 addresses (or network addresses)
+ in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORT
+ tristate "hash:ip,port set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port set type support, by which one
+ can store IPv4/IPv6 address and protocol/port pairs.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORTIP
+ tristate "hash:ip,port,ip set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port,ip set type support, by which
+ one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+ address triples in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORTNET
+ tristate "hash:ip,port,net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port,net set type support, by which
+ one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+ network address/prefix triples in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NET
+ tristate "hash:net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net set type support, by which
+ one can store IPv4/IPv6 network address/prefix elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETPORT
+ tristate "hash:net,port set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,port set type support, by which
+ one can store IPv4/IPv6 network address/prefix and
+ protocol/port pairs as elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_LIST_SET
+ tristate "list:set set support"
+ depends on IP_SET
+ help
+ This option adds the list:set set type support. In this
+ kind of set one can store the name of other sets and it forms
+ an ordered union of the member sets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+endif # IP_SET
diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile
new file mode 100644
index 00000000000..5adbdab67bd
--- /dev/null
+++ b/net/netfilter/ipset/Makefile
@@ -0,0 +1,24 @@
+#
+# Makefile for the ipset modules
+#
+
+ip_set-y := ip_set_core.o ip_set_getport.o pfxlen.o
+
+# ipset core
+obj-$(CONFIG_IP_SET) += ip_set.o
+
+# bitmap types
+obj-$(CONFIG_IP_SET_BITMAP_IP) += ip_set_bitmap_ip.o
+obj-$(CONFIG_IP_SET_BITMAP_IPMAC) += ip_set_bitmap_ipmac.o
+obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o
+
+# hash types
+obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o
+obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o
+obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o
+obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o
+
+# list types
+obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
new file mode 100644
index 00000000000..bca96990218
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -0,0 +1,587 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+#define IP_SET_BITMAP_TIMEOUT
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("bitmap:ip type of IP sets");
+MODULE_ALIAS("ip_set_bitmap:ip");
+
+/* Type structure */
+struct bitmap_ip {
+ void *members; /* the set members */
+ u32 first_ip; /* host byte order, included in range */
+ u32 last_ip; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
+ u32 hosts; /* number of hosts in a subnet */
+ size_t memsize; /* members size */
+ u8 netmask; /* subnet netmask */
+ u32 timeout; /* timeout parameter */
+ struct timer_list gc; /* garbage collection */
+};
+
+/* Base variant */
+
+static inline u32
+ip_to_id(const struct bitmap_ip *m, u32 ip)
+{
+ return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts;
+}
+
+static int
+bitmap_ip_test(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_ip *map = set->data;
+ u16 id = *(u16 *)value;
+
+ return !!test_bit(id, map->members);
+}
+
+static int
+bitmap_ip_add(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ip *map = set->data;
+ u16 id = *(u16 *)value;
+
+ if (test_and_set_bit(id, map->members))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+static int
+bitmap_ip_del(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ip *map = set->data;
+ u16 id = *(u16 *)value;
+
+ if (!test_and_clear_bit(id, map->members))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+static int
+bitmap_ip_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_ip *map = set->data;
+ struct nlattr *atd, *nested;
+ u32 id, first = cb->args[2];
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[2] < map->elements; cb->args[2]++) {
+ id = cb->args[2];
+ if (!test_bit(id, map->members))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id * map->hosts));
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, atd);
+ if (unlikely(id == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+/* Timeout variant */
+
+static int
+bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_ip *map = set->data;
+ const unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+
+ return ip_set_timeout_test(members[id]);
+}
+
+static int
+bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ip *map = set->data;
+ unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+
+ if (ip_set_timeout_test(members[id]))
+ return -IPSET_ERR_EXIST;
+
+ members[id] = ip_set_timeout_set(timeout);
+
+ return 0;
+}
+
+static int
+bitmap_ip_tdel(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ip *map = set->data;
+ unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+ int ret = -IPSET_ERR_EXIST;
+
+ if (ip_set_timeout_test(members[id]))
+ ret = 0;
+
+ members[id] = IPSET_ELEM_UNSET;
+ return ret;
+}
+
+static int
+bitmap_ip_tlist(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_ip *map = set->data;
+ struct nlattr *adt, *nested;
+ u32 id, first = cb->args[2];
+ const unsigned long *members = map->members;
+
+ adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!adt)
+ return -EMSGSIZE;
+ for (; cb->args[2] < map->elements; cb->args[2]++) {
+ id = cb->args[2];
+ if (!ip_set_timeout_test(members[id]))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, adt);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id * map->hosts));
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(members[id])));
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, adt);
+
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, adt);
+ if (unlikely(id == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+static int
+bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ struct bitmap_ip *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ u32 ip;
+
+ ip = ntohl(ip4addr(skb, flags & IPSET_DIM_ONE_SRC));
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ ip = ip_to_id(map, ip);
+
+ return adtfn(set, &ip, map->timeout);
+}
+
+static int
+bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ struct bitmap_ip *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ u32 timeout = map->timeout;
+ u32 ip, ip_to, id;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(map->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST) {
+ id = ip_to_id(map, ip);
+ return adtfn(set, &id, timeout);
+ }
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to) {
+ swap(ip, ip_to);
+ if (ip < map->first_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip &= ip_set_hostmask(cidr);
+ ip_to = ip | ~ip_set_hostmask(cidr);
+ } else
+ ip_to = ip;
+
+ if (ip_to > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ for (; !before(ip_to, ip); ip += map->hosts) {
+ id = ip_to_id(map, ip);
+ ret = adtfn(set, &id, timeout);;
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static void
+bitmap_ip_destroy(struct ip_set *set)
+{
+ struct bitmap_ip *map = set->data;
+
+ if (with_timeout(map->timeout))
+ del_timer_sync(&map->gc);
+
+ ip_set_free(map->members);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static void
+bitmap_ip_flush(struct ip_set *set)
+{
+ struct bitmap_ip *map = set->data;
+
+ memset(map->members, 0, map->memsize);
+}
+
+static int
+bitmap_ip_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct bitmap_ip *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip));
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip));
+ if (map->netmask != 32)
+ NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, map->netmask);
+ NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES,
+ htonl(atomic_read(&set->ref) - 1));
+ NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) + map->memsize));
+ if (with_timeout(map->timeout))
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static bool
+bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_ip *x = a->data;
+ const struct bitmap_ip *y = b->data;
+
+ return x->first_ip == y->first_ip &&
+ x->last_ip == y->last_ip &&
+ x->netmask == y->netmask &&
+ x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant bitmap_ip = {
+ .kadt = bitmap_ip_kadt,
+ .uadt = bitmap_ip_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_ip_add,
+ [IPSET_DEL] = bitmap_ip_del,
+ [IPSET_TEST] = bitmap_ip_test,
+ },
+ .destroy = bitmap_ip_destroy,
+ .flush = bitmap_ip_flush,
+ .head = bitmap_ip_head,
+ .list = bitmap_ip_list,
+ .same_set = bitmap_ip_same_set,
+};
+
+static const struct ip_set_type_variant bitmap_tip = {
+ .kadt = bitmap_ip_kadt,
+ .uadt = bitmap_ip_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_ip_tadd,
+ [IPSET_DEL] = bitmap_ip_tdel,
+ [IPSET_TEST] = bitmap_ip_ttest,
+ },
+ .destroy = bitmap_ip_destroy,
+ .flush = bitmap_ip_flush,
+ .head = bitmap_ip_head,
+ .list = bitmap_ip_tlist,
+ .same_set = bitmap_ip_same_set,
+};
+
+static void
+bitmap_ip_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct bitmap_ip *map = set->data;
+ unsigned long *table = map->members;
+ u32 id;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (id = 0; id < map->elements; id++)
+ if (ip_set_timeout_expired(table[id]))
+ table[id] = IPSET_ELEM_UNSET;
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+bitmap_ip_gc_init(struct ip_set *set)
+{
+ struct bitmap_ip *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = bitmap_ip_gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_ip(struct ip_set *set, struct bitmap_ip *map,
+ u32 first_ip, u32 last_ip,
+ u32 elements, u32 hosts, u8 netmask)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ map->first_ip = first_ip;
+ map->last_ip = last_ip;
+ map->elements = elements;
+ map->hosts = hosts;
+ map->netmask = netmask;
+ map->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = AF_INET;
+
+ return true;
+}
+
+static int
+bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ struct bitmap_ip *map;
+ u32 first_ip, last_ip, hosts, elements;
+ u8 netmask = 32;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+ if (ret)
+ return ret;
+ if (first_ip > last_ip) {
+ u32 tmp = first_ip;
+
+ first_ip = last_ip;
+ last_ip = tmp;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr >= 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ last_ip = first_ip | ~ip_set_hostmask(cidr);
+ } else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_NETMASK]) {
+ netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+ if (netmask > 32)
+ return -IPSET_ERR_INVALID_NETMASK;
+
+ first_ip &= ip_set_hostmask(netmask);
+ last_ip |= ~ip_set_hostmask(netmask);
+ }
+
+ if (netmask == 32) {
+ hosts = 1;
+ elements = last_ip - first_ip + 1;
+ } else {
+ u8 mask_bits;
+ u32 mask;
+
+ mask = range_to_mask(first_ip, last_ip, &mask_bits);
+
+ if ((!mask && (first_ip || last_ip != 0xFFFFFFFF)) ||
+ netmask <= mask_bits)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask);
+ hosts = 2 << (32 - netmask - 1);
+ elements = 2 << (netmask - mask_bits - 1);
+ }
+ if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+ return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+ pr_debug("hosts %u, elements %u\n", hosts, elements);
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->memsize = elements * sizeof(unsigned long);
+
+ if (!init_map_ip(set, map, first_ip, last_ip,
+ elements, hosts, netmask)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ set->variant = &bitmap_tip;
+
+ bitmap_ip_gc_init(set);
+ } else {
+ map->memsize = bitmap_bytes(0, elements - 1);
+
+ if (!init_map_ip(set, map, first_ip, last_ip,
+ elements, hosts, netmask)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ set->variant = &bitmap_ip;
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_ip_type __read_mostly = {
+ .name = "bitmap:ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = AF_INET,
+ .revision = 0,
+ .create = bitmap_ip_create,
+ .create_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_ip_init(void)
+{
+ return ip_set_type_register(&bitmap_ip_type);
+}
+
+static void __exit
+bitmap_ip_fini(void)
+{
+ ip_set_type_unregister(&bitmap_ip_type);
+}
+
+module_init(bitmap_ip_init);
+module_exit(bitmap_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
new file mode 100644
index 00000000000..5e790172def
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -0,0 +1,652 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Martin Josefsson <gandalf@wlug.westbo.se>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip,mac type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("bitmap:ip,mac type of IP sets");
+MODULE_ALIAS("ip_set_bitmap:ip,mac");
+
+enum {
+ MAC_EMPTY, /* element is not set */
+ MAC_FILLED, /* element is set with MAC */
+ MAC_UNSET, /* element is set, without MAC */
+};
+
+/* Type structure */
+struct bitmap_ipmac {
+ void *members; /* the set members */
+ u32 first_ip; /* host byte order, included in range */
+ u32 last_ip; /* host byte order, included in range */
+ u32 timeout; /* timeout value */
+ struct timer_list gc; /* garbage collector */
+ size_t dsize; /* size of element */
+};
+
+/* ADT structure for generic function args */
+struct ipmac {
+ u32 id; /* id in array */
+ unsigned char *ether; /* ethernet address */
+};
+
+/* Member element without and with timeout */
+
+struct ipmac_elem {
+ unsigned char ether[ETH_ALEN];
+ unsigned char match;
+} __attribute__ ((aligned));
+
+struct ipmac_telem {
+ unsigned char ether[ETH_ALEN];
+ unsigned char match;
+ unsigned long timeout;
+} __attribute__ ((aligned));
+
+static inline void *
+bitmap_ipmac_elem(const struct bitmap_ipmac *map, u32 id)
+{
+ return (void *)((char *)map->members + id * map->dsize);
+}
+
+static inline bool
+bitmap_timeout(const struct bitmap_ipmac *map, u32 id)
+{
+ const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id);
+
+ return ip_set_timeout_test(elem->timeout);
+}
+
+static inline bool
+bitmap_expired(const struct bitmap_ipmac *map, u32 id)
+{
+ const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id);
+
+ return ip_set_timeout_expired(elem->timeout);
+}
+
+static inline int
+bitmap_ipmac_exist(const struct ipmac_telem *elem)
+{
+ return elem->match == MAC_UNSET ||
+ (elem->match == MAC_FILLED &&
+ !ip_set_timeout_expired(elem->timeout));
+}
+
+/* Base variant */
+
+static int
+bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+ switch (elem->match) {
+ case MAC_UNSET:
+ /* Trigger kernel to fill out the ethernet address */
+ return -EAGAIN;
+ case MAC_FILLED:
+ return data->ether == NULL ||
+ compare_ether_addr(data->ether, elem->ether) == 0;
+ }
+ return 0;
+}
+
+static int
+bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+ switch (elem->match) {
+ case MAC_UNSET:
+ if (!data->ether)
+ /* Already added without ethernet address */
+ return -IPSET_ERR_EXIST;
+ /* Fill the MAC address */
+ memcpy(elem->ether, data->ether, ETH_ALEN);
+ elem->match = MAC_FILLED;
+ break;
+ case MAC_FILLED:
+ return -IPSET_ERR_EXIST;
+ case MAC_EMPTY:
+ if (data->ether) {
+ memcpy(elem->ether, data->ether, ETH_ALEN);
+ elem->match = MAC_FILLED;
+ } else
+ elem->match = MAC_UNSET;
+ }
+
+ return 0;
+}
+
+static int
+bitmap_ipmac_del(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+ if (elem->match == MAC_EMPTY)
+ return -IPSET_ERR_EXIST;
+
+ elem->match = MAC_EMPTY;
+
+ return 0;
+}
+
+static int
+bitmap_ipmac_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_ipmac *map = set->data;
+ const struct ipmac_elem *elem;
+ struct nlattr *atd, *nested;
+ u32 id, first = cb->args[2];
+ u32 last = map->last_ip - map->first_ip;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[2] <= last; cb->args[2]++) {
+ id = cb->args[2];
+ elem = bitmap_ipmac_elem(map, id);
+ if (elem->match == MAC_EMPTY)
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id));
+ if (elem->match == MAC_FILLED)
+ NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN,
+ elem->ether);
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, atd);
+ if (unlikely(id == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+/* Timeout variant */
+
+static int
+bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+ switch (elem->match) {
+ case MAC_UNSET:
+ /* Trigger kernel to fill out the ethernet address */
+ return -EAGAIN;
+ case MAC_FILLED:
+ return (data->ether == NULL ||
+ compare_ether_addr(data->ether, elem->ether) == 0) &&
+ !bitmap_expired(map, data->id);
+ }
+ return 0;
+}
+
+static int
+bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id);
+
+ switch (elem->match) {
+ case MAC_UNSET:
+ if (!data->ether)
+ /* Already added without ethernet address */
+ return -IPSET_ERR_EXIST;
+ /* Fill the MAC address and activate the timer */
+ memcpy(elem->ether, data->ether, ETH_ALEN);
+ elem->match = MAC_FILLED;
+ if (timeout == map->timeout)
+ /* Timeout was not specified, get stored one */
+ timeout = elem->timeout;
+ elem->timeout = ip_set_timeout_set(timeout);
+ break;
+ case MAC_FILLED:
+ if (!bitmap_expired(map, data->id))
+ return -IPSET_ERR_EXIST;
+ /* Fall through */
+ case MAC_EMPTY:
+ if (data->ether) {
+ memcpy(elem->ether, data->ether, ETH_ALEN);
+ elem->match = MAC_FILLED;
+ } else
+ elem->match = MAC_UNSET;
+ /* If MAC is unset yet, we store plain timeout value
+ * because the timer is not activated yet
+ * and we can reuse it later when MAC is filled out,
+ * possibly by the kernel */
+ elem->timeout = data->ether ? ip_set_timeout_set(timeout)
+ : timeout;
+ break;
+ }
+
+ return 0;
+}
+
+static int
+bitmap_ipmac_tdel(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_ipmac *map = set->data;
+ const struct ipmac *data = value;
+ struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id);
+
+ if (elem->match == MAC_EMPTY || bitmap_expired(map, data->id))
+ return -IPSET_ERR_EXIST;
+
+ elem->match = MAC_EMPTY;
+
+ return 0;
+}
+
+static int
+bitmap_ipmac_tlist(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_ipmac *map = set->data;
+ const struct ipmac_telem *elem;
+ struct nlattr *atd, *nested;
+ u32 id, first = cb->args[2];
+ u32 timeout, last = map->last_ip - map->first_ip;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[2] <= last; cb->args[2]++) {
+ id = cb->args[2];
+ elem = bitmap_ipmac_elem(map, id);
+ if (!bitmap_ipmac_exist(elem))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id));
+ if (elem->match == MAC_FILLED)
+ NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN,
+ elem->ether);
+ timeout = elem->match == MAC_UNSET ? elem->timeout
+ : ip_set_timeout_get(elem->timeout);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(timeout));
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, atd);
+ return -EMSGSIZE;
+}
+
+static int
+bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ struct bitmap_ipmac *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct ipmac data;
+
+ data.id = ntohl(ip4addr(skb, flags & IPSET_DIM_ONE_SRC));
+ if (data.id < map->first_ip || data.id > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ /* Backward compatibility: we don't check the second flag */
+ if (skb_mac_header(skb) < skb->head ||
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ return -EINVAL;
+
+ data.id -= map->first_ip;
+ data.ether = eth_hdr(skb)->h_source;
+
+ return adtfn(set, &data, map->timeout);
+}
+
+static int
+bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct bitmap_ipmac *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct ipmac data;
+ u32 timeout = map->timeout;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &data.id);
+ if (ret)
+ return ret;
+
+ if (data.id < map->first_ip || data.id > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ if (tb[IPSET_ATTR_ETHER])
+ data.ether = nla_data(tb[IPSET_ATTR_ETHER]);
+ else
+ data.ether = NULL;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(map->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ data.id -= map->first_ip;
+
+ ret = adtfn(set, &data, timeout);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static void
+bitmap_ipmac_destroy(struct ip_set *set)
+{
+ struct bitmap_ipmac *map = set->data;
+
+ if (with_timeout(map->timeout))
+ del_timer_sync(&map->gc);
+
+ ip_set_free(map->members);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static void
+bitmap_ipmac_flush(struct ip_set *set)
+{
+ struct bitmap_ipmac *map = set->data;
+
+ memset(map->members, 0,
+ (map->last_ip - map->first_ip + 1) * map->dsize);
+}
+
+static int
+bitmap_ipmac_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct bitmap_ipmac *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip));
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip));
+ NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES,
+ htonl(atomic_read(&set->ref) - 1));
+ NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map)
+ + (map->last_ip - map->first_ip + 1) * map->dsize));
+ if (with_timeout(map->timeout))
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static bool
+bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_ipmac *x = a->data;
+ const struct bitmap_ipmac *y = b->data;
+
+ return x->first_ip == y->first_ip &&
+ x->last_ip == y->last_ip &&
+ x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant bitmap_ipmac = {
+ .kadt = bitmap_ipmac_kadt,
+ .uadt = bitmap_ipmac_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_ipmac_add,
+ [IPSET_DEL] = bitmap_ipmac_del,
+ [IPSET_TEST] = bitmap_ipmac_test,
+ },
+ .destroy = bitmap_ipmac_destroy,
+ .flush = bitmap_ipmac_flush,
+ .head = bitmap_ipmac_head,
+ .list = bitmap_ipmac_list,
+ .same_set = bitmap_ipmac_same_set,
+};
+
+static const struct ip_set_type_variant bitmap_tipmac = {
+ .kadt = bitmap_ipmac_kadt,
+ .uadt = bitmap_ipmac_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_ipmac_tadd,
+ [IPSET_DEL] = bitmap_ipmac_tdel,
+ [IPSET_TEST] = bitmap_ipmac_ttest,
+ },
+ .destroy = bitmap_ipmac_destroy,
+ .flush = bitmap_ipmac_flush,
+ .head = bitmap_ipmac_head,
+ .list = bitmap_ipmac_tlist,
+ .same_set = bitmap_ipmac_same_set,
+};
+
+static void
+bitmap_ipmac_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct bitmap_ipmac *map = set->data;
+ struct ipmac_telem *elem;
+ u32 id, last = map->last_ip - map->first_ip;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (id = 0; id <= last; id++) {
+ elem = bitmap_ipmac_elem(map, id);
+ if (elem->match == MAC_FILLED &&
+ ip_set_timeout_expired(elem->timeout))
+ elem->match = MAC_EMPTY;
+ }
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+bitmap_ipmac_gc_init(struct ip_set *set)
+{
+ struct bitmap_ipmac *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = bitmap_ipmac_gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+/* Create bitmap:ip,mac type of sets */
+
+static bool
+init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
+ u32 first_ip, u32 last_ip)
+{
+ map->members = ip_set_alloc((last_ip - first_ip + 1) * map->dsize);
+ if (!map->members)
+ return false;
+ map->first_ip = first_ip;
+ map->last_ip = last_ip;
+ map->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = AF_INET;
+
+ return true;
+}
+
+static int
+bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ u32 first_ip, last_ip, elements;
+ struct bitmap_ipmac *map;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+ if (ret)
+ return ret;
+ if (first_ip > last_ip) {
+ u32 tmp = first_ip;
+
+ first_ip = last_ip;
+ last_ip = tmp;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr >= 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ last_ip = first_ip | ~ip_set_hostmask(cidr);
+ } else
+ return -IPSET_ERR_PROTOCOL;
+
+ elements = last_ip - first_ip + 1;
+
+ if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+ return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->dsize = sizeof(struct ipmac_telem);
+
+ if (!init_map_ipmac(set, map, first_ip, last_ip)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = &bitmap_tipmac;
+
+ bitmap_ipmac_gc_init(set);
+ } else {
+ map->dsize = sizeof(struct ipmac_elem);
+
+ if (!init_map_ipmac(set, map, first_ip, last_ip)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ set->variant = &bitmap_ipmac;
+
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_ipmac_type = {
+ .name = "bitmap:ip,mac",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_MAC,
+ .dimension = IPSET_DIM_TWO,
+ .family = AF_INET,
+ .revision = 0,
+ .create = bitmap_ipmac_create,
+ .create_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, .len = ETH_ALEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_ipmac_init(void)
+{
+ return ip_set_type_register(&bitmap_ipmac_type);
+}
+
+static void __exit
+bitmap_ipmac_fini(void)
+{
+ ip_set_type_unregister(&bitmap_ipmac_type);
+}
+
+module_init(bitmap_ipmac_init);
+module_exit(bitmap_ipmac_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
new file mode 100644
index 00000000000..165f09b1a9c
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -0,0 +1,515 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:port type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#define IP_SET_BITMAP_TIMEOUT
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("bitmap:port type of IP sets");
+MODULE_ALIAS("ip_set_bitmap:port");
+
+/* Type structure */
+struct bitmap_port {
+ void *members; /* the set members */
+ u16 first_port; /* host byte order, included in range */
+ u16 last_port; /* host byte order, included in range */
+ size_t memsize; /* members size */
+ u32 timeout; /* timeout parameter */
+ struct timer_list gc; /* garbage collection */
+};
+
+/* Base variant */
+
+static int
+bitmap_port_test(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_port *map = set->data;
+ u16 id = *(u16 *)value;
+
+ return !!test_bit(id, map->members);
+}
+
+static int
+bitmap_port_add(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_port *map = set->data;
+ u16 id = *(u16 *)value;
+
+ if (test_and_set_bit(id, map->members))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+static int
+bitmap_port_del(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_port *map = set->data;
+ u16 id = *(u16 *)value;
+
+ if (!test_and_clear_bit(id, map->members))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+static int
+bitmap_port_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_port *map = set->data;
+ struct nlattr *atd, *nested;
+ u16 id, first = cb->args[2];
+ u16 last = map->last_port - map->first_port;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[2] <= last; cb->args[2]++) {
+ id = cb->args[2];
+ if (!test_bit(id, map->members))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT,
+ htons(map->first_port + id));
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, atd);
+ if (unlikely(id == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+/* Timeout variant */
+
+static int
+bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout)
+{
+ const struct bitmap_port *map = set->data;
+ const unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+
+ return ip_set_timeout_test(members[id]);
+}
+
+static int
+bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_port *map = set->data;
+ unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+
+ if (ip_set_timeout_test(members[id]))
+ return -IPSET_ERR_EXIST;
+
+ members[id] = ip_set_timeout_set(timeout);
+
+ return 0;
+}
+
+static int
+bitmap_port_tdel(struct ip_set *set, void *value, u32 timeout)
+{
+ struct bitmap_port *map = set->data;
+ unsigned long *members = map->members;
+ u16 id = *(u16 *)value;
+ int ret = -IPSET_ERR_EXIST;
+
+ if (ip_set_timeout_test(members[id]))
+ ret = 0;
+
+ members[id] = IPSET_ELEM_UNSET;
+ return ret;
+}
+
+static int
+bitmap_port_tlist(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct bitmap_port *map = set->data;
+ struct nlattr *adt, *nested;
+ u16 id, first = cb->args[2];
+ u16 last = map->last_port - map->first_port;
+ const unsigned long *members = map->members;
+
+ adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!adt)
+ return -EMSGSIZE;
+ for (; cb->args[2] <= last; cb->args[2]++) {
+ id = cb->args[2];
+ if (!ip_set_timeout_test(members[id]))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, adt);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT,
+ htons(map->first_port + id));
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(members[id])));
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, adt);
+
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, adt);
+ if (unlikely(id == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+static int
+bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ struct bitmap_port *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ __be16 __port;
+ u16 port = 0;
+
+ if (!ip_set_get_ip_port(skb, pf, flags & IPSET_DIM_ONE_SRC, &__port))
+ return -EINVAL;
+
+ port = ntohs(__port);
+
+ if (port < map->first_port || port > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ port -= map->first_port;
+
+ return adtfn(set, &port, map->timeout);
+}
+
+static int
+bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ struct bitmap_port *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ u32 timeout = map->timeout;
+ u32 port; /* wraparound */
+ u16 id, port_to;
+ int ret = 0;
+
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+ if (port < map->first_port || port > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(map->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST) {
+ id = port - map->first_port;
+ return adtfn(set, &id, timeout);
+ }
+
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to) {
+ swap(port, port_to);
+ if (port < map->first_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+ }
+ } else
+ port_to = port;
+
+ if (port_to > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ for (; port <= port_to; port++) {
+ id = port - map->first_port;
+ ret = adtfn(set, &id, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static void
+bitmap_port_destroy(struct ip_set *set)
+{
+ struct bitmap_port *map = set->data;
+
+ if (with_timeout(map->timeout))
+ del_timer_sync(&map->gc);
+
+ ip_set_free(map->members);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static void
+bitmap_port_flush(struct ip_set *set)
+{
+ struct bitmap_port *map = set->data;
+
+ memset(map->members, 0, map->memsize);
+}
+
+static int
+bitmap_port_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct bitmap_port *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, htons(map->first_port));
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port));
+ NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES,
+ htonl(atomic_read(&set->ref) - 1));
+ NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) + map->memsize));
+ if (with_timeout(map->timeout))
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static bool
+bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_port *x = a->data;
+ const struct bitmap_port *y = b->data;
+
+ return x->first_port == y->first_port &&
+ x->last_port == y->last_port &&
+ x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant bitmap_port = {
+ .kadt = bitmap_port_kadt,
+ .uadt = bitmap_port_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_port_add,
+ [IPSET_DEL] = bitmap_port_del,
+ [IPSET_TEST] = bitmap_port_test,
+ },
+ .destroy = bitmap_port_destroy,
+ .flush = bitmap_port_flush,
+ .head = bitmap_port_head,
+ .list = bitmap_port_list,
+ .same_set = bitmap_port_same_set,
+};
+
+static const struct ip_set_type_variant bitmap_tport = {
+ .kadt = bitmap_port_kadt,
+ .uadt = bitmap_port_uadt,
+ .adt = {
+ [IPSET_ADD] = bitmap_port_tadd,
+ [IPSET_DEL] = bitmap_port_tdel,
+ [IPSET_TEST] = bitmap_port_ttest,
+ },
+ .destroy = bitmap_port_destroy,
+ .flush = bitmap_port_flush,
+ .head = bitmap_port_head,
+ .list = bitmap_port_tlist,
+ .same_set = bitmap_port_same_set,
+};
+
+static void
+bitmap_port_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct bitmap_port *map = set->data;
+ unsigned long *table = map->members;
+ u32 id; /* wraparound */
+ u16 last = map->last_port - map->first_port;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (id = 0; id <= last; id++)
+ if (ip_set_timeout_expired(table[id]))
+ table[id] = IPSET_ELEM_UNSET;
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+bitmap_port_gc_init(struct ip_set *set)
+{
+ struct bitmap_port *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = bitmap_port_gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_port(struct ip_set *set, struct bitmap_port *map,
+ u16 first_port, u16 last_port)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ map->first_port = first_port;
+ map->last_port = last_port;
+ map->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = AF_UNSPEC;
+
+ return true;
+}
+
+static int
+bitmap_port_create(struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ struct bitmap_port *map;
+ u16 first_port, last_port;
+
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+ last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (first_port > last_port) {
+ u16 tmp = first_port;
+
+ first_port = last_port;
+ last_port = tmp;
+ }
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->memsize = (last_port - first_port + 1)
+ * sizeof(unsigned long);
+
+ if (!init_map_port(set, map, first_port, last_port)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ set->variant = &bitmap_tport;
+
+ bitmap_port_gc_init(set);
+ } else {
+ map->memsize = bitmap_bytes(0, last_port - first_port);
+ pr_debug("memsize: %zu\n", map->memsize);
+ if (!init_map_port(set, map, first_port, last_port)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ set->variant = &bitmap_port;
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_port_type = {
+ .name = "bitmap:port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_ONE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = bitmap_port_create,
+ .create_policy = {
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_port_init(void)
+{
+ return ip_set_type_register(&bitmap_port_type);
+}
+
+static void __exit
+bitmap_port_fini(void)
+{
+ ip_set_type_unregister(&bitmap_port_type);
+}
+
+module_init(bitmap_port_init);
+module_exit(bitmap_port_fini);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
new file mode 100644
index 00000000000..8b1a54c1e40
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -0,0 +1,1671 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module for IP set management */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/version.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/ipset/ip_set.h>
+
+static LIST_HEAD(ip_set_type_list); /* all registered set types */
+static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */
+
+static struct ip_set **ip_set_list; /* all individual sets */
+static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */
+
+#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0)
+
+static unsigned int max_sets;
+
+module_param(max_sets, int, 0600);
+MODULE_PARM_DESC(max_sets, "maximal number of sets");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("core IP set support");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
+
+/*
+ * The set types are implemented in modules and registered set types
+ * can be found in ip_set_type_list. Adding/deleting types is
+ * serialized by ip_set_type_mutex.
+ */
+
+static inline void
+ip_set_type_lock(void)
+{
+ mutex_lock(&ip_set_type_mutex);
+}
+
+static inline void
+ip_set_type_unlock(void)
+{
+ mutex_unlock(&ip_set_type_mutex);
+}
+
+/* Register and deregister settype */
+
+static struct ip_set_type *
+find_set_type(const char *name, u8 family, u8 revision)
+{
+ struct ip_set_type *type;
+
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name) &&
+ (type->family == family || type->family == AF_UNSPEC) &&
+ type->revision == revision)
+ return type;
+ return NULL;
+}
+
+/* Unlock, try to load a set type module and lock again */
+static int
+try_to_load_type(const char *name)
+{
+ nfnl_unlock();
+ pr_debug("try to load ip_set_%s\n", name);
+ if (request_module("ip_set_%s", name) < 0) {
+ pr_warning("Can't find ip_set type %s\n", name);
+ nfnl_lock();
+ return -IPSET_ERR_FIND_TYPE;
+ }
+ nfnl_lock();
+ return -EAGAIN;
+}
+
+/* Find a set type and reference it */
+static int
+find_set_type_get(const char *name, u8 family, u8 revision,
+ struct ip_set_type **found)
+{
+ rcu_read_lock();
+ *found = find_set_type(name, family, revision);
+ if (*found) {
+ int err = !try_module_get((*found)->me);
+ rcu_read_unlock();
+ return err ? -EFAULT : 0;
+ }
+ rcu_read_unlock();
+
+ return try_to_load_type(name);
+}
+
+/* Find a given set type by name and family.
+ * If we succeeded, the supported minimal and maximum revisions are
+ * filled out.
+ */
+static int
+find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max)
+{
+ struct ip_set_type *type;
+ bool found = false;
+
+ *min = *max = 0;
+ rcu_read_lock();
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name) &&
+ (type->family == family || type->family == AF_UNSPEC)) {
+ found = true;
+ if (type->revision < *min)
+ *min = type->revision;
+ else if (type->revision > *max)
+ *max = type->revision;
+ }
+ rcu_read_unlock();
+ if (found)
+ return 0;
+
+ return try_to_load_type(name);
+}
+
+#define family_name(f) ((f) == AF_INET ? "inet" : \
+ (f) == AF_INET6 ? "inet6" : "any")
+
+/* Register a set type structure. The type is identified by
+ * the unique triple of name, family and revision.
+ */
+int
+ip_set_type_register(struct ip_set_type *type)
+{
+ int ret = 0;
+
+ if (type->protocol != IPSET_PROTOCOL) {
+ pr_warning("ip_set type %s, family %s, revision %u uses "
+ "wrong protocol version %u (want %u)\n",
+ type->name, family_name(type->family),
+ type->revision, type->protocol, IPSET_PROTOCOL);
+ return -EINVAL;
+ }
+
+ ip_set_type_lock();
+ if (find_set_type(type->name, type->family, type->revision)) {
+ /* Duplicate! */
+ pr_warning("ip_set type %s, family %s, revision %u "
+ "already registered!\n", type->name,
+ family_name(type->family), type->revision);
+ ret = -EINVAL;
+ goto unlock;
+ }
+ list_add_rcu(&type->list, &ip_set_type_list);
+ pr_debug("type %s, family %s, revision %u registered.\n",
+ type->name, family_name(type->family), type->revision);
+unlock:
+ ip_set_type_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_type_register);
+
+/* Unregister a set type. There's a small race with ip_set_create */
+void
+ip_set_type_unregister(struct ip_set_type *type)
+{
+ ip_set_type_lock();
+ if (!find_set_type(type->name, type->family, type->revision)) {
+ pr_warning("ip_set type %s, family %s, revision %u "
+ "not registered\n", type->name,
+ family_name(type->family), type->revision);
+ goto unlock;
+ }
+ list_del_rcu(&type->list);
+ pr_debug("type %s, family %s, revision %u unregistered.\n",
+ type->name, family_name(type->family), type->revision);
+unlock:
+ ip_set_type_unlock();
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ip_set_type_unregister);
+
+/* Utility functions */
+void *
+ip_set_alloc(size_t size)
+{
+ void *members = NULL;
+
+ if (size < KMALLOC_MAX_SIZE)
+ members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+
+ if (members) {
+ pr_debug("%p: allocated with kmalloc\n", members);
+ return members;
+ }
+
+ members = vzalloc(size);
+ if (!members)
+ return NULL;
+ pr_debug("%p: allocated with vmalloc\n", members);
+
+ return members;
+}
+EXPORT_SYMBOL_GPL(ip_set_alloc);
+
+void
+ip_set_free(void *members)
+{
+ pr_debug("%p: free with %s\n", members,
+ is_vmalloc_addr(members) ? "vfree" : "kfree");
+ if (is_vmalloc_addr(members))
+ vfree(members);
+ else
+ kfree(members);
+}
+EXPORT_SYMBOL_GPL(ip_set_free);
+
+static inline bool
+flag_nested(const struct nlattr *nla)
+{
+ return nla->nla_type & NLA_F_NESTED;
+}
+
+static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
+ [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 },
+ [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+};
+
+int
+ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr)
+{
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+ if (unlikely(!flag_nested(nla)))
+ return -IPSET_ERR_PROTOCOL;
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4)))
+ return -IPSET_ERR_PROTOCOL;
+
+ *ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4);
+
+int
+ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
+{
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+ if (unlikely(!flag_nested(nla)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6)))
+ return -IPSET_ERR_PROTOCOL;
+
+ memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]),
+ sizeof(struct in6_addr));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
+
+/*
+ * Creating/destroying/renaming/swapping affect the existence and
+ * the properties of a set. All of these can be executed from userspace
+ * only and serialized by the nfnl mutex indirectly from nfnetlink.
+ *
+ * Sets are identified by their index in ip_set_list and the index
+ * is used by the external references (set/SET netfilter modules).
+ *
+ * The set behind an index may change by swapping only, from userspace.
+ */
+
+static inline void
+__ip_set_get(ip_set_id_t index)
+{
+ atomic_inc(&ip_set_list[index]->ref);
+}
+
+static inline void
+__ip_set_put(ip_set_id_t index)
+{
+ atomic_dec(&ip_set_list[index]->ref);
+}
+
+/*
+ * Add, del and test set entries from kernel.
+ *
+ * The set behind the index must exist and must be referenced
+ * so it can't be destroyed (or changed) under our foot.
+ */
+
+int
+ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
+ u8 family, u8 dim, u8 flags)
+{
+ struct ip_set *set = ip_set_list[index];
+ int ret = 0;
+
+ BUG_ON(set == NULL || atomic_read(&set->ref) == 0);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (dim < set->type->dimension ||
+ !(family == set->family || set->family == AF_UNSPEC))
+ return 0;
+
+ read_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, IPSET_TEST, family, dim, flags);
+ read_unlock_bh(&set->lock);
+
+ if (ret == -EAGAIN) {
+ /* Type requests element to be completed */
+ pr_debug("element must be competed, ADD is triggered\n");
+ write_lock_bh(&set->lock);
+ set->variant->kadt(set, skb, IPSET_ADD, family, dim, flags);
+ write_unlock_bh(&set->lock);
+ ret = 1;
+ }
+
+ /* Convert error codes to nomatch */
+ return (ret < 0 ? 0 : ret);
+}
+EXPORT_SYMBOL_GPL(ip_set_test);
+
+int
+ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
+ u8 family, u8 dim, u8 flags)
+{
+ struct ip_set *set = ip_set_list[index];
+ int ret;
+
+ BUG_ON(set == NULL || atomic_read(&set->ref) == 0);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (dim < set->type->dimension ||
+ !(family == set->family || set->family == AF_UNSPEC))
+ return 0;
+
+ write_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, IPSET_ADD, family, dim, flags);
+ write_unlock_bh(&set->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_add);
+
+int
+ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
+ u8 family, u8 dim, u8 flags)
+{
+ struct ip_set *set = ip_set_list[index];
+ int ret = 0;
+
+ BUG_ON(set == NULL || atomic_read(&set->ref) == 0);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (dim < set->type->dimension ||
+ !(family == set->family || set->family == AF_UNSPEC))
+ return 0;
+
+ write_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, IPSET_DEL, family, dim, flags);
+ write_unlock_bh(&set->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_del);
+
+/*
+ * Find set by name, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ * The nfnl mutex must already be activated.
+ */
+ip_set_id_t
+ip_set_get_byname(const char *name, struct ip_set **set)
+{
+ ip_set_id_t i, index = IPSET_INVALID_ID;
+ struct ip_set *s;
+
+ for (i = 0; i < ip_set_max; i++) {
+ s = ip_set_list[i];
+ if (s != NULL && STREQ(s->name, name)) {
+ __ip_set_get(i);
+ index = i;
+ *set = s;
+ }
+ }
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_byname);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ * The nfnl mutex must already be activated.
+ */
+void
+ip_set_put_byindex(ip_set_id_t index)
+{
+ if (ip_set_list[index] != NULL) {
+ BUG_ON(atomic_read(&ip_set_list[index]->ref) == 0);
+ __ip_set_put(index);
+ }
+}
+EXPORT_SYMBOL_GPL(ip_set_put_byindex);
+
+/*
+ * Get the name of a set behind a set index.
+ * We assume the set is referenced, so it does exist and
+ * can't be destroyed. The set cannot be renamed due to
+ * the referencing either.
+ *
+ * The nfnl mutex must already be activated.
+ */
+const char *
+ip_set_name_byindex(ip_set_id_t index)
+{
+ const struct ip_set *set = ip_set_list[index];
+
+ BUG_ON(set == NULL);
+ BUG_ON(atomic_read(&set->ref) == 0);
+
+ /* Referenced, so it's safe */
+ return set->name;
+}
+EXPORT_SYMBOL_GPL(ip_set_name_byindex);
+
+/*
+ * Routines to call by external subsystems, which do not
+ * call nfnl_lock for us.
+ */
+
+/*
+ * Find set by name, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ * The nfnl mutex is used in the function.
+ */
+ip_set_id_t
+ip_set_nfnl_get(const char *name)
+{
+ struct ip_set *s;
+ ip_set_id_t index;
+
+ nfnl_lock();
+ index = ip_set_get_byname(name, &s);
+ nfnl_unlock();
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_get);
+
+/*
+ * Find set by index, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ * The nfnl mutex is used in the function.
+ */
+ip_set_id_t
+ip_set_nfnl_get_byindex(ip_set_id_t index)
+{
+ if (index > ip_set_max)
+ return IPSET_INVALID_ID;
+
+ nfnl_lock();
+ if (ip_set_list[index])
+ __ip_set_get(index);
+ else
+ index = IPSET_INVALID_ID;
+ nfnl_unlock();
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ * The nfnl mutex is used in the function.
+ */
+void
+ip_set_nfnl_put(ip_set_id_t index)
+{
+ nfnl_lock();
+ if (ip_set_list[index] != NULL) {
+ BUG_ON(atomic_read(&ip_set_list[index]->ref) == 0);
+ __ip_set_put(index);
+ }
+ nfnl_unlock();
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
+
+/*
+ * Communication protocol with userspace over netlink.
+ *
+ * We already locked by nfnl_lock.
+ */
+
+static inline bool
+protocol_failed(const struct nlattr * const tb[])
+{
+ return !tb[IPSET_ATTR_PROTOCOL] ||
+ nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL;
+}
+
+static inline u32
+flag_exist(const struct nlmsghdr *nlh)
+{
+ return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST;
+}
+
+static struct nlmsghdr *
+start_msg(struct sk_buff *skb, u32 pid, u32 seq, unsigned int flags,
+ enum ipset_cmd cmd)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+
+ nlh = nlmsg_put(skb, pid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
+ sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ return NULL;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ return nlh;
+}
+
+/* Create a set */
+
+static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1},
+ [IPSET_ATTR_REVISION] = { .type = NLA_U8 },
+ [IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
+ [IPSET_ATTR_DATA] = { .type = NLA_NESTED },
+};
+
+static ip_set_id_t
+find_set_id(const char *name)
+{
+ ip_set_id_t i, index = IPSET_INVALID_ID;
+ const struct ip_set *set;
+
+ for (i = 0; index == IPSET_INVALID_ID && i < ip_set_max; i++) {
+ set = ip_set_list[i];
+ if (set != NULL && STREQ(set->name, name))
+ index = i;
+ }
+ return index;
+}
+
+static inline struct ip_set *
+find_set(const char *name)
+{
+ ip_set_id_t index = find_set_id(name);
+
+ return index == IPSET_INVALID_ID ? NULL : ip_set_list[index];
+}
+
+static int
+find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set)
+{
+ ip_set_id_t i;
+
+ *index = IPSET_INVALID_ID;
+ for (i = 0; i < ip_set_max; i++) {
+ if (ip_set_list[i] == NULL) {
+ if (*index == IPSET_INVALID_ID)
+ *index = i;
+ } else if (STREQ(name, ip_set_list[i]->name)) {
+ /* Name clash */
+ *set = ip_set_list[i];
+ return -EEXIST;
+ }
+ }
+ if (*index == IPSET_INVALID_ID)
+ /* No free slot remained */
+ return -IPSET_ERR_MAX_SETS;
+ return 0;
+}
+
+static int
+ip_set_create(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *set, *clash;
+ ip_set_id_t index = IPSET_INVALID_ID;
+ struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
+ const char *name, *typename;
+ u8 family, revision;
+ u32 flags = flag_exist(nlh);
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_TYPENAME] == NULL ||
+ attr[IPSET_ATTR_REVISION] == NULL ||
+ attr[IPSET_ATTR_FAMILY] == NULL ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA]))))
+ return -IPSET_ERR_PROTOCOL;
+
+ name = nla_data(attr[IPSET_ATTR_SETNAME]);
+ typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+ family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+ revision = nla_get_u8(attr[IPSET_ATTR_REVISION]);
+ pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n",
+ name, typename, family_name(family), revision);
+
+ /*
+ * First, and without any locks, allocate and initialize
+ * a normal base set structure.
+ */
+ set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
+ if (!set)
+ return -ENOMEM;
+ rwlock_init(&set->lock);
+ strlcpy(set->name, name, IPSET_MAXNAMELEN);
+ atomic_set(&set->ref, 0);
+ set->family = family;
+
+ /*
+ * Next, check that we know the type, and take
+ * a reference on the type, to make sure it stays available
+ * while constructing our new set.
+ *
+ * After referencing the type, we try to create the type
+ * specific part of the set without holding any locks.
+ */
+ ret = find_set_type_get(typename, family, revision, &(set->type));
+ if (ret)
+ goto out;
+
+ /*
+ * Without holding any locks, create private part.
+ */
+ if (attr[IPSET_ATTR_DATA] &&
+ nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
+ set->type->create_policy)) {
+ ret = -IPSET_ERR_PROTOCOL;
+ goto put_out;
+ }
+
+ ret = set->type->create(set, tb, flags);
+ if (ret != 0)
+ goto put_out;
+
+ /* BTW, ret==0 here. */
+
+ /*
+ * Here, we have a valid, constructed set and we are protected
+ * by nfnl_lock. Find the first free index in ip_set_list and
+ * check clashing.
+ */
+ if ((ret = find_free_id(set->name, &index, &clash)) != 0) {
+ /* If this is the same set and requested, ignore error */
+ if (ret == -EEXIST &&
+ (flags & IPSET_FLAG_EXIST) &&
+ STREQ(set->type->name, clash->type->name) &&
+ set->type->family == clash->type->family &&
+ set->type->revision == clash->type->revision &&
+ set->variant->same_set(set, clash))
+ ret = 0;
+ goto cleanup;
+ }
+
+ /*
+ * Finally! Add our shiny new set to the list, and be done.
+ */
+ pr_debug("create: '%s' created with index %u!\n", set->name, index);
+ ip_set_list[index] = set;
+
+ return ret;
+
+cleanup:
+ set->variant->destroy(set);
+put_out:
+ module_put(set->type->me);
+out:
+ kfree(set);
+ return ret;
+}
+
+/* Destroy sets */
+
+static const struct nla_policy
+ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static void
+ip_set_destroy_set(ip_set_id_t index)
+{
+ struct ip_set *set = ip_set_list[index];
+
+ pr_debug("set: %s\n", set->name);
+ ip_set_list[index] = NULL;
+
+ /* Must call it without holding any lock */
+ set->variant->destroy(set);
+ module_put(set->type->me);
+ kfree(set);
+}
+
+static int
+ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ ip_set_id_t i;
+
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ /* References are protected by the nfnl mutex */
+ if (!attr[IPSET_ATTR_SETNAME]) {
+ for (i = 0; i < ip_set_max; i++) {
+ if (ip_set_list[i] != NULL &&
+ (atomic_read(&ip_set_list[i]->ref)))
+ return -IPSET_ERR_BUSY;
+ }
+ for (i = 0; i < ip_set_max; i++) {
+ if (ip_set_list[i] != NULL)
+ ip_set_destroy_set(i);
+ }
+ } else {
+ i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (i == IPSET_INVALID_ID)
+ return -ENOENT;
+ else if (atomic_read(&ip_set_list[i]->ref))
+ return -IPSET_ERR_BUSY;
+
+ ip_set_destroy_set(i);
+ }
+ return 0;
+}
+
+/* Flush sets */
+
+static void
+ip_set_flush_set(struct ip_set *set)
+{
+ pr_debug("set: %s\n", set->name);
+
+ write_lock_bh(&set->lock);
+ set->variant->flush(set);
+ write_unlock_bh(&set->lock);
+}
+
+static int
+ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ ip_set_id_t i;
+
+ if (unlikely(protocol_failed(attr)))
+ return -EPROTO;
+
+ if (!attr[IPSET_ATTR_SETNAME]) {
+ for (i = 0; i < ip_set_max; i++)
+ if (ip_set_list[i] != NULL)
+ ip_set_flush_set(ip_set_list[i]);
+ } else {
+ i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (i == IPSET_INVALID_ID)
+ return -ENOENT;
+
+ ip_set_flush_set(ip_set_list[i]);
+ }
+
+ return 0;
+}
+
+/* Rename a set */
+
+static const struct nla_policy
+ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_SETNAME2] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static int
+ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *set;
+ const char *name2;
+ ip_set_id_t i;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_SETNAME2] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+ if (atomic_read(&set->ref) != 0)
+ return -IPSET_ERR_REFERENCED;
+
+ name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
+ for (i = 0; i < ip_set_max; i++) {
+ if (ip_set_list[i] != NULL &&
+ STREQ(ip_set_list[i]->name, name2))
+ return -IPSET_ERR_EXIST_SETNAME2;
+ }
+ strncpy(set->name, name2, IPSET_MAXNAMELEN);
+
+ return 0;
+}
+
+/* Swap two sets so that name/index points to the other.
+ * References and set names are also swapped.
+ *
+ * We are protected by the nfnl mutex and references are
+ * manipulated only by holding the mutex. The kernel interfaces
+ * do not hold the mutex but the pointer settings are atomic
+ * so the ip_set_list always contains valid pointers to the sets.
+ */
+
+static int
+ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *from, *to;
+ ip_set_id_t from_id, to_id;
+ char from_name[IPSET_MAXNAMELEN];
+ u32 from_ref;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_SETNAME2] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ from_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (from_id == IPSET_INVALID_ID)
+ return -ENOENT;
+
+ to_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME2]));
+ if (to_id == IPSET_INVALID_ID)
+ return -IPSET_ERR_EXIST_SETNAME2;
+
+ from = ip_set_list[from_id];
+ to = ip_set_list[to_id];
+
+ /* Features must not change.
+ * Not an artifical restriction anymore, as we must prevent
+ * possible loops created by swapping in setlist type of sets. */
+ if (!(from->type->features == to->type->features &&
+ from->type->family == to->type->family))
+ return -IPSET_ERR_TYPE_MISMATCH;
+
+ /* No magic here: ref munging protected by the nfnl_lock */
+ strncpy(from_name, from->name, IPSET_MAXNAMELEN);
+ from_ref = atomic_read(&from->ref);
+
+ strncpy(from->name, to->name, IPSET_MAXNAMELEN);
+ atomic_set(&from->ref, atomic_read(&to->ref));
+ strncpy(to->name, from_name, IPSET_MAXNAMELEN);
+ atomic_set(&to->ref, from_ref);
+
+ ip_set_list[from_id] = to;
+ ip_set_list[to_id] = from;
+
+ return 0;
+}
+
+/* List/save set data */
+
+#define DUMP_INIT 0L
+#define DUMP_ALL 1L
+#define DUMP_ONE 2L
+#define DUMP_LAST 3L
+
+static int
+ip_set_dump_done(struct netlink_callback *cb)
+{
+ if (cb->args[2]) {
+ pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name);
+ __ip_set_put((ip_set_id_t) cb->args[1]);
+ }
+ return 0;
+}
+
+static inline void
+dump_attrs(struct nlmsghdr *nlh)
+{
+ const struct nlattr *attr;
+ int rem;
+
+ pr_debug("dump nlmsg\n");
+ nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) {
+ pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len);
+ }
+}
+
+static int
+dump_init(struct netlink_callback *cb)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
+ int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *attr = (void *)nlh + min_len;
+ ip_set_id_t index;
+
+ /* Second pass, so parser can't fail */
+ nla_parse(cda, IPSET_ATTR_CMD_MAX,
+ attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
+
+ /* cb->args[0] : dump single set/all sets
+ * [1] : set index
+ * [..]: type specific
+ */
+
+ if (!cda[IPSET_ATTR_SETNAME]) {
+ cb->args[0] = DUMP_ALL;
+ return 0;
+ }
+
+ index = find_set_id(nla_data(cda[IPSET_ATTR_SETNAME]));
+ if (index == IPSET_INVALID_ID)
+ return -ENOENT;
+
+ cb->args[0] = DUMP_ONE;
+ cb->args[1] = index;
+ return 0;
+}
+
+static int
+ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ ip_set_id_t index = IPSET_INVALID_ID, max;
+ struct ip_set *set = NULL;
+ struct nlmsghdr *nlh = NULL;
+ unsigned int flags = NETLINK_CB(cb->skb).pid ? NLM_F_MULTI : 0;
+ int ret = 0;
+
+ if (cb->args[0] == DUMP_INIT) {
+ ret = dump_init(cb);
+ if (ret < 0) {
+ nlh = nlmsg_hdr(cb->skb);
+ /* We have to create and send the error message
+ * manually :-( */
+ if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(cb->skb, nlh, ret);
+ return ret;
+ }
+ }
+
+ if (cb->args[1] >= ip_set_max)
+ goto out;
+
+ pr_debug("args[0]: %ld args[1]: %ld\n", cb->args[0], cb->args[1]);
+ max = cb->args[0] == DUMP_ONE ? cb->args[1] + 1 : ip_set_max;
+ for (; cb->args[1] < max; cb->args[1]++) {
+ index = (ip_set_id_t) cb->args[1];
+ set = ip_set_list[index];
+ if (set == NULL) {
+ if (cb->args[0] == DUMP_ONE) {
+ ret = -ENOENT;
+ goto out;
+ }
+ continue;
+ }
+ /* When dumping all sets, we must dump "sorted"
+ * so that lists (unions of sets) are dumped last.
+ */
+ if (cb->args[0] != DUMP_ONE &&
+ !((cb->args[0] == DUMP_ALL) ^
+ (set->type->features & IPSET_DUMP_LAST)))
+ continue;
+ pr_debug("List set: %s\n", set->name);
+ if (!cb->args[2]) {
+ /* Start listing: make sure set won't be destroyed */
+ pr_debug("reference set\n");
+ __ip_set_get(index);
+ }
+ nlh = start_msg(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, flags,
+ IPSET_CMD_LIST);
+ if (!nlh) {
+ ret = -EMSGSIZE;
+ goto release_refcount;
+ }
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+ NLA_PUT_STRING(skb, IPSET_ATTR_SETNAME, set->name);
+ switch (cb->args[2]) {
+ case 0:
+ /* Core header data */
+ NLA_PUT_STRING(skb, IPSET_ATTR_TYPENAME,
+ set->type->name);
+ NLA_PUT_U8(skb, IPSET_ATTR_FAMILY,
+ set->family);
+ NLA_PUT_U8(skb, IPSET_ATTR_REVISION,
+ set->type->revision);
+ ret = set->variant->head(set, skb);
+ if (ret < 0)
+ goto release_refcount;
+ /* Fall through and add elements */
+ default:
+ read_lock_bh(&set->lock);
+ ret = set->variant->list(set, skb, cb);
+ read_unlock_bh(&set->lock);
+ if (!cb->args[2]) {
+ /* Set is done, proceed with next one */
+ if (cb->args[0] == DUMP_ONE)
+ cb->args[1] = IPSET_INVALID_ID;
+ else
+ cb->args[1]++;
+ }
+ goto release_refcount;
+ }
+ }
+ goto out;
+
+nla_put_failure:
+ ret = -EFAULT;
+release_refcount:
+ /* If there was an error or set is done, release set */
+ if (ret || !cb->args[2]) {
+ pr_debug("release set %s\n", ip_set_list[index]->name);
+ __ip_set_put(index);
+ }
+
+ /* If we dump all sets, continue with dumping last ones */
+ if (cb->args[0] == DUMP_ALL && cb->args[1] >= max && !cb->args[2])
+ cb->args[0] = DUMP_LAST;
+
+out:
+ if (nlh) {
+ nlmsg_end(skb, nlh);
+ pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len);
+ dump_attrs(nlh);
+ }
+
+ return ret < 0 ? ret : skb->len;
+}
+
+static int
+ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ return netlink_dump_start(ctnl, skb, nlh,
+ ip_set_dump_start,
+ ip_set_dump_done);
+}
+
+/* Add, del and test */
+
+static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_DATA] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ADT] = { .type = NLA_NESTED },
+};
+
+static int
+call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
+ struct nlattr *tb[], enum ipset_adt adt,
+ u32 flags, bool use_lineno)
+{
+ int ret, retried = 0;
+ u32 lineno = 0;
+ bool eexist = flags & IPSET_FLAG_EXIST;
+
+ do {
+ write_lock_bh(&set->lock);
+ ret = set->variant->uadt(set, tb, adt, &lineno, flags);
+ write_unlock_bh(&set->lock);
+ } while (ret == -EAGAIN &&
+ set->variant->resize &&
+ (ret = set->variant->resize(set, retried++)) == 0);
+
+ if (!ret || (ret == -IPSET_ERR_EXIST && eexist))
+ return 0;
+ if (lineno && use_lineno) {
+ /* Error in restore/batch mode: send back lineno */
+ struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb);
+ struct sk_buff *skb2;
+ struct nlmsgerr *errmsg;
+ size_t payload = sizeof(*errmsg) + nlmsg_len(nlh);
+ int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *cmdattr;
+ u32 *errline;
+
+ skb2 = nlmsg_new(payload, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+ rep = __nlmsg_put(skb2, NETLINK_CB(skb).pid,
+ nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
+ errmsg = nlmsg_data(rep);
+ errmsg->error = ret;
+ memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
+ cmdattr = (void *)&errmsg->msg + min_len;
+
+ nla_parse(cda, IPSET_ATTR_CMD_MAX,
+ cmdattr, nlh->nlmsg_len - min_len,
+ ip_set_adt_policy);
+
+ errline = nla_data(cda[IPSET_ATTR_LINENO]);
+
+ *errline = lineno;
+
+ netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ /* Signal netlink not to send its ACK/errmsg. */
+ return -EINTR;
+ }
+
+ return ret;
+}
+
+static int
+ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ const struct nlattr *nla;
+ u32 flags = flag_exist(nlh);
+ bool use_lineno;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ !((attr[IPSET_ATTR_DATA] != NULL) ^
+ (attr[IPSET_ATTR_ADT] != NULL)) ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA])) ||
+ (attr[IPSET_ATTR_ADT] != NULL &&
+ (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+ attr[IPSET_ATTR_LINENO] == NULL))))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ use_lineno = !!attr[IPSET_ATTR_LINENO];
+ if (attr[IPSET_ATTR_DATA]) {
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+ attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags,
+ use_lineno);
+ } else {
+ int nla_rem;
+
+ nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+ memset(tb, 0, sizeof(tb));
+ if (nla_type(nla) != IPSET_ATTR_DATA ||
+ !flag_nested(nla) ||
+ nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_ADD,
+ flags, use_lineno);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+static int
+ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ const struct nlattr *nla;
+ u32 flags = flag_exist(nlh);
+ bool use_lineno;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ !((attr[IPSET_ATTR_DATA] != NULL) ^
+ (attr[IPSET_ATTR_ADT] != NULL)) ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA])) ||
+ (attr[IPSET_ATTR_ADT] != NULL &&
+ (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+ attr[IPSET_ATTR_LINENO] == NULL))))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ use_lineno = !!attr[IPSET_ATTR_LINENO];
+ if (attr[IPSET_ATTR_DATA]) {
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+ attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags,
+ use_lineno);
+ } else {
+ int nla_rem;
+
+ nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+ memset(tb, 0, sizeof(*tb));
+ if (nla_type(nla) != IPSET_ATTR_DATA ||
+ !flag_nested(nla) ||
+ nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_DEL,
+ flags, use_lineno);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+static int
+ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_DATA] == NULL ||
+ !flag_nested(attr[IPSET_ATTR_DATA])))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+
+ read_lock_bh(&set->lock);
+ ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0);
+ read_unlock_bh(&set->lock);
+ /* Userspace can't trigger element to be re-added */
+ if (ret == -EAGAIN)
+ ret = 1;
+
+ return ret < 0 ? ret : ret > 0 ? 0 : -IPSET_ERR_EXIST;
+}
+
+/* Get headed data of a set */
+
+static int
+ip_set_header(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ const struct ip_set *set;
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ ip_set_id_t index;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ index = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (index == IPSET_INVALID_ID)
+ return -ENOENT;
+ set = ip_set_list[index];
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_HEADER);
+ if (!nlh2)
+ goto nlmsg_failure;
+ NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+ NLA_PUT_STRING(skb2, IPSET_ATTR_SETNAME, set->name);
+ NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, set->type->name);
+ NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, set->family);
+ NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, set->type->revision);
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+/* Get type data */
+
+static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
+};
+
+static int
+ip_set_type(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ u8 family, min, max;
+ const char *typename;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_TYPENAME] == NULL ||
+ attr[IPSET_ATTR_FAMILY] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+ typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+ ret = find_set_type_minmax(typename, family, &min, &max);
+ if (ret)
+ return ret;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_TYPE);
+ if (!nlh2)
+ goto nlmsg_failure;
+ NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+ NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, typename);
+ NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, family);
+ NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, max);
+ NLA_PUT_U8(skb2, IPSET_ATTR_REVISION_MIN, min);
+ nlmsg_end(skb2, nlh2);
+
+ pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+/* Get protocol version */
+
+static const struct nla_policy
+ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+};
+
+static int
+ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ int ret = 0;
+
+ if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_PROTOCOL);
+ if (!nlh2)
+ goto nlmsg_failure;
+ NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
+ [IPSET_CMD_CREATE] = {
+ .call = ip_set_create,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_create_policy,
+ },
+ [IPSET_CMD_DESTROY] = {
+ .call = ip_set_destroy,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_FLUSH] = {
+ .call = ip_set_flush,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_RENAME] = {
+ .call = ip_set_rename,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname2_policy,
+ },
+ [IPSET_CMD_SWAP] = {
+ .call = ip_set_swap,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname2_policy,
+ },
+ [IPSET_CMD_LIST] = {
+ .call = ip_set_dump,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_SAVE] = {
+ .call = ip_set_dump,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_ADD] = {
+ .call = ip_set_uadd,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_DEL] = {
+ .call = ip_set_udel,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_TEST] = {
+ .call = ip_set_utest,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_HEADER] = {
+ .call = ip_set_header,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_TYPE] = {
+ .call = ip_set_type,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_type_policy,
+ },
+ [IPSET_CMD_PROTOCOL] = {
+ .call = ip_set_protocol,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_protocol_policy,
+ },
+};
+
+static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = {
+ .name = "ip_set",
+ .subsys_id = NFNL_SUBSYS_IPSET,
+ .cb_count = IPSET_MSG_MAX,
+ .cb = ip_set_netlink_subsys_cb,
+};
+
+/* Interface to iptables/ip6tables */
+
+static int
+ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
+{
+ unsigned *op;
+ void *data;
+ int copylen = *len, ret = 0;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (optval != SO_IP_SET)
+ return -EBADF;
+ if (*len < sizeof(unsigned))
+ return -EINVAL;
+
+ data = vmalloc(*len);
+ if (!data)
+ return -ENOMEM;
+ if (copy_from_user(data, user, *len) != 0) {
+ ret = -EFAULT;
+ goto done;
+ }
+ op = (unsigned *) data;
+
+ if (*op < IP_SET_OP_VERSION) {
+ /* Check the version at the beginning of operations */
+ struct ip_set_req_version *req_version = data;
+ if (req_version->version != IPSET_PROTOCOL) {
+ ret = -EPROTO;
+ goto done;
+ }
+ }
+
+ switch (*op) {
+ case IP_SET_OP_VERSION: {
+ struct ip_set_req_version *req_version = data;
+
+ if (*len != sizeof(struct ip_set_req_version)) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ req_version->version = IPSET_PROTOCOL;
+ ret = copy_to_user(user, req_version,
+ sizeof(struct ip_set_req_version));
+ goto done;
+ }
+ case IP_SET_OP_GET_BYNAME: {
+ struct ip_set_req_get_set *req_get = data;
+
+ if (*len != sizeof(struct ip_set_req_get_set)) {
+ ret = -EINVAL;
+ goto done;
+ }
+ req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
+ nfnl_lock();
+ req_get->set.index = find_set_id(req_get->set.name);
+ nfnl_unlock();
+ goto copy;
+ }
+ case IP_SET_OP_GET_BYINDEX: {
+ struct ip_set_req_get_set *req_get = data;
+
+ if (*len != sizeof(struct ip_set_req_get_set) ||
+ req_get->set.index >= ip_set_max) {
+ ret = -EINVAL;
+ goto done;
+ }
+ nfnl_lock();
+ strncpy(req_get->set.name,
+ ip_set_list[req_get->set.index]
+ ? ip_set_list[req_get->set.index]->name : "",
+ IPSET_MAXNAMELEN);
+ nfnl_unlock();
+ goto copy;
+ }
+ default:
+ ret = -EBADMSG;
+ goto done;
+ } /* end of switch(op) */
+
+copy:
+ ret = copy_to_user(user, data, copylen);
+
+done:
+ vfree(data);
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static struct nf_sockopt_ops so_set __read_mostly = {
+ .pf = PF_INET,
+ .get_optmin = SO_IP_SET,
+ .get_optmax = SO_IP_SET + 1,
+ .get = &ip_set_sockfn_get,
+ .owner = THIS_MODULE,
+};
+
+static int __init
+ip_set_init(void)
+{
+ int ret;
+
+ if (max_sets)
+ ip_set_max = max_sets;
+ if (ip_set_max >= IPSET_INVALID_ID)
+ ip_set_max = IPSET_INVALID_ID - 1;
+
+ ip_set_list = kzalloc(sizeof(struct ip_set *) * ip_set_max,
+ GFP_KERNEL);
+ if (!ip_set_list) {
+ pr_err("ip_set: Unable to create ip_set_list\n");
+ return -ENOMEM;
+ }
+
+ ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+ if (ret != 0) {
+ pr_err("ip_set: cannot register with nfnetlink.\n");
+ kfree(ip_set_list);
+ return ret;
+ }
+ ret = nf_register_sockopt(&so_set);
+ if (ret != 0) {
+ pr_err("SO_SET registry failed: %d\n", ret);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ kfree(ip_set_list);
+ return ret;
+ }
+
+ pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL);
+ return 0;
+}
+
+static void __exit
+ip_set_fini(void)
+{
+ /* There can't be any existing set */
+ nf_unregister_sockopt(&so_set);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ kfree(ip_set_list);
+ pr_debug("these are the famous last words\n");
+}
+
+module_init(ip_set_init);
+module_exit(ip_set_fini);
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
new file mode 100644
index 00000000000..8d522721268
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -0,0 +1,141 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Get Layer-4 data from the packets */
+
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/ipset/ip_set_getport.h>
+
+/* We must handle non-linear skbs */
+static bool
+get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
+ bool src, __be16 *port, u8 *proto)
+{
+ switch (protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? th->source : th->dest;
+ break;
+ }
+ case IPPROTO_UDP: {
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph);
+ if (uh == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? uh->source : uh->dest;
+ break;
+ }
+ case IPPROTO_ICMP: {
+ struct icmphdr _ich;
+ const struct icmphdr *ic;
+
+ ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+ if (ic == NULL)
+ return false;
+
+ *port = (__force __be16)htons((ic->type << 8) | ic->code);
+ break;
+ }
+ case IPPROTO_ICMPV6: {
+ struct icmp6hdr _ich;
+ const struct icmp6hdr *ic;
+
+ ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+ if (ic == NULL)
+ return false;
+
+ *port = (__force __be16)
+ htons((ic->icmp6_type << 8) | ic->icmp6_code);
+ break;
+ }
+ default:
+ break;
+ }
+ *proto = protocol;
+
+ return true;
+}
+
+bool
+ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
+ __be16 *port, u8 *proto)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ unsigned int protooff = ip_hdrlen(skb);
+ int protocol = iph->protocol;
+
+ /* See comments at tcp_match in ip_tables.c */
+ if (protocol <= 0 || (ntohs(iph->frag_off) & IP_OFFSET))
+ return false;
+
+ return get_port(skb, protocol, protooff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip4_port);
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+bool
+ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
+ __be16 *port, u8 *proto)
+{
+ int protoff;
+ u8 nexthdr;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr);
+ if (protoff < 0)
+ return false;
+
+ return get_port(skb, nexthdr, protoff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip6_port);
+#endif
+
+bool
+ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
+{
+ bool ret;
+ u8 proto;
+
+ switch (pf) {
+ case AF_INET:
+ ret = ip_set_get_ip4_port(skb, src, port, &proto);
+ break;
+ case AF_INET6:
+ ret = ip_set_get_ip6_port(skb, src, port, &proto);
+ break;
+ default:
+ return false;
+ }
+ if (!ret)
+ return ret;
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ return true;
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip_port);
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
new file mode 100644
index 00000000000..43bcce20012
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -0,0 +1,464 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip");
+
+/* Type specific function prefix */
+#define TYPE hash_ip
+
+static bool
+hash_ip_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ip4_same_set hash_ip_same_set
+#define hash_ip6_same_set hash_ip_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ip4_elem {
+ __be32 ip;
+};
+
+/* Member elements with timeout support */
+struct hash_ip4_telem {
+ __be32 ip;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ip4_data_equal(const struct hash_ip4_elem *ip1,
+ const struct hash_ip4_elem *ip2)
+{
+ return ip1->ip == ip2->ip;
+}
+
+static inline bool
+hash_ip4_data_isnull(const struct hash_ip4_elem *elem)
+{
+ return elem->ip == 0;
+}
+
+static inline void
+hash_ip4_data_copy(struct hash_ip4_elem *dst, const struct hash_ip4_elem *src)
+{
+ dst->ip = src->ip;
+}
+
+/* Zero valued IP addresses cannot be stored */
+static inline void
+hash_ip4_data_zero_out(struct hash_ip4_elem *elem)
+{
+ elem->ip = 0;
+}
+
+static inline bool
+hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ip4_data_tlist(struct sk_buff *skb, const struct hash_ip4_elem *data)
+{
+ const struct hash_ip4_telem *tdata =
+ (const struct hash_ip4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define IP_SET_HASH_WITH_NETMASK
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ __be32 ip;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &ip);
+ ip &= ip_set_netmask(h->netmask);
+ if (ip == 0)
+ return -EINVAL;
+
+ return adtfn(set, &ip, h->timeout);
+}
+
+static int
+hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ u32 ip, ip_to, hosts, timeout = h->timeout;
+ __be32 nip;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ip &= ip_set_hostmask(h->netmask);
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST) {
+ nip = htonl(ip);
+ if (nip == 0)
+ return -IPSET_ERR_HASH_ELEM;
+ return adtfn(set, &nip, timeout);
+ }
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip &= ip_set_hostmask(cidr);
+ ip_to = ip | ~ip_set_hostmask(cidr);
+ } else
+ ip_to = ip;
+
+ hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
+
+ for (; !before(ip_to, ip); ip += hosts) {
+ nip = htonl(ip);
+ if (nip == 0)
+ return -IPSET_ERR_HASH_ELEM;
+ ret = adtfn(set, &nip, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+hash_ip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout &&
+ x->netmask == y->netmask;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ip6_elem {
+ union nf_inet_addr ip;
+};
+
+struct hash_ip6_telem {
+ union nf_inet_addr ip;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
+ const struct hash_ip6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0;
+}
+
+static inline bool
+hash_ip6_data_isnull(const struct hash_ip6_elem *elem)
+{
+ return ipv6_addr_any(&elem->ip.in6);
+}
+
+static inline void
+hash_ip6_data_copy(struct hash_ip6_elem *dst, const struct hash_ip6_elem *src)
+{
+ ipv6_addr_copy(&dst->ip.in6, &src->ip.in6);
+}
+
+static inline void
+hash_ip6_data_zero_out(struct hash_ip6_elem *elem)
+{
+ ipv6_addr_set(&elem->ip.in6, 0, 0, 0, 0);
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+ ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+ ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+ ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+ ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static bool
+hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ip6_data_tlist(struct sk_buff *skb, const struct hash_ip6_elem *data)
+{
+ const struct hash_ip6_telem *e =
+ (const struct hash_ip6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ union nf_inet_addr ip;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &ip.in6);
+ ip6_netmask(&ip, h->netmask);
+ if (ipv6_addr_any(&ip.in6))
+ return -EINVAL;
+
+ return adtfn(set, &ip, h->timeout);
+}
+
+static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+};
+
+static int
+hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ union nf_inet_addr ip;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &ip);
+ if (ret)
+ return ret;
+
+ ip6_netmask(&ip, h->netmask);
+ if (ipv6_addr_any(&ip.in6))
+ return -IPSET_ERR_HASH_ELEM;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ ret = adtfn(set, &ip, timeout);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ u8 netmask, hbits;
+ struct ip_set_hash *h;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+ netmask = set->family == AF_INET ? 32 : 128;
+ pr_debug("Create set %s with family %s\n",
+ set->name, set->family == AF_INET ? "inet" : "inet6");
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ if (tb[IPSET_ATTR_NETMASK]) {
+ netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+ if ((set->family == AF_INET && netmask > 32) ||
+ (set->family == AF_INET6 && netmask > 128) ||
+ netmask == 0)
+ return -IPSET_ERR_INVALID_NETMASK;
+ }
+
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ h->netmask = netmask;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_ip4_tvariant : &hash_ip6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_ip4_gc_init(set);
+ else
+ hash_ip6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_ip4_variant : &hash_ip6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_ip_type __read_mostly = {
+ .name = "hash:ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_ip_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ip_init(void)
+{
+ return ip_set_type_register(&hash_ip_type);
+}
+
+static void __exit
+hash_ip_fini(void)
+{
+ ip_set_type_unregister(&hash_ip_type);
+}
+
+module_init(hash_ip_init);
+module_exit(hash_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
new file mode 100644
index 00000000000..adbe787ea5d
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -0,0 +1,544 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip,port type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip,port");
+
+/* Type specific function prefix */
+#define TYPE hash_ipport
+
+static bool
+hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ipport4_same_set hash_ipport_same_set
+#define hash_ipport6_same_set hash_ipport_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ipport4_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Member elements with timeout support */
+struct hash_ipport4_telem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1,
+ const struct hash_ipport4_elem *ip2)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipport4_data_isnull(const struct hash_ipport4_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipport4_data_copy(struct hash_ipport4_elem *dst,
+ const struct hash_ipport4_elem *src)
+{
+ dst->ip = src->ip;
+ dst->port = src->port;
+ dst->proto = src->proto;
+}
+
+static inline void
+hash_ipport4_data_zero_out(struct hash_ipport4_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_ipport4_data_list(struct sk_buff *skb,
+ const struct hash_ipport4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipport4_data_tlist(struct sk_buff *skb,
+ const struct hash_ipport4_elem *data)
+{
+ const struct hash_ipport4_telem *tdata =
+ (const struct hash_ipport4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport4_elem data = { };
+
+ if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport4_elem data = { };
+ u32 ip, ip_to, p, port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMP:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip = ntohl(data.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip &= ip_set_hostmask(cidr);
+ ip_to = ip | ~ip_set_hostmask(cidr);
+ } else
+ ip_to = ip;
+
+ port = ntohs(data.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ } else
+ port_to = port;
+
+ for (; !before(ip_to, ip); ip++)
+ for (p = port; p <= port_to; p++) {
+ data.ip = htonl(ip);
+ data.port = htons(p);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ipport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+struct hash_ipport6_telem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1,
+ const struct hash_ipport6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipport6_data_isnull(const struct hash_ipport6_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipport6_data_copy(struct hash_ipport6_elem *dst,
+ const struct hash_ipport6_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipport6_data_zero_out(struct hash_ipport6_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_ipport6_data_list(struct sk_buff *skb,
+ const struct hash_ipport6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipport6_data_tlist(struct sk_buff *skb,
+ const struct hash_ipport6_elem *data)
+{
+ const struct hash_ipport6_telem *e =
+ (const struct hash_ipport6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport6_elem data = { };
+
+ if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport6_elem data = { };
+ u32 port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMPV6:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(data.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ for (; port <= port_to; port++) {
+ data.port = htons(port);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ipport_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ struct ip_set_hash *h;
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ u8 hbits;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_ipport4_tvariant : &hash_ipport6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_ipport4_gc_init(set);
+ else
+ hash_ipport6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_ipport4_variant : &hash_ipport6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_ipport_type __read_mostly = {
+ .name = "hash:ip,port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_TWO,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_ipport_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipport_init(void)
+{
+ return ip_set_type_register(&hash_ipport_type);
+}
+
+static void __exit
+hash_ipport_fini(void)
+{
+ ip_set_type_unregister(&hash_ipport_type);
+}
+
+module_init(hash_ipport_init);
+module_exit(hash_ipport_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
new file mode 100644
index 00000000000..22e23abb86c
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -0,0 +1,562 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip,port,ip type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip,port,ip");
+
+/* Type specific function prefix */
+#define TYPE hash_ipportip
+
+static bool
+hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ipportip4_same_set hash_ipportip_same_set
+#define hash_ipportip6_same_set hash_ipportip_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ipportip4_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Member elements with timeout support */
+struct hash_ipportip4_telem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
+ const struct hash_ipportip4_elem *ip2)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->ip2 == ip2->ip2 &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportip4_data_isnull(const struct hash_ipportip4_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipportip4_data_copy(struct hash_ipportip4_elem *dst,
+ const struct hash_ipportip4_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportip4_data_zero_out(struct hash_ipportip4_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_ipportip4_data_list(struct sk_buff *skb,
+ const struct hash_ipportip4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipportip4_data_tlist(struct sk_buff *skb,
+ const struct hash_ipportip4_elem *data)
+{
+ const struct hash_ipportip4_telem *tdata =
+ (const struct hash_ipportip4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip4_elem data = { };
+
+ if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+ ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip4_elem data = { };
+ u32 ip, ip_to, p, port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMP:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip = ntohl(data.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip &= ip_set_hostmask(cidr);
+ ip_to = ip | ~ip_set_hostmask(cidr);
+ } else
+ ip_to = ip;
+
+ port = ntohs(data.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ } else
+ port_to = port;
+
+ for (; !before(ip_to, ip); ip++)
+ for (p = port; p <= port_to; p++) {
+ data.ip = htonl(ip);
+ data.port = htons(p);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ipportip6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+struct hash_ipportip6_telem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1,
+ const struct hash_ipportip6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+ ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportip6_data_isnull(const struct hash_ipportip6_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipportip6_data_copy(struct hash_ipportip6_elem *dst,
+ const struct hash_ipportip6_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportip6_data_zero_out(struct hash_ipportip6_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_ipportip6_data_list(struct sk_buff *skb,
+ const struct hash_ipportip6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipportip6_data_tlist(struct sk_buff *skb,
+ const struct hash_ipportip6_elem *data)
+{
+ const struct hash_ipportip6_telem *e =
+ (const struct hash_ipportip6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip6_elem data = { };
+
+ if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+ ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip6_elem data = { };
+ u32 port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMPV6:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(data.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ for (; port <= port_to; port++) {
+ data.port = htons(port);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ipportip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ struct ip_set_hash *h;
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ u8 hbits;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_ipportip4_tvariant : &hash_ipportip6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_ipportip4_gc_init(set);
+ else
+ hash_ipportip6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_ipportip4_variant : &hash_ipportip6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_ipportip_type __read_mostly = {
+ .name = "hash:ip,port,ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
+ .dimension = IPSET_DIM_THREE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_ipportip_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipportip_init(void)
+{
+ return ip_set_type_register(&hash_ipportip_type);
+}
+
+static void __exit
+hash_ipportip_fini(void)
+{
+ ip_set_type_unregister(&hash_ipportip_type);
+}
+
+module_init(hash_ipportip_init);
+module_exit(hash_ipportip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
new file mode 100644
index 00000000000..6033e8b54bb
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -0,0 +1,628 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip,port,net type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip,port,net");
+
+/* Type specific function prefix */
+#define TYPE hash_ipportnet
+
+static bool
+hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ipportnet4_same_set hash_ipportnet_same_set
+#define hash_ipportnet6_same_set hash_ipportnet_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ipportnet4_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 cidr;
+ u8 proto;
+};
+
+/* Member elements with timeout support */
+struct hash_ipportnet4_telem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 cidr;
+ u8 proto;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
+ const struct hash_ipportnet4_elem *ip2)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->ip2 == ip2->ip2 &&
+ ip1->cidr == ip2->cidr &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportnet4_data_isnull(const struct hash_ipportnet4_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipportnet4_data_copy(struct hash_ipportnet4_elem *dst,
+ const struct hash_ipportnet4_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr)
+{
+ elem->ip2 &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+static inline void
+hash_ipportnet4_data_zero_out(struct hash_ipportnet4_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_ipportnet4_data_list(struct sk_buff *skb,
+ const struct hash_ipportnet4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipportnet4_data_tlist(struct sk_buff *skb,
+ const struct hash_ipportnet4_elem *data)
+{
+ const struct hash_ipportnet4_telem *tdata =
+ (const struct hash_ipportnet4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet4_elem data =
+ { .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+ ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2);
+ data.ip2 &= ip_set_netmask(data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet4_elem data = { .cidr = HOST_MASK };
+ u32 ip, ip_to, p, port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR2])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ data.ip2 &= ip_set_netmask(data.cidr);
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMP:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip = ntohl(data.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip &= ip_set_hostmask(cidr);
+ ip_to = ip | ~ip_set_hostmask(cidr);
+ } else
+ ip_to = ip;
+
+ port = ntohs(data.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ } else
+ port_to = port;
+
+ for (; !before(ip_to, ip); ip++)
+ for (p = port; p <= port_to; p++) {
+ data.ip = htonl(ip);
+ data.port = htons(p);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ipportnet6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 cidr;
+ u8 proto;
+};
+
+struct hash_ipportnet6_telem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 cidr;
+ u8 proto;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
+ const struct hash_ipportnet6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+ ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 &&
+ ip1->cidr == ip2->cidr &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportnet6_data_isnull(const struct hash_ipportnet6_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_ipportnet6_data_copy(struct hash_ipportnet6_elem *dst,
+ const struct hash_ipportnet6_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportnet6_data_zero_out(struct hash_ipportnet6_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+ ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+ ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+ ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+ ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static inline void
+hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip2, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_ipportnet6_data_list(struct sk_buff *skb,
+ const struct hash_ipportnet6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_ipportnet6_data_tlist(struct sk_buff *skb,
+ const struct hash_ipportnet6_elem *data)
+{
+ const struct hash_ipportnet6_telem *e =
+ (const struct hash_ipportnet6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet6_elem data =
+ { .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+ ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
+ ip6_netmask(&data.ip2, data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet6_elem data = { .cidr = HOST_MASK };
+ u32 port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR2])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&data.ip2, data.cidr);
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMPV6:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(data.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ for (; port <= port_to; port++) {
+ data.port = htons(port);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ipportnet_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ struct ip_set_hash *h;
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ u8 hbits;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ h = kzalloc(sizeof(*h)
+ + sizeof(struct ip_set_hash_nets)
+ * (set->family == AF_INET ? 32 : 128), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_ipportnet4_tvariant
+ : &hash_ipportnet6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_ipportnet4_gc_init(set);
+ else
+ hash_ipportnet6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_ipportnet4_variant : &hash_ipportnet6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_ipportnet_type __read_mostly = {
+ .name = "hash:ip,port,net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
+ .dimension = IPSET_DIM_THREE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_ipportnet_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipportnet_init(void)
+{
+ return ip_set_type_register(&hash_ipportnet_type);
+}
+
+static void __exit
+hash_ipportnet_fini(void)
+{
+ ip_set_type_unregister(&hash_ipportnet_type);
+}
+
+module_init(hash_ipportnet_init);
+module_exit(hash_ipportnet_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
new file mode 100644
index 00000000000..c4db202b7da
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -0,0 +1,458 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:net type of IP sets");
+MODULE_ALIAS("ip_set_hash:net");
+
+/* Type specific function prefix */
+#define TYPE hash_net
+
+static bool
+hash_net_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_net4_same_set hash_net_same_set
+#define hash_net6_same_set hash_net_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_net4_elem {
+ __be32 ip;
+ u16 padding0;
+ u8 padding1;
+ u8 cidr;
+};
+
+/* Member elements with timeout support */
+struct hash_net4_telem {
+ __be32 ip;
+ u16 padding0;
+ u8 padding1;
+ u8 cidr;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_net4_data_equal(const struct hash_net4_elem *ip1,
+ const struct hash_net4_elem *ip2)
+{
+ return ip1->ip == ip2->ip && ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_net4_data_isnull(const struct hash_net4_elem *elem)
+{
+ return elem->cidr == 0;
+}
+
+static inline void
+hash_net4_data_copy(struct hash_net4_elem *dst,
+ const struct hash_net4_elem *src)
+{
+ dst->ip = src->ip;
+ dst->cidr = src->cidr;
+}
+
+static inline void
+hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+/* Zero CIDR values cannot be stored */
+static inline void
+hash_net4_data_zero_out(struct hash_net4_elem *elem)
+{
+ elem->cidr = 0;
+}
+
+static bool
+hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_net4_data_tlist(struct sk_buff *skb, const struct hash_net4_elem *data)
+{
+ const struct hash_net4_telem *tdata =
+ (const struct hash_net4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, tdata->cidr);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define IP_SET_HASH_WITH_NETS
+
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net4_elem data = { .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+ data.ip &= ip_set_netmask(data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net4_elem data = { .cidr = HOST_MASK };
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ data.ip &= ip_set_netmask(data.cidr);
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ ret = adtfn(set, &data, timeout);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static bool
+hash_net_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_net6_elem {
+ union nf_inet_addr ip;
+ u16 padding0;
+ u8 padding1;
+ u8 cidr;
+};
+
+struct hash_net6_telem {
+ union nf_inet_addr ip;
+ u16 padding0;
+ u8 padding1;
+ u8 cidr;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_net6_data_equal(const struct hash_net6_elem *ip1,
+ const struct hash_net6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_net6_data_isnull(const struct hash_net6_elem *elem)
+{
+ return elem->cidr == 0;
+}
+
+static inline void
+hash_net6_data_copy(struct hash_net6_elem *dst,
+ const struct hash_net6_elem *src)
+{
+ ipv6_addr_copy(&dst->ip.in6, &src->ip.in6);
+ dst->cidr = src->cidr;
+}
+
+static inline void
+hash_net6_data_zero_out(struct hash_net6_elem *elem)
+{
+ elem->cidr = 0;
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+ ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+ ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+ ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+ ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static inline void
+hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_net6_data_tlist(struct sk_buff *skb, const struct hash_net6_elem *data)
+{
+ const struct hash_net6_telem *e =
+ (const struct hash_net6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, e->cidr);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net6_elem data = { .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+ ip6_netmask(&data.ip, data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net6_elem data = { .cidr = HOST_MASK };
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&data.ip, data.cidr);
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ ret = adtfn(set, &data, timeout);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_net_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ struct ip_set_hash *h;
+ u8 hbits;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ h = kzalloc(sizeof(*h)
+ + sizeof(struct ip_set_hash_nets)
+ * (set->family == AF_INET ? 32 : 128), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_net4_tvariant : &hash_net6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_net4_gc_init(set);
+ else
+ hash_net6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_net4_variant : &hash_net6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_net_type __read_mostly = {
+ .name = "hash:net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_net_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_net_init(void)
+{
+ return ip_set_type_register(&hash_net_type);
+}
+
+static void __exit
+hash_net_fini(void)
+{
+ ip_set_type_unregister(&hash_net_type);
+}
+
+module_init(hash_net_init);
+module_exit(hash_net_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
new file mode 100644
index 00000000000..34a165626ee
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -0,0 +1,578 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:net,port type of IP sets");
+MODULE_ALIAS("ip_set_hash:net,port");
+
+/* Type specific function prefix */
+#define TYPE hash_netport
+
+static bool
+hash_netport_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_netport4_same_set hash_netport_same_set
+#define hash_netport6_same_set hash_netport_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_netport4_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr;
+};
+
+/* Member elements with timeout support */
+struct hash_netport4_telem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
+ const struct hash_netport4_elem *ip2)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_netport4_data_isnull(const struct hash_netport4_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_netport4_data_copy(struct hash_netport4_elem *dst,
+ const struct hash_netport4_elem *src)
+{
+ dst->ip = src->ip;
+ dst->port = src->port;
+ dst->proto = src->proto;
+ dst->cidr = src->cidr;
+}
+
+static inline void
+hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+static inline void
+hash_netport4_data_zero_out(struct hash_netport4_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static bool
+hash_netport4_data_list(struct sk_buff *skb,
+ const struct hash_netport4_elem *data)
+{
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_netport4_data_tlist(struct sk_buff *skb,
+ const struct hash_netport4_elem *data)
+{
+ const struct hash_netport4_telem *tdata =
+ (const struct hash_netport4_telem *)data;
+
+ NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(tdata->timeout)));
+
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+#define PF 4
+#define HOST_MASK 32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport4_elem data = {
+ .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+ data.ip &= ip_set_netmask(data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport4_elem data = { .cidr = HOST_MASK };
+ u32 port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+ data.ip &= ip_set_netmask(data.cidr);
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMP:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(data.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ for (; port <= port_to; port++) {
+ data.port = htons(port);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+hash_netport_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct ip_set_hash *x = a->data;
+ const struct ip_set_hash *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_netport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr;
+};
+
+struct hash_netport6_telem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr;
+ unsigned long timeout;
+};
+
+static inline bool
+hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
+ const struct hash_netport6_elem *ip2)
+{
+ return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_netport6_data_isnull(const struct hash_netport6_elem *elem)
+{
+ return elem->proto == 0;
+}
+
+static inline void
+hash_netport6_data_copy(struct hash_netport6_elem *dst,
+ const struct hash_netport6_elem *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_netport6_data_zero_out(struct hash_netport6_elem *elem)
+{
+ elem->proto = 0;
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+ ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+ ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+ ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+ ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static inline void
+hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_netport6_data_list(struct sk_buff *skb,
+ const struct hash_netport6_elem *data)
+{
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static bool
+hash_netport6_data_tlist(struct sk_buff *skb,
+ const struct hash_netport6_elem *data)
+{
+ const struct hash_netport6_telem *e =
+ (const struct hash_netport6_telem *)data;
+
+ NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+ NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+ NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+ NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(e->timeout)));
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF 6
+#define HOST_MASK 128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport6_elem data = {
+ .cidr = h->nets[0].cidr || HOST_MASK };
+
+ if (data.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ data.cidr = HOST_MASK;
+
+ if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+ &data.port, &data.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+ ip6_netmask(&data.ip, data.cidr);
+
+ return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ const struct ip_set_hash *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport6_elem data = { .cidr = HOST_MASK };
+ u32 port, port_to;
+ u32 timeout = h->timeout;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!data.cidr)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip6_netmask(&data.ip, data.cidr);
+
+ if (tb[IPSET_ATTR_PORT])
+ data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+
+ if (data.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ switch (data.proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_ICMPV6:
+ break;
+ default:
+ data.port = 0;
+ break;
+ }
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout(h->timeout))
+ return -IPSET_ERR_TIMEOUT;
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ if (adt == IPSET_TEST ||
+ !(data.proto == IPPROTO_TCP || data.proto == IPPROTO_UDP) ||
+ !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &data, timeout);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(data.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ for (; port <= port_to; port++) {
+ data.port = htons(port);
+ ret = adtfn(set, &data, timeout);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_netport_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ struct ip_set_hash *h;
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ u8 hbits;
+
+ if (!(set->family == AF_INET || set->family == AF_INET6))
+ return -IPSET_ERR_INVALID_FAMILY;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+ h = kzalloc(sizeof(*h)
+ + sizeof(struct ip_set_hash_nets)
+ * (set->family == AF_INET ? 32 : 128), GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ h->table = ip_set_alloc(
+ sizeof(struct htable)
+ + jhash_size(hbits) * sizeof(struct hbucket));
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+ set->variant = set->family == AF_INET
+ ? &hash_netport4_tvariant : &hash_netport6_tvariant;
+
+ if (set->family == AF_INET)
+ hash_netport4_gc_init(set);
+ else
+ hash_netport6_gc_init(set);
+ } else {
+ set->variant = set->family == AF_INET
+ ? &hash_netport4_variant : &hash_netport6_variant;
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+
+static struct ip_set_type hash_netport_type __read_mostly = {
+ .name = "hash:net,port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_TWO,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = hash_netport_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netport_init(void)
+{
+ return ip_set_type_register(&hash_netport_type);
+}
+
+static void __exit
+hash_netport_fini(void)
+{
+ ip_set_type_unregister(&hash_netport_type);
+}
+
+module_init(hash_netport_init);
+module_exit(hash_netport_fini);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
new file mode 100644
index 00000000000..a47c32982f0
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -0,0 +1,584 @@
+/* Copyright (C) 2008-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the list:set type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_list.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("list:set type of IP sets");
+MODULE_ALIAS("ip_set_list:set");
+
+/* Member elements without and with timeout */
+struct set_elem {
+ ip_set_id_t id;
+};
+
+struct set_telem {
+ ip_set_id_t id;
+ unsigned long timeout;
+};
+
+/* Type structure */
+struct list_set {
+ size_t dsize; /* element size */
+ u32 size; /* size of set list array */
+ u32 timeout; /* timeout value */
+ struct timer_list gc; /* garbage collection */
+ struct set_elem members[0]; /* the set members */
+};
+
+static inline struct set_elem *
+list_set_elem(const struct list_set *map, u32 id)
+{
+ return (struct set_elem *)((char *)map->members + id * map->dsize);
+}
+
+static inline bool
+list_set_timeout(const struct list_set *map, u32 id)
+{
+ const struct set_telem *elem =
+ (const struct set_telem *) list_set_elem(map, id);
+
+ return ip_set_timeout_test(elem->timeout);
+}
+
+static inline bool
+list_set_expired(const struct list_set *map, u32 id)
+{
+ const struct set_telem *elem =
+ (const struct set_telem *) list_set_elem(map, id);
+
+ return ip_set_timeout_expired(elem->timeout);
+}
+
+static inline int
+list_set_exist(const struct set_telem *elem)
+{
+ return elem->id != IPSET_INVALID_ID &&
+ !ip_set_timeout_expired(elem->timeout);
+}
+
+/* Set list without and with timeout */
+
+static int
+list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+ struct list_set *map = set->data;
+ struct set_elem *elem;
+ u32 i;
+ int ret;
+
+ for (i = 0; i < map->size; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id == IPSET_INVALID_ID)
+ return 0;
+ if (with_timeout(map->timeout) && list_set_expired(map, i))
+ continue;
+ switch (adt) {
+ case IPSET_TEST:
+ ret = ip_set_test(elem->id, skb, pf, dim, flags);
+ if (ret > 0)
+ return ret;
+ break;
+ case IPSET_ADD:
+ ret = ip_set_add(elem->id, skb, pf, dim, flags);
+ if (ret == 0)
+ return ret;
+ break;
+ case IPSET_DEL:
+ ret = ip_set_del(elem->id, skb, pf, dim, flags);
+ if (ret == 0)
+ return ret;
+ break;
+ default:
+ break;
+ }
+ }
+ return -EINVAL;
+}
+
+static bool
+next_id_eq(const struct list_set *map, u32 i, ip_set_id_t id)
+{
+ const struct set_elem *elem;
+
+ if (i + 1 < map->size) {
+ elem = list_set_elem(map, i + 1);
+ return !!(elem->id == id &&
+ !(with_timeout(map->timeout) &&
+ list_set_expired(map, i + 1)));
+ }
+
+ return 0;
+}
+
+static void
+list_elem_add(struct list_set *map, u32 i, ip_set_id_t id)
+{
+ struct set_elem *e;
+
+ for (; i < map->size; i++) {
+ e = list_set_elem(map, i);
+ swap(e->id, id);
+ if (e->id == IPSET_INVALID_ID)
+ break;
+ }
+}
+
+static void
+list_elem_tadd(struct list_set *map, u32 i, ip_set_id_t id,
+ unsigned long timeout)
+{
+ struct set_telem *e;
+
+ for (; i < map->size; i++) {
+ e = (struct set_telem *)list_set_elem(map, i);
+ swap(e->id, id);
+ if (e->id == IPSET_INVALID_ID)
+ break;
+ swap(e->timeout, timeout);
+ }
+}
+
+static int
+list_set_add(struct list_set *map, u32 i, ip_set_id_t id,
+ unsigned long timeout)
+{
+ const struct set_elem *e = list_set_elem(map, i);
+
+ if (i == map->size - 1 && e->id != IPSET_INVALID_ID)
+ /* Last element replaced: e.g. add new,before,last */
+ ip_set_put_byindex(e->id);
+ if (with_timeout(map->timeout))
+ list_elem_tadd(map, i, id, timeout);
+ else
+ list_elem_add(map, i, id);
+
+ return 0;
+}
+
+static int
+list_set_del(struct list_set *map, ip_set_id_t id, u32 i)
+{
+ struct set_elem *a = list_set_elem(map, i), *b;
+
+ ip_set_put_byindex(id);
+
+ for (; i < map->size - 1; i++) {
+ b = list_set_elem(map, i + 1);
+ a->id = b->id;
+ if (with_timeout(map->timeout))
+ ((struct set_telem *)a)->timeout =
+ ((struct set_telem *)b)->timeout;
+ a = b;
+ if (a->id == IPSET_INVALID_ID)
+ break;
+ }
+ /* Last element */
+ a->id = IPSET_INVALID_ID;
+ return 0;
+}
+
+static int
+list_set_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+ struct list_set *map = set->data;
+ bool with_timeout = with_timeout(map->timeout);
+ int before = 0;
+ u32 timeout = map->timeout;
+ ip_set_id_t id, refid = IPSET_INVALID_ID;
+ const struct set_elem *elem;
+ struct ip_set *s;
+ u32 i;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_NAME] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s);
+ if (id == IPSET_INVALID_ID)
+ return -IPSET_ERR_NAME;
+ /* "Loop detection" */
+ if (s->type->features & IPSET_TYPE_NAME) {
+ ret = -IPSET_ERR_LOOP;
+ goto finish;
+ }
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ before = f & IPSET_FLAG_BEFORE;
+ }
+
+ if (before && !tb[IPSET_ATTR_NAMEREF]) {
+ ret = -IPSET_ERR_BEFORE;
+ goto finish;
+ }
+
+ if (tb[IPSET_ATTR_NAMEREF]) {
+ refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]),
+ &s);
+ if (refid == IPSET_INVALID_ID) {
+ ret = -IPSET_ERR_NAMEREF;
+ goto finish;
+ }
+ if (!before)
+ before = -1;
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!with_timeout) {
+ ret = -IPSET_ERR_TIMEOUT;
+ goto finish;
+ }
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+
+ switch (adt) {
+ case IPSET_TEST:
+ for (i = 0; i < map->size && !ret; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id == IPSET_INVALID_ID ||
+ (before != 0 && i + 1 >= map->size))
+ break;
+ else if (with_timeout && list_set_expired(map, i))
+ continue;
+ else if (before > 0 && elem->id == id)
+ ret = next_id_eq(map, i, refid);
+ else if (before < 0 && elem->id == refid)
+ ret = next_id_eq(map, i, id);
+ else if (before == 0 && elem->id == id)
+ ret = 1;
+ }
+ break;
+ case IPSET_ADD:
+ for (i = 0; i < map->size && !ret; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id == id &&
+ !(with_timeout && list_set_expired(map, i)))
+ ret = -IPSET_ERR_EXIST;
+ }
+ if (ret == -IPSET_ERR_EXIST)
+ break;
+ ret = -IPSET_ERR_LIST_FULL;
+ for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id == IPSET_INVALID_ID)
+ ret = before != 0 ? -IPSET_ERR_REF_EXIST
+ : list_set_add(map, i, id, timeout);
+ else if (elem->id != refid)
+ continue;
+ else if (with_timeout && list_set_expired(map, i))
+ ret = -IPSET_ERR_REF_EXIST;
+ else if (before)
+ ret = list_set_add(map, i, id, timeout);
+ else if (i + 1 < map->size)
+ ret = list_set_add(map, i + 1, id, timeout);
+ }
+ break;
+ case IPSET_DEL:
+ ret = -IPSET_ERR_EXIST;
+ for (i = 0; i < map->size && ret == -IPSET_ERR_EXIST; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id == IPSET_INVALID_ID) {
+ ret = before != 0 ? -IPSET_ERR_REF_EXIST
+ : -IPSET_ERR_EXIST;
+ break;
+ } else if (with_timeout && list_set_expired(map, i))
+ continue;
+ else if (elem->id == id &&
+ (before == 0 ||
+ (before > 0 &&
+ next_id_eq(map, i, refid))))
+ ret = list_set_del(map, id, i);
+ else if (before < 0 &&
+ elem->id == refid &&
+ next_id_eq(map, i, id))
+ ret = list_set_del(map, id, i + 1);
+ }
+ break;
+ default:
+ break;
+ }
+
+finish:
+ if (refid != IPSET_INVALID_ID)
+ ip_set_put_byindex(refid);
+ if (adt != IPSET_ADD || ret)
+ ip_set_put_byindex(id);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static void
+list_set_flush(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+ struct set_elem *elem;
+ u32 i;
+
+ for (i = 0; i < map->size; i++) {
+ elem = list_set_elem(map, i);
+ if (elem->id != IPSET_INVALID_ID) {
+ ip_set_put_byindex(elem->id);
+ elem->id = IPSET_INVALID_ID;
+ }
+ }
+}
+
+static void
+list_set_destroy(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ if (with_timeout(map->timeout))
+ del_timer_sync(&map->gc);
+ list_set_flush(set);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static int
+list_set_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ NLA_PUT_NET32(skb, IPSET_ATTR_SIZE, htonl(map->size));
+ if (with_timeout(map->timeout))
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+ NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES,
+ htonl(atomic_read(&set->ref) - 1));
+ NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) + map->size * map->dsize));
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int
+list_set_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *atd, *nested;
+ u32 i, first = cb->args[2];
+ const struct set_elem *e;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[2] < map->size; cb->args[2]++) {
+ i = cb->args[2];
+ e = list_set_elem(map, i);
+ if (e->id == IPSET_INVALID_ID)
+ goto finish;
+ if (with_timeout(map->timeout) && list_set_expired(map, i))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (i == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ NLA_PUT_STRING(skb, IPSET_ATTR_NAME,
+ ip_set_name_byindex(e->id));
+ if (with_timeout(map->timeout)) {
+ const struct set_telem *te =
+ (const struct set_telem *) e;
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(te->timeout)));
+ }
+ ipset_nest_end(skb, nested);
+ }
+finish:
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, atd);
+ if (unlikely(i == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+static bool
+list_set_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct list_set *x = a->data;
+ const struct list_set *y = b->data;
+
+ return x->size == y->size &&
+ x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant list_set = {
+ .kadt = list_set_kadt,
+ .uadt = list_set_uadt,
+ .destroy = list_set_destroy,
+ .flush = list_set_flush,
+ .head = list_set_head,
+ .list = list_set_list,
+ .same_set = list_set_same_set,
+};
+
+static void
+list_set_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct list_set *map = set->data;
+ struct set_telem *e;
+ u32 i;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (i = map->size - 1; i >= 0; i--) {
+ e = (struct set_telem *) list_set_elem(map, i);
+ if (e->id != IPSET_INVALID_ID &&
+ list_set_expired(map, i))
+ list_set_del(map, e->id, i);
+ }
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+list_set_gc_init(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = list_set_gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+/* Create list:set type of sets */
+
+static bool
+init_list_set(struct ip_set *set, u32 size, size_t dsize,
+ unsigned long timeout)
+{
+ struct list_set *map;
+ struct set_elem *e;
+ u32 i;
+
+ map = kzalloc(sizeof(*map) + size * dsize, GFP_KERNEL);
+ if (!map)
+ return false;
+
+ map->size = size;
+ map->dsize = dsize;
+ map->timeout = timeout;
+ set->data = map;
+
+ for (i = 0; i < size; i++) {
+ e = list_set_elem(map, i);
+ e->id = IPSET_INVALID_ID;
+ }
+
+ return true;
+}
+
+static int
+list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ u32 size = IP_SET_LIST_DEFAULT_SIZE;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_SIZE])
+ size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]);
+ if (size < IP_SET_LIST_MIN_SIZE)
+ size = IP_SET_LIST_MIN_SIZE;
+
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!init_list_set(set, size, sizeof(struct set_telem),
+ ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT])))
+ return -ENOMEM;
+
+ list_set_gc_init(set);
+ } else {
+ if (!init_list_set(set, size, sizeof(struct set_elem),
+ IPSET_NO_TIMEOUT))
+ return -ENOMEM;
+ }
+ set->variant = &list_set;
+ return 0;
+}
+
+static struct ip_set_type list_set_type __read_mostly = {
+ .name = "list:set",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_NAME | IPSET_DUMP_LAST,
+ .dimension = IPSET_DIM_ONE,
+ .family = AF_UNSPEC,
+ .revision = 0,
+ .create = list_set_create,
+ .create_policy = {
+ [IPSET_ATTR_SIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_NAME] = { .type = NLA_STRING,
+ .len = IPSET_MAXNAMELEN },
+ [IPSET_ATTR_NAMEREF] = { .type = NLA_STRING,
+ .len = IPSET_MAXNAMELEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+list_set_init(void)
+{
+ return ip_set_type_register(&list_set_type);
+}
+
+static void __exit
+list_set_fini(void)
+{
+ ip_set_type_unregister(&list_set_type);
+}
+
+module_init(list_set_init);
+module_exit(list_set_fini);
diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c
new file mode 100644
index 00000000000..23f8c816221
--- /dev/null
+++ b/net/netfilter/ipset/pfxlen.c
@@ -0,0 +1,291 @@
+#include <linux/netfilter/ipset/pfxlen.h>
+
+/*
+ * Prefixlen maps for fast conversions, by Jan Engelhardt.
+ */
+
+#define E(a, b, c, d) \
+ {.ip6 = { \
+ __constant_htonl(a), __constant_htonl(b), \
+ __constant_htonl(c), __constant_htonl(d), \
+ } }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_netmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_netmask_map[] = {
+ E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_netmask_map);
+
+#undef E
+#define E(a, b, c, d) \
+ {.ip6 = { (__force __be32) a, (__force __be32) b, \
+ (__force __be32) c, (__force __be32) d, \
+ } }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_hostmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_hostmask_map[] = {
+ E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_hostmask_map);
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index a475edee091..5c48ffb60c2 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -43,11 +43,6 @@ EXPORT_SYMBOL(register_ip_vs_app);
EXPORT_SYMBOL(unregister_ip_vs_app);
EXPORT_SYMBOL(register_ip_vs_app_inc);
-/* ipvs application list head */
-static LIST_HEAD(ip_vs_app_list);
-static DEFINE_MUTEX(__ip_vs_app_mutex);
-
-
/*
* Get an ip_vs_app object
*/
@@ -67,7 +62,8 @@ static inline void ip_vs_app_put(struct ip_vs_app *app)
* Allocate/initialize app incarnation and register it in proto apps.
*/
static int
-ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
+ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
{
struct ip_vs_protocol *pp;
struct ip_vs_app *inc;
@@ -98,7 +94,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
}
}
- ret = pp->register_app(inc);
+ ret = pp->register_app(net, inc);
if (ret)
goto out;
@@ -119,7 +115,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
* Release app incarnation
*/
static void
-ip_vs_app_inc_release(struct ip_vs_app *inc)
+ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_protocol *pp;
@@ -127,7 +123,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
return;
if (pp->unregister_app)
- pp->unregister_app(inc);
+ pp->unregister_app(net, inc);
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
pp->name, inc->name, ntohs(inc->port));
@@ -168,15 +164,17 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc)
* Register an application incarnation in protocol applications
*/
int
-register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
+register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
int result;
- mutex_lock(&__ip_vs_app_mutex);
+ mutex_lock(&ipvs->app_mutex);
- result = ip_vs_app_inc_new(app, proto, port);
+ result = ip_vs_app_inc_new(net, app, proto, port);
- mutex_unlock(&__ip_vs_app_mutex);
+ mutex_unlock(&ipvs->app_mutex);
return result;
}
@@ -185,16 +183,17 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
/*
* ip_vs_app registration routine
*/
-int register_ip_vs_app(struct ip_vs_app *app)
+int register_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
- mutex_lock(&__ip_vs_app_mutex);
+ mutex_lock(&ipvs->app_mutex);
- list_add(&app->a_list, &ip_vs_app_list);
+ list_add(&app->a_list, &ipvs->app_list);
- mutex_unlock(&__ip_vs_app_mutex);
+ mutex_unlock(&ipvs->app_mutex);
return 0;
}
@@ -204,19 +203,20 @@ int register_ip_vs_app(struct ip_vs_app *app)
* ip_vs_app unregistration routine
* We are sure there are no app incarnations attached to services
*/
-void unregister_ip_vs_app(struct ip_vs_app *app)
+void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_app *inc, *nxt;
- mutex_lock(&__ip_vs_app_mutex);
+ mutex_lock(&ipvs->app_mutex);
list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
- ip_vs_app_inc_release(inc);
+ ip_vs_app_inc_release(net, inc);
}
list_del(&app->a_list);
- mutex_unlock(&__ip_vs_app_mutex);
+ mutex_unlock(&ipvs->app_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -226,7 +226,8 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
/*
* Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
*/
-int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
+int ip_vs_bind_app(struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
{
return pp->app_conn_bind(cp);
}
@@ -481,11 +482,11 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
* /proc/net/ip_vs_app entry function
*/
-static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
+static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos)
{
struct ip_vs_app *app, *inc;
- list_for_each_entry(app, &ip_vs_app_list, a_list) {
+ list_for_each_entry(app, &ipvs->app_list, a_list) {
list_for_each_entry(inc, &app->incs_list, a_list) {
if (pos-- == 0)
return inc;
@@ -497,19 +498,24 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
{
- mutex_lock(&__ip_vs_app_mutex);
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
- return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
+ mutex_lock(&ipvs->app_mutex);
+
+ return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN;
}
static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_app *inc, *app;
struct list_head *e;
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
++*pos;
if (v == SEQ_START_TOKEN)
- return ip_vs_app_idx(0);
+ return ip_vs_app_idx(ipvs, 0);
inc = v;
app = inc->app;
@@ -518,7 +524,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return list_entry(e, struct ip_vs_app, a_list);
/* go on to next application */
- for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
+ for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) {
app = list_entry(e, struct ip_vs_app, a_list);
list_for_each_entry(inc, &app->incs_list, a_list) {
return inc;
@@ -529,7 +535,9 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
{
- mutex_unlock(&__ip_vs_app_mutex);
+ struct netns_ipvs *ipvs = net_ipvs(seq_file_net(seq));
+
+ mutex_unlock(&ipvs->app_mutex);
}
static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
@@ -557,7 +565,8 @@ static const struct seq_operations ip_vs_app_seq_ops = {
static int ip_vs_app_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_app_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private));
}
static const struct file_operations ip_vs_app_fops = {
@@ -569,15 +578,36 @@ static const struct file_operations ip_vs_app_fops = {
};
#endif
-int __init ip_vs_app_init(void)
+static int __net_init __ip_vs_app_init(struct net *net)
{
- /* we will replace it with proc_net_ipvs_create() soon */
- proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->app_list);
+ __mutex_init(&ipvs->app_mutex, "ipvs->app_mutex", &ipvs->app_key);
+ proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops);
return 0;
}
+static void __net_exit __ip_vs_app_cleanup(struct net *net)
+{
+ proc_net_remove(net, "ip_vs_app");
+}
+
+static struct pernet_operations ip_vs_app_ops = {
+ .init = __ip_vs_app_init,
+ .exit = __ip_vs_app_cleanup,
+};
+
+int __init ip_vs_app_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_app_ops);
+ return rv;
+}
+
void ip_vs_app_cleanup(void)
{
- proc_net_remove(&init_net, "ip_vs_app");
+ unregister_pernet_subsys(&ip_vs_app_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index e9adecdc8ca..9c2a517b69c 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -48,35 +48,32 @@
/*
* Connection hash size. Default is what was selected at compile time.
*/
-int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
/* size and mask values */
-int ip_vs_conn_tab_size;
-int ip_vs_conn_tab_mask;
+int ip_vs_conn_tab_size __read_mostly;
+static int ip_vs_conn_tab_mask __read_mostly;
/*
* Connection hash table: for input and output packets lookups of IPVS
*/
-static struct list_head *ip_vs_conn_tab;
+static struct hlist_head *ip_vs_conn_tab __read_mostly;
/* SLAB cache for IPVS connections */
static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
-/* counter for current IPVS connections */
-static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
-
/* counter for no client port connections */
static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
/* random value for IPVS connection hash */
-static unsigned int ip_vs_conn_rnd;
+static unsigned int ip_vs_conn_rnd __read_mostly;
/*
* Fine locking granularity for big connection hash table
*/
-#define CT_LOCKARRAY_BITS 4
+#define CT_LOCKARRAY_BITS 5
#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
@@ -133,19 +130,19 @@ static inline void ct_write_unlock_bh(unsigned key)
/*
* Returns hash value for IPVS connection entry
*/
-static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
+static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto,
const union nf_inet_addr *addr,
__be16 port)
{
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
- return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
- (__force u32)port, proto, ip_vs_conn_rnd)
- & ip_vs_conn_tab_mask;
+ return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+ (__force u32)port, proto, ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
#endif
- return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
- ip_vs_conn_rnd)
- & ip_vs_conn_tab_mask;
+ return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+ ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
}
static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
@@ -166,18 +163,18 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
port = p->vport;
}
- return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+ return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
}
static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
- NULL, 0, &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
+ &cp->caddr, cp->cport, NULL, 0, &p);
- if (cp->dest && cp->dest->svc->pe) {
- p.pe = cp->dest->svc->pe;
+ if (cp->pe) {
+ p.pe = cp->pe;
p.pe_data = cp->pe_data;
p.pe_data_len = cp->pe_data_len;
}
@@ -186,7 +183,7 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
}
/*
- * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
* returns bool success.
*/
static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
@@ -204,7 +201,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
spin_lock(&cp->lock);
if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
- list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
+ hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]);
cp->flags |= IP_VS_CONN_F_HASHED;
atomic_inc(&cp->refcnt);
ret = 1;
@@ -237,7 +234,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
spin_lock(&cp->lock);
if (cp->flags & IP_VS_CONN_F_HASHED) {
- list_del(&cp->c_list);
+ hlist_del(&cp->c_list);
cp->flags &= ~IP_VS_CONN_F_HASHED;
atomic_dec(&cp->refcnt);
ret = 1;
@@ -262,18 +259,20 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp;
+ struct hlist_node *n;
hash = ip_vs_conn_hashkey_param(p, false);
ct_read_lock(hash);
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
if (cp->af == p->af &&
+ p->cport == cp->cport && p->vport == cp->vport &&
ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
- p->cport == cp->cport && p->vport == cp->vport &&
((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
- p->protocol == cp->protocol) {
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
/* HIT */
atomic_inc(&cp->refcnt);
ct_read_unlock(hash);
@@ -313,23 +312,23 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
struct ip_vs_conn_param *p)
{
__be16 _ports[2], *pptr;
+ struct net *net = skb_net(skb);
pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
if (pptr == NULL)
return 1;
if (likely(!inverse))
- ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
- &iph->daddr, pptr[1], p);
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
+ pptr[0], &iph->daddr, pptr[1], p);
else
- ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
- &iph->saddr, pptr[0], p);
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
+ pptr[1], &iph->saddr, pptr[0], p);
return 0;
}
struct ip_vs_conn *
ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph,
unsigned int proto_off, int inverse)
{
@@ -347,14 +346,17 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp;
+ struct hlist_node *n;
hash = ip_vs_conn_hashkey_param(p, false);
ct_read_lock(hash);
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
+ if (!ip_vs_conn_net_eq(cp, p->net))
+ continue;
if (p->pe_data && p->pe->ct_match) {
- if (p->pe->ct_match(p, cp))
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp))
goto out;
continue;
}
@@ -394,6 +396,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp, *ret=NULL;
+ struct hlist_node *n;
/*
* Check for "full" addressed entries
@@ -402,12 +405,13 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
ct_read_lock(hash);
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
if (cp->af == p->af &&
+ p->vport == cp->cport && p->cport == cp->dport &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
- p->vport == cp->cport && p->cport == cp->dport &&
- p->protocol == cp->protocol) {
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
/* HIT */
atomic_inc(&cp->refcnt);
ret = cp;
@@ -428,7 +432,6 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
struct ip_vs_conn *
ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph,
unsigned int proto_off, int inverse)
{
@@ -611,9 +614,9 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
struct ip_vs_dest *dest;
if ((cp) && (!cp->dest)) {
- dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
- &cp->vaddr, cp->vport,
- cp->protocol);
+ dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
+ cp->dport, &cp->vaddr, cp->vport,
+ cp->protocol, cp->fwmark);
ip_vs_bind_dest(cp, dest);
return dest;
} else
@@ -686,13 +689,14 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
int ip_vs_check_template(struct ip_vs_conn *ct)
{
struct ip_vs_dest *dest = ct->dest;
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
/*
* Checking the dest server status.
*/
if ((dest == NULL) ||
!(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
- (sysctl_ip_vs_expire_quiescent_template &&
+ (ipvs->sysctl_expire_quiescent_template &&
(atomic_read(&dest->weight) == 0))) {
IP_VS_DBG_BUF(9, "check_template: dest not available for "
"protocol %s s:%s:%d v:%s:%d "
@@ -730,6 +734,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
static void ip_vs_conn_expire(unsigned long data)
{
struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
cp->timeout = 60*HZ;
@@ -765,13 +770,14 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->flags & IP_VS_CONN_F_NFCT)
ip_vs_conn_drop_conntrack(cp);
+ ip_vs_pe_put(cp->pe);
kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
- atomic_dec(&ip_vs_conn_count);
+ atomic_dec(&ipvs->conn_count);
kmem_cache_free(ip_vs_conn_cachep, cp);
return;
@@ -802,10 +808,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param *p,
const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
- struct ip_vs_dest *dest)
+ struct ip_vs_dest *dest, __u32 fwmark)
{
struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
+ struct netns_ipvs *ipvs = net_ipvs(p->net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+ p->protocol);
cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
@@ -813,8 +821,9 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
return NULL;
}
- INIT_LIST_HEAD(&cp->c_list);
+ INIT_HLIST_NODE(&cp->c_list);
setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+ ip_vs_conn_net_set(cp, p->net);
cp->af = p->af;
cp->protocol = p->protocol;
ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
@@ -826,7 +835,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
&cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
- if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+ cp->fwmark = fwmark;
+ if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
+ ip_vs_pe_get(p->pe);
+ cp->pe = p->pe;
cp->pe_data = p->pe_data;
cp->pe_data_len = p->pe_data_len;
}
@@ -842,7 +854,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
- atomic_inc(&ip_vs_conn_count);
+ atomic_inc(&ipvs->conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
@@ -861,8 +873,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
#endif
ip_vs_bind_xmit(cp);
- if (unlikely(pp && atomic_read(&pp->appcnt)))
- ip_vs_bind_app(cp, pp);
+ if (unlikely(pd && atomic_read(&pd->appcnt)))
+ ip_vs_bind_app(cp, pd->pp);
/*
* Allow conntrack to be preserved. By default, conntrack
@@ -871,7 +883,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
* IP_VS_CONN_F_ONE_PACKET too.
*/
- if (ip_vs_conntrack_enabled())
+ if (ip_vs_conntrack_enabled(ipvs))
cp->flags |= IP_VS_CONN_F_NFCT;
/* Hash it in the ip_vs_conn_tab finally */
@@ -884,18 +896,24 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
* /proc/net/ip_vs_conn entries
*/
#ifdef CONFIG_PROC_FS
+struct ip_vs_iter_state {
+ struct seq_net_private p;
+ struct hlist_head *l;
+};
static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
{
int idx;
struct ip_vs_conn *cp;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_node *n;
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
ct_read_lock_bh(idx);
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
if (pos-- == 0) {
- seq->private = &ip_vs_conn_tab[idx];
- return cp;
+ iter->l = &ip_vs_conn_tab[idx];
+ return cp;
}
}
ct_read_unlock_bh(idx);
@@ -906,14 +924,18 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
{
- seq->private = NULL;
+ struct ip_vs_iter_state *iter = seq->private;
+
+ iter->l = NULL;
return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
}
static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_conn *cp = v;
- struct list_head *e, *l = seq->private;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_node *e;
+ struct hlist_head *l = iter->l;
int idx;
++*pos;
@@ -921,27 +943,28 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return ip_vs_conn_array(seq, 0);
/* more on same hash chain? */
- if ((e = cp->c_list.next) != l)
- return list_entry(e, struct ip_vs_conn, c_list);
+ if ((e = cp->c_list.next))
+ return hlist_entry(e, struct ip_vs_conn, c_list);
idx = l - ip_vs_conn_tab;
ct_read_unlock_bh(idx);
while (++idx < ip_vs_conn_tab_size) {
ct_read_lock_bh(idx);
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
- seq->private = &ip_vs_conn_tab[idx];
+ hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) {
+ iter->l = &ip_vs_conn_tab[idx];
return cp;
}
ct_read_unlock_bh(idx);
}
- seq->private = NULL;
+ iter->l = NULL;
return NULL;
}
static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
{
- struct list_head *l = seq->private;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_head *l = iter->l;
if (l)
ct_read_unlock_bh(l - ip_vs_conn_tab);
@@ -955,18 +978,19 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
"Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
else {
const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
size_t len = 0;
- if (cp->dest && cp->pe_data &&
- cp->dest->svc->pe->show_pe_data) {
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
+ if (cp->pe_data) {
pe_data[0] = ' ';
- len = strlen(cp->dest->svc->pe->name);
- memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+ len = strlen(cp->pe->name);
+ memcpy(pe_data + 1, cp->pe->name, len);
pe_data[len + 1] = ' ';
len += 2;
- len += cp->dest->svc->pe->show_pe_data(cp,
- pe_data + len);
+ len += cp->pe->show_pe_data(cp, pe_data + len);
}
pe_data[len] = '\0';
@@ -1004,7 +1028,8 @@ static const struct seq_operations ip_vs_conn_seq_ops = {
static int ip_vs_conn_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_conn_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state));
}
static const struct file_operations ip_vs_conn_fops = {
@@ -1031,6 +1056,10 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
"Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
else {
const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
+
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -1067,7 +1096,8 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &ip_vs_conn_sync_seq_ops);
+ return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state));
}
static const struct file_operations ip_vs_conn_sync_fops = {
@@ -1113,7 +1143,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
}
/* Called from keventd and must protect itself from softirqs */
-void ip_vs_random_dropentry(void)
+void ip_vs_random_dropentry(struct net *net)
{
int idx;
struct ip_vs_conn *cp;
@@ -1123,17 +1153,19 @@ void ip_vs_random_dropentry(void)
*/
for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
unsigned hash = net_random() & ip_vs_conn_tab_mask;
+ struct hlist_node *n;
/*
* Lock is actually needed in this loop.
*/
ct_write_lock_bh(hash);
- list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
-
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
case IP_VS_TCP_S_SYN_RECV:
@@ -1168,20 +1200,24 @@ void ip_vs_random_dropentry(void)
/*
* Flush all the connection entries in the ip_vs_conn_tab
*/
-static void ip_vs_conn_flush(void)
+static void ip_vs_conn_flush(struct net *net)
{
int idx;
struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- flush_again:
+flush_again:
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ struct hlist_node *n;
+
/*
* Lock is actually needed in this loop.
*/
ct_write_lock_bh(idx);
- list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-
+ hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
if (cp->control) {
@@ -1194,16 +1230,41 @@ static void ip_vs_conn_flush(void)
/* the counter may be not NULL, because maybe some conn entries
are run by slow timer handler or unhashed but still referred */
- if (atomic_read(&ip_vs_conn_count) != 0) {
+ if (atomic_read(&ipvs->conn_count) != 0) {
schedule();
goto flush_again;
}
}
+/*
+ * per netns init and exit
+ */
+int __net_init __ip_vs_conn_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ atomic_set(&ipvs->conn_count, 0);
+
+ proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
+ proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+ return 0;
+}
+static void __net_exit __ip_vs_conn_cleanup(struct net *net)
+{
+ /* flush all the connection entries first */
+ ip_vs_conn_flush(net);
+ proc_net_remove(net, "ip_vs_conn");
+ proc_net_remove(net, "ip_vs_conn_sync");
+}
+static struct pernet_operations ipvs_conn_ops = {
+ .init = __ip_vs_conn_init,
+ .exit = __ip_vs_conn_cleanup,
+};
int __init ip_vs_conn_init(void)
{
int idx;
+ int retc;
/* Compute size and mask */
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
@@ -1212,8 +1273,7 @@ int __init ip_vs_conn_init(void)
/*
* Allocate the connection hash table and initialize its list heads
*/
- ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size *
- sizeof(struct list_head));
+ ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
if (!ip_vs_conn_tab)
return -ENOMEM;
@@ -1233,32 +1293,25 @@ int __init ip_vs_conn_init(void)
IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
sizeof(struct ip_vs_conn));
- for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
- INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
- }
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
+ INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
}
- proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
- proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+ retc = register_pernet_subsys(&ipvs_conn_ops);
/* calculate the random value for connection hash */
get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
- return 0;
+ return retc;
}
-
void ip_vs_conn_cleanup(void)
{
- /* flush all the connection entries first */
- ip_vs_conn_flush();
-
+ unregister_pernet_subsys(&ipvs_conn_ops);
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
- proc_net_remove(&init_net, "ip_vs_conn");
- proc_net_remove(&init_net, "ip_vs_conn_sync");
vfree(ip_vs_conn_tab);
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b4e51e9c5a0..2d1f932add4 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -41,6 +41,7 @@
#include <net/icmp.h> /* for icmp_send */
#include <net/route.h>
#include <net/ip6_checksum.h>
+#include <net/netns/generic.h> /* net_generic() */
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
@@ -68,6 +69,12 @@ EXPORT_SYMBOL(ip_vs_conn_put);
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
+int ip_vs_net_id __read_mostly;
+#ifdef IP_VS_GENERIC_NETNS
+EXPORT_SYMBOL(ip_vs_net_id);
+#endif
+/* netns cnt used for uniqueness */
+static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
/* ID used in ICMP lookups */
#define icmp_id(icmph) (((icmph)->un).echo.id)
@@ -108,21 +115,28 @@ static inline void
ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- spin_lock(&dest->stats.lock);
- dest->stats.ustats.inpkts++;
- dest->stats.ustats.inbytes += skb->len;
- spin_unlock(&dest->stats.lock);
-
- spin_lock(&dest->svc->stats.lock);
- dest->svc->stats.ustats.inpkts++;
- dest->svc->stats.ustats.inbytes += skb->len;
- spin_unlock(&dest->svc->stats.lock);
-
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.inpkts++;
- ip_vs_stats.ustats.inbytes += skb->len;
- spin_unlock(&ip_vs_stats.lock);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(dest->svc->stats.cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(ipvs->cpustats);
+ s->ustats.inpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
}
}
@@ -131,21 +145,28 @@ static inline void
ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
- spin_lock(&dest->stats.lock);
- dest->stats.ustats.outpkts++;
- dest->stats.ustats.outbytes += skb->len;
- spin_unlock(&dest->stats.lock);
-
- spin_lock(&dest->svc->stats.lock);
- dest->svc->stats.ustats.outpkts++;
- dest->svc->stats.ustats.outbytes += skb->len;
- spin_unlock(&dest->svc->stats.lock);
-
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.outpkts++;
- ip_vs_stats.ustats.outbytes += skb->len;
- spin_unlock(&ip_vs_stats.lock);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(dest->svc->stats.cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(ipvs->cpustats);
+ s->ustats.outpkts++;
+ u64_stats_update_begin(&s->syncp);
+ s->ustats.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
}
}
@@ -153,41 +174,44 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
static inline void
ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
{
- spin_lock(&cp->dest->stats.lock);
- cp->dest->stats.ustats.conns++;
- spin_unlock(&cp->dest->stats.lock);
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(cp->dest->stats.cpustats);
+ s->ustats.conns++;
- spin_lock(&svc->stats.lock);
- svc->stats.ustats.conns++;
- spin_unlock(&svc->stats.lock);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ s->ustats.conns++;
- spin_lock(&ip_vs_stats.lock);
- ip_vs_stats.ustats.conns++;
- spin_unlock(&ip_vs_stats.lock);
+ s = this_cpu_ptr(ipvs->cpustats);
+ s->ustats.conns++;
}
static inline int
ip_vs_set_state(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
- if (unlikely(!pp->state_transition))
+ if (unlikely(!pd->pp->state_transition))
return 0;
- return pp->state_transition(cp, direction, skb, pp);
+ return pd->pp->state_transition(cp, direction, skb, pd);
}
-static inline void
+static inline int
ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
struct sk_buff *skb, int protocol,
const union nf_inet_addr *caddr, __be16 cport,
const union nf_inet_addr *vaddr, __be16 vport,
struct ip_vs_conn_param *p)
{
- ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+ ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
+ vport, p);
p->pe = svc->pe;
if (p->pe && p->pe->fill_param)
- p->pe->fill_param(p, skb);
+ return p->pe->fill_param(p, skb);
+
+ return 0;
}
/*
@@ -200,7 +224,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc,
struct sk_buff *skb,
- __be16 ports[2])
+ __be16 src_port, __be16 dst_port, int *ignored)
{
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
@@ -224,8 +248,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
"mnet %s\n",
- IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
- IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
+ IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
+ IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
IP_VS_DBG_ADDR(svc->af, &snet));
/*
@@ -247,14 +271,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
__be16 vport = 0;
- if (ports[1] == svc->port) {
+ if (dst_port == svc->port) {
/* non-FTP template:
* <protocol, caddr, 0, vaddr, vport, daddr, dport>
* FTP template:
* <protocol, caddr, 0, vaddr, 0, daddr, 0>
*/
if (svc->port != FTPPORT)
- vport = ports[1];
+ vport = dst_port;
} else {
/* Note: persistent fwmark-based services and
* persistent port zero service are handled here.
@@ -268,24 +292,31 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
vaddr = &fwmark;
}
}
- ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
- vaddr, vport, &param);
+ /* return *ignored = -1 so NF_DROP can be used */
+ if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+ vaddr, vport, &param) < 0) {
+ *ignored = -1;
+ return NULL;
+ }
}
/* Check if a template already exists */
ct = ip_vs_ct_in_get(&param);
if (!ct || !ip_vs_check_template(ct)) {
- /* No template found or the dest of the connection
+ /*
+ * No template found or the dest of the connection
* template is not available.
+ * return *ignored=0 i.e. ICMP and NF_DROP
*/
dest = svc->scheduler->schedule(svc, skb);
if (!dest) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
kfree(param.pe_data);
+ *ignored = 0;
return NULL;
}
- if (ports[1] == svc->port && svc->port != FTPPORT)
+ if (dst_port == svc->port && svc->port != FTPPORT)
dport = dest->port;
/* Create a template
@@ -293,9 +324,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* and thus param.pe_data will be destroyed
* when the template expires */
ct = ip_vs_conn_new(&param, &dest->addr, dport,
- IP_VS_CONN_F_TEMPLATE, dest);
+ IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
if (ct == NULL) {
kfree(param.pe_data);
+ *ignored = -1;
return NULL;
}
@@ -306,7 +338,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
kfree(param.pe_data);
}
- dport = ports[1];
+ dport = dst_port;
if (dport == svc->port && dest->port)
dport = dest->port;
@@ -317,11 +349,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/*
* Create a new connection according to the template
*/
- ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
- &iph.daddr, ports[1], &param);
- cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
+ ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr,
+ src_port, &iph.daddr, dst_port, &param);
+
+ cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
if (cp == NULL) {
ip_vs_conn_put(ct);
+ *ignored = -1;
return NULL;
}
@@ -341,11 +375,27 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* It selects a server according to the virtual service, and
* creates a connection entry.
* Protocols supported: TCP, UDP
+ *
+ * Usage of *ignored
+ *
+ * 1 : protocol tried to schedule (eg. on SYN), found svc but the
+ * svc/scheduler decides that this packet should be accepted with
+ * NF_ACCEPT because it must not be scheduled.
+ *
+ * 0 : scheduler can not find destination, so try bypass or
+ * return ICMP and then NF_DROP (ip_vs_leave).
+ *
+ * -1 : scheduler tried to schedule but fatal error occurred, eg.
+ * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
+ * failure such as missing Call-ID, ENOMEM on skb_linearize
+ * or pe_data. In this case we should return NF_DROP without
+ * any attempts to send ICMP with ip_vs_leave.
*/
struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
- struct ip_vs_protocol *pp, int *ignored)
+ struct ip_vs_proto_data *pd, int *ignored)
{
+ struct ip_vs_protocol *pp = pd->pp;
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_dest *dest;
@@ -371,12 +421,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
}
/*
- * Do not schedule replies from local real server. It is risky
- * for fwmark services but mostly for persistent services.
+ * Do not schedule replies from local real server.
*/
if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
- (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
- (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+ (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) {
IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
"Not scheduling reply for existing connection");
__ip_vs_conn_put(cp);
@@ -386,10 +434,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
/*
* Persistent service
*/
- if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
- *ignored = 0;
- return ip_vs_sched_persist(svc, skb, pptr);
- }
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
+
+ *ignored = 0;
/*
* Non-persistent service
@@ -402,8 +450,6 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
return NULL;
}
- *ignored = 0;
-
dest = svc->scheduler->schedule(svc, skb);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -419,13 +465,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
*/
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
- pptr[0], &iph.daddr, pptr[1], &p);
+
+ ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
+ &iph.saddr, pptr[0], &iph.daddr, pptr[1],
+ &p);
cp = ip_vs_conn_new(&p, &dest->addr,
dest->port ? dest->port : pptr[1],
- flags, dest);
- if (!cp)
+ flags, dest, skb->mark);
+ if (!cp) {
+ *ignored = -1;
return NULL;
+ }
}
IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
@@ -447,11 +497,14 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
* no destination is available for a new connection.
*/
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
+ struct net *net;
+ struct netns_ipvs *ipvs;
__be16 _ports[2], *pptr;
struct ip_vs_iphdr iph;
int unicast;
+
ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
@@ -459,18 +512,20 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ip_vs_service_put(svc);
return NF_DROP;
}
+ net = skb_net(skb);
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
else
#endif
- unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
+ unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST);
/* if it is fwmark-based service, the cache_bypass sysctl is up
and the destination is a non-local unicast, then create
a cache_bypass connection entry */
- if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
+ ipvs = net_ipvs(net);
+ if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
int ret, cs;
struct ip_vs_conn *cp;
unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
@@ -484,12 +539,12 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->af, iph.protocol,
+ ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
&iph.saddr, pptr[0],
&iph.daddr, pptr[1], &p);
cp = ip_vs_conn_new(&p, &daddr, 0,
IP_VS_CONN_F_BYPASS | flags,
- NULL);
+ NULL, skb->mark);
if (!cp)
return NF_DROP;
}
@@ -498,10 +553,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ip_vs_in_stats(cp, skb);
/* set state */
- cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+ cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
/* transmit the first SYN packet */
- ret = cp->packet_xmit(skb, cp, pp);
+ ret = cp->packet_xmit(skb, cp, pd->pp);
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
@@ -674,7 +729,7 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
#endif
/* Handle relevant response ICMP messages - forward to the right
- * destination host. Used for NAT and local client.
+ * destination host.
*/
static int handle_response_icmp(int af, struct sk_buff *skb,
union nf_inet_addr *snet,
@@ -682,6 +737,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
struct ip_vs_protocol *pp,
unsigned int offset, unsigned int ihl)
{
+ struct netns_ipvs *ipvs;
unsigned int verdict = NF_DROP;
if (IP_VS_FWD_METHOD(cp) != 0) {
@@ -703,6 +759,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
if (!skb_make_writable(skb, offset))
goto out;
+ ipvs = net_ipvs(skb_net(skb));
+
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
ip_vs_nat_icmp_v6(skb, pp, cp, 1);
@@ -712,11 +770,11 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+ if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0)
goto out;
} else
#endif
- if ((sysctl_ip_vs_snat_reroute ||
+ if ((ipvs->sysctl_snat_reroute ||
skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
ip_route_me_harder(skb, RTN_LOCAL) != 0)
goto out;
@@ -808,7 +866,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
ip_vs_fill_iphdr(AF_INET, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
if (!cp)
return NF_ACCEPT;
@@ -885,7 +943,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
if (!cp)
return NF_ACCEPT;
@@ -921,12 +979,14 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
}
/* Handle response packets: rewrite addresses and send away...
- * Used for NAT and local client.
*/
static unsigned int
-handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
struct ip_vs_conn *cp, int ihl)
{
+ struct ip_vs_protocol *pp = pd->pp;
+ struct netns_ipvs *ipvs;
+
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
if (!skb_make_writable(skb, ihl))
@@ -961,13 +1021,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* if it came from this machine itself. So re-compute
* the routing information.
*/
+ ipvs = net_ipvs(skb_net(skb));
+
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+ if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0)
goto drop;
} else
#endif
- if ((sysctl_ip_vs_snat_reroute ||
+ if ((ipvs->sysctl_snat_reroute ||
skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
ip_route_me_harder(skb, RTN_LOCAL) != 0)
goto drop;
@@ -975,7 +1037,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
ip_vs_out_stats(cp, skb);
- ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+ ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
skb->ipvs_property = 1;
if (!(cp->flags & IP_VS_CONN_F_NFCT))
ip_vs_notrack(skb);
@@ -999,9 +1061,12 @@ drop:
static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
{
+ struct net *net = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs;
EnterFunction(11);
@@ -1022,6 +1087,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
+ net = skb_net(skb);
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
@@ -1045,9 +1111,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
- pp = ip_vs_proto_get(iph.protocol);
- if (unlikely(!pp))
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
return NF_ACCEPT;
+ pp = pd->pp;
/* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
@@ -1073,11 +1140,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
+ cp = pp->conn_out_get(af, skb, &iph, iph.len, 0);
+ ipvs = net_ipvs(net);
if (likely(cp))
- return handle_response(af, skb, pp, cp, iph.len);
- if (sysctl_ip_vs_nat_icmp_send &&
+ return handle_response(af, skb, pd, cp, iph.len);
+ if (ipvs->sysctl_nat_icmp_send &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
pp->protocol == IPPROTO_SCTP)) {
@@ -1087,7 +1155,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
sizeof(_ports), _ports);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
- if (ip_vs_lookup_real_service(af, iph.protocol,
+ if (ip_vs_lookup_real_service(net, af, iph.protocol,
&iph.saddr,
pptr[0])) {
/*
@@ -1202,14 +1270,15 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
static int
ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
{
+ struct net *net = NULL;
struct iphdr *iph;
struct icmphdr _icmph, *ic;
struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
struct ip_vs_iphdr ciph;
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
unsigned int offset, ihl, verdict;
- union nf_inet_addr snet;
*related = 1;
@@ -1249,9 +1318,11 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- pp = ip_vs_proto_get(cih->protocol);
- if (!pp)
+ net = skb_net(skb);
+ pd = ip_vs_proto_data_get(net, cih->protocol);
+ if (!pd)
return NF_ACCEPT;
+ pp = pd->pp;
/* Is the embedded protocol header present? */
if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
@@ -1265,18 +1336,9 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
ip_vs_fill_iphdr(AF_INET, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
- if (!cp) {
- /* The packet could also belong to a local client */
- cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
- if (cp) {
- snet.ip = iph->saddr;
- return handle_response_icmp(AF_INET, skb, &snet,
- cih->protocol, cp, pp,
- offset, ihl);
- }
+ cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1);
+ if (!cp)
return NF_ACCEPT;
- }
verdict = NF_DROP;
@@ -1312,6 +1374,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
static int
ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
{
+ struct net *net = NULL;
struct ipv6hdr *iph;
struct icmp6hdr _icmph, *ic;
struct ipv6hdr _ciph, *cih; /* The ip header contained
@@ -1319,8 +1382,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
struct ip_vs_iphdr ciph;
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
unsigned int offset, verdict;
- union nf_inet_addr snet;
struct rt6_info *rt;
*related = 1;
@@ -1361,9 +1424,11 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- pp = ip_vs_proto_get(cih->nexthdr);
- if (!pp)
+ net = skb_net(skb);
+ pd = ip_vs_proto_data_get(net, cih->nexthdr);
+ if (!pd)
return NF_ACCEPT;
+ pp = pd->pp;
/* Is the embedded protocol header present? */
/* TODO: we don't support fragmentation at the moment anyways */
@@ -1377,19 +1442,9 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
- if (!cp) {
- /* The packet could also belong to a local client */
- cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
- if (cp) {
- ipv6_addr_copy(&snet.in6, &iph->saddr);
- return handle_response_icmp(AF_INET6, skb, &snet,
- cih->nexthdr,
- cp, pp, offset,
- sizeof(struct ipv6hdr));
- }
+ cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1);
+ if (!cp)
return NF_ACCEPT;
- }
verdict = NF_DROP;
@@ -1423,10 +1478,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
{
+ struct net *net;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, restart, pkts;
+ struct netns_ipvs *ipvs;
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
@@ -1480,20 +1538,21 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
+ net = skb_net(skb);
/* Protocol supported? */
- pp = ip_vs_proto_get(iph.protocol);
- if (unlikely(!pp))
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
return NF_ACCEPT;
-
+ pp = pd->pp;
/*
* Check if the packet belongs to an existing connection entry
*/
- cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
+ cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
if (unlikely(!cp)) {
int v;
- if (!pp->conn_schedule(af, skb, pp, &v, &cp))
+ if (!pp->conn_schedule(af, skb, pd, &v, &cp))
return v;
}
@@ -1505,12 +1564,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
}
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
-
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
- if (sysctl_ip_vs_expire_nodest_conn) {
+ if (ipvs->sysctl_expire_nodest_conn) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
}
@@ -1521,7 +1581,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
}
ip_vs_in_stats(cp, skb);
- restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+ restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
if (cp->packet_xmit)
ret = cp->packet_xmit(skb, cp, pp);
/* do not touch skb anymore */
@@ -1535,35 +1595,41 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
*
* Sync connection if it is about to close to
* encorage the standby servers to update the connections timeout
+ *
+ * For ONE_PKT let ip_vs_sync_conn() do the filter work.
*/
- pkts = atomic_add_return(1, &cp->in_pkts);
- if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ pkts = ipvs->sysctl_sync_threshold[0];
+ else
+ pkts = atomic_add_return(1, &cp->in_pkts);
+
+ if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
cp->protocol == IPPROTO_SCTP) {
if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
- (pkts % sysctl_ip_vs_sync_threshold[1]
- == sysctl_ip_vs_sync_threshold[0])) ||
+ (pkts % ipvs->sysctl_sync_threshold[1]
+ == ipvs->sysctl_sync_threshold[0])) ||
(cp->old_state != cp->state &&
((cp->state == IP_VS_SCTP_S_CLOSED) ||
(cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
(cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
- ip_vs_sync_conn(cp);
+ ip_vs_sync_conn(net, cp);
goto out;
}
}
/* Keep this block last: TCP and others with pp->num_states <= 1 */
- else if (af == AF_INET &&
- (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+ else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
(((cp->protocol != IPPROTO_TCP ||
cp->state == IP_VS_TCP_S_ESTABLISHED) &&
- (pkts % sysctl_ip_vs_sync_threshold[1]
- == sysctl_ip_vs_sync_threshold[0])) ||
+ (pkts % ipvs->sysctl_sync_threshold[1]
+ == ipvs->sysctl_sync_threshold[0])) ||
((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
(cp->state == IP_VS_TCP_S_CLOSE) ||
(cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
(cp->state == IP_VS_TCP_S_TIME_WAIT)))))
- ip_vs_sync_conn(cp);
+ ip_vs_sync_conn(net, cp);
out:
cp->old_state = cp->state;
@@ -1782,7 +1848,39 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
},
#endif
};
+/*
+ * Initialize IP Virtual Server netns mem.
+ */
+static int __net_init __ip_vs_init(struct net *net)
+{
+ struct netns_ipvs *ipvs;
+ ipvs = net_generic(net, ip_vs_net_id);
+ if (ipvs == NULL) {
+ pr_err("%s(): no memory.\n", __func__);
+ return -ENOMEM;
+ }
+ ipvs->net = net;
+ /* Counters used for creating unique names */
+ ipvs->gen = atomic_read(&ipvs_netns_cnt);
+ atomic_inc(&ipvs_netns_cnt);
+ net->ipvs = ipvs;
+ printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
+ sizeof(struct netns_ipvs), ipvs->gen);
+ return 0;
+}
+
+static void __net_exit __ip_vs_cleanup(struct net *net)
+{
+ IP_VS_DBG(10, "ipvs netns %d released\n", net_ipvs(net)->gen);
+}
+
+static struct pernet_operations ipvs_core_ops = {
+ .init = __ip_vs_init,
+ .exit = __ip_vs_cleanup,
+ .id = &ip_vs_net_id,
+ .size = sizeof(struct netns_ipvs),
+};
/*
* Initialize IP Virtual Server
@@ -1791,8 +1889,11 @@ static int __init ip_vs_init(void)
{
int ret;
- ip_vs_estimator_init();
+ ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
+ if (ret < 0)
+ return ret;
+ ip_vs_estimator_init();
ret = ip_vs_control_init();
if (ret < 0) {
pr_err("can't setup control.\n");
@@ -1813,15 +1914,23 @@ static int __init ip_vs_init(void)
goto cleanup_app;
}
+ ret = ip_vs_sync_init();
+ if (ret < 0) {
+ pr_err("can't setup sync data.\n");
+ goto cleanup_conn;
+ }
+
ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
if (ret < 0) {
pr_err("can't register hooks.\n");
- goto cleanup_conn;
+ goto cleanup_sync;
}
pr_info("ipvs loaded.\n");
return ret;
+cleanup_sync:
+ ip_vs_sync_cleanup();
cleanup_conn:
ip_vs_conn_cleanup();
cleanup_app:
@@ -1831,17 +1940,20 @@ static int __init ip_vs_init(void)
ip_vs_control_cleanup();
cleanup_estimator:
ip_vs_estimator_cleanup();
+ unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
return ret;
}
static void __exit ip_vs_cleanup(void)
{
nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ ip_vs_sync_cleanup();
ip_vs_conn_cleanup();
ip_vs_app_cleanup();
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
ip_vs_estimator_cleanup();
+ unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
pr_info("ipvs unloaded.\n");
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 22f7ad5101a..a60b20fa142 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -38,6 +38,7 @@
#include <linux/mutex.h>
#include <net/net_namespace.h>
+#include <linux/nsproxy.h>
#include <net/ip.h>
#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
@@ -57,42 +58,7 @@ static DEFINE_MUTEX(__ip_vs_mutex);
/* lock for service table */
static DEFINE_RWLOCK(__ip_vs_svc_lock);
-/* lock for table with the real services */
-static DEFINE_RWLOCK(__ip_vs_rs_lock);
-
-/* lock for state and timeout tables */
-static DEFINE_SPINLOCK(ip_vs_securetcp_lock);
-
-/* lock for drop entry handling */
-static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
-
-/* lock for drop packet handling */
-static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
-
-/* 1/rate drop and drop-entry variables */
-int ip_vs_drop_rate = 0;
-int ip_vs_drop_counter = 0;
-static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
-
-/* number of virtual services */
-static int ip_vs_num_services = 0;
-
/* sysctl variables */
-static int sysctl_ip_vs_drop_entry = 0;
-static int sysctl_ip_vs_drop_packet = 0;
-static int sysctl_ip_vs_secure_tcp = 0;
-static int sysctl_ip_vs_amemthresh = 1024;
-static int sysctl_ip_vs_am_droprate = 10;
-int sysctl_ip_vs_cache_bypass = 0;
-int sysctl_ip_vs_expire_nodest_conn = 0;
-int sysctl_ip_vs_expire_quiescent_template = 0;
-int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
-int sysctl_ip_vs_nat_icmp_send = 0;
-#ifdef CONFIG_IP_VS_NFCT
-int sysctl_ip_vs_conntrack;
-#endif
-int sysctl_ip_vs_snat_reroute = 1;
-
#ifdef CONFIG_IP_VS_DEBUG
static int sysctl_ip_vs_debug_level = 0;
@@ -105,18 +71,17 @@ int ip_vs_get_debug_level(void)
#ifdef CONFIG_IP_VS_IPV6
/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
-static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
+static int __ip_vs_addr_is_local_v6(struct net *net,
+ const struct in6_addr *addr)
{
struct rt6_info *rt;
- struct flowi fl = {
- .oif = 0,
- .fl6_dst = *addr,
- .fl6_src = { .s6_addr32 = {0, 0, 0, 0} },
+ struct flowi6 fl6 = {
+ .daddr = *addr,
};
- rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+ rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
- return 1;
+ return 1;
return 0;
}
@@ -125,7 +90,7 @@ static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
* update_defense_level is called from keventd and from sysctl,
* so it needs to protect itself from softirqs
*/
-static void update_defense_level(void)
+static void update_defense_level(struct netns_ipvs *ipvs)
{
struct sysinfo i;
static int old_secure_tcp = 0;
@@ -141,73 +106,73 @@ static void update_defense_level(void)
/* si_swapinfo(&i); */
/* availmem = availmem - (i.totalswap - i.freeswap); */
- nomem = (availmem < sysctl_ip_vs_amemthresh);
+ nomem = (availmem < ipvs->sysctl_amemthresh);
local_bh_disable();
/* drop_entry */
- spin_lock(&__ip_vs_dropentry_lock);
- switch (sysctl_ip_vs_drop_entry) {
+ spin_lock(&ipvs->dropentry_lock);
+ switch (ipvs->sysctl_drop_entry) {
case 0:
- atomic_set(&ip_vs_dropentry, 0);
+ atomic_set(&ipvs->dropentry, 0);
break;
case 1:
if (nomem) {
- atomic_set(&ip_vs_dropentry, 1);
- sysctl_ip_vs_drop_entry = 2;
+ atomic_set(&ipvs->dropentry, 1);
+ ipvs->sysctl_drop_entry = 2;
} else {
- atomic_set(&ip_vs_dropentry, 0);
+ atomic_set(&ipvs->dropentry, 0);
}
break;
case 2:
if (nomem) {
- atomic_set(&ip_vs_dropentry, 1);
+ atomic_set(&ipvs->dropentry, 1);
} else {
- atomic_set(&ip_vs_dropentry, 0);
- sysctl_ip_vs_drop_entry = 1;
+ atomic_set(&ipvs->dropentry, 0);
+ ipvs->sysctl_drop_entry = 1;
};
break;
case 3:
- atomic_set(&ip_vs_dropentry, 1);
+ atomic_set(&ipvs->dropentry, 1);
break;
}
- spin_unlock(&__ip_vs_dropentry_lock);
+ spin_unlock(&ipvs->dropentry_lock);
/* drop_packet */
- spin_lock(&__ip_vs_droppacket_lock);
- switch (sysctl_ip_vs_drop_packet) {
+ spin_lock(&ipvs->droppacket_lock);
+ switch (ipvs->sysctl_drop_packet) {
case 0:
- ip_vs_drop_rate = 0;
+ ipvs->drop_rate = 0;
break;
case 1:
if (nomem) {
- ip_vs_drop_rate = ip_vs_drop_counter
- = sysctl_ip_vs_amemthresh /
- (sysctl_ip_vs_amemthresh-availmem);
- sysctl_ip_vs_drop_packet = 2;
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
+ ipvs->sysctl_drop_packet = 2;
} else {
- ip_vs_drop_rate = 0;
+ ipvs->drop_rate = 0;
}
break;
case 2:
if (nomem) {
- ip_vs_drop_rate = ip_vs_drop_counter
- = sysctl_ip_vs_amemthresh /
- (sysctl_ip_vs_amemthresh-availmem);
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
} else {
- ip_vs_drop_rate = 0;
- sysctl_ip_vs_drop_packet = 1;
+ ipvs->drop_rate = 0;
+ ipvs->sysctl_drop_packet = 1;
}
break;
case 3:
- ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+ ipvs->drop_rate = ipvs->sysctl_am_droprate;
break;
}
- spin_unlock(&__ip_vs_droppacket_lock);
+ spin_unlock(&ipvs->droppacket_lock);
/* secure_tcp */
- spin_lock(&ip_vs_securetcp_lock);
- switch (sysctl_ip_vs_secure_tcp) {
+ spin_lock(&ipvs->securetcp_lock);
+ switch (ipvs->sysctl_secure_tcp) {
case 0:
if (old_secure_tcp >= 2)
to_change = 0;
@@ -216,7 +181,7 @@ static void update_defense_level(void)
if (nomem) {
if (old_secure_tcp < 2)
to_change = 1;
- sysctl_ip_vs_secure_tcp = 2;
+ ipvs->sysctl_secure_tcp = 2;
} else {
if (old_secure_tcp >= 2)
to_change = 0;
@@ -229,7 +194,7 @@ static void update_defense_level(void)
} else {
if (old_secure_tcp >= 2)
to_change = 0;
- sysctl_ip_vs_secure_tcp = 1;
+ ipvs->sysctl_secure_tcp = 1;
}
break;
case 3:
@@ -237,10 +202,11 @@ static void update_defense_level(void)
to_change = 1;
break;
}
- old_secure_tcp = sysctl_ip_vs_secure_tcp;
+ old_secure_tcp = ipvs->sysctl_secure_tcp;
if (to_change >= 0)
- ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
- spin_unlock(&ip_vs_securetcp_lock);
+ ip_vs_protocol_timeout_change(ipvs,
+ ipvs->sysctl_secure_tcp > 1);
+ spin_unlock(&ipvs->securetcp_lock);
local_bh_enable();
}
@@ -250,16 +216,16 @@ static void update_defense_level(void)
* Timer for checking the defense
*/
#define DEFENSE_TIMER_PERIOD 1*HZ
-static void defense_work_handler(struct work_struct *work);
-static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
static void defense_work_handler(struct work_struct *work)
{
- update_defense_level();
- if (atomic_read(&ip_vs_dropentry))
- ip_vs_random_dropentry();
+ struct netns_ipvs *ipvs =
+ container_of(work, struct netns_ipvs, defense_work.work);
- schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+ update_defense_level(ipvs);
+ if (atomic_read(&ipvs->dropentry))
+ ip_vs_random_dropentry(ipvs->net);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
}
int
@@ -287,33 +253,13 @@ static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
-/*
- * Hash table: for real service lookups
- */
-#define IP_VS_RTAB_BITS 4
-#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
-#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
-
-static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
-
-/*
- * Trash for destinations
- */
-static LIST_HEAD(ip_vs_dest_trash);
-
-/*
- * FTP & NULL virtual service counters
- */
-static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
-static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
-
/*
* Returns hash value for virtual service
*/
-static __inline__ unsigned
-ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
- __be16 port)
+static inline unsigned
+ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
+ const union nf_inet_addr *addr, __be16 port)
{
register unsigned porth = ntohs(port);
__be32 addr_fold = addr->ip;
@@ -323,6 +269,7 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
+ addr_fold ^= ((size_t)net>>8);
return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
& IP_VS_SVC_TAB_MASK;
@@ -331,13 +278,13 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
/*
* Returns hash value of fwmark for virtual service lookup
*/
-static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
{
- return fwmark & IP_VS_SVC_TAB_MASK;
+ return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
}
/*
- * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
* or in the ip_vs_svc_fwm_table by fwmark.
* Should be called with locked tables.
*/
@@ -353,16 +300,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
if (svc->fwmark == 0) {
/*
- * Hash it by <protocol,addr,port> in ip_vs_svc_table
+ * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
*/
- hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
- svc->port);
+ hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
+ &svc->addr, svc->port);
list_add(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
- * Hash it by fwmark in ip_vs_svc_fwm_table
+ * Hash it by fwmark in svc_fwm_table
*/
- hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
+ hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
@@ -374,7 +321,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
/*
- * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ * Unhashes a service from svc_table / svc_fwm_table.
* Should be called with locked tables.
*/
static int ip_vs_svc_unhash(struct ip_vs_service *svc)
@@ -386,10 +333,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
}
if (svc->fwmark == 0) {
- /* Remove it from the ip_vs_svc_table table */
+ /* Remove it from the svc_table table */
list_del(&svc->s_list);
} else {
- /* Remove it from the ip_vs_svc_fwm_table table */
+ /* Remove it from the svc_fwm_table table */
list_del(&svc->f_list);
}
@@ -400,23 +347,24 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
/*
- * Get service by {proto,addr,port} in the service table.
+ * Get service by {netns, proto,addr,port} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
- __be16 vport)
+__ip_vs_service_find(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
{
unsigned hash;
struct ip_vs_service *svc;
/* Check for "full" addressed entries */
- hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
+ hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
if ((svc->af == af)
&& ip_vs_addr_equal(af, &svc->addr, vaddr)
&& (svc->port == vport)
- && (svc->protocol == protocol)) {
+ && (svc->protocol == protocol)
+ && net_eq(svc->net, net)) {
/* HIT */
return svc;
}
@@ -430,16 +378,17 @@ __ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
* Get service by {fwmark} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_svc_fwm_find(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
{
unsigned hash;
struct ip_vs_service *svc;
/* Check for fwmark addressed entries */
- hash = ip_vs_svc_fwm_hashkey(fwmark);
+ hash = ip_vs_svc_fwm_hashkey(net, fwmark);
list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
- if (svc->fwmark == fwmark && svc->af == af) {
+ if (svc->fwmark == fwmark && svc->af == af
+ && net_eq(svc->net, net)) {
/* HIT */
return svc;
}
@@ -449,42 +398,44 @@ __ip_vs_svc_fwm_find(int af, __u32 fwmark)
}
struct ip_vs_service *
-ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
+ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
struct ip_vs_service *svc;
+ struct netns_ipvs *ipvs = net_ipvs(net);
read_lock(&__ip_vs_svc_lock);
/*
* Check the table hashed by fwmark first
*/
- if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
+ svc = __ip_vs_svc_fwm_find(net, af, fwmark);
+ if (fwmark && svc)
goto out;
/*
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, vport);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
if (svc == NULL
&& protocol == IPPROTO_TCP
- && atomic_read(&ip_vs_ftpsvc_counter)
+ && atomic_read(&ipvs->ftpsvc_counter)
&& (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
/*
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
}
if (svc == NULL
- && atomic_read(&ip_vs_nullsvc_counter)) {
+ && atomic_read(&ipvs->nullsvc_counter)) {
/*
* Check if the catch-all port (port zero) exists
*/
- svc = __ip_vs_service_find(af, protocol, vaddr, 0);
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
}
out:
@@ -519,6 +470,7 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port), atomic_read(&svc->usecnt));
+ free_percpu(svc->stats.cpustats);
kfree(svc);
}
}
@@ -545,10 +497,10 @@ static inline unsigned ip_vs_rs_hashkey(int af,
}
/*
- * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
+ * Hashes ip_vs_dest in rs_table by <proto,addr,port>.
* should be called with locked tables.
*/
-static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
unsigned hash;
@@ -562,19 +514,19 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest)
*/
hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
- list_add(&dest->d_list, &ip_vs_rtable[hash]);
+ list_add(&dest->d_list, &ipvs->rs_table[hash]);
return 1;
}
/*
- * UNhashes ip_vs_dest from ip_vs_rtable.
+ * UNhashes ip_vs_dest from rs_table.
* should be called with locked tables.
*/
static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
/*
- * Remove it from the ip_vs_rtable table.
+ * Remove it from the rs_table table.
*/
if (!list_empty(&dest->d_list)) {
list_del(&dest->d_list);
@@ -588,10 +540,11 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
* Lookup real service by <proto,addr,port> in the real service table.
*/
struct ip_vs_dest *
-ip_vs_lookup_real_service(int af, __u16 protocol,
+ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
const union nf_inet_addr *daddr,
__be16 dport)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
unsigned hash;
struct ip_vs_dest *dest;
@@ -601,19 +554,19 @@ ip_vs_lookup_real_service(int af, __u16 protocol,
*/
hash = ip_vs_rs_hashkey(af, daddr, dport);
- read_lock(&__ip_vs_rs_lock);
- list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
+ read_lock(&ipvs->rs_lock);
+ list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
if ((dest->af == af)
&& ip_vs_addr_equal(af, &dest->addr, daddr)
&& (dest->port == dport)
&& ((dest->protocol == protocol) ||
dest->vfwmark)) {
/* HIT */
- read_unlock(&__ip_vs_rs_lock);
+ read_unlock(&ipvs->rs_lock);
return dest;
}
}
- read_unlock(&__ip_vs_rs_lock);
+ read_unlock(&ipvs->rs_lock);
return NULL;
}
@@ -652,15 +605,16 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
* ip_vs_lookup_real_service() looked promissing, but
* seems not working as expected.
*/
-struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
+struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
+ const union nf_inet_addr *daddr,
__be16 dport,
const union nf_inet_addr *vaddr,
- __be16 vport, __u16 protocol)
+ __be16 vport, __u16 protocol, __u32 fwmark)
{
struct ip_vs_dest *dest;
struct ip_vs_service *svc;
- svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
+ svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
if (!svc)
return NULL;
dest = ip_vs_lookup_dest(svc, daddr, dport);
@@ -685,11 +639,12 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
__be16 dport)
{
struct ip_vs_dest *dest, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
/*
* Find the destination in trash
*/
- list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
"dest->refcnt=%d\n",
dest->vfwmark,
@@ -720,6 +675,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
list_del(&dest->n_list);
ip_vs_dst_reset(dest);
__ip_vs_unbind_svc(dest);
+ free_percpu(dest->stats.cpustats);
kfree(dest);
}
}
@@ -737,14 +693,16 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
* are expired, and the refcnt of each destination in the trash must
* be 1, so we simply release them here.
*/
-static void ip_vs_trash_cleanup(void)
+static void ip_vs_trash_cleanup(struct net *net)
{
struct ip_vs_dest *dest, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
list_del(&dest->n_list);
ip_vs_dst_reset(dest);
__ip_vs_unbind_svc(dest);
+ free_percpu(dest->stats.cpustats);
kfree(dest);
}
}
@@ -768,6 +726,7 @@ static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
int conn_flags;
/* set the weight and the flags */
@@ -780,12 +739,12 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/*
- * Put the real service in ip_vs_rtable if not present.
+ * Put the real service in rs_table if not present.
* For now only for NAT!
*/
- write_lock_bh(&__ip_vs_rs_lock);
- ip_vs_rs_hash(dest);
- write_unlock_bh(&__ip_vs_rs_lock);
+ write_lock_bh(&ipvs->rs_lock);
+ ip_vs_rs_hash(ipvs, dest);
+ write_unlock_bh(&ipvs->rs_lock);
}
atomic_set(&dest->conn_flags, conn_flags);
@@ -808,12 +767,12 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
dest->u_threshold = udest->u_threshold;
dest->l_threshold = udest->l_threshold;
- spin_lock(&dest->dst_lock);
+ spin_lock_bh(&dest->dst_lock);
ip_vs_dst_reset(dest);
- spin_unlock(&dest->dst_lock);
+ spin_unlock_bh(&dest->dst_lock);
if (add)
- ip_vs_new_estimator(&dest->stats);
+ ip_vs_new_estimator(svc->net, &dest->stats);
write_lock_bh(&__ip_vs_svc_lock);
@@ -850,12 +809,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atype = ipv6_addr_type(&udest->addr.in6);
if ((!(atype & IPV6_ADDR_UNICAST) ||
atype & IPV6_ADDR_LINKLOCAL) &&
- !__ip_vs_addr_is_local_v6(&udest->addr.in6))
+ !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
return -EINVAL;
} else
#endif
{
- atype = inet_addr_type(&init_net, udest->addr.ip);
+ atype = inet_addr_type(svc->net, udest->addr.ip);
if (atype != RTN_LOCAL && atype != RTN_UNICAST)
return -EINVAL;
}
@@ -865,6 +824,11 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
pr_err("%s(): no memory.\n", __func__);
return -ENOMEM;
}
+ dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!dest->stats.cpustats) {
+ pr_err("%s() alloc_percpu failed\n", __func__);
+ goto err_alloc;
+ }
dest->af = svc->af;
dest->protocol = svc->protocol;
@@ -888,6 +852,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
LeaveFunction(2);
return 0;
+
+err_alloc:
+ kfree(dest);
+ return -ENOMEM;
}
@@ -1006,16 +974,18 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Delete a destination (must be already unlinked from the service)
*/
-static void __ip_vs_del_dest(struct ip_vs_dest *dest)
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
{
- ip_vs_kill_estimator(&dest->stats);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_kill_estimator(net, &dest->stats);
/*
* Remove it from the d-linked list with the real services.
*/
- write_lock_bh(&__ip_vs_rs_lock);
+ write_lock_bh(&ipvs->rs_lock);
ip_vs_rs_unhash(dest);
- write_unlock_bh(&__ip_vs_rs_lock);
+ write_unlock_bh(&ipvs->rs_lock);
/*
* Decrease the refcnt of the dest, and free the dest
@@ -1034,6 +1004,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
and only one user context can update virtual service at a
time, so the operation here is OK */
atomic_dec(&dest->svc->refcnt);
+ free_percpu(dest->stats.cpustats);
kfree(dest);
} else {
IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
@@ -1041,7 +1012,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
- list_add(&dest->n_list, &ip_vs_dest_trash);
+ list_add(&dest->n_list, &ipvs->dest_trash);
atomic_inc(&dest->refcnt);
}
}
@@ -1105,7 +1076,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Delete the destination
*/
- __ip_vs_del_dest(dest);
+ __ip_vs_del_dest(svc->net, dest);
LeaveFunction(2);
@@ -1117,13 +1088,14 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
* Add a service into the service hash table
*/
static int
-ip_vs_add_service(struct ip_vs_service_user_kern *u,
+ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
int ret = 0;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
+ struct netns_ipvs *ipvs = net_ipvs(net);
/* increase the module use count */
ip_vs_use_count_inc();
@@ -1137,7 +1109,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
if (u->pe_name && *u->pe_name) {
- pe = ip_vs_pe_get(u->pe_name);
+ pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
@@ -1159,6 +1131,11 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
ret = -ENOMEM;
goto out_err;
}
+ svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!svc->stats.cpustats) {
+ pr_err("%s() alloc_percpu failed\n", __func__);
+ goto out_err;
+ }
/* I'm the first user of the service */
atomic_set(&svc->usecnt, 0);
@@ -1172,6 +1149,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
svc->flags = u->flags;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
+ svc->net = net;
INIT_LIST_HEAD(&svc->destinations);
rwlock_init(&svc->sched_lock);
@@ -1189,15 +1167,15 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
/* Update the virtual service counters */
if (svc->port == FTPPORT)
- atomic_inc(&ip_vs_ftpsvc_counter);
+ atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
- atomic_inc(&ip_vs_nullsvc_counter);
+ atomic_inc(&ipvs->nullsvc_counter);
- ip_vs_new_estimator(&svc->stats);
+ ip_vs_new_estimator(net, &svc->stats);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
- ip_vs_num_services++;
+ ipvs->num_services++;
/* Hash the service into the service table */
write_lock_bh(&__ip_vs_svc_lock);
@@ -1207,6 +1185,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
*svc_p = svc;
return 0;
+
out_err:
if (svc != NULL) {
ip_vs_unbind_scheduler(svc);
@@ -1215,6 +1194,8 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
ip_vs_app_inc_put(svc->inc);
local_bh_enable();
}
+ if (svc->stats.cpustats)
+ free_percpu(svc->stats.cpustats);
kfree(svc);
}
ip_vs_scheduler_put(sched);
@@ -1248,7 +1229,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
old_sched = sched;
if (u->pe_name && *u->pe_name) {
- pe = ip_vs_pe_get(u->pe_name);
+ pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
@@ -1334,14 +1315,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
struct ip_vs_pe *old_pe;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
pr_info("%s: enter\n", __func__);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
- ip_vs_num_services--;
+ ipvs->num_services--;
- ip_vs_kill_estimator(&svc->stats);
+ ip_vs_kill_estimator(svc->net, &svc->stats);
/* Unbind scheduler */
old_sched = svc->scheduler;
@@ -1364,16 +1346,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
*/
list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
__ip_vs_unlink_dest(svc, dest, 0);
- __ip_vs_del_dest(dest);
+ __ip_vs_del_dest(svc->net, dest);
}
/*
* Update the virtual service counters
*/
if (svc->port == FTPPORT)
- atomic_dec(&ip_vs_ftpsvc_counter);
+ atomic_dec(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
- atomic_dec(&ip_vs_nullsvc_counter);
+ atomic_dec(&ipvs->nullsvc_counter);
/*
* Free the service if nobody refers to it
@@ -1383,6 +1365,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port), atomic_read(&svc->usecnt));
+ free_percpu(svc->stats.cpustats);
kfree(svc);
}
@@ -1428,17 +1411,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Flush all the virtual services
*/
-static int ip_vs_flush(void)
+static int ip_vs_flush(struct net *net)
{
int idx;
struct ip_vs_service *svc, *nxt;
/*
- * Flush the service table hashed by <protocol,addr,port>
+ * Flush the service table hashed by <netns,protocol,addr,port>
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
- ip_vs_unlink_service(svc);
+ list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
+ s_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc);
}
}
@@ -1448,7 +1433,8 @@ static int ip_vs_flush(void)
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry_safe(svc, nxt,
&ip_vs_svc_fwm_table[idx], f_list) {
- ip_vs_unlink_service(svc);
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc);
}
}
@@ -1472,24 +1458,26 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
return 0;
}
-static int ip_vs_zero_all(void)
+static int ip_vs_zero_all(struct net *net)
{
int idx;
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- ip_vs_zero_service(svc);
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- ip_vs_zero_service(svc);
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
}
}
- ip_vs_zero_stats(&ip_vs_stats);
+ ip_vs_zero_stats(net_ipvs(net)->tot_stats);
return 0;
}
@@ -1498,6 +1486,7 @@ static int
proc_do_defense_mode(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
+ struct net *net = current->nsproxy->net_ns;
int *valp = table->data;
int val = *valp;
int rc;
@@ -1508,7 +1497,7 @@ proc_do_defense_mode(ctl_table *table, int write,
/* Restore the correct value */
*valp = val;
} else {
- update_defense_level();
+ update_defense_level(net_ipvs(net));
}
}
return rc;
@@ -1534,45 +1523,54 @@ proc_do_sync_threshold(ctl_table *table, int write,
return rc;
}
+static int
+proc_do_sync_mode(ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 1)) {
+ /* Restore the correct value */
+ *valp = val;
+ } else {
+ struct net *net = current->nsproxy->net_ns;
+ ip_vs_sync_switch_mode(net, val);
+ }
+ }
+ return rc;
+}
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ * Do not change order or insert new entries without
+ * align with netns init in __ip_vs_control_init()
*/
static struct ctl_table vs_vars[] = {
{
.procname = "amemthresh",
- .data = &sysctl_ip_vs_amemthresh,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#ifdef CONFIG_IP_VS_DEBUG
- {
- .procname = "debug_level",
- .data = &sysctl_ip_vs_debug_level,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
{
.procname = "am_droprate",
- .data = &sysctl_ip_vs_am_droprate,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "drop_entry",
- .data = &sysctl_ip_vs_drop_entry,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "drop_packet",
- .data = &sysctl_ip_vs_drop_packet,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
@@ -1580,7 +1578,6 @@ static struct ctl_table vs_vars[] = {
#ifdef CONFIG_IP_VS_NFCT
{
.procname = "conntrack",
- .data = &sysctl_ip_vs_conntrack,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
@@ -1588,18 +1585,62 @@ static struct ctl_table vs_vars[] = {
#endif
{
.procname = "secure_tcp",
- .data = &sysctl_ip_vs_secure_tcp,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "snat_reroute",
- .data = &sysctl_ip_vs_snat_reroute,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .procname = "sync_version",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_mode,
+ },
+ {
+ .procname = "cache_bypass",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_nodest_conn",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_quiescent_template",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_threshold",
+ .maxlen =
+ sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
+ .mode = 0644,
+ .proc_handler = proc_do_sync_threshold,
+ },
+ {
+ .procname = "nat_icmp_send",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IP_VS_DEBUG
+ {
+ .procname = "debug_level",
+ .data = &sysctl_ip_vs_debug_level,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
#if 0
{
.procname = "timeout_established",
@@ -1686,41 +1727,6 @@ static struct ctl_table vs_vars[] = {
.proc_handler = proc_dointvec_jiffies,
},
#endif
- {
- .procname = "cache_bypass",
- .data = &sysctl_ip_vs_cache_bypass,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "expire_nodest_conn",
- .data = &sysctl_ip_vs_expire_nodest_conn,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "expire_quiescent_template",
- .data = &sysctl_ip_vs_expire_quiescent_template,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sync_threshold",
- .data = &sysctl_ip_vs_sync_threshold,
- .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
- .mode = 0644,
- .proc_handler = proc_do_sync_threshold,
- },
- {
- .procname = "nat_icmp_send",
- .data = &sysctl_ip_vs_nat_icmp_send,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
{ }
};
@@ -1732,11 +1738,10 @@ const struct ctl_path net_vs_ctl_path[] = {
};
EXPORT_SYMBOL_GPL(net_vs_ctl_path);
-static struct ctl_table_header * sysctl_header;
-
#ifdef CONFIG_PROC_FS
struct ip_vs_iter {
+ struct seq_net_private p; /* Do not move this, netns depends upon it*/
struct list_head *table;
int bucket;
};
@@ -1763,6 +1768,7 @@ static inline const char *ip_vs_fwd_name(unsigned flags)
/* Get the Nth entry in the two lists */
static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
{
+ struct net *net = seq_file_net(seq);
struct ip_vs_iter *iter = seq->private;
int idx;
struct ip_vs_service *svc;
@@ -1770,7 +1776,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* look in hash by protocol */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
- if (pos-- == 0){
+ if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_table;
iter->bucket = idx;
return svc;
@@ -1781,7 +1787,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* keep looking in fwmark */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
- if (pos-- == 0) {
+ if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_fwm_table;
iter->bucket = idx;
return svc;
@@ -1935,7 +1941,7 @@ static const struct seq_operations ip_vs_info_seq_ops = {
static int ip_vs_info_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &ip_vs_info_seq_ops,
+ return seq_open_net(inode, file, &ip_vs_info_seq_ops,
sizeof(struct ip_vs_iter));
}
@@ -1949,13 +1955,11 @@ static const struct file_operations ip_vs_info_fops = {
#endif
-struct ip_vs_stats ip_vs_stats = {
- .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
-};
-
#ifdef CONFIG_PROC_FS
static int ip_vs_stats_show(struct seq_file *seq, void *v)
{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
@@ -1963,29 +1967,29 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
seq_printf(seq,
" Conns Packets Packets Bytes Bytes\n");
- spin_lock_bh(&ip_vs_stats.lock);
- seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
- ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
- (unsigned long long) ip_vs_stats.ustats.inbytes,
- (unsigned long long) ip_vs_stats.ustats.outbytes);
+ spin_lock_bh(&tot_stats->lock);
+ seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", tot_stats->ustats.conns,
+ tot_stats->ustats.inpkts, tot_stats->ustats.outpkts,
+ (unsigned long long) tot_stats->ustats.inbytes,
+ (unsigned long long) tot_stats->ustats.outbytes);
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
" Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
seq_printf(seq,"%8X %8X %8X %16X %16X\n",
- ip_vs_stats.ustats.cps,
- ip_vs_stats.ustats.inpps,
- ip_vs_stats.ustats.outpps,
- ip_vs_stats.ustats.inbps,
- ip_vs_stats.ustats.outbps);
- spin_unlock_bh(&ip_vs_stats.lock);
+ tot_stats->ustats.cps,
+ tot_stats->ustats.inpps,
+ tot_stats->ustats.outpps,
+ tot_stats->ustats.inbps,
+ tot_stats->ustats.outbps);
+ spin_unlock_bh(&tot_stats->lock);
return 0;
}
static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
{
- return single_open(file, ip_vs_stats_show, NULL);
+ return single_open_net(inode, file, ip_vs_stats_show);
}
static const struct file_operations ip_vs_stats_fops = {
@@ -1996,13 +2000,70 @@ static const struct file_operations ip_vs_stats_fops = {
.release = single_release,
};
+static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
+ int i;
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Total Incoming Outgoing Incoming Outgoing\n");
+ seq_printf(seq,
+ "CPU Conns Packets Packets Bytes Bytes\n");
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *u = per_cpu_ptr(net->ipvs->cpustats, i);
+ seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
+ i, u->ustats.conns, u->ustats.inpkts,
+ u->ustats.outpkts, (__u64)u->ustats.inbytes,
+ (__u64)u->ustats.outbytes);
+ }
+
+ spin_lock_bh(&tot_stats->lock);
+ seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n",
+ tot_stats->ustats.conns, tot_stats->ustats.inpkts,
+ tot_stats->ustats.outpkts,
+ (unsigned long long) tot_stats->ustats.inbytes,
+ (unsigned long long) tot_stats->ustats.outbytes);
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
+ seq_printf(seq, " %8X %8X %8X %16X %16X\n",
+ tot_stats->ustats.cps,
+ tot_stats->ustats.inpps,
+ tot_stats->ustats.outpps,
+ tot_stats->ustats.inbps,
+ tot_stats->ustats.outbps);
+ spin_unlock_bh(&tot_stats->lock);
+
+ return 0;
+}
+
+static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, ip_vs_stats_percpu_show);
+}
+
+static const struct file_operations ip_vs_stats_percpu_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_stats_percpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
/*
* Set timeout values for tcp tcpfin udp in the timeout_table.
*/
-static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
u->tcp_timeout,
u->tcp_fin_timeout,
@@ -2010,19 +2071,22 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
#ifdef CONFIG_IP_VS_PROTO_TCP
if (u->tcp_timeout) {
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
= u->tcp_timeout * HZ;
}
if (u->tcp_fin_timeout) {
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
= u->tcp_fin_timeout * HZ;
}
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
if (u->udp_timeout) {
- ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ pd->timeout_table[IP_VS_UDP_S_NORMAL]
= u->udp_timeout * HZ;
}
#endif
@@ -2087,6 +2151,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
static int
do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
+ struct net *net = sock_net(sk);
int ret;
unsigned char arg[MAX_ARG_LEN];
struct ip_vs_service_user *usvc_compat;
@@ -2121,19 +2186,20 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
- ret = ip_vs_flush();
+ ret = ip_vs_flush(net);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
- ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
+ ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+ ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
+ dm->syncid);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = stop_sync_thread(dm->state);
+ ret = stop_sync_thread(net, dm->state);
goto out_unlock;
}
@@ -2148,7 +2214,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_ZERO) {
/* if no service address is set, zero counters in all */
if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
- ret = ip_vs_zero_all();
+ ret = ip_vs_zero_all(net);
goto out_unlock;
}
}
@@ -2165,10 +2231,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* Lookup the exact service by <protocol, addr, port> or fwmark */
if (usvc.fwmark == 0)
- svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+ svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
&usvc.addr, usvc.port);
else
- svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
+ svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2181,7 +2247,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (svc != NULL)
ret = -EEXIST;
else
- ret = ip_vs_add_service(&usvc, &svc);
+ ret = ip_vs_add_service(net, &usvc, &svc);
break;
case IP_VS_SO_SET_EDIT:
ret = ip_vs_edit_service(svc, &usvc);
@@ -2241,7 +2307,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
}
static inline int
-__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+__ip_vs_get_service_entries(struct net *net,
+ const struct ip_vs_get_services *get,
struct ip_vs_get_services __user *uptr)
{
int idx, count=0;
@@ -2252,7 +2319,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET)
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
if (count >= get->num_services)
@@ -2271,7 +2338,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
/* Only expose IPv4 entries to old interface */
- if (svc->af != AF_INET)
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
if (count >= get->num_services)
@@ -2291,7 +2358,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
}
static inline int
-__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
struct ip_vs_get_dests __user *uptr)
{
struct ip_vs_service *svc;
@@ -2299,9 +2366,9 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
int ret = 0;
if (get->fwmark)
- svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
else
- svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+ svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
get->port);
if (svc) {
@@ -2336,17 +2403,21 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
}
static inline void
-__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
#ifdef CONFIG_IP_VS_PROTO_TCP
- u->tcp_timeout =
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
- u->tcp_fin_timeout =
- ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+ u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
u->udp_timeout =
- ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+ pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
#endif
}
@@ -2375,7 +2446,10 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
unsigned char arg[128];
int ret = 0;
unsigned int copylen;
+ struct net *net = sock_net(sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ BUG_ON(!net);
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -2418,7 +2492,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
struct ip_vs_getinfo info;
info.version = IP_VS_VERSION_CODE;
info.size = ip_vs_conn_tab_size;
- info.num_services = ip_vs_num_services;
+ info.num_services = ipvs->num_services;
if (copy_to_user(user, &info, sizeof(info)) != 0)
ret = -EFAULT;
}
@@ -2437,7 +2511,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_service_entries(get, user);
+ ret = __ip_vs_get_service_entries(net, get, user);
}
break;
@@ -2450,10 +2524,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
if (entry->fwmark)
- svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
else
- svc = __ip_vs_service_find(AF_INET, entry->protocol,
- &addr, entry->port);
+ svc = __ip_vs_service_find(net, AF_INET,
+ entry->protocol, &addr,
+ entry->port);
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
@@ -2476,7 +2551,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EINVAL;
goto out;
}
- ret = __ip_vs_get_dest_entries(get, user);
+ ret = __ip_vs_get_dest_entries(net, get, user);
}
break;
@@ -2484,7 +2559,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
if (copy_to_user(user, &t, sizeof(t)) != 0)
ret = -EFAULT;
}
@@ -2495,15 +2570,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
struct ip_vs_daemon_user d[2];
memset(&d, 0, sizeof(d));
- if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+ if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
- d[0].syncid = ip_vs_master_syncid;
+ strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+ sizeof(d[0].mcast_ifn));
+ d[0].syncid = ipvs->master_syncid;
}
- if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+ if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
- d[1].syncid = ip_vs_backup_syncid;
+ strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+ sizeof(d[1].mcast_ifn));
+ d[1].syncid = ipvs->backup_syncid;
}
if (copy_to_user(user, &d, sizeof(d)) != 0)
ret = -EFAULT;
@@ -2542,6 +2619,7 @@ static struct genl_family ip_vs_genl_family = {
.name = IPVS_GENL_NAME,
.version = IPVS_GENL_VERSION,
.maxattr = IPVS_CMD_MAX,
+ .netnsok = true, /* Make ipvsadm to work on netns */
};
/* Policy used for first-level command attributes */
@@ -2696,11 +2774,12 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
int idx = 0, i;
int start = cb->args[0];
struct ip_vs_service *svc;
+ struct net *net = skb_sknet(skb);
mutex_lock(&__ip_vs_mutex);
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
- if (++idx <= start)
+ if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2711,7 +2790,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
- if (++idx <= start)
+ if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
@@ -2727,7 +2806,8 @@ nla_put_failure:
return skb->len;
}
-static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
+static int ip_vs_genl_parse_service(struct net *net,
+ struct ip_vs_service_user_kern *usvc,
struct nlattr *nla, int full_entry,
struct ip_vs_service **ret_svc)
{
@@ -2770,9 +2850,9 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
}
if (usvc->fwmark)
- svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+ svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
else
- svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+ svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
&usvc->addr, usvc->port);
*ret_svc = svc;
@@ -2809,13 +2889,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
return 0;
}
-static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
+static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
+ struct nlattr *nla)
{
struct ip_vs_service_user_kern usvc;
struct ip_vs_service *svc;
int ret;
- ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+ ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
return ret ? ERR_PTR(ret) : svc;
}
@@ -2883,6 +2964,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+ struct net *net = skb_sknet(skb);
mutex_lock(&__ip_vs_mutex);
@@ -2891,7 +2973,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
goto out_err;
- svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
+
+ svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc) || svc == NULL)
goto out_err;
@@ -3005,20 +3088,23 @@ nla_put_failure:
static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
struct netlink_callback *cb)
{
+ struct net *net = skb_net(skb);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
mutex_lock(&__ip_vs_mutex);
- if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+ if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
- ip_vs_master_mcast_ifn,
- ip_vs_master_syncid, cb) < 0)
+ ipvs->master_mcast_ifn,
+ ipvs->master_syncid, cb) < 0)
goto nla_put_failure;
cb->args[0] = 1;
}
- if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+ if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
- ip_vs_backup_mcast_ifn,
- ip_vs_backup_syncid, cb) < 0)
+ ipvs->backup_mcast_ifn,
+ ipvs->backup_syncid, cb) < 0)
goto nla_put_failure;
cb->args[1] = 1;
@@ -3030,31 +3116,33 @@ nla_put_failure:
return skb->len;
}
-static int ip_vs_genl_new_daemon(struct nlattr **attrs)
+static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
{
if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
- return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+ return start_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
}
-static int ip_vs_genl_del_daemon(struct nlattr **attrs)
+static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
{
if (!attrs[IPVS_DAEMON_ATTR_STATE])
return -EINVAL;
- return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+ return stop_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
}
-static int ip_vs_genl_set_config(struct nlattr **attrs)
+static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
@@ -3066,7 +3154,7 @@ static int ip_vs_genl_set_config(struct nlattr **attrs)
if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
- return ip_vs_set_timeout(&t);
+ return ip_vs_set_timeout(net, &t);
}
static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
@@ -3076,16 +3164,20 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
struct ip_vs_dest_user_kern udest;
int ret = 0, cmd;
int need_full_svc = 0, need_full_dest = 0;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ net = skb_sknet(skb);
+ ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
mutex_lock(&__ip_vs_mutex);
if (cmd == IPVS_CMD_FLUSH) {
- ret = ip_vs_flush();
+ ret = ip_vs_flush(net);
goto out;
} else if (cmd == IPVS_CMD_SET_CONFIG) {
- ret = ip_vs_genl_set_config(info->attrs);
+ ret = ip_vs_genl_set_config(net, info->attrs);
goto out;
} else if (cmd == IPVS_CMD_NEW_DAEMON ||
cmd == IPVS_CMD_DEL_DAEMON) {
@@ -3101,13 +3193,13 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
}
if (cmd == IPVS_CMD_NEW_DAEMON)
- ret = ip_vs_genl_new_daemon(daemon_attrs);
+ ret = ip_vs_genl_new_daemon(net, daemon_attrs);
else
- ret = ip_vs_genl_del_daemon(daemon_attrs);
+ ret = ip_vs_genl_del_daemon(net, daemon_attrs);
goto out;
} else if (cmd == IPVS_CMD_ZERO &&
!info->attrs[IPVS_CMD_ATTR_SERVICE]) {
- ret = ip_vs_zero_all();
+ ret = ip_vs_zero_all(net);
goto out;
}
@@ -3117,7 +3209,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
need_full_svc = 1;
- ret = ip_vs_genl_parse_service(&usvc,
+ ret = ip_vs_genl_parse_service(net, &usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
need_full_svc, &svc);
if (ret)
@@ -3147,7 +3239,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
switch (cmd) {
case IPVS_CMD_NEW_SERVICE:
if (svc == NULL)
- ret = ip_vs_add_service(&usvc, &svc);
+ ret = ip_vs_add_service(net, &usvc, &svc);
else
ret = -EEXIST;
break;
@@ -3185,7 +3277,11 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *msg;
void *reply;
int ret, cmd, reply_cmd;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ net = skb_sknet(skb);
+ ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_GET_SERVICE)
@@ -3214,7 +3310,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_service *svc;
- svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
+ svc = ip_vs_genl_find_service(net,
+ info->attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc)) {
ret = PTR_ERR(svc);
goto out_err;
@@ -3234,7 +3331,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct ip_vs_timeout_user t;
- __ip_vs_get_timeouts(&t);
+ __ip_vs_get_timeouts(net, &t);
#ifdef CONFIG_IP_VS_PROTO_TCP
NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
@@ -3380,62 +3477,173 @@ static void ip_vs_genl_unregister(void)
/* End of Generic Netlink interface definitions */
+/*
+ * per netns intit/exit func.
+ */
+int __net_init __ip_vs_control_init(struct net *net)
+{
+ int idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ctl_table *tbl;
+
+ atomic_set(&ipvs->dropentry, 0);
+ spin_lock_init(&ipvs->dropentry_lock);
+ spin_lock_init(&ipvs->droppacket_lock);
+ spin_lock_init(&ipvs->securetcp_lock);
+ ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
+
+ /* Initialize rs_table */
+ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
+ INIT_LIST_HEAD(&ipvs->rs_table[idx]);
+
+ INIT_LIST_HEAD(&ipvs->dest_trash);
+ atomic_set(&ipvs->ftpsvc_counter, 0);
+ atomic_set(&ipvs->nullsvc_counter, 0);
+
+ /* procfs stats */
+ ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL);
+ if (ipvs->tot_stats == NULL) {
+ pr_err("%s(): no memory.\n", __func__);
+ return -ENOMEM;
+ }
+ ipvs->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!ipvs->cpustats) {
+ pr_err("%s() alloc_percpu failed\n", __func__);
+ goto err_alloc;
+ }
+ spin_lock_init(&ipvs->tot_stats->lock);
+
+ proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
+ proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
+ proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
+ &ip_vs_stats_percpu_fops);
+
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_dup;
+ } else
+ tbl = vs_vars;
+ /* Initialize sysctl defaults */
+ idx = 0;
+ ipvs->sysctl_amemthresh = 1024;
+ tbl[idx++].data = &ipvs->sysctl_amemthresh;
+ ipvs->sysctl_am_droprate = 10;
+ tbl[idx++].data = &ipvs->sysctl_am_droprate;
+ tbl[idx++].data = &ipvs->sysctl_drop_entry;
+ tbl[idx++].data = &ipvs->sysctl_drop_packet;
+#ifdef CONFIG_IP_VS_NFCT
+ tbl[idx++].data = &ipvs->sysctl_conntrack;
+#endif
+ tbl[idx++].data = &ipvs->sysctl_secure_tcp;
+ ipvs->sysctl_snat_reroute = 1;
+ tbl[idx++].data = &ipvs->sysctl_snat_reroute;
+ ipvs->sysctl_sync_ver = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ver;
+ tbl[idx++].data = &ipvs->sysctl_cache_bypass;
+ tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+ tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
+ ipvs->sysctl_sync_threshold[0] = 3;
+ ipvs->sysctl_sync_threshold[1] = 50;
+ tbl[idx].data = &ipvs->sysctl_sync_threshold;
+ tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+ tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
+
+
+#ifdef CONFIG_SYSCTL
+ ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
+ tbl);
+ if (ipvs->sysctl_hdr == NULL) {
+ if (!net_eq(net, &init_net))
+ kfree(tbl);
+ goto err_dup;
+ }
+#endif
+ ip_vs_new_estimator(net, ipvs->tot_stats);
+ ipvs->sysctl_tbl = tbl;
+ /* Schedule defense work */
+ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+ return 0;
+
+err_dup:
+ free_percpu(ipvs->cpustats);
+err_alloc:
+ kfree(ipvs->tot_stats);
+ return -ENOMEM;
+}
+
+static void __net_exit __ip_vs_control_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_trash_cleanup(net);
+ ip_vs_kill_estimator(net, ipvs->tot_stats);
+ cancel_delayed_work_sync(&ipvs->defense_work);
+ cancel_work_sync(&ipvs->defense_work.work);
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(ipvs->sysctl_hdr);
+#endif
+ proc_net_remove(net, "ip_vs_stats_percpu");
+ proc_net_remove(net, "ip_vs_stats");
+ proc_net_remove(net, "ip_vs");
+ free_percpu(ipvs->cpustats);
+ kfree(ipvs->tot_stats);
+}
+
+static struct pernet_operations ipvs_control_ops = {
+ .init = __ip_vs_control_init,
+ .exit = __ip_vs_control_cleanup,
+};
int __init ip_vs_control_init(void)
{
- int ret;
int idx;
+ int ret;
EnterFunction(2);
- /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+ /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
}
- for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+
+ ret = register_pernet_subsys(&ipvs_control_ops);
+ if (ret) {
+ pr_err("cannot register namespace.\n");
+ goto err;
}
- smp_wmb();
+
+ smp_wmb(); /* Do we really need it now ? */
ret = nf_register_sockopt(&ip_vs_sockopts);
if (ret) {
pr_err("cannot register sockopt.\n");
- return ret;
+ goto err_net;
}
ret = ip_vs_genl_register();
if (ret) {
pr_err("cannot register Generic Netlink interface.\n");
nf_unregister_sockopt(&ip_vs_sockopts);
- return ret;
+ goto err_net;
}
- proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
- proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
-
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
-
- ip_vs_new_estimator(&ip_vs_stats);
-
- /* Hook the defense timer */
- schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
-
LeaveFunction(2);
return 0;
+
+err_net:
+ unregister_pernet_subsys(&ipvs_control_ops);
+err:
+ return ret;
}
void ip_vs_control_cleanup(void)
{
EnterFunction(2);
- ip_vs_trash_cleanup();
- cancel_delayed_work_sync(&defense_work);
- cancel_work_sync(&defense_work.work);
- ip_vs_kill_estimator(&ip_vs_stats);
- unregister_sysctl_table(sysctl_header);
- proc_net_remove(&init_net, "ip_vs_stats");
- proc_net_remove(&init_net, "ip_vs");
+ unregister_pernet_subsys(&ipvs_control_ops);
ip_vs_genl_unregister();
nf_unregister_sockopt(&ip_vs_sockopts);
LeaveFunction(2);
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index ff28801962e..f560a05c965 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -8,8 +8,12 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
- *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * Affected data: est_list and est_lock.
+ * estimation_timer() runs with timer per netns.
+ * get_stats()) do the per cpu summing.
*/
#define KMSG_COMPONENT "IPVS"
@@ -48,11 +52,42 @@
*/
-static void estimation_timer(unsigned long arg);
+/*
+ * Make a summary from each cpu
+ */
+static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
+ struct ip_vs_cpu_stats *stats)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
+ unsigned int start;
+ __u64 inbytes, outbytes;
+ if (i) {
+ sum->conns += s->ustats.conns;
+ sum->inpkts += s->ustats.inpkts;
+ sum->outpkts += s->ustats.outpkts;
+ do {
+ start = u64_stats_fetch_begin_bh(&s->syncp);
+ inbytes = s->ustats.inbytes;
+ outbytes = s->ustats.outbytes;
+ } while (u64_stats_fetch_retry_bh(&s->syncp, start));
+ sum->inbytes += inbytes;
+ sum->outbytes += outbytes;
+ } else {
+ sum->conns = s->ustats.conns;
+ sum->inpkts = s->ustats.inpkts;
+ sum->outpkts = s->ustats.outpkts;
+ do {
+ start = u64_stats_fetch_begin_bh(&s->syncp);
+ sum->inbytes = s->ustats.inbytes;
+ sum->outbytes = s->ustats.outbytes;
+ } while (u64_stats_fetch_retry_bh(&s->syncp, start));
+ }
+ }
+}
-static LIST_HEAD(est_list);
-static DEFINE_SPINLOCK(est_lock);
-static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
static void estimation_timer(unsigned long arg)
{
@@ -62,11 +97,16 @@ static void estimation_timer(unsigned long arg)
u32 n_inpkts, n_outpkts;
u64 n_inbytes, n_outbytes;
u32 rate;
+ struct net *net = (struct net *)arg;
+ struct netns_ipvs *ipvs;
- spin_lock(&est_lock);
- list_for_each_entry(e, &est_list, list) {
+ ipvs = net_ipvs(net);
+ ip_vs_read_cpu_stats(&ipvs->tot_stats->ustats, ipvs->cpustats);
+ spin_lock(&ipvs->est_lock);
+ list_for_each_entry(e, &ipvs->est_list, list) {
s = container_of(e, struct ip_vs_stats, est);
+ ip_vs_read_cpu_stats(&s->ustats, s->cpustats);
spin_lock(&s->lock);
n_conns = s->ustats.conns;
n_inpkts = s->ustats.inpkts;
@@ -75,38 +115,39 @@ static void estimation_timer(unsigned long arg)
n_outbytes = s->ustats.outbytes;
/* scaled by 2^10, but divided 2 seconds */
- rate = (n_conns - e->last_conns)<<9;
+ rate = (n_conns - e->last_conns) << 9;
e->last_conns = n_conns;
- e->cps += ((long)rate - (long)e->cps)>>2;
- s->ustats.cps = (e->cps+0x1FF)>>10;
+ e->cps += ((long)rate - (long)e->cps) >> 2;
+ s->ustats.cps = (e->cps + 0x1FF) >> 10;
- rate = (n_inpkts - e->last_inpkts)<<9;
+ rate = (n_inpkts - e->last_inpkts) << 9;
e->last_inpkts = n_inpkts;
- e->inpps += ((long)rate - (long)e->inpps)>>2;
- s->ustats.inpps = (e->inpps+0x1FF)>>10;
+ e->inpps += ((long)rate - (long)e->inpps) >> 2;
+ s->ustats.inpps = (e->inpps + 0x1FF) >> 10;
- rate = (n_outpkts - e->last_outpkts)<<9;
+ rate = (n_outpkts - e->last_outpkts) << 9;
e->last_outpkts = n_outpkts;
- e->outpps += ((long)rate - (long)e->outpps)>>2;
- s->ustats.outpps = (e->outpps+0x1FF)>>10;
+ e->outpps += ((long)rate - (long)e->outpps) >> 2;
+ s->ustats.outpps = (e->outpps + 0x1FF) >> 10;
- rate = (n_inbytes - e->last_inbytes)<<4;
+ rate = (n_inbytes - e->last_inbytes) << 4;
e->last_inbytes = n_inbytes;
- e->inbps += ((long)rate - (long)e->inbps)>>2;
- s->ustats.inbps = (e->inbps+0xF)>>5;
+ e->inbps += ((long)rate - (long)e->inbps) >> 2;
+ s->ustats.inbps = (e->inbps + 0xF) >> 5;
- rate = (n_outbytes - e->last_outbytes)<<4;
+ rate = (n_outbytes - e->last_outbytes) << 4;
e->last_outbytes = n_outbytes;
- e->outbps += ((long)rate - (long)e->outbps)>>2;
- s->ustats.outbps = (e->outbps+0xF)>>5;
+ e->outbps += ((long)rate - (long)e->outbps) >> 2;
+ s->ustats.outbps = (e->outbps + 0xF) >> 5;
spin_unlock(&s->lock);
}
- spin_unlock(&est_lock);
- mod_timer(&est_timer, jiffies + 2*HZ);
+ spin_unlock(&ipvs->est_lock);
+ mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
}
-void ip_vs_new_estimator(struct ip_vs_stats *stats)
+void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
INIT_LIST_HEAD(&est->list);
@@ -126,18 +167,19 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats)
est->last_outbytes = stats->ustats.outbytes;
est->outbps = stats->ustats.outbps<<5;
- spin_lock_bh(&est_lock);
- list_add(&est->list, &est_list);
- spin_unlock_bh(&est_lock);
+ spin_lock_bh(&ipvs->est_lock);
+ list_add(&est->list, &ipvs->est_list);
+ spin_unlock_bh(&ipvs->est_lock);
}
-void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_estimator *est = &stats->est;
- spin_lock_bh(&est_lock);
+ spin_lock_bh(&ipvs->est_lock);
list_del(&est->list);
- spin_unlock_bh(&est_lock);
+ spin_unlock_bh(&ipvs->est_lock);
}
void ip_vs_zero_estimator(struct ip_vs_stats *stats)
@@ -157,13 +199,35 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats)
est->outbps = 0;
}
-int __init ip_vs_estimator_init(void)
+static int __net_init __ip_vs_estimator_init(struct net *net)
{
- mod_timer(&est_timer, jiffies + 2 * HZ);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->est_list);
+ spin_lock_init(&ipvs->est_lock);
+ setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net);
+ mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
return 0;
}
+static void __net_exit __ip_vs_estimator_exit(struct net *net)
+{
+ del_timer_sync(&net_ipvs(net)->est_timer);
+}
+static struct pernet_operations ip_vs_app_ops = {
+ .init = __ip_vs_estimator_init,
+ .exit = __ip_vs_estimator_exit,
+};
+
+int __init ip_vs_estimator_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_app_ops);
+ return rv;
+}
+
void ip_vs_estimator_cleanup(void)
{
- del_timer_sync(&est_timer);
+ unregister_pernet_subsys(&ip_vs_app_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 75455000ad1..6b5dd6ddaae 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -157,6 +157,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
int ret = 0;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
+ struct net *net;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
@@ -197,18 +198,20 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
*/
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, iph->protocol,
- &from, port, &cp->caddr, 0, &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &from, port,
+ &cp->caddr, 0, &p);
n_cp = ip_vs_conn_out_get(&p);
}
if (!n_cp) {
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp),
+ AF_INET, IPPROTO_TCP, &cp->caddr,
0, &cp->vaddr, port, &p);
n_cp = ip_vs_conn_new(&p, &from, port,
IP_VS_CONN_F_NO_CPORT |
IP_VS_CONN_F_NFCT,
- cp->dest);
+ cp->dest, skb->mark);
if (!n_cp)
return 0;
@@ -257,8 +260,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* would be adjusted twice.
*/
+ net = skb_net(skb);
cp->app_data = NULL;
- ip_vs_tcp_conn_listen(n_cp);
+ ip_vs_tcp_conn_listen(net, n_cp);
ip_vs_conn_put(n_cp);
return ret;
}
@@ -287,6 +291,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
union nf_inet_addr to;
__be16 port;
struct ip_vs_conn *n_cp;
+ struct net *net;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
@@ -358,14 +363,15 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
- &cp->vaddr, htons(ntohs(cp->vport)-1),
- &p);
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &to, port, &cp->vaddr,
+ htons(ntohs(cp->vport)-1), &p);
n_cp = ip_vs_conn_in_get(&p);
if (!n_cp) {
n_cp = ip_vs_conn_new(&p, &cp->daddr,
htons(ntohs(cp->dport)-1),
- IP_VS_CONN_F_NFCT, cp->dest);
+ IP_VS_CONN_F_NFCT, cp->dest,
+ skb->mark);
if (!n_cp)
return 0;
@@ -377,7 +383,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
/*
* Move tunnel to listen state
*/
- ip_vs_tcp_conn_listen(n_cp);
+ net = skb_net(skb);
+ ip_vs_tcp_conn_listen(net, n_cp);
ip_vs_conn_put(n_cp);
return 1;
@@ -398,23 +405,22 @@ static struct ip_vs_app ip_vs_ftp = {
.pkt_in = ip_vs_ftp_in,
};
-
/*
- * ip_vs_ftp initialization
+ * per netns ip_vs_ftp initialization
*/
-static int __init ip_vs_ftp_init(void)
+static int __net_init __ip_vs_ftp_init(struct net *net)
{
int i, ret;
struct ip_vs_app *app = &ip_vs_ftp;
- ret = register_ip_vs_app(app);
+ ret = register_ip_vs_app(net, app);
if (ret)
return ret;
for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
if (!ports[i])
continue;
- ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+ ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
if (ret)
break;
pr_info("%s: loaded support on port[%d] = %d\n",
@@ -422,18 +428,39 @@ static int __init ip_vs_ftp_init(void)
}
if (ret)
- unregister_ip_vs_app(app);
+ unregister_ip_vs_app(net, app);
return ret;
}
+/*
+ * netns exit
+ */
+static void __ip_vs_ftp_exit(struct net *net)
+{
+ struct ip_vs_app *app = &ip_vs_ftp;
+
+ unregister_ip_vs_app(net, app);
+}
+
+static struct pernet_operations ip_vs_ftp_ops = {
+ .init = __ip_vs_ftp_init,
+ .exit = __ip_vs_ftp_exit,
+};
+int __init ip_vs_ftp_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_ftp_ops);
+ return rv;
+}
/*
* ip_vs_ftp finish.
*/
static void __exit ip_vs_ftp_exit(void)
{
- unregister_ip_vs_app(&ip_vs_ftp);
+ unregister_pernet_subsys(&ip_vs_ftp_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 9323f894419..6bf7a807649 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -70,7 +70,6 @@
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
-static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
/*
@@ -117,7 +116,7 @@ struct ip_vs_lblc_table {
static ctl_table vs_vars_table[] = {
{
.procname = "lblc_expiration",
- .data = &sysctl_ip_vs_lblc_expiration,
+ .data = NULL,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -125,8 +124,6 @@ static ctl_table vs_vars_table[] = {
{ }
};
-static struct ctl_table_header * sysctl_header;
-
static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
{
list_del(&en->list);
@@ -248,6 +245,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
struct ip_vs_lblc_entry *en, *nxt;
unsigned long now = jiffies;
int i, j;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
@@ -255,7 +253,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
if (time_before(now,
- en->lastuse + sysctl_ip_vs_lblc_expiration))
+ en->lastuse +
+ ipvs->sysctl_lblc_expiration))
continue;
ip_vs_lblc_free(en);
@@ -390,12 +389,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
int loh, doh;
/*
- * We think the overhead of processing active connections is fifty
- * times higher than that of inactive connections in average. (This
- * fifty times might not be accurate, we will change it later.) We
- * use the following formula to estimate the overhead:
- * dest->activeconns*50 + dest->inactconns
- * and the load:
+ * We use the following formula to estimate the load:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
@@ -411,8 +405,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
continue;
if (atomic_read(&dest->weight) > 0) {
least = dest;
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
@@ -426,8 +419,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
+ doh = ip_vs_dest_conn_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
@@ -511,7 +503,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* No cache entry or it is invalid, time to schedule */
dest = __ip_vs_lblc_schedule(svc);
if (!dest) {
- IP_VS_ERR_RL("LBLC: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
@@ -543,23 +535,73 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler =
.schedule = ip_vs_lblc_schedule,
};
+/*
+ * per netns init.
+ */
+static int __net_init __ip_vs_lblc_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblc_ctl_table == NULL)
+ return -ENOMEM;
+ } else
+ ipvs->lblc_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblc_expiration = 24*60*60*HZ;
+ ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
+
+#ifdef CONFIG_SYSCTL
+ ipvs->lblc_ctl_header =
+ register_net_sysctl_table(net, net_vs_ctl_path,
+ ipvs->lblc_ctl_table);
+ if (!ipvs->lblc_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+ return -ENOMEM;
+ }
+#endif
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblc_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(ipvs->lblc_ctl_header);
+#endif
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+}
+
+static struct pernet_operations ip_vs_lblc_ops = {
+ .init = __ip_vs_lblc_init,
+ .exit = __ip_vs_lblc_exit,
+};
static int __init ip_vs_lblc_init(void)
{
int ret;
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+ ret = register_pernet_subsys(&ip_vs_lblc_ops);
+ if (ret)
+ return ret;
+
ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
if (ret)
- unregister_sysctl_table(sysctl_header);
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
return ret;
}
-
static void __exit ip_vs_lblc_cleanup(void)
{
- unregister_sysctl_table(sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index dbeed8ea421..00631765b92 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -70,8 +70,6 @@
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
-static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
-
/*
* for IPVS lblcr entry hash table
@@ -180,8 +178,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
if ((atomic_read(&least->weight) > 0)
&& (least->flags & IP_VS_DEST_F_AVAILABLE)) {
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
@@ -194,8 +191,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
+ doh = ip_vs_dest_conn_overhead(dest);
if ((loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight))
&& (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -230,8 +226,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
list_for_each_entry(e, &set->list, list) {
most = e->dest;
if (atomic_read(&most->weight) > 0) {
- moh = atomic_read(&most->activeconns) * 50
- + atomic_read(&most->inactconns);
+ moh = ip_vs_dest_conn_overhead(most);
goto nextstage;
}
}
@@ -241,8 +236,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
nextstage:
list_for_each_entry(e, &set->list, list) {
dest = e->dest;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
+ doh = ip_vs_dest_conn_overhead(dest);
/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
if ((moh * atomic_read(&dest->weight) <
doh * atomic_read(&most->weight))
@@ -296,7 +290,7 @@ struct ip_vs_lblcr_table {
static ctl_table vs_vars_table[] = {
{
.procname = "lblcr_expiration",
- .data = &sysctl_ip_vs_lblcr_expiration,
+ .data = NULL,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -304,8 +298,6 @@ static ctl_table vs_vars_table[] = {
{ }
};
-static struct ctl_table_header * sysctl_header;
-
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
{
list_del(&en->list);
@@ -425,14 +417,15 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
unsigned long now = jiffies;
int i, j;
struct ip_vs_lblcr_entry *en, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
- if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
- now))
+ if (time_after(en->lastuse
+ + ipvs->sysctl_lblcr_expiration, now))
continue;
ip_vs_lblcr_free(en);
@@ -566,12 +559,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
int loh, doh;
/*
- * We think the overhead of processing active connections is fifty
- * times higher than that of inactive connections in average. (This
- * fifty times might not be accurate, we will change it later.) We
- * use the following formula to estimate the overhead:
- * dest->activeconns*50 + dest->inactconns
- * and the load:
+ * We use the following formula to estimate the load:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
@@ -588,8 +576,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
if (atomic_read(&dest->weight) > 0) {
least = dest;
- loh = atomic_read(&least->activeconns) * 50
- + atomic_read(&least->inactconns);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
@@ -603,8 +590,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = atomic_read(&dest->activeconns) * 50
- + atomic_read(&dest->inactconns);
+ doh = ip_vs_dest_conn_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
@@ -664,6 +650,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
read_lock(&svc->sched_lock);
en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
if (en) {
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
/* We only hold a read lock, but this is atomic */
en->lastuse = jiffies;
@@ -675,7 +662,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* More than one destination + enough time passed by, cleanup */
if (atomic_read(&en->set.size) > 1 &&
time_after(jiffies, en->set.lastmod +
- sysctl_ip_vs_lblcr_expiration)) {
+ ipvs->sysctl_lblcr_expiration)) {
struct ip_vs_dest *m;
write_lock(&en->set.lock);
@@ -694,7 +681,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* The cache entry is invalid, time to schedule */
dest = __ip_vs_lblcr_schedule(svc);
if (!dest) {
- IP_VS_ERR_RL("LBLCR: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
read_unlock(&svc->sched_lock);
return NULL;
}
@@ -744,23 +731,73 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
.schedule = ip_vs_lblcr_schedule,
};
+/*
+ * per netns init.
+ */
+static int __net_init __ip_vs_lblcr_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblcr_ctl_table == NULL)
+ return -ENOMEM;
+ } else
+ ipvs->lblcr_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblcr_expiration = 24*60*60*HZ;
+ ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
+
+#ifdef CONFIG_SYSCTL
+ ipvs->lblcr_ctl_header =
+ register_net_sysctl_table(net, net_vs_ctl_path,
+ ipvs->lblcr_ctl_table);
+ if (!ipvs->lblcr_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+ return -ENOMEM;
+ }
+#endif
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblcr_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
+#endif
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+}
+
+static struct pernet_operations ip_vs_lblcr_ops = {
+ .init = __ip_vs_lblcr_init,
+ .exit = __ip_vs_lblcr_exit,
+};
static int __init ip_vs_lblcr_init(void)
{
int ret;
- sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+ ret = register_pernet_subsys(&ip_vs_lblcr_ops);
+ if (ret)
+ return ret;
+
ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
if (ret)
- unregister_sysctl_table(sysctl_header);
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
return ret;
}
-
static void __exit ip_vs_lblcr_cleanup(void)
{
- unregister_sysctl_table(sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
}
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index 4f69db1fac5..f391819c0cc 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -22,22 +22,6 @@
#include <net/ip_vs.h>
-
-static inline unsigned int
-ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
-{
- /*
- * We think the overhead of processing active connections is 256
- * times higher than that of inactive connections in average. (This
- * 256 times might not be accurate, we will change it later) We
- * use the following formula to estimate the overhead now:
- * dest->activeconns*256 + dest->inactconns
- */
- return (atomic_read(&dest->activeconns) << 8) +
- atomic_read(&dest->inactconns);
-}
-
-
/*
* Least Connection scheduling
*/
@@ -62,7 +46,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
atomic_read(&dest->weight) == 0)
continue;
- doh = ip_vs_lc_dest_overhead(dest);
+ doh = ip_vs_dest_conn_overhead(dest);
if (!least || doh < loh) {
least = dest;
loh = doh;
@@ -70,7 +54,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
if (!least)
- IP_VS_ERR_RL("LC: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
else
IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d "
"inactconns %d\n",
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 4680647cd45..f454c80df0a 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -141,6 +141,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
struct nf_conntrack_tuple *orig, new_reply;
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = nf_ct_net(ct);
if (exp->tuple.src.l3num != PF_INET)
return;
@@ -155,7 +156,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
/* RS->CLIENT */
orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+ ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
&orig->src.u3, orig->src.u.tcp.port,
&orig->dst.u3, orig->dst.u.tcp.port, &p);
cp = ip_vs_conn_out_get(&p);
@@ -268,7 +269,8 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
" for conn " FMT_CONN "\n",
__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
- h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+ h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
+ &tuple);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
/* Show what happens instead of calling nf_ct_kill() */
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index c413e183082..984d9c137d8 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -99,7 +99,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
if (!least) {
- IP_VS_ERR_RL("NQ: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 3414af70ee1..5cf859ccb31 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -29,12 +29,11 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc)
}
/* Get pe in the pe list by name */
-static struct ip_vs_pe *
-ip_vs_pe_getbyname(const char *pe_name)
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
{
struct ip_vs_pe *pe;
- IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+ IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
pe_name);
spin_lock_bh(&ip_vs_pe_lock);
@@ -60,28 +59,22 @@ ip_vs_pe_getbyname(const char *pe_name)
}
/* Lookup pe and try to load it if it doesn't exist */
-struct ip_vs_pe *ip_vs_pe_get(const char *name)
+struct ip_vs_pe *ip_vs_pe_getbyname(const char *name)
{
struct ip_vs_pe *pe;
/* Search for the pe by name */
- pe = ip_vs_pe_getbyname(name);
+ pe = __ip_vs_pe_getbyname(name);
/* If pe not found, load the module and search again */
if (!pe) {
request_module("ip_vs_pe_%s", name);
- pe = ip_vs_pe_getbyname(name);
+ pe = __ip_vs_pe_getbyname(name);
}
return pe;
}
-void ip_vs_pe_put(struct ip_vs_pe *pe)
-{
- if (pe && pe->module)
- module_put(pe->module);
-}
-
/* Register a pe in the pe list */
int register_ip_vs_pe(struct ip_vs_pe *pe)
{
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index b8b4e9620f3..0d83bc01fed 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -71,6 +71,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
struct ip_vs_iphdr iph;
unsigned int dataoff, datalen, matchoff, matchlen;
const char *dptr;
+ int retc;
ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
@@ -83,6 +84,8 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
if (dataoff >= skb->len)
return -EINVAL;
+ if ((retc=skb_linearize(skb)) < 0)
+ return retc;
dptr = skb->data + dataoff;
datalen = skb->len - dataoff;
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index c5399839087..17484a4416e 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -60,6 +60,35 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
return 0;
}
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) || \
+ defined(CONFIG_IP_VS_PROTO_SCTP) || defined(CONFIG_IP_VS_PROTO_AH) || \
+ defined(CONFIG_IP_VS_PROTO_ESP)
+/*
+ * register an ipvs protocols netns related data
+ */
+static int
+register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+ struct ip_vs_proto_data *pd =
+ kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC);
+
+ if (!pd) {
+ pr_err("%s(): no memory.\n", __func__);
+ return -ENOMEM;
+ }
+ pd->pp = pp; /* For speed issues */
+ pd->next = ipvs->proto_data_table[hash];
+ ipvs->proto_data_table[hash] = pd;
+ atomic_set(&pd->appcnt, 0); /* Init app counter */
+
+ if (pp->init_netns != NULL)
+ pp->init_netns(net, pd);
+
+ return 0;
+}
+#endif
/*
* unregister an ipvs protocol
@@ -82,6 +111,29 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
return -ESRCH;
}
+/*
+ * unregister an ipvs protocols netns data
+ */
+static int
+unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data **pd_p;
+ unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol);
+
+ pd_p = &ipvs->proto_data_table[hash];
+ for (; *pd_p; pd_p = &(*pd_p)->next) {
+ if (*pd_p == pd) {
+ *pd_p = pd->next;
+ if (pd->pp->exit_netns != NULL)
+ pd->pp->exit_netns(net, pd);
+ kfree(pd);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
/*
* get ip_vs_protocol object by its proto.
@@ -100,19 +152,44 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
}
EXPORT_SYMBOL(ip_vs_proto_get);
+/*
+ * get ip_vs_protocol object data by netns and proto
+ */
+struct ip_vs_proto_data *
+__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
+{
+ struct ip_vs_proto_data *pd;
+ unsigned hash = IP_VS_PROTO_HASH(proto);
+
+ for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
+ if (pd->pp->protocol == proto)
+ return pd;
+ }
+
+ return NULL;
+}
+
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct net *net, unsigned short proto)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ return __ipvs_proto_data_get(ipvs, proto);
+}
+EXPORT_SYMBOL(ip_vs_proto_data_get);
/*
* Propagate event for state change to all protocols
*/
-void ip_vs_protocol_timeout_change(int flags)
+void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
{
- struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
int i;
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
- for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
- if (pp->timeout_change)
- pp->timeout_change(pp, flags);
+ for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) {
+ if (pd->pp->timeout_change)
+ pd->pp->timeout_change(pd, flags);
}
}
}
@@ -236,6 +313,46 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
}
+/*
+ * per network name-space init
+ */
+static int __net_init __ip_vs_protocol_init(struct net *net)
+{
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ register_ip_vs_proto_netns(net, &ip_vs_protocol_esp);
+#endif
+ return 0;
+}
+
+static void __net_exit __ip_vs_protocol_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd;
+ int i;
+
+ /* unregister all the ipvs proto data for this netns */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pd = ipvs->proto_data_table[i]) != NULL)
+ unregister_ip_vs_proto_netns(net, pd);
+ }
+}
+
+static struct pernet_operations ipvs_proto_ops = {
+ .init = __ip_vs_protocol_init,
+ .exit = __ip_vs_protocol_cleanup,
+};
int __init ip_vs_protocol_init(void)
{
@@ -265,6 +382,7 @@ int __init ip_vs_protocol_init(void)
REGISTER_PROTOCOL(&ip_vs_protocol_esp);
#endif
pr_info("Registered protocols (%s)\n", &protocols[2]);
+ return register_pernet_subsys(&ipvs_proto_ops);
return 0;
}
@@ -275,6 +393,7 @@ void ip_vs_protocol_cleanup(void)
struct ip_vs_protocol *pp;
int i;
+ unregister_pernet_subsys(&ipvs_proto_ops);
/* unregister all the ipvs protocols */
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
while ((pp = ip_vs_proto_table[i]) != NULL)
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 3a0461117d3..5b8eb8b12c3 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -41,28 +41,30 @@ struct isakmp_hdr {
#define PORT_ISAKMP 500
static void
-ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
- int inverse, struct ip_vs_conn_param *p)
+ah_esp_conn_fill_param_proto(struct net *net, int af,
+ const struct ip_vs_iphdr *iph, int inverse,
+ struct ip_vs_conn_param *p)
{
if (likely(!inverse))
- ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
&iph->saddr, htons(PORT_ISAKMP),
&iph->daddr, htons(PORT_ISAKMP), p);
else
- ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
&iph->daddr, htons(PORT_ISAKMP),
&iph->saddr, htons(PORT_ISAKMP), p);
}
static struct ip_vs_conn *
-ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+ah_esp_conn_in_get(int af, const struct sk_buff *skb,
const struct ip_vs_iphdr *iph, unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
cp = ip_vs_conn_in_get(&p);
if (!cp) {
/*
@@ -72,7 +74,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
"%s%s %s->%s\n",
inverse ? "ICMP+" : "",
- pp->name,
+ ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
}
@@ -83,21 +85,21 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
static struct ip_vs_conn *
ah_esp_conn_out_get(int af, const struct sk_buff *skb,
- struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph,
unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
- ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
cp = ip_vs_conn_out_get(&p);
if (!cp) {
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
"%s%s %s->%s\n",
inverse ? "ICMP+" : "",
- pp->name,
+ ip_vs_proto_get(iph->protocol)->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
}
@@ -107,7 +109,7 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
static int
-ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
/*
@@ -117,26 +119,14 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
}
-static void ah_esp_init(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
-static void ah_esp_exit(struct ip_vs_protocol *pp)
-{
- /* nothing to do now */
-}
-
-
#ifdef CONFIG_IP_VS_PROTO_AH
struct ip_vs_protocol ip_vs_protocol_ah = {
.name = "AH",
.protocol = IPPROTO_AH,
.num_states = 1,
.dont_defrag = 1,
- .init = ah_esp_init,
- .exit = ah_esp_exit,
+ .init = NULL,
+ .exit = NULL,
.conn_schedule = ah_esp_conn_schedule,
.conn_in_get = ah_esp_conn_in_get,
.conn_out_get = ah_esp_conn_out_get,
@@ -149,7 +139,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
.app_conn_bind = NULL,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL, /* ISAKMP */
- .set_state_timeout = NULL,
};
#endif
@@ -159,8 +148,8 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
.protocol = IPPROTO_ESP,
.num_states = 1,
.dont_defrag = 1,
- .init = ah_esp_init,
- .exit = ah_esp_exit,
+ .init = NULL,
+ .exit = NULL,
.conn_schedule = ah_esp_conn_schedule,
.conn_in_get = ah_esp_conn_in_get,
.conn_out_get = ah_esp_conn_out_get,
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 1ea96bcd342..b027ccc49f4 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -9,9 +9,10 @@
#include <net/ip_vs.h>
static int
-sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
+ struct net *net;
struct ip_vs_service *svc;
sctp_chunkhdr_t _schunkh, *sch;
sctp_sctphdr_t *sh, _sctph;
@@ -27,13 +28,13 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
sizeof(_schunkh), &_schunkh);
if (sch == NULL)
return 0;
-
+ net = skb_net(skb);
if ((sch->type == SCTP_CID_INIT) &&
- (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+ (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
&iph.daddr, sh->dest))) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(net_ipvs(net))) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
@@ -46,14 +47,19 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd);
+ else {
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ }
return 0;
}
ip_vs_service_put(svc);
}
-
+ /* NF_ACCEPT */
return 1;
}
@@ -856,7 +862,7 @@ static struct ipvs_sctp_nextstate
/*
* Timeout table[state]
*/
-static int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
+static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
[IP_VS_SCTP_S_NONE] = 2 * HZ,
[IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ,
[IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ,
@@ -900,20 +906,8 @@ static const char *sctp_state_name(int state)
return "?";
}
-static void sctp_timeout_change(struct ip_vs_protocol *pp, int flags)
-{
-}
-
-static int
-sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
-
-return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_SCTP_S_LAST,
- sctp_state_name_table, sname, to);
-}
-
static inline int
-set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
int direction, const struct sk_buff *skb)
{
sctp_chunkhdr_t _sctpch, *sch;
@@ -971,7 +965,7 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
IP_VS_DBG_BUF(8, "%s %s %s:%d->"
"%s:%d state: %s->%s conn->refcnt:%d\n",
- pp->name,
+ pd->pp->name,
((direction == IP_VS_DIR_OUTPUT) ?
"output " : "input "),
IP_VS_DBG_ADDR(cp->af, &cp->daddr),
@@ -995,75 +989,73 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
}
}
}
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = next_state];
+ else /* What to do ? */
+ cp->timeout = sctp_timeouts[cp->state = next_state];
- cp->timeout = pp->timeout_table[cp->state = next_state];
-
- return 1;
+ return 1;
}
static int
sctp_state_transition(struct ip_vs_conn *cp, int direction,
- const struct sk_buff *skb, struct ip_vs_protocol *pp)
+ const struct sk_buff *skb, struct ip_vs_proto_data *pd)
{
int ret = 0;
spin_lock(&cp->lock);
- ret = set_sctp_state(pp, cp, direction, skb);
+ ret = set_sctp_state(pd, cp, direction, skb);
spin_unlock(&cp->lock);
return ret;
}
-/*
- * Hash table for SCTP application incarnations
- */
-#define SCTP_APP_TAB_BITS 4
-#define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS)
-#define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1)
-
-static struct list_head sctp_apps[SCTP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(sctp_app_lock);
-
static inline __u16 sctp_app_hashkey(__be16 port)
{
return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
& SCTP_APP_TAB_MASK;
}
-static int sctp_register_app(struct ip_vs_app *inc)
+static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
hash = sctp_app_hashkey(port);
- spin_lock_bh(&sctp_app_lock);
- list_for_each_entry(i, &sctp_apps[hash], p_list) {
+ spin_lock_bh(&ipvs->sctp_app_lock);
+ list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &sctp_apps[hash]);
- atomic_inc(&ip_vs_protocol_sctp.appcnt);
+ list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&sctp_app_lock);
+ spin_unlock_bh(&ipvs->sctp_app_lock);
return ret;
}
-static void sctp_unregister_app(struct ip_vs_app *inc)
+static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&sctp_app_lock);
- atomic_dec(&ip_vs_protocol_sctp.appcnt);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+ spin_lock_bh(&ipvs->sctp_app_lock);
+ atomic_dec(&pd->appcnt);
list_del(&inc->p_list);
- spin_unlock_bh(&sctp_app_lock);
+ spin_unlock_bh(&ipvs->sctp_app_lock);
}
static int sctp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -1074,12 +1066,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = sctp_app_hashkey(cp->vport);
- spin_lock(&sctp_app_lock);
- list_for_each_entry(inc, &sctp_apps[hash], p_list) {
+ spin_lock(&ipvs->sctp_app_lock);
+ list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&sctp_app_lock);
+ spin_unlock(&ipvs->sctp_app_lock);
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -1095,43 +1087,50 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&sctp_app_lock);
+ spin_unlock(&ipvs->sctp_app_lock);
out:
return result;
}
-static void ip_vs_sctp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(sctp_apps);
- pp->timeout_table = sctp_timeouts;
-}
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
+ spin_lock_init(&ipvs->sctp_app_lock);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
+ sizeof(sctp_timeouts));
+}
-static void ip_vs_sctp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
-
+ kfree(pd->timeout_table);
}
struct ip_vs_protocol ip_vs_protocol_sctp = {
- .name = "SCTP",
- .protocol = IPPROTO_SCTP,
- .num_states = IP_VS_SCTP_S_LAST,
- .dont_defrag = 0,
- .appcnt = ATOMIC_INIT(0),
- .init = ip_vs_sctp_init,
- .exit = ip_vs_sctp_exit,
- .register_app = sctp_register_app,
+ .name = "SCTP",
+ .protocol = IPPROTO_SCTP,
+ .num_states = IP_VS_SCTP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_sctp_init,
+ .exit_netns = __ip_vs_sctp_exit,
+ .register_app = sctp_register_app,
.unregister_app = sctp_unregister_app,
- .conn_schedule = sctp_conn_schedule,
- .conn_in_get = ip_vs_conn_in_get_proto,
- .conn_out_get = ip_vs_conn_out_get_proto,
- .snat_handler = sctp_snat_handler,
- .dnat_handler = sctp_dnat_handler,
- .csum_check = sctp_csum_check,
- .state_name = sctp_state_name,
+ .conn_schedule = sctp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = sctp_snat_handler,
+ .dnat_handler = sctp_dnat_handler,
+ .csum_check = sctp_csum_check,
+ .state_name = sctp_state_name,
.state_transition = sctp_state_transition,
- .app_conn_bind = sctp_app_conn_bind,
- .debug_packet = ip_vs_tcpudp_debug_packet,
- .timeout_change = sctp_timeout_change,
- .set_state_timeout = sctp_set_state_timeout,
+ .app_conn_bind = sctp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL,
};
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index f6c5200e214..c0cc341b840 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -9,8 +9,12 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
*
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * tcp_timeouts table has copy per netns in a hash table per
+ * protocol ip_vs_proto_data and is handled by netns
*/
#define KMSG_COMPONENT "IPVS"
@@ -28,9 +32,10 @@
#include <net/ip_vs.h>
static int
-tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
+ struct net *net;
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
struct ip_vs_iphdr iph;
@@ -42,14 +47,14 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
*verdict = NF_DROP;
return 0;
}
-
+ net = skb_net(skb);
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
if (th->syn &&
- (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
- th->dest))) {
+ (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
+ &iph.daddr, th->dest))) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(net_ipvs(net))) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
@@ -63,13 +68,19 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd);
+ else {
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ }
return 0;
}
ip_vs_service_put(svc);
}
+ /* NF_ACCEPT */
return 1;
}
@@ -338,7 +349,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = {
/*
* Timeout table[state]
*/
-static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = 2*HZ,
[IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
[IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
@@ -437,10 +448,7 @@ static struct tcp_states_t tcp_states_dos [] = {
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
-static struct tcp_states_t *tcp_state_table = tcp_states;
-
-
-static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
{
int on = (flags & 1); /* secure_tcp */
@@ -450,14 +458,7 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
** for most if not for all of the applications. Something
** like "capabilities" (flags) for each object.
*/
- tcp_state_table = (on? tcp_states_dos : tcp_states);
-}
-
-static int
-tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
- return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
- tcp_state_name_table, sname, to);
+ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
}
static inline int tcp_state_idx(struct tcphdr *th)
@@ -474,7 +475,7 @@ static inline int tcp_state_idx(struct tcphdr *th)
}
static inline void
-set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
int direction, struct tcphdr *th)
{
int state_idx;
@@ -497,7 +498,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
goto tcp_state_out;
}
- new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+ new_state =
+ pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
tcp_state_out:
if (new_state != cp->state) {
@@ -505,7 +507,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
"%s:%d state: %s->%s conn->refcnt:%d\n",
- pp->name,
+ pd->pp->name,
((state_off == TCP_DIR_OUTPUT) ?
"output " : "input "),
th->syn ? 'S' : '.',
@@ -535,17 +537,19 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
}
}
- cp->timeout = pp->timeout_table[cp->state = new_state];
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = new_state];
+ else /* What to do ? */
+ cp->timeout = tcp_timeouts[cp->state = new_state];
}
-
/*
* Handle state transitions
*/
static int
tcp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
struct tcphdr _tcph, *th;
@@ -560,23 +564,12 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
return 0;
spin_lock(&cp->lock);
- set_tcp_state(pp, cp, direction, th);
+ set_tcp_state(pd, cp, direction, th);
spin_unlock(&cp->lock);
return 1;
}
-
-/*
- * Hash table for TCP application incarnations
- */
-#define TCP_APP_TAB_BITS 4
-#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
-#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
-
-static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(tcp_app_lock);
-
static inline __u16 tcp_app_hashkey(__be16 port)
{
return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
@@ -584,44 +577,50 @@ static inline __u16 tcp_app_hashkey(__be16 port)
}
-static int tcp_register_app(struct ip_vs_app *inc)
+static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
hash = tcp_app_hashkey(port);
- spin_lock_bh(&tcp_app_lock);
- list_for_each_entry(i, &tcp_apps[hash], p_list) {
+ spin_lock_bh(&ipvs->tcp_app_lock);
+ list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &tcp_apps[hash]);
- atomic_inc(&ip_vs_protocol_tcp.appcnt);
+ list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&tcp_app_lock);
+ spin_unlock_bh(&ipvs->tcp_app_lock);
return ret;
}
static void
-tcp_unregister_app(struct ip_vs_app *inc)
+tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&tcp_app_lock);
- atomic_dec(&ip_vs_protocol_tcp.appcnt);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ spin_lock_bh(&ipvs->tcp_app_lock);
+ atomic_dec(&pd->appcnt);
list_del(&inc->p_list);
- spin_unlock_bh(&tcp_app_lock);
+ spin_unlock_bh(&ipvs->tcp_app_lock);
}
static int
tcp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -633,12 +632,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = tcp_app_hashkey(cp->vport);
- spin_lock(&tcp_app_lock);
- list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+ spin_lock(&ipvs->tcp_app_lock);
+ list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&tcp_app_lock);
+ spin_unlock(&ipvs->tcp_app_lock);
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -655,7 +654,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&tcp_app_lock);
+ spin_unlock(&ipvs->tcp_app_lock);
out:
return result;
@@ -665,24 +664,35 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/*
* Set LISTEN timeout. (ip_vs_conn_put will setup timer)
*/
-void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
spin_lock(&cp->lock);
cp->state = IP_VS_TCP_S_LISTEN;
- cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+ cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
+ : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
spin_unlock(&cp->lock);
}
-
-static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(tcp_apps);
- pp->timeout_table = tcp_timeouts;
-}
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
+ spin_lock_init(&ipvs->tcp_app_lock);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
+ sizeof(tcp_timeouts));
+ pd->tcp_state_table = tcp_states;
+}
-static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
+ kfree(pd->timeout_table);
}
@@ -691,9 +701,10 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.protocol = IPPROTO_TCP,
.num_states = IP_VS_TCP_S_LAST,
.dont_defrag = 0,
- .appcnt = ATOMIC_INIT(0),
- .init = ip_vs_tcp_init,
- .exit = ip_vs_tcp_exit,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_tcp_init,
+ .exit_netns = __ip_vs_tcp_exit,
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
@@ -707,5 +718,4 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.app_conn_bind = tcp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = tcp_timeout_change,
- .set_state_timeout = tcp_set_state_timeout,
};
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 9d106a06bb0..f1282cbe6fe 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -9,7 +9,8 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Changes:
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
*
*/
@@ -28,9 +29,10 @@
#include <net/ip6_checksum.h>
static int
-udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp)
{
+ struct net *net;
struct ip_vs_service *svc;
struct udphdr _udph, *uh;
struct ip_vs_iphdr iph;
@@ -42,13 +44,13 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
*verdict = NF_DROP;
return 0;
}
-
- svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+ net = skb_net(skb);
+ svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
&iph.daddr, uh->dest);
if (svc) {
int ignored;
- if (ip_vs_todrop()) {
+ if (ip_vs_todrop(net_ipvs(net))) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
@@ -62,13 +64,19 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
- if (!*cpp && !ignored) {
- *verdict = ip_vs_leave(svc, skb, pp);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd);
+ else {
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ }
return 0;
}
ip_vs_service_put(svc);
}
+ /* NF_ACCEPT */
return 1;
}
@@ -338,19 +346,6 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
return 1;
}
-
-/*
- * Note: the caller guarantees that only one of register_app,
- * unregister_app or app_conn_bind is called each time.
- */
-
-#define UDP_APP_TAB_BITS 4
-#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
-#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
-
-static struct list_head udp_apps[UDP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(udp_app_lock);
-
static inline __u16 udp_app_hashkey(__be16 port)
{
return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
@@ -358,44 +353,50 @@ static inline __u16 udp_app_hashkey(__be16 port)
}
-static int udp_register_app(struct ip_vs_app *inc)
+static int udp_register_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
hash = udp_app_hashkey(port);
- spin_lock_bh(&udp_app_lock);
- list_for_each_entry(i, &udp_apps[hash], p_list) {
+ spin_lock_bh(&ipvs->udp_app_lock);
+ list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &udp_apps[hash]);
- atomic_inc(&ip_vs_protocol_udp.appcnt);
+ list_add(&inc->p_list, &ipvs->udp_apps[hash]);
+ atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&udp_app_lock);
+ spin_unlock_bh(&ipvs->udp_app_lock);
return ret;
}
static void
-udp_unregister_app(struct ip_vs_app *inc)
+udp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- spin_lock_bh(&udp_app_lock);
- atomic_dec(&ip_vs_protocol_udp.appcnt);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ spin_lock_bh(&ipvs->udp_app_lock);
+ atomic_dec(&pd->appcnt);
list_del(&inc->p_list);
- spin_unlock_bh(&udp_app_lock);
+ spin_unlock_bh(&ipvs->udp_app_lock);
}
static int udp_app_conn_bind(struct ip_vs_conn *cp)
{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
int hash;
struct ip_vs_app *inc;
int result = 0;
@@ -407,12 +408,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = udp_app_hashkey(cp->vport);
- spin_lock(&udp_app_lock);
- list_for_each_entry(inc, &udp_apps[hash], p_list) {
+ spin_lock(&ipvs->udp_app_lock);
+ list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&udp_app_lock);
+ spin_unlock(&ipvs->udp_app_lock);
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -429,14 +430,14 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&udp_app_lock);
+ spin_unlock(&ipvs->udp_app_lock);
out:
return result;
}
-static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_NORMAL] = 5*60*HZ,
[IP_VS_UDP_S_LAST] = 2*HZ,
};
@@ -446,14 +447,6 @@ static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_LAST] = "BUG!",
};
-
-static int
-udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
- return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
- udp_state_name_table, sname, to);
-}
-
static const char * udp_state_name(int state)
{
if (state >= IP_VS_UDP_S_LAST)
@@ -464,20 +457,30 @@ static const char * udp_state_name(int state)
static int
udp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
- struct ip_vs_protocol *pp)
+ struct ip_vs_proto_data *pd)
{
- cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
+ if (unlikely(!pd)) {
+ pr_err("UDP no ns data\n");
+ return 0;
+ }
+
+ cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
return 1;
}
-static void udp_init(struct ip_vs_protocol *pp)
+static void __udp_init(struct net *net, struct ip_vs_proto_data *pd)
{
- IP_VS_INIT_HASH_TABLE(udp_apps);
- pp->timeout_table = udp_timeouts;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
+ spin_lock_init(&ipvs->udp_app_lock);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
+ sizeof(udp_timeouts));
}
-static void udp_exit(struct ip_vs_protocol *pp)
+static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
{
+ kfree(pd->timeout_table);
}
@@ -486,8 +489,10 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.protocol = IPPROTO_UDP,
.num_states = IP_VS_UDP_S_LAST,
.dont_defrag = 0,
- .init = udp_init,
- .exit = udp_exit,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __udp_init,
+ .exit_netns = __udp_exit,
.conn_schedule = udp_conn_schedule,
.conn_in_get = ip_vs_conn_in_get_proto,
.conn_out_get = ip_vs_conn_out_get_proto,
@@ -501,5 +506,4 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.app_conn_bind = udp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL,
- .set_state_timeout = udp_set_state_timeout,
};
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index e210f37d8ea..c49b388d108 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -72,7 +72,7 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
q = q->next;
} while (q != p);
write_unlock(&svc->sched_lock);
- IP_VS_ERR_RL("RR: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
out:
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 076ebe00435..08dbdd5bc18 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -29,6 +29,7 @@
#include <net/ip_vs.h>
+EXPORT_SYMBOL(ip_vs_scheduler_err);
/*
* IPVS scheduler list
*/
@@ -146,6 +147,30 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
module_put(scheduler->module);
}
+/*
+ * Common error output helper for schedulers
+ */
+
+void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
+{
+ if (svc->fwmark) {
+ IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
+ svc->scheduler->name, svc->fwmark,
+ svc->fwmark, msg);
+#ifdef CONFIG_IP_VS_IPV6
+ } else if (svc->af == AF_INET6) {
+ IP_VS_ERR_RL("%s: %s [%pI6]:%d - %s\n",
+ svc->scheduler->name,
+ ip_vs_proto_name(svc->protocol),
+ &svc->addr.in6, ntohs(svc->port), msg);
+#endif
+ } else {
+ IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
+ svc->scheduler->name,
+ ip_vs_proto_name(svc->protocol),
+ &svc->addr.ip, ntohs(svc->port), msg);
+ }
+}
/*
* Register a scheduler in the scheduler list
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index 1ab75a9dc40..89ead246ed3 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -87,7 +87,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
goto nextstage;
}
}
- IP_VS_ERR_RL("SED: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
/*
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index e6cc174fbc0..b5e2556c581 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -223,7 +223,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
- IP_VS_ERR_RL("SH: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index ab85aedea17..fecf24de4af 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -5,6 +5,18 @@
* high-performance and highly available server based on a
* cluster of servers.
*
+ * Version 1, is capable of handling both version 0 and 1 messages.
+ * Version 0 is the plain old format.
+ * Note Version 0 receivers will just drop Ver 1 messages.
+ * Version 1 is capable of handle IPv6, Persistence data,
+ * time-outs, and firewall marks.
+ * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
+ * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
+ *
+ * Definitions Message: is a complete datagram
+ * Sync_conn: is a part of a Message
+ * Param Data is an option to a Sync_conn.
+ *
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* ip_vs_sync: sync connection info from master load balancer to backups
@@ -15,6 +27,8 @@
* Alexandre Cassen : Added SyncID support for incoming sync
* messages filtering.
* Justin Ossevoort : Fix endian problem on sync message size.
+ * Hans Schillstrom : Added Version 1: i.e. IPv6,
+ * Persistence support, fwmark and time-out.
*/
#define KMSG_COMPONENT "IPVS"
@@ -35,6 +49,8 @@
#include <linux/wait.h>
#include <linux/kernel.h>
+#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
+
#include <net/ip.h>
#include <net/sock.h>
@@ -43,11 +59,13 @@
#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
#define IP_VS_SYNC_PORT 8848 /* multicast port */
+#define SYNC_PROTO_VER 1 /* Protocol version in header */
/*
* IPVS sync connection entry
+ * Version 0, i.e. original version.
*/
-struct ip_vs_sync_conn {
+struct ip_vs_sync_conn_v0 {
__u8 reserved;
/* Protocol, addresses and port numbers */
@@ -71,41 +89,159 @@ struct ip_vs_sync_conn_options {
struct ip_vs_seq out_seq; /* outgoing seq. struct */
};
+/*
+ Sync Connection format (sync_conn)
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Type | Protocol | Ver. | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Flags |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | State | cport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | vport | dport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | fwmark |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | timeout (in sec.) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | ... |
+ | IP-Addresses (v4 or v6) |
+ | ... |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ Optional Parameters.
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param. Type | Param. Length | Param. data |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+ | ... |
+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | | Param Type | Param. Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param data |
+ | Last Param data should be padded for 32 bit alignment |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+/*
+ * Type 0, IPv4 sync connection format
+ */
+struct ip_vs_sync_v4 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ __be32 caddr; /* client address */
+ __be32 vaddr; /* virtual address */
+ __be32 daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+/*
+ * Type 2 messages IPv6
+ */
+struct ip_vs_sync_v6 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ struct in6_addr caddr; /* client address */
+ struct in6_addr vaddr; /* virtual address */
+ struct in6_addr daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+
+union ip_vs_sync_conn {
+ struct ip_vs_sync_v4 v4;
+ struct ip_vs_sync_v6 v6;
+};
+
+/* Bits in Type field in above */
+#define STYPE_INET6 0
+#define STYPE_F_INET6 (1 << STYPE_INET6)
+
+#define SVER_SHIFT 12 /* Shift to get version */
+#define SVER_MASK 0x0fff /* Mask to strip version */
+
+#define IPVS_OPT_SEQ_DATA 1
+#define IPVS_OPT_PE_DATA 2
+#define IPVS_OPT_PE_NAME 3
+#define IPVS_OPT_PARAM 7
+
+#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
+#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
+#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
+#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
+
struct ip_vs_sync_thread_data {
+ struct net *net;
struct socket *sock;
char *buf;
};
-#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
+/* Version 0 definition of packet sizes */
+#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
#define FULL_CONN_SIZE \
-(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
/*
- The master mulitcasts messages to the backup load balancers in the
- following format.
+ The master mulitcasts messages (Datagrams) to the backup load balancers
+ in the following format.
+
+ Version 1:
+ Note, first byte should be Zero, so ver 0 receivers will drop the packet.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- | Count Conns | SyncID | Size |
+ | 0 | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | Version | Reserved, set to Zero |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (1) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| . |
- | . |
+ ~ . ~
| . |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (n) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Version 0 Header
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | IPVS Sync Connection (1) |
*/
#define SYNC_MESG_HEADER_LEN 4
#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
-struct ip_vs_sync_mesg {
+/* Version 0 header */
+struct ip_vs_sync_mesg_v0 {
__u8 nr_conns;
__u8 syncid;
__u16 size;
@@ -113,9 +249,16 @@ struct ip_vs_sync_mesg {
/* ip_vs_sync_conn entries start here */
};
-/* the maximum length of sync (sending/receiving) message */
-static int sync_send_mesg_maxlen;
-static int sync_recv_mesg_maxlen;
+/* Version 1 header */
+struct ip_vs_sync_mesg {
+ __u8 reserved; /* must be zero */
+ __u8 syncid;
+ __u16 size;
+ __u8 nr_conns;
+ __s8 version; /* SYNC_PROTO_VER */
+ __u16 spare;
+ /* ip_vs_sync_conn entries start here */
+};
struct ip_vs_sync_buff {
struct list_head list;
@@ -127,28 +270,6 @@ struct ip_vs_sync_buff {
unsigned char *end;
};
-
-/* the sync_buff list head and the lock */
-static LIST_HEAD(ip_vs_sync_queue);
-static DEFINE_SPINLOCK(ip_vs_sync_lock);
-
-/* current sync_buff for accepting new conn entries */
-static struct ip_vs_sync_buff *curr_sb = NULL;
-static DEFINE_SPINLOCK(curr_sb_lock);
-
-/* ipvs sync daemon state */
-volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
-volatile int ip_vs_master_syncid = 0;
-volatile int ip_vs_backup_syncid = 0;
-
-/* multicast interface name */
-char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-
-/* sync daemon tasks */
-static struct task_struct *sync_master_thread;
-static struct task_struct *sync_backup_thread;
-
/* multicast addr */
static struct sockaddr_in mcast_addr = {
.sin_family = AF_INET,
@@ -156,41 +277,71 @@ static struct sockaddr_in mcast_addr = {
.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
};
+/*
+ * Copy of struct ip_vs_seq
+ * From unaligned network order to aligned host order
+ */
+static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
+{
+ ho->init_seq = get_unaligned_be32(&no->init_seq);
+ ho->delta = get_unaligned_be32(&no->delta);
+ ho->previous_delta = get_unaligned_be32(&no->previous_delta);
+}
+
+/*
+ * Copy of struct ip_vs_seq
+ * From Aligned host order to unaligned network order
+ */
+static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
+{
+ put_unaligned_be32(ho->init_seq, &no->init_seq);
+ put_unaligned_be32(ho->delta, &no->delta);
+ put_unaligned_be32(ho->previous_delta, &no->previous_delta);
+}
-static inline struct ip_vs_sync_buff *sb_dequeue(void)
+static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
{
struct ip_vs_sync_buff *sb;
- spin_lock_bh(&ip_vs_sync_lock);
- if (list_empty(&ip_vs_sync_queue)) {
+ spin_lock_bh(&ipvs->sync_lock);
+ if (list_empty(&ipvs->sync_queue)) {
sb = NULL;
} else {
- sb = list_entry(ip_vs_sync_queue.next,
+ sb = list_entry(ipvs->sync_queue.next,
struct ip_vs_sync_buff,
list);
list_del(&sb->list);
}
- spin_unlock_bh(&ip_vs_sync_lock);
+ spin_unlock_bh(&ipvs->sync_lock);
return sb;
}
-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+/*
+ * Create a new sync buffer for Version 1 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
{
struct ip_vs_sync_buff *sb;
if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
return NULL;
- if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
kfree(sb);
return NULL;
}
+ sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */
+ sb->mesg->version = SYNC_PROTO_VER;
+ sb->mesg->syncid = ipvs->master_syncid;
+ sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
sb->mesg->nr_conns = 0;
- sb->mesg->syncid = ip_vs_master_syncid;
- sb->mesg->size = 4;
- sb->head = (unsigned char *)sb->mesg + 4;
- sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+ sb->mesg->spare = 0;
+ sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
+ sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+
sb->firstuse = jiffies;
return sb;
}
@@ -201,14 +352,16 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
kfree(sb);
}
-static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+static inline void sb_queue_tail(struct netns_ipvs *ipvs)
{
- spin_lock(&ip_vs_sync_lock);
- if (ip_vs_sync_state & IP_VS_STATE_MASTER)
- list_add_tail(&sb->list, &ip_vs_sync_queue);
+ struct ip_vs_sync_buff *sb = ipvs->sync_buff;
+
+ spin_lock(&ipvs->sync_lock);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ list_add_tail(&sb->list, &ipvs->sync_queue);
else
ip_vs_sync_buff_release(sb);
- spin_unlock(&ip_vs_sync_lock);
+ spin_unlock(&ipvs->sync_lock);
}
/*
@@ -216,36 +369,101 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
* than the specified time or the specified time is zero.
*/
static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(unsigned long time)
+get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
{
struct ip_vs_sync_buff *sb;
- spin_lock_bh(&curr_sb_lock);
- if (curr_sb && (time == 0 ||
- time_before(jiffies - curr_sb->firstuse, time))) {
- sb = curr_sb;
- curr_sb = NULL;
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (ipvs->sync_buff &&
+ time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) {
+ sb = ipvs->sync_buff;
+ ipvs->sync_buff = NULL;
} else
sb = NULL;
- spin_unlock_bh(&curr_sb_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
return sb;
}
+/*
+ * Switch mode from sending version 0 or 1
+ * - must handle sync_buf
+ */
+void ip_vs_sync_switch_mode(struct net *net, int mode)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER))
+ return;
+ if (mode == ipvs->sysctl_sync_ver || !ipvs->sync_buff)
+ return;
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ /* Buffer empty ? then let buf_create do the job */
+ if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) {
+ kfree(ipvs->sync_buff);
+ ipvs->sync_buff = NULL;
+ } else {
+ spin_lock_bh(&ipvs->sync_lock);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ list_add_tail(&ipvs->sync_buff->list,
+ &ipvs->sync_queue);
+ else
+ ip_vs_sync_buff_release(ipvs->sync_buff);
+ spin_unlock_bh(&ipvs->sync_lock);
+ }
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+}
/*
+ * Create a new sync buffer for Version 0 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_sync_buff *sb;
+ struct ip_vs_sync_mesg_v0 *mesg;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
+ kfree(sb);
+ return NULL;
+ }
+ mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
+ mesg->nr_conns = 0;
+ mesg->syncid = ipvs->master_syncid;
+ mesg->size = sizeof(struct ip_vs_sync_mesg_v0);
+ sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
+ sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+ sb->firstuse = jiffies;
+ return sb;
+}
+
+/*
+ * Version 0 , could be switched in by sys_ctl.
* Add an ip_vs_conn information into the current sync_buff.
- * Called by ip_vs_in.
*/
-void ip_vs_sync_conn(struct ip_vs_conn *cp)
+void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
{
- struct ip_vs_sync_mesg *m;
- struct ip_vs_sync_conn *s;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg_v0 *m;
+ struct ip_vs_sync_conn_v0 *s;
int len;
- spin_lock(&curr_sb_lock);
- if (!curr_sb) {
- if (!(curr_sb=ip_vs_sync_buff_create())) {
- spin_unlock(&curr_sb_lock);
+ if (unlikely(cp->af != AF_INET))
+ return;
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return;
+
+ spin_lock(&ipvs->sync_buff_lock);
+ if (!ipvs->sync_buff) {
+ ipvs->sync_buff =
+ ip_vs_sync_buff_create_v0(ipvs);
+ if (!ipvs->sync_buff) {
+ spin_unlock(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
@@ -253,10 +471,11 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
SIMPLE_CONN_SIZE;
- m = curr_sb->mesg;
- s = (struct ip_vs_sync_conn *)curr_sb->head;
+ m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg;
+ s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head;
/* copy members */
+ s->reserved = 0;
s->protocol = cp->protocol;
s->cport = cp->cport;
s->vport = cp->vport;
@@ -274,83 +493,366 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
m->nr_conns++;
m->size += len;
- curr_sb->head += len;
+ ipvs->sync_buff->head += len;
/* check if there is a space for next one */
- if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
- sb_queue_tail(curr_sb);
- curr_sb = NULL;
+ if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) {
+ sb_queue_tail(ipvs);
+ ipvs->sync_buff = NULL;
}
- spin_unlock(&curr_sb_lock);
+ spin_unlock(&ipvs->sync_buff_lock);
/* synchronize its controller if it has */
if (cp->control)
- ip_vs_sync_conn(cp->control);
+ ip_vs_sync_conn(net, cp->control);
+}
+
+/*
+ * Add an ip_vs_conn information into the current sync_buff.
+ * Called by ip_vs_in.
+ * Sending Version 1 messages
+ */
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m;
+ union ip_vs_sync_conn *s;
+ __u8 *p;
+ unsigned int len, pe_name_len, pad;
+
+ /* Handle old version of the protocol */
+ if (ipvs->sysctl_sync_ver == 0) {
+ ip_vs_sync_conn_v0(net, cp);
+ return;
+ }
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ goto control;
+sloop:
+ /* Sanity checks */
+ pe_name_len = 0;
+ if (cp->pe_data_len) {
+ if (!cp->pe_data || !cp->dest) {
+ IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
+ return;
+ }
+ pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
+ }
+
+ spin_lock(&ipvs->sync_buff_lock);
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ len = sizeof(struct ip_vs_sync_v6);
+ else
+#endif
+ len = sizeof(struct ip_vs_sync_v4);
+
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
+ len += sizeof(struct ip_vs_sync_conn_options) + 2;
+
+ if (cp->pe_data_len)
+ len += cp->pe_data_len + 2; /* + Param hdr field */
+ if (pe_name_len)
+ len += pe_name_len + 2;
+
+ /* check if there is a space for this one */
+ pad = 0;
+ if (ipvs->sync_buff) {
+ pad = (4 - (size_t)ipvs->sync_buff->head) & 3;
+ if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) {
+ sb_queue_tail(ipvs);
+ ipvs->sync_buff = NULL;
+ pad = 0;
+ }
+ }
+
+ if (!ipvs->sync_buff) {
+ ipvs->sync_buff = ip_vs_sync_buff_create(ipvs);
+ if (!ipvs->sync_buff) {
+ spin_unlock(&ipvs->sync_buff_lock);
+ pr_err("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ }
+
+ m = ipvs->sync_buff->mesg;
+ p = ipvs->sync_buff->head;
+ ipvs->sync_buff->head += pad + len;
+ m->size += pad + len;
+ /* Add ev. padding from prev. sync_conn */
+ while (pad--)
+ *(p++) = 0;
+
+ s = (union ip_vs_sync_conn *)p;
+
+ /* Set message type & copy members */
+ s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
+ s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */
+ s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->v4.state = htons(cp->state);
+ s->v4.protocol = cp->protocol;
+ s->v4.cport = cp->cport;
+ s->v4.vport = cp->vport;
+ s->v4.dport = cp->dport;
+ s->v4.fwmark = htonl(cp->fwmark);
+ s->v4.timeout = htonl(cp->timeout / HZ);
+ m->nr_conns++;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6) {
+ p += sizeof(struct ip_vs_sync_v6);
+ ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6);
+ ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6);
+ ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6);
+ } else
+#endif
+ {
+ p += sizeof(struct ip_vs_sync_v4); /* options ptr */
+ s->v4.caddr = cp->caddr.ip;
+ s->v4.vaddr = cp->vaddr.ip;
+ s->v4.daddr = cp->daddr.ip;
+ }
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ *(p++) = IPVS_OPT_SEQ_DATA;
+ *(p++) = sizeof(struct ip_vs_sync_conn_options);
+ hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
+ p += sizeof(struct ip_vs_seq);
+ hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
+ p += sizeof(struct ip_vs_seq);
+ }
+ /* Handle pe data */
+ if (cp->pe_data_len && cp->pe_data) {
+ *(p++) = IPVS_OPT_PE_DATA;
+ *(p++) = cp->pe_data_len;
+ memcpy(p, cp->pe_data, cp->pe_data_len);
+ p += cp->pe_data_len;
+ if (pe_name_len) {
+ /* Add PE_NAME */
+ *(p++) = IPVS_OPT_PE_NAME;
+ *(p++) = pe_name_len;
+ memcpy(p, cp->pe->name, pe_name_len);
+ p += pe_name_len;
+ }
+ }
+
+ spin_unlock(&ipvs->sync_buff_lock);
+
+control:
+ /* synchronize its controller if it has */
+ cp = cp->control;
+ if (!cp)
+ return;
+ /*
+ * Reduce sync rate for templates
+ * i.e only increment in_pkts for Templates.
+ */
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+ int pkts = atomic_add_return(1, &cp->in_pkts);
+
+ if (pkts % ipvs->sysctl_sync_threshold[1] != 1)
+ return;
+ }
+ goto sloop;
}
+/*
+ * fill_param used by version 1
+ */
static inline int
-ip_vs_conn_fill_param_sync(int af, int protocol,
- const union nf_inet_addr *caddr, __be16 cport,
- const union nf_inet_addr *vaddr, __be16 vport,
- struct ip_vs_conn_param *p)
+ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
+ struct ip_vs_conn_param *p,
+ __u8 *pe_data, unsigned int pe_data_len,
+ __u8 *pe_name, unsigned int pe_name_len)
{
- /* XXX: Need to take into account persistence engine */
- ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_conn_fill_param(net, af, sc->v6.protocol,
+ (const union nf_inet_addr *)&sc->v6.caddr,
+ sc->v6.cport,
+ (const union nf_inet_addr *)&sc->v6.vaddr,
+ sc->v6.vport, p);
+ else
+#endif
+ ip_vs_conn_fill_param(net, af, sc->v4.protocol,
+ (const union nf_inet_addr *)&sc->v4.caddr,
+ sc->v4.cport,
+ (const union nf_inet_addr *)&sc->v4.vaddr,
+ sc->v4.vport, p);
+ /* Handle pe data */
+ if (pe_data_len) {
+ if (pe_name_len) {
+ char buff[IP_VS_PENAME_MAXLEN+1];
+
+ memcpy(buff, pe_name, pe_name_len);
+ buff[pe_name_len]=0;
+ p->pe = __ip_vs_pe_getbyname(buff);
+ if (!p->pe) {
+ IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
+ buff);
+ return 1;
+ }
+ } else {
+ IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
+ return 1;
+ }
+
+ p->pe_data = kmalloc(pe_data_len, GFP_ATOMIC);
+ if (!p->pe_data) {
+ if (p->pe->module)
+ module_put(p->pe->module);
+ return -ENOMEM;
+ }
+ memcpy(p->pe_data, pe_data, pe_data_len);
+ p->pe_data_len = pe_data_len;
+ }
return 0;
}
/*
- * Process received multicast message and create the corresponding
- * ip_vs_conn entries.
+ * Connection Add / Update.
+ * Common for version 0 and 1 reception of backup sync_conns.
+ * Param: ...
+ * timeout is in sec.
*/
-static void ip_vs_process_message(const char *buffer, const size_t buflen)
+static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
+ unsigned int flags, unsigned int state,
+ unsigned int protocol, unsigned int type,
+ const union nf_inet_addr *daddr, __be16 dport,
+ unsigned long timeout, __u32 fwmark,
+ struct ip_vs_sync_conn_options *opt)
{
- struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
- struct ip_vs_sync_conn *s;
- struct ip_vs_sync_conn_options *opt;
- struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp;
struct ip_vs_dest *dest;
- struct ip_vs_conn_param param;
- char *p;
- int i;
+ struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- if (buflen < sizeof(struct ip_vs_sync_mesg)) {
- IP_VS_ERR_RL("sync message header too short\n");
- return;
- }
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ cp = ip_vs_conn_in_get(param);
+ else
+ cp = ip_vs_ct_in_get(param);
- /* Convert size back to host byte order */
- m->size = ntohs(m->size);
+ if (cp && param->pe_data) /* Free pe_data */
+ kfree(param->pe_data);
+ if (!cp) {
+ /*
+ * Find the appropriate destination for the connection.
+ * If it is not found the connection will remain unbound
+ * but still handled.
+ */
+ dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
+ param->vport, protocol, fwmark);
- if (buflen != m->size) {
- IP_VS_ERR_RL("bogus sync message size\n");
- return;
+ /* Set the approprite ativity flag */
+ if (protocol == IPPROTO_TCP) {
+ if (state != IP_VS_TCP_S_ESTABLISHED)
+ flags |= IP_VS_CONN_F_INACTIVE;
+ else
+ flags &= ~IP_VS_CONN_F_INACTIVE;
+ } else if (protocol == IPPROTO_SCTP) {
+ if (state != IP_VS_SCTP_S_ESTABLISHED)
+ flags |= IP_VS_CONN_F_INACTIVE;
+ else
+ flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
+ if (dest)
+ atomic_dec(&dest->refcnt);
+ if (!cp) {
+ if (param->pe_data)
+ kfree(param->pe_data);
+ IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
+ return;
+ }
+ } else if (!cp->dest) {
+ dest = ip_vs_try_bind_dest(cp);
+ if (dest)
+ atomic_dec(&dest->refcnt);
+ } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
+ (cp->state != state)) {
+ /* update active/inactive flag for the connection */
+ dest = cp->dest;
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (state != IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (state == IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
+ (cp->state != state)) {
+ dest = cp->dest;
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (state != IP_VS_SCTP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
}
- /* SyncID sanity check */
- if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
- IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
- m->syncid);
- return;
+ if (opt)
+ memcpy(&cp->in_seq, opt, sizeof(*opt));
+ atomic_set(&cp->in_pkts, ipvs->sysctl_sync_threshold[0]);
+ cp->state = state;
+ cp->old_state = cp->state;
+ /*
+ * For Ver 0 messages style
+ * - Not possible to recover the right timeout for templates
+ * - can not find the right fwmark
+ * virtual service. If needed, we can do it for
+ * non-fwmark persistent services.
+ * Ver 1 messages style.
+ * - No problem.
+ */
+ if (timeout) {
+ if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
+ timeout = MAX_SCHEDULE_TIMEOUT / HZ;
+ cp->timeout = timeout*HZ;
+ } else {
+ struct ip_vs_proto_data *pd;
+
+ pd = ip_vs_proto_data_get(net, protocol);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
+ cp->timeout = pd->timeout_table[state];
+ else
+ cp->timeout = (3*60*HZ);
}
+ ip_vs_conn_put(cp);
+}
- p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+/*
+ * Process received multicast message for Version 0
+ */
+static void ip_vs_process_message_v0(struct net *net, const char *buffer,
+ const size_t buflen)
+{
+ struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
+ struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_conn_options *opt;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ char *p;
+ int i;
+
+ p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
for (i=0; i<m->nr_conns; i++) {
unsigned flags, state;
if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
- IP_VS_ERR_RL("bogus conn in sync message\n");
+ IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
return;
}
- s = (struct ip_vs_sync_conn *) p;
+ s = (struct ip_vs_sync_conn_v0 *) p;
flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
flags &= ~IP_VS_CONN_F_HASHED;
if (flags & IP_VS_CONN_F_SEQ_MASK) {
opt = (struct ip_vs_sync_conn_options *)&s[1];
p += FULL_CONN_SIZE;
if (p > buffer+buflen) {
- IP_VS_ERR_RL("bogus conn options in sync message\n");
+ IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
return;
}
} else {
@@ -362,118 +864,286 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
pp = ip_vs_proto_get(s->protocol);
if (!pp) {
- IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
s->protocol);
continue;
}
if (state >= pp->num_states) {
- IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
pp->name, state);
continue;
}
} else {
/* protocol in templates is not used for state/timeout */
- pp = NULL;
if (state > 0) {
- IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
+ IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
state);
state = 0;
}
}
- {
- if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
- (union nf_inet_addr *)&s->caddr,
- s->cport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport, &param)) {
- pr_err("ip_vs_conn_fill_param_sync failed");
- return;
+ ip_vs_conn_fill_param(net, AF_INET, s->protocol,
+ (const union nf_inet_addr *)&s->caddr,
+ s->cport,
+ (const union nf_inet_addr *)&s->vaddr,
+ s->vport, &param);
+
+ /* Send timeout as Zero */
+ ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
+ (union nf_inet_addr *)&s->daddr, s->dport,
+ 0, 0, opt);
+ }
+}
+
+/*
+ * Handle options
+ */
+static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
+ __u32 *opt_flags,
+ struct ip_vs_sync_conn_options *opt)
+{
+ struct ip_vs_sync_conn_options *topt;
+
+ topt = (struct ip_vs_sync_conn_options *)p;
+
+ if (plen != sizeof(struct ip_vs_sync_conn_options)) {
+ IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
+ return -EINVAL;
+ }
+ if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
+ IP_VS_DBG(2, "BACKUP, conn options found twice\n");
+ return -EINVAL;
+ }
+ ntoh_seq(&topt->in_seq, &opt->in_seq);
+ ntoh_seq(&topt->out_seq, &opt->out_seq);
+ *opt_flags |= IPVS_OPT_F_SEQ_DATA;
+ return 0;
+}
+
+static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
+ __u8 **data, unsigned int maxlen,
+ __u32 *opt_flags, __u32 flag)
+{
+ if (plen > maxlen) {
+ IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
+ return -EINVAL;
+ }
+ if (*opt_flags & flag) {
+ IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
+ return -EINVAL;
+ }
+ *data_len = plen;
+ *data = p;
+ *opt_flags |= flag;
+ return 0;
+}
+/*
+ * Process a Version 1 sync. connection
+ */
+static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
+{
+ struct ip_vs_sync_conn_options opt;
+ union ip_vs_sync_conn *s;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ __u32 flags;
+ unsigned int af, state, pe_data_len=0, pe_name_len=0;
+ __u8 *pe_data=NULL, *pe_name=NULL;
+ __u32 opt_flags=0;
+ int retc=0;
+
+ s = (union ip_vs_sync_conn *) p;
+
+ if (s->v6.type & STYPE_F_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ af = AF_INET6;
+ p += sizeof(struct ip_vs_sync_v6);
+#else
+ IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
+ retc = 10;
+ goto out;
+#endif
+ } else if (!s->v4.type) {
+ af = AF_INET;
+ p += sizeof(struct ip_vs_sync_v4);
+ } else {
+ return -10;
+ }
+ if (p > msg_end)
+ return -20;
+
+ /* Process optional params check Type & Len. */
+ while (p < msg_end) {
+ int ptype;
+ int plen;
+
+ if (p+2 > msg_end)
+ return -30;
+ ptype = *(p++);
+ plen = *(p++);
+
+ if (!plen || ((p + plen) > msg_end))
+ return -40;
+ /* Handle seq option p = param data */
+ switch (ptype & ~IPVS_OPT_F_PARAM) {
+ case IPVS_OPT_SEQ_DATA:
+ if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
+ return -50;
+ break;
+
+ case IPVS_OPT_PE_DATA:
+ if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
+ IP_VS_PEDATA_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_DATA))
+ return -60;
+ break;
+
+ case IPVS_OPT_PE_NAME:
+ if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
+ IP_VS_PENAME_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_NAME))
+ return -70;
+ break;
+
+ default:
+ /* Param data mandatory ? */
+ if (!(ptype & IPVS_OPT_F_PARAM)) {
+ IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
+ ptype & ~IPVS_OPT_F_PARAM);
+ retc = 20;
+ goto out;
}
- if (!(flags & IP_VS_CONN_F_TEMPLATE))
- cp = ip_vs_conn_in_get(&param);
- else
- cp = ip_vs_ct_in_get(&param);
}
- if (!cp) {
- /*
- * Find the appropriate destination for the connection.
- * If it is not found the connection will remain unbound
- * but still handled.
- */
- dest = ip_vs_find_dest(AF_INET,
- (union nf_inet_addr *)&s->daddr,
- s->dport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport,
- s->protocol);
- /* Set the approprite ativity flag */
- if (s->protocol == IPPROTO_TCP) {
- if (state != IP_VS_TCP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
- } else if (s->protocol == IPPROTO_SCTP) {
- if (state != IP_VS_SCTP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
+ p += plen; /* Next option */
+ }
+
+ /* Get flags and Mask off unsupported */
+ flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
+ flags |= IP_VS_CONN_F_SYNC;
+ state = ntohs(s->v4.state);
+
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ pp = ip_vs_proto_get(s->v4.protocol);
+ if (!pp) {
+ IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
+ s->v4.protocol);
+ retc = 30;
+ goto out;
+ }
+ if (state >= pp->num_states) {
+ IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
+ pp->name, state);
+ retc = 40;
+ goto out;
+ }
+ } else {
+ /* protocol in templates is not used for state/timeout */
+ if (state > 0) {
+ IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
+ state);
+ state = 0;
+ }
+ }
+ if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
+ pe_data_len, pe_name, pe_name_len)) {
+ retc = 50;
+ goto out;
+ }
+ /* If only IPv4, just silent skip IPv6 */
+ if (af == AF_INET)
+ ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
+ (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
+ ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#ifdef CONFIG_IP_VS_IPV6
+ else
+ ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
+ (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
+ ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#endif
+ return 0;
+ /* Error exit */
+out:
+ IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
+ return retc;
+
+}
+/*
+ * Process received multicast message and create the corresponding
+ * ip_vs_conn entries.
+ * Handles Version 0 & 1
+ */
+static void ip_vs_process_message(struct net *net, __u8 *buffer,
+ const size_t buflen)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
+ __u8 *p, *msg_end;
+ int i, nr_conns;
+
+ if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
+ IP_VS_DBG(2, "BACKUP, message header too short\n");
+ return;
+ }
+ /* Convert size back to host byte order */
+ m2->size = ntohs(m2->size);
+
+ if (buflen != m2->size) {
+ IP_VS_DBG(2, "BACKUP, bogus message size\n");
+ return;
+ }
+ /* SyncID sanity check */
+ if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+ IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
+ return;
+ }
+ /* Handle version 1 message */
+ if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
+ && (m2->spare == 0)) {
+
+ msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
+ nr_conns = m2->nr_conns;
+
+ for (i=0; i<nr_conns; i++) {
+ union ip_vs_sync_conn *s;
+ unsigned size;
+ int retc;
+
+ p = msg_end;
+ if (p + sizeof(s->v4) > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+ return;
}
- cp = ip_vs_conn_new(&param,
- (union nf_inet_addr *)&s->daddr,
- s->dport, flags, dest);
- if (dest)
- atomic_dec(&dest->refcnt);
- if (!cp) {
- pr_err("ip_vs_conn_new failed\n");
+ s = (union ip_vs_sync_conn *)p;
+ size = ntohs(s->v4.ver_size) & SVER_MASK;
+ msg_end = p + size;
+ /* Basic sanity checks */
+ if (msg_end > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
return;
}
- } else if (!cp->dest) {
- dest = ip_vs_try_bind_dest(cp);
- if (dest)
- atomic_dec(&dest->refcnt);
- } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
- (cp->state != state)) {
- /* update active/inactive flag for the connection */
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_TCP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags |= IP_VS_CONN_F_INACTIVE;
- } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state == IP_VS_TCP_S_ESTABLISHED)) {
- atomic_inc(&dest->activeconns);
- atomic_dec(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
+ ntohs(s->v4.ver_size) >> SVER_SHIFT);
+ return;
}
- } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
- (cp->state != state)) {
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_SCTP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ /* Process a single sync_conn */
+ retc = ip_vs_proc_sync_conn(net, p, msg_end);
+ if (retc < 0) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
+ retc);
+ return;
}
+ /* Make sure we have 32 bit alignment */
+ msg_end = p + ((size + 3) & ~3);
}
-
- if (opt)
- memcpy(&cp->in_seq, opt, sizeof(*opt));
- atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
- cp->state = state;
- cp->old_state = cp->state;
- /*
- * We can not recover the right timeout for templates
- * in all cases, we can not find the right fwmark
- * virtual service. If needed, we can do it for
- * non-fwmark persistent services.
- */
- if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
- cp->timeout = pp->timeout_table[state];
- else
- cp->timeout = (3*60*HZ);
- ip_vs_conn_put(cp);
+ } else {
+ /* Old type of message */
+ ip_vs_process_message_v0(net, buffer, buflen);
+ return;
}
}
@@ -511,8 +1181,10 @@ static int set_mcast_if(struct sock *sk, char *ifname)
{
struct net_device *dev;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
@@ -531,30 +1203,33 @@ static int set_mcast_if(struct sock *sk, char *ifname)
* Set the maximum length of sync message according to the
* specified interface's MTU.
*/
-static int set_sync_mesg_maxlen(int sync_state)
+static int set_sync_mesg_maxlen(struct net *net, int sync_state)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct net_device *dev;
int num;
if (sync_state == IP_VS_STATE_MASTER) {
- if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
+ dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
+ if (!dev)
return -ENODEV;
num = (dev->mtu - sizeof(struct iphdr) -
sizeof(struct udphdr) -
SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
- sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+ ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
IP_VS_DBG(7, "setting the maximum length of sync sending "
- "message %d.\n", sync_send_mesg_maxlen);
+ "message %d.\n", ipvs->send_mesg_maxlen);
} else if (sync_state == IP_VS_STATE_BACKUP) {
- if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
+ dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
+ if (!dev)
return -ENODEV;
- sync_recv_mesg_maxlen = dev->mtu -
+ ipvs->recv_mesg_maxlen = dev->mtu -
sizeof(struct iphdr) - sizeof(struct udphdr);
IP_VS_DBG(7, "setting the maximum length of sync receiving "
- "message %d.\n", sync_recv_mesg_maxlen);
+ "message %d.\n", ipvs->recv_mesg_maxlen);
}
return 0;
@@ -569,6 +1244,7 @@ static int set_sync_mesg_maxlen(int sync_state)
static int
join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
{
+ struct net *net = sock_net(sk);
struct ip_mreqn mreq;
struct net_device *dev;
int ret;
@@ -576,7 +1252,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
memset(&mreq, 0, sizeof(mreq));
memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -593,11 +1270,13 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
static int bind_mcastif_addr(struct socket *sock, char *ifname)
{
+ struct net *net = sock_net(sock->sk);
struct net_device *dev;
__be32 addr;
struct sockaddr_in sin;
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
return -ENODEV;
addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -619,19 +1298,20 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
/*
* Set up sending multicast socket over UDP
*/
-static struct socket * make_send_sock(void)
+static struct socket *make_send_sock(struct net *net)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct socket *sock;
int result;
/* First create a socket */
- result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
- result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
+ result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
if (result < 0) {
pr_err("Error setting outbound mcast interface\n");
goto error;
@@ -640,7 +1320,7 @@ static struct socket * make_send_sock(void)
set_mcast_loop(sock->sk, 0);
set_mcast_ttl(sock->sk, 1);
- result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
+ result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
if (result < 0) {
pr_err("Error binding address of the mcast interface\n");
goto error;
@@ -664,13 +1344,14 @@ static struct socket * make_send_sock(void)
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket * make_receive_sock(void)
+static struct socket *make_receive_sock(struct net *net)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct socket *sock;
int result;
/* First create a socket */
- result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1);
if (result < 0) {
pr_err("Error during creation of socket; terminating\n");
return ERR_PTR(result);
@@ -689,7 +1370,7 @@ static struct socket * make_receive_sock(void)
/* join the multicast group */
result = join_mcast_group(sock->sk,
(struct in_addr *) &mcast_addr.sin_addr,
- ip_vs_backup_mcast_ifn);
+ ipvs->backup_mcast_ifn);
if (result < 0) {
pr_err("Error joining to the multicast group\n");
goto error;
@@ -760,20 +1441,21 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
static int sync_thread_master(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
struct ip_vs_sync_buff *sb;
pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
"syncid = %d\n",
- ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+ ipvs->master_mcast_ifn, ipvs->master_syncid);
while (!kthread_should_stop()) {
- while ((sb = sb_dequeue())) {
+ while ((sb = sb_dequeue(ipvs))) {
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
ip_vs_sync_buff_release(sb);
}
- /* check if entries stay in curr_sb for 2 seconds */
- sb = get_curr_sync_buff(2 * HZ);
+ /* check if entries stay in ipvs->sync_buff for 2 seconds */
+ sb = get_curr_sync_buff(ipvs, 2 * HZ);
if (sb) {
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
ip_vs_sync_buff_release(sb);
@@ -783,14 +1465,13 @@ static int sync_thread_master(void *data)
}
/* clean up the sync_buff queue */
- while ((sb=sb_dequeue())) {
+ while ((sb = sb_dequeue(ipvs)))
ip_vs_sync_buff_release(sb);
- }
/* clean up the current sync_buff */
- if ((sb = get_curr_sync_buff(0))) {
+ sb = get_curr_sync_buff(ipvs, 0);
+ if (sb)
ip_vs_sync_buff_release(sb);
- }
/* release the sending multicast socket */
sock_release(tinfo->sock);
@@ -803,11 +1484,12 @@ static int sync_thread_master(void *data)
static int sync_thread_backup(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
"syncid = %d\n",
- ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+ ipvs->backup_mcast_ifn, ipvs->backup_syncid);
while (!kthread_should_stop()) {
wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -817,7 +1499,7 @@ static int sync_thread_backup(void *data)
/* do we have data now? */
while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
len = ip_vs_receive(tinfo->sock, tinfo->buf,
- sync_recv_mesg_maxlen);
+ ipvs->recv_mesg_maxlen);
if (len <= 0) {
pr_err("receiving message error\n");
break;
@@ -826,7 +1508,7 @@ static int sync_thread_backup(void *data)
/* disable bottom half, because it accesses the data
shared by softirq while getting/creating conns */
local_bh_disable();
- ip_vs_process_message(tinfo->buf, len);
+ ip_vs_process_message(tinfo->net, tinfo->buf, len);
local_bh_enable();
}
}
@@ -840,41 +1522,42 @@ static int sync_thread_backup(void *data)
}
-int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
{
struct ip_vs_sync_thread_data *tinfo;
struct task_struct **realtask, *task;
struct socket *sock;
+ struct netns_ipvs *ipvs = net_ipvs(net);
char *name, *buf = NULL;
int (*threadfn)(void *data);
int result = -ENOMEM;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
- sizeof(struct ip_vs_sync_conn));
+ sizeof(struct ip_vs_sync_conn_v0));
if (state == IP_VS_STATE_MASTER) {
- if (sync_master_thread)
+ if (ipvs->master_thread)
return -EEXIST;
- strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
- sizeof(ip_vs_master_mcast_ifn));
- ip_vs_master_syncid = syncid;
- realtask = &sync_master_thread;
- name = "ipvs_syncmaster";
+ strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->master_mcast_ifn));
+ ipvs->master_syncid = syncid;
+ realtask = &ipvs->master_thread;
+ name = "ipvs_master:%d";
threadfn = sync_thread_master;
- sock = make_send_sock();
+ sock = make_send_sock(net);
} else if (state == IP_VS_STATE_BACKUP) {
- if (sync_backup_thread)
+ if (ipvs->backup_thread)
return -EEXIST;
- strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
- sizeof(ip_vs_backup_mcast_ifn));
- ip_vs_backup_syncid = syncid;
- realtask = &sync_backup_thread;
- name = "ipvs_syncbackup";
+ strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->backup_mcast_ifn));
+ ipvs->backup_syncid = syncid;
+ realtask = &ipvs->backup_thread;
+ name = "ipvs_backup:%d";
threadfn = sync_thread_backup;
- sock = make_receive_sock();
+ sock = make_receive_sock(net);
} else {
return -EINVAL;
}
@@ -884,9 +1567,9 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
goto out;
}
- set_sync_mesg_maxlen(state);
+ set_sync_mesg_maxlen(net, state);
if (state == IP_VS_STATE_BACKUP) {
- buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
+ buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL);
if (!buf)
goto outsocket;
}
@@ -895,10 +1578,11 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
if (!tinfo)
goto outbuf;
+ tinfo->net = net;
tinfo->sock = sock;
tinfo->buf = buf;
- task = kthread_run(threadfn, tinfo, name);
+ task = kthread_run(threadfn, tinfo, name, ipvs->gen);
if (IS_ERR(task)) {
result = PTR_ERR(task);
goto outtinfo;
@@ -906,7 +1590,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
/* mark as active */
*realtask = task;
- ip_vs_sync_state |= state;
+ ipvs->sync_state |= state;
/* increase the module use count */
ip_vs_use_count_inc();
@@ -924,16 +1608,18 @@ out:
}
-int stop_sync_thread(int state)
+int stop_sync_thread(struct net *net, int state)
{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
if (state == IP_VS_STATE_MASTER) {
- if (!sync_master_thread)
+ if (!ipvs->master_thread)
return -ESRCH;
pr_info("stopping master sync thread %d ...\n",
- task_pid_nr(sync_master_thread));
+ task_pid_nr(ipvs->master_thread));
/*
* The lock synchronizes with sb_queue_tail(), so that we don't
@@ -941,21 +1627,21 @@ int stop_sync_thread(int state)
* progress of stopping the master sync daemon.
*/
- spin_lock_bh(&ip_vs_sync_lock);
- ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
- spin_unlock_bh(&ip_vs_sync_lock);
- kthread_stop(sync_master_thread);
- sync_master_thread = NULL;
+ spin_lock_bh(&ipvs->sync_lock);
+ ipvs->sync_state &= ~IP_VS_STATE_MASTER;
+ spin_unlock_bh(&ipvs->sync_lock);
+ kthread_stop(ipvs->master_thread);
+ ipvs->master_thread = NULL;
} else if (state == IP_VS_STATE_BACKUP) {
- if (!sync_backup_thread)
+ if (!ipvs->backup_thread)
return -ESRCH;
pr_info("stopping backup sync thread %d ...\n",
- task_pid_nr(sync_backup_thread));
+ task_pid_nr(ipvs->backup_thread));
- ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
- kthread_stop(sync_backup_thread);
- sync_backup_thread = NULL;
+ ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
+ kthread_stop(ipvs->backup_thread);
+ ipvs->backup_thread = NULL;
} else {
return -EINVAL;
}
@@ -965,3 +1651,42 @@ int stop_sync_thread(int state)
return 0;
}
+
+/*
+ * Initialize data struct for each netns
+ */
+static int __net_init __ip_vs_sync_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->sync_queue);
+ spin_lock_init(&ipvs->sync_lock);
+ spin_lock_init(&ipvs->sync_buff_lock);
+
+ ipvs->sync_mcast_addr.sin_family = AF_INET;
+ ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
+ ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
+ return 0;
+}
+
+static void __ip_vs_sync_cleanup(struct net *net)
+{
+ stop_sync_thread(net, IP_VS_STATE_MASTER);
+ stop_sync_thread(net, IP_VS_STATE_BACKUP);
+}
+
+static struct pernet_operations ipvs_sync_ops = {
+ .init = __ip_vs_sync_init,
+ .exit = __ip_vs_sync_cleanup,
+};
+
+
+int __init ip_vs_sync_init(void)
+{
+ return register_pernet_subsys(&ipvs_sync_ops);
+}
+
+void ip_vs_sync_cleanup(void)
+{
+ unregister_pernet_subsys(&ipvs_sync_ops);
+}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index bbddfdb10db..bc1bfc48a17 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -27,22 +27,6 @@
#include <net/ip_vs.h>
-
-static inline unsigned int
-ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
-{
- /*
- * We think the overhead of processing active connections is 256
- * times higher than that of inactive connections in average. (This
- * 256 times might not be accurate, we will change it later) We
- * use the following formula to estimate the overhead now:
- * dest->activeconns*256 + dest->inactconns
- */
- return (atomic_read(&dest->activeconns) << 8) +
- atomic_read(&dest->inactconns);
-}
-
-
/*
* Weighted Least Connection scheduling
*/
@@ -71,11 +55,11 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
- loh = ip_vs_wlc_dest_overhead(least);
+ loh = ip_vs_dest_conn_overhead(least);
goto nextstage;
}
}
- IP_VS_ERR_RL("WLC: no destination available\n");
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
/*
@@ -85,7 +69,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
list_for_each_entry_continue(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = ip_vs_wlc_dest_overhead(dest);
+ doh = ip_vs_dest_conn_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 30db633f88f..1ef41f50723 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -147,8 +147,9 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
if (mark->cl == mark->cl->next) {
/* no dest entry */
- IP_VS_ERR_RL("WRR: no destination available: "
- "no destinations present\n");
+ ip_vs_scheduler_err(svc,
+ "no destination available: "
+ "no destinations present");
dest = NULL;
goto out;
}
@@ -162,8 +163,8 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
*/
if (mark->cw == 0) {
mark->cl = &svc->destinations;
- IP_VS_ERR_RL("WRR: no destination "
- "available\n");
+ ip_vs_scheduler_err(svc,
+ "no destination available");
dest = NULL;
goto out;
}
@@ -185,8 +186,9 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* back to the start, and no dest is found.
It is only possible when all dests are OVERLOADED */
dest = NULL;
- IP_VS_ERR_RL("WRR: no destination available: "
- "all destinations are overloaded\n");
+ ip_vs_scheduler_err(svc,
+ "no destination available: "
+ "all destinations are overloaded");
goto out;
}
}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 5325a3fbe4a..6132b213edd 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -43,6 +43,13 @@
#include <net/ip_vs.h>
+enum {
+ IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */
+ IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */
+ IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
+ * local
+ */
+};
/*
* Destination cache to speed up outgoing route lookup
@@ -77,11 +84,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
return dst;
}
-/*
- * Get route to destination or remote server
- * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
- * &4=Allow redirect from remote daddr to local
- */
+/* Get route to destination or remote server */
static struct rtable *
__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
__be32 daddr, u32 rtos, int rt_mode)
@@ -95,12 +98,8 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
spin_lock(&dest->dst_lock);
if (!(rt = (struct rtable *)
__ip_vs_dst_check(dest, rtos))) {
- struct flowi fl = {
- .fl4_dst = dest->addr.ip,
- .fl4_tos = rtos,
- };
-
- if (ip_route_output_key(net, &rt, &fl)) {
+ rt = ip_route_output(net, dest->addr.ip, 0, rtos, 0);
+ if (IS_ERR(rt)) {
spin_unlock(&dest->dst_lock);
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
&dest->addr.ip);
@@ -113,12 +112,8 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
}
spin_unlock(&dest->dst_lock);
} else {
- struct flowi fl = {
- .fl4_dst = daddr,
- .fl4_tos = rtos,
- };
-
- if (ip_route_output_key(net, &rt, &fl)) {
+ rt = ip_route_output(net, daddr, 0, rtos, 0);
+ if (IS_ERR(rt)) {
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
&daddr);
return NULL;
@@ -126,15 +121,16 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
}
local = rt->rt_flags & RTCF_LOCAL;
- if (!((local ? 1 : 2) & rt_mode)) {
+ if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
+ rt_mode)) {
IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
(rt->rt_flags & RTCF_LOCAL) ?
"local":"non-local", &rt->rt_dst);
ip_rt_put(rt);
return NULL;
}
- if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
- ort->rt_flags & RTCF_LOCAL)) {
+ if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) {
IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
"requires NAT method, dest: %pI4\n",
&ip_hdr(skb)->daddr, &rt->rt_dst);
@@ -169,15 +165,15 @@ __ip_vs_reroute_locally(struct sk_buff *skb)
return 0;
refdst_drop(orefdst);
} else {
- struct flowi fl = {
- .fl4_dst = iph->daddr,
- .fl4_src = iph->saddr,
- .fl4_tos = RT_TOS(iph->tos),
- .mark = skb->mark,
+ struct flowi4 fl4 = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowi4_tos = RT_TOS(iph->tos),
+ .flowi4_mark = skb->mark,
};
- struct rtable *rt;
- if (ip_route_output_key(net, &rt, &fl))
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
return 0;
if (!(rt->rt_flags & RTCF_LOCAL)) {
ip_rt_put(rt);
@@ -202,22 +198,27 @@ __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
struct in6_addr *ret_saddr, int do_xfrm)
{
struct dst_entry *dst;
- struct flowi fl = {
- .fl6_dst = *daddr,
+ struct flowi6 fl6 = {
+ .daddr = *daddr,
};
- dst = ip6_route_output(net, NULL, &fl);
+ dst = ip6_route_output(net, NULL, &fl6);
if (dst->error)
goto out_err;
if (!ret_saddr)
return dst;
- if (ipv6_addr_any(&fl.fl6_src) &&
+ if (ipv6_addr_any(&fl6.saddr) &&
ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
- &fl.fl6_dst, 0, &fl.fl6_src) < 0)
- goto out_err;
- if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+ &fl6.daddr, 0, &fl6.saddr) < 0)
goto out_err;
- ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+ if (do_xfrm) {
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ dst = NULL;
+ goto out_err;
+ }
+ }
+ ipv6_addr_copy(ret_saddr, &fl6.saddr);
return dst;
out_err:
@@ -384,13 +385,14 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
- RT_TOS(iph->tos), 2)))
+ if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos),
+ IP_VS_RT_MODE_NON_LOCAL)))
goto tx_error_icmp;
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+ if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
+ !skb_is_gso(skb)) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -443,7 +445,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
+ if (skb->len > mtu && !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -512,7 +514,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
}
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos), 1|2|4)))
+ RT_TOS(iph->tos),
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR)))
goto tx_error_icmp;
local = rt->rt_flags & RTCF_LOCAL;
/*
@@ -543,7 +548,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+ if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
+ !skb_is_gso(skb)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
"ip_vs_nat_xmit(): frag needed for");
@@ -658,7 +664,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
+ if (skb->len > mtu && !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -754,7 +760,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(tos), 1|2)))
+ RT_TOS(tos), IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL)))
goto tx_error_icmp;
if (rt->rt_flags & RTCF_LOCAL) {
ip_rt_put(rt);
@@ -773,8 +780,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
df |= (old_iph->frag_off & htons(IP_DF));
- if ((old_iph->frag_off & htons(IP_DF))
- && mtu < ntohs(old_iph->tot_len)) {
+ if ((old_iph->frag_off & htons(IP_DF) &&
+ mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error_put;
@@ -886,7 +893,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
- if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+ if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) &&
+ !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -982,7 +990,9 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos), 1|2)))
+ RT_TOS(iph->tos),
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL)))
goto tx_error_icmp;
if (rt->rt_flags & RTCF_LOCAL) {
ip_rt_put(rt);
@@ -991,7 +1001,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
+ if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu &&
+ !skb_is_gso(skb)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
ip_rt_put(rt);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -1125,7 +1136,10 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
+ RT_TOS(ip_hdr(skb)->tos),
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR)))
goto tx_error_icmp;
local = rt->rt_flags & RTCF_LOCAL;
@@ -1158,7 +1172,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
+ if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) &&
+ !skb_is_gso(skb)) {
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error_put;
@@ -1272,7 +1287,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* MTU checking */
mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
+ if (skb->len > mtu && !skb_is_gso(skb)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
new file mode 100644
index 00000000000..4e99cca6161
--- /dev/null
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -0,0 +1,82 @@
+/*
+ * broadcast connection tracking helper
+ *
+ * (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <net/route.h>
+#include <linux/inetdevice.h>
+#include <linux/skbuff.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+int nf_conntrack_broadcast_help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int timeout)
+{
+ struct nf_conntrack_expect *exp;
+ struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt = skb_rtable(skb);
+ struct in_device *in_dev;
+ struct nf_conn_help *help = nfct_help(ct);
+ __be32 mask = 0;
+
+ /* we're only interested in locally generated packets */
+ if (skb->sk == NULL)
+ goto out;
+ if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
+ goto out;
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ goto out;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (in_dev != NULL) {
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_broadcast == iph->daddr) {
+ mask = ifa->ifa_mask;
+ break;
+ }
+ } endfor_ifa(in_dev);
+ }
+ rcu_read_unlock();
+
+ if (mask == 0)
+ goto out;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL)
+ goto out;
+
+ exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
+
+ exp->mask.src.u3.ip = mask;
+ exp->mask.src.u.udp.port = htons(0xFFFF);
+
+ exp->expectfn = NULL;
+ exp->flags = NF_CT_EXPECT_PERMANENT;
+ exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
+ exp->helper = NULL;
+
+ nf_ct_expect_related(exp);
+ nf_ct_expect_put(exp);
+
+ nf_ct_refresh(ct, skb, timeout * HZ);
+out:
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index e61511929c6..2f454efa1a8 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -43,6 +43,7 @@
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
@@ -282,6 +283,11 @@ EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
static void death_by_timeout(unsigned long ul_conntrack)
{
struct nf_conn *ct = (void *)ul_conntrack;
+ struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp && tstamp->stop == 0)
+ tstamp->stop = ktime_to_ns(ktime_get_real());
if (!test_bit(IPS_DYING_BIT, &ct->status) &&
unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
@@ -419,6 +425,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
+ struct nf_conn_tstamp *tstamp;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
@@ -486,8 +493,16 @@ __nf_conntrack_confirm(struct sk_buff *skb)
ct->timeout.expires += jiffies;
add_timer(&ct->timeout);
atomic_inc(&ct->ct_general.use);
- set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ ct->status |= IPS_CONFIRMED;
+
+ /* set conntrack timestamp, if enabled. */
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp) {
+ if (skb->tstamp.tv64 == 0)
+ __net_timestamp((struct sk_buff *)skb);
+ tstamp->start = ktime_to_ns(skb->tstamp);
+ }
/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
* guarantee that no other CPU can find the conntrack before the above
@@ -655,7 +670,8 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
* and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
*/
memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
- sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
+ offsetof(struct nf_conn, proto) -
+ offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
spin_lock_init(&ct->lock);
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
@@ -745,6 +761,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
@@ -942,8 +959,15 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_REPLY, ct);
out:
- if (tmpl)
- nf_ct_put(tmpl);
+ if (tmpl) {
+ /* Special case: we have to repeat this hook, assign the
+ * template again to this packet. We assume that this packet
+ * has no conntrack assigned. This is used by nf_ct_tcp. */
+ if (ret == NF_REPEAT)
+ skb->nfct = (struct nf_conntrack *)tmpl;
+ else
+ nf_ct_put(tmpl);
+ }
return ret;
}
@@ -1185,6 +1209,11 @@ struct __nf_ct_flush_report {
static int kill_report(struct nf_conn *i, void *data)
{
struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
+ struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(i);
+ if (tstamp && tstamp->stop == 0)
+ tstamp->stop = ktime_to_ns(ktime_get_real());
/* If we fail to deliver the event, death_by_timeout() will retry */
if (nf_conntrack_event_report(IPCT_DESTROY, i,
@@ -1201,9 +1230,9 @@ static int kill_all(struct nf_conn *i, void *data)
return 1;
}
-void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size)
+void nf_ct_free_hashtable(void *hash, unsigned int size)
{
- if (vmalloced)
+ if (is_vmalloc_addr(hash))
vfree(hash);
else
free_pages((unsigned long)hash,
@@ -1270,8 +1299,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
goto i_see_dead_people;
}
- nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
- net->ct.htable_size);
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
nf_conntrack_ecache_fini(net);
nf_conntrack_acct_fini(net);
nf_conntrack_expect_fini(net);
@@ -1300,21 +1328,18 @@ void nf_conntrack_cleanup(struct net *net)
}
}
-void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls)
+void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
{
struct hlist_nulls_head *hash;
unsigned int nr_slots, i;
size_t sz;
- *vmalloced = 0;
-
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
sz = nr_slots * sizeof(struct hlist_nulls_head);
hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
get_order(sz));
if (!hash) {
- *vmalloced = 1;
printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL);
@@ -1330,7 +1355,7 @@ EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
{
- int i, bucket, vmalloced, old_vmalloced;
+ int i, bucket;
unsigned int hashsize, old_size;
struct hlist_nulls_head *hash, *old_hash;
struct nf_conntrack_tuple_hash *h;
@@ -1347,7 +1372,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
if (!hashsize)
return -EINVAL;
- hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1);
+ hash = nf_ct_alloc_hashtable(&hashsize, 1);
if (!hash)
return -ENOMEM;
@@ -1369,15 +1394,13 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
}
}
old_size = init_net.ct.htable_size;
- old_vmalloced = init_net.ct.hash_vmalloc;
old_hash = init_net.ct.hash;
init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
- init_net.ct.hash_vmalloc = vmalloced;
init_net.ct.hash = hash;
spin_unlock_bh(&nf_conntrack_lock);
- nf_ct_free_hashtable(old_hash, old_vmalloced, old_size);
+ nf_ct_free_hashtable(old_hash, old_size);
return 0;
}
EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
@@ -1490,8 +1513,7 @@ static int nf_conntrack_init_net(struct net *net)
}
net->ct.htable_size = nf_conntrack_htable_size;
- net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size,
- &net->ct.hash_vmalloc, 1);
+ net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
if (!net->ct.hash) {
ret = -ENOMEM;
printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
@@ -1503,6 +1525,9 @@ static int nf_conntrack_init_net(struct net *net)
ret = nf_conntrack_acct_init(net);
if (ret < 0)
goto err_acct;
+ ret = nf_conntrack_tstamp_init(net);
+ if (ret < 0)
+ goto err_tstamp;
ret = nf_conntrack_ecache_init(net);
if (ret < 0)
goto err_ecache;
@@ -1510,12 +1535,13 @@ static int nf_conntrack_init_net(struct net *net)
return 0;
err_ecache:
+ nf_conntrack_tstamp_fini(net);
+err_tstamp:
nf_conntrack_acct_fini(net);
err_acct:
nf_conntrack_expect_fini(net);
err_expect:
- nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
- net->ct.htable_size);
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
err_hash:
kmem_cache_destroy(net->ct.nf_conntrack_cachep);
err_cache:
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 5702de35e2b..63a1b915a7e 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -63,6 +63,9 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct)
* this does not harm and it happens very rarely. */
unsigned long missed = e->missed;
+ if (!((events | missed) & e->ctmask))
+ goto out_unlock;
+
ret = notify->fcn(events | missed, &item);
if (unlikely(ret < 0 || missed)) {
spin_lock_bh(&ct->lock);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index a20fb0bd1ef..cd1e8e0970f 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -319,7 +319,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
const struct nf_conntrack_expect_policy *p;
unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
- atomic_inc(&exp->use);
+ /* two references : one for hash insert, one for the timer */
+ atomic_add(2, &exp->use);
if (master_help) {
hlist_add_head(&exp->lnode, &master_help->expectations);
@@ -333,12 +334,14 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
(unsigned long)exp);
if (master_help) {
- p = &master_help->helper->expect_policy[exp->class];
+ p = &rcu_dereference_protected(
+ master_help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ )->expect_policy[exp->class];
exp->timeout.expires = jiffies + p->timeout * HZ;
}
add_timer(&exp->timeout);
- atomic_inc(&exp->use);
NF_CT_STAT_INC(net, expect_create);
}
@@ -369,7 +372,10 @@ static inline int refresh_timer(struct nf_conntrack_expect *i)
if (!del_timer(&i->timeout))
return 0;
- p = &master_help->helper->expect_policy[i->class];
+ p = &rcu_dereference_protected(
+ master_help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ )->expect_policy[i->class];
i->timeout.expires = jiffies + p->timeout * HZ;
add_timer(&i->timeout);
return 1;
@@ -407,7 +413,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
}
/* Will be over limit? */
if (master_help) {
- p = &master_help->helper->expect_policy[expect->class];
+ p = &rcu_dereference_protected(
+ master_help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ )->expect_policy[expect->class];
if (p->max_expected &&
master_help->expecting[expect->class] >= p->max_expected) {
evict_oldest_expect(master, expect);
@@ -478,7 +487,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
if (n)
return n;
}
@@ -491,11 +500,11 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
- head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
}
return head;
}
@@ -630,8 +639,7 @@ int nf_conntrack_expect_init(struct net *net)
}
net->ct.expect_count = 0;
- net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
- &net->ct.expect_vmalloc, 0);
+ net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
if (net->ct.expect_hash == NULL)
goto err1;
@@ -653,8 +661,7 @@ err3:
if (net_eq(net, &init_net))
kmem_cache_destroy(nf_ct_expect_cachep);
err2:
- nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
- nf_ct_expect_hsize);
+ nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
err1:
return err;
}
@@ -666,6 +673,5 @@ void nf_conntrack_expect_fini(struct net *net)
rcu_barrier(); /* Wait for call_rcu() before destroy */
kmem_cache_destroy(nf_ct_expect_cachep);
}
- nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
- nf_ct_expect_hsize);
+ nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index bd82450c193..80a23ed62bb 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -140,15 +140,16 @@ static void update_alloc_size(struct nf_ct_ext_type *type)
/* This assumes that extended areas in conntrack for the types
whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */
for (i = min; i <= max; i++) {
- t1 = nf_ct_ext_types[i];
+ t1 = rcu_dereference_protected(nf_ct_ext_types[i],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
if (!t1)
continue;
- t1->alloc_size = sizeof(struct nf_ct_ext)
- + ALIGN(sizeof(struct nf_ct_ext), t1->align)
- + t1->len;
+ t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) +
+ t1->len;
for (j = 0; j < NF_CT_EXT_NUM; j++) {
- t2 = nf_ct_ext_types[j];
+ t2 = rcu_dereference_protected(nf_ct_ext_types[j],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
if (t2 == NULL || t2 == t1 ||
(t2->flags & NF_CT_EXT_F_PREALLOC) == 0)
continue;
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index b969025cf82..533a183e666 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -714,7 +714,6 @@ static int callforward_do_filter(const union nf_inet_addr *src,
u_int8_t family)
{
const struct nf_afinfo *afinfo;
- struct flowi fl1, fl2;
int ret = 0;
/* rcu_read_lock()ed by nf_hook_slow() */
@@ -722,17 +721,20 @@ static int callforward_do_filter(const union nf_inet_addr *src,
if (!afinfo)
return 0;
- memset(&fl1, 0, sizeof(fl1));
- memset(&fl2, 0, sizeof(fl2));
-
switch (family) {
case AF_INET: {
+ struct flowi4 fl1, fl2;
struct rtable *rt1, *rt2;
- fl1.fl4_dst = src->ip;
- fl2.fl4_dst = dst->ip;
- if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) {
- if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) {
+ memset(&fl1, 0, sizeof(fl1));
+ fl1.daddr = src->ip;
+
+ memset(&fl2, 0, sizeof(fl2));
+ fl2.daddr = dst->ip;
+ if (!afinfo->route((struct dst_entry **)&rt1,
+ flowi4_to_flowi(&fl1))) {
+ if (!afinfo->route((struct dst_entry **)&rt2,
+ flowi4_to_flowi(&fl2))) {
if (rt1->rt_gateway == rt2->rt_gateway &&
rt1->dst.dev == rt2->dst.dev)
ret = 1;
@@ -745,12 +747,18 @@ static int callforward_do_filter(const union nf_inet_addr *src,
#if defined(CONFIG_NF_CONNTRACK_IPV6) || \
defined(CONFIG_NF_CONNTRACK_IPV6_MODULE)
case AF_INET6: {
+ struct flowi6 fl1, fl2;
struct rt6_info *rt1, *rt2;
- memcpy(&fl1.fl6_dst, src, sizeof(fl1.fl6_dst));
- memcpy(&fl2.fl6_dst, dst, sizeof(fl2.fl6_dst));
- if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) {
- if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) {
+ memset(&fl1, 0, sizeof(fl1));
+ ipv6_addr_copy(&fl1.daddr, &src->in6);
+
+ memset(&fl2, 0, sizeof(fl2));
+ ipv6_addr_copy(&fl2.daddr, &dst->in6);
+ if (!afinfo->route((struct dst_entry **)&rt1,
+ flowi6_to_flowi(&fl1))) {
+ if (!afinfo->route((struct dst_entry **)&rt2,
+ flowi6_to_flowi(&fl2))) {
if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway,
sizeof(rt1->rt6i_gateway)) &&
rt1->dst.dev == rt2->dst.dev)
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 59e1a4cd4e8..1bdfea35795 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -33,7 +33,6 @@ static DEFINE_MUTEX(nf_ct_helper_mutex);
static struct hlist_head *nf_ct_helper_hash __read_mostly;
static unsigned int nf_ct_helper_hsize __read_mostly;
static unsigned int nf_ct_helper_count __read_mostly;
-static int nf_ct_helper_vmalloc;
/* Stupid hash, but collision free for the default registrations of the
@@ -158,7 +157,10 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i,
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
struct nf_conn_help *help = nfct_help(ct);
- if (help && help->helper == me) {
+ if (help && rcu_dereference_protected(
+ help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ ) == me) {
nf_conntrack_event(IPCT_HELPER, ct);
rcu_assign_pointer(help->helper, NULL);
}
@@ -210,7 +212,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
hlist_for_each_entry_safe(exp, n, next,
&net->ct.expect_hash[i], hnode) {
struct nf_conn_help *help = nfct_help(exp->master);
- if ((help->helper == me || exp->helper == me) &&
+ if ((rcu_dereference_protected(
+ help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ ) == me || exp->helper == me) &&
del_timer(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
@@ -261,8 +266,7 @@ int nf_conntrack_helper_init(void)
int err;
nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
- nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize,
- &nf_ct_helper_vmalloc, 0);
+ nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
if (!nf_ct_helper_hash)
return -ENOMEM;
@@ -273,14 +277,12 @@ int nf_conntrack_helper_init(void)
return 0;
err1:
- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc,
- nf_ct_helper_hsize);
+ nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
return err;
}
void nf_conntrack_helper_fini(void)
{
nf_ct_extend_unregister(&helper_extend);
- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc,
- nf_ct_helper_hsize);
+ nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
}
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index aadde018a07..4c8f30a3d6d 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -18,14 +18,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/if_addr.h>
#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <net/route.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_helper.h>
@@ -40,75 +33,26 @@ MODULE_ALIAS("ip_conntrack_netbios_ns");
MODULE_ALIAS_NFCT_HELPER("netbios_ns");
static unsigned int timeout __read_mostly = 3;
-module_param(timeout, uint, 0400);
+module_param(timeout, uint, S_IRUSR);
MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
-static int help(struct sk_buff *skb, unsigned int protoff,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo)
-{
- struct nf_conntrack_expect *exp;
- struct iphdr *iph = ip_hdr(skb);
- struct rtable *rt = skb_rtable(skb);
- struct in_device *in_dev;
- __be32 mask = 0;
-
- /* we're only interested in locally generated packets */
- if (skb->sk == NULL)
- goto out;
- if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
- goto out;
- if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- goto out;
-
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(rt->dst.dev);
- if (in_dev != NULL) {
- for_primary_ifa(in_dev) {
- if (ifa->ifa_broadcast == iph->daddr) {
- mask = ifa->ifa_mask;
- break;
- }
- } endfor_ifa(in_dev);
- }
- rcu_read_unlock();
-
- if (mask == 0)
- goto out;
-
- exp = nf_ct_expect_alloc(ct);
- if (exp == NULL)
- goto out;
-
- exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- exp->tuple.src.u.udp.port = htons(NMBD_PORT);
-
- exp->mask.src.u3.ip = mask;
- exp->mask.src.u.udp.port = htons(0xFFFF);
-
- exp->expectfn = NULL;
- exp->flags = NF_CT_EXPECT_PERMANENT;
- exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
- exp->helper = NULL;
-
- nf_ct_expect_related(exp);
- nf_ct_expect_put(exp);
-
- nf_ct_refresh(ct, skb, timeout * HZ);
-out:
- return NF_ACCEPT;
-}
-
static struct nf_conntrack_expect_policy exp_policy = {
.max_expected = 1,
};
+static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+}
+
static struct nf_conntrack_helper helper __read_mostly = {
.name = "netbios-ns",
- .tuple.src.l3num = AF_INET,
+ .tuple.src.l3num = NFPROTO_IPV4,
.tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT),
.tuple.dst.protonum = IPPROTO_UDP,
.me = THIS_MODULE,
- .help = help,
+ .help = netbios_ns_help,
.expect_policy = &exp_policy,
};
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2b7eef37875..30bf8a167fc 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -42,6 +42,7 @@
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
#ifdef CONFIG_NF_NAT_NEEDED
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_protocol.h>
@@ -230,6 +231,33 @@ nla_put_failure:
return -1;
}
+static int
+ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nlattr *nest_count;
+ const struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (!tstamp)
+ return 0;
+
+ nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED);
+ if (!nest_count)
+ goto nla_put_failure;
+
+ NLA_PUT_BE64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start));
+ if (tstamp->stop != 0) {
+ NLA_PUT_BE64(skb, CTA_TIMESTAMP_STOP,
+ cpu_to_be64(tstamp->stop));
+ }
+ nla_nest_end(skb, nest_count);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
#ifdef CONFIG_NF_CONNTRACK_MARK
static inline int
ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
@@ -404,6 +432,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
ctnetlink_dump_timeout(skb, ct) < 0 ||
ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0 ||
ctnetlink_dump_protoinfo(skb, ct) < 0 ||
ctnetlink_dump_helpinfo(skb, ct) < 0 ||
ctnetlink_dump_mark(skb, ct) < 0 ||
@@ -471,6 +500,18 @@ ctnetlink_secctx_size(const struct nf_conn *ct)
}
static inline size_t
+ctnetlink_timestamp_size(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
+ return 0;
+ return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t));
+#else
+ return 0;
+#endif
+}
+
+static inline size_t
ctnetlink_nlmsg_size(const struct nf_conn *ct)
{
return NLMSG_ALIGN(sizeof(struct nfgenmsg))
@@ -481,6 +522,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
+ ctnetlink_counters_size(ct)
+ + ctnetlink_timestamp_size(ct)
+ nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ nla_total_size(0) /* CTA_PROTOINFO */
+ nla_total_size(0) /* CTA_HELP */
@@ -571,7 +613,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
if (events & (1 << IPCT_DESTROY)) {
if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
- ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+ ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0)
goto nla_put_failure;
} else {
if (ctnetlink_dump_timeout(skb, ct) < 0)
@@ -667,6 +710,7 @@ restart:
if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
IPCTNL_MSG_CT_NEW, ct) < 0) {
+ nf_conntrack_get(&ct->ct_general);
cb->args[1] = (unsigned long)ct;
goto out;
}
@@ -760,7 +804,7 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
static int
ctnetlink_parse_tuple(const struct nlattr * const cda[],
struct nf_conntrack_tuple *tuple,
- enum ctattr_tuple type, u_int8_t l3num)
+ enum ctattr_type type, u_int8_t l3num)
{
struct nlattr *tb[CTA_TUPLE_MAX+1];
int err;
@@ -924,7 +968,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
u16 zone;
int err;
- if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP)
+ if (nlh->nlmsg_flags & NLM_F_DUMP)
return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table,
ctnetlink_done);
@@ -1357,6 +1401,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
}
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
/* we must add conntrack extensions before confirmation. */
ct->status |= IPS_CONFIRMED;
@@ -1375,6 +1420,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
}
#endif
+ memset(&ct->proto, 0, sizeof(ct->proto));
if (cda[CTA_PROTOINFO]) {
err = ctnetlink_change_protoinfo(ct, cda);
if (err < 0)
@@ -1787,7 +1833,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
u16 zone;
int err;
- if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
return netlink_dump_start(ctnl, skb, nlh,
ctnetlink_exp_dump_table,
ctnetlink_exp_done);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index dc7bb74110d..5701c8dd783 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -166,6 +166,7 @@ static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto
int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
{
int ret = 0;
+ struct nf_conntrack_l3proto *old;
if (proto->l3proto >= AF_MAX)
return -EBUSY;
@@ -174,7 +175,9 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
return -EINVAL;
mutex_lock(&nf_ct_proto_mutex);
- if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) {
+ old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex));
+ if (old != &nf_conntrack_l3proto_generic) {
ret = -EBUSY;
goto out_unlock;
}
@@ -201,7 +204,9 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
BUG_ON(proto->l3proto >= AF_MAX);
mutex_lock(&nf_ct_proto_mutex);
- BUG_ON(nf_ct_l3protos[proto->l3proto] != proto);
+ BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != proto);
rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
&nf_conntrack_l3proto_generic);
nf_ct_l3proto_unregister_sysctl(proto);
@@ -279,7 +284,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
mutex_lock(&nf_ct_proto_mutex);
if (!nf_ct_protos[l4proto->l3proto]) {
/* l3proto may be loaded latter. */
- struct nf_conntrack_l4proto **proto_array;
+ struct nf_conntrack_l4proto __rcu **proto_array;
int i;
proto_array = kmalloc(MAX_NF_CT_PROTO *
@@ -291,7 +296,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
}
for (i = 0; i < MAX_NF_CT_PROTO; i++)
- proto_array[i] = &nf_conntrack_l4proto_generic;
+ RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic);
/* Before making proto_array visible to lockless readers,
* we must make sure its content is committed to memory.
@@ -299,8 +304,10 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
smp_wmb();
nf_ct_protos[l4proto->l3proto] = proto_array;
- } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] !=
- &nf_conntrack_l4proto_generic) {
+ } else if (rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != &nf_conntrack_l4proto_generic) {
ret = -EBUSY;
goto out_unlock;
}
@@ -331,7 +338,10 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
BUG_ON(l4proto->l3proto >= PF_MAX);
mutex_lock(&nf_ct_proto_mutex);
- BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto);
+ BUG_ON(rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != l4proto);
rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
&nf_conntrack_l4proto_generic);
nf_ct_l4proto_unregister_sysctl(l4proto);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 5292560d6d4..9ae57c57c50 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -452,6 +452,9 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
ct->proto.dccp.state = CT_DCCP_NONE;
+ ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
+ ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
+ ct->proto.dccp.handshake_seq = 0;
return true;
out_invalid:
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index c6049c2d5ea..6f4ee70f460 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -413,6 +413,7 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
test_bit(SCTP_CID_COOKIE_ACK, map))
return false;
+ memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
new_state = SCTP_CONNTRACK_MAX;
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
/* Don't need lock here: this conntrack not in circulation yet */
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 3fb2b73b24d..37bf94394be 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -227,11 +227,11 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
* sCL -> sIV
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
-/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
+/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
/*
* sSS -> sSR Standard open.
* sS2 -> sSR Simultaneous open
- * sSR -> sSR Retransmitted SYN/ACK.
+ * sSR -> sIG Retransmitted SYN/ACK, ignore it.
* sES -> sIG Late retransmitted SYN/ACK?
* sFW -> sIG Might be SYN/ACK answering ignored SYN
* sCW -> sIG
@@ -1066,9 +1066,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
BUG_ON(th == NULL);
/* Don't need lock here: this conntrack not in circulation yet */
- new_state
- = tcp_conntracks[0][get_conntrack_index(th)]
- [TCP_CONNTRACK_NONE];
+ new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
/* Invalid: delete conntrack */
if (new_state >= TCP_CONNTRACK_MAX) {
@@ -1077,6 +1075,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
}
if (new_state == TCP_CONNTRACK_SYN_SENT) {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
/* SYN packet */
ct->proto.tcp.seen[0].td_end =
segment_seq_plus_len(ntohl(th->seq), skb->len,
@@ -1088,11 +1087,11 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
ct->proto.tcp.seen[0].td_end;
tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
- ct->proto.tcp.seen[1].flags = 0;
} else if (nf_ct_tcp_loose == 0) {
/* Don't try to pick up connections. */
return false;
} else {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
/*
* We are in the middle of a connection,
* its history is lost for us.
@@ -1107,7 +1106,6 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
ct->proto.tcp.seen[0].td_maxend =
ct->proto.tcp.seen[0].td_end +
ct->proto.tcp.seen[0].td_maxwin;
- ct->proto.tcp.seen[0].td_scale = 0;
/* We assume SACK and liberal window checking to handle
* window scaling */
@@ -1116,13 +1114,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
IP_CT_TCP_FLAG_BE_LIBERAL;
}
- ct->proto.tcp.seen[1].td_end = 0;
- ct->proto.tcp.seen[1].td_maxend = 0;
- ct->proto.tcp.seen[1].td_maxwin = 0;
- ct->proto.tcp.seen[1].td_scale = 0;
-
/* tcp_packet will set them */
- ct->proto.tcp.state = TCP_CONNTRACK_NONE;
ct->proto.tcp.last_index = TCP_NONE_SET;
pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c
new file mode 100644
index 00000000000..6e545e26289
--- /dev/null
+++ b/net/netfilter/nf_conntrack_snmp.c
@@ -0,0 +1,77 @@
+/*
+ * SNMP service broadcast connection tracking helper
+ *
+ * (c) 2011 Jiri Olsa <jolsa@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/in.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+#define SNMP_PORT 161
+
+MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>");
+MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFCT_HELPER("snmp");
+
+static unsigned int timeout __read_mostly = 30;
+module_param(timeout, uint, S_IRUSR);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+int (*nf_nat_snmp_hook)(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL_GPL(nf_nat_snmp_hook);
+
+static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ typeof(nf_nat_snmp_hook) nf_nat_snmp;
+
+ nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+
+ nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook);
+ if (nf_nat_snmp && ct->status & IPS_NAT_MASK)
+ return nf_nat_snmp(skb, protoff, ct, ctinfo);
+
+ return NF_ACCEPT;
+}
+
+static struct nf_conntrack_expect_policy exp_policy = {
+ .max_expected = 1,
+};
+
+static struct nf_conntrack_helper helper __read_mostly = {
+ .name = "snmp",
+ .tuple.src.l3num = NFPROTO_IPV4,
+ .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ .help = snmp_conntrack_help,
+ .expect_policy = &exp_policy,
+};
+
+static int __init nf_conntrack_snmp_init(void)
+{
+ exp_policy.timeout = timeout;
+ return nf_conntrack_helper_register(&helper);
+}
+
+static void __exit nf_conntrack_snmp_fini(void)
+{
+ nf_conntrack_helper_unregister(&helper);
+}
+
+module_init(nf_conntrack_snmp_init);
+module_exit(nf_conntrack_snmp_fini);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index b4d7f0f24b2..0ae14282588 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -29,6 +29,8 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <linux/rculist_nulls.h>
MODULE_LICENSE("GPL");
@@ -45,6 +47,7 @@ EXPORT_SYMBOL_GPL(print_tuple);
struct ct_iter_state {
struct seq_net_private p;
unsigned int bucket;
+ u_int64_t time_now;
};
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
@@ -56,7 +59,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket++) {
- n = rcu_dereference(net->ct.hash[st->bucket].first);
+ n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -69,13 +72,15 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
return NULL;
}
- head = rcu_dereference(net->ct.hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(
+ &net->ct.hash[st->bucket]));
}
return head;
}
@@ -93,6 +98,9 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
+ struct ct_iter_state *st = seq->private;
+
+ st->time_now = ktime_to_ns(ktime_get_real());
rcu_read_lock();
return ct_get_idx(seq, *pos);
}
@@ -132,6 +140,34 @@ static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
}
#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+ struct ct_iter_state *st = s->private;
+ struct nf_conn_tstamp *tstamp;
+ s64 delta_time;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp) {
+ delta_time = st->time_now - tstamp->start;
+ if (delta_time > 0)
+ delta_time = div_s64(delta_time, NSEC_PER_SEC);
+ else
+ delta_time = 0;
+
+ return seq_printf(s, "delta-time=%llu ",
+ (unsigned long long)delta_time);
+ }
+ return 0;
+}
+#else
+static inline int
+ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+ return 0;
+}
+#endif
+
/* return 0 on success, 1 in case of error */
static int ct_seq_show(struct seq_file *s, void *v)
{
@@ -200,6 +236,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
goto release;
#endif
+ if (ct_show_delta_time(s, ct))
+ goto release;
+
if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
goto release;
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
new file mode 100644
index 00000000000..af7dd31af0a
--- /dev/null
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -0,0 +1,120 @@
+/*
+ * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+
+static int nf_ct_tstamp __read_mostly;
+
+module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
+MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table tstamp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_timestamp",
+ .data = &init_net.ct.sysctl_tstamp,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type tstamp_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_tstamp),
+ .align = __alignof__(struct nf_conn_tstamp),
+ .id = NF_CT_EXT_TSTAMP,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_tstamp;
+
+ net->ct.tstamp_sysctl_header = register_net_sysctl_table(net,
+ nf_net_netfilter_sysctl_path, table);
+ if (!net->ct.tstamp_sysctl_header) {
+ printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.tstamp_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.tstamp_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_tstamp_init(struct net *net)
+{
+ int ret;
+
+ net->ct.sysctl_tstamp = nf_ct_tstamp;
+
+ if (net_eq(net, &init_net)) {
+ ret = nf_ct_extend_register(&tstamp_extend);
+ if (ret < 0) {
+ printk(KERN_ERR "nf_ct_tstamp: Unable to register "
+ "extension\n");
+ goto out_extend_register;
+ }
+ }
+
+ ret = nf_conntrack_tstamp_init_sysctl(net);
+ if (ret < 0)
+ goto out_sysctl;
+
+ return 0;
+
+out_sysctl:
+ if (net_eq(net, &init_net))
+ nf_ct_extend_unregister(&tstamp_extend);
+out_extend_register:
+ return ret;
+}
+
+void nf_conntrack_tstamp_fini(struct net *net)
+{
+ nf_conntrack_tstamp_fini_sysctl(net);
+ if (net_eq(net, &init_net))
+ nf_ct_extend_unregister(&tstamp_extend);
+}
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index b07393eab88..20714edf6cd 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -85,6 +85,8 @@ EXPORT_SYMBOL(nf_log_unregister);
int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger)
{
+ if (pf >= ARRAY_SIZE(nf_loggers))
+ return -EINVAL;
mutex_lock(&nf_log_mutex);
if (__find_logger(pf, logger->name) == NULL) {
mutex_unlock(&nf_log_mutex);
@@ -98,6 +100,8 @@ EXPORT_SYMBOL(nf_log_bind_pf);
void nf_log_unbind_pf(u_int8_t pf)
{
+ if (pf >= ARRAY_SIZE(nf_loggers))
+ return;
mutex_lock(&nf_log_mutex);
rcu_assign_pointer(nf_loggers[pf], NULL);
mutex_unlock(&nf_log_mutex);
@@ -161,7 +165,8 @@ static int seq_show(struct seq_file *s, void *v)
struct nf_logger *t;
int ret;
- logger = nf_loggers[*pos];
+ logger = rcu_dereference_protected(nf_loggers[*pos],
+ lockdep_is_held(&nf_log_mutex));
if (!logger)
ret = seq_printf(s, "%2lld NONE (", *pos);
@@ -249,7 +254,8 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
mutex_unlock(&nf_log_mutex);
} else {
mutex_lock(&nf_log_mutex);
- logger = nf_loggers[tindex];
+ logger = rcu_dereference_protected(nf_loggers[tindex],
+ lockdep_is_held(&nf_log_mutex));
if (!logger)
table->data = "NONE";
else
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 74aebed5bd2..5ab22e2bbd7 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -27,14 +27,17 @@ static DEFINE_MUTEX(queue_handler_mutex);
int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
{
int ret;
+ const struct nf_queue_handler *old;
if (pf >= ARRAY_SIZE(queue_handler))
return -EINVAL;
mutex_lock(&queue_handler_mutex);
- if (queue_handler[pf] == qh)
+ old = rcu_dereference_protected(queue_handler[pf],
+ lockdep_is_held(&queue_handler_mutex));
+ if (old == qh)
ret = -EEXIST;
- else if (queue_handler[pf])
+ else if (old)
ret = -EBUSY;
else {
rcu_assign_pointer(queue_handler[pf], qh);
@@ -49,11 +52,15 @@ EXPORT_SYMBOL(nf_register_queue_handler);
/* The caller must flush their queue before this */
int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
{
+ const struct nf_queue_handler *old;
+
if (pf >= ARRAY_SIZE(queue_handler))
return -EINVAL;
mutex_lock(&queue_handler_mutex);
- if (queue_handler[pf] && queue_handler[pf] != qh) {
+ old = rcu_dereference_protected(queue_handler[pf],
+ lockdep_is_held(&queue_handler_mutex));
+ if (old && old != qh) {
mutex_unlock(&queue_handler_mutex);
return -EINVAL;
}
@@ -73,7 +80,10 @@ void nf_unregister_queue_handlers(const struct nf_queue_handler *qh)
mutex_lock(&queue_handler_mutex);
for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) {
- if (queue_handler[pf] == qh)
+ if (rcu_dereference_protected(
+ queue_handler[pf],
+ lockdep_is_held(&queue_handler_mutex)
+ ) == qh)
rcu_assign_pointer(queue_handler[pf], NULL);
}
mutex_unlock(&queue_handler_mutex);
@@ -115,7 +125,7 @@ static int __nf_queue(struct sk_buff *skb,
int (*okfn)(struct sk_buff *),
unsigned int queuenum)
{
- int status;
+ int status = -ENOENT;
struct nf_queue_entry *entry = NULL;
#ifdef CONFIG_BRIDGE_NETFILTER
struct net_device *physindev;
@@ -128,16 +138,20 @@ static int __nf_queue(struct sk_buff *skb,
rcu_read_lock();
qh = rcu_dereference(queue_handler[pf]);
- if (!qh)
+ if (!qh) {
+ status = -ESRCH;
goto err_unlock;
+ }
afinfo = nf_get_afinfo(pf);
if (!afinfo)
goto err_unlock;
entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC);
- if (!entry)
+ if (!entry) {
+ status = -ENOMEM;
goto err_unlock;
+ }
*entry = (struct nf_queue_entry) {
.skb = skb,
@@ -151,11 +165,9 @@ static int __nf_queue(struct sk_buff *skb,
/* If it's going away, ignore hook. */
if (!try_module_get(entry->elem->owner)) {
- rcu_read_unlock();
- kfree(entry);
- return 0;
+ status = -ECANCELED;
+ goto err_unlock;
}
-
/* Bump dev refs so they don't vanish while packet is out */
if (indev)
dev_hold(indev);
@@ -182,14 +194,13 @@ static int __nf_queue(struct sk_buff *skb,
goto err;
}
- return 1;
+ return 0;
err_unlock:
rcu_read_unlock();
err:
- kfree_skb(skb);
kfree(entry);
- return 1;
+ return status;
}
int nf_queue(struct sk_buff *skb,
@@ -201,6 +212,8 @@ int nf_queue(struct sk_buff *skb,
unsigned int queuenum)
{
struct sk_buff *segs;
+ int err;
+ unsigned int queued;
if (!skb_is_gso(skb))
return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
@@ -216,20 +229,35 @@ int nf_queue(struct sk_buff *skb,
}
segs = skb_gso_segment(skb, 0);
- kfree_skb(skb);
+ /* Does not use PTR_ERR to limit the number of error codes that can be
+ * returned by nf_queue. For instance, callers rely on -ECANCELED to mean
+ * 'ignore this hook'.
+ */
if (IS_ERR(segs))
- return 1;
+ return -EINVAL;
+ queued = 0;
+ err = 0;
do {
struct sk_buff *nskb = segs->next;
segs->next = NULL;
- if (!__nf_queue(segs, elem, pf, hook, indev, outdev, okfn,
- queuenum))
+ if (err == 0)
+ err = __nf_queue(segs, elem, pf, hook, indev,
+ outdev, okfn, queuenum);
+ if (err == 0)
+ queued++;
+ else
kfree_skb(segs);
segs = nskb;
} while (segs);
- return 1;
+
+ /* also free orig skb if only some segments were queued */
+ if (unlikely(err && queued))
+ err = 0;
+ if (err == 0)
+ kfree_skb(skb);
+ return err;
}
void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
@@ -237,6 +265,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
struct sk_buff *skb = entry->skb;
struct list_head *elem = &entry->elem->list;
const struct nf_afinfo *afinfo;
+ int err;
rcu_read_lock();
@@ -270,10 +299,17 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
local_bh_enable();
break;
case NF_QUEUE:
- if (!__nf_queue(skb, elem, entry->pf, entry->hook,
- entry->indev, entry->outdev, entry->okfn,
- verdict >> NF_VERDICT_BITS))
- goto next_hook;
+ err = __nf_queue(skb, elem, entry->pf, entry->hook,
+ entry->indev, entry->outdev, entry->okfn,
+ verdict >> NF_VERDICT_QBITS);
+ if (err < 0) {
+ if (err == -ECANCELED)
+ goto next_hook;
+ if (err == -ESRCH &&
+ (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+ goto next_hook;
+ kfree_skb(skb);
+ }
break;
case NF_STOLEN:
default:
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index 4d87befb04c..474d621cbc2 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -28,26 +28,23 @@ nf_tproxy_destructor(struct sk_buff *skb)
skb->destructor = NULL;
if (sk)
- nf_tproxy_put_sock(sk);
+ sock_put(sk);
}
/* consumes sk */
-int
+void
nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
{
- bool transparent = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_transparent :
- inet_sk(sk)->transparent;
-
- if (transparent) {
- skb_orphan(skb);
- skb->sk = sk;
- skb->destructor = nf_tproxy_destructor;
- return 1;
- } else
- nf_tproxy_put_sock(sk);
-
- return 0;
+ /* assigning tw sockets complicates things; most
+ * skb->sk->X checks would have to test sk->sk_state first */
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ inet_twsk_put(inet_twsk(sk));
+ return;
+ }
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = nf_tproxy_destructor;
}
EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 6a1572b0ab4..985e9b76c91 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -376,7 +376,6 @@ __build_packet_message(struct nfulnl_instance *inst,
unsigned int hooknum,
const struct net_device *indev,
const struct net_device *outdev,
- const struct nf_loginfo *li,
const char *prefix, unsigned int plen)
{
struct nfulnl_msg_packet_hdr pmsg;
@@ -652,7 +651,7 @@ nfulnl_log_packet(u_int8_t pf,
inst->qlen++;
__build_packet_message(inst, skb, data_len, pf,
- hooknum, in, out, li, prefix, plen);
+ hooknum, in, out, prefix, plen);
if (inst->qlen >= qthreshold)
__nfulnl_flush(inst);
@@ -874,19 +873,19 @@ static struct hlist_node *get_first(struct iter_state *st)
for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
if (!hlist_empty(&instance_table[st->bucket]))
- return rcu_dereference_bh(instance_table[st->bucket].first);
+ return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
}
return NULL;
}
static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h)
{
- h = rcu_dereference_bh(h->next);
+ h = rcu_dereference_bh(hlist_next_rcu(h));
while (!h) {
if (++st->bucket >= INSTANCE_BUCKETS)
return NULL;
- h = rcu_dereference_bh(instance_table[st->bucket].first);
+ h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
}
return h;
}
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 68e67d19724..b83123f12b4 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -387,25 +387,31 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
{
struct sk_buff *nskb;
struct nfqnl_instance *queue;
- int err;
+ int err = -ENOBUFS;
/* rcu_read_lock()ed by nf_hook_slow() */
queue = instance_lookup(queuenum);
- if (!queue)
+ if (!queue) {
+ err = -ESRCH;
goto err_out;
+ }
- if (queue->copy_mode == NFQNL_COPY_NONE)
+ if (queue->copy_mode == NFQNL_COPY_NONE) {
+ err = -EINVAL;
goto err_out;
+ }
nskb = nfqnl_build_packet_message(queue, entry);
- if (nskb == NULL)
+ if (nskb == NULL) {
+ err = -ENOMEM;
goto err_out;
-
+ }
spin_lock_bh(&queue->lock);
- if (!queue->peer_pid)
+ if (!queue->peer_pid) {
+ err = -EINVAL;
goto err_out_free_nskb;
-
+ }
if (queue->queue_total >= queue->queue_maxlen) {
queue->queue_dropped++;
if (net_ratelimit())
@@ -432,7 +438,7 @@ err_out_free_nskb:
err_out_unlock:
spin_unlock_bh(&queue->lock);
err_out:
- return -1;
+ return err;
}
static int
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index c9423763107..0a77d2ff215 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -23,6 +23,7 @@
#include <linux/mutex.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/audit.h>
#include <net/net_namespace.h>
#include <linux/netfilter/x_tables.h>
@@ -38,9 +39,8 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
struct compat_delta {
- struct compat_delta *next;
- unsigned int offset;
- int delta;
+ unsigned int offset; /* offset in kernel */
+ int delta; /* delta in 32bit user land */
};
struct xt_af {
@@ -49,7 +49,9 @@ struct xt_af {
struct list_head target;
#ifdef CONFIG_COMPAT
struct mutex compat_mutex;
- struct compat_delta *compat_offsets;
+ struct compat_delta *compat_tab;
+ unsigned int number; /* number of slots in compat_tab[] */
+ unsigned int cur; /* number of used slots in compat_tab[] */
#endif
};
@@ -414,54 +416,67 @@ int xt_check_match(struct xt_mtchk_param *par,
EXPORT_SYMBOL_GPL(xt_check_match);
#ifdef CONFIG_COMPAT
-int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta)
+int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
{
- struct compat_delta *tmp;
+ struct xt_af *xp = &xt[af];
- tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
+ if (!xp->compat_tab) {
+ if (!xp->number)
+ return -EINVAL;
+ xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number);
+ if (!xp->compat_tab)
+ return -ENOMEM;
+ xp->cur = 0;
+ }
- tmp->offset = offset;
- tmp->delta = delta;
+ if (xp->cur >= xp->number)
+ return -EINVAL;
- if (xt[af].compat_offsets) {
- tmp->next = xt[af].compat_offsets->next;
- xt[af].compat_offsets->next = tmp;
- } else {
- xt[af].compat_offsets = tmp;
- tmp->next = NULL;
- }
+ if (xp->cur)
+ delta += xp->compat_tab[xp->cur - 1].delta;
+ xp->compat_tab[xp->cur].offset = offset;
+ xp->compat_tab[xp->cur].delta = delta;
+ xp->cur++;
return 0;
}
EXPORT_SYMBOL_GPL(xt_compat_add_offset);
void xt_compat_flush_offsets(u_int8_t af)
{
- struct compat_delta *tmp, *next;
-
- if (xt[af].compat_offsets) {
- for (tmp = xt[af].compat_offsets; tmp; tmp = next) {
- next = tmp->next;
- kfree(tmp);
- }
- xt[af].compat_offsets = NULL;
+ if (xt[af].compat_tab) {
+ vfree(xt[af].compat_tab);
+ xt[af].compat_tab = NULL;
+ xt[af].number = 0;
}
}
EXPORT_SYMBOL_GPL(xt_compat_flush_offsets);
int xt_compat_calc_jump(u_int8_t af, unsigned int offset)
{
- struct compat_delta *tmp;
- int delta;
-
- for (tmp = xt[af].compat_offsets, delta = 0; tmp; tmp = tmp->next)
- if (tmp->offset < offset)
- delta += tmp->delta;
- return delta;
+ struct compat_delta *tmp = xt[af].compat_tab;
+ int mid, left = 0, right = xt[af].cur - 1;
+
+ while (left <= right) {
+ mid = (left + right) >> 1;
+ if (offset > tmp[mid].offset)
+ left = mid + 1;
+ else if (offset < tmp[mid].offset)
+ right = mid - 1;
+ else
+ return mid ? tmp[mid - 1].delta : 0;
+ }
+ WARN_ON_ONCE(1);
+ return 0;
}
EXPORT_SYMBOL_GPL(xt_compat_calc_jump);
+void xt_compat_init_offsets(u_int8_t af, unsigned int number)
+{
+ xt[af].number = number;
+ xt[af].cur = 0;
+}
+EXPORT_SYMBOL(xt_compat_init_offsets);
+
int xt_compat_match_offset(const struct xt_match *match)
{
u_int16_t csize = match->compatsize ? : match->matchsize;
@@ -820,6 +835,21 @@ xt_replace_table(struct xt_table *table,
*/
local_bh_enable();
+#ifdef CONFIG_AUDIT
+ if (audit_enabled) {
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(current->audit_context, GFP_KERNEL,
+ AUDIT_NETFILTER_CFG);
+ if (ab) {
+ audit_log_format(ab, "table=%s family=%u entries=%u",
+ table->name, table->af,
+ private->number);
+ audit_log_end(ab);
+ }
+ }
+#endif
+
return private;
}
EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -1338,7 +1368,7 @@ static int __init xt_init(void)
mutex_init(&xt[i].mutex);
#ifdef CONFIG_COMPAT
mutex_init(&xt[i].compat_mutex);
- xt[i].compat_offsets = NULL;
+ xt[i].compat_tab = NULL;
#endif
INIT_LIST_HEAD(&xt[i].target);
INIT_LIST_HEAD(&xt[i].match);
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
new file mode 100644
index 00000000000..81802d27346
--- /dev/null
+++ b/net/netfilter/xt_AUDIT.c
@@ -0,0 +1,204 @@
+/*
+ * Creates audit record for dropped/accepted packets
+ *
+ * (C) 2010-2011 Thomas Graf <tgraf@redhat.com>
+ * (C) 2010-2011 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/audit.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_AUDIT.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>");
+MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets");
+MODULE_ALIAS("ipt_AUDIT");
+MODULE_ALIAS("ip6t_AUDIT");
+MODULE_ALIAS("ebt_AUDIT");
+MODULE_ALIAS("arpt_AUDIT");
+
+static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb,
+ unsigned int proto, unsigned int offset)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE: {
+ const __be16 *pptr;
+ __be16 _ports[2];
+
+ pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports);
+ if (pptr == NULL) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " sport=%hu dport=%hu",
+ ntohs(pptr[0]), ntohs(pptr[1]));
+ }
+ break;
+
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6: {
+ const u8 *iptr;
+ u8 _ih[2];
+
+ iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih);
+ if (iptr == NULL) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu",
+ iptr[0], iptr[1]);
+
+ }
+ break;
+ }
+}
+
+static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct iphdr _iph;
+ const struct iphdr *ih;
+
+ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (!ih) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu",
+ &ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol);
+
+ if (ntohs(ih->frag_off) & IP_OFFSET) {
+ audit_log_format(ab, " frag=1");
+ return;
+ }
+
+ audit_proto(ab, skb, ih->protocol, ih->ihl * 4);
+}
+
+static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct ipv6hdr _ip6h;
+ const struct ipv6hdr *ih;
+ u8 nexthdr;
+ int offset;
+
+ ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+ if (!ih) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ nexthdr = ih->nexthdr;
+ offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h),
+ &nexthdr);
+
+ audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+ &ih->saddr, &ih->daddr, nexthdr);
+
+ if (offset)
+ audit_proto(ab, skb, nexthdr, offset);
+}
+
+static unsigned int
+audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_audit_info *info = par->targinfo;
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+ if (ab == NULL)
+ goto errout;
+
+ audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s",
+ info->type, par->hooknum, skb->len,
+ par->in ? par->in->name : "?",
+ par->out ? par->out->name : "?");
+
+ if (skb->mark)
+ audit_log_format(ab, " mark=%#x", skb->mark);
+
+ if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
+ audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+
+ if (par->family == NFPROTO_BRIDGE) {
+ switch (eth_hdr(skb)->h_proto) {
+ case __constant_htons(ETH_P_IP):
+ audit_ip4(ab, skb);
+ break;
+
+ case __constant_htons(ETH_P_IPV6):
+ audit_ip6(ab, skb);
+ break;
+ }
+ }
+ }
+
+ switch (par->family) {
+ case NFPROTO_IPV4:
+ audit_ip4(ab, skb);
+ break;
+
+ case NFPROTO_IPV6:
+ audit_ip6(ab, skb);
+ break;
+ }
+
+ audit_log_end(ab);
+
+errout:
+ return XT_CONTINUE;
+}
+
+static int audit_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_audit_info *info = par->targinfo;
+
+ if (info->type > XT_AUDIT_TYPE_MAX) {
+ pr_info("Audit type out of range (valid range: 0..%hhu)\n",
+ XT_AUDIT_TYPE_MAX);
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static struct xt_target audit_tg_reg __read_mostly = {
+ .name = "AUDIT",
+ .family = NFPROTO_UNSPEC,
+ .target = audit_tg,
+ .targetsize = sizeof(struct xt_audit_info),
+ .checkentry = audit_tg_check,
+ .me = THIS_MODULE,
+};
+
+static int __init audit_tg_init(void)
+{
+ return xt_register_target(&audit_tg_reg);
+}
+
+static void __exit audit_tg_exit(void)
+{
+ xt_unregister_target(&audit_tg_reg);
+}
+
+module_init(audit_tg_init);
+module_exit(audit_tg_exit);
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index c2c0e4abeb9..af9c4dadf81 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -19,12 +19,14 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_CLASSIFY.h>
+#include <linux/netfilter_arp.h>
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Xtables: Qdisc classification");
MODULE_ALIAS("ipt_CLASSIFY");
MODULE_ALIAS("ip6t_CLASSIFY");
+MODULE_ALIAS("arpt_CLASSIFY");
static unsigned int
classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
@@ -35,26 +37,36 @@ classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static struct xt_target classify_tg_reg __read_mostly = {
- .name = "CLASSIFY",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .table = "mangle",
- .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
- (1 << NF_INET_POST_ROUTING),
- .target = classify_tg,
- .targetsize = sizeof(struct xt_classify_target_info),
- .me = THIS_MODULE,
+static struct xt_target classify_tg_reg[] __read_mostly = {
+ {
+ .name = "CLASSIFY",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_POST_ROUTING),
+ .target = classify_tg,
+ .targetsize = sizeof(struct xt_classify_target_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CLASSIFY",
+ .revision = 0,
+ .family = NFPROTO_ARP,
+ .hooks = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD),
+ .target = classify_tg,
+ .targetsize = sizeof(struct xt_classify_target_info),
+ .me = THIS_MODULE,
+ },
};
static int __init classify_tg_init(void)
{
- return xt_register_target(&classify_tg_reg);
+ return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
}
static void __exit classify_tg_exit(void)
{
- xt_unregister_target(&classify_tg_reg);
+ xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
}
module_init(classify_tg_init);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index be1f22e1354..3bdd443aaf1 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -313,3 +313,5 @@ MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>");
MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>");
MODULE_DESCRIPTION("Xtables: idle time monitor");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("ipt_IDLETIMER");
+MODULE_ALIAS("ip6t_IDLETIMER");
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
index a4140509eea..993de2ba89d 100644
--- a/net/netfilter/xt_LED.c
+++ b/net/netfilter/xt_LED.c
@@ -31,6 +31,8 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>");
MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match");
+MODULE_ALIAS("ipt_LED");
+MODULE_ALIAS("ip6t_LED");
static LIST_HEAD(xt_led_triggers);
static DEFINE_MUTEX(xt_led_mutex);
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 039cce1bde3..d4f4b5d66b2 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -72,18 +72,31 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
if (info->queues_total > 1) {
if (par->family == NFPROTO_IPV4)
- queue = hash_v4(skb) % info->queues_total + queue;
+ queue = (((u64) hash_v4(skb) * info->queues_total) >>
+ 32) + queue;
#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
else if (par->family == NFPROTO_IPV6)
- queue = hash_v6(skb) % info->queues_total + queue;
+ queue = (((u64) hash_v6(skb) * info->queues_total) >>
+ 32) + queue;
#endif
}
return NF_QUEUE_NR(queue);
}
-static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par)
+static unsigned int
+nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
- const struct xt_NFQ_info_v1 *info = par->targinfo;
+ const struct xt_NFQ_info_v2 *info = par->targinfo;
+ unsigned int ret = nfqueue_tg_v1(skb, par);
+
+ if (info->bypass)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+ return ret;
+}
+
+static int nfqueue_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_NFQ_info_v2 *info = par->targinfo;
u32 maxid;
if (unlikely(!rnd_inited)) {
@@ -100,6 +113,8 @@ static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par)
info->queues_total, maxid);
return -ERANGE;
}
+ if (par->target->revision == 2 && info->bypass > 1)
+ return -EINVAL;
return 0;
}
@@ -115,11 +130,20 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = {
.name = "NFQUEUE",
.revision = 1,
.family = NFPROTO_UNSPEC,
- .checkentry = nfqueue_tg_v1_check,
+ .checkentry = nfqueue_tg_check,
.target = nfqueue_tg_v1,
.targetsize = sizeof(struct xt_NFQ_info_v1),
.me = THIS_MODULE,
},
+ {
+ .name = "NFQUEUE",
+ .revision = 2,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfqueue_tg_check,
+ .target = nfqueue_tg_v2,
+ .targetsize = sizeof(struct xt_NFQ_info_v2),
+ .me = THIS_MODULE,
+ },
};
static int __init nfqueue_tg_init(void)
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index eb81c380da1..6e6b46cb1db 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -148,16 +148,21 @@ tcpmss_mangle_packet(struct sk_buff *skb,
static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb,
unsigned int family)
{
- struct flowi fl = {};
+ struct flowi fl;
const struct nf_afinfo *ai;
struct rtable *rt = NULL;
u_int32_t mtu = ~0U;
- if (family == PF_INET)
- fl.fl4_dst = ip_hdr(skb)->saddr;
- else
- fl.fl6_dst = ipv6_hdr(skb)->saddr;
+ if (family == PF_INET) {
+ struct flowi4 *fl4 = &fl.u.ip4;
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = ip_hdr(skb)->saddr;
+ } else {
+ struct flowi6 *fl6 = &fl.u.ip6;
+ memset(fl6, 0, sizeof(*fl6));
+ ipv6_addr_copy(&fl6->daddr, &ipv6_hdr(skb)->saddr);
+ }
rcu_read_lock();
ai = nf_get_afinfo(family);
if (ai != NULL)
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 5128a6c4cb2..5f054a0dbbb 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -62,18 +62,19 @@ tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
const struct iphdr *iph = ip_hdr(skb);
struct net *net = pick_net(skb);
struct rtable *rt;
- struct flowi fl;
+ struct flowi4 fl4;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl4, 0, sizeof(fl4));
if (info->priv) {
if (info->priv->oif == -1)
return false;
- fl.oif = info->priv->oif;
+ fl4.flowi4_oif = info->priv->oif;
}
- fl.fl4_dst = info->gw.ip;
- fl.fl4_tos = RT_TOS(iph->tos);
- fl.fl4_scope = RT_SCOPE_UNIVERSE;
- if (ip_route_output_key(net, &rt, &fl) != 0)
+ fl4.daddr = info->gw.ip;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
return false;
skb_dst_drop(skb);
@@ -142,18 +143,18 @@ tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info)
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = pick_net(skb);
struct dst_entry *dst;
- struct flowi fl;
+ struct flowi6 fl6;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
if (info->priv) {
if (info->priv->oif == -1)
return false;
- fl.oif = info->priv->oif;
+ fl6.flowi6_oif = info->priv->oif;
}
- fl.fl6_dst = info->gw.in6;
- fl.fl6_flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
+ fl6.daddr = info->gw.in6;
+ fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
(iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
- dst = ip6_route_output(net, NULL, &fl);
+ dst = ip6_route_output(net, NULL, &fl6);
if (dst == NULL)
return false;
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 640678f47a2..dcfd57eb9d0 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -33,6 +33,20 @@
#include <net/netfilter/nf_tproxy_core.h>
#include <linux/netfilter/xt_TPROXY.h>
+static bool tproxy_sk_is_transparent(struct sock *sk)
+{
+ if (sk->sk_state != TCP_TIME_WAIT) {
+ if (inet_sk(sk)->transparent)
+ return true;
+ sock_put(sk);
+ } else {
+ if (inet_twsk(sk)->tw_transparent)
+ return true;
+ inet_twsk_put(inet_twsk(sk));
+ }
+ return false;
+}
+
static inline __be32
tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
{
@@ -141,7 +155,7 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
skb->dev, NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
- if (sk && nf_tproxy_assign_sock(skb, sk)) {
+ if (sk && tproxy_sk_is_transparent(sk)) {
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
@@ -149,6 +163,8 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
iph->protocol, &iph->daddr, ntohs(hp->dest),
&laddr, ntohs(lport), skb->mark);
+
+ nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
@@ -306,7 +322,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
par->in, NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
- if (sk && nf_tproxy_assign_sock(skb, sk)) {
+ if (sk && tproxy_sk_is_transparent(sk)) {
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
@@ -314,6 +330,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
tproto, &iph->saddr, ntohs(hp->source),
laddr, ntohs(lport), skb->mark);
+
+ nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 5c5b6b921b8..e029c480740 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -185,18 +185,24 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
int connections;
ct = nf_ct_get(skb, &ctinfo);
- if (ct != NULL)
- tuple_ptr = &ct->tuplehash[0].tuple;
- else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- par->family, &tuple))
+ if (ct != NULL) {
+ if (info->flags & XT_CONNLIMIT_DADDR)
+ tuple_ptr = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ else
+ tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ par->family, &tuple)) {
goto hotdrop;
+ }
if (par->family == NFPROTO_IPV6) {
const struct ipv6hdr *iph = ipv6_hdr(skb);
- memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr));
+ memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
+ &iph->daddr : &iph->saddr, sizeof(addr.ip6));
} else {
const struct iphdr *iph = ip_hdr(skb);
- addr.ip = iph->saddr;
+ addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ?
+ iph->daddr : iph->saddr;
}
spin_lock_bh(&info->data->lock);
@@ -204,13 +210,12 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
&info->mask, par->family);
spin_unlock_bh(&info->data->lock);
- if (connections < 0) {
+ if (connections < 0)
/* kmalloc failed, drop it entirely */
- par->hotdrop = true;
- return false;
- }
+ goto hotdrop;
- return (connections > info->limit) ^ info->inverse;
+ return (connections > info->limit) ^
+ !!(info->flags & XT_CONNLIMIT_INVERT);
hotdrop:
par->hotdrop = true;
@@ -268,25 +273,38 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
kfree(info->data);
}
-static struct xt_match connlimit_mt_reg __read_mostly = {
- .name = "connlimit",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = connlimit_mt_check,
- .match = connlimit_mt,
- .matchsize = sizeof(struct xt_connlimit_info),
- .destroy = connlimit_mt_destroy,
- .me = THIS_MODULE,
+static struct xt_match connlimit_mt_reg[] __read_mostly = {
+ {
+ .name = "connlimit",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connlimit_mt_check,
+ .match = connlimit_mt,
+ .matchsize = sizeof(struct xt_connlimit_info),
+ .destroy = connlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "connlimit",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connlimit_mt_check,
+ .match = connlimit_mt,
+ .matchsize = sizeof(struct xt_connlimit_info),
+ .destroy = connlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
};
static int __init connlimit_mt_init(void)
{
- return xt_register_match(&connlimit_mt_reg);
+ return xt_register_matches(connlimit_mt_reg,
+ ARRAY_SIZE(connlimit_mt_reg));
}
static void __exit connlimit_mt_exit(void)
{
- xt_unregister_match(&connlimit_mt_reg);
+ xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
}
module_init(connlimit_mt_init);
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index e536710ad91..2c0086a4751 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -112,6 +112,54 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info,
return true;
}
+static inline bool
+port_match(u16 min, u16 max, u16 port, bool invert)
+{
+ return (port >= min && port <= max) ^ invert;
+}
+
+static inline bool
+ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info,
+ const struct nf_conn *ct)
+{
+ const struct nf_conntrack_tuple *tuple;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ if ((info->match_flags & XT_CONNTRACK_PROTO) &&
+ (nf_ct_protonum(ct) == info->l4proto) ^
+ !(info->invert_flags & XT_CONNTRACK_PROTO))
+ return false;
+
+ /* Shortcut to match all recognized protocols by using ->src.all. */
+ if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) &&
+ !port_match(info->origsrc_port, info->origsrc_port_high,
+ ntohs(tuple->src.u.all),
+ info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT))
+ return false;
+
+ if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) &&
+ !port_match(info->origdst_port, info->origdst_port_high,
+ ntohs(tuple->dst.u.all),
+ info->invert_flags & XT_CONNTRACK_ORIGDST_PORT))
+ return false;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) &&
+ !port_match(info->replsrc_port, info->replsrc_port_high,
+ ntohs(tuple->src.u.all),
+ info->invert_flags & XT_CONNTRACK_REPLSRC_PORT))
+ return false;
+
+ if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) &&
+ !port_match(info->repldst_port, info->repldst_port_high,
+ ntohs(tuple->dst.u.all),
+ info->invert_flags & XT_CONNTRACK_REPLDST_PORT))
+ return false;
+
+ return true;
+}
+
static bool
conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
u16 state_mask, u16 status_mask)
@@ -170,8 +218,13 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
!(info->invert_flags & XT_CONNTRACK_REPLDST))
return false;
- if (!ct_proto_port_check(info, ct))
- return false;
+ if (par->match->revision != 3) {
+ if (!ct_proto_port_check(info, ct))
+ return false;
+ } else {
+ if (!ct_proto_port_check_v3(par->matchinfo, ct))
+ return false;
+ }
if ((info->match_flags & XT_CONNTRACK_STATUS) &&
(!!(status_mask & ct->status) ^
@@ -207,10 +260,23 @@ conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
return conntrack_mt(skb, par, info->state_mask, info->status_mask);
}
+static bool
+conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_conntrack_mtinfo3 *info = par->matchinfo;
+
+ return conntrack_mt(skb, par, info->state_mask, info->status_mask);
+}
+
static int conntrack_mt_check(const struct xt_mtchk_param *par)
{
int ret;
+ if (strcmp(par->table, "raw") == 0) {
+ pr_info("state is undetermined at the time of raw table\n");
+ return -EINVAL;
+ }
+
ret = nf_ct_l3proto_try_module_get(par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
@@ -244,6 +310,16 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {
.destroy = conntrack_mt_destroy,
.me = THIS_MODULE,
},
+ {
+ .name = "conntrack",
+ .revision = 3,
+ .family = NFPROTO_UNSPEC,
+ .matchsize = sizeof(struct xt_conntrack_mtinfo3),
+ .match = conntrack_mt_v3,
+ .checkentry = conntrack_mt_check,
+ .destroy = conntrack_mt_destroy,
+ .me = THIS_MODULE,
+ },
};
static int __init conntrack_mt_init(void)
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
index b39db8a5cba..c7a2e5466bc 100644
--- a/net/netfilter/xt_cpu.c
+++ b/net/netfilter/xt_cpu.c
@@ -22,6 +22,8 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>");
MODULE_DESCRIPTION("Xtables: CPU match");
+MODULE_ALIAS("ipt_cpu");
+MODULE_ALIAS("ip6t_cpu");
static int cpu_mt_check(const struct xt_mtchk_param *par)
{
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
new file mode 100644
index 00000000000..d9202cdd25c
--- /dev/null
+++ b/net/netfilter/xt_devgroup.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#include <linux/netfilter/xt_devgroup.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: Device group match");
+MODULE_ALIAS("ipt_devgroup");
+MODULE_ALIAS("ip6t_devgroup");
+
+static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_devgroup_info *info = par->matchinfo;
+
+ if (info->flags & XT_DEVGROUP_MATCH_SRC &&
+ (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^
+ ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0)))
+ return false;
+
+ if (info->flags & XT_DEVGROUP_MATCH_DST &&
+ (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^
+ ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0)))
+ return false;
+
+ return true;
+}
+
+static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct xt_devgroup_info *info = par->matchinfo;
+
+ if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC |
+ XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST))
+ return -EINVAL;
+
+ if (info->flags & XT_DEVGROUP_MATCH_SRC &&
+ par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD)))
+ return -EINVAL;
+
+ if (info->flags & XT_DEVGROUP_MATCH_DST &&
+ par->hook_mask & ~((1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct xt_match devgroup_mt_reg __read_mostly = {
+ .name = "devgroup",
+ .match = devgroup_mt,
+ .checkentry = devgroup_mt_checkentry,
+ .matchsize = sizeof(struct xt_devgroup_info),
+ .family = NFPROTO_UNSPEC,
+ .me = THIS_MODULE
+};
+
+static int __init devgroup_mt_init(void)
+{
+ return xt_register_match(&devgroup_mt_reg);
+}
+
+static void __exit devgroup_mt_exit(void)
+{
+ xt_unregister_match(&devgroup_mt_reg);
+}
+
+module_init(devgroup_mt_init);
+module_exit(devgroup_mt_exit);
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index 88f7c3511c7..b46626cddd9 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -31,7 +31,7 @@ iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)
pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n",
&iph->saddr,
(info->flags & IPRANGE_SRC_INV) ? "(INV) " : "",
- &info->src_max.ip,
+ &info->src_min.ip,
&info->src_max.ip);
return false;
}
@@ -53,15 +53,13 @@ iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)
}
static inline int
-iprange_ipv6_sub(const struct in6_addr *a, const struct in6_addr *b)
+iprange_ipv6_lt(const struct in6_addr *a, const struct in6_addr *b)
{
unsigned int i;
- int r;
for (i = 0; i < 4; ++i) {
- r = ntohl(a->s6_addr32[i]) - ntohl(b->s6_addr32[i]);
- if (r != 0)
- return r;
+ if (a->s6_addr32[i] != b->s6_addr32[i])
+ return ntohl(a->s6_addr32[i]) < ntohl(b->s6_addr32[i]);
}
return 0;
@@ -75,18 +73,30 @@ iprange_mt6(const struct sk_buff *skb, struct xt_action_param *par)
bool m;
if (info->flags & IPRANGE_SRC) {
- m = iprange_ipv6_sub(&iph->saddr, &info->src_min.in6) < 0;
- m |= iprange_ipv6_sub(&iph->saddr, &info->src_max.in6) > 0;
+ m = iprange_ipv6_lt(&iph->saddr, &info->src_min.in6);
+ m |= iprange_ipv6_lt(&info->src_max.in6, &iph->saddr);
m ^= !!(info->flags & IPRANGE_SRC_INV);
- if (m)
+ if (m) {
+ pr_debug("src IP %pI6 NOT in range %s%pI6-%pI6\n",
+ &iph->saddr,
+ (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "",
+ &info->src_min.in6,
+ &info->src_max.in6);
return false;
+ }
}
if (info->flags & IPRANGE_DST) {
- m = iprange_ipv6_sub(&iph->daddr, &info->dst_min.in6) < 0;
- m |= iprange_ipv6_sub(&iph->daddr, &info->dst_max.in6) > 0;
+ m = iprange_ipv6_lt(&iph->daddr, &info->dst_min.in6);
+ m |= iprange_ipv6_lt(&info->dst_max.in6, &iph->daddr);
m ^= !!(info->flags & IPRANGE_DST_INV);
- if (m)
+ if (m) {
+ pr_debug("dst IP %pI6 NOT in range %s%pI6-%pI6\n",
+ &iph->daddr,
+ (info->flags & IPRANGE_DST_INV) ? "(INV) " : "",
+ &info->dst_min.in6,
+ &info->dst_max.in6);
return false;
+ }
}
return true;
}
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 9127a3d8aa3..bb10b0717f1 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -85,7 +85,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */);
+ cp = pp->conn_out_get(family, skb, &iph, iph.len, 1 /* inverse */);
if (unlikely(cp == NULL)) {
match = false;
goto out;
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
new file mode 100644
index 00000000000..061d48cec13
--- /dev/null
+++ b/net/netfilter/xt_set.c
@@ -0,0 +1,359 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Martin Josefsson <gandalf@wlug.westbo.se>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module which implements the set match and SET target
+ * for netfilter/iptables. */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/version.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_set.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("Xtables: IP set match and target module");
+MODULE_ALIAS("xt_SET");
+MODULE_ALIAS("ipt_set");
+MODULE_ALIAS("ip6t_set");
+MODULE_ALIAS("ipt_SET");
+MODULE_ALIAS("ip6t_SET");
+
+static inline int
+match_set(ip_set_id_t index, const struct sk_buff *skb,
+ u8 pf, u8 dim, u8 flags, int inv)
+{
+ if (ip_set_test(index, skb, pf, dim, flags))
+ inv = !inv;
+ return inv;
+}
+
+/* Revision 0 interface: backward compatible with netfilter/iptables */
+
+static bool
+set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match_v0 *info = par->matchinfo;
+
+ return match_set(info->match_set.index, skb, par->family,
+ info->match_set.u.compat.dim,
+ info->match_set.u.compat.flags,
+ info->match_set.u.compat.flags & IPSET_INV_MATCH);
+}
+
+static void
+compat_flags(struct xt_set_info_v0 *info)
+{
+ u_int8_t i;
+
+ /* Fill out compatibility data according to enum ip_set_kopt */
+ info->u.compat.dim = IPSET_DIM_ZERO;
+ if (info->u.flags[0] & IPSET_MATCH_INV)
+ info->u.compat.flags |= IPSET_INV_MATCH;
+ for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) {
+ info->u.compat.dim++;
+ if (info->u.flags[i] & IPSET_SRC)
+ info->u.compat.flags |= (1<<info->u.compat.dim);
+ }
+}
+
+static int
+set_match_v0_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_set_info_match_v0 *info = par->matchinfo;
+ ip_set_id_t index;
+
+ index = ip_set_nfnl_get_byindex(info->match_set.index);
+
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find set indentified by id %u to match\n",
+ info->match_set.index);
+ return -ENOENT;
+ }
+ if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ pr_warning("Protocol error: set match dimension "
+ "is over the limit!\n");
+ return -ERANGE;
+ }
+
+ /* Fill out compatibility data */
+ compat_flags(&info->match_set);
+
+ return 0;
+}
+
+static void
+set_match_v0_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_set_info_match_v0 *info = par->matchinfo;
+
+ ip_set_nfnl_put(info->match_set.index);
+}
+
+static unsigned int
+set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_set_info_target_v0 *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_add(info->add_set.index, skb, par->family,
+ info->add_set.u.compat.dim,
+ info->add_set.u.compat.flags);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_del(info->del_set.index, skb, par->family,
+ info->del_set.u.compat.dim,
+ info->del_set.u.compat.flags);
+
+ return XT_CONTINUE;
+}
+
+static int
+set_target_v0_checkentry(const struct xt_tgchk_param *par)
+{
+ struct xt_set_info_target_v0 *info = par->targinfo;
+ ip_set_id_t index;
+
+ if (info->add_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(info->add_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find add_set index %u as target\n",
+ info->add_set.index);
+ return -ENOENT;
+ }
+ }
+
+ if (info->del_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(info->del_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find del_set index %u as target\n",
+ info->del_set.index);
+ return -ENOENT;
+ }
+ }
+ if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 ||
+ info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ pr_warning("Protocol error: SET target dimension "
+ "is over the limit!\n");
+ return -ERANGE;
+ }
+
+ /* Fill out compatibility data */
+ compat_flags(&info->add_set);
+ compat_flags(&info->del_set);
+
+ return 0;
+}
+
+static void
+set_target_v0_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct xt_set_info_target_v0 *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(info->del_set.index);
+}
+
+/* Revision 1: current interface to netfilter/iptables */
+
+static bool
+set_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match *info = par->matchinfo;
+
+ return match_set(info->match_set.index, skb, par->family,
+ info->match_set.dim,
+ info->match_set.flags,
+ info->match_set.flags & IPSET_INV_MATCH);
+}
+
+static int
+set_match_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_set_info_match *info = par->matchinfo;
+ ip_set_id_t index;
+
+ index = ip_set_nfnl_get_byindex(info->match_set.index);
+
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find set indentified by id %u to match\n",
+ info->match_set.index);
+ return -ENOENT;
+ }
+ if (info->match_set.dim > IPSET_DIM_MAX) {
+ pr_warning("Protocol error: set match dimension "
+ "is over the limit!\n");
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static void
+set_match_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_set_info_match *info = par->matchinfo;
+
+ ip_set_nfnl_put(info->match_set.index);
+}
+
+static unsigned int
+set_target(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_set_info_target *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_add(info->add_set.index,
+ skb, par->family,
+ info->add_set.dim,
+ info->add_set.flags);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_del(info->del_set.index,
+ skb, par->family,
+ info->add_set.dim,
+ info->del_set.flags);
+
+ return XT_CONTINUE;
+}
+
+static int
+set_target_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct xt_set_info_target *info = par->targinfo;
+ ip_set_id_t index;
+
+ if (info->add_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(info->add_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find add_set index %u as target\n",
+ info->add_set.index);
+ return -ENOENT;
+ }
+ }
+
+ if (info->del_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(info->del_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warning("Cannot find del_set index %u as target\n",
+ info->del_set.index);
+ return -ENOENT;
+ }
+ }
+ if (info->add_set.dim > IPSET_DIM_MAX ||
+ info->del_set.flags > IPSET_DIM_MAX) {
+ pr_warning("Protocol error: SET target dimension "
+ "is over the limit!\n");
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static void
+set_target_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct xt_set_info_target *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(info->del_set.index);
+}
+
+static struct xt_match set_matches[] __read_mostly = {
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 0,
+ .match = set_match_v0,
+ .matchsize = sizeof(struct xt_set_info_match_v0),
+ .checkentry = set_match_v0_checkentry,
+ .destroy = set_match_v0_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 1,
+ .match = set_match,
+ .matchsize = sizeof(struct xt_set_info_match),
+ .checkentry = set_match_checkentry,
+ .destroy = set_match_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .match = set_match,
+ .matchsize = sizeof(struct xt_set_info_match),
+ .checkentry = set_match_checkentry,
+ .destroy = set_match_destroy,
+ .me = THIS_MODULE
+ },
+};
+
+static struct xt_target set_targets[] __read_mostly = {
+ {
+ .name = "SET",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = set_target_v0,
+ .targetsize = sizeof(struct xt_set_info_target_v0),
+ .checkentry = set_target_v0_checkentry,
+ .destroy = set_target_v0_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "SET",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .target = set_target,
+ .targetsize = sizeof(struct xt_set_info_target),
+ .checkentry = set_target_checkentry,
+ .destroy = set_target_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "SET",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .target = set_target,
+ .targetsize = sizeof(struct xt_set_info_target),
+ .checkentry = set_target_checkentry,
+ .destroy = set_target_destroy,
+ .me = THIS_MODULE
+ },
+};
+
+static int __init xt_set_init(void)
+{
+ int ret = xt_register_matches(set_matches, ARRAY_SIZE(set_matches));
+
+ if (!ret) {
+ ret = xt_register_targets(set_targets,
+ ARRAY_SIZE(set_targets));
+ if (ret)
+ xt_unregister_matches(set_matches,
+ ARRAY_SIZE(set_matches));
+ }
+ return ret;
+}
+
+static void __exit xt_set_fini(void)
+{
+ xt_unregister_matches(set_matches, ARRAY_SIZE(set_matches));
+ xt_unregister_targets(set_targets, ARRAY_SIZE(set_targets));
+}
+
+module_init(xt_set_init);
+module_exit(xt_set_fini);
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 00d6ae83830..9cc46356b57 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -35,6 +35,15 @@
#include <net/netfilter/nf_conntrack.h>
#endif
+static void
+xt_socket_put_sk(struct sock *sk)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_put(inet_twsk(sk));
+ else
+ sock_put(sk);
+}
+
static int
extract_icmp4_fields(const struct sk_buff *skb,
u8 *protocol,
@@ -164,7 +173,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
(sk->sk_state == TCP_TIME_WAIT &&
inet_twsk(sk)->tw_transparent));
- nf_tproxy_put_sock(sk);
+ xt_socket_put_sk(sk);
if (wildcard || !transparent)
sk = NULL;
@@ -298,7 +307,7 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
(sk->sk_state == TCP_TIME_WAIT &&
inet_twsk(sk)->tw_transparent));
- nf_tproxy_put_sock(sk);
+ xt_socket_put_sk(sk);
if (wildcard || !transparent)
sk = NULL;
diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h
index 6caef8b2061..f4fc4c9ad56 100644
--- a/net/netlabel/netlabel_user.h
+++ b/net/netlabel/netlabel_user.h
@@ -49,9 +49,9 @@
static inline void netlbl_netlink_auditinfo(struct sk_buff *skb,
struct netlbl_audit *audit_info)
{
- audit_info->secid = NETLINK_CB(skb).sid;
- audit_info->loginuid = NETLINK_CB(skb).loginuid;
- audit_info->sessionid = NETLINK_CB(skb).sessionid;
+ security_task_getsecid(current, &audit_info->secid);
+ audit_info->loginuid = audit_get_loginuid(current);
+ audit_info->sessionid = audit_get_sessionid(current);
}
/* NetLabel NETLINK I/O functions */
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 478181d53c5..c8f35b5d2ee 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1362,17 +1362,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
NETLINK_CB(skb).pid = nlk->pid;
NETLINK_CB(skb).dst_group = dst_group;
- NETLINK_CB(skb).loginuid = audit_get_loginuid(current);
- NETLINK_CB(skb).sessionid = audit_get_sessionid(current);
- security_task_getsecid(current, &(NETLINK_CB(skb).sid));
memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
- /* What can I do? Netlink is asynchronous, so that
- we will have to save current capabilities to
- check them, when this message will be delivered
- to corresponding kernel module. --ANK (980802)
- */
-
err = -EFAULT;
if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
kfree_skb(skb);
@@ -1407,7 +1398,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
int noblock = flags&MSG_DONTWAIT;
size_t copied;
struct sk_buff *skb, *data_skb;
- int err;
+ int err, ret;
if (flags&MSG_OOB)
return -EOPNOTSUPP;
@@ -1470,8 +1461,13 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
skb_free_datagram(sk, skb);
- if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2)
- netlink_dump(sk);
+ if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
+ ret = netlink_dump(sk);
+ if (ret) {
+ sk->sk_err = ret;
+ sk->sk_error_report(sk);
+ }
+ }
scm_recv(sock, msg, siocb->scm, flags);
out:
@@ -1736,6 +1732,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
struct netlink_callback *cb;
struct sock *sk;
struct netlink_sock *nlk;
+ int ret;
cb = kzalloc(sizeof(*cb), GFP_KERNEL);
if (cb == NULL)
@@ -1764,9 +1761,13 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
nlk->cb = cb;
mutex_unlock(nlk->cb_mutex);
- netlink_dump(sk);
+ ret = netlink_dump(sk);
+
sock_put(sk);
+ if (ret)
+ return ret;
+
/* We successfully started a dump, by returning -EINTR we
* signal not to send ACK even if it was requested.
*/
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f83cb370292..1781d99145e 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -519,7 +519,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_netlink_recv(skb, CAP_NET_ADMIN))
return -EPERM;
- if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
if (ops->dumpit == NULL)
return -EOPNOTSUPP;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 91cb1d71f01..b5362e96022 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -164,7 +164,6 @@ struct packet_mreq_max {
static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
int closing, int tx_ring);
-#define PGV_FROM_VMALLOC 1
struct pgv {
char *buffer;
};
@@ -466,7 +465,7 @@ retry:
*/
err = -EMSGSIZE;
- if (len > dev->mtu + dev->hard_header_len)
+ if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN)
goto out_unlock;
if (!skb) {
@@ -497,6 +496,19 @@ retry:
goto retry;
}
+ if (len > (dev->mtu + dev->hard_header_len)) {
+ /* Earlier code assumed this would be a VLAN pkt,
+ * double-check this now that we have the actual
+ * packet in hand.
+ */
+ struct ethhdr *ehdr;
+ skb_reset_mac_header(skb);
+ ehdr = eth_hdr(skb);
+ if (ehdr->h_proto != htons(ETH_P_8021Q)) {
+ err = -EMSGSIZE;
+ goto out_unlock;
+ }
+ }
skb->protocol = proto;
skb->dev = dev;
@@ -523,11 +535,11 @@ static inline unsigned int run_filter(const struct sk_buff *skb,
{
struct sk_filter *filter;
- rcu_read_lock_bh();
- filter = rcu_dereference_bh(sk->sk_filter);
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
if (filter != NULL)
res = sk_run_filter(skb, filter->insns);
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return res;
}
@@ -954,7 +966,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
{
- struct socket *sock;
struct sk_buff *skb;
struct net_device *dev;
__be16 proto;
@@ -966,8 +977,6 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
int len_sum = 0;
int status = 0;
- sock = po->sk.sk_socket;
-
mutex_lock(&po->pg_vec_lock);
err = -EBUSY;
@@ -1200,7 +1209,7 @@ static int packet_snd(struct socket *sock,
}
err = -EMSGSIZE;
- if (!gso_type && (len > dev->mtu+reserve))
+ if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN))
goto out_unlock;
err = -ENOBUFS;
@@ -1225,6 +1234,20 @@ static int packet_snd(struct socket *sock,
if (err < 0)
goto out_free;
+ if (!gso_type && (len > dev->mtu + reserve)) {
+ /* Earlier code assumed this would be a VLAN pkt,
+ * double-check this now that we have the actual
+ * packet in hand.
+ */
+ struct ethhdr *ehdr;
+ skb_reset_mac_header(skb);
+ ehdr = eth_hdr(skb);
+ if (ehdr->h_proto != htons(ETH_P_8021Q)) {
+ err = -EMSGSIZE;
+ goto out_free;
+ }
+ }
+
skb->protocol = proto;
skb->dev = dev;
skb->priority = sk->sk_priority;
diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig
index 0d9b8a220a7..6ec7d55b176 100644
--- a/net/phonet/Kconfig
+++ b/net/phonet/Kconfig
@@ -14,15 +14,3 @@ config PHONET
To compile this driver as a module, choose M here: the module
will be called phonet. If unsure, say N.
-
-config PHONET_PIPECTRLR
- bool "Phonet Pipe Controller (EXPERIMENTAL)"
- depends on PHONET && EXPERIMENTAL
- default N
- help
- The Pipe Controller implementation in Phonet stack to support Pipe
- data with Nokia Slim modems like WG2.5 used on ST-Ericsson U8500
- platform.
-
- This option is incompatible with older Nokia modems.
- Say N here unless you really know what you are doing.
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 1072b2c19d3..c6fffd946d4 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -110,6 +110,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
sk->sk_protocol = protocol;
pn = pn_sk(sk);
pn->sobject = 0;
+ pn->dobject = 0;
pn->resource = 0;
sk->sk_prot->init(sk);
err = 0;
@@ -194,11 +195,7 @@ static int pn_send(struct sk_buff *skb, struct net_device *dev,
if (skb->pkt_type == PACKET_LOOPBACK) {
skb_reset_mac_header(skb);
skb_orphan(skb);
- if (irq)
- netif_rx(skb);
- else
- netif_rx_ni(skb);
- err = 0;
+ err = (irq ? netif_rx(skb) : netif_rx_ni(skb)) ? -ENOBUFS : 0;
} else {
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
NULL, NULL, skb->len);
@@ -207,6 +204,8 @@ static int pn_send(struct sk_buff *skb, struct net_device *dev,
goto drop;
}
err = dev_queue_xmit(skb);
+ if (unlikely(err > 0))
+ err = net_xmit_errno(err);
}
return err;
@@ -242,8 +241,18 @@ int pn_skb_send(struct sock *sk, struct sk_buff *skb,
struct net_device *dev;
struct pn_sock *pn = pn_sk(sk);
int err;
- u16 src;
- u8 daddr = pn_sockaddr_get_addr(target), saddr = PN_NO_ADDR;
+ u16 src, dst;
+ u8 daddr, saddr, res;
+
+ src = pn->sobject;
+ if (target != NULL) {
+ dst = pn_sockaddr_get_object(target);
+ res = pn_sockaddr_get_resource(target);
+ } else {
+ dst = pn->dobject;
+ res = pn->resource;
+ }
+ daddr = pn_addr(dst);
err = -EHOSTUNREACH;
if (sk->sk_bound_dev_if)
@@ -251,10 +260,9 @@ int pn_skb_send(struct sock *sk, struct sk_buff *skb,
else if (phonet_address_lookup(net, daddr) == 0) {
dev = phonet_device_get(net);
skb->pkt_type = PACKET_LOOPBACK;
- } else if (pn_sockaddr_get_object(target) == 0) {
+ } else if (dst == 0) {
/* Resource routing (small race until phonet_rcv()) */
- struct sock *sk = pn_find_sock_by_res(net,
- target->spn_resource);
+ struct sock *sk = pn_find_sock_by_res(net, res);
if (sk) {
sock_put(sk);
dev = phonet_device_get(net);
@@ -271,12 +279,10 @@ int pn_skb_send(struct sock *sk, struct sk_buff *skb,
if (saddr == PN_NO_ADDR)
goto drop;
- src = pn->sobject;
if (!pn_addr(src))
src = pn_object(saddr, pn_obj(src));
- err = pn_send(skb, dev, pn_sockaddr_get_object(target),
- src, pn_sockaddr_get_resource(target), 0);
+ err = pn_send(skb, dev, dst, src, res, 0);
dev_put(dev);
return err;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 3e60f2e4e6c..68e635f11de 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -42,7 +42,7 @@
* TCP_ESTABLISHED connected pipe in enabled state
*
* pep_sock locking:
- * - sk_state, ackq, hlist: sock lock needed
+ * - sk_state, hlist: sock lock needed
* - listener: read only
* - pipe_handle: read only
*/
@@ -50,11 +50,6 @@
#define CREDITS_MAX 10
#define CREDITS_THR 7
-static const struct sockaddr_pn pipe_srv = {
- .spn_family = AF_PHONET,
- .spn_resource = 0xD9, /* pipe service */
-};
-
#define pep_sb_size(s) (((s) + 5) & ~3) /* 2-bytes head, 32-bits aligned */
/* Get the next TLV sub-block. */
@@ -82,236 +77,95 @@ static unsigned char *pep_get_sb(struct sk_buff *skb, u8 *ptype, u8 *plen,
return data;
}
-static int pep_reply(struct sock *sk, struct sk_buff *oskb,
- u8 code, const void *data, int len, gfp_t priority)
+static struct sk_buff *pep_alloc_skb(struct sock *sk, const void *payload,
+ int len, gfp_t priority)
{
- const struct pnpipehdr *oph = pnp_hdr(oskb);
- struct pnpipehdr *ph;
- struct sk_buff *skb;
-
- skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority);
+ struct sk_buff *skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority);
if (!skb)
- return -ENOMEM;
+ return NULL;
skb_set_owner_w(skb, sk);
skb_reserve(skb, MAX_PNPIPE_HEADER);
__skb_put(skb, len);
- skb_copy_to_linear_data(skb, data, len);
- __skb_push(skb, sizeof(*ph));
+ skb_copy_to_linear_data(skb, payload, len);
+ __skb_push(skb, sizeof(struct pnpipehdr));
skb_reset_transport_header(skb);
- ph = pnp_hdr(skb);
- ph->utid = oph->utid;
- ph->message_id = oph->message_id + 1; /* REQ -> RESP */
- ph->pipe_handle = oph->pipe_handle;
- ph->error_code = code;
-
- return pn_skb_send(sk, skb, &pipe_srv);
-}
-
-#define PAD 0x00
-
-#ifdef CONFIG_PHONET_PIPECTRLR
-static u8 pipe_negotiate_fc(u8 *host_fc, u8 *remote_fc, int len)
-{
- int i, j;
- u8 base_fc, final_fc;
-
- for (i = 0; i < len; i++) {
- base_fc = host_fc[i];
- for (j = 0; j < len; j++) {
- if (remote_fc[j] == base_fc) {
- final_fc = base_fc;
- goto done;
- }
- }
- }
- return -EINVAL;
-
-done:
- return final_fc;
-
-}
-
-static int pipe_get_flow_info(struct sock *sk, struct sk_buff *skb,
- u8 *pref_rx_fc, u8 *req_tx_fc)
-{
- struct pnpipehdr *hdr;
- u8 n_sb;
-
- if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
- return -EINVAL;
-
- hdr = pnp_hdr(skb);
- n_sb = hdr->data[4];
-
- __skb_pull(skb, sizeof(*hdr) + 4);
- while (n_sb > 0) {
- u8 type, buf[3], len = sizeof(buf);
- u8 *data = pep_get_sb(skb, &type, &len, buf);
-
- if (data == NULL)
- return -EINVAL;
-
- switch (type) {
- case PN_PIPE_SB_REQUIRED_FC_TX:
- if (len < 3 || (data[2] | data[3] | data[4]) > 3)
- break;
- req_tx_fc[0] = data[2];
- req_tx_fc[1] = data[3];
- req_tx_fc[2] = data[4];
- break;
-
- case PN_PIPE_SB_PREFERRED_FC_RX:
- if (len < 3 || (data[2] | data[3] | data[4]) > 3)
- break;
- pref_rx_fc[0] = data[2];
- pref_rx_fc[1] = data[3];
- pref_rx_fc[2] = data[4];
- break;
-
- }
- n_sb--;
- }
- return 0;
+ return skb;
}
-static int pipe_handler_send_req(struct sock *sk, u8 utid,
- u8 msg_id, gfp_t priority)
+static int pep_reply(struct sock *sk, struct sk_buff *oskb, u8 code,
+ const void *data, int len, gfp_t priority)
{
- int len;
+ const struct pnpipehdr *oph = pnp_hdr(oskb);
struct pnpipehdr *ph;
struct sk_buff *skb;
- struct pep_sock *pn = pep_sk(sk);
-
- static const u8 data[4] = {
- PAD, PAD, PAD, PAD,
- };
+ struct sockaddr_pn peer;
- switch (msg_id) {
- case PNS_PEP_CONNECT_REQ:
- len = sizeof(data);
- break;
-
- case PNS_PEP_DISCONNECT_REQ:
- case PNS_PEP_ENABLE_REQ:
- case PNS_PEP_DISABLE_REQ:
- len = 0;
- break;
-
- default:
- return -EINVAL;
- }
-
- skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority);
+ skb = pep_alloc_skb(sk, data, len, priority);
if (!skb)
return -ENOMEM;
- skb_set_owner_w(skb, sk);
- skb_reserve(skb, MAX_PNPIPE_HEADER);
- if (len) {
- __skb_put(skb, len);
- skb_copy_to_linear_data(skb, data, len);
- }
- __skb_push(skb, sizeof(*ph));
- skb_reset_transport_header(skb);
ph = pnp_hdr(skb);
- ph->utid = utid;
- ph->message_id = msg_id;
- ph->pipe_handle = pn->pipe_handle;
- ph->error_code = PN_PIPE_NO_ERROR;
+ ph->utid = oph->utid;
+ ph->message_id = oph->message_id + 1; /* REQ -> RESP */
+ ph->pipe_handle = oph->pipe_handle;
+ ph->error_code = code;
- return pn_skb_send(sk, skb, &pn->remote_pep);
+ pn_skb_get_src_sockaddr(oskb, &peer);
+ return pn_skb_send(sk, skb, &peer);
}
-static int pipe_handler_send_created_ind(struct sock *sk,
- u8 utid, u8 msg_id)
+static int pep_indicate(struct sock *sk, u8 id, u8 code,
+ const void *data, int len, gfp_t priority)
{
- int err_code;
+ struct pep_sock *pn = pep_sk(sk);
struct pnpipehdr *ph;
struct sk_buff *skb;
- struct pep_sock *pn = pep_sk(sk);
- static u8 data[4] = {
- 0x03, 0x04,
- };
- data[2] = pn->tx_fc;
- data[3] = pn->rx_fc;
-
- /*
- * actually, below is number of sub-blocks and not error code.
- * Pipe_created_ind message format does not have any
- * error code field. However, the Phonet stack will always send
- * an error code as part of pnpipehdr. So, use that err_code to
- * specify the number of sub-blocks.
- */
- err_code = 0x01;
-
- skb = alloc_skb(MAX_PNPIPE_HEADER + sizeof(data), GFP_ATOMIC);
+ skb = pep_alloc_skb(sk, data, len, priority);
if (!skb)
return -ENOMEM;
- skb_set_owner_w(skb, sk);
- skb_reserve(skb, MAX_PNPIPE_HEADER);
- __skb_put(skb, sizeof(data));
- skb_copy_to_linear_data(skb, data, sizeof(data));
- __skb_push(skb, sizeof(*ph));
- skb_reset_transport_header(skb);
ph = pnp_hdr(skb);
- ph->utid = utid;
- ph->message_id = msg_id;
+ ph->utid = 0;
+ ph->message_id = id;
ph->pipe_handle = pn->pipe_handle;
- ph->error_code = err_code;
-
- return pn_skb_send(sk, skb, &pn->remote_pep);
+ ph->data[0] = code;
+ return pn_skb_send(sk, skb, NULL);
}
-static int pipe_handler_send_ind(struct sock *sk, u8 utid, u8 msg_id)
+#define PAD 0x00
+
+static int pipe_handler_request(struct sock *sk, u8 id, u8 code,
+ const void *data, int len)
{
- int err_code;
+ struct pep_sock *pn = pep_sk(sk);
struct pnpipehdr *ph;
struct sk_buff *skb;
- struct pep_sock *pn = pep_sk(sk);
-
- /*
- * actually, below is a filler.
- * Pipe_enabled/disabled_ind message format does not have any
- * error code field. However, the Phonet stack will always send
- * an error code as part of pnpipehdr. So, use that err_code to
- * specify the filler value.
- */
- err_code = 0x0;
- skb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC);
+ skb = pep_alloc_skb(sk, data, len, GFP_KERNEL);
if (!skb)
return -ENOMEM;
- skb_set_owner_w(skb, sk);
- skb_reserve(skb, MAX_PNPIPE_HEADER);
- __skb_push(skb, sizeof(*ph));
- skb_reset_transport_header(skb);
ph = pnp_hdr(skb);
- ph->utid = utid;
- ph->message_id = msg_id;
+ ph->utid = id; /* whatever */
+ ph->message_id = id;
ph->pipe_handle = pn->pipe_handle;
- ph->error_code = err_code;
-
- return pn_skb_send(sk, skb, &pn->remote_pep);
+ ph->data[0] = code;
+ return pn_skb_send(sk, skb, NULL);
}
-static int pipe_handler_enable_pipe(struct sock *sk, int enable)
+static int pipe_handler_send_created_ind(struct sock *sk)
{
- int utid, req;
-
- if (enable) {
- utid = PNS_PIPE_ENABLE_UTID;
- req = PNS_PEP_ENABLE_REQ;
- } else {
- utid = PNS_PIPE_DISABLE_UTID;
- req = PNS_PEP_DISABLE_REQ;
- }
- return pipe_handler_send_req(sk, utid, req, GFP_ATOMIC);
+ struct pep_sock *pn = pep_sk(sk);
+ u8 data[4] = {
+ PN_PIPE_SB_NEGOTIATED_FC, pep_sb_size(2),
+ pn->tx_fc, pn->rx_fc,
+ };
+
+ return pep_indicate(sk, PNS_PIPE_CREATED_IND, 1 /* sub-blocks */,
+ data, 4, GFP_ATOMIC);
}
-#endif
static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
{
@@ -334,11 +188,12 @@ static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
GFP_KERNEL);
}
-static int pep_reject_conn(struct sock *sk, struct sk_buff *skb, u8 code)
+static int pep_reject_conn(struct sock *sk, struct sk_buff *skb, u8 code,
+ gfp_t priority)
{
static const u8 data[4] = { PAD, PAD, PAD, 0 /* sub-blocks */ };
WARN_ON(code == PN_PIPE_NO_ERROR);
- return pep_reply(sk, skb, code, data, sizeof(data), GFP_ATOMIC);
+ return pep_reply(sk, skb, code, data, sizeof(data), priority);
}
/* Control requests are not sent by the pipe service and have a specific
@@ -350,23 +205,21 @@ static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code,
struct sk_buff *skb;
struct pnpipehdr *ph;
struct sockaddr_pn dst;
+ u8 data[4] = {
+ oph->data[0], /* PEP type */
+ code, /* error code, at an unusual offset */
+ PAD, PAD,
+ };
- skb = alloc_skb(MAX_PNPIPE_HEADER + 4, priority);
+ skb = pep_alloc_skb(sk, data, 4, priority);
if (!skb)
return -ENOMEM;
- skb_set_owner_w(skb, sk);
-
- skb_reserve(skb, MAX_PHONET_HEADER);
- ph = (struct pnpipehdr *)skb_put(skb, sizeof(*ph) + 4);
+ ph = pnp_hdr(skb);
ph->utid = oph->utid;
ph->message_id = PNS_PEP_CTRL_RESP;
ph->pipe_handle = oph->pipe_handle;
ph->data[0] = oph->data[1]; /* CTRL id */
- ph->data[1] = oph->data[0]; /* PEP type */
- ph->data[2] = code; /* error code, at an usual offset */
- ph->data[3] = PAD;
- ph->data[4] = PAD;
pn_skb_get_src_sockaddr(oskb, &dst);
return pn_skb_send(sk, skb, &dst);
@@ -374,38 +227,15 @@ static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code,
static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority)
{
- struct pep_sock *pn = pep_sk(sk);
- struct pnpipehdr *ph;
- struct sk_buff *skb;
+ u8 data[4] = { type, PAD, PAD, status };
- skb = alloc_skb(MAX_PNPIPE_HEADER + 4, priority);
- if (!skb)
- return -ENOMEM;
- skb_set_owner_w(skb, sk);
-
- skb_reserve(skb, MAX_PNPIPE_HEADER + 4);
- __skb_push(skb, sizeof(*ph) + 4);
- skb_reset_transport_header(skb);
- ph = pnp_hdr(skb);
- ph->utid = 0;
- ph->message_id = PNS_PEP_STATUS_IND;
- ph->pipe_handle = pn->pipe_handle;
- ph->pep_type = PN_PEP_TYPE_COMMON;
- ph->data[1] = type;
- ph->data[2] = PAD;
- ph->data[3] = PAD;
- ph->data[4] = status;
-
-#ifdef CONFIG_PHONET_PIPECTRLR
- return pn_skb_send(sk, skb, &pn->remote_pep);
-#else
- return pn_skb_send(sk, skb, &pipe_srv);
-#endif
+ return pep_indicate(sk, PNS_PEP_STATUS_IND, PN_PEP_TYPE_COMMON,
+ data, 4, priority);
}
/* Send our RX flow control information to the sender.
* Socket must be locked. */
-static void pipe_grant_credits(struct sock *sk)
+static void pipe_grant_credits(struct sock *sk, gfp_t priority)
{
struct pep_sock *pn = pep_sk(sk);
@@ -415,16 +245,16 @@ static void pipe_grant_credits(struct sock *sk)
case PN_LEGACY_FLOW_CONTROL: /* TODO */
break;
case PN_ONE_CREDIT_FLOW_CONTROL:
- pipe_snd_status(sk, PN_PEP_IND_FLOW_CONTROL,
- PEP_IND_READY, GFP_ATOMIC);
- pn->rx_credits = 1;
+ if (pipe_snd_status(sk, PN_PEP_IND_FLOW_CONTROL,
+ PEP_IND_READY, priority) == 0)
+ pn->rx_credits = 1;
break;
case PN_MULTI_CREDIT_FLOW_CONTROL:
if ((pn->rx_credits + CREDITS_THR) > CREDITS_MAX)
break;
if (pipe_snd_status(sk, PN_PEP_IND_ID_MCFC_GRANT_CREDITS,
CREDITS_MAX - pn->rx_credits,
- GFP_ATOMIC) == 0)
+ priority) == 0)
pn->rx_credits = CREDITS_MAX;
break;
}
@@ -522,7 +352,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
switch (hdr->message_id) {
case PNS_PEP_CONNECT_REQ:
- pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE);
+ pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_ATOMIC);
break;
case PNS_PEP_DISCONNECT_REQ:
@@ -532,35 +362,11 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
sk->sk_state_change(sk);
break;
-#ifdef CONFIG_PHONET_PIPECTRLR
- case PNS_PEP_DISCONNECT_RESP:
- pn->pipe_state = PIPE_IDLE;
- sk->sk_state = TCP_CLOSE;
- break;
-#endif
-
case PNS_PEP_ENABLE_REQ:
/* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */
pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
break;
-#ifdef CONFIG_PHONET_PIPECTRLR
- case PNS_PEP_ENABLE_RESP:
- pn->pipe_state = PIPE_ENABLED;
- pipe_handler_send_ind(sk, PNS_PIPE_ENABLED_IND_UTID,
- PNS_PIPE_ENABLED_IND);
-
- if (!pn_flow_safe(pn->tx_fc)) {
- atomic_set(&pn->tx_credits, 1);
- sk->sk_write_space(sk);
- }
- if (sk->sk_state == TCP_ESTABLISHED)
- break; /* Nothing to do */
- sk->sk_state = TCP_ESTABLISHED;
- pipe_grant_credits(sk);
- break;
-#endif
-
case PNS_PEP_RESET_REQ:
switch (hdr->state_after_reset) {
case PN_PIPE_DISABLE:
@@ -579,17 +385,6 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
break;
-#ifdef CONFIG_PHONET_PIPECTRLR
- case PNS_PEP_DISABLE_RESP:
- pn->pipe_state = PIPE_DISABLED;
- atomic_set(&pn->tx_credits, 0);
- pipe_handler_send_ind(sk, PNS_PIPE_DISABLED_IND_UTID,
- PNS_PIPE_DISABLED_IND);
- sk->sk_state = TCP_SYN_RECV;
- pn->rx_credits = 0;
- break;
-#endif
-
case PNS_PEP_CTRL_REQ:
if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) {
atomic_inc(&sk->sk_drops);
@@ -607,7 +402,8 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
if (!pn_flow_safe(pn->rx_fc)) {
err = sock_queue_rcv_skb(sk, skb);
if (!err)
- return 0;
+ return NET_RX_SUCCESS;
+ err = -ENOBUFS;
break;
}
@@ -645,7 +441,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
if (sk->sk_state == TCP_ESTABLISHED)
break; /* Nothing to do */
sk->sk_state = TCP_ESTABLISHED;
- pipe_grant_credits(sk);
+ pipe_grant_credits(sk, GFP_ATOMIC);
break;
case PNS_PIPE_DISABLED_IND:
@@ -660,7 +456,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
}
out:
kfree_skb(skb);
- return err;
+ return (err == -ENOBUFS) ? NET_RX_DROP : NET_RX_SUCCESS;
queue:
skb->dev = NULL;
@@ -669,7 +465,7 @@ queue:
skb_queue_tail(queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, err);
- return 0;
+ return NET_RX_SUCCESS;
}
/* Destroy connected sock. */
@@ -681,133 +477,126 @@ static void pipe_destruct(struct sock *sk)
skb_queue_purge(&pn->ctrlreq_queue);
}
-#ifdef CONFIG_PHONET_PIPECTRLR
-static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
+static u8 pipe_negotiate_fc(const u8 *fcs, unsigned n)
{
- struct pep_sock *pn = pep_sk(sk);
- u8 host_pref_rx_fc[3] = {3, 2, 1}, host_req_tx_fc[3] = {3, 2, 1};
- u8 remote_pref_rx_fc[3], remote_req_tx_fc[3];
- u8 negotiated_rx_fc, negotiated_tx_fc;
- int ret;
-
- pipe_get_flow_info(sk, skb, remote_pref_rx_fc,
- remote_req_tx_fc);
- negotiated_tx_fc = pipe_negotiate_fc(remote_req_tx_fc,
- host_pref_rx_fc,
- sizeof(host_pref_rx_fc));
- negotiated_rx_fc = pipe_negotiate_fc(host_req_tx_fc,
- remote_pref_rx_fc,
- sizeof(host_pref_rx_fc));
-
- pn->pipe_state = PIPE_DISABLED;
- sk->sk_state = TCP_SYN_RECV;
- sk->sk_backlog_rcv = pipe_do_rcv;
- sk->sk_destruct = pipe_destruct;
- pn->rx_credits = 0;
- pn->rx_fc = negotiated_rx_fc;
- pn->tx_fc = negotiated_tx_fc;
- sk->sk_state_change(sk);
+ unsigned i;
+ u8 final_fc = PN_NO_FLOW_CONTROL;
- ret = pipe_handler_send_created_ind(sk,
- PNS_PIPE_CREATED_IND_UTID,
- PNS_PIPE_CREATED_IND
- );
+ for (i = 0; i < n; i++) {
+ u8 fc = fcs[i];
- return ret;
+ if (fc > final_fc && fc < PN_MAX_FLOW_CONTROL)
+ final_fc = fc;
+ }
+ return final_fc;
}
-#endif
-static int pep_connreq_rcv(struct sock *sk, struct sk_buff *skb)
+static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
{
- struct sock *newsk;
- struct pep_sock *newpn, *pn = pep_sk(sk);
+ struct pep_sock *pn = pep_sk(sk);
struct pnpipehdr *hdr;
- struct sockaddr_pn dst;
- u16 peer_type;
- u8 pipe_handle, enabled, n_sb;
- u8 aligned = 0;
+ u8 n_sb;
if (!pskb_pull(skb, sizeof(*hdr) + 4))
return -EINVAL;
hdr = pnp_hdr(skb);
- pipe_handle = hdr->pipe_handle;
- switch (hdr->state_after_connect) {
- case PN_PIPE_DISABLE:
- enabled = 0;
- break;
- case PN_PIPE_ENABLE:
- enabled = 1;
- break;
- default:
- pep_reject_conn(sk, skb, PN_PIPE_ERR_INVALID_PARAM);
- return -EINVAL;
- }
- peer_type = hdr->other_pep_type << 8;
-
- if (unlikely(sk->sk_state != TCP_LISTEN) || sk_acceptq_is_full(sk)) {
- pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE);
- return -ENOBUFS;
- }
+ if (hdr->error_code != PN_PIPE_NO_ERROR)
+ return -ECONNREFUSED;
- /* Parse sub-blocks (options) */
+ /* Parse sub-blocks */
n_sb = hdr->data[4];
while (n_sb > 0) {
- u8 type, buf[1], len = sizeof(buf);
+ u8 type, buf[6], len = sizeof(buf);
const u8 *data = pep_get_sb(skb, &type, &len, buf);
if (data == NULL)
return -EINVAL;
+
switch (type) {
- case PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE:
- if (len < 1)
- return -EINVAL;
- peer_type = (peer_type & 0xff00) | data[0];
+ case PN_PIPE_SB_REQUIRED_FC_TX:
+ if (len < 2 || len < data[0])
+ break;
+ pn->tx_fc = pipe_negotiate_fc(data + 2, len - 2);
break;
- case PN_PIPE_SB_ALIGNED_DATA:
- aligned = data[0] != 0;
+
+ case PN_PIPE_SB_PREFERRED_FC_RX:
+ if (len < 2 || len < data[0])
+ break;
+ pn->rx_fc = pipe_negotiate_fc(data + 2, len - 2);
break;
+
}
n_sb--;
}
- skb = skb_clone(skb, GFP_ATOMIC);
- if (!skb)
- return -ENOMEM;
+ return pipe_handler_send_created_ind(sk);
+}
- /* Create a new to-be-accepted sock */
- newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_ATOMIC, sk->sk_prot);
- if (!newsk) {
- kfree_skb(skb);
- return -ENOMEM;
- }
- sock_init_data(NULL, newsk);
- newsk->sk_state = TCP_SYN_RECV;
- newsk->sk_backlog_rcv = pipe_do_rcv;
- newsk->sk_protocol = sk->sk_protocol;
- newsk->sk_destruct = pipe_destruct;
+/* Queue an skb to an actively connected sock.
+ * Socket lock must be held. */
+static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct pnpipehdr *hdr = pnp_hdr(skb);
+ int err = NET_RX_SUCCESS;
- newpn = pep_sk(newsk);
- pn_skb_get_dst_sockaddr(skb, &dst);
- newpn->pn_sk.sobject = pn_sockaddr_get_object(&dst);
- newpn->pn_sk.resource = pn->pn_sk.resource;
- skb_queue_head_init(&newpn->ctrlreq_queue);
- newpn->pipe_handle = pipe_handle;
- atomic_set(&newpn->tx_credits, 0);
- newpn->peer_type = peer_type;
- newpn->rx_credits = 0;
- newpn->rx_fc = newpn->tx_fc = PN_LEGACY_FLOW_CONTROL;
- newpn->init_enable = enabled;
- newpn->aligned = aligned;
+ switch (hdr->message_id) {
+ case PNS_PIPE_ALIGNED_DATA:
+ __skb_pull(skb, 1);
+ /* fall through */
+ case PNS_PIPE_DATA:
+ __skb_pull(skb, 3); /* Pipe data header */
+ if (!pn_flow_safe(pn->rx_fc)) {
+ err = sock_queue_rcv_skb(sk, skb);
+ if (!err)
+ return NET_RX_SUCCESS;
+ err = NET_RX_DROP;
+ break;
+ }
- BUG_ON(!skb_queue_empty(&newsk->sk_receive_queue));
- skb_queue_head(&newsk->sk_receive_queue, skb);
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
+ if (pn->rx_credits == 0) {
+ atomic_inc(&sk->sk_drops);
+ err = NET_RX_DROP;
+ break;
+ }
+ pn->rx_credits--;
+ skb->dev = NULL;
+ skb_set_owner_r(skb, sk);
+ err = skb->len;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk, err);
+ return NET_RX_SUCCESS;
- sk_acceptq_added(sk);
- sk_add_node(newsk, &pn->ackq);
- return 0;
+ case PNS_PEP_CONNECT_RESP:
+ if (sk->sk_state != TCP_SYN_SENT)
+ break;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ if (pep_connresp_rcv(sk, skb)) {
+ sk->sk_state = TCP_CLOSE_WAIT;
+ break;
+ }
+
+ sk->sk_state = TCP_ESTABLISHED;
+ if (!pn_flow_safe(pn->tx_fc)) {
+ atomic_set(&pn->tx_credits, 1);
+ sk->sk_write_space(sk);
+ }
+ pipe_grant_credits(sk, GFP_ATOMIC);
+ break;
+
+ case PNS_PEP_DISCONNECT_RESP:
+ /* sock should already be dead, nothing to do */
+ break;
+
+ case PNS_PEP_STATUS_IND:
+ pipe_rcv_status(sk, skb);
+ break;
+ }
+ kfree_skb(skb);
+ return err;
}
/* Listening sock must be locked */
@@ -847,7 +636,6 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
struct sock *sknode;
struct pnpipehdr *hdr;
struct sockaddr_pn dst;
- int err = NET_RX_SUCCESS;
u8 pipe_handle;
if (!pskb_may_pull(skb, sizeof(*hdr)))
@@ -865,26 +653,18 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
if (sknode)
return sk_receive_skb(sknode, skb, 1);
- /* Look for a pipe handle pending accept */
- sknode = pep_find_pipe(&pn->ackq, &dst, pipe_handle);
- if (sknode) {
- sock_put(sknode);
- if (net_ratelimit())
- printk(KERN_WARNING"Phonet unconnected PEP ignored");
- err = NET_RX_DROP;
- goto drop;
- }
-
switch (hdr->message_id) {
case PNS_PEP_CONNECT_REQ:
- err = pep_connreq_rcv(sk, skb);
- break;
-
-#ifdef CONFIG_PHONET_PIPECTRLR
- case PNS_PEP_CONNECT_RESP:
- err = pep_connresp_rcv(sk, skb);
- break;
-#endif
+ if (sk->sk_state != TCP_LISTEN || sk_acceptq_is_full(sk)) {
+ pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE,
+ GFP_ATOMIC);
+ break;
+ }
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ sk_acceptq_added(sk);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk, 0);
+ return NET_RX_SUCCESS;
case PNS_PEP_DISCONNECT_REQ:
pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
@@ -898,12 +678,17 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
case PNS_PEP_ENABLE_REQ:
case PNS_PEP_DISABLE_REQ:
/* invalid handle is not even allowed here! */
+ break;
+
default:
- err = NET_RX_DROP;
+ if ((1 << sk->sk_state)
+ & ~(TCPF_CLOSE|TCPF_LISTEN|TCPF_CLOSE_WAIT))
+ /* actively connected socket */
+ return pipe_handler_do_rcv(sk, skb);
}
drop:
kfree_skb(skb);
- return err;
+ return NET_RX_SUCCESS;
}
static int pipe_do_remove(struct sock *sk)
@@ -912,20 +697,16 @@ static int pipe_do_remove(struct sock *sk)
struct pnpipehdr *ph;
struct sk_buff *skb;
- skb = alloc_skb(MAX_PNPIPE_HEADER, GFP_KERNEL);
+ skb = pep_alloc_skb(sk, NULL, 0, GFP_KERNEL);
if (!skb)
return -ENOMEM;
- skb_reserve(skb, MAX_PNPIPE_HEADER);
- __skb_push(skb, sizeof(*ph));
- skb_reset_transport_header(skb);
ph = pnp_hdr(skb);
ph->utid = 0;
ph->message_id = PNS_PIPE_REMOVE_REQ;
ph->pipe_handle = pn->pipe_handle;
ph->data[0] = PAD;
-
- return pn_skb_send(sk, skb, &pipe_srv);
+ return pn_skb_send(sk, skb, NULL);
}
/* associated socket ceases to exist */
@@ -938,29 +719,15 @@ static void pep_sock_close(struct sock *sk, long timeout)
sk_common_release(sk);
lock_sock(sk);
- if (sk->sk_state == TCP_LISTEN) {
- /* Destroy the listen queue */
- struct sock *sknode;
- struct hlist_node *p, *n;
-
- sk_for_each_safe(sknode, p, n, &pn->ackq)
- sk_del_node_init(sknode);
- sk->sk_state = TCP_CLOSE;
- } else if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED))
- /* Forcefully remove dangling Phonet pipe */
- pipe_do_remove(sk);
-
-#ifdef CONFIG_PHONET_PIPECTRLR
- if (pn->pipe_state != PIPE_IDLE) {
- /* send pep disconnect request */
- pipe_handler_send_req(sk,
- PNS_PEP_DISCONNECT_UTID, PNS_PEP_DISCONNECT_REQ,
- GFP_KERNEL);
-
- pn->pipe_state = PIPE_IDLE;
- sk->sk_state = TCP_CLOSE;
+ if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED)) {
+ if (sk->sk_backlog_rcv == pipe_do_rcv)
+ /* Forcefully remove dangling Phonet pipe */
+ pipe_do_remove(sk);
+ else
+ pipe_handler_request(sk, PNS_PEP_DISCONNECT_REQ, PAD,
+ NULL, 0);
}
-#endif
+ sk->sk_state = TCP_CLOSE;
ifindex = pn->ifindex;
pn->ifindex = 0;
@@ -971,86 +738,141 @@ static void pep_sock_close(struct sock *sk, long timeout)
sock_put(sk);
}
-static int pep_wait_connreq(struct sock *sk, int noblock)
+static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
{
- struct task_struct *tsk = current;
- struct pep_sock *pn = pep_sk(sk);
- long timeo = sock_rcvtimeo(sk, noblock);
-
- for (;;) {
- DEFINE_WAIT(wait);
+ struct pep_sock *pn = pep_sk(sk), *newpn;
+ struct sock *newsk = NULL;
+ struct sk_buff *skb;
+ struct pnpipehdr *hdr;
+ struct sockaddr_pn dst, src;
+ int err;
+ u16 peer_type;
+ u8 pipe_handle, enabled, n_sb;
+ u8 aligned = 0;
- if (sk->sk_state != TCP_LISTEN)
- return -EINVAL;
- if (!hlist_empty(&pn->ackq))
- break;
- if (!timeo)
- return -EWOULDBLOCK;
- if (signal_pending(tsk))
- return sock_intr_errno(timeo);
+ skb = skb_recv_datagram(sk, 0, flags & O_NONBLOCK, errp);
+ if (!skb)
+ return NULL;
- prepare_to_wait_exclusive(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
- release_sock(sk);
- timeo = schedule_timeout(timeo);
- lock_sock(sk);
- finish_wait(sk_sleep(sk), &wait);
+ lock_sock(sk);
+ if (sk->sk_state != TCP_LISTEN) {
+ err = -EINVAL;
+ goto drop;
}
+ sk_acceptq_removed(sk);
- return 0;
-}
+ err = -EPROTO;
+ if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
+ goto drop;
-static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
-{
- struct pep_sock *pn = pep_sk(sk);
- struct sock *newsk = NULL;
- struct sk_buff *oskb;
- int err;
+ hdr = pnp_hdr(skb);
+ pipe_handle = hdr->pipe_handle;
+ switch (hdr->state_after_connect) {
+ case PN_PIPE_DISABLE:
+ enabled = 0;
+ break;
+ case PN_PIPE_ENABLE:
+ enabled = 1;
+ break;
+ default:
+ pep_reject_conn(sk, skb, PN_PIPE_ERR_INVALID_PARAM,
+ GFP_KERNEL);
+ goto drop;
+ }
+ peer_type = hdr->other_pep_type << 8;
- lock_sock(sk);
- err = pep_wait_connreq(sk, flags & O_NONBLOCK);
- if (err)
- goto out;
+ /* Parse sub-blocks (options) */
+ n_sb = hdr->data[4];
+ while (n_sb > 0) {
+ u8 type, buf[1], len = sizeof(buf);
+ const u8 *data = pep_get_sb(skb, &type, &len, buf);
- newsk = __sk_head(&pn->ackq);
+ if (data == NULL)
+ goto drop;
+ switch (type) {
+ case PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE:
+ if (len < 1)
+ goto drop;
+ peer_type = (peer_type & 0xff00) | data[0];
+ break;
+ case PN_PIPE_SB_ALIGNED_DATA:
+ aligned = data[0] != 0;
+ break;
+ }
+ n_sb--;
+ }
- oskb = skb_dequeue(&newsk->sk_receive_queue);
- err = pep_accept_conn(newsk, oskb);
- if (err) {
- skb_queue_head(&newsk->sk_receive_queue, oskb);
+ /* Check for duplicate pipe handle */
+ newsk = pep_find_pipe(&pn->hlist, &dst, pipe_handle);
+ if (unlikely(newsk)) {
+ __sock_put(newsk);
newsk = NULL;
- goto out;
+ pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_KERNEL);
+ goto drop;
+ }
+
+ /* Create a new to-be-accepted sock */
+ newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot);
+ if (!newsk) {
+ pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
+ err = -ENOBUFS;
+ goto drop;
}
- kfree_skb(oskb);
+ sock_init_data(NULL, newsk);
+ newsk->sk_state = TCP_SYN_RECV;
+ newsk->sk_backlog_rcv = pipe_do_rcv;
+ newsk->sk_protocol = sk->sk_protocol;
+ newsk->sk_destruct = pipe_destruct;
+
+ newpn = pep_sk(newsk);
+ pn_skb_get_dst_sockaddr(skb, &dst);
+ pn_skb_get_src_sockaddr(skb, &src);
+ newpn->pn_sk.sobject = pn_sockaddr_get_object(&dst);
+ newpn->pn_sk.dobject = pn_sockaddr_get_object(&src);
+ newpn->pn_sk.resource = pn_sockaddr_get_resource(&dst);
sock_hold(sk);
- pep_sk(newsk)->listener = sk;
+ newpn->listener = sk;
+ skb_queue_head_init(&newpn->ctrlreq_queue);
+ newpn->pipe_handle = pipe_handle;
+ atomic_set(&newpn->tx_credits, 0);
+ newpn->ifindex = 0;
+ newpn->peer_type = peer_type;
+ newpn->rx_credits = 0;
+ newpn->rx_fc = newpn->tx_fc = PN_LEGACY_FLOW_CONTROL;
+ newpn->init_enable = enabled;
+ newpn->aligned = aligned;
- sock_hold(newsk);
- sk_del_node_init(newsk);
- sk_acceptq_removed(sk);
+ err = pep_accept_conn(newsk, skb);
+ if (err) {
+ sock_put(newsk);
+ newsk = NULL;
+ goto drop;
+ }
sk_add_node(newsk, &pn->hlist);
- __sock_put(newsk);
-
-out:
+drop:
release_sock(sk);
+ kfree_skb(skb);
*errp = err;
return newsk;
}
-#ifdef CONFIG_PHONET_PIPECTRLR
static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
{
struct pep_sock *pn = pep_sk(sk);
- struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
-
- memcpy(&pn->remote_pep, spn, sizeof(struct sockaddr_pn));
+ int err;
+ u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD };
- return pipe_handler_send_req(sk,
- PNS_PEP_CONNECT_UTID, PNS_PEP_CONNECT_REQ,
- GFP_ATOMIC);
+ pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+ err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ,
+ PN_PIPE_ENABLE, data, 4);
+ if (err) {
+ pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
+ return err;
+ }
+ sk->sk_state = TCP_SYN_SENT;
+ return 0;
}
-#endif
static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
@@ -1081,10 +903,18 @@ static int pep_init(struct sock *sk)
{
struct pep_sock *pn = pep_sk(sk);
- INIT_HLIST_HEAD(&pn->ackq);
+ sk->sk_destruct = pipe_destruct;
INIT_HLIST_HEAD(&pn->hlist);
+ pn->listener = NULL;
skb_queue_head_init(&pn->ctrlreq_queue);
+ atomic_set(&pn->tx_credits, 0);
+ pn->ifindex = 0;
+ pn->peer_type = 0;
pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
+ pn->rx_credits = 0;
+ pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL;
+ pn->init_enable = 1;
+ pn->aligned = 0;
return 0;
}
@@ -1103,18 +933,6 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
lock_sock(sk);
switch (optname) {
-#ifdef CONFIG_PHONET_PIPECTRLR
- case PNPIPE_PIPE_HANDLE:
- if (val) {
- if (pn->pipe_state > PIPE_IDLE) {
- err = -EFAULT;
- break;
- }
- pn->pipe_handle = val;
- break;
- }
-#endif
-
case PNPIPE_ENCAP:
if (val && val != PNPIPE_ENCAP_IP) {
err = -EINVAL;
@@ -1141,16 +959,6 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
}
goto out_norel;
-#ifdef CONFIG_PHONET_PIPECTRLR
- case PNPIPE_ENABLE:
- if (pn->pipe_state <= PIPE_IDLE) {
- err = -ENOTCONN;
- break;
- }
- err = pipe_handler_enable_pipe(sk, val);
- break;
-#endif
-
default:
err = -ENOPROTOOPT;
}
@@ -1180,13 +988,11 @@ static int pep_getsockopt(struct sock *sk, int level, int optname,
val = pn->ifindex;
break;
-#ifdef CONFIG_PHONET_PIPECTRLR
- case PNPIPE_ENABLE:
- if (pn->pipe_state <= PIPE_IDLE)
- return -ENOTCONN;
- val = pn->pipe_state != PIPE_DISABLED;
+ case PNPIPE_HANDLE:
+ val = pn->pipe_handle;
+ if (val == PN_PIPE_INVALID_HANDLE)
+ return -EINVAL;
break;
-#endif
default:
return -ENOPROTOOPT;
@@ -1222,11 +1028,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
} else
ph->message_id = PNS_PIPE_DATA;
ph->pipe_handle = pn->pipe_handle;
-#ifdef CONFIG_PHONET_PIPECTRLR
- err = pn_skb_send(sk, skb, &pn->remote_pep);
-#else
- err = pn_skb_send(sk, skb, &pipe_srv);
-#endif
+ err = pn_skb_send(sk, skb, NULL);
if (err && pn_flow_safe(pn->tx_fc))
atomic_inc(&pn->tx_credits);
@@ -1355,7 +1157,7 @@ struct sk_buff *pep_read(struct sock *sk)
struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue);
if (sk->sk_state == TCP_ESTABLISHED)
- pipe_grant_credits(sk);
+ pipe_grant_credits(sk, GFP_ATOMIC);
return skb;
}
@@ -1400,7 +1202,7 @@ static int pep_recvmsg(struct kiocb *iocb, struct sock *sk,
}
if (sk->sk_state == TCP_ESTABLISHED)
- pipe_grant_credits(sk);
+ pipe_grant_credits(sk, GFP_KERNEL);
release_sock(sk);
copy:
msg->msg_flags |= MSG_EOR;
@@ -1424,9 +1226,9 @@ static void pep_sock_unhash(struct sock *sk)
lock_sock(sk);
-#ifndef CONFIG_PHONET_PIPECTRLR
- if ((1 << sk->sk_state) & ~(TCPF_CLOSE|TCPF_LISTEN)) {
+ if (pn->listener != NULL) {
skparent = pn->listener;
+ pn->listener = NULL;
release_sock(sk);
pn = pep_sk(skparent);
@@ -1434,7 +1236,7 @@ static void pep_sock_unhash(struct sock *sk)
sk_del_node_init(sk);
sk = skparent;
}
-#endif
+
/* Unhash a listening sock only when it is closed
* and all of its active connected pipes are closed. */
if (hlist_empty(&pn->hlist))
@@ -1448,9 +1250,7 @@ static void pep_sock_unhash(struct sock *sk)
static struct proto pep_proto = {
.close = pep_sock_close,
.accept = pep_sock_accept,
-#ifdef CONFIG_PHONET_PIPECTRLR
.connect = pep_sock_connect,
-#endif
.ioctl = pep_ioctl,
.init = pep_init,
.setsockopt = pep_setsockopt,
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 25f746d20c1..b1adafab377 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -225,15 +225,18 @@ static int pn_socket_autobind(struct socket *sock)
return 0; /* socket was already bound */
}
-#ifdef CONFIG_PHONET_PIPECTRLR
static int pn_socket_connect(struct socket *sock, struct sockaddr *addr,
int len, int flags)
{
struct sock *sk = sock->sk;
+ struct pn_sock *pn = pn_sk(sk);
struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
- long timeo;
+ struct task_struct *tsk = current;
+ long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
int err;
+ if (pn_socket_autobind(sock))
+ return -ENOBUFS;
if (len < sizeof(struct sockaddr_pn))
return -EINVAL;
if (spn->spn_family != AF_PHONET)
@@ -243,82 +246,61 @@ static int pn_socket_connect(struct socket *sock, struct sockaddr *addr,
switch (sock->state) {
case SS_UNCONNECTED:
- sk->sk_state = TCP_CLOSE;
- break;
- case SS_CONNECTING:
- switch (sk->sk_state) {
- case TCP_SYN_RECV:
- sock->state = SS_CONNECTED;
+ if (sk->sk_state != TCP_CLOSE) {
err = -EISCONN;
goto out;
- case TCP_CLOSE:
- err = -EALREADY;
- if (flags & O_NONBLOCK)
- goto out;
- goto wait_connect;
}
break;
- case SS_CONNECTED:
- switch (sk->sk_state) {
- case TCP_SYN_RECV:
- err = -EISCONN;
- goto out;
- case TCP_CLOSE:
- sock->state = SS_UNCONNECTED;
- break;
- }
- break;
- case SS_DISCONNECTING:
- case SS_FREE:
- break;
+ case SS_CONNECTING:
+ err = -EALREADY;
+ goto out;
+ default:
+ err = -EISCONN;
+ goto out;
}
- sk->sk_state = TCP_CLOSE;
- sk_stream_kill_queues(sk);
+ pn->dobject = pn_sockaddr_get_object(spn);
+ pn->resource = pn_sockaddr_get_resource(spn);
sock->state = SS_CONNECTING;
+
err = sk->sk_prot->connect(sk, addr, len);
- if (err < 0) {
+ if (err) {
sock->state = SS_UNCONNECTED;
- sk->sk_state = TCP_CLOSE;
+ pn->dobject = 0;
goto out;
}
- err = -EINPROGRESS;
-wait_connect:
- if (sk->sk_state != TCP_SYN_RECV && (flags & O_NONBLOCK))
- goto out;
-
- timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
- release_sock(sk);
-
- err = -ERESTARTSYS;
- timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
- sk->sk_state != TCP_CLOSE,
- timeo);
-
- lock_sock(sk);
- if (timeo < 0)
- goto out; /* -ERESTARTSYS */
+ while (sk->sk_state == TCP_SYN_SENT) {
+ DEFINE_WAIT(wait);
- err = -ETIMEDOUT;
- if (timeo == 0 && sk->sk_state != TCP_SYN_RECV)
- goto out;
+ if (!timeo) {
+ err = -EINPROGRESS;
+ goto out;
+ }
+ if (signal_pending(tsk)) {
+ err = sock_intr_errno(timeo);
+ goto out;
+ }
- if (sk->sk_state != TCP_SYN_RECV) {
- sock->state = SS_UNCONNECTED;
- err = sock_error(sk);
- if (!err)
- err = -ECONNREFUSED;
- goto out;
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ finish_wait(sk_sleep(sk), &wait);
}
- sock->state = SS_CONNECTED;
- err = 0;
+ if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED))
+ err = 0;
+ else if (sk->sk_state == TCP_CLOSE_WAIT)
+ err = -ECONNRESET;
+ else
+ err = -ECONNREFUSED;
+ sock->state = err ? SS_UNCONNECTED : SS_CONNECTED;
out:
release_sock(sk);
return err;
}
-#endif
static int pn_socket_accept(struct socket *sock, struct socket *newsock,
int flags)
@@ -327,6 +309,9 @@ static int pn_socket_accept(struct socket *sock, struct socket *newsock,
struct sock *newsk;
int err;
+ if (unlikely(sk->sk_state != TCP_LISTEN))
+ return -EINVAL;
+
newsk = sk->sk_prot->accept(sk, flags, &err);
if (!newsk)
return err;
@@ -363,13 +348,8 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock,
poll_wait(file, sk_sleep(sk), wait);
- switch (sk->sk_state) {
- case TCP_LISTEN:
- return hlist_empty(&pn->ackq) ? 0 : POLLIN;
- case TCP_CLOSE:
+ if (sk->sk_state == TCP_CLOSE)
return POLLERR;
- }
-
if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
if (!skb_queue_empty(&pn->ctrlreq_queue))
@@ -428,19 +408,19 @@ static int pn_socket_listen(struct socket *sock, int backlog)
struct sock *sk = sock->sk;
int err = 0;
- if (sock->state != SS_UNCONNECTED)
- return -EINVAL;
if (pn_socket_autobind(sock))
return -ENOBUFS;
lock_sock(sk);
- if (sk->sk_state != TCP_CLOSE) {
+ if (sock->state != SS_UNCONNECTED) {
err = -EINVAL;
goto out;
}
- sk->sk_state = TCP_LISTEN;
- sk->sk_ack_backlog = 0;
+ if (sk->sk_state != TCP_LISTEN) {
+ sk->sk_state = TCP_LISTEN;
+ sk->sk_ack_backlog = 0;
+ }
sk->sk_max_ack_backlog = backlog;
out:
release_sock(sk);
@@ -488,11 +468,7 @@ const struct proto_ops phonet_stream_ops = {
.owner = THIS_MODULE,
.release = pn_socket_release,
.bind = pn_socket_bind,
-#ifdef CONFIG_PHONET_PIPECTRLR
.connect = pn_socket_connect,
-#else
- .connect = sock_no_connect,
-#endif
.socketpair = sock_no_socketpair,
.accept = pn_socket_accept,
.getname = pn_socket_getname,
@@ -633,8 +609,8 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu "
"%d %p %d%n",
- sk->sk_protocol, pn->sobject, 0, pn->resource,
- sk->sk_state,
+ sk->sk_protocol, pn->sobject, pn->dobject,
+ pn->resource, sk->sk_state,
sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk),
sock_i_uid(sk), sock_i_ino(sk),
atomic_read(&sk->sk_refcnt), sk,
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 71f373c421b..c47a511f203 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -551,7 +551,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
if (conn->c_loopback
&& rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
- return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+ scat = &rm->data.op_sg[sg];
+ ret = sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+ ret = min_t(int, ret, scat->length - conn->c_xmit_data_off);
+ return ret;
}
/* FIXME we may overallocate here */
diff --git a/net/rds/loop.c b/net/rds/loop.c
index aeec1d483b1..bca6761a3ca 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -61,10 +61,15 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg,
unsigned int off)
{
+ struct scatterlist *sgp = &rm->data.op_sg[sg];
+ int ret = sizeof(struct rds_header) +
+ be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
/* Do not send cong updates to loopback */
if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
- return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+ ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off);
+ goto out;
}
BUG_ON(hdr_off || sg || off);
@@ -80,8 +85,8 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
NULL);
rds_inc_put(&rm->m_inc);
-
- return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
+out:
+ return ret;
}
/*
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 9542449c072..da8adac2bf0 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,7 +50,6 @@ rdsdebug(char *fmt, ...)
#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
#define RDS_CONG_MAP_BYTES (65536 / 8)
-#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index eaf76587645..7fce6dfd218 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -18,7 +18,7 @@ config RFKILL_LEDS
default y
config RFKILL_INPUT
- bool "RF switch input support" if EMBEDDED
+ bool "RF switch input support" if EXPERT
depends on RFKILL
depends on INPUT = y || RFKILL = INPUT
- default y if !EMBEDDED
+ default y if !EXPERT
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index d952e7eac18..5ee0c62046a 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -803,7 +803,6 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
rose_insert_socket(sk); /* Finish the bind */
}
-rose_try_next_neigh:
rose->dest_addr = addr->srose_addr;
rose->dest_call = addr->srose_call;
rose->rand = ((long)rose & 0xFFFF) + rose->lci;
@@ -865,12 +864,6 @@ rose_try_next_neigh:
}
if (sk->sk_state != TCP_ESTABLISHED) {
- /* Try next neighbour */
- rose->neighbour = rose_get_neigh(&addr->srose_addr, &cause, &diagnostic, 0);
- if (rose->neighbour)
- goto rose_try_next_neigh;
-
- /* No more neighbours */
sock->state = SS_UNCONNECTED;
err = sock_error(sk); /* Always set at this point */
goto out_release;
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index b4fdaac233f..88a77e90e7e 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -674,29 +674,34 @@ struct rose_route *rose_route_free_lci(unsigned int lci, struct rose_neigh *neig
* Find a neighbour or a route given a ROSE address.
*/
struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause,
- unsigned char *diagnostic, int new)
+ unsigned char *diagnostic, int route_frame)
{
struct rose_neigh *res = NULL;
struct rose_node *node;
int failed = 0;
int i;
- if (!new) spin_lock_bh(&rose_node_list_lock);
+ if (!route_frame) spin_lock_bh(&rose_node_list_lock);
for (node = rose_node_list; node != NULL; node = node->next) {
if (rosecmpm(addr, &node->address, node->mask) == 0) {
for (i = 0; i < node->count; i++) {
- if (new) {
- if (node->neighbour[i]->restarted) {
- res = node->neighbour[i];
- goto out;
- }
+ if (node->neighbour[i]->restarted) {
+ res = node->neighbour[i];
+ goto out;
}
- else {
+ }
+ }
+ }
+ if (!route_frame) { /* connect request */
+ for (node = rose_node_list; node != NULL; node = node->next) {
+ if (rosecmpm(addr, &node->address, node->mask) == 0) {
+ for (i = 0; i < node->count; i++) {
if (!rose_ftimer_running(node->neighbour[i])) {
res = node->neighbour[i];
+ failed = 0;
goto out;
- } else
- failed = 1;
+ }
+ failed = 1;
}
}
}
@@ -711,8 +716,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause,
}
out:
- if (!new) spin_unlock_bh(&rose_node_list_lock);
-
+ if (!route_frame) spin_unlock_bh(&rose_node_list_lock);
return res;
}
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
index 89315009bab..1a2b0633fec 100644
--- a/net/rxrpc/ar-input.c
+++ b/net/rxrpc/ar-input.c
@@ -423,6 +423,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
goto protocol_error;
}
+ case RXRPC_PACKET_TYPE_ACKALL:
case RXRPC_PACKET_TYPE_ACK:
/* ACK processing is done in process context */
read_lock_bh(&call->state_lock);
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 5ee16f0353f..d763793d39d 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -89,11 +89,11 @@ static int rxrpc_instantiate_xdr_rxkad(struct key *key, const __be32 *xdr,
return ret;
plen -= sizeof(*token);
- token = kmalloc(sizeof(*token), GFP_KERNEL);
+ token = kzalloc(sizeof(*token), GFP_KERNEL);
if (!token)
return -ENOMEM;
- token->kad = kmalloc(plen, GFP_KERNEL);
+ token->kad = kzalloc(plen, GFP_KERNEL);
if (!token->kad) {
kfree(token);
return -ENOMEM;
@@ -731,10 +731,10 @@ static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen)
goto error;
ret = -ENOMEM;
- token = kmalloc(sizeof(*token), GFP_KERNEL);
+ token = kzalloc(sizeof(*token), GFP_KERNEL);
if (!token)
goto error;
- token->kad = kmalloc(plen, GFP_KERNEL);
+ token->kad = kzalloc(plen, GFP_KERNEL);
if (!token->kad)
goto error_free;
diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c
index a53fb25a64e..55b93dc60d0 100644
--- a/net/rxrpc/ar-peer.c
+++ b/net/rxrpc/ar-peer.c
@@ -36,31 +36,15 @@ static void rxrpc_destroy_peer(struct work_struct *work);
static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
{
struct rtable *rt;
- struct flowi fl;
- int ret;
peer->if_mtu = 1500;
- memset(&fl, 0, sizeof(fl));
-
- switch (peer->srx.transport.family) {
- case AF_INET:
- fl.oif = 0;
- fl.proto = IPPROTO_UDP,
- fl.fl4_dst = peer->srx.transport.sin.sin_addr.s_addr;
- fl.fl4_src = 0;
- fl.fl4_tos = 0;
- /* assume AFS.CM talking to AFS.FS */
- fl.fl_ip_sport = htons(7001);
- fl.fl_ip_dport = htons(7000);
- break;
- default:
- BUG();
- }
-
- ret = ip_route_output_key(&init_net, &rt, &fl);
- if (ret < 0) {
- _leave(" [route err %d]", ret);
+ rt = ip_route_output_ports(&init_net, NULL,
+ peer->srx.transport.sin.sin_addr.s_addr, 0,
+ htons(7000), htons(7001),
+ IPPROTO_UDP, 0, 0);
+ if (IS_ERR(rt)) {
+ _leave(" [route err %ld]", PTR_ERR(rt));
return;
}
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index f04d4a484d5..a7a5583d4f6 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -126,6 +126,17 @@ config NET_SCH_RED
To compile this code as a module, choose M here: the
module will be called sch_red.
+config NET_SCH_SFB
+ tristate "Stochastic Fair Blue (SFB)"
+ ---help---
+ Say Y here if you want to use the Stochastic Fair Blue (SFB)
+ packet scheduling algorithm.
+
+ See the top of <file:net/sched/sch_sfb.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_sfb.
+
config NET_SCH_SFQ
tristate "Stochastic Fairness Queueing (SFQ)"
---help---
@@ -205,6 +216,29 @@ config NET_SCH_DRR
If unsure, say N.
+config NET_SCH_MQPRIO
+ tristate "Multi-queue priority scheduler (MQPRIO)"
+ help
+ Say Y here if you want to use the Multi-queue Priority scheduler.
+ This scheduler allows QOS to be offloaded on NICs that have support
+ for offloading QOS schedulers.
+
+ To compile this driver as a module, choose M here: the module will
+ be called sch_mqprio.
+
+ If unsure, say N.
+
+config NET_SCH_CHOKE
+ tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+ help
+ Say Y here if you want to use the CHOKe packet scheduler (CHOose
+ and Keep for responsive flows, CHOose and Kill for unresponsive
+ flows). This is a variation of RED which trys to penalize flows
+ that monopolize the queue.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_choke.
+
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
@@ -243,7 +277,7 @@ config NET_CLS_TCINDEX
config NET_CLS_ROUTE4
tristate "Routing decision (ROUTE)"
- select NET_CLS_ROUTE
+ select IP_ROUTE_CLASSID
select NET_CLS
---help---
If you say Y here, you will be able to classify packets
@@ -252,9 +286,6 @@ config NET_CLS_ROUTE4
To compile this code as a module, choose M here: the
module will be called cls_route.
-config NET_CLS_ROUTE
- bool
-
config NET_CLS_FW
tristate "Netfilter mark (FW)"
select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 960f5dba630..2e77b8dba22 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
+obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
@@ -32,6 +33,9 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
+obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
+
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 23b25f89e7e..15873e14cb5 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -78,7 +78,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
struct tc_action *a, struct tcf_hashinfo *hinfo)
{
struct tcf_common *p;
- int err = 0, index = -1,i = 0, s_i = 0, n_i = 0;
+ int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
struct nlattr *nest;
read_lock_bh(hinfo->lock);
@@ -126,7 +126,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,
{
struct tcf_common *p, *s_p;
struct nlattr *nest;
- int i= 0, n_i = 0;
+ int i = 0, n_i = 0;
nest = nla_nest_start(skb, a->order);
if (nest == NULL)
@@ -138,7 +138,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,
while (p != NULL) {
s_p = p->tcfc_next;
if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo))
- module_put(a->ops->owner);
+ module_put(a->ops->owner);
n_i++;
p = s_p;
}
@@ -447,7 +447,8 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
- if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) {
+ err = tcf_action_dump_old(skb, a, bind, ref);
+ if (err > 0) {
nla_nest_end(skb, nest);
return err;
}
@@ -491,7 +492,7 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
struct tc_action *a;
struct tc_action_ops *a_o;
char act_name[IFNAMSIZ];
- struct nlattr *tb[TCA_ACT_MAX+1];
+ struct nlattr *tb[TCA_ACT_MAX + 1];
struct nlattr *kind;
int err;
@@ -549,9 +550,9 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
goto err_free;
/* module count goes up only when brand new policy is created
- if it exists and is only bound to in a_o->init() then
- ACT_P_CREATED is not returned (a zero is).
- */
+ * if it exists and is only bound to in a_o->init() then
+ * ACT_P_CREATED is not returned (a zero is).
+ */
if (err != ACT_P_CREATED)
module_put(a_o->owner);
a->ops = a_o;
@@ -569,7 +570,7 @@ err_out:
struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind)
{
- struct nlattr *tb[TCA_ACT_MAX_PRIO+1];
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *head = NULL, *act, *act_prev = NULL;
int err;
int i;
@@ -697,7 +698,7 @@ act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n,
static struct tc_action *
tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid)
{
- struct nlattr *tb[TCA_ACT_MAX+1];
+ struct nlattr *tb[TCA_ACT_MAX + 1];
struct tc_action *a;
int index;
int err;
@@ -770,7 +771,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
struct tcamsg *t;
struct netlink_callback dcb;
struct nlattr *nest;
- struct nlattr *tb[TCA_ACT_MAX+1];
+ struct nlattr *tb[TCA_ACT_MAX + 1];
struct nlattr *kind;
struct tc_action *a = create_a(0);
int err = -ENOMEM;
@@ -821,7 +822,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
nlh->nlmsg_flags |= NLM_F_ROOT;
module_put(a->ops->owner);
kfree(a);
- err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ err = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
if (err > 0)
return 0;
@@ -842,14 +844,14 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
u32 pid, int event)
{
int i, ret;
- struct nlattr *tb[TCA_ACT_MAX_PRIO+1];
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *head = NULL, *act, *act_prev = NULL;
ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
if (ret < 0)
return ret;
- if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) {
+ if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
if (tb[1] != NULL)
return tca_action_flush(net, tb[1], n, pid);
else
@@ -892,7 +894,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
/* now do the delete */
tcf_action_destroy(head, 0);
ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
- n->nlmsg_flags&NLM_F_ECHO);
+ n->nlmsg_flags & NLM_F_ECHO);
if (ret > 0)
return 0;
return ret;
@@ -936,7 +938,7 @@ static int tcf_add_notify(struct net *net, struct tc_action *a,
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
NETLINK_CB(skb).dst_group = RTNLGRP_TC;
- err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags&NLM_F_ECHO);
+ err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags & NLM_F_ECHO);
if (err > 0)
err = 0;
return err;
@@ -967,7 +969,7 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
/* dump then free all the actions after update; inserted policy
* stays intact
- * */
+ */
ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags);
for (a = act; a; a = act) {
act = a->next;
@@ -993,8 +995,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
return -EINVAL;
}
- /* n->nlmsg_flags&NLM_F_CREATE
- * */
+ /* n->nlmsg_flags & NLM_F_CREATE */
switch (n->nlmsg_type) {
case RTM_NEWACTION:
/* we are going to assume all other flags
@@ -1003,7 +1004,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
* but since we want avoid ambiguity (eg when flags
* is zero) then just set this
*/
- if (n->nlmsg_flags&NLM_F_REPLACE)
+ if (n->nlmsg_flags & NLM_F_REPLACE)
ovr = 1;
replay:
ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr);
@@ -1028,7 +1029,7 @@ replay:
static struct nlattr *
find_dump_kind(const struct nlmsghdr *n)
{
- struct nlattr *tb1, *tb2[TCA_ACT_MAX+1];
+ struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct nlattr *nla[TCAA_MAX + 1];
struct nlattr *kind;
@@ -1071,9 +1072,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
}
a_o = tc_lookup_action(kind);
- if (a_o == NULL) {
+ if (a_o == NULL)
return 0;
- }
memset(&a, 0, sizeof(struct tc_action));
a.ops = a_o;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 83ddfc07e45..6cdf9abe475 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -63,7 +63,7 @@ static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,
if (nla == NULL)
return -EINVAL;
- err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy);
+ err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy);
if (err < 0)
return err;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index c2ed90a4c0b..2b4ab4b05ce 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -50,7 +50,7 @@ static int gact_determ(struct tcf_gact *gact)
}
typedef int (*g_rand)(struct tcf_gact *gact);
-static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ };
+static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };
#endif /* CONFIG_GACT_PROB */
static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
@@ -89,7 +89,7 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(parm->index, est, a, sizeof(*gact),
bind, &gact_idx_gen, &gact_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
ret = ACT_P_CREATED;
} else {
if (!ovr) {
@@ -205,9 +205,9 @@ MODULE_LICENSE("GPL");
static int __init gact_init_module(void)
{
#ifdef CONFIG_GACT_PROB
- printk(KERN_INFO "GACT probability on\n");
+ pr_info("GACT probability on\n");
#else
- printk(KERN_INFO "GACT probability NOT on\n");
+ pr_info("GACT probability NOT on\n");
#endif
return tcf_register_action(&act_gact_ops);
}
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index c2a7c20e81c..9fc211a1b20 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -138,7 +138,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind,
&ipt_idx_gen, &ipt_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
ret = ACT_P_CREATED;
} else {
if (!ovr) {
@@ -162,7 +162,8 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
if (unlikely(!t))
goto err2;
- if ((err = ipt_init_target(t, tname, hook)) < 0)
+ err = ipt_init_target(t, tname, hook);
+ if (err < 0)
goto err3;
spin_lock_bh(&ipt->tcf_lock);
@@ -212,8 +213,9 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
bstats_update(&ipt->tcf_bstats, skb);
/* yes, we have to worry about both in and out dev
- worry later - danger - this API seems to have changed
- from earlier kernels */
+ * worry later - danger - this API seems to have changed
+ * from earlier kernels
+ */
par.in = skb->dev;
par.out = NULL;
par.hooknum = ipt->tcfi_hook;
@@ -253,9 +255,9 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
struct tc_cnt c;
/* for simple targets kernel size == user size
- ** user name = target name
- ** for foolproof you need to not assume this
- */
+ * user name = target name
+ * for foolproof you need to not assume this
+ */
t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
if (unlikely(!t))
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index d765067e99d..961386e2f2c 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -41,13 +41,13 @@ static struct tcf_hashinfo mirred_hash_info = {
.lock = &mirred_lock,
};
-static inline int tcf_mirred_release(struct tcf_mirred *m, int bind)
+static int tcf_mirred_release(struct tcf_mirred *m, int bind)
{
if (m) {
if (bind)
m->tcf_bindcnt--;
m->tcf_refcnt--;
- if(!m->tcf_bindcnt && m->tcf_refcnt <= 0) {
+ if (!m->tcf_bindcnt && m->tcf_refcnt <= 0) {
list_del(&m->tcfm_list);
if (m->tcfm_dev)
dev_put(m->tcfm_dev);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 178a4bd7b7c..762b027650a 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -69,7 +69,7 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
&nat_idx_gen, &nat_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
p = to_tcf_nat(pc);
ret = ACT_P_CREATED;
} else {
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 445bef716f7..50c7c06c019 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -70,7 +70,7 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
&pedit_idx_gen, &pedit_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
p = to_pedit(pc);
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL) {
@@ -127,11 +127,9 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
int i, munged = 0;
unsigned int off;
- if (skb_cloned(skb)) {
- if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
- return p->tcf_action;
- }
- }
+ if (skb_cloned(skb) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ return p->tcf_action;
off = skb_network_offset(skb);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index e2f08b1e2e5..8a1630774fd 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -22,8 +22,8 @@
#include <net/act_api.h>
#include <net/netlink.h>
-#define L2T(p,L) qdisc_l2t((p)->tcfp_R_tab, L)
-#define L2T_P(p,L) qdisc_l2t((p)->tcfp_P_tab, L)
+#define L2T(p, L) qdisc_l2t((p)->tcfp_R_tab, L)
+#define L2T_P(p, L) qdisc_l2t((p)->tcfp_P_tab, L)
#define POL_TAB_MASK 15
static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1];
@@ -37,8 +37,7 @@ static struct tcf_hashinfo police_hash_info = {
};
/* old policer structure from before tc actions */
-struct tc_police_compat
-{
+struct tc_police_compat {
u32 index;
int action;
u32 limit;
@@ -139,7 +138,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
- unsigned h;
+ unsigned int h;
int ret = 0, err;
struct nlattr *tb[TCA_POLICE_MAX + 1];
struct tc_police *parm;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 7287cff7af3..a34a22de60b 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -47,7 +47,7 @@ static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result
/* print policy string followed by _ then packet count
* Example if this was the 3rd packet and the string was "hello"
* then it would look like "hello_3" (without quotes)
- **/
+ */
pr_info("simple: %s_%d\n",
(char *)d->tcfd_defdata, d->tcf_bstats.packets);
spin_unlock(&d->tcf_lock);
@@ -125,7 +125,7 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
&simp_idx_gen, &simp_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
d = to_defact(pc);
ret = alloc_defdata(d, defdata);
@@ -149,7 +149,7 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
return ret;
}
-static inline int tcf_simp_cleanup(struct tc_action *a, int bind)
+static int tcf_simp_cleanup(struct tc_action *a, int bind)
{
struct tcf_defact *d = a->priv;
@@ -158,8 +158,8 @@ static inline int tcf_simp_cleanup(struct tc_action *a, int bind)
return 0;
}
-static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
- int bind, int ref)
+static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_defact *d = a->priv;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 836f5fee9e5..5f6f0c7c390 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -113,7 +113,7 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
&skbedit_idx_gen, &skbedit_hash_info);
if (IS_ERR(pc))
- return PTR_ERR(pc);
+ return PTR_ERR(pc);
d = to_skbedit(pc);
ret = ACT_P_CREATED;
@@ -144,7 +144,7 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
return ret;
}
-static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind)
+static int tcf_skbedit_cleanup(struct tc_action *a, int bind)
{
struct tcf_skbedit *d = a->priv;
@@ -153,8 +153,8 @@ static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind)
return 0;
}
-static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
- int bind, int ref)
+static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_skbedit *d = a->priv;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 5fd0c28ef79..bb2c523f815 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -85,7 +85,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
int rc = -ENOENT;
write_lock(&cls_mod_lock);
- for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next)
+ for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
if (t == ops)
break;
@@ -111,7 +111,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
u32 first = TC_H_MAKE(0xC0000000U, 0U);
if (tp)
- first = tp->prio-1;
+ first = tp->prio - 1;
return first;
}
@@ -149,7 +149,8 @@ replay:
if (prio == 0) {
/* If no priority is given, user wants we allocated it. */
- if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+ if (n->nlmsg_type != RTM_NEWTFILTER ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
prio = TC_H_MAKE(0x80000000U, 0U);
}
@@ -176,7 +177,8 @@ replay:
}
/* Is it classful? */
- if ((cops = q->ops->cl_ops) == NULL)
+ cops = q->ops->cl_ops;
+ if (!cops)
return -EINVAL;
if (cops->tcf_chain == NULL)
@@ -196,10 +198,11 @@ replay:
goto errout;
/* Check the chain for existence of proto-tcf with this priority */
- for (back = chain; (tp=*back) != NULL; back = &tp->next) {
+ for (back = chain; (tp = *back) != NULL; back = &tp->next) {
if (tp->prio >= prio) {
if (tp->prio == prio) {
- if (!nprio || (tp->protocol != protocol && protocol))
+ if (!nprio ||
+ (tp->protocol != protocol && protocol))
goto errout;
} else
tp = NULL;
@@ -216,7 +219,8 @@ replay:
goto errout;
err = -ENOENT;
- if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
+ if (n->nlmsg_type != RTM_NEWTFILTER ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
goto errout;
@@ -420,7 +424,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
return skb->len;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return skb->len;
if (!tcm->tcm_parent)
@@ -429,7 +434,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
if (!q)
goto out;
- if ((cops = q->ops->cl_ops) == NULL)
+ cops = q->ops->cl_ops;
+ if (!cops)
goto errout;
if (cops->tcf_chain == NULL)
goto errout;
@@ -444,8 +450,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
s_t = cb->args[0];
- for (tp=*chain, t=0; tp; tp = tp->next, t++) {
- if (t < s_t) continue;
+ for (tp = *chain, t = 0; tp; tp = tp->next, t++) {
+ if (t < s_t)
+ continue;
if (TC_H_MAJ(tcm->tcm_info) &&
TC_H_MAJ(tcm->tcm_info) != tp->prio)
continue;
@@ -468,10 +475,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
arg.skb = skb;
arg.cb = cb;
arg.w.stop = 0;
- arg.w.skip = cb->args[1]-1;
+ arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
tp->ops->walk(tp, &arg.w);
- cb->args[1] = arg.w.count+1;
+ cb->args[1] = arg.w.count + 1;
if (arg.w.stop)
break;
}
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index f23d9155b1e..8be8872dd57 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -21,14 +21,12 @@
#include <net/act_api.h>
#include <net/pkt_cls.h>
-struct basic_head
-{
+struct basic_head {
u32 hgenerator;
struct list_head flist;
};
-struct basic_filter
-{
+struct basic_filter {
u32 handle;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
@@ -92,8 +90,7 @@ static int basic_init(struct tcf_proto *tp)
return 0;
}
-static inline void basic_delete_filter(struct tcf_proto *tp,
- struct basic_filter *f)
+static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
@@ -135,9 +132,9 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
[TCA_BASIC_EMATCHES] = { .type = NLA_NESTED },
};
-static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
- unsigned long base, struct nlattr **tb,
- struct nlattr *est)
+static int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
+ unsigned long base, struct nlattr **tb,
+ struct nlattr *est)
{
int err = -EINVAL;
struct tcf_exts e;
@@ -203,7 +200,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
} while (--i > 0 && basic_get(tp, head->hgenerator));
if (i <= 0) {
- printk(KERN_ERR "Insufficient number of handles\n");
+ pr_err("Insufficient number of handles\n");
goto errout;
}
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index d49c40fb7e0..32a335194ca 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -56,7 +56,8 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
{
struct cgroup_cls_state *cs;
- if (!(cs = kzalloc(sizeof(*cs), GFP_KERNEL)))
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
return ERR_PTR(-ENOMEM);
if (cgrp->parent)
@@ -94,8 +95,7 @@ static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
}
-struct cls_cgroup_head
-{
+struct cls_cgroup_head {
u32 handle;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
@@ -166,7 +166,7 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
unsigned long *arg)
{
- struct nlattr *tb[TCA_CGROUP_MAX+1];
+ struct nlattr *tb[TCA_CGROUP_MAX + 1];
struct cls_cgroup_head *head = tp->root;
struct tcf_ematch_tree t;
struct tcf_exts e;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 5b271a18bc3..8ec01391d98 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -121,7 +121,7 @@ static u32 flow_get_proto_src(struct sk_buff *skb)
if (!pskb_network_may_pull(skb, sizeof(*iph)))
break;
iph = ip_hdr(skb);
- if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+ if (iph->frag_off & htons(IP_MF | IP_OFFSET))
break;
poff = proto_ports_offset(iph->protocol);
if (poff >= 0 &&
@@ -163,7 +163,7 @@ static u32 flow_get_proto_dst(struct sk_buff *skb)
if (!pskb_network_may_pull(skb, sizeof(*iph)))
break;
iph = ip_hdr(skb);
- if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+ if (iph->frag_off & htons(IP_MF | IP_OFFSET))
break;
poff = proto_ports_offset(iph->protocol);
if (poff >= 0 &&
@@ -276,7 +276,7 @@ fallback:
static u32 flow_get_rtclassid(const struct sk_buff *skb)
{
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (skb_dst(skb))
return skb_dst(skb)->tclassid;
#endif
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 93b0a7b6f9b..26e7bc4ffb7 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -31,14 +31,12 @@
#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
-struct fw_head
-{
+struct fw_head {
struct fw_filter *ht[HTSIZE];
u32 mask;
};
-struct fw_filter
-{
+struct fw_filter {
struct fw_filter *next;
u32 id;
struct tcf_result res;
@@ -53,7 +51,7 @@ static const struct tcf_ext_map fw_ext_map = {
.police = TCA_FW_POLICE
};
-static __inline__ int fw_hash(u32 handle)
+static inline int fw_hash(u32 handle)
{
if (HTSIZE == 4096)
return ((handle >> 24) & 0xFFF) ^
@@ -82,14 +80,14 @@ static __inline__ int fw_hash(u32 handle)
static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = (struct fw_head *)tp->root;
struct fw_filter *f;
int r;
u32 id = skb->mark;
if (head != NULL) {
id &= head->mask;
- for (f=head->ht[fw_hash(id)]; f; f=f->next) {
+ for (f = head->ht[fw_hash(id)]; f; f = f->next) {
if (f->id == id) {
*res = f->res;
#ifdef CONFIG_NET_CLS_IND
@@ -105,7 +103,8 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
}
} else {
/* old method */
- if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) {
+ if (id && (TC_H_MAJ(id) == 0 ||
+ !(TC_H_MAJ(id ^ tp->q->handle)))) {
res->classid = id;
res->class = 0;
return 0;
@@ -117,13 +116,13 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = (struct fw_head *)tp->root;
struct fw_filter *f;
if (head == NULL)
return 0;
- for (f=head->ht[fw_hash(handle)]; f; f=f->next) {
+ for (f = head->ht[fw_hash(handle)]; f; f = f->next) {
if (f->id == handle)
return (unsigned long)f;
}
@@ -139,8 +138,7 @@ static int fw_init(struct tcf_proto *tp)
return 0;
}
-static inline void
-fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
+static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
@@ -156,8 +154,8 @@ static void fw_destroy(struct tcf_proto *tp)
if (head == NULL)
return;
- for (h=0; h<HTSIZE; h++) {
- while ((f=head->ht[h]) != NULL) {
+ for (h = 0; h < HTSIZE; h++) {
+ while ((f = head->ht[h]) != NULL) {
head->ht[h] = f->next;
fw_delete_filter(tp, f);
}
@@ -167,14 +165,14 @@ static void fw_destroy(struct tcf_proto *tp)
static int fw_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct fw_head *head = (struct fw_head*)tp->root;
- struct fw_filter *f = (struct fw_filter*)arg;
+ struct fw_head *head = (struct fw_head *)tp->root;
+ struct fw_filter *f = (struct fw_filter *)arg;
struct fw_filter **fp;
if (head == NULL || f == NULL)
goto out;
- for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
+ for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
@@ -240,7 +238,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base,
struct nlattr **tca,
unsigned long *arg)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = (struct fw_head *)tp->root;
struct fw_filter *f = (struct fw_filter *) *arg;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_FW_MAX + 1];
@@ -302,7 +300,7 @@ errout:
static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- struct fw_head *head = (struct fw_head*)tp->root;
+ struct fw_head *head = (struct fw_head *)tp->root;
int h;
if (head == NULL)
@@ -332,7 +330,7 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct fw_head *head = (struct fw_head *)tp->root;
- struct fw_filter *f = (struct fw_filter*)fh;
+ struct fw_filter *f = (struct fw_filter *)fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 694dcd85dec..a907905376d 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -23,34 +23,30 @@
#include <net/pkt_cls.h>
/*
- 1. For now we assume that route tags < 256.
- It allows to use direct table lookups, instead of hash tables.
- 2. For now we assume that "from TAG" and "fromdev DEV" statements
- are mutually exclusive.
- 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
+ * 1. For now we assume that route tags < 256.
+ * It allows to use direct table lookups, instead of hash tables.
+ * 2. For now we assume that "from TAG" and "fromdev DEV" statements
+ * are mutually exclusive.
+ * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
*/
-struct route4_fastmap
-{
+struct route4_fastmap {
struct route4_filter *filter;
u32 id;
int iif;
};
-struct route4_head
-{
+struct route4_head {
struct route4_fastmap fastmap[16];
- struct route4_bucket *table[256+1];
+ struct route4_bucket *table[256 + 1];
};
-struct route4_bucket
-{
+struct route4_bucket {
/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
- struct route4_filter *ht[16+16+1];
+ struct route4_filter *ht[16 + 16 + 1];
};
-struct route4_filter
-{
+struct route4_filter {
struct route4_filter *next;
u32 id;
int iif;
@@ -61,20 +57,20 @@ struct route4_filter
struct route4_bucket *bkt;
};
-#define ROUTE4_FAILURE ((struct route4_filter*)(-1L))
+#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
static const struct tcf_ext_map route_ext_map = {
.police = TCA_ROUTE4_POLICE,
.action = TCA_ROUTE4_ACT
};
-static __inline__ int route4_fastmap_hash(u32 id, int iif)
+static inline int route4_fastmap_hash(u32 id, int iif)
{
- return id&0xF;
+ return id & 0xF;
}
-static inline
-void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
+static void
+route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
{
spinlock_t *root_lock = qdisc_root_sleeping_lock(q);
@@ -83,32 +79,33 @@ void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
spin_unlock_bh(root_lock);
}
-static inline void
+static void
route4_set_fastmap(struct route4_head *head, u32 id, int iif,
struct route4_filter *f)
{
int h = route4_fastmap_hash(id, iif);
+
head->fastmap[h].id = id;
head->fastmap[h].iif = iif;
head->fastmap[h].filter = f;
}
-static __inline__ int route4_hash_to(u32 id)
+static inline int route4_hash_to(u32 id)
{
- return id&0xFF;
+ return id & 0xFF;
}
-static __inline__ int route4_hash_from(u32 id)
+static inline int route4_hash_from(u32 id)
{
- return (id>>16)&0xF;
+ return (id >> 16) & 0xF;
}
-static __inline__ int route4_hash_iif(int iif)
+static inline int route4_hash_iif(int iif)
{
- return 16 + ((iif>>16)&0xF);
+ return 16 + ((iif >> 16) & 0xF);
}
-static __inline__ int route4_hash_wild(void)
+static inline int route4_hash_wild(void)
{
return 32;
}
@@ -131,21 +128,22 @@ static __inline__ int route4_hash_wild(void)
static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
- struct route4_head *head = (struct route4_head*)tp->root;
+ struct route4_head *head = (struct route4_head *)tp->root;
struct dst_entry *dst;
struct route4_bucket *b;
struct route4_filter *f;
u32 id, h;
int iif, dont_cache = 0;
- if ((dst = skb_dst(skb)) == NULL)
+ dst = skb_dst(skb);
+ if (!dst)
goto failure;
id = dst->tclassid;
if (head == NULL)
goto old_method;
- iif = ((struct rtable*)dst)->fl.iif;
+ iif = ((struct rtable *)dst)->rt_iif;
h = route4_fastmap_hash(id, iif);
if (id == head->fastmap[h].id &&
@@ -161,7 +159,8 @@ static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
h = route4_hash_to(id);
restart:
- if ((b = head->table[h]) != NULL) {
+ b = head->table[h];
+ if (b) {
for (f = b->ht[route4_hash_from(id)]; f; f = f->next)
if (f->id == id)
ROUTE4_APPLY_RESULT();
@@ -197,8 +196,9 @@ old_method:
static inline u32 to_hash(u32 id)
{
- u32 h = id&0xFF;
- if (id&0x8000)
+ u32 h = id & 0xFF;
+
+ if (id & 0x8000)
h += 256;
return h;
}
@@ -211,17 +211,17 @@ static inline u32 from_hash(u32 id)
if (!(id & 0x8000)) {
if (id > 255)
return 256;
- return id&0xF;
+ return id & 0xF;
}
- return 16 + (id&0xF);
+ return 16 + (id & 0xF);
}
static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
{
- struct route4_head *head = (struct route4_head*)tp->root;
+ struct route4_head *head = (struct route4_head *)tp->root;
struct route4_bucket *b;
struct route4_filter *f;
- unsigned h1, h2;
+ unsigned int h1, h2;
if (!head)
return 0;
@@ -230,11 +230,12 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
if (h1 > 256)
return 0;
- h2 = from_hash(handle>>16);
+ h2 = from_hash(handle >> 16);
if (h2 > 32)
return 0;
- if ((b = head->table[h1]) != NULL) {
+ b = head->table[h1];
+ if (b) {
for (f = b->ht[h2]; f; f = f->next)
if (f->handle == handle)
return (unsigned long)f;
@@ -251,7 +252,7 @@ static int route4_init(struct tcf_proto *tp)
return 0;
}
-static inline void
+static void
route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
{
tcf_unbind_filter(tp, &f->res);
@@ -267,11 +268,12 @@ static void route4_destroy(struct tcf_proto *tp)
if (head == NULL)
return;
- for (h1=0; h1<=256; h1++) {
+ for (h1 = 0; h1 <= 256; h1++) {
struct route4_bucket *b;
- if ((b = head->table[h1]) != NULL) {
- for (h2=0; h2<=32; h2++) {
+ b = head->table[h1];
+ if (b) {
+ for (h2 = 0; h2 <= 32; h2++) {
struct route4_filter *f;
while ((f = b->ht[h2]) != NULL) {
@@ -287,9 +289,9 @@ static void route4_destroy(struct tcf_proto *tp)
static int route4_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct route4_head *head = (struct route4_head*)tp->root;
- struct route4_filter **fp, *f = (struct route4_filter*)arg;
- unsigned h = 0;
+ struct route4_head *head = (struct route4_head *)tp->root;
+ struct route4_filter **fp, *f = (struct route4_filter *)arg;
+ unsigned int h = 0;
struct route4_bucket *b;
int i;
@@ -299,7 +301,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
h = f->handle;
b = f->bkt;
- for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) {
+ for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
@@ -310,7 +312,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
/* Strip tree */
- for (i=0; i<=32; i++)
+ for (i = 0; i <= 32; i++)
if (b->ht[i])
return 0;
@@ -380,7 +382,8 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
}
h1 = to_hash(nhandle);
- if ((b = head->table[h1]) == NULL) {
+ b = head->table[h1];
+ if (!b) {
err = -ENOBUFS;
b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
if (b == NULL)
@@ -391,6 +394,7 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
tcf_tree_unlock(tp);
} else {
unsigned int h2 = from_hash(nhandle >> 16);
+
err = -EEXIST;
for (fp = b->ht[h2]; fp; fp = fp->next)
if (fp->handle == f->handle)
@@ -444,7 +448,8 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
if (err < 0)
return err;
- if ((f = (struct route4_filter*)*arg) != NULL) {
+ f = (struct route4_filter *)*arg;
+ if (f) {
if (f->handle != handle && handle)
return -EINVAL;
@@ -481,7 +486,7 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
reinsert:
h = from_hash(f->handle >> 16);
- for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next)
+ for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next)
if (f->handle < f1->handle)
break;
@@ -492,7 +497,8 @@ reinsert:
if (old_handle && f->handle != old_handle) {
th = to_hash(old_handle);
h = from_hash(old_handle >> 16);
- if ((b = head->table[th]) != NULL) {
+ b = head->table[th];
+ if (b) {
for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
*fp = f->next;
@@ -515,7 +521,7 @@ errout:
static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct route4_head *head = tp->root;
- unsigned h, h1;
+ unsigned int h, h1;
if (head == NULL)
arg->stop = 1;
@@ -549,7 +555,7 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
static int route4_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct route4_filter *f = (struct route4_filter*)fh;
+ struct route4_filter *f = (struct route4_filter *)fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
u32 id;
@@ -563,15 +569,15 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh,
if (nest == NULL)
goto nla_put_failure;
- if (!(f->handle&0x8000)) {
- id = f->id&0xFF;
+ if (!(f->handle & 0x8000)) {
+ id = f->id & 0xFF;
NLA_PUT_U32(skb, TCA_ROUTE4_TO, id);
}
- if (f->handle&0x80000000) {
- if ((f->handle>>16) != 0xFFFF)
+ if (f->handle & 0x80000000) {
+ if ((f->handle >> 16) != 0xFFFF)
NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif);
} else {
- id = f->id>>16;
+ id = f->id >> 16;
NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id);
}
if (f->res.classid)
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 425a1790b04..402c44b241a 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -66,28 +66,25 @@
powerful classification engine. */
-struct rsvp_head
-{
+struct rsvp_head {
u32 tmap[256/32];
u32 hgenerator;
u8 tgenerator;
struct rsvp_session *ht[256];
};
-struct rsvp_session
-{
+struct rsvp_session {
struct rsvp_session *next;
__be32 dst[RSVP_DST_LEN];
struct tc_rsvp_gpi dpi;
u8 protocol;
u8 tunnelid;
/* 16 (src,sport) hash slots, and one wildcard source slot */
- struct rsvp_filter *ht[16+1];
+ struct rsvp_filter *ht[16 + 1];
};
-struct rsvp_filter
-{
+struct rsvp_filter {
struct rsvp_filter *next;
__be32 src[RSVP_DST_LEN];
struct tc_rsvp_gpi spi;
@@ -100,17 +97,19 @@ struct rsvp_filter
struct rsvp_session *sess;
};
-static __inline__ unsigned hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
+static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
{
- unsigned h = (__force __u32)dst[RSVP_DST_LEN-1];
+ unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
+
h ^= h>>16;
h ^= h>>8;
return (h ^ protocol ^ tunnelid) & 0xFF;
}
-static __inline__ unsigned hash_src(__be32 *src)
+static inline unsigned int hash_src(__be32 *src)
{
- unsigned h = (__force __u32)src[RSVP_DST_LEN-1];
+ unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
+
h ^= h>>16;
h ^= h>>8;
h ^= h>>4;
@@ -134,10 +133,10 @@ static struct tcf_ext_map rsvp_ext_map = {
static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
- struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
+ struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
struct rsvp_session *s;
struct rsvp_filter *f;
- unsigned h1, h2;
+ unsigned int h1, h2;
__be32 *dst, *src;
u8 protocol;
u8 tunnelid = 0;
@@ -162,13 +161,13 @@ restart:
src = &nhptr->saddr.s6_addr32[0];
dst = &nhptr->daddr.s6_addr32[0];
protocol = nhptr->nexthdr;
- xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr);
+ xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
#else
src = &nhptr->saddr;
dst = &nhptr->daddr;
protocol = nhptr->protocol;
- xprt = ((u8*)nhptr) + (nhptr->ihl<<2);
- if (nhptr->frag_off & htons(IP_MF|IP_OFFSET))
+ xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
+ if (nhptr->frag_off & htons(IP_MF | IP_OFFSET))
return -1;
#endif
@@ -176,10 +175,10 @@ restart:
h2 = hash_src(src);
for (s = sht[h1]; s; s = s->next) {
- if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
+ if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
protocol == s->protocol &&
!(s->dpi.mask &
- (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) &&
+ (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
#if RSVP_DST_LEN == 4
dst[0] == s->dst[0] &&
dst[1] == s->dst[1] &&
@@ -188,8 +187,8 @@ restart:
tunnelid == s->tunnelid) {
for (f = s->ht[h2]; f; f = f->next) {
- if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] &&
- !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key))
+ if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
+ !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
#if RSVP_DST_LEN == 4
&&
src[0] == f->src[0] &&
@@ -205,7 +204,7 @@ matched:
return 0;
tunnelid = f->res.classid;
- nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr));
+ nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
goto restart;
}
}
@@ -224,11 +223,11 @@ matched:
static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
{
- struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
+ struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
struct rsvp_session *s;
struct rsvp_filter *f;
- unsigned h1 = handle&0xFF;
- unsigned h2 = (handle>>8)&0xFF;
+ unsigned int h1 = handle & 0xFF;
+ unsigned int h2 = (handle >> 8) & 0xFF;
if (h2 > 16)
return 0;
@@ -258,7 +257,7 @@ static int rsvp_init(struct tcf_proto *tp)
return -ENOBUFS;
}
-static inline void
+static void
rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
{
tcf_unbind_filter(tp, &f->res);
@@ -277,13 +276,13 @@ static void rsvp_destroy(struct tcf_proto *tp)
sht = data->ht;
- for (h1=0; h1<256; h1++) {
+ for (h1 = 0; h1 < 256; h1++) {
struct rsvp_session *s;
while ((s = sht[h1]) != NULL) {
sht[h1] = s->next;
- for (h2=0; h2<=16; h2++) {
+ for (h2 = 0; h2 <= 16; h2++) {
struct rsvp_filter *f;
while ((f = s->ht[h2]) != NULL) {
@@ -299,13 +298,13 @@ static void rsvp_destroy(struct tcf_proto *tp)
static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg;
- unsigned h = f->handle;
+ struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg;
+ unsigned int h = f->handle;
struct rsvp_session **sp;
struct rsvp_session *s = f->sess;
int i;
- for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) {
+ for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
@@ -314,12 +313,12 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
/* Strip tree */
- for (i=0; i<=16; i++)
+ for (i = 0; i <= 16; i++)
if (s->ht[i])
return 0;
/* OK, session has no flows */
- for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF];
+ for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF];
*sp; sp = &(*sp)->next) {
if (*sp == s) {
tcf_tree_lock(tp);
@@ -337,13 +336,14 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
return 0;
}
-static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
+static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
{
struct rsvp_head *data = tp->root;
int i = 0xFFFF;
while (i-- > 0) {
u32 h;
+
if ((data->hgenerator += 0x10000) == 0)
data->hgenerator = 0x10000;
h = data->hgenerator|salt;
@@ -355,10 +355,10 @@ static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
static int tunnel_bts(struct rsvp_head *data)
{
- int n = data->tgenerator>>5;
- u32 b = 1<<(data->tgenerator&0x1F);
+ int n = data->tgenerator >> 5;
+ u32 b = 1 << (data->tgenerator & 0x1F);
- if (data->tmap[n]&b)
+ if (data->tmap[n] & b)
return 0;
data->tmap[n] |= b;
return 1;
@@ -372,10 +372,10 @@ static void tunnel_recycle(struct rsvp_head *data)
memset(tmap, 0, sizeof(tmap));
- for (h1=0; h1<256; h1++) {
+ for (h1 = 0; h1 < 256; h1++) {
struct rsvp_session *s;
for (s = sht[h1]; s; s = s->next) {
- for (h2=0; h2<=16; h2++) {
+ for (h2 = 0; h2 <= 16; h2++) {
struct rsvp_filter *f;
for (f = s->ht[h2]; f; f = f->next) {
@@ -395,8 +395,8 @@ static u32 gen_tunnel(struct rsvp_head *data)
{
int i, k;
- for (k=0; k<2; k++) {
- for (i=255; i>0; i--) {
+ for (k = 0; k < 2; k++) {
+ for (i = 255; i > 0; i--) {
if (++data->tgenerator == 0)
data->tgenerator = 1;
if (tunnel_bts(data))
@@ -428,7 +428,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
struct nlattr *opt = tca[TCA_OPTIONS-1];
struct nlattr *tb[TCA_RSVP_MAX + 1];
struct tcf_exts e;
- unsigned h1, h2;
+ unsigned int h1, h2;
__be32 *dst;
int err;
@@ -443,7 +443,8 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
if (err < 0)
return err;
- if ((f = (struct rsvp_filter*)*arg) != NULL) {
+ f = (struct rsvp_filter *)*arg;
+ if (f) {
/* Node exists: adjust only classid */
if (f->handle != handle && handle)
@@ -500,7 +501,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
goto errout;
}
- for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) {
+ for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) {
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
pinfo && pinfo->protocol == s->protocol &&
memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
@@ -523,7 +524,7 @@ insert:
tcf_exts_change(tp, &f->exts, &e);
for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
- if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask)
+ if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask)
break;
f->next = *fp;
wmb();
@@ -567,7 +568,7 @@ errout2:
static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct rsvp_head *head = tp->root;
- unsigned h, h1;
+ unsigned int h, h1;
if (arg->stop)
return;
@@ -598,7 +599,7 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct rsvp_filter *f = (struct rsvp_filter*)fh;
+ struct rsvp_filter *f = (struct rsvp_filter *)fh;
struct rsvp_session *s;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
@@ -624,7 +625,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
if (f->res.classid)
NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid);
- if (((f->handle>>8)&0xFF) != 16)
+ if (((f->handle >> 8) & 0xFF) != 16)
NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src);
if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0)
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 20ef330bb91..36667fa6423 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -249,7 +249,7 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
* of the hashing index is below the threshold.
*/
if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD)
- cp.hash = (cp.mask >> cp.shift)+1;
+ cp.hash = (cp.mask >> cp.shift) + 1;
else
cp.hash = DEFAULT_HASH_SIZE;
}
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index b0c2a82178a..3b93fc0c895 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -42,8 +42,7 @@
#include <net/act_api.h>
#include <net/pkt_cls.h>
-struct tc_u_knode
-{
+struct tc_u_knode {
struct tc_u_knode *next;
u32 handle;
struct tc_u_hnode *ht_up;
@@ -63,19 +62,17 @@ struct tc_u_knode
struct tc_u32_sel sel;
};
-struct tc_u_hnode
-{
+struct tc_u_hnode {
struct tc_u_hnode *next;
u32 handle;
u32 prio;
struct tc_u_common *tp_c;
int refcnt;
- unsigned divisor;
+ unsigned int divisor;
struct tc_u_knode *ht[1];
};
-struct tc_u_common
-{
+struct tc_u_common {
struct tc_u_hnode *hlist;
struct Qdisc *q;
int refcnt;
@@ -87,9 +84,11 @@ static const struct tcf_ext_map u32_ext_map = {
.police = TCA_U32_POLICE
};
-static __inline__ unsigned u32_hash_fold(__be32 key, struct tc_u32_sel *sel, u8 fshift)
+static inline unsigned int u32_hash_fold(__be32 key,
+ const struct tc_u32_sel *sel,
+ u8 fshift)
{
- unsigned h = ntohl(key & sel->hmask)>>fshift;
+ unsigned int h = ntohl(key & sel->hmask) >> fshift;
return h;
}
@@ -101,7 +100,7 @@ static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_re
unsigned int off;
} stack[TC_U32_MAXDEPTH];
- struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root;
+ struct tc_u_hnode *ht = (struct tc_u_hnode *)tp->root;
unsigned int off = skb_network_offset(skb);
struct tc_u_knode *n;
int sdepth = 0;
@@ -120,7 +119,7 @@ next_knode:
struct tc_u32_key *key = n->sel.keys;
#ifdef CONFIG_CLS_U32_PERF
- n->pf->rcnt +=1;
+ n->pf->rcnt += 1;
j = 0;
#endif
@@ -133,14 +132,14 @@ next_knode:
}
#endif
- for (i = n->sel.nkeys; i>0; i--, key++) {
+ for (i = n->sel.nkeys; i > 0; i--, key++) {
int toff = off + key->off + (off2 & key->offmask);
- __be32 *data, _data;
+ __be32 *data, hdata;
if (skb_headroom(skb) + toff > INT_MAX)
goto out;
- data = skb_header_pointer(skb, toff, 4, &_data);
+ data = skb_header_pointer(skb, toff, 4, &hdata);
if (!data)
goto out;
if ((*data ^ key->val) & key->mask) {
@@ -148,13 +147,13 @@ next_knode:
goto next_knode;
}
#ifdef CONFIG_CLS_U32_PERF
- n->pf->kcnts[j] +=1;
+ n->pf->kcnts[j] += 1;
j++;
#endif
}
if (n->ht_down == NULL) {
check_terminal:
- if (n->sel.flags&TC_U32_TERMINAL) {
+ if (n->sel.flags & TC_U32_TERMINAL) {
*res = n->res;
#ifdef CONFIG_NET_CLS_IND
@@ -164,7 +163,7 @@ check_terminal:
}
#endif
#ifdef CONFIG_CLS_U32_PERF
- n->pf->rhit +=1;
+ n->pf->rhit += 1;
#endif
r = tcf_exts_exec(skb, &n->exts, res);
if (r < 0) {
@@ -188,26 +187,26 @@ check_terminal:
ht = n->ht_down;
sel = 0;
if (ht->divisor) {
- __be32 *data, _data;
+ __be32 *data, hdata;
data = skb_header_pointer(skb, off + n->sel.hoff, 4,
- &_data);
+ &hdata);
if (!data)
goto out;
sel = ht->divisor & u32_hash_fold(*data, &n->sel,
n->fshift);
}
- if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT)))
+ if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
goto next_ht;
- if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) {
+ if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
off2 = n->sel.off + 3;
if (n->sel.flags & TC_U32_VAROFFSET) {
- __be16 *data, _data;
+ __be16 *data, hdata;
data = skb_header_pointer(skb,
off + n->sel.offoff,
- 2, &_data);
+ 2, &hdata);
if (!data)
goto out;
off2 += ntohs(n->sel.offmask & *data) >>
@@ -215,7 +214,7 @@ check_terminal:
}
off2 &= ~3;
}
- if (n->sel.flags&TC_U32_EAT) {
+ if (n->sel.flags & TC_U32_EAT) {
off += off2;
off2 = 0;
}
@@ -236,11 +235,11 @@ out:
deadloop:
if (net_ratelimit())
- printk(KERN_WARNING "cls_u32: dead loop\n");
+ pr_warning("cls_u32: dead loop\n");
return -1;
}
-static __inline__ struct tc_u_hnode *
+static struct tc_u_hnode *
u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
{
struct tc_u_hnode *ht;
@@ -252,10 +251,10 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
return ht;
}
-static __inline__ struct tc_u_knode *
+static struct tc_u_knode *
u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
{
- unsigned sel;
+ unsigned int sel;
struct tc_u_knode *n = NULL;
sel = TC_U32_HASH(handle);
@@ -300,7 +299,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c)
do {
if (++tp_c->hgenerator == 0x7FF)
tp_c->hgenerator = 1;
- } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
+ } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
}
@@ -378,9 +377,9 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
{
struct tc_u_knode *n;
- unsigned h;
+ unsigned int h;
- for (h=0; h<=ht->divisor; h++) {
+ for (h = 0; h <= ht->divisor; h++) {
while ((n = ht->ht[h]) != NULL) {
ht->ht[h] = n->next;
@@ -446,13 +445,13 @@ static void u32_destroy(struct tcf_proto *tp)
static int u32_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct tc_u_hnode *ht = (struct tc_u_hnode*)arg;
+ struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
if (ht == NULL)
return 0;
if (TC_U32_KEY(ht->handle))
- return u32_delete_key(tp, (struct tc_u_knode*)ht);
+ return u32_delete_key(tp, (struct tc_u_knode *)ht);
if (tp->root == ht)
return -EINVAL;
@@ -470,14 +469,14 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
{
struct tc_u_knode *n;
- unsigned i = 0x7FF;
+ unsigned int i = 0x7FF;
- for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
+ for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
if (i < TC_U32_NODE(n->handle))
i = TC_U32_NODE(n->handle);
i++;
- return handle|(i>0xFFF ? 0xFFF : i);
+ return handle | (i > 0xFFF ? 0xFFF : i);
}
static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
@@ -566,7 +565,8 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
if (err < 0)
return err;
- if ((n = (struct tc_u_knode*)*arg) != NULL) {
+ n = (struct tc_u_knode *)*arg;
+ if (n) {
if (TC_U32_KEY(n->handle) == 0)
return -EINVAL;
@@ -574,7 +574,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
}
if (tb[TCA_U32_DIVISOR]) {
- unsigned divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
+ unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
if (--divisor > 0x100)
return -EINVAL;
@@ -585,7 +585,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
if (handle == 0)
return -ENOMEM;
}
- ht = kzalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL);
+ ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
if (ht == NULL)
return -ENOBUFS;
ht->tp_c = tp_c;
@@ -683,7 +683,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
struct tc_u_knode *n;
- unsigned h;
+ unsigned int h;
if (arg->stop)
return;
@@ -717,7 +717,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
static int u32_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct tc_u_knode *n = (struct tc_u_knode*)fh;
+ struct tc_u_knode *n = (struct tc_u_knode *)fh;
struct nlattr *nest;
if (n == NULL)
@@ -730,8 +730,9 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
if (TC_U32_KEY(n->handle) == 0) {
- struct tc_u_hnode *ht = (struct tc_u_hnode*)fh;
- u32 divisor = ht->divisor+1;
+ struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
+ u32 divisor = ht->divisor + 1;
+
NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor);
} else {
NLA_PUT(skb, TCA_U32_SEL,
@@ -755,7 +756,7 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
#ifdef CONFIG_NET_CLS_IND
- if(strlen(n->indev))
+ if (strlen(n->indev))
NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev);
#endif
#ifdef CONFIG_CLS_U32_PERF
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index bc450397487..1c8360a2752 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -33,40 +33,41 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
return 0;
switch (cmp->align) {
- case TCF_EM_ALIGN_U8:
- val = *ptr;
- break;
+ case TCF_EM_ALIGN_U8:
+ val = *ptr;
+ break;
- case TCF_EM_ALIGN_U16:
- val = get_unaligned_be16(ptr);
+ case TCF_EM_ALIGN_U16:
+ val = get_unaligned_be16(ptr);
- if (cmp_needs_transformation(cmp))
- val = be16_to_cpu(val);
- break;
+ if (cmp_needs_transformation(cmp))
+ val = be16_to_cpu(val);
+ break;
- case TCF_EM_ALIGN_U32:
- /* Worth checking boundries? The branching seems
- * to get worse. Visit again. */
- val = get_unaligned_be32(ptr);
+ case TCF_EM_ALIGN_U32:
+ /* Worth checking boundries? The branching seems
+ * to get worse. Visit again.
+ */
+ val = get_unaligned_be32(ptr);
- if (cmp_needs_transformation(cmp))
- val = be32_to_cpu(val);
- break;
+ if (cmp_needs_transformation(cmp))
+ val = be32_to_cpu(val);
+ break;
- default:
- return 0;
+ default:
+ return 0;
}
if (cmp->mask)
val &= cmp->mask;
switch (cmp->opnd) {
- case TCF_EM_OPND_EQ:
- return val == cmp->val;
- case TCF_EM_OPND_LT:
- return val < cmp->val;
- case TCF_EM_OPND_GT:
- return val > cmp->val;
+ case TCF_EM_OPND_EQ:
+ return val == cmp->val;
+ case TCF_EM_OPND_LT:
+ return val < cmp->val;
+ case TCF_EM_OPND_GT:
+ return val > cmp->val;
}
return 0;
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 34da5e29ea1..a4de67eca82 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -73,21 +73,18 @@
#include <net/pkt_cls.h>
#include <net/sock.h>
-struct meta_obj
-{
+struct meta_obj {
unsigned long value;
unsigned int len;
};
-struct meta_value
-{
+struct meta_value {
struct tcf_meta_val hdr;
unsigned long val;
unsigned int len;
};
-struct meta_match
-{
+struct meta_match {
struct meta_value lvalue;
struct meta_value rvalue;
};
@@ -255,7 +252,7 @@ META_COLLECTOR(int_rtclassid)
if (unlikely(skb_dst(skb) == NULL))
*err = -1;
else
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
dst->value = skb_dst(skb)->tclassid;
#else
dst->value = 0;
@@ -267,7 +264,7 @@ META_COLLECTOR(int_rtiif)
if (unlikely(skb_rtable(skb) == NULL))
*err = -1;
else
- dst->value = skb_rtable(skb)->fl.iif;
+ dst->value = skb_rtable(skb)->rt_iif;
}
/**************************************************************************
@@ -404,7 +401,7 @@ META_COLLECTOR(int_sk_sndbuf)
META_COLLECTOR(int_sk_alloc)
{
SKIP_NONLOCAL(skb);
- dst->value = skb->sk->sk_allocation;
+ dst->value = (__force int) skb->sk->sk_allocation;
}
META_COLLECTOR(int_sk_route_caps)
@@ -483,8 +480,7 @@ META_COLLECTOR(int_sk_write_pend)
* Meta value collectors assignment table
**************************************************************************/
-struct meta_ops
-{
+struct meta_ops {
void (*get)(struct sk_buff *, struct tcf_pkt_info *,
struct meta_value *, struct meta_obj *, int *);
};
@@ -494,7 +490,7 @@ struct meta_ops
/* Meta value operations table listing all meta value collectors and
* assigns them to a type and meta id. */
-static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
+static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = {
[TCF_META_TYPE_VAR] = {
[META_ID(DEV)] = META_FUNC(var_dev),
[META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
@@ -550,7 +546,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
}
};
-static inline struct meta_ops * meta_ops(struct meta_value *val)
+static inline struct meta_ops *meta_ops(struct meta_value *val)
{
return &__meta_ops[meta_type(val)][meta_id(val)];
}
@@ -649,9 +645,8 @@ static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
{
if (v->len == sizeof(unsigned long))
NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val);
- else if (v->len == sizeof(u32)) {
+ else if (v->len == sizeof(u32))
NLA_PUT_U32(skb, tlv, v->val);
- }
return 0;
@@ -663,8 +658,7 @@ nla_put_failure:
* Type specific operations table
**************************************************************************/
-struct meta_type_ops
-{
+struct meta_type_ops {
void (*destroy)(struct meta_value *);
int (*compare)(struct meta_obj *, struct meta_obj *);
int (*change)(struct meta_value *, struct nlattr *);
@@ -672,7 +666,7 @@ struct meta_type_ops
int (*dump)(struct sk_buff *, struct meta_value *, int);
};
-static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
+static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
[TCF_META_TYPE_VAR] = {
.destroy = meta_var_destroy,
.compare = meta_var_compare,
@@ -688,7 +682,7 @@ static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
}
};
-static inline struct meta_type_ops * meta_type_ops(struct meta_value *v)
+static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)
{
return &__meta_type_ops[meta_type(v)];
}
@@ -713,7 +707,7 @@ static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
return err;
if (meta_type_ops(v)->apply_extras)
- meta_type_ops(v)->apply_extras(v, dst);
+ meta_type_ops(v)->apply_extras(v, dst);
return 0;
}
@@ -732,12 +726,12 @@ static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
switch (meta->lvalue.hdr.op) {
- case TCF_EM_OPND_EQ:
- return !r;
- case TCF_EM_OPND_LT:
- return r < 0;
- case TCF_EM_OPND_GT:
- return r > 0;
+ case TCF_EM_OPND_EQ:
+ return !r;
+ case TCF_EM_OPND_LT:
+ return r < 0;
+ case TCF_EM_OPND_GT:
+ return r > 0;
}
return 0;
@@ -771,7 +765,7 @@ static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)
static inline int meta_is_supported(struct meta_value *val)
{
- return (!meta_id(val) || meta_ops(val)->get);
+ return !meta_id(val) || meta_ops(val)->get;
}
static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = {
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 1a4176aee6e..a3bed07a008 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -18,8 +18,7 @@
#include <linux/tc_ematch/tc_em_nbyte.h>
#include <net/pkt_cls.h>
-struct nbyte_data
-{
+struct nbyte_data {
struct tcf_em_nbyte hdr;
char pattern[0];
};
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index ea8f566e720..15d353d2e4b 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -19,8 +19,7 @@
#include <linux/tc_ematch/tc_em_text.h>
#include <net/pkt_cls.h>
-struct text_match
-{
+struct text_match {
u16 from_offset;
u16 to_offset;
u8 from_layer;
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
index 953f1479f7d..797bdb88c01 100644
--- a/net/sched/em_u32.c
+++ b/net/sched/em_u32.c
@@ -35,7 +35,7 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
return 0;
- return !(((*(__be32*) ptr) ^ key->val) & key->mask);
+ return !(((*(__be32 *) ptr) ^ key->val) & key->mask);
}
static struct tcf_ematch_ops em_u32_ops = {
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 5e37da961f8..88d93eb9250 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -93,7 +93,7 @@
static LIST_HEAD(ematch_ops);
static DEFINE_RWLOCK(ematch_mod_lock);
-static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind)
+static struct tcf_ematch_ops *tcf_em_lookup(u16 kind)
{
struct tcf_ematch_ops *e = NULL;
@@ -163,8 +163,8 @@ void tcf_em_unregister(struct tcf_ematch_ops *ops)
}
EXPORT_SYMBOL(tcf_em_unregister);
-static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree,
- int index)
+static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree,
+ int index)
{
return &tree->matches[index];
}
@@ -184,7 +184,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
if (em_hdr->kind == TCF_EM_CONTAINER) {
/* Special ematch called "container", carries an index
- * referencing an external ematch sequence. */
+ * referencing an external ematch sequence.
+ */
u32 ref;
if (data_len < sizeof(ref))
@@ -195,7 +196,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
goto errout;
/* We do not allow backward jumps to avoid loops and jumps
- * to our own position are of course illegal. */
+ * to our own position are of course illegal.
+ */
if (ref <= idx)
goto errout;
@@ -208,7 +210,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
* which automatically releases the reference again, therefore
* the module MUST not be given back under any circumstances
* here. Be aware, the destroy function assumes that the
- * module is held if the ops field is non zero. */
+ * module is held if the ops field is non zero.
+ */
em->ops = tcf_em_lookup(em_hdr->kind);
if (em->ops == NULL) {
@@ -221,7 +224,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
if (em->ops) {
/* We dropped the RTNL mutex in order to
* perform the module load. Tell the caller
- * to replay the request. */
+ * to replay the request.
+ */
module_put(em->ops->owner);
err = -EAGAIN;
}
@@ -230,7 +234,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
}
/* ematch module provides expected length of data, so we
- * can do a basic sanity check. */
+ * can do a basic sanity check.
+ */
if (em->ops->datalen && data_len < em->ops->datalen)
goto errout;
@@ -246,7 +251,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
* TCF_EM_SIMPLE may be specified stating that the
* data only consists of a u32 integer and the module
* does not expected a memory reference but rather
- * the value carried. */
+ * the value carried.
+ */
if (em_hdr->flags & TCF_EM_SIMPLE) {
if (data_len < sizeof(u32))
goto errout;
@@ -334,7 +340,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
* The array of rt attributes is parsed in the order as they are
* provided, their type must be incremental from 1 to n. Even
* if it does not serve any real purpose, a failure of sticking
- * to this policy will result in parsing failure. */
+ * to this policy will result in parsing failure.
+ */
for (idx = 0; nla_ok(rt_match, list_len); idx++) {
err = -EINVAL;
@@ -359,7 +366,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
/* Check if the number of matches provided by userspace actually
* complies with the array of matches. The number was used for
* the validation of references and a mismatch could lead to
- * undefined references during the matching process. */
+ * undefined references during the matching process.
+ */
if (idx != tree_hdr->nmatches) {
err = -EINVAL;
goto errout_abort;
@@ -449,7 +457,7 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
.flags = em->flags
};
- NLA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr);
+ NLA_PUT(skb, i + 1, sizeof(em_hdr), &em_hdr);
if (em->ops && em->ops->dump) {
if (em->ops->dump(skb, em) < 0)
@@ -478,6 +486,7 @@ static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
int r = em->ops->match(skb, em, info);
+
return tcf_em_is_inverted(em) ? !r : r;
}
@@ -527,8 +536,8 @@ pop_stack:
stack_overflow:
if (net_ratelimit())
- printk(KERN_WARNING "tc ematch: local stack overflow,"
- " increase NET_EMATCH_STACK\n");
+ pr_warning("tc ematch: local stack overflow,"
+ " increase NET_EMATCH_STACK\n");
return -1;
}
EXPORT_SYMBOL(__tcf_em_tree_match);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b22ca2d1ceb..7490f3f2db8 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -187,7 +187,7 @@ int unregister_qdisc(struct Qdisc_ops *qops)
int err = -ENOENT;
write_lock(&qdisc_mod_lock);
- for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
+ for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
if (q == qops)
break;
if (q) {
@@ -321,7 +321,9 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)
if (!tab || --tab->refcnt)
return;
- for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
+ for (rtabp = &qdisc_rtab_list;
+ (rtab = *rtabp) != NULL;
+ rtabp = &rtab->next) {
if (rtab == tab) {
*rtabp = rtab->next;
kfree(rtab);
@@ -396,6 +398,11 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
return stab;
}
+static void stab_kfree_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct qdisc_size_table, rcu));
+}
+
void qdisc_put_stab(struct qdisc_size_table *tab)
{
if (!tab)
@@ -405,7 +412,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
if (--tab->refcnt == 0) {
list_del(&tab->list);
- kfree(tab);
+ call_rcu_bh(&tab->rcu, stab_kfree_rcu);
}
spin_unlock(&qdisc_stab_lock);
@@ -428,7 +435,7 @@ nla_put_failure:
return -1;
}
-void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
+void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
{
int pkt_len, slot;
@@ -454,14 +461,13 @@ out:
pkt_len = 1;
qdisc_skb_cb(skb)->pkt_len = pkt_len;
}
-EXPORT_SYMBOL(qdisc_calculate_pkt_len);
+EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
{
if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
- printk(KERN_WARNING
- "%s: %s qdisc %X: is non-work-conserving?\n",
- txt, qdisc->ops->id, qdisc->handle >> 16);
+ pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
+ txt, qdisc->ops->id, qdisc->handle >> 16);
qdisc->flags |= TCQ_F_WARN_NONWC;
}
}
@@ -472,7 +478,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
timer);
- wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(wd->qdisc);
__netif_schedule(qdisc_root(wd->qdisc));
return HRTIMER_NORESTART;
@@ -494,7 +500,7 @@ void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
&qdisc_root_sleeping(wd->qdisc)->state))
return;
- wd->qdisc->flags |= TCQ_F_THROTTLED;
+ qdisc_throttled(wd->qdisc);
time = ktime_set(0, 0);
time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
@@ -504,7 +510,7 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule);
void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
{
hrtimer_cancel(&wd->timer);
- wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(wd->qdisc);
}
EXPORT_SYMBOL(qdisc_watchdog_cancel);
@@ -625,7 +631,7 @@ static u32 qdisc_alloc_handle(struct net_device *dev)
autohandle = TC_H_MAKE(0x80000000U, 0);
} while (qdisc_lookup(dev, autohandle) && --i > 0);
- return i>0 ? autohandle : 0;
+ return i > 0 ? autohandle : 0;
}
void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
@@ -834,7 +840,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
err = PTR_ERR(stab);
goto err_out4;
}
- sch->stab = stab;
+ rcu_assign_pointer(sch->stab, stab);
}
if (tca[TCA_RATE]) {
spinlock_t *root_lock;
@@ -874,7 +880,7 @@ err_out4:
* Any broken qdiscs that would require a ops->reset() here?
* The qdisc was never in action so it shouldn't be necessary.
*/
- qdisc_put_stab(sch->stab);
+ qdisc_put_stab(rtnl_dereference(sch->stab));
if (ops->destroy)
ops->destroy(sch);
goto err_out3;
@@ -882,7 +888,7 @@ err_out4:
static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
{
- struct qdisc_size_table *stab = NULL;
+ struct qdisc_size_table *ostab, *stab = NULL;
int err = 0;
if (tca[TCA_OPTIONS]) {
@@ -899,8 +905,9 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
return PTR_ERR(stab);
}
- qdisc_put_stab(sch->stab);
- sch->stab = stab;
+ ostab = rtnl_dereference(sch->stab);
+ rcu_assign_pointer(sch->stab, stab);
+ qdisc_put_stab(ostab);
if (tca[TCA_RATE]) {
/* NB: ignores errors from replace_estimator
@@ -915,9 +922,8 @@ out:
return 0;
}
-struct check_loop_arg
-{
- struct qdisc_walker w;
+struct check_loop_arg {
+ struct qdisc_walker w;
struct Qdisc *p;
int depth;
};
@@ -970,7 +976,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
struct Qdisc *p = NULL;
int err;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return -ENODEV;
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -980,12 +987,12 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
if (clid) {
if (clid != TC_H_ROOT) {
if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
- if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+ p = qdisc_lookup(dev, TC_H_MAJ(clid));
+ if (!p)
return -ENOENT;
q = qdisc_leaf(p, clid);
- } else { /* ingress */
- if (dev_ingress_queue(dev))
- q = dev_ingress_queue(dev)->qdisc_sleeping;
+ } else if (dev_ingress_queue(dev)) {
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
}
} else {
q = dev->qdisc;
@@ -996,7 +1003,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
return -EINVAL;
} else {
- if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ if (!q)
return -ENOENT;
}
@@ -1008,7 +1016,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
return -EINVAL;
if (q->handle == 0)
return -ENOENT;
- if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
+ err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
+ if (err != 0)
return err;
} else {
qdisc_notify(net, skb, n, clid, NULL, q);
@@ -1017,7 +1026,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
}
/*
- Create/change qdisc.
+ * Create/change qdisc.
*/
static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
@@ -1036,7 +1045,8 @@ replay:
clid = tcm->tcm_parent;
q = p = NULL;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return -ENODEV;
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -1046,12 +1056,12 @@ replay:
if (clid) {
if (clid != TC_H_ROOT) {
if (clid != TC_H_INGRESS) {
- if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
+ p = qdisc_lookup(dev, TC_H_MAJ(clid));
+ if (!p)
return -ENOENT;
q = qdisc_leaf(p, clid);
- } else { /* ingress */
- if (dev_ingress_queue_create(dev))
- q = dev_ingress_queue(dev)->qdisc_sleeping;
+ } else if (dev_ingress_queue_create(dev)) {
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
}
} else {
q = dev->qdisc;
@@ -1063,13 +1073,14 @@ replay:
if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
if (tcm->tcm_handle) {
- if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
+ if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
return -EEXIST;
if (TC_H_MIN(tcm->tcm_handle))
return -EINVAL;
- if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ if (!q)
goto create_n_graft;
- if (n->nlmsg_flags&NLM_F_EXCL)
+ if (n->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
return -EINVAL;
@@ -1079,7 +1090,7 @@ replay:
atomic_inc(&q->refcnt);
goto graft;
} else {
- if (q == NULL)
+ if (!q)
goto create_n_graft;
/* This magic test requires explanation.
@@ -1101,9 +1112,9 @@ replay:
* For now we select create/graft, if
* user gave KIND, which does not match existing.
*/
- if ((n->nlmsg_flags&NLM_F_CREATE) &&
- (n->nlmsg_flags&NLM_F_REPLACE) &&
- ((n->nlmsg_flags&NLM_F_EXCL) ||
+ if ((n->nlmsg_flags & NLM_F_CREATE) &&
+ (n->nlmsg_flags & NLM_F_REPLACE) &&
+ ((n->nlmsg_flags & NLM_F_EXCL) ||
(tca[TCA_KIND] &&
nla_strcmp(tca[TCA_KIND], q->ops->id))))
goto create_n_graft;
@@ -1118,7 +1129,7 @@ replay:
/* Change qdisc parameters */
if (q == NULL)
return -ENOENT;
- if (n->nlmsg_flags&NLM_F_EXCL)
+ if (n->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
return -EINVAL;
@@ -1128,7 +1139,7 @@ replay:
return err;
create_n_graft:
- if (!(n->nlmsg_flags&NLM_F_CREATE))
+ if (!(n->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
if (clid == TC_H_INGRESS) {
if (dev_ingress_queue(dev))
@@ -1175,6 +1186,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
struct gnet_dump d;
+ struct qdisc_size_table *stab;
nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
tcm = NLMSG_DATA(nlh);
@@ -1190,7 +1202,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
goto nla_put_failure;
q->qstats.qlen = q->q.qlen;
- if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
+ stab = rtnl_dereference(q->stab);
+ if (stab && qdisc_dump_stab(skb, stab) < 0)
goto nla_put_failure;
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
@@ -1234,16 +1247,19 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb,
return -ENOBUFS;
if (old && !tc_qdisc_dump_ignore(old)) {
- if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
+ if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
+ 0, RTM_DELQDISC) < 0)
goto err_out;
}
if (new && !tc_qdisc_dump_ignore(new)) {
- if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+ if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
+ old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
goto err_out;
}
if (skb->len)
- return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
err_out:
kfree_skb(skb);
@@ -1275,7 +1291,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
q_idx++;
continue;
}
- if (!tc_qdisc_dump_ignore(q) &&
+ if (!tc_qdisc_dump_ignore(q) &&
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
goto done;
@@ -1356,7 +1372,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
u32 qid = TC_H_MAJ(clid);
int err;
- if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return -ENODEV;
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -1391,9 +1408,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
qid = dev->qdisc->handle;
/* Now qid is genuine qdisc handle consistent
- both with parent and child.
-
- TC_H_MAJ(pid) still may be unspecified, complete it now.
+ * both with parent and child.
+ *
+ * TC_H_MAJ(pid) still may be unspecified, complete it now.
*/
if (pid)
pid = TC_H_MAKE(qid, pid);
@@ -1403,7 +1420,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
}
/* OK. Locate qdisc */
- if ((q = qdisc_lookup(dev, qid)) == NULL)
+ q = qdisc_lookup(dev, qid);
+ if (!q)
return -ENOENT;
/* An check that it supports classes */
@@ -1423,13 +1441,14 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
if (cl == 0) {
err = -ENOENT;
- if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
+ if (n->nlmsg_type != RTM_NEWTCLASS ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
goto out;
} else {
switch (n->nlmsg_type) {
case RTM_NEWTCLASS:
err = -EEXIST;
- if (n->nlmsg_flags&NLM_F_EXCL)
+ if (n->nlmsg_flags & NLM_F_EXCL)
goto out;
break;
case RTM_DELTCLASS:
@@ -1521,14 +1540,14 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb,
return -EINVAL;
}
- return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
}
-struct qdisc_dump_args
-{
- struct qdisc_walker w;
- struct sk_buff *skb;
- struct netlink_callback *cb;
+struct qdisc_dump_args {
+ struct qdisc_walker w;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
};
static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
@@ -1590,7 +1609,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
+ struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
struct net *net = sock_net(skb->sk);
struct netdev_queue *dev_queue;
struct net_device *dev;
@@ -1598,7 +1617,8 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
return 0;
- if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
+ dev = dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
return 0;
s_t = cb->args[0];
@@ -1621,19 +1641,22 @@ done:
}
/* Main classifier routine: scans classifier chain attached
- to this qdisc, (optionally) tests for protocol and asks
- specific classifiers.
+ * to this qdisc, (optionally) tests for protocol and asks
+ * specific classifiers.
*/
int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
__be16 protocol = skb->protocol;
- int err = 0;
+ int err;
for (; tp; tp = tp->next) {
- if ((tp->protocol == protocol ||
- tp->protocol == htons(ETH_P_ALL)) &&
- (err = tp->classify(skb, tp, res)) >= 0) {
+ if (tp->protocol != protocol &&
+ tp->protocol != htons(ETH_P_ALL))
+ continue;
+ err = tp->classify(skb, tp, res);
+
+ if (err >= 0) {
#ifdef CONFIG_NET_CLS_ACT
if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
@@ -1649,12 +1672,12 @@ int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
int err = 0;
- __be16 protocol;
#ifdef CONFIG_NET_CLS_ACT
+ __be16 protocol;
struct tcf_proto *otp = tp;
reclassify:
-#endif
protocol = skb->protocol;
+#endif
err = tc_classify_compat(skb, tp, res);
#ifdef CONFIG_NET_CLS_ACT
@@ -1664,11 +1687,11 @@ reclassify:
if (verd++ >= MAX_REC_LOOP) {
if (net_ratelimit())
- printk(KERN_NOTICE
- "%s: packet reclassify loop"
+ pr_notice("%s: packet reclassify loop"
" rule prio %u protocol %02x\n",
- tp->q->ops->id,
- tp->prio & 0xffff, ntohs(tp->protocol));
+ tp->q->ops->id,
+ tp->prio & 0xffff,
+ ntohs(tp->protocol));
return TC_ACT_SHOT;
}
skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
@@ -1761,7 +1784,7 @@ static int __init pktsched_init(void)
err = register_pernet_subsys(&psched_net_ops);
if (err) {
- printk(KERN_ERR "pktsched_init: "
+ pr_err("pktsched_init: "
"cannot initialize per netns operations\n");
return err;
}
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 943d733409d..3f08158b868 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -319,7 +319,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
* creation), and one for the reference held when calling delete.
*/
if (flow->ref < 2) {
- printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n", flow->ref);
+ pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
return -EINVAL;
}
if (flow->ref > 2)
@@ -384,12 +384,12 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
}
flow = NULL;
- done:
- ;
+done:
+ ;
}
- if (!flow)
+ if (!flow) {
flow = &p->link;
- else {
+ } else {
if (flow->vcc)
ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
/*@@@ looks good ... but it's not supposed to work :-) */
@@ -576,8 +576,7 @@ static void atm_tc_destroy(struct Qdisc *sch)
list_for_each_entry_safe(flow, tmp, &p->flows, list) {
if (flow->ref > 1)
- printk(KERN_ERR "atm_destroy: %p->ref = %d\n", flow,
- flow->ref);
+ pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
atm_tc_put(sch, (unsigned long)flow);
}
tasklet_kill(&p->task);
@@ -616,9 +615,8 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
}
if (flow->excess)
NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid);
- else {
+ else
NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0);
- }
nla_nest_end(skb, nest);
return skb->len;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index c80d1c210c5..24d94c097b3 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -72,8 +72,7 @@
struct cbq_sched_data;
-struct cbq_class
-{
+struct cbq_class {
struct Qdisc_class_common common;
struct cbq_class *next_alive; /* next class with backlog in this priority band */
@@ -139,19 +138,18 @@ struct cbq_class
int refcnt;
int filters;
- struct cbq_class *defaults[TC_PRIO_MAX+1];
+ struct cbq_class *defaults[TC_PRIO_MAX + 1];
};
-struct cbq_sched_data
-{
+struct cbq_sched_data {
struct Qdisc_class_hash clhash; /* Hash table of all classes */
- int nclasses[TC_CBQ_MAXPRIO+1];
- unsigned quanta[TC_CBQ_MAXPRIO+1];
+ int nclasses[TC_CBQ_MAXPRIO + 1];
+ unsigned int quanta[TC_CBQ_MAXPRIO + 1];
struct cbq_class link;
- unsigned activemask;
- struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
+ unsigned int activemask;
+ struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
with backlog */
#ifdef CONFIG_NET_CLS_ACT
@@ -162,7 +160,7 @@ struct cbq_sched_data
int tx_len;
psched_time_t now; /* Cached timestamp */
psched_time_t now_rt; /* Cached real time */
- unsigned pmask;
+ unsigned int pmask;
struct hrtimer delay_timer;
struct qdisc_watchdog watchdog; /* Watchdog timer,
@@ -175,9 +173,9 @@ struct cbq_sched_data
};
-#define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
+#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
-static __inline__ struct cbq_class *
+static inline struct cbq_class *
cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
{
struct Qdisc_class_common *clc;
@@ -193,25 +191,27 @@ cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
static struct cbq_class *
cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
{
- struct cbq_class *cl, *new;
+ struct cbq_class *cl;
- for (cl = this->tparent; cl; cl = cl->tparent)
- if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
- return new;
+ for (cl = this->tparent; cl; cl = cl->tparent) {
+ struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
+ if (new != NULL && new != this)
+ return new;
+ }
return NULL;
}
#endif
/* Classify packet. The procedure is pretty complicated, but
- it allows us to combine link sharing and priority scheduling
- transparently.
-
- Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
- so that it resolves to split nodes. Then packets are classified
- by logical priority, or a more specific classifier may be attached
- to the split node.
+ * it allows us to combine link sharing and priority scheduling
+ * transparently.
+ *
+ * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
+ * so that it resolves to split nodes. Then packets are classified
+ * by logical priority, or a more specific classifier may be attached
+ * to the split node.
*/
static struct cbq_class *
@@ -227,7 +227,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
/*
* Step 1. If skb->priority points to one of our classes, use it.
*/
- if (TC_H_MAJ(prio^sch->handle) == 0 &&
+ if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
(cl = cbq_class_lookup(q, prio)) != NULL)
return cl;
@@ -243,10 +243,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
(result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
goto fallback;
- if ((cl = (void*)res.class) == NULL) {
+ cl = (void *)res.class;
+ if (!cl) {
if (TC_H_MAJ(res.classid))
cl = cbq_class_lookup(q, res.classid);
- else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
+ else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
cl = defmap[TC_PRIO_BESTEFFORT];
if (cl == NULL || cl->level >= head->level)
@@ -282,7 +283,7 @@ fallback:
* Step 4. No success...
*/
if (TC_H_MAJ(prio) == 0 &&
- !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
+ !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
!(cl = head->defaults[TC_PRIO_BESTEFFORT]))
return head;
@@ -290,12 +291,12 @@ fallback:
}
/*
- A packet has just been enqueued on the empty class.
- cbq_activate_class adds it to the tail of active class list
- of its priority band.
+ * A packet has just been enqueued on the empty class.
+ * cbq_activate_class adds it to the tail of active class list
+ * of its priority band.
*/
-static __inline__ void cbq_activate_class(struct cbq_class *cl)
+static inline void cbq_activate_class(struct cbq_class *cl)
{
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
int prio = cl->cpriority;
@@ -314,9 +315,9 @@ static __inline__ void cbq_activate_class(struct cbq_class *cl)
}
/*
- Unlink class from active chain.
- Note that this same procedure is done directly in cbq_dequeue*
- during round-robin procedure.
+ * Unlink class from active chain.
+ * Note that this same procedure is done directly in cbq_dequeue*
+ * during round-robin procedure.
*/
static void cbq_deactivate_class(struct cbq_class *this)
@@ -350,7 +351,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
{
int toplevel = q->toplevel;
- if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
+ if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
psched_time_t now;
psched_tdiff_t incr;
@@ -363,7 +364,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
q->toplevel = cl->level;
return;
}
- } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
+ } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
}
}
@@ -390,7 +391,6 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, cl->q);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
cbq_mark_toplevel(q, cl);
if (!cl->next_alive)
cbq_activate_class(cl);
@@ -418,11 +418,11 @@ static void cbq_ovl_classic(struct cbq_class *cl)
delay += cl->offtime;
/*
- Class goes to sleep, so that it will have no
- chance to work avgidle. Let's forgive it 8)
-
- BTW cbq-2.0 has a crap in this
- place, apparently they forgot to shift it by cl->ewma_log.
+ * Class goes to sleep, so that it will have no
+ * chance to work avgidle. Let's forgive it 8)
+ *
+ * BTW cbq-2.0 has a crap in this
+ * place, apparently they forgot to shift it by cl->ewma_log.
*/
if (cl->avgidle < 0)
delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
@@ -439,8 +439,8 @@ static void cbq_ovl_classic(struct cbq_class *cl)
q->wd_expires = delay;
/* Dirty work! We must schedule wakeups based on
- real available rate, rather than leaf rate,
- which may be tiny (even zero).
+ * real available rate, rather than leaf rate,
+ * which may be tiny (even zero).
*/
if (q->toplevel == TC_CBQ_MAXLEVEL) {
struct cbq_class *b;
@@ -460,7 +460,7 @@ static void cbq_ovl_classic(struct cbq_class *cl)
}
/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
- they go overlimit
+ * they go overlimit
*/
static void cbq_ovl_rclassic(struct cbq_class *cl)
@@ -595,7 +595,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
struct Qdisc *sch = q->watchdog.qdisc;
psched_time_t now;
psched_tdiff_t delay = 0;
- unsigned pmask;
+ unsigned int pmask;
now = psched_get_time();
@@ -624,7 +624,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
}
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
__netif_schedule(qdisc_root(sch));
return HRTIMER_NORESTART;
}
@@ -649,7 +649,6 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
ret = qdisc_enqueue(skb, cl->q);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
if (!cl->next_alive)
cbq_activate_class(cl);
return 0;
@@ -665,15 +664,15 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
#endif
/*
- It is mission critical procedure.
-
- We "regenerate" toplevel cutoff, if transmitting class
- has backlog and it is not regulated. It is not part of
- original CBQ description, but looks more reasonable.
- Probably, it is wrong. This question needs further investigation.
-*/
+ * It is mission critical procedure.
+ *
+ * We "regenerate" toplevel cutoff, if transmitting class
+ * has backlog and it is not regulated. It is not part of
+ * original CBQ description, but looks more reasonable.
+ * Probably, it is wrong. This question needs further investigation.
+ */
-static __inline__ void
+static inline void
cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
struct cbq_class *borrowed)
{
@@ -684,7 +683,7 @@ cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
q->toplevel = borrowed->level;
return;
}
- } while ((borrowed=borrowed->borrow) != NULL);
+ } while ((borrowed = borrowed->borrow) != NULL);
}
#if 0
/* It is not necessary now. Uncommenting it
@@ -712,10 +711,10 @@ cbq_update(struct cbq_sched_data *q)
cl->bstats.bytes += len;
/*
- (now - last) is total time between packet right edges.
- (last_pktlen/rate) is "virtual" busy time, so that
-
- idle = (now - last) - last_pktlen/rate
+ * (now - last) is total time between packet right edges.
+ * (last_pktlen/rate) is "virtual" busy time, so that
+ *
+ * idle = (now - last) - last_pktlen/rate
*/
idle = q->now - cl->last;
@@ -725,9 +724,9 @@ cbq_update(struct cbq_sched_data *q)
idle -= L2T(cl, len);
/* true_avgidle := (1-W)*true_avgidle + W*idle,
- where W=2^{-ewma_log}. But cl->avgidle is scaled:
- cl->avgidle == true_avgidle/W,
- hence:
+ * where W=2^{-ewma_log}. But cl->avgidle is scaled:
+ * cl->avgidle == true_avgidle/W,
+ * hence:
*/
avgidle += idle - (avgidle>>cl->ewma_log);
}
@@ -741,22 +740,22 @@ cbq_update(struct cbq_sched_data *q)
cl->avgidle = avgidle;
/* Calculate expected time, when this class
- will be allowed to send.
- It will occur, when:
- (1-W)*true_avgidle + W*delay = 0, i.e.
- idle = (1/W - 1)*(-true_avgidle)
- or
- idle = (1 - W)*(-cl->avgidle);
+ * will be allowed to send.
+ * It will occur, when:
+ * (1-W)*true_avgidle + W*delay = 0, i.e.
+ * idle = (1/W - 1)*(-true_avgidle)
+ * or
+ * idle = (1 - W)*(-cl->avgidle);
*/
idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
/*
- That is not all.
- To maintain the rate allocated to the class,
- we add to undertime virtual clock,
- necessary to complete transmitted packet.
- (len/phys_bandwidth has been already passed
- to the moment of cbq_update)
+ * That is not all.
+ * To maintain the rate allocated to the class,
+ * we add to undertime virtual clock,
+ * necessary to complete transmitted packet.
+ * (len/phys_bandwidth has been already passed
+ * to the moment of cbq_update)
*/
idle -= L2T(&q->link, len);
@@ -778,7 +777,7 @@ cbq_update(struct cbq_sched_data *q)
cbq_update_toplevel(q, this, q->tx_borrowed);
}
-static __inline__ struct cbq_class *
+static inline struct cbq_class *
cbq_under_limit(struct cbq_class *cl)
{
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
@@ -794,16 +793,17 @@ cbq_under_limit(struct cbq_class *cl)
do {
/* It is very suspicious place. Now overlimit
- action is generated for not bounded classes
- only if link is completely congested.
- Though it is in agree with ancestor-only paradigm,
- it looks very stupid. Particularly,
- it means that this chunk of code will either
- never be called or result in strong amplification
- of burstiness. Dangerous, silly, and, however,
- no another solution exists.
+ * action is generated for not bounded classes
+ * only if link is completely congested.
+ * Though it is in agree with ancestor-only paradigm,
+ * it looks very stupid. Particularly,
+ * it means that this chunk of code will either
+ * never be called or result in strong amplification
+ * of burstiness. Dangerous, silly, and, however,
+ * no another solution exists.
*/
- if ((cl = cl->borrow) == NULL) {
+ cl = cl->borrow;
+ if (!cl) {
this_cl->qstats.overlimits++;
this_cl->overlimit(this_cl);
return NULL;
@@ -816,7 +816,7 @@ cbq_under_limit(struct cbq_class *cl)
return cl;
}
-static __inline__ struct sk_buff *
+static inline struct sk_buff *
cbq_dequeue_prio(struct Qdisc *sch, int prio)
{
struct cbq_sched_data *q = qdisc_priv(sch);
@@ -840,7 +840,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
if (cl->deficit <= 0) {
/* Class exhausted its allotment per
- this round. Switch to the next one.
+ * this round. Switch to the next one.
*/
deficit = 1;
cl->deficit += cl->quantum;
@@ -850,8 +850,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
skb = cl->q->dequeue(cl->q);
/* Class did not give us any skb :-(
- It could occur even if cl->q->q.qlen != 0
- f.e. if cl->q == "tbf"
+ * It could occur even if cl->q->q.qlen != 0
+ * f.e. if cl->q == "tbf"
*/
if (skb == NULL)
goto skip_class;
@@ -880,7 +880,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
skip_class:
if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
/* Class is empty or penalized.
- Unlink it from active chain.
+ * Unlink it from active chain.
*/
cl_prev->next_alive = cl->next_alive;
cl->next_alive = NULL;
@@ -919,14 +919,14 @@ next_class:
return NULL;
}
-static __inline__ struct sk_buff *
+static inline struct sk_buff *
cbq_dequeue_1(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
- unsigned activemask;
+ unsigned int activemask;
- activemask = q->activemask&0xFF;
+ activemask = q->activemask & 0xFF;
while (activemask) {
int prio = ffz(~activemask);
activemask &= ~(1<<prio);
@@ -951,11 +951,11 @@ cbq_dequeue(struct Qdisc *sch)
if (q->tx_class) {
psched_tdiff_t incr2;
/* Time integrator. We calculate EOS time
- by adding expected packet transmission time.
- If real time is greater, we warp artificial clock,
- so that:
-
- cbq_time = max(real_time, work);
+ * by adding expected packet transmission time.
+ * If real time is greater, we warp artificial clock,
+ * so that:
+ *
+ * cbq_time = max(real_time, work);
*/
incr2 = L2T(&q->link, q->tx_len);
q->now += incr2;
@@ -971,28 +971,29 @@ cbq_dequeue(struct Qdisc *sch)
skb = cbq_dequeue_1(sch);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
return skb;
}
/* All the classes are overlimit.
-
- It is possible, if:
-
- 1. Scheduler is empty.
- 2. Toplevel cutoff inhibited borrowing.
- 3. Root class is overlimit.
-
- Reset 2d and 3d conditions and retry.
-
- Note, that NS and cbq-2.0 are buggy, peeking
- an arbitrary class is appropriate for ancestor-only
- sharing, but not for toplevel algorithm.
-
- Our version is better, but slower, because it requires
- two passes, but it is unavoidable with top-level sharing.
- */
+ *
+ * It is possible, if:
+ *
+ * 1. Scheduler is empty.
+ * 2. Toplevel cutoff inhibited borrowing.
+ * 3. Root class is overlimit.
+ *
+ * Reset 2d and 3d conditions and retry.
+ *
+ * Note, that NS and cbq-2.0 are buggy, peeking
+ * an arbitrary class is appropriate for ancestor-only
+ * sharing, but not for toplevel algorithm.
+ *
+ * Our version is better, but slower, because it requires
+ * two passes, but it is unavoidable with top-level sharing.
+ */
if (q->toplevel == TC_CBQ_MAXLEVEL &&
q->link.undertime == PSCHED_PASTPERFECT)
@@ -1003,7 +1004,8 @@ cbq_dequeue(struct Qdisc *sch)
}
/* No packets in scheduler or nobody wants to give them to us :-(
- Sigh... start watchdog timer in the last case. */
+ * Sigh... start watchdog timer in the last case.
+ */
if (sch->q.qlen) {
sch->qstats.overlimits++;
@@ -1025,13 +1027,14 @@ static void cbq_adjust_levels(struct cbq_class *this)
int level = 0;
struct cbq_class *cl;
- if ((cl = this->children) != NULL) {
+ cl = this->children;
+ if (cl) {
do {
if (cl->level > level)
level = cl->level;
} while ((cl = cl->sibling) != this->children);
}
- this->level = level+1;
+ this->level = level + 1;
} while ((this = this->tparent) != NULL);
}
@@ -1047,14 +1050,15 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
for (h = 0; h < q->clhash.hashsize; h++) {
hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
/* BUGGGG... Beware! This expression suffer of
- arithmetic overflows!
+ * arithmetic overflows!
*/
if (cl->priority == prio) {
cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
q->quanta[prio];
}
if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
- printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
+ pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
+ cl->common.classid, cl->quantum);
cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
}
}
@@ -1065,18 +1069,18 @@ static void cbq_sync_defmap(struct cbq_class *cl)
{
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
struct cbq_class *split = cl->split;
- unsigned h;
+ unsigned int h;
int i;
if (split == NULL)
return;
- for (i=0; i<=TC_PRIO_MAX; i++) {
- if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
+ if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
split->defaults[i] = NULL;
}
- for (i=0; i<=TC_PRIO_MAX; i++) {
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
int level = split->level;
if (split->defaults[i])
@@ -1089,7 +1093,7 @@ static void cbq_sync_defmap(struct cbq_class *cl)
hlist_for_each_entry(c, n, &q->clhash.hash[h],
common.hnode) {
if (c->split == split && c->level < level &&
- c->defmap&(1<<i)) {
+ c->defmap & (1<<i)) {
split->defaults[i] = c;
level = c->level;
}
@@ -1103,7 +1107,8 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma
struct cbq_class *split = NULL;
if (splitid == 0) {
- if ((split = cl->split) == NULL)
+ split = cl->split;
+ if (!split)
return;
splitid = split->common.classid;
}
@@ -1121,9 +1126,9 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma
cl->defmap = 0;
cbq_sync_defmap(cl);
cl->split = split;
- cl->defmap = def&mask;
+ cl->defmap = def & mask;
} else
- cl->defmap = (cl->defmap&~mask)|(def&mask);
+ cl->defmap = (cl->defmap & ~mask) | (def & mask);
cbq_sync_defmap(cl);
}
@@ -1136,7 +1141,7 @@ static void cbq_unlink_class(struct cbq_class *this)
qdisc_class_hash_remove(&q->clhash, &this->common);
if (this->tparent) {
- clp=&this->sibling;
+ clp = &this->sibling;
cl = *clp;
do {
if (cl == this) {
@@ -1175,7 +1180,7 @@ static void cbq_link_class(struct cbq_class *this)
}
}
-static unsigned int cbq_drop(struct Qdisc* sch)
+static unsigned int cbq_drop(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl, *cl_head;
@@ -1183,7 +1188,8 @@ static unsigned int cbq_drop(struct Qdisc* sch)
unsigned int len;
for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
- if ((cl_head = q->active[prio]) == NULL)
+ cl_head = q->active[prio];
+ if (!cl_head)
continue;
cl = cl_head;
@@ -1200,13 +1206,13 @@ static unsigned int cbq_drop(struct Qdisc* sch)
}
static void
-cbq_reset(struct Qdisc* sch)
+cbq_reset(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl;
struct hlist_node *n;
int prio;
- unsigned h;
+ unsigned int h;
q->activemask = 0;
q->pmask = 0;
@@ -1238,21 +1244,21 @@ cbq_reset(struct Qdisc* sch)
static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
{
- if (lss->change&TCF_CBQ_LSS_FLAGS) {
- cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
- cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
+ if (lss->change & TCF_CBQ_LSS_FLAGS) {
+ cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
+ cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
}
- if (lss->change&TCF_CBQ_LSS_EWMA)
+ if (lss->change & TCF_CBQ_LSS_EWMA)
cl->ewma_log = lss->ewma_log;
- if (lss->change&TCF_CBQ_LSS_AVPKT)
+ if (lss->change & TCF_CBQ_LSS_AVPKT)
cl->avpkt = lss->avpkt;
- if (lss->change&TCF_CBQ_LSS_MINIDLE)
+ if (lss->change & TCF_CBQ_LSS_MINIDLE)
cl->minidle = -(long)lss->minidle;
- if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
+ if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
cl->maxidle = lss->maxidle;
cl->avgidle = lss->maxidle;
}
- if (lss->change&TCF_CBQ_LSS_OFFTIME)
+ if (lss->change & TCF_CBQ_LSS_OFFTIME)
cl->offtime = lss->offtime;
return 0;
}
@@ -1280,10 +1286,10 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
if (wrr->weight)
cl->weight = wrr->weight;
if (wrr->priority) {
- cl->priority = wrr->priority-1;
+ cl->priority = wrr->priority - 1;
cl->cpriority = cl->priority;
if (cl->priority >= cl->priority2)
- cl->priority2 = TC_CBQ_MAXPRIO-1;
+ cl->priority2 = TC_CBQ_MAXPRIO - 1;
}
cbq_addprio(q, cl);
@@ -1300,10 +1306,10 @@ static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
cl->overlimit = cbq_ovl_delay;
break;
case TC_CBQ_OVL_LOWPRIO:
- if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
- ovl->priority2-1 <= cl->priority)
+ if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
+ ovl->priority2 - 1 <= cl->priority)
return -EINVAL;
- cl->priority2 = ovl->priority2-1;
+ cl->priority2 = ovl->priority2 - 1;
cl->overlimit = cbq_ovl_lowprio;
break;
case TC_CBQ_OVL_DROP:
@@ -1382,9 +1388,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
if (!q->link.q)
q->link.q = &noop_qdisc;
- q->link.priority = TC_CBQ_MAXPRIO-1;
- q->link.priority2 = TC_CBQ_MAXPRIO-1;
- q->link.cpriority = TC_CBQ_MAXPRIO-1;
+ q->link.priority = TC_CBQ_MAXPRIO - 1;
+ q->link.priority2 = TC_CBQ_MAXPRIO - 1;
+ q->link.cpriority = TC_CBQ_MAXPRIO - 1;
q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
q->link.overlimit = cbq_ovl_classic;
q->link.allot = psched_mtu(qdisc_dev(sch));
@@ -1415,7 +1421,7 @@ put_rtab:
return err;
}
-static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
@@ -1427,7 +1433,7 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_lssopt opt;
@@ -1452,15 +1458,15 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_wrropt opt;
opt.flags = 0;
opt.allot = cl->allot;
- opt.priority = cl->priority+1;
- opt.cpriority = cl->cpriority+1;
+ opt.priority = cl->priority + 1;
+ opt.cpriority = cl->cpriority + 1;
opt.weight = cl->weight;
NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
return skb->len;
@@ -1470,13 +1476,13 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_ovl opt;
opt.strategy = cl->ovl_strategy;
- opt.priority2 = cl->priority2+1;
+ opt.priority2 = cl->priority2 + 1;
opt.pad = 0;
opt.penalty = cl->penalty;
NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
@@ -1487,7 +1493,7 @@ nla_put_failure:
return -1;
}
-static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_fopt opt;
@@ -1506,7 +1512,7 @@ nla_put_failure:
}
#ifdef CONFIG_NET_CLS_ACT
-static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
+static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
{
unsigned char *b = skb_tail_pointer(skb);
struct tc_cbq_police opt;
@@ -1570,7 +1576,7 @@ static int
cbq_dump_class(struct Qdisc *sch, unsigned long arg,
struct sk_buff *skb, struct tcmsg *tcm)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
struct nlattr *nest;
if (cl->tparent)
@@ -1598,7 +1604,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
struct gnet_dump *d)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
cl->qstats.qlen = cl->q->q.qlen;
cl->xstats.avgidle = cl->avgidle;
@@ -1618,7 +1624,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
if (new == NULL) {
new = qdisc_create_dflt(sch->dev_queue,
@@ -1641,10 +1647,9 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
return 0;
}
-static struct Qdisc *
-cbq_leaf(struct Qdisc *sch, unsigned long arg)
+static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
return cl->q;
}
@@ -1683,13 +1688,12 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
kfree(cl);
}
-static void
-cbq_destroy(struct Qdisc* sch)
+static void cbq_destroy(struct Qdisc *sch)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct hlist_node *n, *next;
struct cbq_class *cl;
- unsigned h;
+ unsigned int h;
#ifdef CONFIG_NET_CLS_ACT
q->rx_class = NULL;
@@ -1713,7 +1717,7 @@ cbq_destroy(struct Qdisc* sch)
static void cbq_put(struct Qdisc *sch, unsigned long arg)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
if (--cl->refcnt == 0) {
#ifdef CONFIG_NET_CLS_ACT
@@ -1736,7 +1740,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
{
int err;
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class*)*arg;
+ struct cbq_class *cl = (struct cbq_class *)*arg;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_CBQ_MAX + 1];
struct cbq_class *parent;
@@ -1828,13 +1832,14 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
if (classid) {
err = -EINVAL;
- if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
+ if (TC_H_MAJ(classid ^ sch->handle) ||
+ cbq_class_lookup(q, classid))
goto failure;
} else {
int i;
- classid = TC_H_MAKE(sch->handle,0x8000);
+ classid = TC_H_MAKE(sch->handle, 0x8000);
- for (i=0; i<0x8000; i++) {
+ for (i = 0; i < 0x8000; i++) {
if (++q->hgenerator >= 0x8000)
q->hgenerator = 1;
if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
@@ -1891,11 +1896,11 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
cl->minidle = -0x7FFFFFFF;
cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
- if (cl->ewma_log==0)
+ if (cl->ewma_log == 0)
cl->ewma_log = q->link.ewma_log;
- if (cl->maxidle==0)
+ if (cl->maxidle == 0)
cl->maxidle = q->link.maxidle;
- if (cl->avpkt==0)
+ if (cl->avpkt == 0)
cl->avpkt = q->link.avpkt;
cl->overlimit = cbq_ovl_classic;
if (tb[TCA_CBQ_OVL_STRATEGY])
@@ -1921,7 +1926,7 @@ failure:
static int cbq_delete(struct Qdisc *sch, unsigned long arg)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
unsigned int qlen;
if (cl->filters || cl->children || cl == &q->link)
@@ -1979,7 +1984,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
struct cbq_sched_data *q = qdisc_priv(sch);
- struct cbq_class *p = (struct cbq_class*)parent;
+ struct cbq_class *p = (struct cbq_class *)parent;
struct cbq_class *cl = cbq_class_lookup(q, classid);
if (cl) {
@@ -1993,7 +1998,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
{
- struct cbq_class *cl = (struct cbq_class*)arg;
+ struct cbq_class *cl = (struct cbq_class *)arg;
cl->filters--;
}
@@ -2003,7 +2008,7 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl;
struct hlist_node *n;
- unsigned h;
+ unsigned int h;
if (arg->stop)
return;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
new file mode 100644
index 00000000000..06afbaeb4c8
--- /dev/null
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,688 @@
+/*
+ * net/sched/sch_choke.c CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+
+/*
+ CHOKe stateless AQM for fair bandwidth allocation
+ =================================================
+
+ CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+ unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+ maintains no flow state. The difference from RED is an additional step
+ during the enqueuing process. If average queue size is over the
+ low threshold (qmin), a packet is chosen at random from the queue.
+ If both the new and chosen packet are from the same flow, both
+ are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+ needs to access packets in queue randomly. It has a minimal class
+ interface to allow overriding the builtin flow classifier with
+ filters.
+
+ Source:
+ R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+ Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+ IEEE INFOCOM, 2000.
+
+ A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+ Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+/* Upper bound on size of sk_buff table (packets) */
+#define CHOKE_MAX_QUEUE (128*1024 - 1)
+
+struct choke_sched_data {
+/* Parameters */
+ u32 limit;
+ unsigned char flags;
+
+ struct red_parms parms;
+
+/* Variables */
+ struct tcf_proto *filter_list;
+ struct {
+ u32 prob_drop; /* Early probability drops */
+ u32 prob_mark; /* Early probability marks */
+ u32 forced_drop; /* Forced drops, qavg > max_thresh */
+ u32 forced_mark; /* Forced marks, qavg > max_thresh */
+ u32 pdrop; /* Drops due to queue limits */
+ u32 other; /* Drops due to drop() calls */
+ u32 matched; /* Drops to flow match */
+ } stats;
+
+ unsigned int head;
+ unsigned int tail;
+
+ unsigned int tab_mask; /* size - 1 */
+
+ struct sk_buff **tab;
+};
+
+/* deliver a random number between 0 and N - 1 */
+static u32 random_N(unsigned int N)
+{
+ return reciprocal_divide(random32(), N);
+}
+
+/* number of elements in queue including holes */
+static unsigned int choke_len(const struct choke_sched_data *q)
+{
+ return (q->tail - q->head) & q->tab_mask;
+}
+
+/* Is ECN parameter configured */
+static int use_ecn(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static int use_harddrop(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+ do {
+ q->head = (q->head + 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->head] == NULL);
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+ do {
+ q->tail = (q->tail - 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->tail] == NULL);
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = q->tab[idx];
+
+ q->tab[idx] = NULL;
+
+ if (idx == q->head)
+ choke_zap_head_holes(q);
+ if (idx == q->tail)
+ choke_zap_tail_holes(q);
+
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_drop(skb, sch);
+ qdisc_tree_decrease_qlen(sch, 1);
+ --sch->q.qlen;
+}
+
+/*
+ * Compare flow of two packets
+ * Returns true only if source and destination address and port match.
+ * false for special cases
+ */
+static bool choke_match_flow(struct sk_buff *skb1,
+ struct sk_buff *skb2)
+{
+ int off1, off2, poff;
+ const u32 *ports1, *ports2;
+ u8 ip_proto;
+ __u32 hash1;
+
+ if (skb1->protocol != skb2->protocol)
+ return false;
+
+ /* Use hash value as quick check
+ * Assumes that __skb_get_rxhash makes IP header and ports linear
+ */
+ hash1 = skb_get_rxhash(skb1);
+ if (!hash1 || hash1 != skb_get_rxhash(skb2))
+ return false;
+
+ /* Probably match, but be sure to avoid hash collisions */
+ off1 = skb_network_offset(skb1);
+ off2 = skb_network_offset(skb2);
+
+ switch (skb1->protocol) {
+ case __constant_htons(ETH_P_IP): {
+ const struct iphdr *ip1, *ip2;
+
+ ip1 = (const struct iphdr *) (skb1->data + off1);
+ ip2 = (const struct iphdr *) (skb2->data + off2);
+
+ ip_proto = ip1->protocol;
+ if (ip_proto != ip2->protocol ||
+ ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
+ return false;
+
+ if ((ip1->frag_off | ip2->frag_off) & htons(IP_MF | IP_OFFSET))
+ ip_proto = 0;
+ off1 += ip1->ihl * 4;
+ off2 += ip2->ihl * 4;
+ break;
+ }
+
+ case __constant_htons(ETH_P_IPV6): {
+ const struct ipv6hdr *ip1, *ip2;
+
+ ip1 = (const struct ipv6hdr *) (skb1->data + off1);
+ ip2 = (const struct ipv6hdr *) (skb2->data + off2);
+
+ ip_proto = ip1->nexthdr;
+ if (ip_proto != ip2->nexthdr ||
+ ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) ||
+ ipv6_addr_cmp(&ip1->daddr, &ip2->daddr))
+ return false;
+ off1 += 40;
+ off2 += 40;
+ }
+
+ default: /* Maybe compare MAC header here? */
+ return false;
+ }
+
+ poff = proto_ports_offset(ip_proto);
+ if (poff < 0)
+ return true;
+
+ off1 += poff;
+ off2 += poff;
+
+ ports1 = (__force u32 *)(skb1->data + off1);
+ ports2 = (__force u32 *)(skb2->data + off2);
+ return *ports1 == *ports2;
+}
+
+struct choke_skb_cb {
+ u16 classid;
+};
+
+static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(skb->cb) <
+ sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb));
+ return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
+{
+ choke_skb_cb(skb)->classid = classid;
+}
+
+static u16 choke_get_classid(const struct sk_buff *skb)
+{
+ return choke_skb_cb(skb)->classid;
+}
+
+/*
+ * Classify flow using either:
+ * 1. pre-existing classification result in skb
+ * 2. fast internal classification
+ * 3. use TC filter based classification
+ */
+static bool choke_classify(struct sk_buff *skb,
+ struct Qdisc *sch, int *qerr)
+
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tcf_result res;
+ int result;
+
+ result = tc_classify(skb, q->filter_list, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return false;
+ }
+#endif
+ choke_set_classid(skb, TC_H_MIN(res.classid));
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Select a packet at random from queue
+ * HACK: since queue can have holes from previous deletion; retry several
+ * times to find a random skb but then just give up and return the head
+ * Will return NULL if queue is empty (q->head == q->tail)
+ */
+static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
+ unsigned int *pidx)
+{
+ struct sk_buff *skb;
+ int retrys = 3;
+
+ do {
+ *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+ skb = q->tab[*pidx];
+ if (skb)
+ return skb;
+ } while (--retrys > 0);
+
+ return q->tab[*pidx = q->head];
+}
+
+/*
+ * Compare new packet with random packet in queue
+ * returns true if matched and sets *pidx
+ */
+static bool choke_match_random(const struct choke_sched_data *q,
+ struct sk_buff *nskb,
+ unsigned int *pidx)
+{
+ struct sk_buff *oskb;
+
+ if (q->head == q->tail)
+ return false;
+
+ oskb = choke_peek_random(q, pidx);
+ if (q->filter_list)
+ return choke_get_classid(nskb) == choke_get_classid(oskb);
+
+ return choke_match_flow(oskb, nskb);
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct red_parms *p = &q->parms;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+ if (q->filter_list) {
+ /* If using external classifiers, get result and record it. */
+ if (!choke_classify(skb, sch, &ret))
+ goto other_drop; /* Packet was eaten by filter */
+ }
+
+ /* Compute average queue usage (see RED) */
+ p->qavg = red_calc_qavg(p, sch->q.qlen);
+ if (red_is_idling(p))
+ red_end_of_idle_period(p);
+
+ /* Is queue small? */
+ if (p->qavg <= p->qth_min)
+ p->qcount = -1;
+ else {
+ unsigned int idx;
+
+ /* Draw a packet at random from queue and compare flow */
+ if (choke_match_random(q, skb, &idx)) {
+ q->stats.matched++;
+ choke_drop_by_idx(sch, idx);
+ goto congestion_drop;
+ }
+
+ /* Queue is large, always mark/drop */
+ if (p->qavg > p->qth_max) {
+ p->qcount = -1;
+
+ sch->qstats.overlimits++;
+ if (use_harddrop(q) || !use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ } else if (++p->qcount) {
+ if (red_mark_probability(p, p->qavg)) {
+ p->qcount = 0;
+ p->qR = red_random(p);
+
+ sch->qstats.overlimits++;
+ if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ }
+ } else
+ p->qR = red_random(p);
+ }
+
+ /* Admit new packet */
+ if (sch->q.qlen < q->limit) {
+ q->tab[q->tail] = skb;
+ q->tail = (q->tail + 1) & q->tab_mask;
+ ++sch->q.qlen;
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ q->stats.pdrop++;
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+
+ congestion_drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+
+ other_drop:
+ if (ret & __NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ if (q->head == q->tail) {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ return NULL;
+ }
+
+ skb = q->tab[q->head];
+ q->tab[q->head] = NULL;
+ choke_zap_head_holes(q);
+ --sch->q.qlen;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_bstats_update(sch, skb);
+
+ return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ unsigned int len;
+
+ len = qdisc_queue_drop(sch);
+ if (len > 0)
+ q->stats.other++;
+ else {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ }
+
+ return len;
+}
+
+static void choke_reset(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
+ [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
+ [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+ if (addr) {
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+ }
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CHOKE_MAX + 1];
+ const struct tc_red_qopt *ctl;
+ int err;
+ struct sk_buff **old = NULL;
+ unsigned int mask;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CHOKE_PARMS] == NULL ||
+ tb[TCA_CHOKE_STAB] == NULL)
+ return -EINVAL;
+
+ ctl = nla_data(tb[TCA_CHOKE_PARMS]);
+
+ if (ctl->limit > CHOKE_MAX_QUEUE)
+ return -EINVAL;
+
+ mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+ if (mask != q->tab_mask) {
+ struct sk_buff **ntab;
+
+ ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
+ if (!ntab)
+ ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+ if (!ntab)
+ return -ENOMEM;
+
+ sch_tree_lock(sch);
+ old = q->tab;
+ if (old) {
+ unsigned int oqlen = sch->q.qlen, tail = 0;
+
+ while (q->head != q->tail) {
+ struct sk_buff *skb = q->tab[q->head];
+
+ q->head = (q->head + 1) & q->tab_mask;
+ if (!skb)
+ continue;
+ if (tail < mask) {
+ ntab[tail++] = skb;
+ continue;
+ }
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ --sch->q.qlen;
+ qdisc_drop(skb, sch);
+ }
+ qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+ q->head = 0;
+ q->tail = tail;
+ }
+
+ q->tab_mask = mask;
+ q->tab = ntab;
+ } else
+ sch_tree_lock(sch);
+
+ q->flags = ctl->flags;
+ q->limit = ctl->limit;
+
+ red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ nla_data(tb[TCA_CHOKE_STAB]));
+
+ if (q->head == q->tail)
+ red_end_of_idle_period(&q->parms);
+
+ sch_tree_unlock(sch);
+ choke_free(old);
+ return 0;
+}
+
+static int choke_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+ struct tc_red_qopt opt = {
+ .limit = q->limit,
+ .flags = q->flags,
+ .qth_min = q->parms.qth_min >> q->parms.Wlog,
+ .qth_max = q->parms.qth_max >> q->parms.Wlog,
+ .Wlog = q->parms.Wlog,
+ .Plog = q->parms.Plog,
+ .Scell_log = q->parms.Scell_log,
+ };
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tc_choke_xstats st = {
+ .early = q->stats.prob_drop + q->stats.forced_drop,
+ .marked = q->stats.prob_mark + q->stats.forced_mark,
+ .pdrop = q->stats.pdrop,
+ .other = q->stats.other,
+ .matched = q->stats.matched,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ if (!arg->stop) {
+ if (arg->fn(sch, 1, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+ .leaf = choke_leaf,
+ .get = choke_get,
+ .put = choke_put,
+ .tcf_chain = choke_find_tcf,
+ .bind_tcf = choke_bind,
+ .unbind_tcf = choke_put,
+ .dump = choke_dump_class,
+ .walk = choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+ .id = "choke",
+ .priv_size = sizeof(struct choke_sched_data),
+
+ .enqueue = choke_enqueue,
+ .dequeue = choke_dequeue,
+ .peek = choke_peek_head,
+ .drop = choke_drop,
+ .init = choke_init,
+ .destroy = choke_destroy,
+ .reset = choke_reset,
+ .change = choke_change,
+ .dump = choke_dump,
+ .dump_stats = choke_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+ return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+ unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index de55e642eaf..6b7fe4a84f1 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -376,7 +376,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
bstats_update(&cl->bstats, skb);
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return err;
@@ -403,6 +402,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
skb = qdisc_dequeue_peeked(cl->qdisc);
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 60f4bdd4408..2c790204d04 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -137,10 +137,10 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
if (tb[TCA_DSMARK_VALUE])
- p->value[*arg-1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
+ p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
if (tb[TCA_DSMARK_MASK])
- p->mask[*arg-1] = mask;
+ p->mask[*arg - 1] = mask;
err = 0;
@@ -155,8 +155,8 @@ static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
if (!dsmark_valid_index(p, arg))
return -EINVAL;
- p->mask[arg-1] = 0xff;
- p->value[arg-1] = 0;
+ p->mask[arg - 1] = 0xff;
+ p->value[arg - 1] = 0;
return 0;
}
@@ -175,7 +175,7 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
if (p->mask[i] == 0xff && !p->value[i])
goto ignore;
if (walker->count >= walker->skip) {
- if (walker->fn(sch, i+1, walker) < 0) {
+ if (walker->fn(sch, i + 1, walker) < 0) {
walker->stop = 1;
break;
}
@@ -260,7 +260,6 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return err;
}
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -283,6 +282,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
if (skb == NULL)
return NULL;
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
index = skb->tc_index & (p->indices - 1);
@@ -304,9 +304,8 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
* and don't need yet another qdisc as a bypass.
*/
if (p->mask[index] != 0xff || p->value[index])
- printk(KERN_WARNING
- "dsmark_dequeue: unsupported protocol %d\n",
- ntohs(skb->protocol));
+ pr_warning("dsmark_dequeue: unsupported protocol %d\n",
+ ntohs(skb->protocol));
break;
}
@@ -424,14 +423,14 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
if (!dsmark_valid_index(p, cl))
return -EINVAL;
- tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1);
+ tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
tcm->tcm_info = p->q->handle;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
- NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl-1]);
- NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl-1]);
+ NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]);
+ NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]);
return nla_nest_end(skb, opts);
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index aa4d6337e43..66effe2da8e 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -19,44 +19,30 @@
/* 1 band FIFO pseudo-"scheduler" */
-struct fifo_sched_data
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
- u32 limit;
-};
-
-static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
-{
- struct fifo_sched_data *q = qdisc_priv(sch);
-
- if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= q->limit))
+ if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_reshape_fail(skb, sch);
}
-static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
- struct fifo_sched_data *q = qdisc_priv(sch);
-
- if (likely(skb_queue_len(&sch->q) < q->limit))
+ if (likely(skb_queue_len(&sch->q) < sch->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_reshape_fail(skb, sch);
}
-static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
- struct sk_buff *skb_head;
- struct fifo_sched_data *q = qdisc_priv(sch);
-
- if (likely(skb_queue_len(&sch->q) < q->limit))
+ if (likely(skb_queue_len(&sch->q) < sch->limit))
return qdisc_enqueue_tail(skb, sch);
/* queue full, remove one skb to fulfill the limit */
- skb_head = qdisc_dequeue_head(sch);
+ __qdisc_queue_drop_head(sch, &sch->q);
sch->qstats.drops++;
- kfree_skb(skb_head);
-
qdisc_enqueue_tail(skb, sch);
return NET_XMIT_CN;
@@ -64,31 +50,40 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch)
static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
{
- struct fifo_sched_data *q = qdisc_priv(sch);
+ bool bypass;
+ bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
if (opt == NULL) {
u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
- if (sch->ops == &bfifo_qdisc_ops)
+ if (is_bfifo)
limit *= psched_mtu(qdisc_dev(sch));
- q->limit = limit;
+ sch->limit = limit;
} else {
struct tc_fifo_qopt *ctl = nla_data(opt);
if (nla_len(opt) < sizeof(*ctl))
return -EINVAL;
- q->limit = ctl->limit;
+ sch->limit = ctl->limit;
}
+ if (is_bfifo)
+ bypass = sch->limit >= psched_mtu(qdisc_dev(sch));
+ else
+ bypass = sch->limit >= 1;
+
+ if (bypass)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
{
- struct fifo_sched_data *q = qdisc_priv(sch);
- struct tc_fifo_qopt opt = { .limit = q->limit };
+ struct tc_fifo_qopt opt = { .limit = sch->limit };
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
@@ -99,7 +94,7 @@ nla_put_failure:
struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
.id = "pfifo",
- .priv_size = sizeof(struct fifo_sched_data),
+ .priv_size = 0,
.enqueue = pfifo_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
@@ -114,7 +109,7 @@ EXPORT_SYMBOL(pfifo_qdisc_ops);
struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
.id = "bfifo",
- .priv_size = sizeof(struct fifo_sched_data),
+ .priv_size = 0,
.enqueue = bfifo_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
@@ -129,7 +124,7 @@ EXPORT_SYMBOL(bfifo_qdisc_ops);
struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
.id = "pfifo_head_drop",
- .priv_size = sizeof(struct fifo_sched_data),
+ .priv_size = 0,
.enqueue = pfifo_tail_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 34dc598440a..c84b65920d1 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -87,8 +87,8 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
*/
kfree_skb(skb);
if (net_ratelimit())
- printk(KERN_WARNING "Dead loop on netdevice %s, "
- "fix it urgently!\n", dev_queue->dev->name);
+ pr_warning("Dead loop on netdevice %s, fix it urgently!\n",
+ dev_queue->dev->name);
ret = qdisc_qlen(q);
} else {
/*
@@ -137,8 +137,8 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
} else {
/* Driver returned NETDEV_TX_BUSY - requeue skb */
if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
- printk(KERN_WARNING "BUG %s code %d qlen %d\n",
- dev->name, ret, q->q.qlen);
+ pr_warning("BUG %s code %d qlen %d\n",
+ dev->name, ret, q->q.qlen);
ret = dev_requeue_skb(skb, q);
}
@@ -412,8 +412,9 @@ static struct Qdisc noqueue_qdisc = {
};
-static const u8 prio2band[TC_PRIO_MAX+1] =
- { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
+static const u8 prio2band[TC_PRIO_MAX + 1] = {
+ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+};
/* 3-band FIFO queue: old style, but should be a bit faster than
generic prio+fifo combination.
@@ -445,7 +446,7 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
return priv->q + band;
}
-static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
{
if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
@@ -460,7 +461,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
return qdisc_drop(skb, qdisc);
}
-static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
+static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
int band = bitmap2band[priv->bitmap];
@@ -479,7 +480,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
return NULL;
}
-static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc)
+static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
int band = bitmap2band[priv->bitmap];
@@ -493,7 +494,7 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc)
return NULL;
}
-static void pfifo_fast_reset(struct Qdisc* qdisc)
+static void pfifo_fast_reset(struct Qdisc *qdisc)
{
int prio;
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
@@ -510,7 +511,7 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
- memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
+ memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
@@ -526,6 +527,8 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
skb_queue_head_init(band2list(priv, prio));
+ /* Can by-pass the queue discipline */
+ qdisc->flags |= TCQ_F_CAN_BYPASS;
return 0;
}
@@ -540,27 +543,32 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
.dump = pfifo_fast_dump,
.owner = THIS_MODULE,
};
+EXPORT_SYMBOL(pfifo_fast_ops);
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
struct Qdisc_ops *ops)
{
void *p;
struct Qdisc *sch;
- unsigned int size;
+ unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
int err = -ENOBUFS;
- /* ensure that the Qdisc and the private data are 64-byte aligned */
- size = QDISC_ALIGN(sizeof(*sch));
- size += ops->priv_size + (QDISC_ALIGNTO - 1);
-
p = kzalloc_node(size, GFP_KERNEL,
netdev_queue_numa_node_read(dev_queue));
if (!p)
goto errout;
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
- sch->padded = (char *) sch - (char *) p;
-
+ /* if we got non aligned memory, ask more and do alignment ourself */
+ if (sch != p) {
+ kfree(p);
+ p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
+ netdev_queue_numa_node_read(dev_queue));
+ if (!p)
+ goto errout;
+ sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+ sch->padded = (char *) sch - (char *) p;
+ }
INIT_LIST_HEAD(&sch->list);
skb_queue_head_init(&sch->q);
spin_lock_init(&sch->busylock);
@@ -630,7 +638,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
#ifdef CONFIG_NET_SCHED
qdisc_list_del(qdisc);
- qdisc_put_stab(qdisc->stab);
+ qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
if (ops->reset)
@@ -674,25 +682,21 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
return oqdisc;
}
+EXPORT_SYMBOL(dev_graft_qdisc);
static void attach_one_default_qdisc(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_unused)
{
- struct Qdisc *qdisc;
+ struct Qdisc *qdisc = &noqueue_qdisc;
if (dev->tx_queue_len) {
qdisc = qdisc_create_dflt(dev_queue,
&pfifo_fast_ops, TC_H_ROOT);
if (!qdisc) {
- printk(KERN_INFO "%s: activation failed\n", dev->name);
+ netdev_info(dev, "activation failed\n");
return;
}
-
- /* Can by-pass the queue discipline for default qdisc */
- qdisc->flags |= TCQ_F_CAN_BYPASS;
- } else {
- qdisc = &noqueue_qdisc;
}
dev_queue->qdisc_sleeping = qdisc;
}
@@ -761,6 +765,7 @@ void dev_activate(struct net_device *dev)
dev_watchdog_up(dev);
}
}
+EXPORT_SYMBOL(dev_activate);
static void dev_deactivate_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
@@ -839,7 +844,9 @@ void dev_deactivate(struct net_device *dev)
list_add(&dev->unreg_list, &single);
dev_deactivate_many(&single);
+ list_del(&single);
}
+EXPORT_SYMBOL(dev_deactivate);
static void dev_init_scheduler_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 51dcc2aa5c9..b9493a09a87 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -32,8 +32,7 @@
struct gred_sched_data;
struct gred_sched;
-struct gred_sched_data
-{
+struct gred_sched_data {
u32 limit; /* HARD maximal queue length */
u32 DP; /* the drop pramaters */
u32 bytesin; /* bytes seen on virtualQ so far*/
@@ -50,8 +49,7 @@ enum {
GRED_RIO_MODE,
};
-struct gred_sched
-{
+struct gred_sched {
struct gred_sched_data *tab[MAX_DPs];
unsigned long flags;
u32 red_flags;
@@ -150,17 +148,18 @@ static inline int gred_use_harddrop(struct gred_sched *t)
return t->red_flags & TC_RED_HARDDROP;
}
-static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
- struct gred_sched_data *q=NULL;
- struct gred_sched *t= qdisc_priv(sch);
+ struct gred_sched_data *q = NULL;
+ struct gred_sched *t = qdisc_priv(sch);
unsigned long qavg = 0;
u16 dp = tc_index_to_dp(skb);
- if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
dp = t->def;
- if ((q = t->tab[dp]) == NULL) {
+ q = t->tab[dp];
+ if (!q) {
/* Pass through packets not assigned to a DP
* if no default DP has been configured. This
* allows for DP flows to be left untouched.
@@ -183,7 +182,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
for (i = 0; i < t->DPs; i++) {
if (t->tab[i] && t->tab[i]->prio < q->prio &&
!red_is_idling(&t->tab[i]->parms))
- qavg +=t->tab[i]->parms.qavg;
+ qavg += t->tab[i]->parms.qavg;
}
}
@@ -203,28 +202,28 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
gred_store_wred_set(t, q);
switch (red_action(&q->parms, q->parms.qavg + qavg)) {
- case RED_DONT_MARK:
- break;
-
- case RED_PROB_MARK:
- sch->qstats.overlimits++;
- if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
- q->stats.prob_drop++;
- goto congestion_drop;
- }
-
- q->stats.prob_mark++;
- break;
-
- case RED_HARD_MARK:
- sch->qstats.overlimits++;
- if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
- !INET_ECN_set_ce(skb)) {
- q->stats.forced_drop++;
- goto congestion_drop;
- }
- q->stats.forced_mark++;
- break;
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ sch->qstats.overlimits++;
+ if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ break;
+
+ case RED_HARD_MARK:
+ sch->qstats.overlimits++;
+ if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+ q->stats.forced_mark++;
+ break;
}
if (q->backlog + qdisc_pkt_len(skb) <= q->limit) {
@@ -241,7 +240,7 @@ congestion_drop:
return NET_XMIT_CN;
}
-static struct sk_buff *gred_dequeue(struct Qdisc* sch)
+static struct sk_buff *gred_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
struct gred_sched *t = qdisc_priv(sch);
@@ -254,9 +253,9 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch)
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
if (net_ratelimit())
- printk(KERN_WARNING "GRED: Unable to relocate "
- "VQ 0x%x after dequeue, screwing up "
- "backlog.\n", tc_index_to_dp(skb));
+ pr_warning("GRED: Unable to relocate VQ 0x%x "
+ "after dequeue, screwing up "
+ "backlog.\n", tc_index_to_dp(skb));
} else {
q->backlog -= qdisc_pkt_len(skb);
@@ -273,7 +272,7 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch)
return NULL;
}
-static unsigned int gred_drop(struct Qdisc* sch)
+static unsigned int gred_drop(struct Qdisc *sch)
{
struct sk_buff *skb;
struct gred_sched *t = qdisc_priv(sch);
@@ -286,9 +285,9 @@ static unsigned int gred_drop(struct Qdisc* sch)
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
if (net_ratelimit())
- printk(KERN_WARNING "GRED: Unable to relocate "
- "VQ 0x%x while dropping, screwing up "
- "backlog.\n", tc_index_to_dp(skb));
+ pr_warning("GRED: Unable to relocate VQ 0x%x "
+ "while dropping, screwing up "
+ "backlog.\n", tc_index_to_dp(skb));
} else {
q->backlog -= len;
q->stats.other++;
@@ -308,7 +307,7 @@ static unsigned int gred_drop(struct Qdisc* sch)
}
-static void gred_reset(struct Qdisc* sch)
+static void gred_reset(struct Qdisc *sch)
{
int i;
struct gred_sched *t = qdisc_priv(sch);
@@ -369,8 +368,8 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
for (i = table->DPs; i < MAX_DPs; i++) {
if (table->tab[i]) {
- printk(KERN_WARNING "GRED: Warning: Destroying "
- "shadowed VQ 0x%x\n", i);
+ pr_warning("GRED: Warning: Destroying "
+ "shadowed VQ 0x%x\n", i);
gred_destroy_vq(table->tab[i]);
table->tab[i] = NULL;
}
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 2e45791d4f6..6488e642565 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -81,8 +81,7 @@
* that are expensive on 32-bit architectures.
*/
-struct internal_sc
-{
+struct internal_sc {
u64 sm1; /* scaled slope of the 1st segment */
u64 ism1; /* scaled inverse-slope of the 1st segment */
u64 dx; /* the x-projection of the 1st segment */
@@ -92,8 +91,7 @@ struct internal_sc
};
/* runtime service curve */
-struct runtime_sc
-{
+struct runtime_sc {
u64 x; /* current starting position on x-axis */
u64 y; /* current starting position on y-axis */
u64 sm1; /* scaled slope of the 1st segment */
@@ -104,15 +102,13 @@ struct runtime_sc
u64 ism2; /* scaled inverse-slope of the 2nd segment */
};
-enum hfsc_class_flags
-{
+enum hfsc_class_flags {
HFSC_RSC = 0x1,
HFSC_FSC = 0x2,
HFSC_USC = 0x4
};
-struct hfsc_class
-{
+struct hfsc_class {
struct Qdisc_class_common cl_common;
unsigned int refcnt; /* usage count */
@@ -140,8 +136,8 @@ struct hfsc_class
u64 cl_cumul; /* cumulative work in bytes done by
real-time criteria */
- u64 cl_d; /* deadline*/
- u64 cl_e; /* eligible time */
+ u64 cl_d; /* deadline*/
+ u64 cl_e; /* eligible time */
u64 cl_vt; /* virtual time */
u64 cl_f; /* time when this class will fit for
link-sharing, max(myf, cfmin) */
@@ -176,8 +172,7 @@ struct hfsc_class
unsigned long cl_nactive; /* number of active children */
};
-struct hfsc_sched
-{
+struct hfsc_sched {
u16 defcls; /* default class id */
struct hfsc_class root; /* root class */
struct Qdisc_class_hash clhash; /* class hash */
@@ -693,7 +688,7 @@ init_vf(struct hfsc_class *cl, unsigned int len)
if (go_active) {
n = rb_last(&cl->cl_parent->vt_tree);
if (n != NULL) {
- max_cl = rb_entry(n, struct hfsc_class,vt_node);
+ max_cl = rb_entry(n, struct hfsc_class, vt_node);
/*
* set vt to the average of the min and max
* classes. if the parent's period didn't
@@ -1177,8 +1172,10 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
return NULL;
}
#endif
- if ((cl = (struct hfsc_class *)res.class) == NULL) {
- if ((cl = hfsc_find_class(res.classid, sch)) == NULL)
+ cl = (struct hfsc_class *)res.class;
+ if (!cl) {
+ cl = hfsc_find_class(res.classid, sch);
+ if (!cl)
break; /* filter selected invalid classid */
if (cl->level >= head->level)
break; /* filter may only point downwards */
@@ -1316,7 +1313,7 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
return -1;
}
-static inline int
+static int
hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)
{
if ((cl->cl_flags & HFSC_RSC) &&
@@ -1420,7 +1417,8 @@ hfsc_schedule_watchdog(struct Qdisc *sch)
struct hfsc_class *cl;
u64 next_time = 0;
- if ((cl = eltree_get_minel(q)) != NULL)
+ cl = eltree_get_minel(q);
+ if (cl)
next_time = cl->cl_e;
if (q->root.cl_cfmin != 0) {
if (next_time == 0 || next_time > q->root.cl_cfmin)
@@ -1600,7 +1598,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
set_active(cl, qdisc_pkt_len(skb));
bstats_update(&cl->bstats, skb);
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -1626,7 +1623,8 @@ hfsc_dequeue(struct Qdisc *sch)
* find the class with the minimum deadline among
* the eligible classes.
*/
- if ((cl = eltree_get_mindl(q, cur_time)) != NULL) {
+ cl = eltree_get_mindl(q, cur_time);
+ if (cl) {
realtime = 1;
} else {
/*
@@ -1665,7 +1663,8 @@ hfsc_dequeue(struct Qdisc *sch)
set_passive(cl);
}
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 984c1b0c683..e1429a85091 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -99,9 +99,10 @@ struct htb_class {
struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
/* When class changes from state 1->2 and disconnects from
- parent's feed then we lost ptr value and start from the
- first child again. Here we store classid of the
- last valid ptr (used when ptr is NULL). */
+ * parent's feed then we lost ptr value and start from the
+ * first child again. Here we store classid of the
+ * last valid ptr (used when ptr is NULL).
+ */
u32 last_ptr_id[TC_HTB_NUMPRIO];
} inner;
} un;
@@ -185,7 +186,7 @@ static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
* have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
* then finish and return direct queue.
*/
-#define HTB_DIRECT (struct htb_class*)-1
+#define HTB_DIRECT ((struct htb_class *)-1L)
static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
int *qerr)
@@ -197,11 +198,13 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
int result;
/* allow to select class by setting skb->priority to valid classid;
- note that nfmark can be used too by attaching filter fw with no
- rules in it */
+ * note that nfmark can be used too by attaching filter fw with no
+ * rules in it
+ */
if (skb->priority == sch->handle)
return HTB_DIRECT; /* X:0 (direct flow) selected */
- if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
+ cl = htb_find(skb->priority, sch);
+ if (cl && cl->level == 0)
return cl;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
@@ -216,10 +219,12 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
return NULL;
}
#endif
- if ((cl = (void *)res.class) == NULL) {
+ cl = (void *)res.class;
+ if (!cl) {
if (res.classid == sch->handle)
return HTB_DIRECT; /* X:0 (direct flow) */
- if ((cl = htb_find(res.classid, sch)) == NULL)
+ cl = htb_find(res.classid, sch);
+ if (!cl)
break; /* filter selected invalid classid */
}
if (!cl->level)
@@ -378,7 +383,8 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
if (p->un.inner.feed[prio].rb_node)
/* parent already has its feed in use so that
- reset bit in mask as parent is already ok */
+ * reset bit in mask as parent is already ok
+ */
mask &= ~(1 << prio);
htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
@@ -413,8 +419,9 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
if (p->un.inner.ptr[prio] == cl->node + prio) {
/* we are removing child which is pointed to from
- parent feed - forget the pointer but remember
- classid */
+ * parent feed - forget the pointer but remember
+ * classid
+ */
p->un.inner.last_ptr_id[prio] = cl->common.classid;
p->un.inner.ptr[prio] = NULL;
}
@@ -574,7 +581,6 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
return NET_XMIT_SUCCESS;
}
@@ -664,8 +670,9 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
unsigned long start)
{
/* don't run for longer than 2 jiffies; 2 is used instead of
- 1 to simplify things when jiffy is going to be incremented
- too soon */
+ * 1 to simplify things when jiffy is going to be incremented
+ * too soon
+ */
unsigned long stop_at = start + 2;
while (time_before(jiffies, stop_at)) {
struct htb_class *cl;
@@ -688,7 +695,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
/* too much load - let's continue after a break for scheduling */
if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
- printk(KERN_WARNING "htb: too many events!\n");
+ pr_warning("htb: too many events!\n");
q->warned |= HTB_WARN_TOOMANYEVENTS;
}
@@ -696,7 +703,8 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
}
/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
- is no such one exists. */
+ * is no such one exists.
+ */
static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
u32 id)
{
@@ -740,12 +748,14 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
for (i = 0; i < 65535; i++) {
if (!*sp->pptr && *sp->pid) {
/* ptr was invalidated but id is valid - try to recover
- the original or next ptr */
+ * the original or next ptr
+ */
*sp->pptr =
htb_id_find_next_upper(prio, sp->root, *sp->pid);
}
*sp->pid = 0; /* ptr is valid now so that remove this hint as it
- can become out of date quickly */
+ * can become out of date quickly
+ */
if (!*sp->pptr) { /* we are at right end; rewind & go up */
*sp->pptr = sp->root;
while ((*sp->pptr)->rb_left)
@@ -773,7 +783,8 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
}
/* dequeues packet at given priority and level; call only if
- you are sure that there is active class at prio/level */
+ * you are sure that there is active class at prio/level
+ */
static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
int level)
{
@@ -790,9 +801,10 @@ next:
return NULL;
/* class can be empty - it is unlikely but can be true if leaf
- qdisc drops packets in enqueue routine or if someone used
- graft operation on the leaf since last dequeue;
- simply deactivate and skip such class */
+ * qdisc drops packets in enqueue routine or if someone used
+ * graft operation on the leaf since last dequeue;
+ * simply deactivate and skip such class
+ */
if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
struct htb_class *next;
htb_deactivate(q, cl);
@@ -832,7 +844,8 @@ next:
ptr[0]) + prio);
}
/* this used to be after charge_class but this constelation
- gives us slightly better performance */
+ * gives us slightly better performance
+ */
if (!cl->un.leaf.q->q.qlen)
htb_deactivate(q, cl);
htb_charge_class(q, cl, level, skb);
@@ -842,7 +855,7 @@ next:
static struct sk_buff *htb_dequeue(struct Qdisc *sch)
{
- struct sk_buff *skb = NULL;
+ struct sk_buff *skb;
struct htb_sched *q = qdisc_priv(sch);
int level;
psched_time_t next_event;
@@ -851,7 +864,9 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
/* try to dequeue direct packets as high prio (!) to minimize cpu work */
skb = __skb_dequeue(&q->direct_queue);
if (skb != NULL) {
- sch->flags &= ~TCQ_F_THROTTLED;
+ok:
+ qdisc_bstats_update(sch, skb);
+ qdisc_unthrottled(sch);
sch->q.qlen--;
return skb;
}
@@ -882,13 +897,11 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
m = ~q->row_mask[level];
while (m != (int)(-1)) {
int prio = ffz(m);
+
m |= 1 << prio;
skb = htb_dequeue_tree(q, prio, level);
- if (likely(skb != NULL)) {
- sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
- goto fin;
- }
+ if (likely(skb != NULL))
+ goto ok;
}
}
sch->qstats.overlimits++;
@@ -989,13 +1002,12 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
return err;
if (tb[TCA_HTB_INIT] == NULL) {
- printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
+ pr_err("HTB: hey probably you have bad tc tool ?\n");
return -EINVAL;
}
gopt = nla_data(tb[TCA_HTB_INIT]);
if (gopt->version != HTB_VER >> 16) {
- printk(KERN_ERR
- "HTB: need tc/htb version %d (minor is %d), you have %d\n",
+ pr_err("HTB: need tc/htb version %d (minor is %d), you have %d\n",
HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
return -EINVAL;
}
@@ -1208,9 +1220,10 @@ static void htb_destroy(struct Qdisc *sch)
cancel_work_sync(&q->work);
qdisc_watchdog_cancel(&q->watchdog);
/* This line used to be after htb_destroy_class call below
- and surprisingly it worked in 2.4. But it must precede it
- because filter need its target class alive to be able to call
- unbind_filter on it (without Oops). */
+ * and surprisingly it worked in 2.4. But it must precede it
+ * because filter need its target class alive to be able to call
+ * unbind_filter on it (without Oops).
+ */
tcf_destroy_chain(&q->filter_list);
for (i = 0; i < q->clhash.hashsize; i++) {
@@ -1344,11 +1357,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
/* check maximal depth */
if (parent && parent->parent && parent->parent->level < 2) {
- printk(KERN_ERR "htb: tree is too deep\n");
+ pr_err("htb: tree is too deep\n");
goto failure;
}
err = -ENOBUFS;
- if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
+ cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+ if (!cl)
goto failure;
err = gen_new_estimator(&cl->bstats, &cl->rate_est,
@@ -1368,8 +1382,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
RB_CLEAR_NODE(&cl->node[prio]);
/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
- so that can't be used inside of sch_tree_lock
- -- thanks to Karlis Peisenieks */
+ * so that can't be used inside of sch_tree_lock
+ * -- thanks to Karlis Peisenieks
+ */
new_q = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, classid);
sch_tree_lock(sch);
@@ -1421,17 +1436,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
}
/* it used to be a nasty bug here, we have to check that node
- is really leaf before changing cl->un.leaf ! */
+ * is really leaf before changing cl->un.leaf !
+ */
if (!cl->level) {
cl->quantum = rtab->rate.rate / q->rate2quantum;
if (!hopt->quantum && cl->quantum < 1000) {
- printk(KERN_WARNING
+ pr_warning(
"HTB: quantum of class %X is small. Consider r2q change.\n",
cl->common.classid);
cl->quantum = 1000;
}
if (!hopt->quantum && cl->quantum > 200000) {
- printk(KERN_WARNING
+ pr_warning(
"HTB: quantum of class %X is big. Consider r2q change.\n",
cl->common.classid);
cl->quantum = 200000;
@@ -1480,13 +1496,13 @@ static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
struct htb_class *cl = htb_find(classid, sch);
/*if (cl && !cl->level) return 0;
- The line above used to be there to prevent attaching filters to
- leaves. But at least tc_index filter uses this just to get class
- for other reasons so that we have to allow for it.
- ----
- 19.6.2002 As Werner explained it is ok - bind filter is just
- another way to "lock" the class - unlike "get" this lock can
- be broken by class during destroy IIUC.
+ * The line above used to be there to prevent attaching filters to
+ * leaves. But at least tc_index filter uses this just to get class
+ * for other reasons so that we have to allow for it.
+ * ----
+ * 19.6.2002 As Werner explained it is ok - bind filter is just
+ * another way to "lock" the class - unlike "get" this lock can
+ * be broken by class during destroy IIUC.
*/
if (cl)
cl->filter_cnt++;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index ecc302f4d2a..ec5cbc84896 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -61,7 +61,6 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
TC_H_MIN(ntx + 1)));
if (qdisc == NULL)
goto err;
- qdisc->flags |= TCQ_F_CAN_BYPASS;
priv->qdiscs[ntx] = qdisc;
}
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
new file mode 100644
index 00000000000..ea17cbed29e
--- /dev/null
+++ b/net/sched/sch_mqprio.c
@@ -0,0 +1,418 @@
+/*
+ * net/sched/sch_mqprio.c
+ *
+ * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct mqprio_sched {
+ struct Qdisc **qdiscs;
+ int hw_owned;
+};
+
+static void mqprio_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ unsigned int ntx;
+
+ if (priv->qdiscs) {
+ for (ntx = 0;
+ ntx < dev->num_tx_queues && priv->qdiscs[ntx];
+ ntx++)
+ qdisc_destroy(priv->qdiscs[ntx]);
+ kfree(priv->qdiscs);
+ }
+
+ if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
+ dev->netdev_ops->ndo_setup_tc(dev, 0);
+ else
+ netdev_set_num_tc(dev, 0);
+}
+
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+ int i, j;
+
+ /* Verify num_tc is not out of max range */
+ if (qopt->num_tc > TC_MAX_QUEUE)
+ return -EINVAL;
+
+ /* Verify priority mapping uses valid tcs */
+ for (i = 0; i < TC_BITMASK + 1; i++) {
+ if (qopt->prio_tc_map[i] >= qopt->num_tc)
+ return -EINVAL;
+ }
+
+ /* net_device does not support requested operation */
+ if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
+ return -EINVAL;
+
+ /* if hw owned qcount and qoffset are taken from LLD so
+ * no reason to verify them here
+ */
+ if (qopt->hw)
+ return 0;
+
+ for (i = 0; i < qopt->num_tc; i++) {
+ unsigned int last = qopt->offset[i] + qopt->count[i];
+
+ /* Verify the queue count is in tx range being equal to the
+ * real_num_tx_queues indicates the last queue is in use.
+ */
+ if (qopt->offset[i] >= dev->real_num_tx_queues ||
+ !qopt->count[i] ||
+ last > dev->real_num_tx_queues)
+ return -EINVAL;
+
+ /* Verify that the offset and counts do not overlap */
+ for (j = i + 1; j < qopt->num_tc; j++) {
+ if (last > qopt->offset[j])
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct netdev_queue *dev_queue;
+ struct Qdisc *qdisc;
+ int i, err = -EOPNOTSUPP;
+ struct tc_mqprio_qopt *qopt = NULL;
+
+ BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
+ BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
+
+ if (sch->parent != TC_H_ROOT)
+ return -EOPNOTSUPP;
+
+ if (!netif_is_multiqueue(dev))
+ return -EOPNOTSUPP;
+
+ if (nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+
+ qopt = nla_data(opt);
+ if (mqprio_parse_opt(dev, qopt))
+ return -EINVAL;
+
+ /* pre-allocate qdisc, attachment can't fail */
+ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+ GFP_KERNEL);
+ if (priv->qdiscs == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ dev_queue = netdev_get_tx_queue(dev, i);
+ qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1)));
+ if (qdisc == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+ priv->qdiscs[i] = qdisc;
+ }
+
+ /* If the mqprio options indicate that hardware should own
+ * the queue mapping then run ndo_setup_tc otherwise use the
+ * supplied and verified mapping
+ */
+ if (qopt->hw) {
+ priv->hw_owned = 1;
+ err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
+ if (err)
+ goto err;
+ } else {
+ netdev_set_num_tc(dev, qopt->num_tc);
+ for (i = 0; i < qopt->num_tc; i++)
+ netdev_set_tc_queue(dev, i,
+ qopt->count[i], qopt->offset[i]);
+ }
+
+ /* Always use supplied priority mappings */
+ for (i = 0; i < TC_BITMASK + 1; i++)
+ netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
+
+ sch->flags |= TCQ_F_MQROOT;
+ return 0;
+
+err:
+ mqprio_destroy(sch);
+ return err;
+}
+
+static void mqprio_attach(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct Qdisc *qdisc;
+ unsigned int ntx;
+
+ /* Attach underlying qdisc */
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ qdisc = priv->qdiscs[ntx];
+ qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+ if (qdisc)
+ qdisc_destroy(qdisc);
+ }
+ kfree(priv->qdiscs);
+ priv->qdiscs = NULL;
+}
+
+static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
+
+ if (ntx >= dev->num_tx_queues)
+ return NULL;
+ return netdev_get_tx_queue(dev, ntx);
+}
+
+static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return -EINVAL;
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ *old = dev_graft_qdisc(dev_queue, new);
+
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+
+ return 0;
+}
+
+static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_mqprio_qopt opt = { 0 };
+ struct Qdisc *qdisc;
+ unsigned int i;
+
+ sch->q.qlen = 0;
+ memset(&sch->bstats, 0, sizeof(sch->bstats));
+ memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+ spin_lock_bh(qdisc_lock(qdisc));
+ sch->q.qlen += qdisc->q.qlen;
+ sch->bstats.bytes += qdisc->bstats.bytes;
+ sch->bstats.packets += qdisc->bstats.packets;
+ sch->qstats.qlen += qdisc->qstats.qlen;
+ sch->qstats.backlog += qdisc->qstats.backlog;
+ sch->qstats.drops += qdisc->qstats.drops;
+ sch->qstats.requeues += qdisc->qstats.requeues;
+ sch->qstats.overlimits += qdisc->qstats.overlimits;
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+
+ opt.num_tc = netdev_get_num_tc(dev);
+ memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+ opt.hw = priv->hw_owned;
+
+ for (i = 0; i < netdev_get_num_tc(dev); i++) {
+ opt.count[i] = dev->tc_to_txq[i].count;
+ opt.offset[i] = dev->tc_to_txq[i].offset;
+ }
+
+ NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+ return skb->len;
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return NULL;
+
+ return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int ntx = TC_H_MIN(classid);
+
+ if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
+ return 0;
+ return ntx;
+}
+
+static void mqprio_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct net_device *dev = qdisc_dev(sch);
+
+ if (cl <= netdev_get_num_tc(dev)) {
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_info = 0;
+ } else {
+ int i;
+ struct netdev_queue *dev_queue;
+
+ dev_queue = mqprio_queue_get(sch, cl);
+ tcm->tcm_parent = 0;
+ for (i = 0; i < netdev_get_num_tc(dev); i++) {
+ struct netdev_tc_txq tc = dev->tc_to_txq[i];
+ int q_idx = cl - netdev_get_num_tc(dev);
+
+ if (q_idx > tc.offset &&
+ q_idx <= tc.offset + tc.count) {
+ tcm->tcm_parent =
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1));
+ break;
+ }
+ }
+ tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+ }
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+ __releases(d->lock)
+ __acquires(d->lock)
+{
+ struct net_device *dev = qdisc_dev(sch);
+
+ if (cl <= netdev_get_num_tc(dev)) {
+ int i;
+ struct Qdisc *qdisc;
+ struct gnet_stats_queue qstats = {0};
+ struct gnet_stats_basic_packed bstats = {0};
+ struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
+
+ /* Drop lock here it will be reclaimed before touching
+ * statistics this is required because the d->lock we
+ * hold here is the look on dev_queue->qdisc_sleeping
+ * also acquired below.
+ */
+ spin_unlock_bh(d->lock);
+
+ for (i = tc.offset; i < tc.offset + tc.count; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+ spin_lock_bh(qdisc_lock(qdisc));
+ bstats.bytes += qdisc->bstats.bytes;
+ bstats.packets += qdisc->bstats.packets;
+ qstats.qlen += qdisc->qstats.qlen;
+ qstats.backlog += qdisc->qstats.backlog;
+ qstats.drops += qdisc->qstats.drops;
+ qstats.requeues += qdisc->qstats.requeues;
+ qstats.overlimits += qdisc->qstats.overlimits;
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+ /* Reclaim root sleeping lock before completing stats */
+ spin_lock_bh(d->lock);
+ if (gnet_stats_copy_basic(d, &bstats) < 0 ||
+ gnet_stats_copy_queue(d, &qstats) < 0)
+ return -1;
+ } else {
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ sch = dev_queue->qdisc_sleeping;
+ sch->qstats.qlen = sch->q.qlen;
+ if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+ gnet_stats_copy_queue(d, &sch->qstats) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx;
+
+ if (arg->stop)
+ return;
+
+ /* Walk hierarchy with a virtual class per tc */
+ arg->count = arg->skip;
+ for (ntx = arg->skip;
+ ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
+ ntx++) {
+ if (arg->fn(sch, ntx + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops mqprio_class_ops = {
+ .graft = mqprio_graft,
+ .leaf = mqprio_leaf,
+ .get = mqprio_get,
+ .put = mqprio_put,
+ .walk = mqprio_walk,
+ .dump = mqprio_dump_class,
+ .dump_stats = mqprio_dump_class_stats,
+};
+
+static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
+ .cl_ops = &mqprio_class_ops,
+ .id = "mqprio",
+ .priv_size = sizeof(struct mqprio_sched),
+ .init = mqprio_init,
+ .destroy = mqprio_destroy,
+ .attach = mqprio_attach,
+ .dump = mqprio_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init mqprio_module_init(void)
+{
+ return register_qdisc(&mqprio_qdisc_ops);
+}
+
+static void __exit mqprio_module_exit(void)
+{
+ unregister_qdisc(&mqprio_qdisc_ops);
+}
+
+module_init(mqprio_module_init);
+module_exit(mqprio_module_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 21f13da2476..edc1950e0e7 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -83,7 +83,6 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -112,6 +111,7 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
qdisc = q->queues[q->curband];
skb = qdisc->dequeue(qdisc);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -156,7 +156,7 @@ static unsigned int multiq_drop(struct Qdisc *sch)
unsigned int len;
struct Qdisc *qdisc;
- for (band = q->bands-1; band >= 0; band--) {
+ for (band = q->bands - 1; band >= 0; band--) {
qdisc = q->queues[band];
if (qdisc->ops->drop) {
len = qdisc->ops->drop(qdisc);
@@ -265,7 +265,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
for (i = 0; i < q->max_bands; i++)
q->queues[i] = &noop_qdisc;
- err = multiq_tune(sch,opt);
+ err = multiq_tune(sch, opt);
if (err)
kfree(q->queues);
@@ -346,7 +346,7 @@ static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
struct multiq_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(cl);
- tcm->tcm_info = q->queues[cl-1]->handle;
+ tcm->tcm_info = q->queues[cl - 1]->handle;
return 0;
}
@@ -378,7 +378,7 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
arg->count++;
continue;
}
- if (arg->fn(sch, band+1, arg) < 0) {
+ if (arg->fn(sch, band + 1, arg) < 0) {
arg->stop = 1;
break;
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 1c4bce86347..edbbf7ad662 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -19,12 +19,13 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
-#define VERSION "1.2"
+#define VERSION "1.3"
/* Network Emulation Queuing algorithm.
====================================
@@ -47,6 +48,20 @@
layering other disciplines. It does not need to do bandwidth
control either since that can be handled by using token
bucket or other rate control.
+
+ Correlated Loss Generator models
+
+ Added generation of correlated loss according to the
+ "Gilbert-Elliot" model, a 4-state markov model.
+
+ References:
+ [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
+ [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
+ and intuitive loss model for packet networks and its implementation
+ in the Netem module in the Linux kernel", available in [1]
+
+ Authors: Stefano Salsano <stefano.salsano at uniroma2.it
+ Fabio Ludovici <fabio.ludovici at yahoo.it>
*/
struct netem_sched_data {
@@ -73,6 +88,26 @@ struct netem_sched_data {
u32 size;
s16 table[0];
} *delay_dist;
+
+ enum {
+ CLG_RANDOM,
+ CLG_4_STATES,
+ CLG_GILB_ELL,
+ } loss_model;
+
+ /* Correlated Loss Generation models */
+ struct clgstate {
+ /* state of the Markov chain */
+ u8 state;
+
+ /* 4-states and Gilbert-Elliot models */
+ u32 a1; /* p13 for 4-states or p for GE */
+ u32 a2; /* p31 for 4-states or r for GE */
+ u32 a3; /* p32 for 4-states or h for GE */
+ u32 a4; /* p14 for 4-states or 1-k for GE */
+ u32 a5; /* p23 used only in 4-states */
+ } clg;
+
};
/* Time stamp put into socket buffer control block */
@@ -115,6 +150,122 @@ static u32 get_crandom(struct crndstate *state)
return answer;
}
+/* loss_4state - 4-state model loss generator
+ * Generates losses according to the 4-state Markov chain adopted in
+ * the GI (General and Intuitive) loss model.
+ */
+static bool loss_4state(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+ u32 rnd = net_random();
+
+ /*
+ * Makes a comparision between rnd and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state and if the next packet has to be transmitted or lost.
+ * The four states correspond to:
+ * 1 => successfully transmitted packets within a gap period
+ * 4 => isolated losses within a gap period
+ * 3 => lost packets within a burst period
+ * 2 => successfully transmitted packets within a burst period
+ */
+ switch (clg->state) {
+ case 1:
+ if (rnd < clg->a4) {
+ clg->state = 4;
+ return true;
+ } else if (clg->a4 < rnd && rnd < clg->a1) {
+ clg->state = 3;
+ return true;
+ } else if (clg->a1 < rnd)
+ clg->state = 1;
+
+ break;
+ case 2:
+ if (rnd < clg->a5) {
+ clg->state = 3;
+ return true;
+ } else
+ clg->state = 2;
+
+ break;
+ case 3:
+ if (rnd < clg->a3)
+ clg->state = 2;
+ else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
+ clg->state = 1;
+ return true;
+ } else if (clg->a2 + clg->a3 < rnd) {
+ clg->state = 3;
+ return true;
+ }
+ break;
+ case 4:
+ clg->state = 1;
+ break;
+ }
+
+ return false;
+}
+
+/* loss_gilb_ell - Gilbert-Elliot model loss generator
+ * Generates losses according to the Gilbert-Elliot loss model or
+ * its special cases (Gilbert or Simple Gilbert)
+ *
+ * Makes a comparision between random number and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state. A second random number is extracted and the comparision
+ * with the loss probability of the current state decides if the next
+ * packet will be transmitted or lost.
+ */
+static bool loss_gilb_ell(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+
+ switch (clg->state) {
+ case 1:
+ if (net_random() < clg->a1)
+ clg->state = 2;
+ if (net_random() < clg->a4)
+ return true;
+ case 2:
+ if (net_random() < clg->a2)
+ clg->state = 1;
+ if (clg->a3 > net_random())
+ return true;
+ }
+
+ return false;
+}
+
+static bool loss_event(struct netem_sched_data *q)
+{
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* Random packet drop 0 => none, ~0 => all */
+ return q->loss && q->loss >= get_crandom(&q->loss_cor);
+
+ case CLG_4_STATES:
+ /* 4state loss model algorithm (used also for GI model)
+ * Extracts a value from the markov 4 state loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_4state(q);
+
+ case CLG_GILB_ELL:
+ /* Gilbert-Elliot loss model algorithm
+ * Extracts a value from the Gilbert-Elliot loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_gilb_ell(q);
+ }
+
+ return false; /* not reached */
+}
+
+
/* tabledist - return a pseudo-randomly distributed value with mean mu and
* std deviation sigma. Uses table lookup to approximate the desired
* distribution, and a uniformly-distributed pseudo-random source.
@@ -161,14 +312,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
int ret;
int count = 1;
- pr_debug("netem_enqueue skb=%p\n", skb);
-
/* Random duplication */
if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
++count;
- /* Random packet drop 0 => none, ~0 => all */
- if (q->loss && q->loss >= get_crandom(&q->loss_cor))
+ /* Drop packet? */
+ if (loss_event(q))
--count;
if (count == 0) {
@@ -211,8 +360,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
cb = netem_skb_cb(skb);
- if (q->gap == 0 || /* not doing reordering */
- q->counter < q->gap || /* inside last reordering gap */
+ if (q->gap == 0 || /* not doing reordering */
+ q->counter < q->gap || /* inside last reordering gap */
q->reorder < get_crandom(&q->reorder_cor)) {
psched_time_t now;
psched_tdiff_t delay;
@@ -238,18 +387,18 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = NET_XMIT_SUCCESS;
}
- if (likely(ret == NET_XMIT_SUCCESS)) {
- sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
- } else if (net_xmit_drop_count(ret)) {
- sch->qstats.drops++;
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret)) {
+ sch->qstats.drops++;
+ return ret;
+ }
}
- pr_debug("netem: enqueue ret %d\n", ret);
- return ret;
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
}
-static unsigned int netem_drop(struct Qdisc* sch)
+static unsigned int netem_drop(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
unsigned int len = 0;
@@ -266,7 +415,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
- if (sch->flags & TCQ_F_THROTTLED)
+ if (qdisc_is_throttled(sch))
return NULL;
skb = q->qdisc->ops->peek(q->qdisc);
@@ -288,8 +437,10 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
skb->tstamp.tv64 = 0;
#endif
- pr_debug("netem_dequeue: return skb=%p\n", skb);
+
sch->q.qlen--;
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
return skb;
}
@@ -308,6 +459,16 @@ static void netem_reset(struct Qdisc *sch)
qdisc_watchdog_cancel(&q->watchdog);
}
+static void dist_free(struct disttable *d)
+{
+ if (d) {
+ if (is_vmalloc_addr(d))
+ vfree(d);
+ else
+ kfree(d);
+ }
+}
+
/*
* Distribution data is a variable size payload containing
* signed 16 bit values.
@@ -315,16 +476,20 @@ static void netem_reset(struct Qdisc *sch)
static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
{
struct netem_sched_data *q = qdisc_priv(sch);
- unsigned long n = nla_len(attr)/sizeof(__s16);
+ size_t n = nla_len(attr)/sizeof(__s16);
const __s16 *data = nla_data(attr);
spinlock_t *root_lock;
struct disttable *d;
int i;
+ size_t s;
- if (n > 65536)
+ if (n > NETEM_DIST_MAX)
return -EINVAL;
- d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
+ s = sizeof(struct disttable) + n * sizeof(s16);
+ d = kmalloc(s, GFP_KERNEL);
+ if (!d)
+ d = vmalloc(s);
if (!d)
return -ENOMEM;
@@ -335,7 +500,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
root_lock = qdisc_root_sleeping_lock(sch);
spin_lock_bh(root_lock);
- kfree(q->delay_dist);
+ dist_free(q->delay_dist);
q->delay_dist = d;
spin_unlock_bh(root_lock);
return 0;
@@ -369,10 +534,66 @@ static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
init_crandom(&q->corrupt_cor, r->correlation);
}
+static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ const struct nlattr *la;
+ int rem;
+
+ nla_for_each_nested(la, attr, rem) {
+ u16 type = nla_type(la);
+
+ switch(type) {
+ case NETEM_LOSS_GI: {
+ const struct tc_netem_gimodel *gi = nla_data(la);
+
+ if (nla_len(la) != sizeof(struct tc_netem_gimodel)) {
+ pr_info("netem: incorrect gi model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_4_STATES;
+
+ q->clg.state = 1;
+ q->clg.a1 = gi->p13;
+ q->clg.a2 = gi->p31;
+ q->clg.a3 = gi->p32;
+ q->clg.a4 = gi->p14;
+ q->clg.a5 = gi->p23;
+ break;
+ }
+
+ case NETEM_LOSS_GE: {
+ const struct tc_netem_gemodel *ge = nla_data(la);
+
+ if (nla_len(la) != sizeof(struct tc_netem_gemodel)) {
+ pr_info("netem: incorrect gi model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_GILB_ELL;
+ q->clg.state = 1;
+ q->clg.a1 = ge->p;
+ q->clg.a2 = ge->r;
+ q->clg.a3 = ge->h;
+ q->clg.a4 = ge->k1;
+ break;
+ }
+
+ default:
+ pr_info("netem: unknown loss type %u\n", type);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
[TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
[TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
[TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
+ [TCA_NETEM_LOSS] = { .type = NLA_NESTED },
};
static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
@@ -380,11 +601,15 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
{
int nested_len = nla_len(nla) - NLA_ALIGN(len);
- if (nested_len < 0)
+ if (nested_len < 0) {
+ pr_info("netem: invalid attributes len %d\n", nested_len);
return -EINVAL;
+ }
+
if (nested_len >= nla_attr_size(0))
return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
nested_len, policy);
+
memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
return 0;
}
@@ -407,7 +632,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
ret = fifo_set_limit(q->qdisc, qopt->limit);
if (ret) {
- pr_debug("netem: can't set fifo limit\n");
+ pr_info("netem: can't set fifo limit\n");
return ret;
}
@@ -440,7 +665,11 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_NETEM_CORRUPT])
get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
- return 0;
+ q->loss_model = CLG_RANDOM;
+ if (tb[TCA_NETEM_LOSS])
+ ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
+
+ return ret;
}
/*
@@ -476,7 +705,6 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
__skb_queue_after(list, skb, nskb);
sch->qstats.backlog += qdisc_pkt_len(nskb);
- qdisc_bstats_update(sch, nskb);
return NET_XMIT_SUCCESS;
}
@@ -536,16 +764,17 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt)
qdisc_watchdog_init(&q->watchdog, sch);
+ q->loss_model = CLG_RANDOM;
q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
TC_H_MAKE(sch->handle, 1));
if (!q->qdisc) {
- pr_debug("netem: qdisc create failed\n");
+ pr_notice("netem: qdisc create tfifo qdisc failed\n");
return -ENOMEM;
}
ret = netem_change(sch, opt);
if (ret) {
- pr_debug("netem: change failed\n");
+ pr_info("netem: change failed\n");
qdisc_destroy(q->qdisc);
}
return ret;
@@ -557,14 +786,61 @@ static void netem_destroy(struct Qdisc *sch)
qdisc_watchdog_cancel(&q->watchdog);
qdisc_destroy(q->qdisc);
- kfree(q->delay_dist);
+ dist_free(q->delay_dist);
+}
+
+static int dump_loss_model(const struct netem_sched_data *q,
+ struct sk_buff *skb)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_NETEM_LOSS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* legacy loss model */
+ nla_nest_cancel(skb, nest);
+ return 0; /* no data */
+
+ case CLG_4_STATES: {
+ struct tc_netem_gimodel gi = {
+ .p13 = q->clg.a1,
+ .p31 = q->clg.a2,
+ .p32 = q->clg.a3,
+ .p14 = q->clg.a4,
+ .p23 = q->clg.a5,
+ };
+
+ NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
+ break;
+ }
+ case CLG_GILB_ELL: {
+ struct tc_netem_gemodel ge = {
+ .p = q->clg.a1,
+ .r = q->clg.a2,
+ .h = q->clg.a3,
+ .k1 = q->clg.a4,
+ };
+
+ NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
+ break;
+ }
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
}
static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
{
const struct netem_sched_data *q = qdisc_priv(sch);
- unsigned char *b = skb_tail_pointer(skb);
- struct nlattr *nla = (struct nlattr *) b;
+ struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
struct tc_netem_qopt qopt;
struct tc_netem_corr cor;
struct tc_netem_reorder reorder;
@@ -591,17 +867,87 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
corrupt.correlation = q->corrupt_cor.rho;
NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
- nla->nla_len = skb_tail_pointer(skb) - b;
+ if (dump_loss_model(q, skb) != 0)
+ goto nla_put_failure;
- return skb->len;
+ return nla_nest_end(skb, nla);
nla_put_failure:
- nlmsg_trim(skb, b);
+ nlmsg_trim(skb, nla);
return -1;
}
+static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ if (cl != 1) /* only one class */
+ return -ENOENT;
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+
+ return 0;
+}
+
+static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->qdisc;
+ q->qdisc = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ return q->qdisc;
+}
+
+static unsigned long netem_get(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void netem_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops netem_class_ops = {
+ .graft = netem_graft,
+ .leaf = netem_leaf,
+ .get = netem_get,
+ .put = netem_put,
+ .walk = netem_walk,
+ .dump = netem_dump_class,
+};
+
static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
.id = "netem",
+ .cl_ops = &netem_class_ops,
.priv_size = sizeof(struct netem_sched_data),
.enqueue = netem_enqueue,
.dequeue = netem_dequeue,
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 966158d49dd..2a318f2dc3e 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -22,8 +22,7 @@
#include <net/pkt_sched.h>
-struct prio_sched_data
-{
+struct prio_sched_data {
int bands;
struct tcf_proto *filter_list;
u8 prio2band[TC_PRIO_MAX+1];
@@ -54,7 +53,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
if (!q->filter_list || err < 0) {
if (TC_H_MAJ(band))
band = 0;
- return q->queues[q->prio2band[band&TC_PRIO_MAX]];
+ return q->queues[q->prio2band[band & TC_PRIO_MAX]];
}
band = res.classid;
}
@@ -84,7 +83,6 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -107,7 +105,7 @@ static struct sk_buff *prio_peek(struct Qdisc *sch)
return NULL;
}
-static struct sk_buff *prio_dequeue(struct Qdisc* sch)
+static struct sk_buff *prio_dequeue(struct Qdisc *sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
@@ -116,6 +114,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc* sch)
struct Qdisc *qdisc = q->queues[prio];
struct sk_buff *skb = qdisc->dequeue(qdisc);
if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -124,7 +123,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc* sch)
}
-static unsigned int prio_drop(struct Qdisc* sch)
+static unsigned int prio_drop(struct Qdisc *sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
@@ -143,24 +142,24 @@ static unsigned int prio_drop(struct Qdisc* sch)
static void
-prio_reset(struct Qdisc* sch)
+prio_reset(struct Qdisc *sch)
{
int prio;
struct prio_sched_data *q = qdisc_priv(sch);
- for (prio=0; prio<q->bands; prio++)
+ for (prio = 0; prio < q->bands; prio++)
qdisc_reset(q->queues[prio]);
sch->q.qlen = 0;
}
static void
-prio_destroy(struct Qdisc* sch)
+prio_destroy(struct Qdisc *sch)
{
int prio;
struct prio_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
- for (prio=0; prio<q->bands; prio++)
+ for (prio = 0; prio < q->bands; prio++)
qdisc_destroy(q->queues[prio]);
}
@@ -177,7 +176,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
return -EINVAL;
- for (i=0; i<=TC_PRIO_MAX; i++) {
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
if (qopt->priomap[i] >= qopt->bands)
return -EINVAL;
}
@@ -186,7 +185,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
q->bands = qopt->bands;
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
- for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
+ for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
if (child != &noop_qdisc) {
@@ -196,9 +195,10 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
}
sch_tree_unlock(sch);
- for (i=0; i<q->bands; i++) {
+ for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
+
child = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops,
TC_H_MAKE(sch->handle, i + 1));
@@ -224,7 +224,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)
struct prio_sched_data *q = qdisc_priv(sch);
int i;
- for (i=0; i<TCQ_PRIO_BANDS; i++)
+ for (i = 0; i < TCQ_PRIO_BANDS; i++)
q->queues[i] = &noop_qdisc;
if (opt == NULL) {
@@ -232,7 +232,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)
} else {
int err;
- if ((err= prio_tune(sch, opt)) != 0)
+ if ((err = prio_tune(sch, opt)) != 0)
return err;
}
return 0;
@@ -245,7 +245,7 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_prio_qopt opt;
opt.bands = q->bands;
- memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
+ memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
@@ -342,7 +342,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
arg->count++;
continue;
}
- if (arg->fn(sch, prio+1, arg) < 0) {
+ if (arg->fn(sch, prio + 1, arg) < 0) {
arg->stop = 1;
break;
}
@@ -350,7 +350,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
}
}
-static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl)
+static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl)
{
struct prio_sched_data *q = qdisc_priv(sch);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a6009c5a2c9..6649463da1b 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -36,8 +36,7 @@
if RED works correctly.
*/
-struct red_sched_data
-{
+struct red_sched_data {
u32 limit; /* HARD maximal queue length */
unsigned char flags;
struct red_parms parms;
@@ -55,7 +54,7 @@ static inline int red_use_harddrop(struct red_sched_data *q)
return q->flags & TC_RED_HARDDROP;
}
-static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
@@ -67,34 +66,33 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
red_end_of_idle_period(&q->parms);
switch (red_action(&q->parms, q->parms.qavg)) {
- case RED_DONT_MARK:
- break;
-
- case RED_PROB_MARK:
- sch->qstats.overlimits++;
- if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
- q->stats.prob_drop++;
- goto congestion_drop;
- }
-
- q->stats.prob_mark++;
- break;
-
- case RED_HARD_MARK:
- sch->qstats.overlimits++;
- if (red_use_harddrop(q) || !red_use_ecn(q) ||
- !INET_ECN_set_ce(skb)) {
- q->stats.forced_drop++;
- goto congestion_drop;
- }
-
- q->stats.forced_mark++;
- break;
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ sch->qstats.overlimits++;
+ if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ break;
+
+ case RED_HARD_MARK:
+ sch->qstats.overlimits++;
+ if (red_use_harddrop(q) || !red_use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ break;
}
ret = qdisc_enqueue(skb, child);
if (likely(ret == NET_XMIT_SUCCESS)) {
- qdisc_bstats_update(sch, skb);
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
@@ -107,22 +105,24 @@ congestion_drop:
return NET_XMIT_CN;
}
-static struct sk_buff * red_dequeue(struct Qdisc* sch)
+static struct sk_buff *red_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
skb = child->dequeue(child);
- if (skb)
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
- else if (!red_is_idling(&q->parms))
- red_start_of_idle_period(&q->parms);
-
+ } else {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ }
return skb;
}
-static struct sk_buff * red_peek(struct Qdisc* sch)
+static struct sk_buff *red_peek(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
@@ -130,7 +130,7 @@ static struct sk_buff * red_peek(struct Qdisc* sch)
return child->ops->peek(child);
}
-static unsigned int red_drop(struct Qdisc* sch)
+static unsigned int red_drop(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
@@ -149,7 +149,7 @@ static unsigned int red_drop(struct Qdisc* sch)
return 0;
}
-static void red_reset(struct Qdisc* sch)
+static void red_reset(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
@@ -216,7 +216,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
return 0;
}
-static int red_init(struct Qdisc* sch, struct nlattr *opt)
+static int red_init(struct Qdisc *sch, struct nlattr *opt)
{
struct red_sched_data *q = qdisc_priv(sch);
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
new file mode 100644
index 00000000000..0a833d0c1f6
--- /dev/null
+++ b/net/sched/sch_sfb.c
@@ -0,0 +1,709 @@
+/*
+ * net/sched/sch_sfb.c Stochastic Fair Blue
+ *
+ * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
+ * A New Class of Active Queue Management Algorithms.
+ * U. Michigan CSE-TR-387-99, April 1999.
+ *
+ * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <net/ip.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+
+/*
+ * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
+ * This implementation uses L = 8 and N = 16
+ * This permits us to split one 32bit hash (provided per packet by rxhash or
+ * external classifier) into 8 subhashes of 4 bits.
+ */
+#define SFB_BUCKET_SHIFT 4
+#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */
+#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
+#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */
+
+/* SFB algo uses a virtual queue, named "bin" */
+struct sfb_bucket {
+ u16 qlen; /* length of virtual queue */
+ u16 p_mark; /* marking probability */
+};
+
+/* We use a double buffering right before hash change
+ * (Section 4.4 of SFB reference : moving hash functions)
+ */
+struct sfb_bins {
+ u32 perturbation; /* jhash perturbation */
+ struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
+};
+
+struct sfb_sched_data {
+ struct Qdisc *qdisc;
+ struct tcf_proto *filter_list;
+ unsigned long rehash_interval;
+ unsigned long warmup_time; /* double buffering warmup time in jiffies */
+ u32 max;
+ u32 bin_size; /* maximum queue length per bin */
+ u32 increment; /* d1 */
+ u32 decrement; /* d2 */
+ u32 limit; /* HARD maximal queue length */
+ u32 penalty_rate;
+ u32 penalty_burst;
+ u32 tokens_avail;
+ unsigned long rehash_time;
+ unsigned long token_time;
+
+ u8 slot; /* current active bins (0 or 1) */
+ bool double_buffering;
+ struct sfb_bins bins[2];
+
+ struct {
+ u32 earlydrop;
+ u32 penaltydrop;
+ u32 bucketdrop;
+ u32 queuedrop;
+ u32 childdrop; /* drops in child qdisc */
+ u32 marked; /* ECN mark */
+ } stats;
+};
+
+/*
+ * Each queued skb might be hashed on one or two bins
+ * We store in skb_cb the two hash values.
+ * (A zero value means double buffering was not used)
+ */
+struct sfb_skb_cb {
+ u32 hashes[2];
+};
+
+static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(skb->cb) <
+ sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb));
+ return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+/*
+ * If using 'internal' SFB flow classifier, hash comes from skb rxhash
+ * If using external classifier, hash comes from the classid.
+ */
+static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
+{
+ return sfb_skb_cb(skb)->hashes[slot];
+}
+
+/* Probabilities are coded as Q0.16 fixed-point values,
+ * with 0xFFFF representing 65535/65536 (almost 1.0)
+ * Addition and subtraction are saturating in [0, 65535]
+ */
+static u32 prob_plus(u32 p1, u32 p2)
+{
+ u32 res = p1 + p2;
+
+ return min_t(u32, res, SFB_MAX_PROB);
+}
+
+static u32 prob_minus(u32 p1, u32 p2)
+{
+ return p1 > p2 ? p1 - p2 : 0;
+}
+
+static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
+{
+ int i;
+ struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b[hash].qlen < 0xFFFF)
+ b[hash].qlen++;
+ b += SFB_NUMBUCKETS; /* next level */
+ }
+}
+
+static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ u32 sfbhash;
+
+ sfbhash = sfb_hash(skb, 0);
+ if (sfbhash)
+ increment_one_qlen(sfbhash, 0, q);
+
+ sfbhash = sfb_hash(skb, 1);
+ if (sfbhash)
+ increment_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_one_qlen(u32 sfbhash, u32 slot,
+ struct sfb_sched_data *q)
+{
+ int i;
+ struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b[hash].qlen > 0)
+ b[hash].qlen--;
+ b += SFB_NUMBUCKETS; /* next level */
+ }
+}
+
+static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ u32 sfbhash;
+
+ sfbhash = sfb_hash(skb, 0);
+ if (sfbhash)
+ decrement_one_qlen(sfbhash, 0, q);
+
+ sfbhash = sfb_hash(skb, 1);
+ if (sfbhash)
+ decrement_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+ b->p_mark = prob_minus(b->p_mark, q->decrement);
+}
+
+static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+ b->p_mark = prob_plus(b->p_mark, q->increment);
+}
+
+static void sfb_zero_all_buckets(struct sfb_sched_data *q)
+{
+ memset(&q->bins, 0, sizeof(q->bins));
+}
+
+/*
+ * compute max qlen, max p_mark, and avg p_mark
+ */
+static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q)
+{
+ int i;
+ u32 qlen = 0, prob = 0, totalpm = 0;
+ const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
+ if (qlen < b->qlen)
+ qlen = b->qlen;
+ totalpm += b->p_mark;
+ if (prob < b->p_mark)
+ prob = b->p_mark;
+ b++;
+ }
+ *prob_r = prob;
+ *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS);
+ return qlen;
+}
+
+
+static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
+{
+ q->bins[slot].perturbation = net_random();
+}
+
+static void sfb_swap_slot(struct sfb_sched_data *q)
+{
+ sfb_init_perturbation(q->slot, q);
+ q->slot ^= 1;
+ q->double_buffering = false;
+}
+
+/* Non elastic flows are allowed to use part of the bandwidth, expressed
+ * in "penalty_rate" packets per second, with "penalty_burst" burst
+ */
+static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ if (q->penalty_rate == 0 || q->penalty_burst == 0)
+ return true;
+
+ if (q->tokens_avail < 1) {
+ unsigned long age = min(10UL * HZ, jiffies - q->token_time);
+
+ q->tokens_avail = (age * q->penalty_rate) / HZ;
+ if (q->tokens_avail > q->penalty_burst)
+ q->tokens_avail = q->penalty_burst;
+ q->token_time = jiffies;
+ if (q->tokens_avail < 1)
+ return true;
+ }
+
+ q->tokens_avail--;
+ return false;
+}
+
+static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q,
+ int *qerr, u32 *salt)
+{
+ struct tcf_result res;
+ int result;
+
+ result = tc_classify(skb, q->filter_list, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return false;
+ }
+#endif
+ *salt = TC_H_MIN(res.classid);
+ return true;
+ }
+ return false;
+}
+
+static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ int i;
+ u32 p_min = ~0;
+ u32 minqlen = ~0;
+ u32 r, slot, salt, sfbhash;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+ if (q->rehash_interval > 0) {
+ unsigned long limit = q->rehash_time + q->rehash_interval;
+
+ if (unlikely(time_after(jiffies, limit))) {
+ sfb_swap_slot(q);
+ q->rehash_time = jiffies;
+ } else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
+ time_after(jiffies, limit - q->warmup_time))) {
+ q->double_buffering = true;
+ }
+ }
+
+ if (q->filter_list) {
+ /* If using external classifiers, get result and record it. */
+ if (!sfb_classify(skb, q, &ret, &salt))
+ goto other_drop;
+ } else {
+ salt = skb_get_rxhash(skb);
+ }
+
+ slot = q->slot;
+
+ sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+ if (!sfbhash)
+ sfbhash = 1;
+ sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+ struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b->qlen == 0)
+ decrement_prob(b, q);
+ else if (b->qlen >= q->bin_size)
+ increment_prob(b, q);
+ if (minqlen > b->qlen)
+ minqlen = b->qlen;
+ if (p_min > b->p_mark)
+ p_min = b->p_mark;
+ }
+
+ slot ^= 1;
+ sfb_skb_cb(skb)->hashes[slot] = 0;
+
+ if (unlikely(minqlen >= q->max || sch->q.qlen >= q->limit)) {
+ sch->qstats.overlimits++;
+ if (minqlen >= q->max)
+ q->stats.bucketdrop++;
+ else
+ q->stats.queuedrop++;
+ goto drop;
+ }
+
+ if (unlikely(p_min >= SFB_MAX_PROB)) {
+ /* Inelastic flow */
+ if (q->double_buffering) {
+ sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+ if (!sfbhash)
+ sfbhash = 1;
+ sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+ struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b->qlen == 0)
+ decrement_prob(b, q);
+ else if (b->qlen >= q->bin_size)
+ increment_prob(b, q);
+ }
+ }
+ if (sfb_rate_limit(skb, q)) {
+ sch->qstats.overlimits++;
+ q->stats.penaltydrop++;
+ goto drop;
+ }
+ goto enqueue;
+ }
+
+ r = net_random() & SFB_MAX_PROB;
+
+ if (unlikely(r < p_min)) {
+ if (unlikely(p_min > SFB_MAX_PROB / 2)) {
+ /* If we're marking that many packets, then either
+ * this flow is unresponsive, or we're badly congested.
+ * In either case, we want to start dropping packets.
+ */
+ if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
+ q->stats.earlydrop++;
+ goto drop;
+ }
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.marked++;
+ } else {
+ q->stats.earlydrop++;
+ goto drop;
+ }
+ }
+
+enqueue:
+ ret = qdisc_enqueue(skb, child);
+ if (likely(ret == NET_XMIT_SUCCESS)) {
+ sch->q.qlen++;
+ increment_qlen(skb, q);
+ } else if (net_xmit_drop_count(ret)) {
+ q->stats.childdrop++;
+ sch->qstats.drops++;
+ }
+ return ret;
+
+drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+other_drop:
+ if (ret & __NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ struct sk_buff *skb;
+
+ skb = child->dequeue(q->qdisc);
+
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ decrement_qlen(skb, q);
+ }
+
+ return skb;
+}
+
+static struct sk_buff *sfb_peek(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+
+ return child->ops->peek(child);
+}
+
+/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
+
+static void sfb_reset(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset(q->qdisc);
+ sch->q.qlen = 0;
+ q->slot = 0;
+ q->double_buffering = false;
+ sfb_zero_all_buckets(q);
+ sfb_init_perturbation(0, q);
+}
+
+static void sfb_destroy(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ qdisc_destroy(q->qdisc);
+}
+
+static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
+ [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) },
+};
+
+static const struct tc_sfb_qopt sfb_default_ops = {
+ .rehash_interval = 600 * MSEC_PER_SEC,
+ .warmup_time = 60 * MSEC_PER_SEC,
+ .limit = 0,
+ .max = 25,
+ .bin_size = 20,
+ .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
+ .decrement = (SFB_MAX_PROB + 3000) / 6000,
+ .penalty_rate = 10,
+ .penalty_burst = 20,
+};
+
+static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child;
+ struct nlattr *tb[TCA_SFB_MAX + 1];
+ const struct tc_sfb_qopt *ctl = &sfb_default_ops;
+ u32 limit;
+ int err;
+
+ if (opt) {
+ err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
+ if (err < 0)
+ return -EINVAL;
+
+ if (tb[TCA_SFB_PARMS] == NULL)
+ return -EINVAL;
+
+ ctl = nla_data(tb[TCA_SFB_PARMS]);
+ }
+
+ limit = ctl->limit;
+ if (limit == 0)
+ limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
+
+ child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+
+ sch_tree_lock(sch);
+
+ qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+ qdisc_destroy(q->qdisc);
+ q->qdisc = child;
+
+ q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
+ q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
+ q->rehash_time = jiffies;
+ q->limit = limit;
+ q->increment = ctl->increment;
+ q->decrement = ctl->decrement;
+ q->max = ctl->max;
+ q->bin_size = ctl->bin_size;
+ q->penalty_rate = ctl->penalty_rate;
+ q->penalty_burst = ctl->penalty_burst;
+ q->tokens_avail = ctl->penalty_burst;
+ q->token_time = jiffies;
+
+ q->slot = 0;
+ q->double_buffering = false;
+ sfb_zero_all_buckets(q);
+ sfb_init_perturbation(0, q);
+ sfb_init_perturbation(1, q);
+
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ q->qdisc = &noop_qdisc;
+ return sfb_change(sch, opt);
+}
+
+static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+ struct tc_sfb_qopt opt = {
+ .rehash_interval = jiffies_to_msecs(q->rehash_interval),
+ .warmup_time = jiffies_to_msecs(q->warmup_time),
+ .limit = q->limit,
+ .max = q->max,
+ .bin_size = q->bin_size,
+ .increment = q->increment,
+ .decrement = q->decrement,
+ .penalty_rate = q->penalty_rate,
+ .penalty_burst = q->penalty_burst,
+ };
+
+ sch->qstats.backlog = q->qdisc->qstats.backlog;
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt);
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct tc_sfb_xstats st = {
+ .earlydrop = q->stats.earlydrop,
+ .penaltydrop = q->stats.penaltydrop,
+ .bucketdrop = q->stats.bucketdrop,
+ .queuedrop = q->stats.queuedrop,
+ .childdrop = q->stats.childdrop,
+ .marked = q->stats.marked,
+ };
+
+ st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ return -ENOSYS;
+}
+
+static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->qdisc;
+ q->qdisc = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ return q->qdisc;
+}
+
+static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void sfb_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg)
+{
+ return -ENOSYS;
+}
+
+static int sfb_delete(struct Qdisc *sch, unsigned long cl)
+{
+ return -ENOSYS;
+}
+
+static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+
+static const struct Qdisc_class_ops sfb_class_ops = {
+ .graft = sfb_graft,
+ .leaf = sfb_leaf,
+ .get = sfb_get,
+ .put = sfb_put,
+ .change = sfb_change_class,
+ .delete = sfb_delete,
+ .walk = sfb_walk,
+ .tcf_chain = sfb_find_tcf,
+ .bind_tcf = sfb_bind,
+ .unbind_tcf = sfb_put,
+ .dump = sfb_dump_class,
+};
+
+static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
+ .id = "sfb",
+ .priv_size = sizeof(struct sfb_sched_data),
+ .cl_ops = &sfb_class_ops,
+ .enqueue = sfb_enqueue,
+ .dequeue = sfb_dequeue,
+ .peek = sfb_peek,
+ .init = sfb_init,
+ .reset = sfb_reset,
+ .destroy = sfb_destroy,
+ .change = sfb_change,
+ .dump = sfb_dump,
+ .dump_stats = sfb_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init sfb_module_init(void)
+{
+ return register_qdisc(&sfb_qdisc_ops);
+}
+
+static void __exit sfb_module_exit(void)
+{
+ unregister_qdisc(&sfb_qdisc_ops);
+}
+
+module_init(sfb_module_init)
+module_exit(sfb_module_exit)
+
+MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
+MODULE_AUTHOR("Juliusz Chroboczek");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 239ec53a634..c2e628dfaac 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -21,6 +21,7 @@
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
@@ -76,7 +77,8 @@
#define SFQ_DEPTH 128 /* max number of packets per flow */
#define SFQ_SLOTS 128 /* max number of flows */
#define SFQ_EMPTY_SLOT 255
-#define SFQ_HASH_DIVISOR 1024
+#define SFQ_DEFAULT_HASH_DIVISOR 1024
+
/* We use 16 bits to store allot, and want to handle packets up to 64K
* Scale allot by 8 (1<<3) so that no overflow occurs.
*/
@@ -92,8 +94,7 @@ typedef unsigned char sfq_index;
* while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1]
* are 'pointers' to dep[] array
*/
-struct sfq_head
-{
+struct sfq_head {
sfq_index next;
sfq_index prev;
};
@@ -108,13 +109,12 @@ struct sfq_slot {
short allot; /* credit for this slot */
};
-struct sfq_sched_data
-{
+struct sfq_sched_data {
/* Parameters */
int perturb_period;
- unsigned quantum; /* Allotment per round: MUST BE >= MTU */
+ unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
int limit;
-
+ unsigned int divisor; /* number of slots in hash table */
/* Variables */
struct tcf_proto *filter_list;
struct timer_list perturb_timer;
@@ -122,7 +122,7 @@ struct sfq_sched_data
sfq_index cur_depth; /* depth of longest slot */
unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
struct sfq_slot *tail; /* current slot in round */
- sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */
+ sfq_index *ht; /* Hash table (divisor slots) */
struct sfq_slot slots[SFQ_SLOTS];
struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */
};
@@ -137,12 +137,12 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
return &q->dep[val - SFQ_SLOTS];
}
-static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
+static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
{
- return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
+ return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1);
}
-static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
+static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
{
u32 h, h2;
@@ -157,13 +157,13 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
iph = ip_hdr(skb);
h = (__force u32)iph->daddr;
h2 = (__force u32)iph->saddr ^ iph->protocol;
- if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+ if (iph->frag_off & htons(IP_MF | IP_OFFSET))
break;
poff = proto_ports_offset(iph->protocol);
if (poff >= 0 &&
pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
iph = ip_hdr(skb);
- h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff);
+ h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff);
}
break;
}
@@ -181,7 +181,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
if (poff >= 0 &&
pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
iph = ipv6_hdr(skb);
- h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff);
+ h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff);
}
break;
}
@@ -203,7 +203,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
if (TC_H_MAJ(skb->priority) == sch->handle &&
TC_H_MIN(skb->priority) > 0 &&
- TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
+ TC_H_MIN(skb->priority) <= q->divisor)
return TC_H_MIN(skb->priority);
if (!q->filter_list)
@@ -221,7 +221,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
return 0;
}
#endif
- if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
+ if (TC_H_MIN(res.classid) <= q->divisor)
return TC_H_MIN(res.classid);
}
return 0;
@@ -402,10 +402,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
q->tail = slot;
slot->allot = q->scaled_quantum;
}
- if (++sch->q.qlen <= q->limit) {
- qdisc_bstats_update(sch, skb);
+ if (++sch->q.qlen <= q->limit)
return NET_XMIT_SUCCESS;
- }
sfq_drop(sch);
return NET_XMIT_CN;
@@ -445,6 +443,7 @@ next_slot:
}
skb = slot_dequeue_head(slot);
sfq_dec(q, a);
+ qdisc_bstats_update(sch, skb);
sch->q.qlen--;
sch->qstats.backlog -= qdisc_pkt_len(skb);
@@ -492,13 +491,18 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
+ if (ctl->divisor &&
+ (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
+ return -EINVAL;
+
sch_tree_lock(sch);
q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
q->perturb_period = ctl->perturb_period * HZ;
if (ctl->limit)
q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
-
+ if (ctl->divisor)
+ q->divisor = ctl->divisor;
qlen = sch->q.qlen;
while (sch->q.qlen > q->limit)
sfq_drop(sch);
@@ -516,15 +520,13 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct sfq_sched_data *q = qdisc_priv(sch);
+ size_t sz;
int i;
q->perturb_timer.function = sfq_perturbation;
q->perturb_timer.data = (unsigned long)sch;
init_timer_deferrable(&q->perturb_timer);
- for (i = 0; i < SFQ_HASH_DIVISOR; i++)
- q->ht[i] = SFQ_EMPTY_SLOT;
-
for (i = 0; i < SFQ_DEPTH; i++) {
q->dep[i].next = i + SFQ_SLOTS;
q->dep[i].prev = i + SFQ_SLOTS;
@@ -533,6 +535,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
q->limit = SFQ_DEPTH - 1;
q->cur_depth = 0;
q->tail = NULL;
+ q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
if (opt == NULL) {
q->quantum = psched_mtu(qdisc_dev(sch));
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
@@ -544,10 +547,23 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
return err;
}
+ sz = sizeof(q->ht[0]) * q->divisor;
+ q->ht = kmalloc(sz, GFP_KERNEL);
+ if (!q->ht && sz > PAGE_SIZE)
+ q->ht = vmalloc(sz);
+ if (!q->ht)
+ return -ENOMEM;
+ for (i = 0; i < q->divisor; i++)
+ q->ht[i] = SFQ_EMPTY_SLOT;
+
for (i = 0; i < SFQ_SLOTS; i++) {
slot_queue_init(&q->slots[i]);
sfq_link(q, i);
}
+ if (q->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
@@ -558,6 +574,10 @@ static void sfq_destroy(struct Qdisc *sch)
tcf_destroy_chain(&q->filter_list);
q->perturb_period = 0;
del_timer_sync(&q->perturb_timer);
+ if (is_vmalloc_addr(q->ht))
+ vfree(q->ht);
+ else
+ kfree(q->ht);
}
static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -570,7 +590,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.perturb_period = q->perturb_period / HZ;
opt.limit = q->limit;
- opt.divisor = SFQ_HASH_DIVISOR;
+ opt.divisor = q->divisor;
opt.flows = q->limit;
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
@@ -595,6 +615,8 @@ static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
+ /* we cannot bypass queue discipline anymore */
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
@@ -648,7 +670,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
if (arg->stop)
return;
- for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
+ for (i = 0; i < q->divisor; i++) {
if (q->ht[i] == SFQ_EMPTY_SLOT ||
arg->count < arg->skip) {
arg->count++;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 77565e72181..1dcfb5223a8 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -97,8 +97,7 @@
changed the limit is not effective anymore.
*/
-struct tbf_sched_data
-{
+struct tbf_sched_data {
/* Parameters */
u32 limit; /* Maximal length of backlog: bytes */
u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
@@ -115,10 +114,10 @@ struct tbf_sched_data
struct qdisc_watchdog watchdog; /* Watchdog timer */
};
-#define L2T(q,L) qdisc_l2t((q)->R_tab,L)
-#define L2T_P(q,L) qdisc_l2t((q)->P_tab,L)
+#define L2T(q, L) qdisc_l2t((q)->R_tab, L)
+#define L2T_P(q, L) qdisc_l2t((q)->P_tab, L)
-static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
int ret;
@@ -134,11 +133,10 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
}
sch->q.qlen++;
- qdisc_bstats_update(sch, skb);
return NET_XMIT_SUCCESS;
}
-static unsigned int tbf_drop(struct Qdisc* sch)
+static unsigned int tbf_drop(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
unsigned int len = 0;
@@ -150,7 +148,7 @@ static unsigned int tbf_drop(struct Qdisc* sch)
return len;
}
-static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
+static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
@@ -186,7 +184,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
q->tokens = toks;
q->ptokens = ptoks;
sch->q.qlen--;
- sch->flags &= ~TCQ_F_THROTTLED;
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
return skb;
}
@@ -209,7 +208,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
return NULL;
}
-static void tbf_reset(struct Qdisc* sch)
+static void tbf_reset(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
@@ -227,7 +226,7 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
};
-static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
+static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
{
int err;
struct tbf_sched_data *q = qdisc_priv(sch);
@@ -236,7 +235,7 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
struct qdisc_rate_table *rtab = NULL;
struct qdisc_rate_table *ptab = NULL;
struct Qdisc *child = NULL;
- int max_size,n;
+ int max_size, n;
err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
if (err < 0)
@@ -259,15 +258,18 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
}
for (n = 0; n < 256; n++)
- if (rtab->data[n] > qopt->buffer) break;
- max_size = (n << qopt->rate.cell_log)-1;
+ if (rtab->data[n] > qopt->buffer)
+ break;
+ max_size = (n << qopt->rate.cell_log) - 1;
if (ptab) {
int size;
for (n = 0; n < 256; n++)
- if (ptab->data[n] > qopt->mtu) break;
- size = (n << qopt->peakrate.cell_log)-1;
- if (size < max_size) max_size = size;
+ if (ptab->data[n] > qopt->mtu)
+ break;
+ size = (n << qopt->peakrate.cell_log) - 1;
+ if (size < max_size)
+ max_size = size;
}
if (max_size < 0)
goto done;
@@ -310,7 +312,7 @@ done:
return err;
}
-static int tbf_init(struct Qdisc* sch, struct nlattr *opt)
+static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
{
struct tbf_sched_data *q = qdisc_priv(sch);
@@ -422,8 +424,7 @@ static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
}
}
-static const struct Qdisc_class_ops tbf_class_ops =
-{
+static const struct Qdisc_class_ops tbf_class_ops = {
.graft = tbf_graft,
.leaf = tbf_leaf,
.get = tbf_get,
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 84ce48eadff..45cd30098e3 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -53,8 +53,7 @@
which will not break load balancing, though native slave
traffic will have the highest priority. */
-struct teql_master
-{
+struct teql_master {
struct Qdisc_ops qops;
struct net_device *dev;
struct Qdisc *slaves;
@@ -65,29 +64,27 @@ struct teql_master
unsigned long tx_dropped;
};
-struct teql_sched_data
-{
+struct teql_sched_data {
struct Qdisc *next;
struct teql_master *m;
struct neighbour *ncache;
struct sk_buff_head q;
};
-#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next)
+#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
-#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT)
+#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
/* "teql*" qdisc routines */
static int
-teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct teql_sched_data *q = qdisc_priv(sch);
if (q->q.qlen < dev->tx_queue_len) {
__skb_queue_tail(&q->q, skb);
- qdisc_bstats_update(sch, skb);
return NET_XMIT_SUCCESS;
}
@@ -97,7 +94,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
}
static struct sk_buff *
-teql_dequeue(struct Qdisc* sch)
+teql_dequeue(struct Qdisc *sch)
{
struct teql_sched_data *dat = qdisc_priv(sch);
struct netdev_queue *dat_queue;
@@ -111,19 +108,21 @@ teql_dequeue(struct Qdisc* sch)
dat->m->slaves = sch;
netif_wake_queue(m);
}
+ } else {
+ qdisc_bstats_update(sch, skb);
}
sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
return skb;
}
static struct sk_buff *
-teql_peek(struct Qdisc* sch)
+teql_peek(struct Qdisc *sch)
{
/* teql is meant to be used as root qdisc */
return NULL;
}
-static __inline__ void
+static inline void
teql_neigh_release(struct neighbour *n)
{
if (n)
@@ -131,7 +130,7 @@ teql_neigh_release(struct neighbour *n)
}
static void
-teql_reset(struct Qdisc* sch)
+teql_reset(struct Qdisc *sch)
{
struct teql_sched_data *dat = qdisc_priv(sch);
@@ -141,13 +140,14 @@ teql_reset(struct Qdisc* sch)
}
static void
-teql_destroy(struct Qdisc* sch)
+teql_destroy(struct Qdisc *sch)
{
struct Qdisc *q, *prev;
struct teql_sched_data *dat = qdisc_priv(sch);
struct teql_master *master = dat->m;
- if ((prev = master->slaves) != NULL) {
+ prev = master->slaves;
+ if (prev) {
do {
q = NEXT_SLAVE(prev);
if (q == sch) {
@@ -179,7 +179,7 @@ teql_destroy(struct Qdisc* sch)
static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
{
struct net_device *dev = qdisc_dev(sch);
- struct teql_master *m = (struct teql_master*)sch->ops;
+ struct teql_master *m = (struct teql_master *)sch->ops;
struct teql_sched_data *q = qdisc_priv(sch);
if (dev->hard_header_len > m->dev->hard_header_len)
@@ -290,7 +290,8 @@ restart:
nores = 0;
busy = 0;
- if ((q = start) == NULL)
+ q = start;
+ if (!q)
goto drop;
do {
@@ -355,10 +356,10 @@ drop:
static int teql_master_open(struct net_device *dev)
{
- struct Qdisc * q;
+ struct Qdisc *q;
struct teql_master *m = netdev_priv(dev);
int mtu = 0xFFFE;
- unsigned flags = IFF_NOARP|IFF_MULTICAST;
+ unsigned int flags = IFF_NOARP | IFF_MULTICAST;
if (m->slaves == NULL)
return -EUNATCH;
@@ -426,7 +427,7 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu)
do {
if (new_mtu > qdisc_dev(q)->mtu)
return -EINVAL;
- } while ((q=NEXT_SLAVE(q)) != m->slaves);
+ } while ((q = NEXT_SLAVE(q)) != m->slaves);
}
dev->mtu = new_mtu;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5f1fb8bd862..6b04287913c 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1089,7 +1089,6 @@ static void sctp_assoc_bh_rcv(struct work_struct *work)
base.inqueue.immediate);
struct sctp_endpoint *ep;
struct sctp_chunk *chunk;
- struct sock *sk;
struct sctp_inq *inqueue;
int state;
sctp_subtype_t subtype;
@@ -1097,7 +1096,6 @@ static void sctp_assoc_bh_rcv(struct work_struct *work)
/* The association should be held so we should be safe. */
ep = asoc->ep;
- sk = asoc->base.sk;
inqueue = &asoc->base.inqueue;
sctp_association_hold(asoc);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index ea2192444ce..826661be73e 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -948,14 +948,11 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct sk_buff *skb,
union sctp_addr addr;
union sctp_addr *paddr = &addr;
struct sctphdr *sh = sctp_hdr(skb);
- sctp_chunkhdr_t *ch;
union sctp_params params;
sctp_init_chunk_t *init;
struct sctp_transport *transport;
struct sctp_af *af;
- ch = (sctp_chunkhdr_t *) skb->data;
-
/*
* This code will NOT touch anything inside the chunk--it is
* strictly READ-ONLY.
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 95e0c8eda1a..865ce7ba4e1 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -201,40 +201,40 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
{
struct sock *sk = skb->sk;
struct ipv6_pinfo *np = inet6_sk(sk);
- struct flowi fl;
+ struct flowi6 fl6;
- memset(&fl, 0, sizeof(fl));
+ memset(&fl6, 0, sizeof(fl6));
- fl.proto = sk->sk_protocol;
+ fl6.flowi6_proto = sk->sk_protocol;
/* Fill in the dest address from the route entry passed with the skb
* and the source address from the transport.
*/
- ipv6_addr_copy(&fl.fl6_dst, &transport->ipaddr.v6.sin6_addr);
- ipv6_addr_copy(&fl.fl6_src, &transport->saddr.v6.sin6_addr);
+ ipv6_addr_copy(&fl6.daddr, &transport->ipaddr.v6.sin6_addr);
+ ipv6_addr_copy(&fl6.saddr, &transport->saddr.v6.sin6_addr);
- fl.fl6_flowlabel = np->flow_label;
- IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
- if (ipv6_addr_type(&fl.fl6_src) & IPV6_ADDR_LINKLOCAL)
- fl.oif = transport->saddr.v6.sin6_scope_id;
+ fl6.flowlabel = np->flow_label;
+ IP6_ECN_flow_xmit(sk, fl6.flowlabel);
+ if (ipv6_addr_type(&fl6.saddr) & IPV6_ADDR_LINKLOCAL)
+ fl6.flowi6_oif = transport->saddr.v6.sin6_scope_id;
else
- fl.oif = sk->sk_bound_dev_if;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
if (np->opt && np->opt->srcrt) {
struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ ipv6_addr_copy(&fl6.daddr, rt0->addr);
}
SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n",
__func__, skb, skb->len,
- &fl.fl6_src, &fl.fl6_dst);
+ &fl6.saddr, &fl6.daddr);
SCTP_INC_STATS(SCTP_MIB_OUTSCTPPACKS);
if (!(transport->param_flags & SPP_PMTUD_ENABLE))
skb->local_df = 1;
- return ip6_xmit(sk, skb, &fl, np->opt);
+ return ip6_xmit(sk, skb, &fl6, np->opt);
}
/* Returns the dst cache entry for the given source and destination ip
@@ -245,22 +245,22 @@ static struct dst_entry *sctp_v6_get_dst(struct sctp_association *asoc,
union sctp_addr *saddr)
{
struct dst_entry *dst;
- struct flowi fl;
+ struct flowi6 fl6;
- memset(&fl, 0, sizeof(fl));
- ipv6_addr_copy(&fl.fl6_dst, &daddr->v6.sin6_addr);
+ memset(&fl6, 0, sizeof(fl6));
+ ipv6_addr_copy(&fl6.daddr, &daddr->v6.sin6_addr);
if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
- fl.oif = daddr->v6.sin6_scope_id;
+ fl6.flowi6_oif = daddr->v6.sin6_scope_id;
- SCTP_DEBUG_PRINTK("%s: DST=%pI6 ", __func__, &fl.fl6_dst);
+ SCTP_DEBUG_PRINTK("%s: DST=%pI6 ", __func__, &fl6.daddr);
if (saddr) {
- ipv6_addr_copy(&fl.fl6_src, &saddr->v6.sin6_addr);
- SCTP_DEBUG_PRINTK("SRC=%pI6 - ", &fl.fl6_src);
+ ipv6_addr_copy(&fl6.saddr, &saddr->v6.sin6_addr);
+ SCTP_DEBUG_PRINTK("SRC=%pI6 - ", &fl6.saddr);
}
- dst = ip6_route_output(&init_net, NULL, &fl);
+ dst = ip6_route_output(&init_net, NULL, &fl6);
if (!dst->error) {
struct rt6_info *rt;
rt = (struct rt6_info *)dst;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 8c6d379b4bb..26dc005113a 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -545,13 +545,11 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
struct sctp_transport *transport = pkt->transport;
sctp_xmit_t status;
struct sctp_chunk *chunk, *chunk1;
- struct sctp_association *asoc;
int fast_rtx;
int error = 0;
int timer = 0;
int done = 0;
- asoc = q->asoc;
lqueue = &q->retransmit;
fast_rtx = q->fast_rtx;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index e58f9476f29..152976ec0b7 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -468,32 +468,32 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
union sctp_addr *saddr)
{
struct rtable *rt;
- struct flowi fl;
+ struct flowi4 fl4;
struct sctp_bind_addr *bp;
struct sctp_sockaddr_entry *laddr;
struct dst_entry *dst = NULL;
union sctp_addr dst_saddr;
- memset(&fl, 0x0, sizeof(struct flowi));
- fl.fl4_dst = daddr->v4.sin_addr.s_addr;
- fl.fl_ip_dport = daddr->v4.sin_port;
- fl.proto = IPPROTO_SCTP;
+ memset(&fl4, 0x0, sizeof(struct flowi4));
+ fl4.daddr = daddr->v4.sin_addr.s_addr;
+ fl4.fl4_dport = daddr->v4.sin_port;
+ fl4.flowi4_proto = IPPROTO_SCTP;
if (asoc) {
- fl.fl4_tos = RT_CONN_FLAGS(asoc->base.sk);
- fl.oif = asoc->base.sk->sk_bound_dev_if;
- fl.fl_ip_sport = htons(asoc->base.bind_addr.port);
+ fl4.flowi4_tos = RT_CONN_FLAGS(asoc->base.sk);
+ fl4.flowi4_oif = asoc->base.sk->sk_bound_dev_if;
+ fl4.fl4_sport = htons(asoc->base.bind_addr.port);
}
if (saddr) {
- fl.fl4_src = saddr->v4.sin_addr.s_addr;
- fl.fl_ip_sport = saddr->v4.sin_port;
+ fl4.saddr = saddr->v4.sin_addr.s_addr;
+ fl4.fl4_sport = saddr->v4.sin_port;
}
SCTP_DEBUG_PRINTK("%s: DST:%pI4, SRC:%pI4 - ",
- __func__, &fl.fl4_dst, &fl.fl4_src);
+ __func__, &fl4.daddr, &fl4.saddr);
- if (!ip_route_output_key(&init_net, &rt, &fl)) {
+ rt = ip_route_output_key(&init_net, &fl4);
+ if (!IS_ERR(rt))
dst = &rt->dst;
- }
/* If there is no association or if a source address is passed, no
* more validation is required.
@@ -533,9 +533,10 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
continue;
if ((laddr->state == SCTP_ADDR_SRC) &&
(AF_INET == laddr->a.sa.sa_family)) {
- fl.fl4_src = laddr->a.v4.sin_addr.s_addr;
- fl.fl_ip_sport = laddr->a.v4.sin_port;
- if (!ip_route_output_key(&init_net, &rt, &fl)) {
+ fl4.saddr = laddr->a.v4.sin_addr.s_addr;
+ fl4.fl4_sport = laddr->a.v4.sin_port;
+ rt = ip_route_output_key(&init_net, &fl4);
+ if (!IS_ERR(rt)) {
dst = &rt->dst;
goto out_unlock;
}
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 2cc46f0962c..de98665db52 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2029,11 +2029,11 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
*errp = sctp_make_op_error_fixed(asoc, chunk);
if (*errp) {
- sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
- WORD_ROUND(ntohs(param.p->length)));
- sctp_addto_chunk_fixed(*errp,
- WORD_ROUND(ntohs(param.p->length)),
- param.v);
+ if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
+ WORD_ROUND(ntohs(param.p->length))))
+ sctp_addto_chunk_fixed(*errp,
+ WORD_ROUND(ntohs(param.p->length)),
+ param.v);
} else {
/* If there is no memory for generating the ERROR
* report as specified, an ABORT will be triggered
@@ -3375,7 +3375,6 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
struct sctp_fwdtsn_skip *skiplist)
{
struct sctp_chunk *retval = NULL;
- struct sctp_fwdtsn_chunk *ftsn_chunk;
struct sctp_fwdtsn_hdr ftsn_hdr;
struct sctp_fwdtsn_skip skip;
size_t hint;
@@ -3388,8 +3387,6 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
if (!retval)
return NULL;
- ftsn_chunk = (struct sctp_fwdtsn_chunk *)retval->subh.fwdtsn_hdr;
-
ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn);
retval->subh.fwdtsn_hdr =
sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index a09b0dd25f5..3951a10605b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2928,7 +2928,6 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva
unsigned int optlen)
{
struct sctp_sock *sp;
- struct sctp_endpoint *ep;
struct sctp_association *asoc = NULL;
struct sctp_setpeerprim prim;
struct sctp_chunk *chunk;
@@ -2936,7 +2935,6 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva
int err;
sp = sctp_sk(sk);
- ep = sp->ep;
if (!sctp_addip_enable)
return -EPERM;
@@ -3428,7 +3426,7 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen);
break;
- case SCTP_DELAYED_ACK:
+ case SCTP_DELAYED_SACK:
retval = sctp_setsockopt_delayed_ack(sk, optval, optlen);
break;
case SCTP_PARTIAL_DELIVERY_POINT:
@@ -5333,7 +5331,7 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
retval = sctp_getsockopt_peer_addr_params(sk, len, optval,
optlen);
break;
- case SCTP_DELAYED_ACK:
+ case SCTP_DELAYED_SACK:
retval = sctp_getsockopt_delayed_ack(sk, len, optval,
optlen);
break;
@@ -6102,15 +6100,16 @@ static void __sctp_write_space(struct sctp_association *asoc)
wake_up_interruptible(&asoc->wait);
if (sctp_writeable(sk)) {
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible(sk_sleep(sk));
+ wait_queue_head_t *wq = sk_sleep(sk);
+
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
/* Note that we try to include the Async I/O support
* here by modeling from the current TCP/UDP code.
* We have not tested with it yet.
*/
- if (sock->wq->fasync_list &&
- !(sk->sk_shutdown & SEND_SHUTDOWN))
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN))
sock_wake_async(sock,
SOCK_WAKE_SPACE, POLL_OUT);
}
diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c
index 747d5412c46..f1e40cebc98 100644
--- a/net/sctp/tsnmap.c
+++ b/net/sctp/tsnmap.c
@@ -344,7 +344,7 @@ __u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map,
/* Refresh the gap ack information. */
if (sctp_tsnmap_has_gap(map)) {
- __u16 start, end;
+ __u16 start = 0, end = 0;
sctp_tsnmap_iter_init(map, &iter);
while (sctp_tsnmap_next_gap_ack(map, &iter,
&start,
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index c7f7e49609c..17678189d05 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -105,11 +105,8 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
gfp_t gfp)
{
struct sk_buff_head temp;
- sctp_data_chunk_t *hdr;
struct sctp_ulpevent *event;
- hdr = (sctp_data_chunk_t *) chunk->chunk_hdr;
-
/* Create an event from the incoming chunk. */
event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp);
if (!event)
@@ -743,11 +740,9 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
struct sk_buff *pos, *tmp;
struct sctp_ulpevent *cevent;
struct sctp_stream *in;
- __u16 sid, csid;
- __u16 ssn, cssn;
+ __u16 sid, csid, cssn;
sid = event->stream;
- ssn = event->ssn;
in = &ulpq->asoc->ssnmap->in;
event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
diff --git a/net/socket.c b/net/socket.c
index ac2219f90d5..937d0fcf74b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -240,17 +240,19 @@ static struct kmem_cache *sock_inode_cachep __read_mostly;
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
+ struct socket_wq *wq;
ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
- ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL);
- if (!ei->socket.wq) {
+ wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+ if (!wq) {
kmem_cache_free(sock_inode_cachep, ei);
return NULL;
}
- init_waitqueue_head(&ei->socket.wq->wait);
- ei->socket.wq->fasync_list = NULL;
+ init_waitqueue_head(&wq->wait);
+ wq->fasync_list = NULL;
+ RCU_INIT_POINTER(ei->socket.wq, wq);
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
@@ -273,9 +275,11 @@ static void wq_free_rcu(struct rcu_head *head)
static void sock_destroy_inode(struct inode *inode)
{
struct socket_alloc *ei;
+ struct socket_wq *wq;
ei = container_of(inode, struct socket_alloc, vfs_inode);
- call_rcu(&ei->socket.wq->rcu, wq_free_rcu);
+ wq = rcu_dereference_protected(ei->socket.wq, 1);
+ call_rcu(&wq->rcu, wq_free_rcu);
kmem_cache_free(sock_inode_cachep, ei);
}
@@ -524,7 +528,7 @@ void sock_release(struct socket *sock)
module_put(owner);
}
- if (sock->wq->fasync_list)
+ if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
printk(KERN_ERR "sock_release: fasync list not empty!\n");
percpu_sub(sockets_in_use, 1);
@@ -1108,15 +1112,16 @@ static int sock_fasync(int fd, struct file *filp, int on)
{
struct socket *sock = filp->private_data;
struct sock *sk = sock->sk;
+ struct socket_wq *wq;
if (sk == NULL)
return -EINVAL;
lock_sock(sk);
+ wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk));
+ fasync_helper(fd, filp, on, &wq->fasync_list);
- fasync_helper(fd, filp, on, &sock->wq->fasync_list);
-
- if (!sock->wq->fasync_list)
+ if (!wq->fasync_list)
sock_reset_flag(sk, SOCK_FASYNC);
else
sock_set_flag(sk, SOCK_FASYNC);
@@ -2643,7 +2648,8 @@ static int bond_ioctl(struct net *net, unsigned int cmd,
old_fs = get_fs();
set_fs(KERNEL_DS);
- err = dev_ioctl(net, cmd, &kifr);
+ err = dev_ioctl(net, cmd,
+ (struct ifreq __user __force *) &kifr);
set_fs(old_fs);
return err;
@@ -2752,7 +2758,7 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
old_fs = get_fs();
set_fs(KERNEL_DS);
- err = dev_ioctl(net, cmd, (void __user *)&ifr);
+ err = dev_ioctl(net, cmd, (void __user __force *)&ifr);
set_fs(old_fs);
if (cmd == SIOCGIFMAP && !err) {
@@ -2857,7 +2863,8 @@ static int routing_ioctl(struct net *net, struct socket *sock,
ret |= __get_user(rtdev, &(ur4->rt_dev));
if (rtdev) {
ret |= copy_from_user(devname, compat_ptr(rtdev), 15);
- r4.rt_dev = devname; devname[15] = 0;
+ r4.rt_dev = (char __user __force *)devname;
+ devname[15] = 0;
} else
r4.rt_dev = NULL;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 7bd3bbba471..b7d435c3f19 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -420,6 +420,7 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
static void svc_udp_data_ready(struct sock *sk, int count)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
@@ -428,8 +429,8 @@ static void svc_udp_data_ready(struct sock *sk, int count)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible(sk_sleep(sk));
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
}
/*
@@ -438,6 +439,7 @@ static void svc_udp_data_ready(struct sock *sk, int count)
static void svc_write_space(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
+ wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
@@ -445,10 +447,10 @@ static void svc_write_space(struct sock *sk)
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) {
+ if (wq && waitqueue_active(wq)) {
dprintk("RPC svc_write_space: someone sleeping on %p\n",
svsk);
- wake_up_interruptible(sk_sleep(sk));
+ wake_up_interruptible(wq);
}
}
@@ -739,6 +741,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq;
dprintk("svc: socket %p TCP (listen) state change %d\n",
sk, sk->sk_state);
@@ -761,8 +764,9 @@ static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
printk("svc: socket %p: no user data\n", sk);
}
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible_all(sk_sleep(sk));
+ wq = sk_sleep(sk);
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible_all(wq);
}
/*
@@ -771,6 +775,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
static void svc_tcp_state_change(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq = sk_sleep(sk);
dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
sk, sk->sk_state, sk->sk_user_data);
@@ -781,13 +786,14 @@ static void svc_tcp_state_change(struct sock *sk)
set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible_all(sk_sleep(sk));
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible_all(wq);
}
static void svc_tcp_data_ready(struct sock *sk, int count)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq = sk_sleep(sk);
dprintk("svc: socket %p TCP data ready (svsk %p)\n",
sk, sk->sk_user_data);
@@ -795,8 +801,8 @@ static void svc_tcp_data_ready(struct sock *sk, int count)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible(sk_sleep(sk));
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
}
/*
@@ -1531,6 +1537,7 @@ static void svc_sock_detach(struct svc_xprt *xprt)
{
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
struct sock *sk = svsk->sk_sk;
+ wait_queue_head_t *wq;
dprintk("svc: svc_sock_detach(%p)\n", svsk);
@@ -1539,8 +1546,9 @@ static void svc_sock_detach(struct svc_xprt *xprt)
sk->sk_data_ready = svsk->sk_odata;
sk->sk_write_space = svsk->sk_owspace;
- if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
- wake_up_interruptible(sk_sleep(sk));
+ wq = sk_sleep(sk);
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
}
/*
@@ -1609,9 +1617,7 @@ static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
*/
static void svc_bc_sock_free(struct svc_xprt *xprt)
{
- if (xprt) {
- kfree(xprt->xpt_bc_sid);
+ if (xprt)
kfree(container_of(xprt, struct svc_sock, sk_xprt));
- }
}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig
index 0436927369f..2c5954b8593 100644
--- a/net/tipc/Kconfig
+++ b/net/tipc/Kconfig
@@ -29,18 +29,6 @@ config TIPC_ADVANCED
Saying Y here will open some advanced configuration for TIPC.
Most users do not need to bother; if unsure, just say N.
-config TIPC_NODES
- int "Maximum number of nodes in a cluster"
- depends on TIPC_ADVANCED
- range 8 2047
- default "255"
- help
- Specifies how many nodes can be supported in a TIPC cluster.
- Can range from 8 to 2047 nodes; default is 255.
-
- Setting this to a smaller value saves some memory;
- setting it to higher allows for more nodes.
-
config TIPC_PORTS
int "Maximum number of ports in a node"
depends on TIPC_ADVANCED
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index 88463d9a6f1..a6fdab33877 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -2,7 +2,7 @@
* net/tipc/addr.c: TIPC address utility routines
*
* Copyright (c) 2000-2006, Ericsson AB
- * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2004-2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -41,7 +41,7 @@
* tipc_addr_domain_valid - validates a network domain address
*
* Accepts <Z.C.N>, <Z.C.0>, <Z.0.0>, and <0.0.0>,
- * where Z, C, and N are non-zero and do not exceed the configured limits.
+ * where Z, C, and N are non-zero.
*
* Returns 1 if domain address is valid, otherwise 0
*/
@@ -51,10 +51,6 @@ int tipc_addr_domain_valid(u32 addr)
u32 n = tipc_node(addr);
u32 c = tipc_cluster(addr);
u32 z = tipc_zone(addr);
- u32 max_nodes = tipc_max_nodes;
-
- if (n > max_nodes)
- return 0;
if (n && (!z || !c))
return 0;
@@ -66,8 +62,7 @@ int tipc_addr_domain_valid(u32 addr)
/**
* tipc_addr_node_valid - validates a proposed network address for this node
*
- * Accepts <Z.C.N>, where Z, C, and N are non-zero and do not exceed
- * the configured limits.
+ * Accepts <Z.C.N>, where Z, C, and N are non-zero.
*
* Returns 1 if address can be used, otherwise 0
*/
@@ -81,9 +76,9 @@ int tipc_in_scope(u32 domain, u32 addr)
{
if (!domain || (domain == addr))
return 1;
- if (domain == (addr & 0xfffff000u)) /* domain <Z.C.0> */
+ if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */
return 1;
- if (domain == (addr & 0xff000000u)) /* domain <Z.0.0> */
+ if (domain == tipc_zone_mask(addr)) /* domain <Z.0.0> */
return 1;
return 0;
}
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index 2490fadd0ca..8971aba99ae 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -37,6 +37,16 @@
#ifndef _TIPC_ADDR_H
#define _TIPC_ADDR_H
+static inline u32 tipc_zone_mask(u32 addr)
+{
+ return addr & 0xff000000u;
+}
+
+static inline u32 tipc_cluster_mask(u32 addr)
+{
+ return addr & 0xfffff000u;
+}
+
static inline int in_own_cluster(u32 addr)
{
return !((addr ^ tipc_own_addr) >> 12);
@@ -49,14 +59,13 @@ static inline int in_own_cluster(u32 addr)
* after a network hop.
*/
-static inline int addr_domain(int sc)
+static inline u32 addr_domain(u32 sc)
{
if (likely(sc == TIPC_NODE_SCOPE))
return tipc_own_addr;
if (sc == TIPC_CLUSTER_SCOPE)
- return tipc_addr(tipc_zone(tipc_own_addr),
- tipc_cluster(tipc_own_addr), 0);
- return tipc_addr(tipc_zone(tipc_own_addr), 0, 0);
+ return tipc_cluster_mask(tipc_own_addr);
+ return tipc_zone_mask(tipc_own_addr);
}
int tipc_addr_domain_valid(u32);
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 70ab5ef4876..7dc1dc7151e 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -3,7 +3,7 @@
*
* Copyright (c) 2004-2006, Ericsson AB
* Copyright (c) 2004, Intel Corporation.
- * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -61,8 +61,8 @@
*/
struct bcbearer_pair {
- struct bearer *primary;
- struct bearer *secondary;
+ struct tipc_bearer *primary;
+ struct tipc_bearer *secondary;
};
/**
@@ -81,7 +81,7 @@ struct bcbearer_pair {
*/
struct bcbearer {
- struct bearer bearer;
+ struct tipc_bearer bearer;
struct media media;
struct bcbearer_pair bpairs[MAX_BEARERS];
struct bcbearer_pair bpairs_temp[TIPC_MAX_LINK_PRI + 1];
@@ -93,6 +93,7 @@ struct bcbearer {
* struct bclink - link used for broadcast messages
* @link: (non-standard) broadcast link structure
* @node: (non-standard) node structure representing b'cast link's peer node
+ * @retransmit_to: node that most recently requested a retransmit
*
* Handles sequence numbering, fragmentation, bundling, etc.
*/
@@ -100,6 +101,7 @@ struct bcbearer {
struct bclink {
struct link link;
struct tipc_node node;
+ struct tipc_node *retransmit_to;
};
@@ -184,6 +186,17 @@ static int bclink_ack_allowed(u32 n)
/**
+ * tipc_bclink_retransmit_to - get most recent node to request retransmission
+ *
+ * Called with bc_lock locked
+ */
+
+struct tipc_node *tipc_bclink_retransmit_to(void)
+{
+ return bclink->retransmit_to;
+}
+
+/**
* bclink_retransmit_pkt - retransmit broadcast packets
* @after: sequence number of last packet to *not* retransmit
* @to: sequence number of last packet to retransmit
@@ -285,6 +298,7 @@ static void bclink_send_nack(struct tipc_node *n_ptr)
msg = buf_msg(buf);
tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG,
INT_H_SIZE, n_ptr->addr);
+ msg_set_non_seq(msg, 1);
msg_set_mc_netid(msg, tipc_net_id);
msg_set_bcast_ack(msg, mod(n_ptr->bclink.last_in));
msg_set_bcgap_after(msg, n_ptr->bclink.gap_after);
@@ -405,8 +419,6 @@ int tipc_bclink_send_msg(struct sk_buff *buf)
else
bclink_set_last_sent();
- if (bcl->out_queue_size > bcl->stats.max_queue_sz)
- bcl->stats.max_queue_sz = bcl->out_queue_size;
bcl->stats.queue_sz_counts++;
bcl->stats.accu_queue_sz += bcl->out_queue_size;
@@ -444,10 +456,9 @@ void tipc_bclink_recv_pkt(struct sk_buff *buf)
tipc_node_unlock(node);
spin_lock_bh(&bc_lock);
bcl->stats.recv_nacks++;
- bcl->owner->next = node; /* remember requestor */
+ bclink->retransmit_to = node;
bclink_retransmit_pkt(msg_bcgap_after(msg),
msg_bcgap_to(msg));
- bcl->owner->next = NULL;
spin_unlock_bh(&bc_lock);
} else {
tipc_bclink_peek_nack(msg_destnode(msg),
@@ -574,8 +585,8 @@ static int tipc_bcbearer_send(struct sk_buff *buf,
bcbearer->remains = tipc_bcast_nmap;
for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) {
- struct bearer *p = bcbearer->bpairs[bp_index].primary;
- struct bearer *s = bcbearer->bpairs[bp_index].secondary;
+ struct tipc_bearer *p = bcbearer->bpairs[bp_index].primary;
+ struct tipc_bearer *s = bcbearer->bpairs[bp_index].secondary;
if (!p)
break; /* no more bearers to try */
@@ -584,11 +595,11 @@ static int tipc_bcbearer_send(struct sk_buff *buf,
if (bcbearer->remains_new.count == bcbearer->remains.count)
continue; /* bearer pair doesn't add anything */
- if (p->publ.blocked ||
- p->media->send_msg(buf, &p->publ, &p->media->bcast_addr)) {
+ if (p->blocked ||
+ p->media->send_msg(buf, p, &p->media->bcast_addr)) {
/* unable to send on primary bearer */
- if (!s || s->publ.blocked ||
- s->media->send_msg(buf, &s->publ,
+ if (!s || s->blocked ||
+ s->media->send_msg(buf, s,
&s->media->bcast_addr)) {
/* unable to send on either bearer */
continue;
@@ -633,7 +644,7 @@ void tipc_bcbearer_sort(void)
memset(bp_temp, 0, sizeof(bcbearer->bpairs_temp));
for (b_index = 0; b_index < MAX_BEARERS; b_index++) {
- struct bearer *b = &tipc_bearers[b_index];
+ struct tipc_bearer *b = &tipc_bearers[b_index];
if (!b->active || !b->nodes.count)
continue;
@@ -682,12 +693,12 @@ void tipc_bcbearer_sort(void)
void tipc_bcbearer_push(void)
{
- struct bearer *b_ptr;
+ struct tipc_bearer *b_ptr;
spin_lock_bh(&bc_lock);
b_ptr = &bcbearer->bearer;
- if (b_ptr->publ.blocked) {
- b_ptr->publ.blocked = 0;
+ if (b_ptr->blocked) {
+ b_ptr->blocked = 0;
tipc_bearer_lock_push(b_ptr);
}
spin_unlock_bh(&bc_lock);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 51f8c5326ce..500c97f1c85 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -2,7 +2,7 @@
* net/tipc/bcast.h: Include file for TIPC broadcast code
*
* Copyright (c) 2003-2006, Ericsson AB
- * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -90,6 +90,7 @@ void tipc_port_list_free(struct port_list *pl_ptr);
int tipc_bclink_init(void);
void tipc_bclink_stop(void);
+struct tipc_node *tipc_bclink_retransmit_to(void);
void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked);
int tipc_bclink_send_msg(struct sk_buff *buf);
void tipc_bclink_recv_pkt(struct sk_buff *buf);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 837b7a46788..411719feb80 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -2,7 +2,7 @@
* net/tipc/bearer.c: TIPC bearer code
*
* Copyright (c) 1996-2006, Ericsson AB
- * Copyright (c) 2004-2006, Wind River Systems
+ * Copyright (c) 2004-2006, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -44,7 +44,7 @@
static struct media media_list[MAX_MEDIA];
static u32 media_count;
-struct bearer tipc_bearers[MAX_BEARERS];
+struct tipc_bearer tipc_bearers[MAX_BEARERS];
/**
* media_name_valid - validate media name
@@ -158,7 +158,6 @@ int tipc_register_media(u32 media_type,
m_ptr->disable_bearer = disable;
m_ptr->addr2str = addr2str;
memcpy(&m_ptr->bcast_addr, bcast_addr, sizeof(*bcast_addr));
- m_ptr->bcast = 1;
strcpy(m_ptr->name, name);
m_ptr->priority = bearer_priority;
m_ptr->tolerance = link_tolerance;
@@ -278,13 +277,13 @@ static int bearer_name_validate(const char *name,
* bearer_find - locates bearer object with matching bearer name
*/
-static struct bearer *bearer_find(const char *name)
+static struct tipc_bearer *bearer_find(const char *name)
{
- struct bearer *b_ptr;
+ struct tipc_bearer *b_ptr;
u32 i;
for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) {
- if (b_ptr->active && (!strcmp(b_ptr->publ.name, name)))
+ if (b_ptr->active && (!strcmp(b_ptr->name, name)))
return b_ptr;
}
return NULL;
@@ -294,16 +293,16 @@ static struct bearer *bearer_find(const char *name)
* tipc_bearer_find_interface - locates bearer object with matching interface name
*/
-struct bearer *tipc_bearer_find_interface(const char *if_name)
+struct tipc_bearer *tipc_bearer_find_interface(const char *if_name)
{
- struct bearer *b_ptr;
+ struct tipc_bearer *b_ptr;
char *b_if_name;
u32 i;
for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) {
if (!b_ptr->active)
continue;
- b_if_name = strchr(b_ptr->publ.name, ':') + 1;
+ b_if_name = strchr(b_ptr->name, ':') + 1;
if (!strcmp(b_if_name, if_name))
return b_ptr;
}
@@ -318,7 +317,7 @@ struct sk_buff *tipc_bearer_get_names(void)
{
struct sk_buff *buf;
struct media *m_ptr;
- struct bearer *b_ptr;
+ struct tipc_bearer *b_ptr;
int i, j;
buf = tipc_cfg_reply_alloc(MAX_BEARERS * TLV_SPACE(TIPC_MAX_BEARER_NAME));
@@ -331,8 +330,8 @@ struct sk_buff *tipc_bearer_get_names(void)
b_ptr = &tipc_bearers[j];
if (b_ptr->active && (b_ptr->media == m_ptr)) {
tipc_cfg_append_tlv(buf, TIPC_TLV_BEARER_NAME,
- b_ptr->publ.name,
- strlen(b_ptr->publ.name) + 1);
+ b_ptr->name,
+ strlen(b_ptr->name) + 1);
}
}
}
@@ -340,14 +339,14 @@ struct sk_buff *tipc_bearer_get_names(void)
return buf;
}
-void tipc_bearer_add_dest(struct bearer *b_ptr, u32 dest)
+void tipc_bearer_add_dest(struct tipc_bearer *b_ptr, u32 dest)
{
tipc_nmap_add(&b_ptr->nodes, dest);
tipc_disc_update_link_req(b_ptr->link_req);
tipc_bcbearer_sort();
}
-void tipc_bearer_remove_dest(struct bearer *b_ptr, u32 dest)
+void tipc_bearer_remove_dest(struct tipc_bearer *b_ptr, u32 dest)
{
tipc_nmap_remove(&b_ptr->nodes, dest);
tipc_disc_update_link_req(b_ptr->link_req);
@@ -362,12 +361,12 @@ void tipc_bearer_remove_dest(struct bearer *b_ptr, u32 dest)
* bearer.lock must be taken before calling
* Returns binary true(1) ore false(0)
*/
-static int bearer_push(struct bearer *b_ptr)
+static int bearer_push(struct tipc_bearer *b_ptr)
{
u32 res = 0;
struct link *ln, *tln;
- if (b_ptr->publ.blocked)
+ if (b_ptr->blocked)
return 0;
while (!list_empty(&b_ptr->cong_links) && (res != PUSH_FAILED)) {
@@ -382,13 +381,13 @@ static int bearer_push(struct bearer *b_ptr)
return list_empty(&b_ptr->cong_links);
}
-void tipc_bearer_lock_push(struct bearer *b_ptr)
+void tipc_bearer_lock_push(struct tipc_bearer *b_ptr)
{
int res;
- spin_lock_bh(&b_ptr->publ.lock);
+ spin_lock_bh(&b_ptr->lock);
res = bearer_push(b_ptr);
- spin_unlock_bh(&b_ptr->publ.lock);
+ spin_unlock_bh(&b_ptr->lock);
if (res)
tipc_bcbearer_push();
}
@@ -398,16 +397,14 @@ void tipc_bearer_lock_push(struct bearer *b_ptr)
* Interrupt enabling new requests after bearer congestion or blocking:
* See bearer_send().
*/
-void tipc_continue(struct tipc_bearer *tb_ptr)
+void tipc_continue(struct tipc_bearer *b_ptr)
{
- struct bearer *b_ptr = (struct bearer *)tb_ptr;
-
- spin_lock_bh(&b_ptr->publ.lock);
+ spin_lock_bh(&b_ptr->lock);
b_ptr->continue_count++;
if (!list_empty(&b_ptr->cong_links))
tipc_k_signal((Handler)tipc_bearer_lock_push, (unsigned long)b_ptr);
- b_ptr->publ.blocked = 0;
- spin_unlock_bh(&b_ptr->publ.lock);
+ b_ptr->blocked = 0;
+ spin_unlock_bh(&b_ptr->lock);
}
/*
@@ -418,7 +415,7 @@ void tipc_continue(struct tipc_bearer *tb_ptr)
* bearer.lock is busy
*/
-static void tipc_bearer_schedule_unlocked(struct bearer *b_ptr, struct link *l_ptr)
+static void tipc_bearer_schedule_unlocked(struct tipc_bearer *b_ptr, struct link *l_ptr)
{
list_move_tail(&l_ptr->link_list, &b_ptr->cong_links);
}
@@ -431,11 +428,11 @@ static void tipc_bearer_schedule_unlocked(struct bearer *b_ptr, struct link *l_p
* bearer.lock is free
*/
-void tipc_bearer_schedule(struct bearer *b_ptr, struct link *l_ptr)
+void tipc_bearer_schedule(struct tipc_bearer *b_ptr, struct link *l_ptr)
{
- spin_lock_bh(&b_ptr->publ.lock);
+ spin_lock_bh(&b_ptr->lock);
tipc_bearer_schedule_unlocked(b_ptr, l_ptr);
- spin_unlock_bh(&b_ptr->publ.lock);
+ spin_unlock_bh(&b_ptr->lock);
}
@@ -444,18 +441,18 @@ void tipc_bearer_schedule(struct bearer *b_ptr, struct link *l_ptr)
* and if there is, try to resolve it before returning.
* 'tipc_net_lock' is read_locked when this function is called
*/
-int tipc_bearer_resolve_congestion(struct bearer *b_ptr, struct link *l_ptr)
+int tipc_bearer_resolve_congestion(struct tipc_bearer *b_ptr, struct link *l_ptr)
{
int res = 1;
if (list_empty(&b_ptr->cong_links))
return 1;
- spin_lock_bh(&b_ptr->publ.lock);
+ spin_lock_bh(&b_ptr->lock);
if (!bearer_push(b_ptr)) {
tipc_bearer_schedule_unlocked(b_ptr, l_ptr);
res = 0;
}
- spin_unlock_bh(&b_ptr->publ.lock);
+ spin_unlock_bh(&b_ptr->lock);
return res;
}
@@ -463,9 +460,9 @@ int tipc_bearer_resolve_congestion(struct bearer *b_ptr, struct link *l_ptr)
* tipc_bearer_congested - determines if bearer is currently congested
*/
-int tipc_bearer_congested(struct bearer *b_ptr, struct link *l_ptr)
+int tipc_bearer_congested(struct tipc_bearer *b_ptr, struct link *l_ptr)
{
- if (unlikely(b_ptr->publ.blocked))
+ if (unlikely(b_ptr->blocked))
return 1;
if (likely(list_empty(&b_ptr->cong_links)))
return 0;
@@ -476,9 +473,9 @@ int tipc_bearer_congested(struct bearer *b_ptr, struct link *l_ptr)
* tipc_enable_bearer - enable bearer with the given name
*/
-int tipc_enable_bearer(const char *name, u32 bcast_scope, u32 priority)
+int tipc_enable_bearer(const char *name, u32 disc_domain, u32 priority)
{
- struct bearer *b_ptr;
+ struct tipc_bearer *b_ptr;
struct media *m_ptr;
struct bearer_name b_name;
char addr_string[16];
@@ -496,9 +493,9 @@ int tipc_enable_bearer(const char *name, u32 bcast_scope, u32 priority)
warn("Bearer <%s> rejected, illegal name\n", name);
return -EINVAL;
}
- if (!tipc_addr_domain_valid(bcast_scope) ||
- !tipc_in_scope(bcast_scope, tipc_own_addr)) {
- warn("Bearer <%s> rejected, illegal broadcast scope\n", name);
+ if (!tipc_addr_domain_valid(disc_domain) ||
+ !tipc_in_scope(disc_domain, tipc_own_addr)) {
+ warn("Bearer <%s> rejected, illegal discovery domain\n", name);
return -EINVAL;
}
if ((priority < TIPC_MIN_LINK_PRI ||
@@ -528,7 +525,7 @@ restart:
bearer_id = i;
continue;
}
- if (!strcmp(name, tipc_bearers[i].publ.name)) {
+ if (!strcmp(name, tipc_bearers[i].name)) {
warn("Bearer <%s> rejected, already enabled\n", name);
goto failed;
}
@@ -551,8 +548,8 @@ restart:
}
b_ptr = &tipc_bearers[bearer_id];
- strcpy(b_ptr->publ.name, name);
- res = m_ptr->enable_bearer(&b_ptr->publ);
+ strcpy(b_ptr->name, name);
+ res = m_ptr->enable_bearer(b_ptr);
if (res) {
warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res);
goto failed;
@@ -562,18 +559,15 @@ restart:
b_ptr->media = m_ptr;
b_ptr->net_plane = bearer_id + 'A';
b_ptr->active = 1;
- b_ptr->detect_scope = bcast_scope;
b_ptr->priority = priority;
INIT_LIST_HEAD(&b_ptr->cong_links);
INIT_LIST_HEAD(&b_ptr->links);
- if (m_ptr->bcast) {
- b_ptr->link_req = tipc_disc_init_link_req(b_ptr, &m_ptr->bcast_addr,
- bcast_scope, 2);
- }
- spin_lock_init(&b_ptr->publ.lock);
+ b_ptr->link_req = tipc_disc_init_link_req(b_ptr, &m_ptr->bcast_addr,
+ disc_domain);
+ spin_lock_init(&b_ptr->lock);
write_unlock_bh(&tipc_net_lock);
info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
- name, tipc_addr_string_fill(addr_string, bcast_scope), priority);
+ name, tipc_addr_string_fill(addr_string, disc_domain), priority);
return 0;
failed:
write_unlock_bh(&tipc_net_lock);
@@ -587,7 +581,7 @@ failed:
int tipc_block_bearer(const char *name)
{
- struct bearer *b_ptr = NULL;
+ struct tipc_bearer *b_ptr = NULL;
struct link *l_ptr;
struct link *temp_l_ptr;
@@ -600,8 +594,8 @@ int tipc_block_bearer(const char *name)
}
info("Blocking bearer <%s>\n", name);
- spin_lock_bh(&b_ptr->publ.lock);
- b_ptr->publ.blocked = 1;
+ spin_lock_bh(&b_ptr->lock);
+ b_ptr->blocked = 1;
list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
struct tipc_node *n_ptr = l_ptr->owner;
@@ -609,7 +603,7 @@ int tipc_block_bearer(const char *name)
tipc_link_reset(l_ptr);
spin_unlock_bh(&n_ptr->lock);
}
- spin_unlock_bh(&b_ptr->publ.lock);
+ spin_unlock_bh(&b_ptr->lock);
read_unlock_bh(&tipc_net_lock);
return 0;
}
@@ -620,27 +614,27 @@ int tipc_block_bearer(const char *name)
* Note: This routine assumes caller holds tipc_net_lock.
*/
-static void bearer_disable(struct bearer *b_ptr)
+static void bearer_disable(struct tipc_bearer *b_ptr)
{
struct link *l_ptr;
struct link *temp_l_ptr;
- info("Disabling bearer <%s>\n", b_ptr->publ.name);
+ info("Disabling bearer <%s>\n", b_ptr->name);
tipc_disc_stop_link_req(b_ptr->link_req);
- spin_lock_bh(&b_ptr->publ.lock);
+ spin_lock_bh(&b_ptr->lock);
b_ptr->link_req = NULL;
- b_ptr->publ.blocked = 1;
- b_ptr->media->disable_bearer(&b_ptr->publ);
+ b_ptr->blocked = 1;
+ b_ptr->media->disable_bearer(b_ptr);
list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
tipc_link_delete(l_ptr);
}
- spin_unlock_bh(&b_ptr->publ.lock);
- memset(b_ptr, 0, sizeof(struct bearer));
+ spin_unlock_bh(&b_ptr->lock);
+ memset(b_ptr, 0, sizeof(struct tipc_bearer));
}
int tipc_disable_bearer(const char *name)
{
- struct bearer *b_ptr;
+ struct tipc_bearer *b_ptr;
int res;
write_lock_bh(&tipc_net_lock);
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 85f451d5aac..31d6172b20f 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -2,7 +2,7 @@
* net/tipc/bearer.h: Include file for TIPC bearer code
*
* Copyright (c) 1996-2006, Ericsson AB
- * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -61,26 +61,7 @@ struct tipc_media_addr {
} dev_addr;
};
-/**
- * struct tipc_bearer - TIPC bearer info available to media code
- * @usr_handle: pointer to additional media-specific information about bearer
- * @mtu: max packet size bearer can support
- * @blocked: non-zero if bearer is blocked
- * @lock: spinlock for controlling access to bearer
- * @addr: media-specific address associated with bearer
- * @name: bearer name (format = media:interface)
- *
- * Note: TIPC initializes "name" and "lock" fields; media code is responsible
- * for initialization all other fields when a bearer is enabled.
- */
-struct tipc_bearer {
- void *usr_handle;
- u32 mtu;
- int blocked;
- spinlock_t lock;
- struct tipc_media_addr addr;
- char name[TIPC_MAX_BEARER_NAME];
-};
+struct tipc_bearer;
/**
* struct media - TIPC media information available to internal users
@@ -89,7 +70,6 @@ struct tipc_bearer {
* @disable_bearer: routine which disables a bearer
* @addr2str: routine which converts bearer's address to string form
* @bcast_addr: media address used in broadcasting
- * @bcast: non-zero if media supports broadcasting [currently mandatory]
* @priority: default link (and bearer) priority
* @tolerance: default time (in ms) before declaring link failure
* @window: default window (in packets) before declaring link congestion
@@ -106,7 +86,6 @@ struct media {
char *(*addr2str)(struct tipc_media_addr *a,
char *str_buf, int str_size);
struct tipc_media_addr bcast_addr;
- int bcast;
u32 priority;
u32 tolerance;
u32 window;
@@ -115,11 +94,15 @@ struct media {
};
/**
- * struct bearer - TIPC bearer information available to internal users
- * @publ: bearer information available to privileged users
+ * struct tipc_bearer - TIPC bearer structure
+ * @usr_handle: pointer to additional media-specific information about bearer
+ * @mtu: max packet size bearer can support
+ * @blocked: non-zero if bearer is blocked
+ * @lock: spinlock for controlling access to bearer
+ * @addr: media-specific address associated with bearer
+ * @name: bearer name (format = media:interface)
* @media: ptr to media structure associated with bearer
* @priority: default link priority for bearer
- * @detect_scope: network address mask used during automatic link creation
* @identity: array index of this bearer within TIPC bearer array
* @link_req: ptr to (optional) structure making periodic link setup requests
* @links: list of non-congested links associated with bearer
@@ -128,13 +111,20 @@ struct media {
* @active: non-zero if bearer structure is represents a bearer
* @net_plane: network plane ('A' through 'H') currently associated with bearer
* @nodes: indicates which nodes in cluster can be reached through bearer
+ *
+ * Note: media-specific code is responsible for initialization of the fields
+ * indicated below when a bearer is enabled; TIPC's generic bearer code takes
+ * care of initializing all other fields.
*/
-
-struct bearer {
- struct tipc_bearer publ;
+struct tipc_bearer {
+ void *usr_handle; /* initalized by media */
+ u32 mtu; /* initalized by media */
+ int blocked; /* initalized by media */
+ struct tipc_media_addr addr; /* initalized by media */
+ char name[TIPC_MAX_BEARER_NAME];
+ spinlock_t lock;
struct media *media;
u32 priority;
- u32 detect_scope;
u32 identity;
struct link_req *link_req;
struct list_head links;
@@ -152,7 +142,7 @@ struct bearer_name {
struct link;
-extern struct bearer tipc_bearers[];
+extern struct tipc_bearer tipc_bearers[];
/*
* TIPC routines available to supported media types
@@ -173,7 +163,7 @@ void tipc_recv_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr);
int tipc_block_bearer(const char *name);
void tipc_continue(struct tipc_bearer *tb_ptr);
-int tipc_enable_bearer(const char *bearer_name, u32 bcast_scope, u32 priority);
+int tipc_enable_bearer(const char *bearer_name, u32 disc_domain, u32 priority);
int tipc_disable_bearer(const char *name);
/*
@@ -186,14 +176,14 @@ void tipc_media_addr_printf(struct print_buf *pb, struct tipc_media_addr *a);
struct sk_buff *tipc_media_get_names(void);
struct sk_buff *tipc_bearer_get_names(void);
-void tipc_bearer_add_dest(struct bearer *b_ptr, u32 dest);
-void tipc_bearer_remove_dest(struct bearer *b_ptr, u32 dest);
-void tipc_bearer_schedule(struct bearer *b_ptr, struct link *l_ptr);
-struct bearer *tipc_bearer_find_interface(const char *if_name);
-int tipc_bearer_resolve_congestion(struct bearer *b_ptr, struct link *l_ptr);
-int tipc_bearer_congested(struct bearer *b_ptr, struct link *l_ptr);
+void tipc_bearer_add_dest(struct tipc_bearer *b_ptr, u32 dest);
+void tipc_bearer_remove_dest(struct tipc_bearer *b_ptr, u32 dest);
+void tipc_bearer_schedule(struct tipc_bearer *b_ptr, struct link *l_ptr);
+struct tipc_bearer *tipc_bearer_find_interface(const char *if_name);
+int tipc_bearer_resolve_congestion(struct tipc_bearer *b_ptr, struct link *l_ptr);
+int tipc_bearer_congested(struct tipc_bearer *b_ptr, struct link *l_ptr);
void tipc_bearer_stop(void);
-void tipc_bearer_lock_push(struct bearer *b_ptr);
+void tipc_bearer_lock_push(struct tipc_bearer *b_ptr);
/**
@@ -214,10 +204,11 @@ void tipc_bearer_lock_push(struct bearer *b_ptr);
* and let TIPC's link code deal with the undelivered message.
*/
-static inline int tipc_bearer_send(struct bearer *b_ptr, struct sk_buff *buf,
+static inline int tipc_bearer_send(struct tipc_bearer *b_ptr,
+ struct sk_buff *buf,
struct tipc_media_addr *dest)
{
- return !b_ptr->media->send_msg(buf, &b_ptr->publ, dest);
+ return !b_ptr->media->send_msg(buf, b_ptr, dest);
}
#endif /* _TIPC_BEARER_H */
diff --git a/net/tipc/config.c b/net/tipc/config.c
index e16750dcf3c..b25a396b7e1 100644
--- a/net/tipc/config.c
+++ b/net/tipc/config.c
@@ -2,7 +2,7 @@
* net/tipc/config.c: TIPC configuration management code
*
* Copyright (c) 2002-2006, Ericsson AB
- * Copyright (c) 2004-2007, Wind River Systems
+ * Copyright (c) 2004-2007, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -148,7 +148,7 @@ static struct sk_buff *cfg_enable_bearer(void)
args = (struct tipc_bearer_config *)TLV_DATA(req_tlv_area);
if (tipc_enable_bearer(args->name,
- ntohl(args->detect_scope),
+ ntohl(args->disc_domain),
ntohl(args->priority)))
return tipc_cfg_reply_error_string("unable to enable bearer");
@@ -260,25 +260,6 @@ static struct sk_buff *cfg_set_max_ports(void)
return tipc_cfg_reply_none();
}
-static struct sk_buff *cfg_set_max_nodes(void)
-{
- u32 value;
-
- if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
- return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
- value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area));
- if (value == tipc_max_nodes)
- return tipc_cfg_reply_none();
- if (value != delimit(value, 8, 2047))
- return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
- " (max nodes must be 8-2047)");
- if (tipc_mode == TIPC_NET_MODE)
- return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
- " (cannot change max nodes once TIPC has joined a network)");
- tipc_max_nodes = value;
- return tipc_cfg_reply_none();
-}
-
static struct sk_buff *cfg_set_netid(void)
{
u32 value;
@@ -397,9 +378,6 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area
case TIPC_CMD_SET_MAX_SUBSCR:
rep_tlv_buf = cfg_set_max_subscriptions();
break;
- case TIPC_CMD_SET_MAX_NODES:
- rep_tlv_buf = cfg_set_max_nodes();
- break;
case TIPC_CMD_SET_NETID:
rep_tlv_buf = cfg_set_netid();
break;
@@ -415,9 +393,6 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area
case TIPC_CMD_GET_MAX_SUBSCR:
rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_max_subscriptions);
break;
- case TIPC_CMD_GET_MAX_NODES:
- rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_max_nodes);
- break;
case TIPC_CMD_GET_NETID:
rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_net_id);
break;
@@ -431,6 +406,8 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area
case TIPC_CMD_GET_MAX_SLAVES:
case TIPC_CMD_SET_MAX_CLUSTERS:
case TIPC_CMD_GET_MAX_CLUSTERS:
+ case TIPC_CMD_SET_MAX_NODES:
+ case TIPC_CMD_GET_MAX_NODES:
rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
" (obsolete command)");
break;
diff --git a/net/tipc/core.c b/net/tipc/core.c
index e071579e085..c9a73e7763f 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -2,7 +2,7 @@
* net/tipc/core.c: TIPC module code
*
* Copyright (c) 2003-2006, Ericsson AB
- * Copyright (c) 2005-2006, Wind River Systems
+ * Copyright (c) 2005-2006, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -41,10 +41,6 @@
#include "config.h"
-#ifndef CONFIG_TIPC_NODES
-#define CONFIG_TIPC_NODES 255
-#endif
-
#ifndef CONFIG_TIPC_PORTS
#define CONFIG_TIPC_PORTS 8191
#endif
@@ -57,7 +53,6 @@
int tipc_mode = TIPC_NOT_RUNNING;
int tipc_random;
-atomic_t tipc_user_count = ATOMIC_INIT(0);
const char tipc_alphabet[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_.";
@@ -65,7 +60,6 @@ const char tipc_alphabet[] =
/* configurable TIPC parameters */
u32 tipc_own_addr;
-int tipc_max_nodes;
int tipc_max_ports;
int tipc_max_subscriptions;
int tipc_max_publications;
@@ -193,7 +187,6 @@ static int __init tipc_init(void)
tipc_max_publications = 10000;
tipc_max_subscriptions = 2000;
tipc_max_ports = CONFIG_TIPC_PORTS;
- tipc_max_nodes = CONFIG_TIPC_NODES;
tipc_net_id = 4711;
res = tipc_core_start();
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 997158546e2..436dda1159d 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -2,7 +2,7 @@
* net/tipc/core.h: Include file for TIPC global declarations
*
* Copyright (c) 2005-2006, Ericsson AB
- * Copyright (c) 2005-2007, Wind River Systems
+ * Copyright (c) 2005-2007, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -147,7 +147,6 @@ void tipc_msg_dbg(struct print_buf *, struct tipc_msg *, const char *);
*/
extern u32 tipc_own_addr;
-extern int tipc_max_nodes;
extern int tipc_max_ports;
extern int tipc_max_subscriptions;
extern int tipc_max_publications;
@@ -161,7 +160,6 @@ extern int tipc_remote_management;
extern int tipc_mode;
extern int tipc_random;
extern const char tipc_alphabet[];
-extern atomic_t tipc_user_count;
/*
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index fa026bd91a6..491eff56b9d 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -2,7 +2,7 @@
* net/tipc/discover.c
*
* Copyright (c) 2003-2006, Ericsson AB
- * Copyright (c) 2005-2006, Wind River Systems
+ * Copyright (c) 2005-2006, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -57,7 +57,7 @@
* @timer_intv: current interval between requests (in ms)
*/
struct link_req {
- struct bearer *bearer;
+ struct tipc_bearer *bearer;
struct tipc_media_addr dest;
struct sk_buff *buf;
struct timer_list timer;
@@ -67,27 +67,24 @@ struct link_req {
/**
* tipc_disc_init_msg - initialize a link setup message
* @type: message type (request or response)
- * @req_links: number of links associated with message
* @dest_domain: network domain of node(s) which should respond to message
* @b_ptr: ptr to bearer issuing message
*/
static struct sk_buff *tipc_disc_init_msg(u32 type,
- u32 req_links,
u32 dest_domain,
- struct bearer *b_ptr)
+ struct tipc_bearer *b_ptr)
{
- struct sk_buff *buf = tipc_buf_acquire(DSC_H_SIZE);
+ struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE);
struct tipc_msg *msg;
if (buf) {
msg = buf_msg(buf);
- tipc_msg_init(msg, LINK_CONFIG, type, DSC_H_SIZE, dest_domain);
+ tipc_msg_init(msg, LINK_CONFIG, type, INT_H_SIZE, dest_domain);
msg_set_non_seq(msg, 1);
- msg_set_req_links(msg, req_links);
msg_set_dest_domain(msg, dest_domain);
msg_set_bc_netid(msg, tipc_net_id);
- msg_set_media_addr(msg, &b_ptr->publ.addr);
+ msg_set_media_addr(msg, &b_ptr->addr);
}
return buf;
}
@@ -99,7 +96,7 @@ static struct sk_buff *tipc_disc_init_msg(u32 type,
* @media_addr: media address advertised by duplicated node
*/
-static void disc_dupl_alert(struct bearer *b_ptr, u32 node_addr,
+static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr,
struct tipc_media_addr *media_addr)
{
char node_addr_str[16];
@@ -111,7 +108,7 @@ static void disc_dupl_alert(struct bearer *b_ptr, u32 node_addr,
tipc_media_addr_printf(&pb, media_addr);
tipc_printbuf_validate(&pb);
warn("Duplicate %s using %s seen on <%s>\n",
- node_addr_str, media_addr_str, b_ptr->publ.name);
+ node_addr_str, media_addr_str, b_ptr->name);
}
/**
@@ -120,19 +117,23 @@ static void disc_dupl_alert(struct bearer *b_ptr, u32 node_addr,
* @b_ptr: bearer that message arrived on
*/
-void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr)
+void tipc_disc_recv_msg(struct sk_buff *buf, struct tipc_bearer *b_ptr)
{
+ struct tipc_node *n_ptr;
struct link *link;
- struct tipc_media_addr media_addr;
+ struct tipc_media_addr media_addr, *addr;
+ struct sk_buff *rbuf;
struct tipc_msg *msg = buf_msg(buf);
u32 dest = msg_dest_domain(msg);
u32 orig = msg_prevnode(msg);
u32 net_id = msg_bc_netid(msg);
u32 type = msg_type(msg);
+ int link_fully_up;
msg_get_media_addr(msg, &media_addr);
buf_discard(buf);
+ /* Validate discovery message from requesting node */
if (net_id != tipc_net_id)
return;
if (!tipc_addr_domain_valid(dest))
@@ -140,63 +141,76 @@ void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr)
if (!tipc_addr_node_valid(orig))
return;
if (orig == tipc_own_addr) {
- if (memcmp(&media_addr, &b_ptr->publ.addr, sizeof(media_addr)))
+ if (memcmp(&media_addr, &b_ptr->addr, sizeof(media_addr)))
disc_dupl_alert(b_ptr, tipc_own_addr, &media_addr);
return;
}
if (!tipc_in_scope(dest, tipc_own_addr))
return;
- if (in_own_cluster(orig)) {
- /* Always accept link here */
- struct sk_buff *rbuf;
- struct tipc_media_addr *addr;
- struct tipc_node *n_ptr = tipc_node_find(orig);
- int link_fully_up;
-
- if (n_ptr == NULL) {
- n_ptr = tipc_node_create(orig);
- if (!n_ptr)
- return;
- }
- spin_lock_bh(&n_ptr->lock);
-
- /* Don't talk to neighbor during cleanup after last session */
+ if (!in_own_cluster(orig))
+ return;
- if (n_ptr->cleanup_required) {
- spin_unlock_bh(&n_ptr->lock);
+ /* Locate structure corresponding to requesting node */
+ n_ptr = tipc_node_find(orig);
+ if (!n_ptr) {
+ n_ptr = tipc_node_create(orig);
+ if (!n_ptr)
return;
- }
+ }
+ tipc_node_lock(n_ptr);
+
+ /* Don't talk to neighbor during cleanup after last session */
+ if (n_ptr->cleanup_required) {
+ tipc_node_unlock(n_ptr);
+ return;
+ }
+
+ link = n_ptr->links[b_ptr->identity];
- link = n_ptr->links[b_ptr->identity];
+ /* Create a link endpoint for this bearer, if necessary */
+ if (!link) {
+ link = tipc_link_create(n_ptr, b_ptr, &media_addr);
if (!link) {
- link = tipc_link_create(b_ptr, orig, &media_addr);
- if (!link) {
- spin_unlock_bh(&n_ptr->lock);
- return;
- }
- }
- addr = &link->media_addr;
- if (memcmp(addr, &media_addr, sizeof(*addr))) {
- if (tipc_link_is_up(link) || (!link->started)) {
- disc_dupl_alert(b_ptr, orig, &media_addr);
- spin_unlock_bh(&n_ptr->lock);
- return;
- }
- warn("Resetting link <%s>, peer interface address changed\n",
- link->name);
- memcpy(addr, &media_addr, sizeof(*addr));
- tipc_link_reset(link);
+ tipc_node_unlock(n_ptr);
+ return;
}
- link_fully_up = link_working_working(link);
- spin_unlock_bh(&n_ptr->lock);
- if ((type == DSC_RESP_MSG) || link_fully_up)
+ }
+
+ /*
+ * Ensure requesting node's media address is correct
+ *
+ * If media address doesn't match and the link is working, reject the
+ * request (must be from a duplicate node).
+ *
+ * If media address doesn't match and the link is not working, accept
+ * the new media address and reset the link to ensure it starts up
+ * cleanly.
+ */
+ addr = &link->media_addr;
+ if (memcmp(addr, &media_addr, sizeof(*addr))) {
+ if (tipc_link_is_up(link) || (!link->started)) {
+ disc_dupl_alert(b_ptr, orig, &media_addr);
+ tipc_node_unlock(n_ptr);
return;
- rbuf = tipc_disc_init_msg(DSC_RESP_MSG, 1, orig, b_ptr);
- if (rbuf != NULL) {
- b_ptr->media->send_msg(rbuf, &b_ptr->publ, &media_addr);
+ }
+ warn("Resetting link <%s>, peer interface address changed\n",
+ link->name);
+ memcpy(addr, &media_addr, sizeof(*addr));
+ tipc_link_reset(link);
+ }
+
+ /* Accept discovery message & send response, if necessary */
+ link_fully_up = link_working_working(link);
+
+ if ((type == DSC_REQ_MSG) && !link_fully_up && !b_ptr->blocked) {
+ rbuf = tipc_disc_init_msg(DSC_RESP_MSG, orig, b_ptr);
+ if (rbuf) {
+ b_ptr->media->send_msg(rbuf, b_ptr, &media_addr);
buf_discard(rbuf);
}
}
+
+ tipc_node_unlock(n_ptr);
}
/**
@@ -249,9 +263,9 @@ void tipc_disc_update_link_req(struct link_req *req)
static void disc_timeout(struct link_req *req)
{
- spin_lock_bh(&req->bearer->publ.lock);
+ spin_lock_bh(&req->bearer->lock);
- req->bearer->media->send_msg(req->buf, &req->bearer->publ, &req->dest);
+ req->bearer->media->send_msg(req->buf, req->bearer, &req->dest);
if ((req->timer_intv == TIPC_LINK_REQ_SLOW) ||
(req->timer_intv == TIPC_LINK_REQ_FAST)) {
@@ -266,7 +280,7 @@ static void disc_timeout(struct link_req *req)
}
k_start_timer(&req->timer, req->timer_intv);
- spin_unlock_bh(&req->bearer->publ.lock);
+ spin_unlock_bh(&req->bearer->lock);
}
/**
@@ -274,15 +288,13 @@ static void disc_timeout(struct link_req *req)
* @b_ptr: ptr to bearer issuing requests
* @dest: destination address for request messages
* @dest_domain: network domain of node(s) which should respond to message
- * @req_links: max number of desired links
*
* Returns pointer to link request structure, or NULL if unable to create.
*/
-struct link_req *tipc_disc_init_link_req(struct bearer *b_ptr,
+struct link_req *tipc_disc_init_link_req(struct tipc_bearer *b_ptr,
const struct tipc_media_addr *dest,
- u32 dest_domain,
- u32 req_links)
+ u32 dest_domain)
{
struct link_req *req;
@@ -290,7 +302,7 @@ struct link_req *tipc_disc_init_link_req(struct bearer *b_ptr,
if (!req)
return NULL;
- req->buf = tipc_disc_init_msg(DSC_REQ_MSG, req_links, dest_domain, b_ptr);
+ req->buf = tipc_disc_init_msg(DSC_REQ_MSG, dest_domain, b_ptr);
if (!req->buf) {
kfree(req);
return NULL;
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
index d2c3cffb79f..e48a167e47b 100644
--- a/net/tipc/discover.h
+++ b/net/tipc/discover.h
@@ -2,7 +2,7 @@
* net/tipc/discover.h
*
* Copyright (c) 2003-2006, Ericsson AB
- * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -39,13 +39,12 @@
struct link_req;
-struct link_req *tipc_disc_init_link_req(struct bearer *b_ptr,
+struct link_req *tipc_disc_init_link_req(struct tipc_bearer *b_ptr,
const struct tipc_media_addr *dest,
- u32 dest_domain,
- u32 req_links);
+ u32 dest_domain);
void tipc_disc_update_link_req(struct link_req *req);
void tipc_disc_stop_link_req(struct link_req *req);
-void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr);
+void tipc_disc_recv_msg(struct sk_buff *buf, struct tipc_bearer *b_ptr);
#endif
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 18702f58d11..43639ff1cbe 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -2,7 +2,7 @@
* net/tipc/link.c: TIPC link code
*
* Copyright (c) 1996-2007, Ericsson AB
- * Copyright (c) 2004-2007, Wind River Systems
+ * Copyright (c) 2004-2007, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -90,7 +90,7 @@ static void link_handle_out_of_seq_msg(struct link *l_ptr,
static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf);
static int link_recv_changeover_msg(struct link **l_ptr, struct sk_buff **buf);
static void link_set_supervision_props(struct link *l_ptr, u32 tolerance);
-static int link_send_sections_long(struct port *sender,
+static int link_send_sections_long(struct tipc_port *sender,
struct iovec const *msg_sect,
u32 num_sect, u32 destnode);
static void link_check_defragm_bufs(struct link *l_ptr);
@@ -113,7 +113,7 @@ static void link_init_max_pkt(struct link *l_ptr)
{
u32 max_pkt;
- max_pkt = (l_ptr->b_ptr->publ.mtu & ~3);
+ max_pkt = (l_ptr->b_ptr->mtu & ~3);
if (max_pkt > MAX_MSG_SIZE)
max_pkt = MAX_MSG_SIZE;
@@ -246,9 +246,6 @@ static void link_timeout(struct link *l_ptr)
l_ptr->stats.accu_queue_sz += l_ptr->out_queue_size;
l_ptr->stats.queue_sz_counts++;
- if (l_ptr->out_queue_size > l_ptr->stats.max_queue_sz)
- l_ptr->stats.max_queue_sz = l_ptr->out_queue_size;
-
if (l_ptr->first_out) {
struct tipc_msg *msg = buf_msg(l_ptr->first_out);
u32 length = msg_size(msg);
@@ -296,19 +293,35 @@ static void link_set_timer(struct link *l_ptr, u32 time)
/**
* tipc_link_create - create a new link
+ * @n_ptr: pointer to associated node
* @b_ptr: pointer to associated bearer
- * @peer: network address of node at other end of link
* @media_addr: media address to use when sending messages over link
*
* Returns pointer to link.
*/
-struct link *tipc_link_create(struct bearer *b_ptr, const u32 peer,
+struct link *tipc_link_create(struct tipc_node *n_ptr,
+ struct tipc_bearer *b_ptr,
const struct tipc_media_addr *media_addr)
{
struct link *l_ptr;
struct tipc_msg *msg;
char *if_name;
+ char addr_string[16];
+ u32 peer = n_ptr->addr;
+
+ if (n_ptr->link_cnt >= 2) {
+ tipc_addr_string_fill(addr_string, n_ptr->addr);
+ err("Attempt to establish third link to %s\n", addr_string);
+ return NULL;
+ }
+
+ if (n_ptr->links[b_ptr->identity]) {
+ tipc_addr_string_fill(addr_string, n_ptr->addr);
+ err("Attempt to establish second link on <%s> to %s\n",
+ b_ptr->name, addr_string);
+ return NULL;
+ }
l_ptr = kzalloc(sizeof(*l_ptr), GFP_ATOMIC);
if (!l_ptr) {
@@ -317,7 +330,7 @@ struct link *tipc_link_create(struct bearer *b_ptr, const u32 peer,
}
l_ptr->addr = peer;
- if_name = strchr(b_ptr->publ.name, ':') + 1;
+ if_name = strchr(b_ptr->name, ':') + 1;
sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:",
tipc_zone(tipc_own_addr), tipc_cluster(tipc_own_addr),
tipc_node(tipc_own_addr),
@@ -325,6 +338,7 @@ struct link *tipc_link_create(struct bearer *b_ptr, const u32 peer,
tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
/* note: peer i/f is appended to link name by reset/activate */
memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr));
+ l_ptr->owner = n_ptr;
l_ptr->checkpoint = 1;
l_ptr->b_ptr = b_ptr;
link_set_supervision_props(l_ptr, b_ptr->media->tolerance);
@@ -348,11 +362,7 @@ struct link *tipc_link_create(struct bearer *b_ptr, const u32 peer,
link_reset_statistics(l_ptr);
- l_ptr->owner = tipc_node_attach_link(l_ptr);
- if (!l_ptr->owner) {
- kfree(l_ptr);
- return NULL;
- }
+ tipc_node_attach_link(n_ptr, l_ptr);
k_init_timer(&l_ptr->timer, (Handler)link_timeout, (unsigned long)l_ptr);
list_add_tail(&l_ptr->link_list, &b_ptr->links);
@@ -391,7 +401,9 @@ void tipc_link_delete(struct link *l_ptr)
static void link_start(struct link *l_ptr)
{
+ tipc_node_lock(l_ptr->owner);
link_state_event(l_ptr, STARTING_EVT);
+ tipc_node_unlock(l_ptr->owner);
}
/**
@@ -406,7 +418,7 @@ static void link_start(struct link *l_ptr)
static int link_schedule_port(struct link *l_ptr, u32 origport, u32 sz)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
spin_lock_bh(&tipc_port_list_lock);
p_ptr = tipc_port_lock(origport);
@@ -415,7 +427,7 @@ static int link_schedule_port(struct link *l_ptr, u32 origport, u32 sz)
goto exit;
if (!list_empty(&p_ptr->wait_list))
goto exit;
- p_ptr->publ.congested = 1;
+ p_ptr->congested = 1;
p_ptr->waiting_pkts = 1 + ((sz - 1) / l_ptr->max_pkt);
list_add_tail(&p_ptr->wait_list, &l_ptr->waiting_ports);
l_ptr->stats.link_congs++;
@@ -428,8 +440,8 @@ exit:
void tipc_link_wakeup_ports(struct link *l_ptr, int all)
{
- struct port *p_ptr;
- struct port *temp_p_ptr;
+ struct tipc_port *p_ptr;
+ struct tipc_port *temp_p_ptr;
int win = l_ptr->queue_limit[0] - l_ptr->out_queue_size;
if (all)
@@ -445,11 +457,11 @@ void tipc_link_wakeup_ports(struct link *l_ptr, int all)
if (win <= 0)
break;
list_del_init(&p_ptr->wait_list);
- spin_lock_bh(p_ptr->publ.lock);
- p_ptr->publ.congested = 0;
- p_ptr->wakeup(&p_ptr->publ);
+ spin_lock_bh(p_ptr->lock);
+ p_ptr->congested = 0;
+ p_ptr->wakeup(p_ptr);
win -= p_ptr->waiting_pkts;
- spin_unlock_bh(p_ptr->publ.lock);
+ spin_unlock_bh(p_ptr->lock);
}
exit:
@@ -549,7 +561,7 @@ void tipc_link_reset(struct link *l_ptr)
tipc_node_link_down(l_ptr->owner, l_ptr);
tipc_bearer_remove_dest(l_ptr->b_ptr, l_ptr->addr);
- if (was_active_link && tipc_node_has_active_links(l_ptr->owner) &&
+ if (was_active_link && tipc_node_active_links(l_ptr->owner) &&
l_ptr->owner->permit_changeover) {
l_ptr->reset_checkpoint = checkpoint;
l_ptr->exp_msg_count = START_CHANGEOVER;
@@ -824,7 +836,10 @@ static void link_add_to_outqueue(struct link *l_ptr,
l_ptr->last_out = buf;
} else
l_ptr->first_out = l_ptr->last_out = buf;
+
l_ptr->out_queue_size++;
+ if (l_ptr->out_queue_size > l_ptr->stats.max_queue_sz)
+ l_ptr->stats.max_queue_sz = l_ptr->out_queue_size;
}
/*
@@ -867,9 +882,6 @@ int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf)
/* Packet can be queued or sent: */
- if (queue_size > l_ptr->stats.max_queue_sz)
- l_ptr->stats.max_queue_sz = queue_size;
-
if (likely(!tipc_bearer_congested(l_ptr->b_ptr, l_ptr) &&
!link_congested(l_ptr))) {
link_add_to_outqueue(l_ptr, buf, msg);
@@ -1027,12 +1039,12 @@ int tipc_send_buf_fast(struct sk_buff *buf, u32 destnode)
* except for total message length.
* Returns user data length or errno.
*/
-int tipc_link_send_sections_fast(struct port *sender,
+int tipc_link_send_sections_fast(struct tipc_port *sender,
struct iovec const *msg_sect,
const u32 num_sect,
u32 destaddr)
{
- struct tipc_msg *hdr = &sender->publ.phdr;
+ struct tipc_msg *hdr = &sender->phdr;
struct link *l_ptr;
struct sk_buff *buf;
struct tipc_node *node;
@@ -1045,7 +1057,7 @@ again:
* (Must not hold any locks while building message.)
*/
- res = tipc_msg_build(hdr, msg_sect, num_sect, sender->publ.max_pkt,
+ res = tipc_msg_build(hdr, msg_sect, num_sect, sender->max_pkt,
!sender->user_port, &buf);
read_lock_bh(&tipc_net_lock);
@@ -1056,7 +1068,7 @@ again:
if (likely(l_ptr)) {
if (likely(buf)) {
res = link_send_buf_fast(l_ptr, buf,
- &sender->publ.max_pkt);
+ &sender->max_pkt);
if (unlikely(res < 0))
buf_discard(buf);
exit:
@@ -1075,7 +1087,7 @@ exit:
if (link_congested(l_ptr) ||
!list_empty(&l_ptr->b_ptr->cong_links)) {
res = link_schedule_port(l_ptr,
- sender->publ.ref, res);
+ sender->ref, res);
goto exit;
}
@@ -1084,12 +1096,12 @@ exit:
* then re-try fast path or fragment the message
*/
- sender->publ.max_pkt = l_ptr->max_pkt;
+ sender->max_pkt = l_ptr->max_pkt;
tipc_node_unlock(node);
read_unlock_bh(&tipc_net_lock);
- if ((msg_hdr_sz(hdr) + res) <= sender->publ.max_pkt)
+ if ((msg_hdr_sz(hdr) + res) <= sender->max_pkt)
goto again;
return link_send_sections_long(sender, msg_sect,
@@ -1123,14 +1135,14 @@ exit:
*
* Returns user data length or errno.
*/
-static int link_send_sections_long(struct port *sender,
+static int link_send_sections_long(struct tipc_port *sender,
struct iovec const *msg_sect,
u32 num_sect,
u32 destaddr)
{
struct link *l_ptr;
struct tipc_node *node;
- struct tipc_msg *hdr = &sender->publ.phdr;
+ struct tipc_msg *hdr = &sender->phdr;
u32 dsz = msg_data_sz(hdr);
u32 max_pkt, fragm_sz, rest;
struct tipc_msg fragm_hdr;
@@ -1142,7 +1154,7 @@ static int link_send_sections_long(struct port *sender,
again:
fragm_no = 1;
- max_pkt = sender->publ.max_pkt - INT_H_SIZE;
+ max_pkt = sender->max_pkt - INT_H_SIZE;
/* leave room for tunnel header in case of link changeover */
fragm_sz = max_pkt - INT_H_SIZE;
/* leave room for fragmentation header in each fragment */
@@ -1157,7 +1169,7 @@ again:
tipc_msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT,
INT_H_SIZE, msg_destnode(hdr));
- msg_set_link_selector(&fragm_hdr, sender->publ.ref);
+ msg_set_link_selector(&fragm_hdr, sender->ref);
msg_set_size(&fragm_hdr, max_pkt);
msg_set_fragm_no(&fragm_hdr, 1);
@@ -1238,13 +1250,13 @@ error:
node = tipc_node_find(destaddr);
if (likely(node)) {
tipc_node_lock(node);
- l_ptr = node->active_links[sender->publ.ref & 1];
+ l_ptr = node->active_links[sender->ref & 1];
if (!l_ptr) {
tipc_node_unlock(node);
goto reject;
}
if (l_ptr->max_pkt < max_pkt) {
- sender->publ.max_pkt = l_ptr->max_pkt;
+ sender->max_pkt = l_ptr->max_pkt;
tipc_node_unlock(node);
for (; buf_chain; buf_chain = buf) {
buf = buf_chain->next;
@@ -1441,7 +1453,7 @@ static void link_retransmit_failure(struct link *l_ptr, struct sk_buff *buf)
info("Outstanding acks: %lu\n",
(unsigned long) TIPC_SKB_CB(buf)->handle);
- n_ptr = l_ptr->owner->next;
+ n_ptr = tipc_bclink_retransmit_to();
tipc_node_lock(n_ptr);
tipc_addr_string_fill(addr_string, n_ptr->addr);
@@ -1595,11 +1607,10 @@ static int link_recv_buf_validate(struct sk_buff *buf)
* structure (i.e. cannot be NULL), but bearer can be inactive.
*/
-void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr)
+void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *b_ptr)
{
read_lock_bh(&tipc_net_lock);
while (head) {
- struct bearer *b_ptr = (struct bearer *)tb_ptr;
struct tipc_node *n_ptr;
struct link *l_ptr;
struct sk_buff *crs;
@@ -1735,10 +1746,6 @@ deliver:
tipc_node_unlock(n_ptr);
tipc_link_recv_bundle(buf);
continue;
- case ROUTE_DISTRIBUTOR:
- tipc_node_unlock(n_ptr);
- buf_discard(buf);
- continue;
case NAME_DISTRIBUTOR:
tipc_node_unlock(n_ptr);
tipc_named_recv(buf);
@@ -1765,6 +1772,10 @@ deliver:
goto protocol_check;
}
break;
+ default:
+ buf_discard(buf);
+ buf = NULL;
+ break;
}
}
tipc_node_unlock(n_ptr);
@@ -1900,6 +1911,7 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg,
struct sk_buff *buf = NULL;
struct tipc_msg *msg = l_ptr->pmsg;
u32 msg_size = sizeof(l_ptr->proto_msg);
+ int r_flag;
if (link_blocked(l_ptr))
return;
@@ -1950,15 +1962,14 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg,
msg_set_ack(msg, mod(l_ptr->reset_checkpoint - 1));
msg_set_seq_gap(msg, 0);
msg_set_next_sent(msg, 1);
+ msg_set_probe(msg, 0);
msg_set_link_tolerance(msg, l_ptr->tolerance);
msg_set_linkprio(msg, l_ptr->priority);
msg_set_max_pkt(msg, l_ptr->max_pkt_target);
}
- if (tipc_node_has_redundant_links(l_ptr->owner))
- msg_set_redundant_link(msg);
- else
- msg_clear_redundant_link(msg);
+ r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr));
+ msg_set_redundant_link(msg, r_flag);
msg_set_linkprio(msg, l_ptr->priority);
/* Ensure sequence number will not fit : */
@@ -1978,7 +1989,6 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg,
skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg));
return;
}
- msg_set_timestamp(msg, jiffies_to_msecs(jiffies));
/* Message can be sent */
@@ -2066,7 +2076,7 @@ static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf)
l_ptr->peer_bearer_id = msg_bearer_id(msg);
/* Synchronize broadcast sequence numbers */
- if (!tipc_node_has_redundant_links(l_ptr->owner))
+ if (!tipc_node_redundant_links(l_ptr->owner))
l_ptr->owner->bclink.last_in = mod(msg_last_bcast(msg));
break;
case STATE_MSG:
@@ -2413,9 +2423,6 @@ static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf)
else
destaddr = msg_destnode(inmsg);
- if (msg_routed(inmsg))
- msg_set_prevnode(inmsg, tipc_own_addr);
-
/* Prepare reusable fragment header: */
tipc_msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT,
@@ -2618,6 +2625,9 @@ static void link_check_defragm_bufs(struct link *l_ptr)
static void link_set_supervision_props(struct link *l_ptr, u32 tolerance)
{
+ if ((tolerance < TIPC_MIN_LINK_TOL) || (tolerance > TIPC_MAX_LINK_TOL))
+ return;
+
l_ptr->tolerance = tolerance;
l_ptr->continuity_interval =
((tolerance / 4) > 500) ? 500 : tolerance / 4;
@@ -2658,7 +2668,7 @@ void tipc_link_set_queue_limits(struct link *l_ptr, u32 window)
static struct link *link_find_link(const char *name, struct tipc_node **node)
{
struct link_name link_name_parts;
- struct bearer *b_ptr;
+ struct tipc_bearer *b_ptr;
struct link *l_ptr;
if (!link_name_validate(name, &link_name_parts))
@@ -2961,7 +2971,7 @@ static void link_print(struct link *l_ptr, const char *str)
tipc_printf(buf, str);
tipc_printf(buf, "Link %x<%s>:",
- l_ptr->addr, l_ptr->b_ptr->publ.name);
+ l_ptr->addr, l_ptr->b_ptr->name);
#ifdef CONFIG_TIPC_DEBUG
if (link_reset_reset(l_ptr) || link_reset_unknown(l_ptr))
@@ -2981,9 +2991,9 @@ static void link_print(struct link *l_ptr, const char *str)
!= (l_ptr->out_queue_size - 1)) ||
(l_ptr->last_out->next != NULL)) {
tipc_printf(buf, "\nSend queue inconsistency\n");
- tipc_printf(buf, "first_out= %x ", l_ptr->first_out);
- tipc_printf(buf, "next_out= %x ", l_ptr->next_out);
- tipc_printf(buf, "last_out= %x ", l_ptr->last_out);
+ tipc_printf(buf, "first_out= %p ", l_ptr->first_out);
+ tipc_printf(buf, "next_out= %p ", l_ptr->next_out);
+ tipc_printf(buf, "last_out= %p ", l_ptr->last_out);
}
} else
tipc_printf(buf, "[]");
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 70967e63702..e6a30dbe1aa 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -2,7 +2,7 @@
* net/tipc/link.h: Include file for TIPC link code
*
* Copyright (c) 1995-2006, Ericsson AB
- * Copyright (c) 2004-2005, Wind River Systems
+ * Copyright (c) 2004-2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -122,7 +122,7 @@ struct link {
u32 checkpoint;
u32 peer_session;
u32 peer_bearer_id;
- struct bearer *b_ptr;
+ struct tipc_bearer *b_ptr;
u32 tolerance;
u32 continuity_interval;
u32 abort_limit;
@@ -196,24 +196,19 @@ struct link {
u32 bearer_congs;
u32 deferred_recv;
u32 duplicates;
-
- /* for statistical profiling of send queue size */
-
- u32 max_queue_sz;
- u32 accu_queue_sz;
- u32 queue_sz_counts;
-
- /* for statistical profiling of message lengths */
-
- u32 msg_length_counts;
- u32 msg_lengths_total;
- u32 msg_length_profile[7];
+ u32 max_queue_sz; /* send queue size high water mark */
+ u32 accu_queue_sz; /* used for send queue size profiling */
+ u32 queue_sz_counts; /* used for send queue size profiling */
+ u32 msg_length_counts; /* used for message length profiling */
+ u32 msg_lengths_total; /* used for message length profiling */
+ u32 msg_length_profile[7]; /* used for msg. length profiling */
} stats;
};
-struct port;
+struct tipc_port;
-struct link *tipc_link_create(struct bearer *b_ptr, const u32 peer,
+struct link *tipc_link_create(struct tipc_node *n_ptr,
+ struct tipc_bearer *b_ptr,
const struct tipc_media_addr *media_addr);
void tipc_link_delete(struct link *l_ptr);
void tipc_link_changeover(struct link *l_ptr);
@@ -230,7 +225,7 @@ void tipc_link_reset(struct link *l_ptr);
int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector);
int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf);
u32 tipc_link_get_max_pkt(u32 dest, u32 selector);
-int tipc_link_send_sections_fast(struct port *sender,
+int tipc_link_send_sections_fast(struct tipc_port *sender,
struct iovec const *msg_sect,
const u32 num_sect,
u32 destnode);
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index bb6180c4fcb..6d92d17e7fb 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -2,7 +2,7 @@
* net/tipc/msg.c: TIPC message header routines
*
* Copyright (c) 2000-2006, Ericsson AB
- * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -192,8 +192,6 @@ void tipc_msg_dbg(struct print_buf *buf, struct tipc_msg *msg, const char *str)
default:
tipc_printf(buf, "UNKNOWN TYPE %u", msg_type(msg));
}
- if (msg_routed(msg) && !msg_non_seq(msg))
- tipc_printf(buf, "ROUT:");
if (msg_reroute_cnt(msg))
tipc_printf(buf, "REROUTED(%u):",
msg_reroute_cnt(msg));
@@ -210,8 +208,6 @@ void tipc_msg_dbg(struct print_buf *buf, struct tipc_msg *msg, const char *str)
default:
tipc_printf(buf, "UNKNOWN:%x", msg_type(msg));
}
- if (msg_routed(msg))
- tipc_printf(buf, "ROUT:");
if (msg_reroute_cnt(msg))
tipc_printf(buf, "REROUTED(%u):",
msg_reroute_cnt(msg));
@@ -232,13 +228,10 @@ void tipc_msg_dbg(struct print_buf *buf, struct tipc_msg *msg, const char *str)
default:
tipc_printf(buf, "UNKNOWN TYPE:%x", msg_type(msg));
}
- if (msg_routed(msg))
- tipc_printf(buf, "ROUT:");
if (msg_reroute_cnt(msg))
tipc_printf(buf, "REROUTED(%u):", msg_reroute_cnt(msg));
break;
case LINK_PROTOCOL:
- tipc_printf(buf, "PROT:TIM(%u):", msg_timestamp(msg));
switch (msg_type(msg)) {
case STATE_MSG:
tipc_printf(buf, "STATE:");
@@ -275,33 +268,6 @@ void tipc_msg_dbg(struct print_buf *buf, struct tipc_msg *msg, const char *str)
tipc_printf(buf, "UNKNOWN TYPE:%x", msg_type(msg));
}
break;
- case ROUTE_DISTRIBUTOR:
- tipc_printf(buf, "ROUTING_MNG:");
- switch (msg_type(msg)) {
- case EXT_ROUTING_TABLE:
- tipc_printf(buf, "EXT_TBL:");
- tipc_printf(buf, "TO:%x:", msg_remote_node(msg));
- break;
- case LOCAL_ROUTING_TABLE:
- tipc_printf(buf, "LOCAL_TBL:");
- tipc_printf(buf, "TO:%x:", msg_remote_node(msg));
- break;
- case SLAVE_ROUTING_TABLE:
- tipc_printf(buf, "DP_TBL:");
- tipc_printf(buf, "TO:%x:", msg_remote_node(msg));
- break;
- case ROUTE_ADDITION:
- tipc_printf(buf, "ADD:");
- tipc_printf(buf, "TO:%x:", msg_remote_node(msg));
- break;
- case ROUTE_REMOVAL:
- tipc_printf(buf, "REMOVE:");
- tipc_printf(buf, "TO:%x:", msg_remote_node(msg));
- break;
- default:
- tipc_printf(buf, "UNKNOWN TYPE:%x", msg_type(msg));
- }
- break;
case LINK_CONFIG:
tipc_printf(buf, "CFG:");
switch (msg_type(msg)) {
@@ -381,20 +347,15 @@ void tipc_msg_dbg(struct print_buf *buf, struct tipc_msg *msg, const char *str)
tipc_printf(buf, ":OPRT(%u):", msg_origport(msg));
tipc_printf(buf, ":DPRT(%u):", msg_destport(msg));
}
- if (msg_routed(msg) && !msg_non_seq(msg))
- tipc_printf(buf, ":TSEQN(%u)", msg_transp_seqno(msg));
}
if (msg_user(msg) == NAME_DISTRIBUTOR) {
tipc_printf(buf, ":ONOD(%x):", msg_orignode(msg));
tipc_printf(buf, ":DNOD(%x):", msg_destnode(msg));
- if (msg_routed(msg))
- tipc_printf(buf, ":CSEQN(%u)", msg_transp_seqno(msg));
}
if (msg_user(msg) == LINK_CONFIG) {
u32 *raw = (u32 *)msg;
struct tipc_media_addr *orig = (struct tipc_media_addr *)&raw[5];
- tipc_printf(buf, ":REQL(%u):", msg_req_links(msg));
tipc_printf(buf, ":DDOM(%x):", msg_dest_domain(msg));
tipc_printf(buf, ":NETID(%u):", msg_bc_netid(msg));
tipc_media_addr_printf(buf, orig);
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 92c4c4fd7b3..de02339fc17 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -2,7 +2,7 @@
* net/tipc/msg.h: Include file for TIPC message header routines
*
* Copyright (c) 2000-2007, Ericsson AB
- * Copyright (c) 2005-2008, Wind River Systems
+ * Copyright (c) 2005-2008, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -421,13 +421,6 @@ static inline int msg_is_dest(struct tipc_msg *m, u32 d)
return msg_short(m) || (msg_destnode(m) == d);
}
-static inline u32 msg_routed(struct tipc_msg *m)
-{
- if (likely(msg_short(m)))
- return 0;
- return (msg_destnode(m) ^ msg_orignode(m)) >> 11;
-}
-
static inline u32 msg_nametype(struct tipc_msg *m)
{
return msg_word(m, 8);
@@ -438,26 +431,6 @@ static inline void msg_set_nametype(struct tipc_msg *m, u32 n)
msg_set_word(m, 8, n);
}
-static inline u32 msg_transp_seqno(struct tipc_msg *m)
-{
- return msg_word(m, 8);
-}
-
-static inline void msg_set_timestamp(struct tipc_msg *m, u32 n)
-{
- msg_set_word(m, 8, n);
-}
-
-static inline u32 msg_timestamp(struct tipc_msg *m)
-{
- return msg_word(m, 8);
-}
-
-static inline void msg_set_transp_seqno(struct tipc_msg *m, u32 n)
-{
- msg_set_word(m, 8, n);
-}
-
static inline u32 msg_nameinst(struct tipc_msg *m)
{
return msg_word(m, 9);
@@ -545,7 +518,6 @@ static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m)
#define NAME_DISTRIBUTOR 11
#define MSG_FRAGMENTER 12
#define LINK_CONFIG 13
-#define DSC_H_SIZE 40
/*
* Connection management protocol messages
@@ -577,16 +549,6 @@ static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n)
msg_set_bits(m, 1, 16, 0x1fff, n);
}
-static inline u32 msg_req_links(struct tipc_msg *m)
-{
- return msg_bits(m, 1, 16, 0xfff);
-}
-
-static inline void msg_set_req_links(struct tipc_msg *m, u32 n)
-{
- msg_set_bits(m, 1, 16, 0xfff, n);
-}
-
/*
* Word 2
@@ -749,14 +711,9 @@ static inline u32 msg_redundant_link(struct tipc_msg *m)
return msg_bits(m, 5, 12, 0x1);
}
-static inline void msg_set_redundant_link(struct tipc_msg *m)
+static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r)
{
- msg_set_bits(m, 5, 12, 0x1, 1);
-}
-
-static inline void msg_clear_redundant_link(struct tipc_msg *m)
-{
- msg_set_bits(m, 5, 12, 0x1, 0);
+ msg_set_bits(m, 5, 12, 0x1, r);
}
@@ -805,21 +762,6 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n)
}
/*
- * Routing table message data
- */
-
-
-static inline u32 msg_remote_node(struct tipc_msg *m)
-{
- return msg_word(m, msg_hdr_sz(m)/4);
-}
-
-static inline void msg_set_remote_node(struct tipc_msg *m, u32 a)
-{
- msg_set_word(m, msg_hdr_sz(m)/4, a);
-}
-
-/*
* Segmentation message types
*/
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 483c226c958..c9fa6dfcf28 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -2,7 +2,7 @@
* net/tipc/name_distr.c: TIPC name distribution code
*
* Copyright (c) 2000-2006, Ericsson AB
- * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -109,11 +109,9 @@ static void named_cluster_distribute(struct sk_buff *buf)
{
struct sk_buff *buf_copy;
struct tipc_node *n_ptr;
- u32 n_num;
- for (n_num = 1; n_num <= tipc_net.highest_node; n_num++) {
- n_ptr = tipc_net.nodes[n_num];
- if (n_ptr && tipc_node_has_active_links(n_ptr)) {
+ list_for_each_entry(n_ptr, &tipc_node_list, list) {
+ if (tipc_node_active_links(n_ptr)) {
buf_copy = skb_copy(buf, GFP_ATOMIC);
if (!buf_copy)
break;
@@ -214,17 +212,16 @@ exit:
}
/**
- * node_is_down - remove publication associated with a failed node
+ * named_purge_publ - remove publication associated with a failed node
*
* Invoked for each publication issued by a newly failed node.
* Removes publication structure from name table & deletes it.
* In rare cases the link may have come back up again when this
* function is called, and we have two items representing the same
* publication. Nudge this item's key to distinguish it from the other.
- * (Note: Publication's node subscription is already unsubscribed.)
*/
-static void node_is_down(struct publication *publ)
+static void named_purge_publ(struct publication *publ)
{
struct publication *p;
@@ -232,6 +229,8 @@ static void node_is_down(struct publication *publ)
publ->key += 1222345;
p = tipc_nametbl_remove_publ(publ->type, publ->lower,
publ->node, publ->ref, publ->key);
+ if (p)
+ tipc_nodesub_unsubscribe(&p->subscr);
write_unlock_bh(&tipc_nametbl_lock);
if (p != publ) {
@@ -268,7 +267,8 @@ void tipc_named_recv(struct sk_buff *buf)
tipc_nodesub_subscribe(&publ->subscr,
msg_orignode(msg),
publ,
- (net_ev_handler)node_is_down);
+ (net_ev_handler)
+ named_purge_publ);
}
} else if (msg_type(msg) == WITHDRAWAL) {
publ = tipc_nametbl_remove_publ(ntohl(item->type),
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 9bacfd00b91..68b3dd63729 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -2,7 +2,7 @@
* net/tipc/net.c: TIPC network routing code
*
* Copyright (c) 1995-2006, Ericsson AB
- * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,7 @@
#include "name_distr.h"
#include "subscr.h"
#include "port.h"
+#include "node.h"
#include "config.h"
/*
@@ -108,26 +109,6 @@
*/
DEFINE_RWLOCK(tipc_net_lock);
-struct network tipc_net;
-
-static int net_start(void)
-{
- tipc_net.nodes = kcalloc(tipc_max_nodes + 1,
- sizeof(*tipc_net.nodes), GFP_ATOMIC);
- tipc_net.highest_node = 0;
-
- return tipc_net.nodes ? 0 : -ENOMEM;
-}
-
-static void net_stop(void)
-{
- u32 n_num;
-
- for (n_num = 1; n_num <= tipc_net.highest_node; n_num++)
- tipc_node_delete(tipc_net.nodes[n_num]);
- kfree(tipc_net.nodes);
- tipc_net.nodes = NULL;
-}
static void net_route_named_msg(struct sk_buff *buf)
{
@@ -217,9 +198,6 @@ int tipc_net_start(u32 addr)
tipc_named_reinit();
tipc_port_reinit();
- res = net_start();
- if (res)
- return res;
res = tipc_bclink_init();
if (res)
return res;
@@ -235,14 +213,16 @@ int tipc_net_start(u32 addr)
void tipc_net_stop(void)
{
+ struct tipc_node *node, *t_node;
+
if (tipc_mode != TIPC_NET_MODE)
return;
write_lock_bh(&tipc_net_lock);
tipc_bearer_stop();
tipc_mode = TIPC_NODE_MODE;
tipc_bclink_stop();
- net_stop();
+ list_for_each_entry_safe(node, t_node, &tipc_node_list, list)
+ tipc_node_delete(node);
write_unlock_bh(&tipc_net_lock);
info("Left network mode\n");
}
-
diff --git a/net/tipc/net.h b/net/tipc/net.h
index 4ae59ad0489..9eb4b9e220e 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -2,7 +2,7 @@
* net/tipc/net.h: Include file for TIPC network routing code
*
* Copyright (c) 1995-2006, Ericsson AB
- * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -37,23 +37,6 @@
#ifndef _TIPC_NET_H
#define _TIPC_NET_H
-struct tipc_node;
-
-/**
- * struct network - TIPC network structure
- * @nodes: array of pointers to all nodes within cluster
- * @highest_node: id of highest numbered node within cluster
- * @links: number of (unicast) links to cluster
- */
-
-struct network {
- struct tipc_node **nodes;
- u32 highest_node;
- u32 links;
-};
-
-
-extern struct network tipc_net;
extern rwlock_t tipc_net_lock;
void tipc_net_route_msg(struct sk_buff *buf);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 3af53e327f4..2d106ef4fa4 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -2,7 +2,7 @@
* net/tipc/node.c: TIPC node management routines
*
* Copyright (c) 2000-2006, Ericsson AB
- * Copyright (c) 2005-2006, Wind River Systems
+ * Copyright (c) 2005-2006, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -44,9 +44,33 @@ static void node_established_contact(struct tipc_node *n_ptr);
static DEFINE_SPINLOCK(node_create_lock);
+static struct hlist_head node_htable[NODE_HTABLE_SIZE];
+LIST_HEAD(tipc_node_list);
+static u32 tipc_num_nodes;
+
+static atomic_t tipc_num_links = ATOMIC_INIT(0);
u32 tipc_own_tag;
/**
+ * tipc_node_find - locate specified node object, if it exists
+ */
+
+struct tipc_node *tipc_node_find(u32 addr)
+{
+ struct tipc_node *node;
+ struct hlist_node *pos;
+
+ if (unlikely(!in_own_cluster(addr)))
+ return NULL;
+
+ hlist_for_each_entry(node, pos, &node_htable[tipc_hashfn(addr)], hash) {
+ if (node->addr == addr)
+ return node;
+ }
+ return NULL;
+}
+
+/**
* tipc_node_create - create neighboring node
*
* Currently, this routine is called by neighbor discovery code, which holds
@@ -58,8 +82,7 @@ u32 tipc_own_tag;
struct tipc_node *tipc_node_create(u32 addr)
{
- struct tipc_node *n_ptr;
- u32 n_num;
+ struct tipc_node *n_ptr, *temp_node;
spin_lock_bh(&node_create_lock);
@@ -78,12 +101,19 @@ struct tipc_node *tipc_node_create(u32 addr)
n_ptr->addr = addr;
spin_lock_init(&n_ptr->lock);
+ INIT_HLIST_NODE(&n_ptr->hash);
+ INIT_LIST_HEAD(&n_ptr->list);
INIT_LIST_HEAD(&n_ptr->nsub);
- n_num = tipc_node(addr);
- tipc_net.nodes[n_num] = n_ptr;
- if (n_num > tipc_net.highest_node)
- tipc_net.highest_node = n_num;
+ hlist_add_head(&n_ptr->hash, &node_htable[tipc_hashfn(addr)]);
+
+ list_for_each_entry(temp_node, &tipc_node_list, list) {
+ if (n_ptr->addr < temp_node->addr)
+ break;
+ }
+ list_add_tail(&n_ptr->list, &temp_node->list);
+
+ tipc_num_nodes++;
spin_unlock_bh(&node_create_lock);
return n_ptr;
@@ -91,18 +121,11 @@ struct tipc_node *tipc_node_create(u32 addr)
void tipc_node_delete(struct tipc_node *n_ptr)
{
- u32 n_num;
-
- if (!n_ptr)
- return;
-
- n_num = tipc_node(n_ptr->addr);
- tipc_net.nodes[n_num] = NULL;
+ list_del(&n_ptr->list);
+ hlist_del(&n_ptr->hash);
kfree(n_ptr);
- while (!tipc_net.nodes[tipc_net.highest_node])
- if (--tipc_net.highest_node == 0)
- break;
+ tipc_num_nodes--;
}
@@ -200,54 +223,32 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr)
node_lost_contact(n_ptr);
}
-int tipc_node_has_active_links(struct tipc_node *n_ptr)
+int tipc_node_active_links(struct tipc_node *n_ptr)
{
return n_ptr->active_links[0] != NULL;
}
-int tipc_node_has_redundant_links(struct tipc_node *n_ptr)
+int tipc_node_redundant_links(struct tipc_node *n_ptr)
{
return n_ptr->working_links > 1;
}
int tipc_node_is_up(struct tipc_node *n_ptr)
{
- return tipc_node_has_active_links(n_ptr);
+ return tipc_node_active_links(n_ptr);
}
-struct tipc_node *tipc_node_attach_link(struct link *l_ptr)
+void tipc_node_attach_link(struct tipc_node *n_ptr, struct link *l_ptr)
{
- struct tipc_node *n_ptr = tipc_node_find(l_ptr->addr);
-
- if (!n_ptr)
- n_ptr = tipc_node_create(l_ptr->addr);
- if (n_ptr) {
- u32 bearer_id = l_ptr->b_ptr->identity;
- char addr_string[16];
-
- if (n_ptr->link_cnt >= 2) {
- err("Attempt to create third link to %s\n",
- tipc_addr_string_fill(addr_string, n_ptr->addr));
- return NULL;
- }
-
- if (!n_ptr->links[bearer_id]) {
- n_ptr->links[bearer_id] = l_ptr;
- tipc_net.links++;
- n_ptr->link_cnt++;
- return n_ptr;
- }
- err("Attempt to establish second link on <%s> to %s\n",
- l_ptr->b_ptr->publ.name,
- tipc_addr_string_fill(addr_string, l_ptr->addr));
- }
- return NULL;
+ n_ptr->links[l_ptr->b_ptr->identity] = l_ptr;
+ atomic_inc(&tipc_num_links);
+ n_ptr->link_cnt++;
}
void tipc_node_detach_link(struct tipc_node *n_ptr, struct link *l_ptr)
{
n_ptr->links[l_ptr->b_ptr->identity] = NULL;
- tipc_net.links--;
+ atomic_dec(&tipc_num_links);
n_ptr->link_cnt--;
}
@@ -327,7 +328,6 @@ static void node_cleanup_finished(unsigned long node_addr)
static void node_lost_contact(struct tipc_node *n_ptr)
{
- struct tipc_node_subscr *ns, *tns;
char addr_string[16];
u32 i;
@@ -365,12 +365,7 @@ static void node_lost_contact(struct tipc_node *n_ptr)
}
/* Notify subscribers */
- list_for_each_entry_safe(ns, tns, &n_ptr->nsub, nodesub_list) {
- ns->node = NULL;
- list_del_init(&ns->nodesub_list);
- tipc_k_signal((Handler)ns->handle_node_down,
- (unsigned long)ns->usr_handle);
- }
+ tipc_nodesub_notify(n_ptr);
/* Prevent re-contact with node until all cleanup is done */
@@ -385,7 +380,6 @@ struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space)
struct tipc_node *n_ptr;
struct tipc_node_info node_info;
u32 payload_size;
- u32 n_num;
if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR))
return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
@@ -396,15 +390,14 @@ struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space)
" (network address)");
read_lock_bh(&tipc_net_lock);
- if (!tipc_net.nodes) {
+ if (!tipc_num_nodes) {
read_unlock_bh(&tipc_net_lock);
return tipc_cfg_reply_none();
}
/* For now, get space for all other nodes */
- payload_size = TLV_SPACE(sizeof(node_info)) *
- (tipc_net.highest_node - 1);
+ payload_size = TLV_SPACE(sizeof(node_info)) * tipc_num_nodes;
if (payload_size > 32768u) {
read_unlock_bh(&tipc_net_lock);
return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
@@ -418,9 +411,8 @@ struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space)
/* Add TLVs for all nodes in scope */
- for (n_num = 1; n_num <= tipc_net.highest_node; n_num++) {
- n_ptr = tipc_net.nodes[n_num];
- if (!n_ptr || !tipc_in_scope(domain, n_ptr->addr))
+ list_for_each_entry(n_ptr, &tipc_node_list, list) {
+ if (!tipc_in_scope(domain, n_ptr->addr))
continue;
node_info.addr = htonl(n_ptr->addr);
node_info.up = htonl(tipc_node_is_up(n_ptr));
@@ -439,7 +431,6 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space)
struct tipc_node *n_ptr;
struct tipc_link_info link_info;
u32 payload_size;
- u32 n_num;
if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR))
return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
@@ -456,7 +447,8 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space)
/* Get space for all unicast links + multicast link */
- payload_size = TLV_SPACE(sizeof(link_info)) * (tipc_net.links + 1);
+ payload_size = TLV_SPACE(sizeof(link_info)) *
+ (atomic_read(&tipc_num_links) + 1);
if (payload_size > 32768u) {
read_unlock_bh(&tipc_net_lock);
return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
@@ -470,18 +462,17 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space)
/* Add TLV for broadcast link */
- link_info.dest = htonl(tipc_own_addr & 0xfffff00);
+ link_info.dest = htonl(tipc_cluster_mask(tipc_own_addr));
link_info.up = htonl(1);
strlcpy(link_info.str, tipc_bclink_name, TIPC_MAX_LINK_NAME);
tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info));
/* Add TLVs for any other links in scope */
- for (n_num = 1; n_num <= tipc_net.highest_node; n_num++) {
+ list_for_each_entry(n_ptr, &tipc_node_list, list) {
u32 i;
- n_ptr = tipc_net.nodes[n_num];
- if (!n_ptr || !tipc_in_scope(domain, n_ptr->addr))
+ if (!tipc_in_scope(domain, n_ptr->addr))
continue;
tipc_node_lock(n_ptr);
for (i = 0; i < MAX_BEARERS; i++) {
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 206a8efa410..5c61afc7a0b 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -2,7 +2,7 @@
* net/tipc/node.h: Include file for TIPC node management routines
*
* Copyright (c) 2000-2006, Ericsson AB
- * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -46,7 +46,8 @@
* struct tipc_node - TIPC node structure
* @addr: network address of node
* @lock: spinlock governing access to structure
- * @next: pointer to next node in sorted list of cluster's nodes
+ * @hash: links to adjacent nodes in unsorted hash chain
+ * @list: links to adjacent nodes in sorted list of cluster's nodes
* @nsub: list of "node down" subscriptions monitoring node
* @active_links: pointers to active links to node
* @links: pointers to all links to node
@@ -69,7 +70,8 @@
struct tipc_node {
u32 addr;
spinlock_t lock;
- struct tipc_node *next;
+ struct hlist_node hash;
+ struct list_head list;
struct list_head nsub;
struct link *active_links[2];
struct link *links[MAX_BEARERS];
@@ -90,27 +92,35 @@ struct tipc_node {
} bclink;
};
+#define NODE_HTABLE_SIZE 512
+extern struct list_head tipc_node_list;
+
+/*
+ * A trivial power-of-two bitmask technique is used for speed, since this
+ * operation is done for every incoming TIPC packet. The number of hash table
+ * entries has been chosen so that no hash chain exceeds 8 nodes and will
+ * usually be much smaller (typically only a single node).
+ */
+static inline unsigned int tipc_hashfn(u32 addr)
+{
+ return addr & (NODE_HTABLE_SIZE - 1);
+}
+
extern u32 tipc_own_tag;
+struct tipc_node *tipc_node_find(u32 addr);
struct tipc_node *tipc_node_create(u32 addr);
void tipc_node_delete(struct tipc_node *n_ptr);
-struct tipc_node *tipc_node_attach_link(struct link *l_ptr);
+void tipc_node_attach_link(struct tipc_node *n_ptr, struct link *l_ptr);
void tipc_node_detach_link(struct tipc_node *n_ptr, struct link *l_ptr);
void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr);
void tipc_node_link_up(struct tipc_node *n_ptr, struct link *l_ptr);
-int tipc_node_has_active_links(struct tipc_node *n_ptr);
-int tipc_node_has_redundant_links(struct tipc_node *n_ptr);
+int tipc_node_active_links(struct tipc_node *n_ptr);
+int tipc_node_redundant_links(struct tipc_node *n_ptr);
int tipc_node_is_up(struct tipc_node *n_ptr);
struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space);
struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space);
-static inline struct tipc_node *tipc_node_find(u32 addr)
-{
- if (likely(in_own_cluster(addr)))
- return tipc_net.nodes[tipc_node(addr)];
- return NULL;
-}
-
static inline void tipc_node_lock(struct tipc_node *n_ptr)
{
spin_lock_bh(&n_ptr->lock);
diff --git a/net/tipc/node_subscr.c b/net/tipc/node_subscr.c
index 018a55332d9..c3c2815ae63 100644
--- a/net/tipc/node_subscr.c
+++ b/net/tipc/node_subscr.c
@@ -2,7 +2,7 @@
* net/tipc/node_subscr.c: TIPC "node down" subscription handling
*
* Copyright (c) 1995-2006, Ericsson AB
- * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -76,3 +76,22 @@ void tipc_nodesub_unsubscribe(struct tipc_node_subscr *node_sub)
list_del_init(&node_sub->nodesub_list);
tipc_node_unlock(node_sub->node);
}
+
+/**
+ * tipc_nodesub_notify - notify subscribers that a node is unreachable
+ *
+ * Note: node is locked by caller
+ */
+
+void tipc_nodesub_notify(struct tipc_node *node)
+{
+ struct tipc_node_subscr *ns;
+
+ list_for_each_entry(ns, &node->nsub, nodesub_list) {
+ if (ns->handle_node_down) {
+ tipc_k_signal((Handler)ns->handle_node_down,
+ (unsigned long)ns->usr_handle);
+ ns->handle_node_down = NULL;
+ }
+ }
+}
diff --git a/net/tipc/node_subscr.h b/net/tipc/node_subscr.h
index 006ed739f51..4bc2ca0867a 100644
--- a/net/tipc/node_subscr.h
+++ b/net/tipc/node_subscr.h
@@ -2,7 +2,7 @@
* net/tipc/node_subscr.h: Include file for TIPC "node down" subscription handling
*
* Copyright (c) 1995-2006, Ericsson AB
- * Copyright (c) 2005, Wind River Systems
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -59,5 +59,6 @@ struct tipc_node_subscr {
void tipc_nodesub_subscribe(struct tipc_node_subscr *node_sub, u32 addr,
void *usr_handle, net_ev_handler handle_down);
void tipc_nodesub_unsubscribe(struct tipc_node_subscr *node_sub);
+void tipc_nodesub_notify(struct tipc_node *node);
#endif
diff --git a/net/tipc/port.c b/net/tipc/port.c
index 067bab2a0b9..6ff78f9c7d6 100644
--- a/net/tipc/port.c
+++ b/net/tipc/port.c
@@ -2,7 +2,7 @@
* net/tipc/port.c: TIPC port code
*
* Copyright (c) 1992-2007, Ericsson AB
- * Copyright (c) 2004-2008, Wind River Systems
+ * Copyright (c) 2004-2008, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -54,33 +54,19 @@ static DEFINE_SPINLOCK(queue_lock);
static LIST_HEAD(ports);
static void port_handle_node_down(unsigned long ref);
-static struct sk_buff *port_build_self_abort_msg(struct port *, u32 err);
-static struct sk_buff *port_build_peer_abort_msg(struct port *, u32 err);
+static struct sk_buff *port_build_self_abort_msg(struct tipc_port *, u32 err);
+static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *, u32 err);
static void port_timeout(unsigned long ref);
-static u32 port_peernode(struct port *p_ptr)
+static u32 port_peernode(struct tipc_port *p_ptr)
{
- return msg_destnode(&p_ptr->publ.phdr);
+ return msg_destnode(&p_ptr->phdr);
}
-static u32 port_peerport(struct port *p_ptr)
+static u32 port_peerport(struct tipc_port *p_ptr)
{
- return msg_destport(&p_ptr->publ.phdr);
-}
-
-static u32 port_out_seqno(struct port *p_ptr)
-{
- return msg_transp_seqno(&p_ptr->publ.phdr);
-}
-
-static void port_incr_out_seqno(struct port *p_ptr)
-{
- struct tipc_msg *m = &p_ptr->publ.phdr;
-
- if (likely(!msg_routed(m)))
- return;
- msg_set_transp_seqno(m, (msg_transp_seqno(m) + 1));
+ return msg_destport(&p_ptr->phdr);
}
/**
@@ -94,7 +80,7 @@ int tipc_multicast(u32 ref, struct tipc_name_seq const *seq,
struct sk_buff *buf;
struct sk_buff *ibuf = NULL;
struct port_list dports = {0, NULL, };
- struct port *oport = tipc_port_deref(ref);
+ struct tipc_port *oport = tipc_port_deref(ref);
int ext_targets;
int res;
@@ -103,7 +89,7 @@ int tipc_multicast(u32 ref, struct tipc_name_seq const *seq,
/* Create multicast message */
- hdr = &oport->publ.phdr;
+ hdr = &oport->phdr;
msg_set_type(hdr, TIPC_MCAST_MSG);
msg_set_nametype(hdr, seq->type);
msg_set_namelower(hdr, seq->lower);
@@ -211,7 +197,7 @@ struct tipc_port *tipc_createport_raw(void *usr_handle,
void (*wakeup)(struct tipc_port *),
const u32 importance)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct tipc_msg *msg;
u32 ref;
@@ -220,21 +206,19 @@ struct tipc_port *tipc_createport_raw(void *usr_handle,
warn("Port creation failed, no memory\n");
return NULL;
}
- ref = tipc_ref_acquire(p_ptr, &p_ptr->publ.lock);
+ ref = tipc_ref_acquire(p_ptr, &p_ptr->lock);
if (!ref) {
warn("Port creation failed, reference table exhausted\n");
kfree(p_ptr);
return NULL;
}
- p_ptr->publ.usr_handle = usr_handle;
- p_ptr->publ.max_pkt = MAX_PKT_DEFAULT;
- p_ptr->publ.ref = ref;
- msg = &p_ptr->publ.phdr;
+ p_ptr->usr_handle = usr_handle;
+ p_ptr->max_pkt = MAX_PKT_DEFAULT;
+ p_ptr->ref = ref;
+ msg = &p_ptr->phdr;
tipc_msg_init(msg, importance, TIPC_NAMED_MSG, LONG_H_SIZE, 0);
msg_set_origport(msg, ref);
- p_ptr->last_in_seqno = 41;
- p_ptr->sent = 1;
INIT_LIST_HEAD(&p_ptr->wait_list);
INIT_LIST_HEAD(&p_ptr->subscription.nodesub_list);
p_ptr->dispatcher = dispatcher;
@@ -246,12 +230,12 @@ struct tipc_port *tipc_createport_raw(void *usr_handle,
INIT_LIST_HEAD(&p_ptr->port_list);
list_add_tail(&p_ptr->port_list, &ports);
spin_unlock_bh(&tipc_port_list_lock);
- return &(p_ptr->publ);
+ return p_ptr;
}
int tipc_deleteport(u32 ref)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct sk_buff *buf = NULL;
tipc_withdraw(ref, 0, NULL);
@@ -263,7 +247,7 @@ int tipc_deleteport(u32 ref)
tipc_port_unlock(p_ptr);
k_cancel_timer(&p_ptr->timer);
- if (p_ptr->publ.connected) {
+ if (p_ptr->connected) {
buf = port_build_peer_abort_msg(p_ptr, TIPC_ERR_NO_PORT);
tipc_nodesub_unsubscribe(&p_ptr->subscription);
}
@@ -279,14 +263,14 @@ int tipc_deleteport(u32 ref)
return 0;
}
-static int port_unreliable(struct port *p_ptr)
+static int port_unreliable(struct tipc_port *p_ptr)
{
- return msg_src_droppable(&p_ptr->publ.phdr);
+ return msg_src_droppable(&p_ptr->phdr);
}
int tipc_portunreliable(u32 ref, unsigned int *isunreliable)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
p_ptr = tipc_port_lock(ref);
if (!p_ptr)
@@ -298,24 +282,24 @@ int tipc_portunreliable(u32 ref, unsigned int *isunreliable)
int tipc_set_portunreliable(u32 ref, unsigned int isunreliable)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
p_ptr = tipc_port_lock(ref);
if (!p_ptr)
return -EINVAL;
- msg_set_src_droppable(&p_ptr->publ.phdr, (isunreliable != 0));
+ msg_set_src_droppable(&p_ptr->phdr, (isunreliable != 0));
tipc_port_unlock(p_ptr);
return 0;
}
-static int port_unreturnable(struct port *p_ptr)
+static int port_unreturnable(struct tipc_port *p_ptr)
{
- return msg_dest_droppable(&p_ptr->publ.phdr);
+ return msg_dest_droppable(&p_ptr->phdr);
}
int tipc_portunreturnable(u32 ref, unsigned int *isunrejectable)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
p_ptr = tipc_port_lock(ref);
if (!p_ptr)
@@ -327,12 +311,12 @@ int tipc_portunreturnable(u32 ref, unsigned int *isunrejectable)
int tipc_set_portunreturnable(u32 ref, unsigned int isunrejectable)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
p_ptr = tipc_port_lock(ref);
if (!p_ptr)
return -EINVAL;
- msg_set_dest_droppable(&p_ptr->publ.phdr, (isunrejectable != 0));
+ msg_set_dest_droppable(&p_ptr->phdr, (isunrejectable != 0));
tipc_port_unlock(p_ptr);
return 0;
}
@@ -345,7 +329,7 @@ int tipc_set_portunreturnable(u32 ref, unsigned int isunrejectable)
static struct sk_buff *port_build_proto_msg(u32 destport, u32 destnode,
u32 origport, u32 orignode,
u32 usr, u32 type, u32 err,
- u32 seqno, u32 ack)
+ u32 ack)
{
struct sk_buff *buf;
struct tipc_msg *msg;
@@ -358,7 +342,6 @@ static struct sk_buff *port_build_proto_msg(u32 destport, u32 destnode,
msg_set_destport(msg, destport);
msg_set_origport(msg, origport);
msg_set_orignode(msg, orignode);
- msg_set_transp_seqno(msg, seqno);
msg_set_msgcnt(msg, ack);
}
return buf;
@@ -413,10 +396,10 @@ int tipc_reject_msg(struct sk_buff *buf, u32 err)
/* send self-abort message when rejecting on a connected port */
if (msg_connected(msg)) {
struct sk_buff *abuf = NULL;
- struct port *p_ptr = tipc_port_lock(msg_destport(msg));
+ struct tipc_port *p_ptr = tipc_port_lock(msg_destport(msg));
if (p_ptr) {
- if (p_ptr->publ.connected)
+ if (p_ptr->connected)
abuf = port_build_self_abort_msg(p_ptr, err);
tipc_port_unlock(p_ptr);
}
@@ -429,7 +412,7 @@ int tipc_reject_msg(struct sk_buff *buf, u32 err)
return data_sz;
}
-int tipc_port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr,
+int tipc_port_reject_sections(struct tipc_port *p_ptr, struct tipc_msg *hdr,
struct iovec const *msg_sect, u32 num_sect,
int err)
{
@@ -446,13 +429,13 @@ int tipc_port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr,
static void port_timeout(unsigned long ref)
{
- struct port *p_ptr = tipc_port_lock(ref);
+ struct tipc_port *p_ptr = tipc_port_lock(ref);
struct sk_buff *buf = NULL;
if (!p_ptr)
return;
- if (!p_ptr->publ.connected) {
+ if (!p_ptr->connected) {
tipc_port_unlock(p_ptr);
return;
}
@@ -463,14 +446,12 @@ static void port_timeout(unsigned long ref)
} else {
buf = port_build_proto_msg(port_peerport(p_ptr),
port_peernode(p_ptr),
- p_ptr->publ.ref,
+ p_ptr->ref,
tipc_own_addr,
CONN_MANAGER,
CONN_PROBE,
TIPC_OK,
- port_out_seqno(p_ptr),
0);
- port_incr_out_seqno(p_ptr);
p_ptr->probing_state = PROBING;
k_start_timer(&p_ptr->timer, p_ptr->probing_interval);
}
@@ -481,7 +462,7 @@ static void port_timeout(unsigned long ref)
static void port_handle_node_down(unsigned long ref)
{
- struct port *p_ptr = tipc_port_lock(ref);
+ struct tipc_port *p_ptr = tipc_port_lock(ref);
struct sk_buff *buf = NULL;
if (!p_ptr)
@@ -492,73 +473,71 @@ static void port_handle_node_down(unsigned long ref)
}
-static struct sk_buff *port_build_self_abort_msg(struct port *p_ptr, u32 err)
+static struct sk_buff *port_build_self_abort_msg(struct tipc_port *p_ptr, u32 err)
{
- u32 imp = msg_importance(&p_ptr->publ.phdr);
+ u32 imp = msg_importance(&p_ptr->phdr);
- if (!p_ptr->publ.connected)
+ if (!p_ptr->connected)
return NULL;
if (imp < TIPC_CRITICAL_IMPORTANCE)
imp++;
- return port_build_proto_msg(p_ptr->publ.ref,
+ return port_build_proto_msg(p_ptr->ref,
tipc_own_addr,
port_peerport(p_ptr),
port_peernode(p_ptr),
imp,
TIPC_CONN_MSG,
err,
- p_ptr->last_in_seqno + 1,
0);
}
-static struct sk_buff *port_build_peer_abort_msg(struct port *p_ptr, u32 err)
+static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *p_ptr, u32 err)
{
- u32 imp = msg_importance(&p_ptr->publ.phdr);
+ u32 imp = msg_importance(&p_ptr->phdr);
- if (!p_ptr->publ.connected)
+ if (!p_ptr->connected)
return NULL;
if (imp < TIPC_CRITICAL_IMPORTANCE)
imp++;
return port_build_proto_msg(port_peerport(p_ptr),
port_peernode(p_ptr),
- p_ptr->publ.ref,
+ p_ptr->ref,
tipc_own_addr,
imp,
TIPC_CONN_MSG,
err,
- port_out_seqno(p_ptr),
0);
}
void tipc_port_recv_proto_msg(struct sk_buff *buf)
{
struct tipc_msg *msg = buf_msg(buf);
- struct port *p_ptr = tipc_port_lock(msg_destport(msg));
+ struct tipc_port *p_ptr = tipc_port_lock(msg_destport(msg));
u32 err = TIPC_OK;
struct sk_buff *r_buf = NULL;
struct sk_buff *abort_buf = NULL;
if (!p_ptr) {
err = TIPC_ERR_NO_PORT;
- } else if (p_ptr->publ.connected) {
+ } else if (p_ptr->connected) {
if ((port_peernode(p_ptr) != msg_orignode(msg)) ||
(port_peerport(p_ptr) != msg_origport(msg))) {
err = TIPC_ERR_NO_PORT;
} else if (msg_type(msg) == CONN_ACK) {
int wakeup = tipc_port_congested(p_ptr) &&
- p_ptr->publ.congested &&
+ p_ptr->congested &&
p_ptr->wakeup;
p_ptr->acked += msg_msgcnt(msg);
if (tipc_port_congested(p_ptr))
goto exit;
- p_ptr->publ.congested = 0;
+ p_ptr->congested = 0;
if (!wakeup)
goto exit;
- p_ptr->wakeup(&p_ptr->publ);
+ p_ptr->wakeup(p_ptr);
goto exit;
}
- } else if (p_ptr->publ.published) {
+ } else if (p_ptr->published) {
err = TIPC_ERR_NO_PORT;
}
if (err) {
@@ -569,7 +548,6 @@ void tipc_port_recv_proto_msg(struct sk_buff *buf)
TIPC_HIGH_IMPORTANCE,
TIPC_CONN_MSG,
err,
- 0,
0);
goto exit;
}
@@ -583,11 +561,9 @@ void tipc_port_recv_proto_msg(struct sk_buff *buf)
CONN_MANAGER,
CONN_PROBE_REPLY,
TIPC_OK,
- port_out_seqno(p_ptr),
0);
}
p_ptr->probing_state = CONFIRMED;
- port_incr_out_seqno(p_ptr);
exit:
if (p_ptr)
tipc_port_unlock(p_ptr);
@@ -596,29 +572,29 @@ exit:
buf_discard(buf);
}
-static void port_print(struct port *p_ptr, struct print_buf *buf, int full_id)
+static void port_print(struct tipc_port *p_ptr, struct print_buf *buf, int full_id)
{
struct publication *publ;
if (full_id)
tipc_printf(buf, "<%u.%u.%u:%u>:",
tipc_zone(tipc_own_addr), tipc_cluster(tipc_own_addr),
- tipc_node(tipc_own_addr), p_ptr->publ.ref);
+ tipc_node(tipc_own_addr), p_ptr->ref);
else
- tipc_printf(buf, "%-10u:", p_ptr->publ.ref);
+ tipc_printf(buf, "%-10u:", p_ptr->ref);
- if (p_ptr->publ.connected) {
+ if (p_ptr->connected) {
u32 dport = port_peerport(p_ptr);
u32 destnode = port_peernode(p_ptr);
tipc_printf(buf, " connected to <%u.%u.%u:%u>",
tipc_zone(destnode), tipc_cluster(destnode),
tipc_node(destnode), dport);
- if (p_ptr->publ.conn_type != 0)
+ if (p_ptr->conn_type != 0)
tipc_printf(buf, " via {%u,%u}",
- p_ptr->publ.conn_type,
- p_ptr->publ.conn_instance);
- } else if (p_ptr->publ.published) {
+ p_ptr->conn_type,
+ p_ptr->conn_instance);
+ } else if (p_ptr->published) {
tipc_printf(buf, " bound to");
list_for_each_entry(publ, &p_ptr->publications, pport_list) {
if (publ->lower == publ->upper)
@@ -639,7 +615,7 @@ struct sk_buff *tipc_port_get_ports(void)
struct sk_buff *buf;
struct tlv_desc *rep_tlv;
struct print_buf pb;
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
int str_len;
buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_PORT_QUERY));
@@ -650,9 +626,9 @@ struct sk_buff *tipc_port_get_ports(void)
tipc_printbuf_init(&pb, TLV_DATA(rep_tlv), MAX_PORT_QUERY);
spin_lock_bh(&tipc_port_list_lock);
list_for_each_entry(p_ptr, &ports, port_list) {
- spin_lock_bh(p_ptr->publ.lock);
+ spin_lock_bh(p_ptr->lock);
port_print(p_ptr, &pb, 0);
- spin_unlock_bh(p_ptr->publ.lock);
+ spin_unlock_bh(p_ptr->lock);
}
spin_unlock_bh(&tipc_port_list_lock);
str_len = tipc_printbuf_validate(&pb);
@@ -665,12 +641,12 @@ struct sk_buff *tipc_port_get_ports(void)
void tipc_port_reinit(void)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct tipc_msg *msg;
spin_lock_bh(&tipc_port_list_lock);
list_for_each_entry(p_ptr, &ports, port_list) {
- msg = &p_ptr->publ.phdr;
+ msg = &p_ptr->phdr;
if (msg_orignode(msg) == tipc_own_addr)
break;
msg_set_prevnode(msg, tipc_own_addr);
@@ -695,7 +671,7 @@ static void port_dispatcher_sigh(void *dummy)
spin_unlock_bh(&queue_lock);
while (buf) {
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct user_port *up_ptr;
struct tipc_portid orig;
struct tipc_name_seq dseq;
@@ -720,8 +696,8 @@ static void port_dispatcher_sigh(void *dummy)
orig.node = msg_orignode(msg);
up_ptr = p_ptr->user_port;
usr_handle = up_ptr->usr_handle;
- connected = p_ptr->publ.connected;
- published = p_ptr->publ.published;
+ connected = p_ptr->connected;
+ published = p_ptr->published;
if (unlikely(msg_errcode(msg)))
goto err;
@@ -732,6 +708,7 @@ static void port_dispatcher_sigh(void *dummy)
tipc_conn_msg_event cb = up_ptr->conn_msg_cb;
u32 peer_port = port_peerport(p_ptr);
u32 peer_node = port_peernode(p_ptr);
+ u32 dsz;
tipc_port_unlock(p_ptr);
if (unlikely(!cb))
@@ -742,13 +719,14 @@ static void port_dispatcher_sigh(void *dummy)
} else if ((msg_origport(msg) != peer_port) ||
(msg_orignode(msg) != peer_node))
goto reject;
- if (unlikely(++p_ptr->publ.conn_unacked >=
- TIPC_FLOW_CONTROL_WIN))
+ dsz = msg_data_sz(msg);
+ if (unlikely(dsz &&
+ (++p_ptr->conn_unacked >=
+ TIPC_FLOW_CONTROL_WIN)))
tipc_acknowledge(dref,
- p_ptr->publ.conn_unacked);
+ p_ptr->conn_unacked);
skb_pull(buf, msg_hdr_sz(msg));
- cb(usr_handle, dref, &buf, msg_data(msg),
- msg_data_sz(msg));
+ cb(usr_handle, dref, &buf, msg_data(msg), dsz);
break;
}
case TIPC_DIRECT_MSG:{
@@ -872,7 +850,7 @@ static u32 port_dispatcher(struct tipc_port *dummy, struct sk_buff *buf)
static void port_wakeup_sh(unsigned long ref)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct user_port *up_ptr;
tipc_continue_event cb = NULL;
void *uh = NULL;
@@ -898,14 +876,14 @@ static void port_wakeup(struct tipc_port *p_ptr)
void tipc_acknowledge(u32 ref, u32 ack)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct sk_buff *buf = NULL;
p_ptr = tipc_port_lock(ref);
if (!p_ptr)
return;
- if (p_ptr->publ.connected) {
- p_ptr->publ.conn_unacked -= ack;
+ if (p_ptr->connected) {
+ p_ptr->conn_unacked -= ack;
buf = port_build_proto_msg(port_peerport(p_ptr),
port_peernode(p_ptr),
ref,
@@ -913,7 +891,6 @@ void tipc_acknowledge(u32 ref, u32 ack)
CONN_MANAGER,
CONN_ACK,
TIPC_OK,
- port_out_seqno(p_ptr),
ack);
}
tipc_port_unlock(p_ptr);
@@ -936,14 +913,14 @@ int tipc_createport(void *usr_handle,
u32 *portref)
{
struct user_port *up_ptr;
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
up_ptr = kmalloc(sizeof(*up_ptr), GFP_ATOMIC);
if (!up_ptr) {
warn("Port creation failed, no memory\n");
return -ENOMEM;
}
- p_ptr = (struct port *)tipc_createport_raw(NULL, port_dispatcher,
+ p_ptr = (struct tipc_port *)tipc_createport_raw(NULL, port_dispatcher,
port_wakeup, importance);
if (!p_ptr) {
kfree(up_ptr);
@@ -952,7 +929,7 @@ int tipc_createport(void *usr_handle,
p_ptr->user_port = up_ptr;
up_ptr->usr_handle = usr_handle;
- up_ptr->ref = p_ptr->publ.ref;
+ up_ptr->ref = p_ptr->ref;
up_ptr->err_cb = error_cb;
up_ptr->named_err_cb = named_error_cb;
up_ptr->conn_err_cb = conn_error_cb;
@@ -960,26 +937,26 @@ int tipc_createport(void *usr_handle,
up_ptr->named_msg_cb = named_msg_cb;
up_ptr->conn_msg_cb = conn_msg_cb;
up_ptr->continue_event_cb = continue_event_cb;
- *portref = p_ptr->publ.ref;
+ *portref = p_ptr->ref;
tipc_port_unlock(p_ptr);
return 0;
}
int tipc_portimportance(u32 ref, unsigned int *importance)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
p_ptr = tipc_port_lock(ref);
if (!p_ptr)
return -EINVAL;
- *importance = (unsigned int)msg_importance(&p_ptr->publ.phdr);
+ *importance = (unsigned int)msg_importance(&p_ptr->phdr);
tipc_port_unlock(p_ptr);
return 0;
}
int tipc_set_portimportance(u32 ref, unsigned int imp)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
if (imp > TIPC_CRITICAL_IMPORTANCE)
return -EINVAL;
@@ -987,7 +964,7 @@ int tipc_set_portimportance(u32 ref, unsigned int imp)
p_ptr = tipc_port_lock(ref);
if (!p_ptr)
return -EINVAL;
- msg_set_importance(&p_ptr->publ.phdr, (u32)imp);
+ msg_set_importance(&p_ptr->phdr, (u32)imp);
tipc_port_unlock(p_ptr);
return 0;
}
@@ -995,7 +972,7 @@ int tipc_set_portimportance(u32 ref, unsigned int imp)
int tipc_publish(u32 ref, unsigned int scope, struct tipc_name_seq const *seq)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct publication *publ;
u32 key;
int res = -EINVAL;
@@ -1004,7 +981,7 @@ int tipc_publish(u32 ref, unsigned int scope, struct tipc_name_seq const *seq)
if (!p_ptr)
return -EINVAL;
- if (p_ptr->publ.connected)
+ if (p_ptr->connected)
goto exit;
if (seq->lower > seq->upper)
goto exit;
@@ -1016,11 +993,11 @@ int tipc_publish(u32 ref, unsigned int scope, struct tipc_name_seq const *seq)
goto exit;
}
publ = tipc_nametbl_publish(seq->type, seq->lower, seq->upper,
- scope, p_ptr->publ.ref, key);
+ scope, p_ptr->ref, key);
if (publ) {
list_add(&publ->pport_list, &p_ptr->publications);
p_ptr->pub_count++;
- p_ptr->publ.published = 1;
+ p_ptr->published = 1;
res = 0;
}
exit:
@@ -1030,7 +1007,7 @@ exit:
int tipc_withdraw(u32 ref, unsigned int scope, struct tipc_name_seq const *seq)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct publication *publ;
struct publication *tpubl;
int res = -EINVAL;
@@ -1063,37 +1040,36 @@ int tipc_withdraw(u32 ref, unsigned int scope, struct tipc_name_seq const *seq)
}
}
if (list_empty(&p_ptr->publications))
- p_ptr->publ.published = 0;
+ p_ptr->published = 0;
tipc_port_unlock(p_ptr);
return res;
}
int tipc_connect2port(u32 ref, struct tipc_portid const *peer)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct tipc_msg *msg;
int res = -EINVAL;
p_ptr = tipc_port_lock(ref);
if (!p_ptr)
return -EINVAL;
- if (p_ptr->publ.published || p_ptr->publ.connected)
+ if (p_ptr->published || p_ptr->connected)
goto exit;
if (!peer->ref)
goto exit;
- msg = &p_ptr->publ.phdr;
+ msg = &p_ptr->phdr;
msg_set_destnode(msg, peer->node);
msg_set_destport(msg, peer->ref);
msg_set_orignode(msg, tipc_own_addr);
- msg_set_origport(msg, p_ptr->publ.ref);
- msg_set_transp_seqno(msg, 42);
+ msg_set_origport(msg, p_ptr->ref);
msg_set_type(msg, TIPC_CONN_MSG);
msg_set_hdr_sz(msg, SHORT_H_SIZE);
p_ptr->probing_interval = PROBING_INTERVAL;
p_ptr->probing_state = CONFIRMED;
- p_ptr->publ.connected = 1;
+ p_ptr->connected = 1;
k_start_timer(&p_ptr->timer, p_ptr->probing_interval);
tipc_nodesub_subscribe(&p_ptr->subscription, peer->node,
@@ -1102,7 +1078,7 @@ int tipc_connect2port(u32 ref, struct tipc_portid const *peer)
res = 0;
exit:
tipc_port_unlock(p_ptr);
- p_ptr->publ.max_pkt = tipc_link_get_max_pkt(peer->node, ref);
+ p_ptr->max_pkt = tipc_link_get_max_pkt(peer->node, ref);
return res;
}
@@ -1120,7 +1096,7 @@ int tipc_disconnect_port(struct tipc_port *tp_ptr)
tp_ptr->connected = 0;
/* let timer expire on it's own to avoid deadlock! */
tipc_nodesub_unsubscribe(
- &((struct port *)tp_ptr)->subscription);
+ &((struct tipc_port *)tp_ptr)->subscription);
res = 0;
} else {
res = -ENOTCONN;
@@ -1135,7 +1111,7 @@ int tipc_disconnect_port(struct tipc_port *tp_ptr)
int tipc_disconnect(u32 ref)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
int res;
p_ptr = tipc_port_lock(ref);
@@ -1151,15 +1127,15 @@ int tipc_disconnect(u32 ref)
*/
int tipc_shutdown(u32 ref)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct sk_buff *buf = NULL;
p_ptr = tipc_port_lock(ref);
if (!p_ptr)
return -EINVAL;
- if (p_ptr->publ.connected) {
- u32 imp = msg_importance(&p_ptr->publ.phdr);
+ if (p_ptr->connected) {
+ u32 imp = msg_importance(&p_ptr->phdr);
if (imp < TIPC_CRITICAL_IMPORTANCE)
imp++;
buf = port_build_proto_msg(port_peerport(p_ptr),
@@ -1169,7 +1145,6 @@ int tipc_shutdown(u32 ref)
imp,
TIPC_CONN_MSG,
TIPC_CONN_SHUTDOWN,
- port_out_seqno(p_ptr),
0);
}
tipc_port_unlock(p_ptr);
@@ -1182,13 +1157,13 @@ int tipc_shutdown(u32 ref)
* message for this node.
*/
-static int tipc_port_recv_sections(struct port *sender, unsigned int num_sect,
+static int tipc_port_recv_sections(struct tipc_port *sender, unsigned int num_sect,
struct iovec const *msg_sect)
{
struct sk_buff *buf;
int res;
- res = tipc_msg_build(&sender->publ.phdr, msg_sect, num_sect,
+ res = tipc_msg_build(&sender->phdr, msg_sect, num_sect,
MAX_MSG_SIZE, !sender->user_port, &buf);
if (likely(buf))
tipc_port_recv_msg(buf);
@@ -1201,15 +1176,15 @@ static int tipc_port_recv_sections(struct port *sender, unsigned int num_sect,
int tipc_send(u32 ref, unsigned int num_sect, struct iovec const *msg_sect)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
u32 destnode;
int res;
p_ptr = tipc_port_deref(ref);
- if (!p_ptr || !p_ptr->publ.connected)
+ if (!p_ptr || !p_ptr->connected)
return -EINVAL;
- p_ptr->publ.congested = 1;
+ p_ptr->congested = 1;
if (!tipc_port_congested(p_ptr)) {
destnode = port_peernode(p_ptr);
if (likely(destnode != tipc_own_addr))
@@ -1219,14 +1194,14 @@ int tipc_send(u32 ref, unsigned int num_sect, struct iovec const *msg_sect)
res = tipc_port_recv_sections(p_ptr, num_sect, msg_sect);
if (likely(res != -ELINKCONG)) {
- port_incr_out_seqno(p_ptr);
- p_ptr->publ.congested = 0;
- p_ptr->sent++;
+ p_ptr->congested = 0;
+ if (res > 0)
+ p_ptr->sent++;
return res;
}
}
if (port_unreliable(p_ptr)) {
- p_ptr->publ.congested = 0;
+ p_ptr->congested = 0;
/* Just calculate msg length and return */
return tipc_msg_calc_data_size(msg_sect, num_sect);
}
@@ -1240,17 +1215,17 @@ int tipc_send(u32 ref, unsigned int num_sect, struct iovec const *msg_sect)
int tipc_send2name(u32 ref, struct tipc_name const *name, unsigned int domain,
unsigned int num_sect, struct iovec const *msg_sect)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct tipc_msg *msg;
u32 destnode = domain;
u32 destport;
int res;
p_ptr = tipc_port_deref(ref);
- if (!p_ptr || p_ptr->publ.connected)
+ if (!p_ptr || p_ptr->connected)
return -EINVAL;
- msg = &p_ptr->publ.phdr;
+ msg = &p_ptr->phdr;
msg_set_type(msg, TIPC_NAMED_MSG);
msg_set_orignode(msg, tipc_own_addr);
msg_set_origport(msg, ref);
@@ -1263,13 +1238,17 @@ int tipc_send2name(u32 ref, struct tipc_name const *name, unsigned int domain,
msg_set_destport(msg, destport);
if (likely(destport)) {
- p_ptr->sent++;
if (likely(destnode == tipc_own_addr))
- return tipc_port_recv_sections(p_ptr, num_sect, msg_sect);
- res = tipc_link_send_sections_fast(p_ptr, msg_sect, num_sect,
- destnode);
- if (likely(res != -ELINKCONG))
+ res = tipc_port_recv_sections(p_ptr, num_sect,
+ msg_sect);
+ else
+ res = tipc_link_send_sections_fast(p_ptr, msg_sect,
+ num_sect, destnode);
+ if (likely(res != -ELINKCONG)) {
+ if (res > 0)
+ p_ptr->sent++;
return res;
+ }
if (port_unreliable(p_ptr)) {
/* Just calculate msg length and return */
return tipc_msg_calc_data_size(msg_sect, num_sect);
@@ -1287,27 +1266,32 @@ int tipc_send2name(u32 ref, struct tipc_name const *name, unsigned int domain,
int tipc_send2port(u32 ref, struct tipc_portid const *dest,
unsigned int num_sect, struct iovec const *msg_sect)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct tipc_msg *msg;
int res;
p_ptr = tipc_port_deref(ref);
- if (!p_ptr || p_ptr->publ.connected)
+ if (!p_ptr || p_ptr->connected)
return -EINVAL;
- msg = &p_ptr->publ.phdr;
+ msg = &p_ptr->phdr;
msg_set_type(msg, TIPC_DIRECT_MSG);
msg_set_orignode(msg, tipc_own_addr);
msg_set_origport(msg, ref);
msg_set_destnode(msg, dest->node);
msg_set_destport(msg, dest->ref);
msg_set_hdr_sz(msg, DIR_MSG_H_SIZE);
- p_ptr->sent++;
+
if (dest->node == tipc_own_addr)
- return tipc_port_recv_sections(p_ptr, num_sect, msg_sect);
- res = tipc_link_send_sections_fast(p_ptr, msg_sect, num_sect, dest->node);
- if (likely(res != -ELINKCONG))
+ res = tipc_port_recv_sections(p_ptr, num_sect, msg_sect);
+ else
+ res = tipc_link_send_sections_fast(p_ptr, msg_sect, num_sect,
+ dest->node);
+ if (likely(res != -ELINKCONG)) {
+ if (res > 0)
+ p_ptr->sent++;
return res;
+ }
if (port_unreliable(p_ptr)) {
/* Just calculate msg length and return */
return tipc_msg_calc_data_size(msg_sect, num_sect);
@@ -1322,15 +1306,15 @@ int tipc_send2port(u32 ref, struct tipc_portid const *dest,
int tipc_send_buf2port(u32 ref, struct tipc_portid const *dest,
struct sk_buff *buf, unsigned int dsz)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct tipc_msg *msg;
int res;
- p_ptr = (struct port *)tipc_ref_deref(ref);
- if (!p_ptr || p_ptr->publ.connected)
+ p_ptr = (struct tipc_port *)tipc_ref_deref(ref);
+ if (!p_ptr || p_ptr->connected)
return -EINVAL;
- msg = &p_ptr->publ.phdr;
+ msg = &p_ptr->phdr;
msg_set_type(msg, TIPC_DIRECT_MSG);
msg_set_orignode(msg, tipc_own_addr);
msg_set_origport(msg, ref);
@@ -1343,12 +1327,16 @@ int tipc_send_buf2port(u32 ref, struct tipc_portid const *dest,
skb_push(buf, DIR_MSG_H_SIZE);
skb_copy_to_linear_data(buf, msg, DIR_MSG_H_SIZE);
- p_ptr->sent++;
+
if (dest->node == tipc_own_addr)
- return tipc_port_recv_msg(buf);
- res = tipc_send_buf_fast(buf, dest->node);
- if (likely(res != -ELINKCONG))
+ res = tipc_port_recv_msg(buf);
+ else
+ res = tipc_send_buf_fast(buf, dest->node);
+ if (likely(res != -ELINKCONG)) {
+ if (res > 0)
+ p_ptr->sent++;
return res;
+ }
if (port_unreliable(p_ptr))
return dsz;
return -ELINKCONG;
diff --git a/net/tipc/port.h b/net/tipc/port.h
index 8e84b989949..87b9424ae0e 100644
--- a/net/tipc/port.h
+++ b/net/tipc/port.h
@@ -2,7 +2,7 @@
* net/tipc/port.h: Include file for TIPC port code
*
* Copyright (c) 1994-2007, Ericsson AB
- * Copyright (c) 2004-2007, Wind River Systems
+ * Copyright (c) 2004-2007, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -95,7 +95,7 @@ struct user_port {
};
/**
- * struct tipc_port - TIPC port info available to socket API
+ * struct tipc_port - TIPC port structure
* @usr_handle: pointer to additional user-defined information about port
* @lock: pointer to spinlock for controlling access to port
* @connected: non-zero if port is currently connected to a peer port
@@ -107,43 +107,33 @@ struct user_port {
* @max_pkt: maximum packet size "hint" used when building messages sent by port
* @ref: unique reference to port in TIPC object registry
* @phdr: preformatted message header used when sending messages
- */
-struct tipc_port {
- void *usr_handle;
- spinlock_t *lock;
- int connected;
- u32 conn_type;
- u32 conn_instance;
- u32 conn_unacked;
- int published;
- u32 congested;
- u32 max_pkt;
- u32 ref;
- struct tipc_msg phdr;
-};
-
-/**
- * struct port - TIPC port structure
- * @publ: TIPC port info available to privileged users
* @port_list: adjacent ports in TIPC's global list of ports
* @dispatcher: ptr to routine which handles received messages
* @wakeup: ptr to routine to call when port is no longer congested
* @user_port: ptr to user port associated with port (if any)
* @wait_list: adjacent ports in list of ports waiting on link congestion
* @waiting_pkts:
- * @sent:
- * @acked:
+ * @sent: # of non-empty messages sent by port
+ * @acked: # of non-empty message acknowledgements from connected port's peer
* @publications: list of publications for port
* @pub_count: total # of publications port has made during its lifetime
* @probing_state:
* @probing_interval:
- * @last_in_seqno:
* @timer_ref:
* @subscription: "node down" subscription used to terminate failed connections
*/
-
-struct port {
- struct tipc_port publ;
+struct tipc_port {
+ void *usr_handle;
+ spinlock_t *lock;
+ int connected;
+ u32 conn_type;
+ u32 conn_instance;
+ u32 conn_unacked;
+ int published;
+ u32 congested;
+ u32 max_pkt;
+ u32 ref;
+ struct tipc_msg phdr;
struct list_head port_list;
u32 (*dispatcher)(struct tipc_port *, struct sk_buff *);
void (*wakeup)(struct tipc_port *);
@@ -156,7 +146,6 @@ struct port {
u32 pub_count;
u32 probing_state;
u32 probing_interval;
- u32 last_in_seqno;
struct timer_list timer;
struct tipc_node_subscr subscription;
};
@@ -230,7 +219,7 @@ int tipc_send_buf2port(u32 portref, struct tipc_portid const *dest,
int tipc_multicast(u32 portref, struct tipc_name_seq const *seq,
unsigned int section_count, struct iovec const *msg);
-int tipc_port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr,
+int tipc_port_reject_sections(struct tipc_port *p_ptr, struct tipc_msg *hdr,
struct iovec const *msg_sect, u32 num_sect,
int err);
struct sk_buff *tipc_port_get_ports(void);
@@ -242,9 +231,9 @@ void tipc_port_reinit(void);
* tipc_port_lock - lock port instance referred to and return its pointer
*/
-static inline struct port *tipc_port_lock(u32 ref)
+static inline struct tipc_port *tipc_port_lock(u32 ref)
{
- return (struct port *)tipc_ref_lock(ref);
+ return (struct tipc_port *)tipc_ref_lock(ref);
}
/**
@@ -253,27 +242,27 @@ static inline struct port *tipc_port_lock(u32 ref)
* Can use pointer instead of tipc_ref_unlock() since port is already locked.
*/
-static inline void tipc_port_unlock(struct port *p_ptr)
+static inline void tipc_port_unlock(struct tipc_port *p_ptr)
{
- spin_unlock_bh(p_ptr->publ.lock);
+ spin_unlock_bh(p_ptr->lock);
}
-static inline struct port *tipc_port_deref(u32 ref)
+static inline struct tipc_port *tipc_port_deref(u32 ref)
{
- return (struct port *)tipc_ref_deref(ref);
+ return (struct tipc_port *)tipc_ref_deref(ref);
}
-static inline u32 tipc_peer_port(struct port *p_ptr)
+static inline u32 tipc_peer_port(struct tipc_port *p_ptr)
{
- return msg_destport(&p_ptr->publ.phdr);
+ return msg_destport(&p_ptr->phdr);
}
-static inline u32 tipc_peer_node(struct port *p_ptr)
+static inline u32 tipc_peer_node(struct tipc_port *p_ptr)
{
- return msg_destnode(&p_ptr->publ.phdr);
+ return msg_destnode(&p_ptr->phdr);
}
-static inline int tipc_port_congested(struct port *p_ptr)
+static inline int tipc_port_congested(struct tipc_port *p_ptr)
{
return (p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2);
}
@@ -284,7 +273,7 @@ static inline int tipc_port_congested(struct port *p_ptr)
static inline int tipc_port_recv_msg(struct sk_buff *buf)
{
- struct port *p_ptr;
+ struct tipc_port *p_ptr;
struct tipc_msg *msg = buf_msg(buf);
u32 destport = msg_destport(msg);
u32 dsz = msg_data_sz(msg);
@@ -299,7 +288,7 @@ static inline int tipc_port_recv_msg(struct sk_buff *buf)
/* validate destination & pass to port, otherwise reject message */
p_ptr = tipc_port_lock(destport);
if (likely(p_ptr)) {
- if (likely(p_ptr->publ.connected)) {
+ if (likely(p_ptr->connected)) {
if ((unlikely(msg_origport(msg) != tipc_peer_port(p_ptr))) ||
(unlikely(msg_orignode(msg) != tipc_peer_node(p_ptr))) ||
(unlikely(!msg_connected(msg)))) {
@@ -308,7 +297,7 @@ static inline int tipc_port_recv_msg(struct sk_buff *buf)
goto reject;
}
}
- err = p_ptr->dispatcher(&p_ptr->publ, buf);
+ err = p_ptr->dispatcher(p_ptr, buf);
tipc_port_unlock(p_ptr);
if (likely(!err))
return dsz;
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 2b02a3a8031..29d94d53198 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2,7 +2,7 @@
* net/tipc/socket.c: TIPC socket API
*
* Copyright (c) 2001-2007, Ericsson AB
- * Copyright (c) 2004-2008, Wind River Systems
+ * Copyright (c) 2004-2008, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -58,6 +58,9 @@ struct tipc_sock {
#define tipc_sk(sk) ((struct tipc_sock *)(sk))
#define tipc_sk_port(sk) ((struct tipc_port *)(tipc_sk(sk)->p))
+#define tipc_rx_ready(sock) (!skb_queue_empty(&sock->sk->sk_receive_queue) || \
+ (sock->state == SS_DISCONNECTING))
+
static int backlog_rcv(struct sock *sk, struct sk_buff *skb);
static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf);
static void wakeupdispatch(struct tipc_port *tport);
@@ -241,7 +244,6 @@ static int tipc_create(struct net *net, struct socket *sock, int protocol,
tipc_set_portunreliable(tp_ptr->ref, 1);
}
- atomic_inc(&tipc_user_count);
return 0;
}
@@ -290,7 +292,7 @@ static int release(struct socket *sock)
if (buf == NULL)
break;
atomic_dec(&tipc_queue_size);
- if (TIPC_SKB_CB(buf)->handle != msg_data(buf_msg(buf)))
+ if (TIPC_SKB_CB(buf)->handle != 0)
buf_discard(buf);
else {
if ((sock->state == SS_CONNECTING) ||
@@ -321,7 +323,6 @@ static int release(struct socket *sock)
sock_put(sk);
sock->sk = NULL;
- atomic_dec(&tipc_user_count);
return res;
}
@@ -495,6 +496,8 @@ static int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m)
if (likely(dest->addr.name.name.type != TIPC_CFG_SRV))
return -EACCES;
+ if (!m->msg_iovlen || (m->msg_iov[0].iov_len < sizeof(hdr)))
+ return -EMSGSIZE;
if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr)))
return -EFAULT;
if ((ntohs(hdr.tcm_type) & 0xC000) && (!capable(CAP_NET_ADMIN)))
@@ -911,15 +914,13 @@ static int recv_msg(struct kiocb *iocb, struct socket *sock,
struct tipc_port *tport = tipc_sk_port(sk);
struct sk_buff *buf;
struct tipc_msg *msg;
+ long timeout;
unsigned int sz;
u32 err;
int res;
/* Catch invalid receive requests */
- if (m->msg_iovlen != 1)
- return -EOPNOTSUPP; /* Don't do multiple iovec entries yet */
-
if (unlikely(!buf_len))
return -EINVAL;
@@ -930,6 +931,7 @@ static int recv_msg(struct kiocb *iocb, struct socket *sock,
goto exit;
}
+ timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
restart:
/* Look for a message in receive queue; wait if necessary */
@@ -939,17 +941,15 @@ restart:
res = -ENOTCONN;
goto exit;
}
- if (flags & MSG_DONTWAIT) {
- res = -EWOULDBLOCK;
+ if (timeout <= 0L) {
+ res = timeout ? timeout : -EWOULDBLOCK;
goto exit;
}
release_sock(sk);
- res = wait_event_interruptible(*sk_sleep(sk),
- (!skb_queue_empty(&sk->sk_receive_queue) ||
- (sock->state == SS_DISCONNECTING)));
+ timeout = wait_event_interruptible_timeout(*sk_sleep(sk),
+ tipc_rx_ready(sock),
+ timeout);
lock_sock(sk);
- if (res)
- goto exit;
}
/* Look at first message in receive queue */
@@ -991,11 +991,10 @@ restart:
sz = buf_len;
m->msg_flags |= MSG_TRUNC;
}
- if (unlikely(copy_to_user(m->msg_iov->iov_base, msg_data(msg),
- sz))) {
- res = -EFAULT;
+ res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg),
+ m->msg_iov, sz);
+ if (res)
goto exit;
- }
res = sz;
} else {
if ((sock->state == SS_READY) ||
@@ -1038,19 +1037,15 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock,
struct tipc_port *tport = tipc_sk_port(sk);
struct sk_buff *buf;
struct tipc_msg *msg;
+ long timeout;
unsigned int sz;
int sz_to_copy, target, needed;
int sz_copied = 0;
- char __user *crs = m->msg_iov->iov_base;
- unsigned char *buf_crs;
u32 err;
int res = 0;
/* Catch invalid receive attempts */
- if (m->msg_iovlen != 1)
- return -EOPNOTSUPP; /* Don't do multiple iovec entries yet */
-
if (unlikely(!buf_len))
return -EINVAL;
@@ -1063,7 +1058,7 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock,
}
target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len);
-
+ timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
restart:
/* Look for a message in receive queue; wait if necessary */
@@ -1073,17 +1068,15 @@ restart:
res = -ENOTCONN;
goto exit;
}
- if (flags & MSG_DONTWAIT) {
- res = -EWOULDBLOCK;
+ if (timeout <= 0L) {
+ res = timeout ? timeout : -EWOULDBLOCK;
goto exit;
}
release_sock(sk);
- res = wait_event_interruptible(*sk_sleep(sk),
- (!skb_queue_empty(&sk->sk_receive_queue) ||
- (sock->state == SS_DISCONNECTING)));
+ timeout = wait_event_interruptible_timeout(*sk_sleep(sk),
+ tipc_rx_ready(sock),
+ timeout);
lock_sock(sk);
- if (res)
- goto exit;
}
/* Look at first message in receive queue */
@@ -1112,24 +1105,25 @@ restart:
/* Capture message data (if valid) & compute return value (always) */
if (!err) {
- buf_crs = (unsigned char *)(TIPC_SKB_CB(buf)->handle);
- sz = (unsigned char *)msg + msg_size(msg) - buf_crs;
+ u32 offset = (u32)(unsigned long)(TIPC_SKB_CB(buf)->handle);
+ sz -= offset;
needed = (buf_len - sz_copied);
sz_to_copy = (sz <= needed) ? sz : needed;
- if (unlikely(copy_to_user(crs, buf_crs, sz_to_copy))) {
- res = -EFAULT;
+
+ res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset,
+ m->msg_iov, sz_to_copy);
+ if (res)
goto exit;
- }
+
sz_copied += sz_to_copy;
if (sz_to_copy < sz) {
if (!(flags & MSG_PEEK))
- TIPC_SKB_CB(buf)->handle = buf_crs + sz_to_copy;
+ TIPC_SKB_CB(buf)->handle =
+ (void *)(unsigned long)(offset + sz_to_copy);
goto exit;
}
-
- crs += sz_to_copy;
} else {
if (sz_copied != 0)
goto exit; /* can't add error msg to valid data */
@@ -1256,7 +1250,7 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf)
/* Enqueue message (finally!) */
- TIPC_SKB_CB(buf)->handle = msg_data(msg);
+ TIPC_SKB_CB(buf)->handle = 0;
atomic_inc(&tipc_queue_size);
__skb_queue_tail(&sk->sk_receive_queue, buf);
@@ -1608,7 +1602,7 @@ restart:
buf = __skb_dequeue(&sk->sk_receive_queue);
if (buf) {
atomic_dec(&tipc_queue_size);
- if (TIPC_SKB_CB(buf)->handle != msg_data(buf_msg(buf))) {
+ if (TIPC_SKB_CB(buf)->handle != 0) {
buf_discard(buf);
goto restart;
}
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index ca04479c3d4..aae9eae1340 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -2,7 +2,7 @@
* net/tipc/subscr.c: TIPC network topology service
*
* Copyright (c) 2000-2006, Ericsson AB
- * Copyright (c) 2005-2007, Wind River Systems
+ * Copyright (c) 2005-2007, 2010-2011, Wind River Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -160,7 +160,7 @@ void tipc_subscr_report_overlap(struct subscription *sub,
static void subscr_timeout(struct subscription *sub)
{
- struct port *server_port;
+ struct tipc_port *server_port;
/* Validate server port reference (in case subscriber is terminating) */
@@ -472,8 +472,6 @@ static void subscr_named_msg_event(void *usr_handle,
struct tipc_portid const *orig,
struct tipc_name_seq const *dest)
{
- static struct iovec msg_sect = {NULL, 0};
-
struct subscriber *subscriber;
u32 server_port_ref;
@@ -508,7 +506,7 @@ static void subscr_named_msg_event(void *usr_handle,
/* Lock server port (& save lock address for future use) */
- subscriber->lock = tipc_port_lock(subscriber->port_ref)->publ.lock;
+ subscriber->lock = tipc_port_lock(subscriber->port_ref)->lock;
/* Add subscriber to topology server's subscriber list */
@@ -523,7 +521,7 @@ static void subscr_named_msg_event(void *usr_handle,
/* Send an ACK- to complete connection handshaking */
- tipc_send(server_port_ref, 1, &msg_sect);
+ tipc_send(server_port_ref, 0, NULL);
/* Handle optional subscription request */
@@ -542,7 +540,6 @@ int tipc_subscr_start(void)
spin_lock_init(&topsrv.lock);
INIT_LIST_HEAD(&topsrv.subscriber_list);
- spin_lock_bh(&topsrv.lock);
res = tipc_createport(NULL,
TIPC_CRITICAL_IMPORTANCE,
NULL,
@@ -563,12 +560,10 @@ int tipc_subscr_start(void)
goto failed;
}
- spin_unlock_bh(&topsrv.lock);
return 0;
failed:
err("Failed to create subscription service\n");
- spin_unlock_bh(&topsrv.lock);
return res;
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index dd419d28620..de870184e45 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1171,7 +1171,7 @@ restart:
newsk->sk_type = sk->sk_type;
init_peercred(newsk);
newu = unix_sk(newsk);
- newsk->sk_wq = &newu->peer_wq;
+ RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
otheru = unix_sk(other);
/* copy address information from listening to new sock*/
@@ -1475,6 +1475,12 @@ restart:
goto out_free;
}
+ if (sk_filter(other, skb) < 0) {
+ /* Toss the packet but do not return any error to the sender */
+ err = len;
+ goto out_free;
+ }
+
unix_state_lock(other);
err = -EPERM;
if (!unix_may_send(sk, other))
@@ -1561,7 +1567,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
struct sock *sk = sock->sk;
struct sock *other = NULL;
- struct sockaddr_un *sunaddr = msg->msg_name;
int err, size;
struct sk_buff *skb;
int sent = 0;
@@ -1584,7 +1589,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
goto out_err;
} else {
- sunaddr = NULL;
err = -ENOTCONN;
other = unix_peer(sk);
if (!other)
@@ -1724,7 +1728,11 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
msg->msg_namelen = 0;
- mutex_lock(&u->readlock);
+ err = mutex_lock_interruptible(&u->readlock);
+ if (err) {
+ err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
+ goto out;
+ }
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (!skb) {
@@ -1864,7 +1872,11 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
memset(&tmp_scm, 0, sizeof(tmp_scm));
}
- mutex_lock(&u->readlock);
+ err = mutex_lock_interruptible(&u->readlock);
+ if (err) {
+ err = sock_intr_errno(timeo);
+ goto out;
+ }
do {
int chunk;
@@ -1895,11 +1907,12 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
timeo = unix_stream_data_wait(sk, timeo);
- if (signal_pending(current)) {
+ if (signal_pending(current)
+ || mutex_lock_interruptible(&u->readlock)) {
err = sock_intr_errno(timeo);
goto out;
}
- mutex_lock(&u->readlock);
+
continue;
unlock:
unix_state_unlock(sk);
@@ -1978,36 +1991,38 @@ static int unix_shutdown(struct socket *sock, int mode)
mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
- if (mode) {
- unix_state_lock(sk);
- sk->sk_shutdown |= mode;
- other = unix_peer(sk);
- if (other)
- sock_hold(other);
- unix_state_unlock(sk);
- sk->sk_state_change(sk);
-
- if (other &&
- (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
-
- int peer_mode = 0;
-
- if (mode&RCV_SHUTDOWN)
- peer_mode |= SEND_SHUTDOWN;
- if (mode&SEND_SHUTDOWN)
- peer_mode |= RCV_SHUTDOWN;
- unix_state_lock(other);
- other->sk_shutdown |= peer_mode;
- unix_state_unlock(other);
- other->sk_state_change(other);
- if (peer_mode == SHUTDOWN_MASK)
- sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
- else if (peer_mode & RCV_SHUTDOWN)
- sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
- }
- if (other)
- sock_put(other);
+ if (!mode)
+ return 0;
+
+ unix_state_lock(sk);
+ sk->sk_shutdown |= mode;
+ other = unix_peer(sk);
+ if (other)
+ sock_hold(other);
+ unix_state_unlock(sk);
+ sk->sk_state_change(sk);
+
+ if (other &&
+ (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
+
+ int peer_mode = 0;
+
+ if (mode&RCV_SHUTDOWN)
+ peer_mode |= SEND_SHUTDOWN;
+ if (mode&SEND_SHUTDOWN)
+ peer_mode |= RCV_SHUTDOWN;
+ unix_state_lock(other);
+ other->sk_shutdown |= peer_mode;
+ unix_state_unlock(other);
+ other->sk_state_change(other);
+ if (peer_mode == SHUTDOWN_MASK)
+ sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
+ else if (peer_mode & RCV_SHUTDOWN)
+ sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
}
+ if (other)
+ sock_put(other);
+
return 0;
}
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index 74944a2dd43..788a12c1eb5 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -59,8 +59,6 @@
#include <asm/uaccess.h> /* copy_to/from_user */
#include <linux/init.h> /* __initfunc et al. */
-#define KMEM_SAFETYZONE 8
-
#define DEV_TO_SLAVE(dev) (*((struct net_device **)netdev_priv(dev)))
/*
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index d0ee29063e5..1f1ef70f34f 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -95,7 +95,7 @@ config CFG80211_DEBUGFS
If unsure, say N.
config CFG80211_INTERNAL_REGDB
- bool "use statically compiled regulatory rules database" if EMBEDDED
+ bool "use statically compiled regulatory rules database" if EXPERT
default n
depends on CFG80211
---help---
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 7f1f4ec4904..0bf169bb770 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -805,11 +805,11 @@ int cfg80211_wext_siwfreq(struct net_device *dev,
return freq;
if (freq == 0)
return -EINVAL;
- wdev_lock(wdev);
mutex_lock(&rdev->devlist_mtx);
+ wdev_lock(wdev);
err = cfg80211_set_freq(rdev, wdev, freq, NL80211_CHAN_NO_HT);
- mutex_unlock(&rdev->devlist_mtx);
wdev_unlock(wdev);
+ mutex_unlock(&rdev->devlist_mtx);
return err;
default:
return -EOPNOTSUPP;
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index 55187c8f642..406207515b5 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -27,9 +27,19 @@
#include <net/sock.h>
#include <net/x25.h>
-/*
- * Parse a set of facilities into the facilities structures. Unrecognised
- * facilities are written to the debug log file.
+/**
+ * x25_parse_facilities - Parse facilities from skb into the facilities structs
+ *
+ * @skb: sk_buff to parse
+ * @facilities: Regular facilites, updated as facilities are found
+ * @dte_facs: ITU DTE facilities, updated as DTE facilities are found
+ * @vc_fac_mask: mask is updated with all facilities found
+ *
+ * Return codes:
+ * -1 - Parsing error, caller should drop call and clean up
+ * 0 - Parse OK, this skb has no facilities
+ * >0 - Parse OK, returns the length of the facilities header
+ *
*/
int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
struct x25_dte_facilities *dte_facs, unsigned long *vc_fac_mask)
@@ -62,7 +72,7 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
switch (*p & X25_FAC_CLASS_MASK) {
case X25_FAC_CLASS_A:
if (len < 2)
- return 0;
+ return -1;
switch (*p) {
case X25_FAC_REVERSE:
if((p[1] & 0x81) == 0x81) {
@@ -107,7 +117,7 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
break;
case X25_FAC_CLASS_B:
if (len < 3)
- return 0;
+ return -1;
switch (*p) {
case X25_FAC_PACKET_SIZE:
facilities->pacsize_in = p[1];
@@ -130,7 +140,7 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
break;
case X25_FAC_CLASS_C:
if (len < 4)
- return 0;
+ return -1;
printk(KERN_DEBUG "X.25: unknown facility %02X, "
"values %02X, %02X, %02X\n",
p[0], p[1], p[2], p[3]);
@@ -139,18 +149,18 @@ int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
break;
case X25_FAC_CLASS_D:
if (len < p[1] + 2)
- return 0;
+ return -1;
switch (*p) {
case X25_FAC_CALLING_AE:
if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1)
- return 0;
+ return -1;
dte_facs->calling_len = p[2];
memcpy(dte_facs->calling_ae, &p[3], p[1] - 1);
*vc_fac_mask |= X25_MASK_CALLING_AE;
break;
case X25_FAC_CALLED_AE:
if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1)
- return 0;
+ return -1;
dte_facs->called_len = p[2];
memcpy(dte_facs->called_ae, &p[3], p[1] - 1);
*vc_fac_mask |= X25_MASK_CALLED_AE;
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index f729f022be6..15de65f0471 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -91,10 +91,10 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
{
struct x25_address source_addr, dest_addr;
int len;
+ struct x25_sock *x25 = x25_sk(sk);
switch (frametype) {
case X25_CALL_ACCEPTED: {
- struct x25_sock *x25 = x25_sk(sk);
x25_stop_timer(sk);
x25->condition = 0x00;
@@ -113,14 +113,16 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
&dest_addr);
if (len > 0)
skb_pull(skb, len);
+ else if (len < 0)
+ goto out_clear;
len = x25_parse_facilities(skb, &x25->facilities,
&x25->dte_facilities,
&x25->vc_facil_mask);
if (len > 0)
skb_pull(skb, len);
- else
- return -1;
+ else if (len < 0)
+ goto out_clear;
/*
* Copy any Call User Data.
*/
@@ -144,6 +146,12 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
}
return 0;
+
+out_clear:
+ x25_write_internal(sk, X25_CLEAR_REQUEST);
+ x25->state = X25_STATE_2;
+ x25_start_t23timer(sk);
+ return 0;
}
/*
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
index 4cbc942f762..21306928d47 100644
--- a/net/x25/x25_link.c
+++ b/net/x25/x25_link.c
@@ -396,9 +396,12 @@ void __exit x25_link_free(void)
write_lock_bh(&x25_neigh_list_lock);
list_for_each_safe(entry, tmp, &x25_neigh_list) {
+ struct net_device *dev;
+
nb = list_entry(entry, struct x25_neigh, node);
+ dev = nb->dev;
__x25_remove_neigh(nb);
- dev_put(nb->dev);
+ dev_put(dev);
}
write_unlock_bh(&x25_neigh_list_lock);
}
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index c631047e1b2..aa429eefe91 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -4,7 +4,7 @@
obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
xfrm_input.o xfrm_output.o xfrm_algo.o \
- xfrm_sysctl.o
+ xfrm_sysctl.o xfrm_replay.o
obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
obj-$(CONFIG_XFRM_USER) += xfrm_user.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index 8b4d6e3246e..58064d9e565 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -618,21 +618,21 @@ static int xfrm_alg_name_match(const struct xfrm_algo_desc *entry,
(entry->compat && !strcmp(name, entry->compat)));
}
-struct xfrm_algo_desc *xfrm_aalg_get_byname(char *name, int probe)
+struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe)
{
return xfrm_find_algo(&xfrm_aalg_list, xfrm_alg_name_match, name,
probe);
}
EXPORT_SYMBOL_GPL(xfrm_aalg_get_byname);
-struct xfrm_algo_desc *xfrm_ealg_get_byname(char *name, int probe)
+struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe)
{
return xfrm_find_algo(&xfrm_ealg_list, xfrm_alg_name_match, name,
probe);
}
EXPORT_SYMBOL_GPL(xfrm_ealg_get_byname);
-struct xfrm_algo_desc *xfrm_calg_get_byname(char *name, int probe)
+struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe)
{
return xfrm_find_algo(&xfrm_calg_list, xfrm_alg_name_match, name,
probe);
@@ -654,7 +654,7 @@ static int xfrm_aead_name_match(const struct xfrm_algo_desc *entry,
!strcmp(name, entry->name);
}
-struct xfrm_algo_desc *xfrm_aead_get_byname(char *name, int icv_len, int probe)
+struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len, int probe)
{
struct xfrm_aead_name data = {
.name = name,
diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h
index 8e69533d231..7199d78b2aa 100644
--- a/net/xfrm/xfrm_hash.h
+++ b/net/xfrm/xfrm_hash.h
@@ -4,29 +4,32 @@
#include <linux/xfrm.h>
#include <linux/socket.h>
-static inline unsigned int __xfrm4_addr_hash(xfrm_address_t *addr)
+static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr)
{
return ntohl(addr->a4);
}
-static inline unsigned int __xfrm6_addr_hash(xfrm_address_t *addr)
+static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr)
{
return ntohl(addr->a6[2] ^ addr->a6[3]);
}
-static inline unsigned int __xfrm4_daddr_saddr_hash(xfrm_address_t *daddr, xfrm_address_t *saddr)
+static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr)
{
u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4;
return ntohl((__force __be32)sum);
}
-static inline unsigned int __xfrm6_daddr_saddr_hash(xfrm_address_t *daddr, xfrm_address_t *saddr)
+static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr)
{
return ntohl(daddr->a6[2] ^ daddr->a6[3] ^
saddr->a6[2] ^ saddr->a6[3]);
}
-static inline unsigned int __xfrm_dst_hash(xfrm_address_t *daddr, xfrm_address_t *saddr,
+static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
u32 reqid, unsigned short family,
unsigned int hmask)
{
@@ -42,8 +45,8 @@ static inline unsigned int __xfrm_dst_hash(xfrm_address_t *daddr, xfrm_address_t
return (h ^ (h >> 16)) & hmask;
}
-static inline unsigned __xfrm_src_hash(xfrm_address_t *daddr,
- xfrm_address_t *saddr,
+static inline unsigned __xfrm_src_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
unsigned short family,
unsigned int hmask)
{
@@ -60,8 +63,8 @@ static inline unsigned __xfrm_src_hash(xfrm_address_t *daddr,
}
static inline unsigned int
-__xfrm_spi_hash(xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family,
- unsigned int hmask)
+__xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto,
+ unsigned short family, unsigned int hmask)
{
unsigned int h = (__force u32)spi ^ proto;
switch (family) {
@@ -80,10 +83,11 @@ static inline unsigned int __idx_hash(u32 index, unsigned int hmask)
return (index ^ (index >> 8)) & hmask;
}
-static inline unsigned int __sel_hash(struct xfrm_selector *sel, unsigned short family, unsigned int hmask)
+static inline unsigned int __sel_hash(const struct xfrm_selector *sel,
+ unsigned short family, unsigned int hmask)
{
- xfrm_address_t *daddr = &sel->daddr;
- xfrm_address_t *saddr = &sel->saddr;
+ const xfrm_address_t *daddr = &sel->daddr;
+ const xfrm_address_t *saddr = &sel->saddr;
unsigned int h = 0;
switch (family) {
@@ -107,7 +111,9 @@ static inline unsigned int __sel_hash(struct xfrm_selector *sel, unsigned short
return h & hmask;
}
-static inline unsigned int __addr_hash(xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, unsigned int hmask)
+static inline unsigned int __addr_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ unsigned short family, unsigned int hmask)
{
unsigned int h = 0;
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 45f1c98d4fc..872065ca7f8 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -107,6 +107,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
struct net *net = dev_net(skb->dev);
int err;
__be32 seq;
+ __be32 seq_hi;
struct xfrm_state *x;
xfrm_address_t *daddr;
struct xfrm_mode *inner_mode;
@@ -118,7 +119,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
if (encap_type < 0) {
async = 1;
x = xfrm_input_state(skb);
- seq = XFRM_SKB_CB(skb)->seq.input;
+ seq = XFRM_SKB_CB(skb)->seq.input.low;
goto resume;
}
@@ -172,7 +173,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
goto drop_unlock;
}
- if (x->props.replay_window && xfrm_replay_check(x, skb, seq)) {
+ if (x->props.replay_window && x->repl->check(x, skb, seq)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR);
goto drop_unlock;
}
@@ -184,7 +185,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
spin_unlock(&x->lock);
- XFRM_SKB_CB(skb)->seq.input = seq;
+ seq_hi = htonl(xfrm_replay_seqhi(x, seq));
+
+ XFRM_SKB_CB(skb)->seq.input.low = seq;
+ XFRM_SKB_CB(skb)->seq.input.hi = seq_hi;
nexthdr = x->type->input(x, skb);
@@ -206,8 +210,7 @@ resume:
/* only the first xfrm gets the encap type */
encap_type = 0;
- if (x->props.replay_window)
- xfrm_replay_advance(x, seq);
+ x->repl->advance(x, seq);
x->curlft.bytes += skb->len;
x->curlft.packets++;
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 64f2ae1fdc1..1aba03f449c 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -67,17 +67,10 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
goto error;
}
- if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
- XFRM_SKB_CB(skb)->seq.output = ++x->replay.oseq;
- if (unlikely(x->replay.oseq == 0)) {
- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATESEQERROR);
- x->replay.oseq--;
- xfrm_audit_state_replay_overflow(x, skb);
- err = -EOVERFLOW;
- goto error;
- }
- if (xfrm_aevent_is_on(net))
- xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
+ err = x->repl->overflow(x, skb);
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATESEQERROR);
+ goto error;
}
x->curlft.bytes += skb->len;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 8b3ef404c79..1ba0258b49c 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -50,37 +50,40 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
static void xfrm_init_pmtu(struct dst_entry *dst);
static int stale_bundle(struct dst_entry *dst);
-static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *xdst,
- struct flowi *fl, int family, int strict);
+static int xfrm_bundle_ok(struct xfrm_dst *xdst, int family);
static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
int dir);
static inline int
-__xfrm4_selector_match(struct xfrm_selector *sel, struct flowi *fl)
+__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
- return addr_match(&fl->fl4_dst, &sel->daddr, sel->prefixlen_d) &&
- addr_match(&fl->fl4_src, &sel->saddr, sel->prefixlen_s) &&
- !((xfrm_flowi_dport(fl) ^ sel->dport) & sel->dport_mask) &&
- !((xfrm_flowi_sport(fl) ^ sel->sport) & sel->sport_mask) &&
- (fl->proto == sel->proto || !sel->proto) &&
- (fl->oif == sel->ifindex || !sel->ifindex);
+ const struct flowi4 *fl4 = &fl->u.ip4;
+
+ return addr_match(&fl4->daddr, &sel->daddr, sel->prefixlen_d) &&
+ addr_match(&fl4->saddr, &sel->saddr, sel->prefixlen_s) &&
+ !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
+ !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
+ (fl4->flowi4_proto == sel->proto || !sel->proto) &&
+ (fl4->flowi4_oif == sel->ifindex || !sel->ifindex);
}
static inline int
-__xfrm6_selector_match(struct xfrm_selector *sel, struct flowi *fl)
+__xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
- return addr_match(&fl->fl6_dst, &sel->daddr, sel->prefixlen_d) &&
- addr_match(&fl->fl6_src, &sel->saddr, sel->prefixlen_s) &&
- !((xfrm_flowi_dport(fl) ^ sel->dport) & sel->dport_mask) &&
- !((xfrm_flowi_sport(fl) ^ sel->sport) & sel->sport_mask) &&
- (fl->proto == sel->proto || !sel->proto) &&
- (fl->oif == sel->ifindex || !sel->ifindex);
+ const struct flowi6 *fl6 = &fl->u.ip6;
+
+ return addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) &&
+ addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) &&
+ !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) &&
+ !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) &&
+ (fl6->flowi6_proto == sel->proto || !sel->proto) &&
+ (fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
}
-int xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl,
- unsigned short family)
+int xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
+ unsigned short family)
{
switch (family) {
case AF_INET:
@@ -92,8 +95,8 @@ int xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl,
}
static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
- xfrm_address_t *saddr,
- xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr,
int family)
{
struct xfrm_policy_afinfo *afinfo;
@@ -311,7 +314,9 @@ static inline unsigned int idx_hash(struct net *net, u32 index)
return __idx_hash(index, net->xfrm.policy_idx_hmask);
}
-static struct hlist_head *policy_hash_bysel(struct net *net, struct xfrm_selector *sel, unsigned short family, int dir)
+static struct hlist_head *policy_hash_bysel(struct net *net,
+ const struct xfrm_selector *sel,
+ unsigned short family, int dir)
{
unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
unsigned int hash = __sel_hash(sel, family, hmask);
@@ -321,7 +326,10 @@ static struct hlist_head *policy_hash_bysel(struct net *net, struct xfrm_selecto
net->xfrm.policy_bydst[dir].table + hash);
}
-static struct hlist_head *policy_hash_direct(struct net *net, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, int dir)
+static struct hlist_head *policy_hash_direct(struct net *net,
+ const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ unsigned short family, int dir)
{
unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
unsigned int hash = __addr_hash(daddr, saddr, family, hmask);
@@ -864,32 +872,33 @@ EXPORT_SYMBOL(xfrm_policy_walk_done);
*
* Returns 0 if policy found, else an -errno.
*/
-static int xfrm_policy_match(struct xfrm_policy *pol, struct flowi *fl,
+static int xfrm_policy_match(const struct xfrm_policy *pol,
+ const struct flowi *fl,
u8 type, u16 family, int dir)
{
- struct xfrm_selector *sel = &pol->selector;
+ const struct xfrm_selector *sel = &pol->selector;
int match, ret = -ESRCH;
if (pol->family != family ||
- (fl->mark & pol->mark.m) != pol->mark.v ||
+ (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
pol->type != type)
return ret;
match = xfrm_selector_match(sel, fl, family);
if (match)
- ret = security_xfrm_policy_lookup(pol->security, fl->secid,
+ ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
dir);
return ret;
}
static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
- struct flowi *fl,
+ const struct flowi *fl,
u16 family, u8 dir)
{
int err;
struct xfrm_policy *pol, *ret;
- xfrm_address_t *daddr, *saddr;
+ const xfrm_address_t *daddr, *saddr;
struct hlist_node *entry;
struct hlist_head *chain;
u32 priority = ~0U;
@@ -941,7 +950,7 @@ fail:
}
static struct xfrm_policy *
-__xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir)
+__xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
{
#ifdef CONFIG_XFRM_SUB_POLICY
struct xfrm_policy *pol;
@@ -954,7 +963,7 @@ __xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir)
}
static struct flow_cache_object *
-xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
+xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
u8 dir, struct flow_cache_object *old_obj, void *ctx)
{
struct xfrm_policy *pol;
@@ -990,7 +999,8 @@ static inline int policy_to_flow_dir(int dir)
}
}
-static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
+static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir,
+ const struct flowi *fl)
{
struct xfrm_policy *pol;
@@ -1006,7 +1016,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struc
goto out;
}
err = security_xfrm_policy_lookup(pol->security,
- fl->secid,
+ fl->flowi_secid,
policy_to_flow_dir(dir));
if (!err)
xfrm_pol_hold(pol);
@@ -1098,7 +1108,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
return 0;
}
-static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
+static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
{
struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
@@ -1157,9 +1167,8 @@ xfrm_get_saddr(struct net *net, xfrm_address_t *local, xfrm_address_t *remote,
/* Resolve list of templates for the flow, given policy. */
static int
-xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl,
- struct xfrm_state **xfrm,
- unsigned short family)
+xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
+ struct xfrm_state **xfrm, unsigned short family)
{
struct net *net = xp_net(policy);
int nx;
@@ -1214,9 +1223,8 @@ fail:
}
static int
-xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
- struct xfrm_state **xfrm,
- unsigned short family)
+xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
+ struct xfrm_state **xfrm, unsigned short family)
{
struct xfrm_state *tp[XFRM_MAX_DEPTH];
struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
@@ -1256,7 +1264,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
* still valid.
*/
-static inline int xfrm_get_tos(struct flowi *fl, int family)
+static inline int xfrm_get_tos(const struct flowi *fl, int family)
{
struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
int tos;
@@ -1340,10 +1348,13 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
default:
BUG();
}
- xdst = dst_alloc(dst_ops) ?: ERR_PTR(-ENOBUFS);
+ xdst = dst_alloc(dst_ops, 0);
xfrm_policy_put_afinfo(afinfo);
- xdst->flo.ops = &xfrm_bundle_fc_ops;
+ if (likely(xdst))
+ xdst->flo.ops = &xfrm_bundle_fc_ops;
+ else
+ xdst = ERR_PTR(-ENOBUFS);
return xdst;
}
@@ -1366,7 +1377,7 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
}
static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
- struct flowi *fl)
+ const struct flowi *fl)
{
struct xfrm_policy_afinfo *afinfo =
xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
@@ -1389,7 +1400,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
struct xfrm_state **xfrm, int nx,
- struct flowi *fl,
+ const struct flowi *fl,
struct dst_entry *dst)
{
struct net *net = xp_net(policy);
@@ -1505,7 +1516,7 @@ free_dst:
}
static int inline
-xfrm_dst_alloc_copy(void **target, void *src, int size)
+xfrm_dst_alloc_copy(void **target, const void *src, int size)
{
if (!*target) {
*target = kmalloc(size, GFP_ATOMIC);
@@ -1517,7 +1528,7 @@ xfrm_dst_alloc_copy(void **target, void *src, int size)
}
static int inline
-xfrm_dst_update_parent(struct dst_entry *dst, struct xfrm_selector *sel)
+xfrm_dst_update_parent(struct dst_entry *dst, const struct xfrm_selector *sel)
{
#ifdef CONFIG_XFRM_SUB_POLICY
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
@@ -1529,7 +1540,7 @@ xfrm_dst_update_parent(struct dst_entry *dst, struct xfrm_selector *sel)
}
static int inline
-xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
+xfrm_dst_update_origin(struct dst_entry *dst, const struct flowi *fl)
{
#ifdef CONFIG_XFRM_SUB_POLICY
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
@@ -1539,7 +1550,7 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
#endif
}
-static int xfrm_expand_policies(struct flowi *fl, u16 family,
+static int xfrm_expand_policies(const struct flowi *fl, u16 family,
struct xfrm_policy **pols,
int *num_pols, int *num_xfrms)
{
@@ -1585,7 +1596,7 @@ static int xfrm_expand_policies(struct flowi *fl, u16 family,
static struct xfrm_dst *
xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
- struct flowi *fl, u16 family,
+ const struct flowi *fl, u16 family,
struct dst_entry *dst_orig)
{
struct net *net = xp_net(pols[0]);
@@ -1628,7 +1639,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
}
static struct flow_cache_object *
-xfrm_bundle_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir,
+xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
struct flow_cache_object *oldflo, void *ctx)
{
struct dst_entry *dst_orig = (struct dst_entry *)ctx;
@@ -1727,18 +1738,36 @@ error:
return ERR_PTR(err);
}
+static struct dst_entry *make_blackhole(struct net *net, u16 family,
+ struct dst_entry *dst_orig)
+{
+ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+ struct dst_entry *ret;
+
+ if (!afinfo) {
+ dst_release(dst_orig);
+ ret = ERR_PTR(-EINVAL);
+ } else {
+ ret = afinfo->blackhole_route(net, dst_orig);
+ }
+ xfrm_policy_put_afinfo(afinfo);
+
+ return ret;
+}
+
/* Main function: finds/creates a bundle for given flow.
*
* At the moment we eat a raw IP route. Mostly to speed up lookups
* on interfaces with disabled IPsec.
*/
-int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
- struct sock *sk, int flags)
+struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
+ const struct flowi *fl,
+ struct sock *sk, int flags)
{
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
struct flow_cache_object *flo;
struct xfrm_dst *xdst;
- struct dst_entry *dst, *dst_orig = *dst_p, *route;
+ struct dst_entry *dst, *route;
u16 family = dst_orig->ops->family;
u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
@@ -1820,9 +1849,10 @@ restart:
dst_release(dst);
xfrm_pols_put(pols, drop_pols);
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
- return -EREMOTE;
+
+ return make_blackhole(net, family, dst_orig);
}
- if (flags & XFRM_LOOKUP_WAIT) {
+ if (fl->flowi_flags & FLOWI_FLAG_CAN_SLEEP) {
DECLARE_WAITQUEUE(wait, current);
add_wait_queue(&net->xfrm.km_waitq, &wait);
@@ -1864,47 +1894,33 @@ no_transform:
goto error;
} else if (num_xfrms > 0) {
/* Flow transformed */
- *dst_p = dst;
dst_release(dst_orig);
} else {
/* Flow passes untransformed */
dst_release(dst);
+ dst = dst_orig;
}
ok:
xfrm_pols_put(pols, drop_pols);
- return 0;
+ return dst;
nopol:
- if (!(flags & XFRM_LOOKUP_ICMP))
+ if (!(flags & XFRM_LOOKUP_ICMP)) {
+ dst = dst_orig;
goto ok;
+ }
err = -ENOENT;
error:
dst_release(dst);
dropdst:
dst_release(dst_orig);
- *dst_p = NULL;
xfrm_pols_put(pols, drop_pols);
- return err;
-}
-EXPORT_SYMBOL(__xfrm_lookup);
-
-int xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
- struct sock *sk, int flags)
-{
- int err = __xfrm_lookup(net, dst_p, fl, sk, flags);
-
- if (err == -EREMOTE) {
- dst_release(*dst_p);
- *dst_p = NULL;
- err = -EAGAIN;
- }
-
- return err;
+ return ERR_PTR(err);
}
EXPORT_SYMBOL(xfrm_lookup);
static inline int
-xfrm_secpath_reject(int idx, struct sk_buff *skb, struct flowi *fl)
+xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
{
struct xfrm_state *x;
@@ -1923,7 +1939,7 @@ xfrm_secpath_reject(int idx, struct sk_buff *skb, struct flowi *fl)
*/
static inline int
-xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x,
+xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
unsigned short family)
{
if (xfrm_state_kern(x))
@@ -1946,7 +1962,7 @@ xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x,
* Otherwise "-2 - errored_index" is returned.
*/
static inline int
-xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
+xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
unsigned short family)
{
int idx = start;
@@ -1978,13 +1994,13 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
return -EAFNOSUPPORT;
afinfo->decode_session(skb, fl, reverse);
- err = security_xfrm_decode_session(skb, &fl->secid);
+ err = security_xfrm_decode_session(skb, &fl->flowi_secid);
xfrm_policy_put_afinfo(afinfo);
return err;
}
EXPORT_SYMBOL(__xfrm_decode_session);
-static inline int secpath_has_nontransport(struct sec_path *sp, int k, int *idxp)
+static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp)
{
for (; k < sp->len; k++) {
if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
@@ -2159,7 +2175,7 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
struct net *net = dev_net(skb->dev);
struct flowi fl;
struct dst_entry *dst;
- int res;
+ int res = 0;
if (xfrm_decode_session(skb, &fl, family) < 0) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
@@ -2167,9 +2183,12 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
}
skb_dst_force(skb);
- dst = skb_dst(skb);
- res = xfrm_lookup(net, &dst, &fl, NULL, 0) == 0;
+ dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, 0);
+ if (IS_ERR(dst)) {
+ res = 1;
+ dst = NULL;
+ }
skb_dst_set(skb, dst);
return res;
}
@@ -2207,7 +2226,7 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
static int stale_bundle(struct dst_entry *dst)
{
- return !xfrm_bundle_ok(NULL, (struct xfrm_dst *)dst, NULL, AF_UNSPEC, 0);
+ return !xfrm_bundle_ok((struct xfrm_dst *)dst, AF_UNSPEC);
}
void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
@@ -2279,8 +2298,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst)
* still valid.
*/
-static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
- struct flowi *fl, int family, int strict)
+static int xfrm_bundle_ok(struct xfrm_dst *first, int family)
{
struct dst_entry *dst = &first->u.dst;
struct xfrm_dst *last;
@@ -2289,26 +2307,12 @@ static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
(dst->dev && !netif_running(dst->dev)))
return 0;
-#ifdef CONFIG_XFRM_SUB_POLICY
- if (fl) {
- if (first->origin && !flow_cache_uli_match(first->origin, fl))
- return 0;
- if (first->partner &&
- !xfrm_selector_match(first->partner, fl, family))
- return 0;
- }
-#endif
last = NULL;
do {
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
- if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family))
- return 0;
- if (fl && pol &&
- !security_xfrm_state_pol_flow_match(dst->xfrm, pol, fl))
- return 0;
if (dst->xfrm->km.state != XFRM_STATE_VALID)
return 0;
if (xdst->xfrm_genid != dst->xfrm->genid)
@@ -2317,11 +2321,6 @@ static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
return 0;
- if (strict && fl &&
- !(dst->xfrm->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) &&
- !xfrm_state_addr_flow_check(dst->xfrm, fl, family))
- return 0;
-
mtu = dst_mtu(dst->child);
if (xdst->child_mtu_cached != mtu) {
last = xdst;
@@ -2732,8 +2731,8 @@ EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete);
#endif
#ifdef CONFIG_XFRM_MIGRATE
-static int xfrm_migrate_selector_match(struct xfrm_selector *sel_cmp,
- struct xfrm_selector *sel_tgt)
+static int xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp,
+ const struct xfrm_selector *sel_tgt)
{
if (sel_cmp->proto == IPSEC_ULPROTO_ANY) {
if (sel_tgt->family == sel_cmp->family &&
@@ -2753,7 +2752,7 @@ static int xfrm_migrate_selector_match(struct xfrm_selector *sel_cmp,
return 0;
}
-static struct xfrm_policy * xfrm_migrate_policy_find(struct xfrm_selector *sel,
+static struct xfrm_policy * xfrm_migrate_policy_find(const struct xfrm_selector *sel,
u8 dir, u8 type)
{
struct xfrm_policy *pol, *ret = NULL;
@@ -2789,7 +2788,7 @@ static struct xfrm_policy * xfrm_migrate_policy_find(struct xfrm_selector *sel,
return ret;
}
-static int migrate_tmpl_match(struct xfrm_migrate *m, struct xfrm_tmpl *t)
+static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t)
{
int match = 0;
@@ -2859,7 +2858,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
return 0;
}
-static int xfrm_migrate_check(struct xfrm_migrate *m, int num_migrate)
+static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate)
{
int i, j;
@@ -2893,7 +2892,7 @@ static int xfrm_migrate_check(struct xfrm_migrate *m, int num_migrate)
return 0;
}
-int xfrm_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
+int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
struct xfrm_migrate *m, int num_migrate,
struct xfrm_kmaddress *k)
{
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
new file mode 100644
index 00000000000..2f5be5b1574
--- /dev/null
+++ b/net/xfrm/xfrm_replay.c
@@ -0,0 +1,534 @@
+/*
+ * xfrm_replay.c - xfrm replay detection, derived from xfrm_state.c.
+ *
+ * Copyright (C) 2010 secunet Security Networks AG
+ * Copyright (C) 2010 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <net/xfrm.h>
+
+u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq)
+{
+ u32 seq, seq_hi, bottom;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+
+ if (!(x->props.flags & XFRM_STATE_ESN))
+ return 0;
+
+ seq = ntohl(net_seq);
+ seq_hi = replay_esn->seq_hi;
+ bottom = replay_esn->seq - replay_esn->replay_window + 1;
+
+ if (likely(replay_esn->seq >= replay_esn->replay_window - 1)) {
+ /* A. same subspace */
+ if (unlikely(seq < bottom))
+ seq_hi++;
+ } else {
+ /* B. window spans two subspaces */
+ if (unlikely(seq >= bottom))
+ seq_hi--;
+ }
+
+ return seq_hi;
+}
+
+static void xfrm_replay_notify(struct xfrm_state *x, int event)
+{
+ struct km_event c;
+ /* we send notify messages in case
+ * 1. we updated on of the sequence numbers, and the seqno difference
+ * is at least x->replay_maxdiff, in this case we also update the
+ * timeout of our timer function
+ * 2. if x->replay_maxage has elapsed since last update,
+ * and there were changes
+ *
+ * The state structure must be locked!
+ */
+
+ switch (event) {
+ case XFRM_REPLAY_UPDATE:
+ if (x->replay_maxdiff &&
+ (x->replay.seq - x->preplay.seq < x->replay_maxdiff) &&
+ (x->replay.oseq - x->preplay.oseq < x->replay_maxdiff)) {
+ if (x->xflags & XFRM_TIME_DEFER)
+ event = XFRM_REPLAY_TIMEOUT;
+ else
+ return;
+ }
+
+ break;
+
+ case XFRM_REPLAY_TIMEOUT:
+ if (memcmp(&x->replay, &x->preplay,
+ sizeof(struct xfrm_replay_state)) == 0) {
+ x->xflags |= XFRM_TIME_DEFER;
+ return;
+ }
+
+ break;
+ }
+
+ memcpy(&x->preplay, &x->replay, sizeof(struct xfrm_replay_state));
+ c.event = XFRM_MSG_NEWAE;
+ c.data.aevent = event;
+ km_state_notify(x, &c);
+
+ if (x->replay_maxage &&
+ !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
+ x->xflags &= ~XFRM_TIME_DEFER;
+}
+
+static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err = 0;
+ struct net *net = xs_net(x);
+
+ if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
+ XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq;
+ if (unlikely(x->replay.oseq == 0)) {
+ x->replay.oseq--;
+ xfrm_audit_state_replay_overflow(x, skb);
+ err = -EOVERFLOW;
+
+ return err;
+ }
+ if (xfrm_aevent_is_on(net))
+ x->repl->notify(x, XFRM_REPLAY_UPDATE);
+ }
+
+ return err;
+}
+
+static int xfrm_replay_check(struct xfrm_state *x,
+ struct sk_buff *skb, __be32 net_seq)
+{
+ u32 diff;
+ u32 seq = ntohl(net_seq);
+
+ if (unlikely(seq == 0))
+ goto err;
+
+ if (likely(seq > x->replay.seq))
+ return 0;
+
+ diff = x->replay.seq - seq;
+ if (diff >= min_t(unsigned int, x->props.replay_window,
+ sizeof(x->replay.bitmap) * 8)) {
+ x->stats.replay_window++;
+ goto err;
+ }
+
+ if (x->replay.bitmap & (1U << diff)) {
+ x->stats.replay++;
+ goto err;
+ }
+ return 0;
+
+err:
+ xfrm_audit_state_replay(x, skb, net_seq);
+ return -EINVAL;
+}
+
+static void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq)
+{
+ u32 diff;
+ u32 seq = ntohl(net_seq);
+
+ if (!x->props.replay_window)
+ return;
+
+ if (seq > x->replay.seq) {
+ diff = seq - x->replay.seq;
+ if (diff < x->props.replay_window)
+ x->replay.bitmap = ((x->replay.bitmap) << diff) | 1;
+ else
+ x->replay.bitmap = 1;
+ x->replay.seq = seq;
+ } else {
+ diff = x->replay.seq - seq;
+ x->replay.bitmap |= (1U << diff);
+ }
+
+ if (xfrm_aevent_is_on(xs_net(x)))
+ xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
+}
+
+static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err = 0;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ struct net *net = xs_net(x);
+
+ if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
+ XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq;
+ if (unlikely(replay_esn->oseq == 0)) {
+ replay_esn->oseq--;
+ xfrm_audit_state_replay_overflow(x, skb);
+ err = -EOVERFLOW;
+
+ return err;
+ }
+ if (xfrm_aevent_is_on(net))
+ x->repl->notify(x, XFRM_REPLAY_UPDATE);
+ }
+
+ return err;
+}
+
+static int xfrm_replay_check_bmp(struct xfrm_state *x,
+ struct sk_buff *skb, __be32 net_seq)
+{
+ unsigned int bitnr, nr;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ u32 seq = ntohl(net_seq);
+ u32 diff = replay_esn->seq - seq;
+ u32 pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+
+ if (unlikely(seq == 0))
+ goto err;
+
+ if (likely(seq > replay_esn->seq))
+ return 0;
+
+ if (diff >= replay_esn->replay_window) {
+ x->stats.replay_window++;
+ goto err;
+ }
+
+ if (pos >= diff) {
+ bitnr = (pos - diff) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ if (replay_esn->bmp[nr] & (1U << bitnr))
+ goto err_replay;
+ } else {
+ bitnr = replay_esn->replay_window - (diff - pos);
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ if (replay_esn->bmp[nr] & (1U << bitnr))
+ goto err_replay;
+ }
+ return 0;
+
+err_replay:
+ x->stats.replay++;
+err:
+ xfrm_audit_state_replay(x, skb, net_seq);
+ return -EINVAL;
+}
+
+static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq)
+{
+ unsigned int bitnr, nr, i;
+ u32 diff;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ u32 seq = ntohl(net_seq);
+ u32 pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+
+ if (!replay_esn->replay_window)
+ return;
+
+ if (seq > replay_esn->seq) {
+ diff = seq - replay_esn->seq;
+
+ if (diff < replay_esn->replay_window) {
+ for (i = 1; i < diff; i++) {
+ bitnr = (pos + i) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] &= ~(1U << bitnr);
+ }
+
+ bitnr = (pos + diff) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] |= (1U << bitnr);
+ } else {
+ nr = replay_esn->replay_window >> 5;
+ for (i = 0; i <= nr; i++)
+ replay_esn->bmp[i] = 0;
+
+ bitnr = (pos + diff) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] |= (1U << bitnr);
+ }
+
+ replay_esn->seq = seq;
+ } else {
+ diff = replay_esn->seq - seq;
+
+ if (pos >= diff) {
+ bitnr = (pos - diff) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] |= (1U << bitnr);
+ } else {
+ bitnr = replay_esn->replay_window - (diff - pos);
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] |= (1U << bitnr);
+ }
+ }
+
+ if (xfrm_aevent_is_on(xs_net(x)))
+ xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
+}
+
+static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event)
+{
+ struct km_event c;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ struct xfrm_replay_state_esn *preplay_esn = x->preplay_esn;
+
+ /* we send notify messages in case
+ * 1. we updated on of the sequence numbers, and the seqno difference
+ * is at least x->replay_maxdiff, in this case we also update the
+ * timeout of our timer function
+ * 2. if x->replay_maxage has elapsed since last update,
+ * and there were changes
+ *
+ * The state structure must be locked!
+ */
+
+ switch (event) {
+ case XFRM_REPLAY_UPDATE:
+ if (x->replay_maxdiff &&
+ (replay_esn->seq - preplay_esn->seq < x->replay_maxdiff) &&
+ (replay_esn->oseq - preplay_esn->oseq < x->replay_maxdiff)) {
+ if (x->xflags & XFRM_TIME_DEFER)
+ event = XFRM_REPLAY_TIMEOUT;
+ else
+ return;
+ }
+
+ break;
+
+ case XFRM_REPLAY_TIMEOUT:
+ if (memcmp(x->replay_esn, x->preplay_esn,
+ xfrm_replay_state_esn_len(replay_esn)) == 0) {
+ x->xflags |= XFRM_TIME_DEFER;
+ return;
+ }
+
+ break;
+ }
+
+ memcpy(x->preplay_esn, x->replay_esn,
+ xfrm_replay_state_esn_len(replay_esn));
+ c.event = XFRM_MSG_NEWAE;
+ c.data.aevent = event;
+ km_state_notify(x, &c);
+
+ if (x->replay_maxage &&
+ !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
+ x->xflags &= ~XFRM_TIME_DEFER;
+}
+
+static int xfrm_replay_overflow_esn(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err = 0;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ struct net *net = xs_net(x);
+
+ if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
+ XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq;
+ XFRM_SKB_CB(skb)->seq.output.hi = replay_esn->oseq_hi;
+
+ if (unlikely(replay_esn->oseq == 0)) {
+ XFRM_SKB_CB(skb)->seq.output.hi = ++replay_esn->oseq_hi;
+
+ if (replay_esn->oseq_hi == 0) {
+ replay_esn->oseq--;
+ replay_esn->oseq_hi--;
+ xfrm_audit_state_replay_overflow(x, skb);
+ err = -EOVERFLOW;
+
+ return err;
+ }
+ }
+ if (xfrm_aevent_is_on(net))
+ x->repl->notify(x, XFRM_REPLAY_UPDATE);
+ }
+
+ return err;
+}
+
+static int xfrm_replay_check_esn(struct xfrm_state *x,
+ struct sk_buff *skb, __be32 net_seq)
+{
+ unsigned int bitnr, nr;
+ u32 diff;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ u32 seq = ntohl(net_seq);
+ u32 pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+ u32 wsize = replay_esn->replay_window;
+ u32 top = replay_esn->seq;
+ u32 bottom = top - wsize + 1;
+
+ if (unlikely(seq == 0 && replay_esn->seq_hi == 0 &&
+ (replay_esn->seq < replay_esn->replay_window - 1)))
+ goto err;
+
+ diff = top - seq;
+
+ if (likely(top >= wsize - 1)) {
+ /* A. same subspace */
+ if (likely(seq > top) || seq < bottom)
+ return 0;
+ } else {
+ /* B. window spans two subspaces */
+ if (likely(seq > top && seq < bottom))
+ return 0;
+ if (seq >= bottom)
+ diff = ~seq + top + 1;
+ }
+
+ if (diff >= replay_esn->replay_window) {
+ x->stats.replay_window++;
+ goto err;
+ }
+
+ if (pos >= diff) {
+ bitnr = (pos - diff) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ if (replay_esn->bmp[nr] & (1U << bitnr))
+ goto err_replay;
+ } else {
+ bitnr = replay_esn->replay_window - (diff - pos);
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ if (replay_esn->bmp[nr] & (1U << bitnr))
+ goto err_replay;
+ }
+ return 0;
+
+err_replay:
+ x->stats.replay++;
+err:
+ xfrm_audit_state_replay(x, skb, net_seq);
+ return -EINVAL;
+}
+
+static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
+{
+ unsigned int bitnr, nr, i;
+ int wrap;
+ u32 diff, pos, seq, seq_hi;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+
+ if (!replay_esn->replay_window)
+ return;
+
+ seq = ntohl(net_seq);
+ pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+ seq_hi = xfrm_replay_seqhi(x, net_seq);
+ wrap = seq_hi - replay_esn->seq_hi;
+
+ if ((!wrap && seq > replay_esn->seq) || wrap > 0) {
+ if (likely(!wrap))
+ diff = seq - replay_esn->seq;
+ else
+ diff = ~replay_esn->seq + seq + 1;
+
+ if (diff < replay_esn->replay_window) {
+ for (i = 1; i < diff; i++) {
+ bitnr = (pos + i) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] &= ~(1U << bitnr);
+ }
+
+ bitnr = (pos + diff) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] |= (1U << bitnr);
+ } else {
+ nr = replay_esn->replay_window >> 5;
+ for (i = 0; i <= nr; i++)
+ replay_esn->bmp[i] = 0;
+
+ bitnr = (pos + diff) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] |= (1U << bitnr);
+ }
+
+ replay_esn->seq = seq;
+
+ if (unlikely(wrap > 0))
+ replay_esn->seq_hi++;
+ } else {
+ diff = replay_esn->seq - seq;
+
+ if (pos >= diff) {
+ bitnr = (pos - diff) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] |= (1U << bitnr);
+ } else {
+ bitnr = replay_esn->replay_window - (diff - pos);
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] |= (1U << bitnr);
+ }
+ }
+
+ if (xfrm_aevent_is_on(xs_net(x)))
+ xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
+}
+
+static struct xfrm_replay xfrm_replay_legacy = {
+ .advance = xfrm_replay_advance,
+ .check = xfrm_replay_check,
+ .notify = xfrm_replay_notify,
+ .overflow = xfrm_replay_overflow,
+};
+
+static struct xfrm_replay xfrm_replay_bmp = {
+ .advance = xfrm_replay_advance_bmp,
+ .check = xfrm_replay_check_bmp,
+ .notify = xfrm_replay_notify_bmp,
+ .overflow = xfrm_replay_overflow_bmp,
+};
+
+static struct xfrm_replay xfrm_replay_esn = {
+ .advance = xfrm_replay_advance_esn,
+ .check = xfrm_replay_check_esn,
+ .notify = xfrm_replay_notify_bmp,
+ .overflow = xfrm_replay_overflow_esn,
+};
+
+int xfrm_init_replay(struct xfrm_state *x)
+{
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+
+ if (replay_esn) {
+ if (replay_esn->replay_window >
+ replay_esn->bmp_len * sizeof(__u32))
+ return -EINVAL;
+
+ if ((x->props.flags & XFRM_STATE_ESN) && x->replay_esn)
+ x->repl = &xfrm_replay_esn;
+ else
+ x->repl = &xfrm_replay_bmp;
+ } else
+ x->repl = &xfrm_replay_legacy;
+
+ return 0;
+}
+EXPORT_SYMBOL(xfrm_init_replay);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 220ebc05c7a..d575f053486 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -42,16 +42,9 @@ static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
-#ifdef CONFIG_AUDITSYSCALL
-static void xfrm_audit_state_replay(struct xfrm_state *x,
- struct sk_buff *skb, __be32 net_seq);
-#else
-#define xfrm_audit_state_replay(x, s, sq) do { ; } while (0)
-#endif /* CONFIG_AUDITSYSCALL */
-
static inline unsigned int xfrm_dst_hash(struct net *net,
- xfrm_address_t *daddr,
- xfrm_address_t *saddr,
+ const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
u32 reqid,
unsigned short family)
{
@@ -59,15 +52,16 @@ static inline unsigned int xfrm_dst_hash(struct net *net,
}
static inline unsigned int xfrm_src_hash(struct net *net,
- xfrm_address_t *daddr,
- xfrm_address_t *saddr,
+ const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
unsigned short family)
{
return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask);
}
static inline unsigned int
-xfrm_spi_hash(struct net *net, xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family)
+xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr,
+ __be32 spi, u8 proto, unsigned short family)
{
return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask);
}
@@ -362,6 +356,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
kfree(x->calg);
kfree(x->encap);
kfree(x->coaddr);
+ kfree(x->replay_esn);
+ kfree(x->preplay_esn);
if (x->inner_mode)
xfrm_put_mode(x->inner_mode);
if (x->inner_mode_iaf)
@@ -656,9 +652,9 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
EXPORT_SYMBOL(xfrm_sad_getinfo);
static int
-xfrm_init_tempstate(struct xfrm_state *x, struct flowi *fl,
- struct xfrm_tmpl *tmpl,
- xfrm_address_t *daddr, xfrm_address_t *saddr,
+xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
+ const struct xfrm_tmpl *tmpl,
+ const xfrm_address_t *daddr, const xfrm_address_t *saddr,
unsigned short family)
{
struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
@@ -677,7 +673,10 @@ xfrm_init_tempstate(struct xfrm_state *x, struct flowi *fl,
return 0;
}
-static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family)
+static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
+ const xfrm_address_t *daddr,
+ __be32 spi, u8 proto,
+ unsigned short family)
{
unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family);
struct xfrm_state *x;
@@ -699,7 +698,10 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, xfrm_ad
return NULL;
}
-static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto, unsigned short family)
+static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark,
+ const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ u8 proto, unsigned short family)
{
unsigned int h = xfrm_src_hash(net, daddr, saddr, family);
struct xfrm_state *x;
@@ -746,8 +748,7 @@ static void xfrm_hash_grow_check(struct net *net, int have_hash_collision)
}
static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
- struct flowi *fl, unsigned short family,
- xfrm_address_t *daddr, xfrm_address_t *saddr,
+ const struct flowi *fl, unsigned short family,
struct xfrm_state **best, int *acq_in_progress,
int *error)
{
@@ -784,8 +785,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
}
struct xfrm_state *
-xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
- struct flowi *fl, struct xfrm_tmpl *tmpl,
+xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
+ const struct flowi *fl, struct xfrm_tmpl *tmpl,
struct xfrm_policy *pol, int *err,
unsigned short family)
{
@@ -813,7 +814,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
tmpl->mode == x->props.mode &&
tmpl->id.proto == x->id.proto &&
(tmpl->id.spi == x->id.spi || !tmpl->id.spi))
- xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
+ xfrm_state_look_at(pol, x, fl, encap_family,
&best, &acquire_in_progress, &error);
}
if (best)
@@ -829,7 +830,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
tmpl->mode == x->props.mode &&
tmpl->id.proto == x->id.proto &&
(tmpl->id.spi == x->id.spi || !tmpl->id.spi))
- xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
+ xfrm_state_look_at(pol, x, fl, encap_family,
&best, &acquire_in_progress, &error);
}
@@ -853,7 +854,7 @@ found:
xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
memcpy(&x->mark, &pol->mark, sizeof(x->mark));
- error = security_xfrm_state_alloc_acquire(x, pol->security, fl->secid);
+ error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid);
if (error) {
x->km.state = XFRM_STATE_DEAD;
to_put = x;
@@ -991,7 +992,11 @@ void xfrm_state_insert(struct xfrm_state *x)
EXPORT_SYMBOL(xfrm_state_insert);
/* xfrm_state_lock is held */
-static struct xfrm_state *__find_acq_core(struct net *net, struct xfrm_mark *m, unsigned short family, u8 mode, u32 reqid, u8 proto, xfrm_address_t *daddr, xfrm_address_t *saddr, int create)
+static struct xfrm_state *__find_acq_core(struct net *net, struct xfrm_mark *m,
+ unsigned short family, u8 mode,
+ u32 reqid, u8 proto,
+ const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr, int create)
{
unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family);
struct hlist_node *entry;
@@ -1369,7 +1374,7 @@ int xfrm_state_check_expire(struct xfrm_state *x)
EXPORT_SYMBOL(xfrm_state_check_expire);
struct xfrm_state *
-xfrm_state_lookup(struct net *net, u32 mark, xfrm_address_t *daddr, __be32 spi,
+xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi,
u8 proto, unsigned short family)
{
struct xfrm_state *x;
@@ -1383,7 +1388,7 @@ EXPORT_SYMBOL(xfrm_state_lookup);
struct xfrm_state *
xfrm_state_lookup_byaddr(struct net *net, u32 mark,
- xfrm_address_t *daddr, xfrm_address_t *saddr,
+ const xfrm_address_t *daddr, const xfrm_address_t *saddr,
u8 proto, unsigned short family)
{
struct xfrm_state *x;
@@ -1397,7 +1402,7 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
struct xfrm_state *
xfrm_find_acq(struct net *net, struct xfrm_mark *mark, u8 mode, u32 reqid, u8 proto,
- xfrm_address_t *daddr, xfrm_address_t *saddr,
+ const xfrm_address_t *daddr, const xfrm_address_t *saddr,
int create, unsigned short family)
{
struct xfrm_state *x;
@@ -1609,54 +1614,6 @@ void xfrm_state_walk_done(struct xfrm_state_walk *walk)
}
EXPORT_SYMBOL(xfrm_state_walk_done);
-
-void xfrm_replay_notify(struct xfrm_state *x, int event)
-{
- struct km_event c;
- /* we send notify messages in case
- * 1. we updated on of the sequence numbers, and the seqno difference
- * is at least x->replay_maxdiff, in this case we also update the
- * timeout of our timer function
- * 2. if x->replay_maxage has elapsed since last update,
- * and there were changes
- *
- * The state structure must be locked!
- */
-
- switch (event) {
- case XFRM_REPLAY_UPDATE:
- if (x->replay_maxdiff &&
- (x->replay.seq - x->preplay.seq < x->replay_maxdiff) &&
- (x->replay.oseq - x->preplay.oseq < x->replay_maxdiff)) {
- if (x->xflags & XFRM_TIME_DEFER)
- event = XFRM_REPLAY_TIMEOUT;
- else
- return;
- }
-
- break;
-
- case XFRM_REPLAY_TIMEOUT:
- if ((x->replay.seq == x->preplay.seq) &&
- (x->replay.bitmap == x->preplay.bitmap) &&
- (x->replay.oseq == x->preplay.oseq)) {
- x->xflags |= XFRM_TIME_DEFER;
- return;
- }
-
- break;
- }
-
- memcpy(&x->preplay, &x->replay, sizeof(struct xfrm_replay_state));
- c.event = XFRM_MSG_NEWAE;
- c.data.aevent = event;
- km_state_notify(x, &c);
-
- if (x->replay_maxage &&
- !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
- x->xflags &= ~XFRM_TIME_DEFER;
-}
-
static void xfrm_replay_timer_handler(unsigned long data)
{
struct xfrm_state *x = (struct xfrm_state*)data;
@@ -1665,7 +1622,7 @@ static void xfrm_replay_timer_handler(unsigned long data)
if (x->km.state == XFRM_STATE_VALID) {
if (xfrm_aevent_is_on(xs_net(x)))
- xfrm_replay_notify(x, XFRM_REPLAY_TIMEOUT);
+ x->repl->notify(x, XFRM_REPLAY_TIMEOUT);
else
x->xflags |= XFRM_TIME_DEFER;
}
@@ -1673,61 +1630,10 @@ static void xfrm_replay_timer_handler(unsigned long data)
spin_unlock(&x->lock);
}
-int xfrm_replay_check(struct xfrm_state *x,
- struct sk_buff *skb, __be32 net_seq)
-{
- u32 diff;
- u32 seq = ntohl(net_seq);
-
- if (unlikely(seq == 0))
- goto err;
-
- if (likely(seq > x->replay.seq))
- return 0;
-
- diff = x->replay.seq - seq;
- if (diff >= min_t(unsigned int, x->props.replay_window,
- sizeof(x->replay.bitmap) * 8)) {
- x->stats.replay_window++;
- goto err;
- }
-
- if (x->replay.bitmap & (1U << diff)) {
- x->stats.replay++;
- goto err;
- }
- return 0;
-
-err:
- xfrm_audit_state_replay(x, skb, net_seq);
- return -EINVAL;
-}
-
-void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq)
-{
- u32 diff;
- u32 seq = ntohl(net_seq);
-
- if (seq > x->replay.seq) {
- diff = seq - x->replay.seq;
- if (diff < x->props.replay_window)
- x->replay.bitmap = ((x->replay.bitmap) << diff) | 1;
- else
- x->replay.bitmap = 1;
- x->replay.seq = seq;
- } else {
- diff = x->replay.seq - seq;
- x->replay.bitmap |= (1U << diff);
- }
-
- if (xfrm_aevent_is_on(xs_net(x)))
- xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
-}
-
static LIST_HEAD(xfrm_km_list);
static DEFINE_RWLOCK(xfrm_km_lock);
-void km_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
+void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
struct xfrm_mgr *km;
@@ -1738,7 +1644,7 @@ void km_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
read_unlock(&xfrm_km_lock);
}
-void km_state_notify(struct xfrm_state *x, struct km_event *c)
+void km_state_notify(struct xfrm_state *x, const struct km_event *c)
{
struct xfrm_mgr *km;
read_lock(&xfrm_km_lock);
@@ -1819,9 +1725,9 @@ void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 pid)
EXPORT_SYMBOL(km_policy_expired);
#ifdef CONFIG_XFRM_MIGRATE
-int km_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
- struct xfrm_migrate *m, int num_migrate,
- struct xfrm_kmaddress *k)
+int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+ const struct xfrm_migrate *m, int num_migrate,
+ const struct xfrm_kmaddress *k)
{
int err = -EINVAL;
int ret;
@@ -2236,7 +2142,7 @@ void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
}
EXPORT_SYMBOL_GPL(xfrm_audit_state_replay_overflow);
-static void xfrm_audit_state_replay(struct xfrm_state *x,
+void xfrm_audit_state_replay(struct xfrm_state *x,
struct sk_buff *skb, __be32 net_seq)
{
struct audit_buffer *audit_buf;
@@ -2251,6 +2157,7 @@ static void xfrm_audit_state_replay(struct xfrm_state *x,
spi, spi, ntohl(net_seq));
audit_log_end(audit_buf);
}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_replay);
void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family)
{
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index d5e1e0b0889..706385ae3e4 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -119,6 +119,19 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs)
return 0;
}
+static inline int verify_replay(struct xfrm_usersa_info *p,
+ struct nlattr **attrs)
+{
+ struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL];
+
+ if (!rt)
+ return 0;
+
+ if (p->replay_window != 0)
+ return -EINVAL;
+
+ return 0;
+}
static int verify_newsa_info(struct xfrm_usersa_info *p,
struct nlattr **attrs)
@@ -214,6 +227,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
goto out;
if ((err = verify_sec_ctx_len(attrs)))
goto out;
+ if ((err = verify_replay(p, attrs)))
+ goto out;
err = -EINVAL;
switch (p->mode) {
@@ -234,7 +249,7 @@ out:
}
static int attach_one_algo(struct xfrm_algo **algpp, u8 *props,
- struct xfrm_algo_desc *(*get_byname)(char *, int),
+ struct xfrm_algo_desc *(*get_byname)(const char *, int),
struct nlattr *rta)
{
struct xfrm_algo *p, *ualg;
@@ -345,6 +360,33 @@ static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props,
return 0;
}
+static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn,
+ struct xfrm_replay_state_esn **preplay_esn,
+ struct nlattr *rta)
+{
+ struct xfrm_replay_state_esn *p, *pp, *up;
+
+ if (!rta)
+ return 0;
+
+ up = nla_data(rta);
+
+ p = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ pp = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL);
+ if (!pp) {
+ kfree(p);
+ return -ENOMEM;
+ }
+
+ *replay_esn = p;
+ *preplay_esn = pp;
+
+ return 0;
+}
+
static inline int xfrm_user_sec_ctx_size(struct xfrm_sec_ctx *xfrm_ctx)
{
int len = 0;
@@ -380,10 +422,20 @@ static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *
static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs)
{
struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];
+ struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL];
struct nlattr *lt = attrs[XFRMA_LTIME_VAL];
struct nlattr *et = attrs[XFRMA_ETIMER_THRESH];
struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH];
+ if (re) {
+ struct xfrm_replay_state_esn *replay_esn;
+ replay_esn = nla_data(re);
+ memcpy(x->replay_esn, replay_esn,
+ xfrm_replay_state_esn_len(replay_esn));
+ memcpy(x->preplay_esn, replay_esn,
+ xfrm_replay_state_esn_len(replay_esn));
+ }
+
if (rp) {
struct xfrm_replay_state *replay;
replay = nla_data(rp);
@@ -467,16 +519,19 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX])))
goto error;
+ if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
+ attrs[XFRMA_REPLAY_ESN_VAL])))
+ goto error;
+
x->km.seq = p->seq;
x->replay_maxdiff = net->xfrm.sysctl_aevent_rseqth;
/* sysctl_xfrm_aevent_etime is in 100ms units */
x->replay_maxage = (net->xfrm.sysctl_aevent_etime*HZ)/XFRM_AE_ETH_M;
- x->preplay.bitmap = 0;
- x->preplay.seq = x->replay.seq+x->replay_maxdiff;
- x->preplay.oseq = x->replay.oseq +x->replay_maxdiff;
- /* override default values from above */
+ if ((err = xfrm_init_replay(x)))
+ goto error;
+ /* override default values from above */
xfrm_update_ae_params(x, attrs);
return x;
@@ -497,9 +552,9 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
struct xfrm_state *x;
int err;
struct km_event c;
- uid_t loginuid = NETLINK_CB(skb).loginuid;
- u32 sessionid = NETLINK_CB(skb).sessionid;
- u32 sid = NETLINK_CB(skb).sid;
+ uid_t loginuid = audit_get_loginuid(current);
+ u32 sessionid = audit_get_sessionid(current);
+ u32 sid;
err = verify_newsa_info(p, attrs);
if (err)
@@ -515,6 +570,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
else
err = xfrm_state_update(x);
+ security_task_getsecid(current, &sid);
xfrm_audit_state_add(x, err ? 0 : 1, loginuid, sessionid, sid);
if (err < 0) {
@@ -575,9 +631,9 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
int err = -ESRCH;
struct km_event c;
struct xfrm_usersa_id *p = nlmsg_data(nlh);
- uid_t loginuid = NETLINK_CB(skb).loginuid;
- u32 sessionid = NETLINK_CB(skb).sessionid;
- u32 sid = NETLINK_CB(skb).sid;
+ uid_t loginuid = audit_get_loginuid(current);
+ u32 sessionid = audit_get_sessionid(current);
+ u32 sid;
x = xfrm_user_state_lookup(net, p, attrs, &err);
if (x == NULL)
@@ -602,6 +658,7 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
km_state_notify(x, &c);
out:
+ security_task_getsecid(current, &sid);
xfrm_audit_state_delete(x, err ? 0 : 1, loginuid, sessionid, sid);
xfrm_state_put(x);
return err;
@@ -705,6 +762,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
if (xfrm_mark_put(skb, &x->mark))
goto nla_put_failure;
+ if (x->replay_esn)
+ NLA_PUT(skb, XFRMA_REPLAY_ESN_VAL,
+ xfrm_replay_state_esn_len(x->replay_esn), x->replay_esn);
+
if (x->security && copy_sec_ctx(x->security, skb) < 0)
goto nla_put_failure;
@@ -1265,9 +1326,9 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
struct km_event c;
int err;
int excl;
- uid_t loginuid = NETLINK_CB(skb).loginuid;
- u32 sessionid = NETLINK_CB(skb).sessionid;
- u32 sid = NETLINK_CB(skb).sid;
+ uid_t loginuid = audit_get_loginuid(current);
+ u32 sessionid = audit_get_sessionid(current);
+ u32 sid;
err = verify_newpolicy_info(p);
if (err)
@@ -1286,6 +1347,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
* a type XFRM_MSG_UPDPOLICY - JHS */
excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
err = xfrm_policy_insert(p->dir, xp, excl);
+ security_task_getsecid(current, &sid);
xfrm_audit_policy_add(xp, err ? 0 : 1, loginuid, sessionid, sid);
if (err) {
@@ -1522,10 +1584,11 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
NETLINK_CB(skb).pid);
}
} else {
- uid_t loginuid = NETLINK_CB(skb).loginuid;
- u32 sessionid = NETLINK_CB(skb).sessionid;
- u32 sid = NETLINK_CB(skb).sid;
+ uid_t loginuid = audit_get_loginuid(current);
+ u32 sessionid = audit_get_sessionid(current);
+ u32 sid;
+ security_task_getsecid(current, &sid);
xfrm_audit_policy_delete(xp, err ? 0 : 1, loginuid, sessionid,
sid);
@@ -1553,9 +1616,9 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
struct xfrm_audit audit_info;
int err;
- audit_info.loginuid = NETLINK_CB(skb).loginuid;
- audit_info.sessionid = NETLINK_CB(skb).sessionid;
- audit_info.secid = NETLINK_CB(skb).sid;
+ audit_info.loginuid = audit_get_loginuid(current);
+ audit_info.sessionid = audit_get_sessionid(current);
+ security_task_getsecid(current, &audit_info.secid);
err = xfrm_state_flush(net, p->proto, &audit_info);
if (err) {
if (err == -ESRCH) /* empty table */
@@ -1572,17 +1635,21 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
return 0;
}
-static inline size_t xfrm_aevent_msgsize(void)
+static inline size_t xfrm_aevent_msgsize(struct xfrm_state *x)
{
+ size_t replay_size = x->replay_esn ?
+ xfrm_replay_state_esn_len(x->replay_esn) :
+ sizeof(struct xfrm_replay_state);
+
return NLMSG_ALIGN(sizeof(struct xfrm_aevent_id))
- + nla_total_size(sizeof(struct xfrm_replay_state))
+ + nla_total_size(replay_size)
+ nla_total_size(sizeof(struct xfrm_lifetime_cur))
+ nla_total_size(sizeof(struct xfrm_mark))
+ nla_total_size(4) /* XFRM_AE_RTHR */
+ nla_total_size(4); /* XFRM_AE_ETHR */
}
-static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, struct km_event *c)
+static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
{
struct xfrm_aevent_id *id;
struct nlmsghdr *nlh;
@@ -1600,7 +1667,13 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, struct km_eve
id->reqid = x->props.reqid;
id->flags = c->data.aevent;
- NLA_PUT(skb, XFRMA_REPLAY_VAL, sizeof(x->replay), &x->replay);
+ if (x->replay_esn)
+ NLA_PUT(skb, XFRMA_REPLAY_ESN_VAL,
+ xfrm_replay_state_esn_len(x->replay_esn),
+ x->replay_esn);
+ else
+ NLA_PUT(skb, XFRMA_REPLAY_VAL, sizeof(x->replay), &x->replay);
+
NLA_PUT(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft);
if (id->flags & XFRM_AE_RTHR)
@@ -1633,16 +1706,16 @@ static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
struct xfrm_aevent_id *p = nlmsg_data(nlh);
struct xfrm_usersa_id *id = &p->sa_id;
- r_skb = nlmsg_new(xfrm_aevent_msgsize(), GFP_ATOMIC);
- if (r_skb == NULL)
- return -ENOMEM;
-
mark = xfrm_mark_get(attrs, &m);
x = xfrm_state_lookup(net, mark, &id->daddr, id->spi, id->proto, id->family);
- if (x == NULL) {
- kfree_skb(r_skb);
+ if (x == NULL)
return -ESRCH;
+
+ r_skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC);
+ if (r_skb == NULL) {
+ xfrm_state_put(x);
+ return -ENOMEM;
}
/*
@@ -1674,9 +1747,10 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
struct xfrm_mark m;
struct xfrm_aevent_id *p = nlmsg_data(nlh);
struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];
+ struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL];
struct nlattr *lt = attrs[XFRMA_LTIME_VAL];
- if (!lt && !rp)
+ if (!lt && !rp && !re)
return err;
/* pedantic mode - thou shalt sayeth replaceth */
@@ -1720,9 +1794,9 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
- audit_info.loginuid = NETLINK_CB(skb).loginuid;
- audit_info.sessionid = NETLINK_CB(skb).sessionid;
- audit_info.secid = NETLINK_CB(skb).sid;
+ audit_info.loginuid = audit_get_loginuid(current);
+ audit_info.sessionid = audit_get_sessionid(current);
+ security_task_getsecid(current, &audit_info.secid);
err = xfrm_policy_flush(net, type, &audit_info);
if (err) {
if (err == -ESRCH) /* empty table */
@@ -1789,9 +1863,11 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
err = 0;
if (up->hard) {
- uid_t loginuid = NETLINK_CB(skb).loginuid;
- uid_t sessionid = NETLINK_CB(skb).sessionid;
- u32 sid = NETLINK_CB(skb).sid;
+ uid_t loginuid = audit_get_loginuid(current);
+ u32 sessionid = audit_get_sessionid(current);
+ u32 sid;
+
+ security_task_getsecid(current, &sid);
xfrm_policy_delete(xp, p->dir);
xfrm_audit_policy_delete(xp, 1, loginuid, sessionid, sid);
@@ -1830,9 +1906,11 @@ static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
km_state_expired(x, ue->hard, current->pid);
if (ue->hard) {
- uid_t loginuid = NETLINK_CB(skb).loginuid;
- uid_t sessionid = NETLINK_CB(skb).sessionid;
- u32 sid = NETLINK_CB(skb).sid;
+ uid_t loginuid = audit_get_loginuid(current);
+ u32 sessionid = audit_get_sessionid(current);
+ u32 sid;
+
+ security_task_getsecid(current, &sid);
__xfrm_state_delete(x);
xfrm_audit_state_delete(x, 1, loginuid, sessionid, sid);
}
@@ -1986,7 +2064,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
#endif
#ifdef CONFIG_XFRM_MIGRATE
-static int copy_to_user_migrate(struct xfrm_migrate *m, struct sk_buff *skb)
+static int copy_to_user_migrate(const struct xfrm_migrate *m, struct sk_buff *skb)
{
struct xfrm_user_migrate um;
@@ -2004,7 +2082,7 @@ static int copy_to_user_migrate(struct xfrm_migrate *m, struct sk_buff *skb)
return nla_put(skb, XFRMA_MIGRATE, sizeof(um), &um);
}
-static int copy_to_user_kmaddress(struct xfrm_kmaddress *k, struct sk_buff *skb)
+static int copy_to_user_kmaddress(const struct xfrm_kmaddress *k, struct sk_buff *skb)
{
struct xfrm_user_kmaddress uk;
@@ -2025,11 +2103,11 @@ static inline size_t xfrm_migrate_msgsize(int num_migrate, int with_kma)
+ userpolicy_type_attrsize();
}
-static int build_migrate(struct sk_buff *skb, struct xfrm_migrate *m,
- int num_migrate, struct xfrm_kmaddress *k,
- struct xfrm_selector *sel, u8 dir, u8 type)
+static int build_migrate(struct sk_buff *skb, const struct xfrm_migrate *m,
+ int num_migrate, const struct xfrm_kmaddress *k,
+ const struct xfrm_selector *sel, u8 dir, u8 type)
{
- struct xfrm_migrate *mp;
+ const struct xfrm_migrate *mp;
struct xfrm_userpolicy_id *pol_id;
struct nlmsghdr *nlh;
int i;
@@ -2061,9 +2139,9 @@ nlmsg_failure:
return -EMSGSIZE;
}
-static int xfrm_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
- struct xfrm_migrate *m, int num_migrate,
- struct xfrm_kmaddress *k)
+static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+ const struct xfrm_migrate *m, int num_migrate,
+ const struct xfrm_kmaddress *k)
{
struct net *net = &init_net;
struct sk_buff *skb;
@@ -2079,9 +2157,9 @@ static int xfrm_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_MIGRATE, GFP_ATOMIC);
}
#else
-static int xfrm_send_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
- struct xfrm_migrate *m, int num_migrate,
- struct xfrm_kmaddress *k)
+static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+ const struct xfrm_migrate *m, int num_migrate,
+ const struct xfrm_kmaddress *k)
{
return -ENOPROTOOPT;
}
@@ -2137,6 +2215,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
[XFRMA_KMADDRESS] = { .len = sizeof(struct xfrm_user_kmaddress) },
[XFRMA_MARK] = { .len = sizeof(struct xfrm_mark) },
[XFRMA_TFCPAD] = { .type = NLA_U32 },
+ [XFRMA_REPLAY_ESN_VAL] = { .len = sizeof(struct xfrm_replay_state_esn) },
};
static struct xfrm_link {
@@ -2189,7 +2268,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) &&
- (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
+ (nlh->nlmsg_flags & NLM_F_DUMP)) {
if (link->dump == NULL)
return -EINVAL;
@@ -2220,7 +2299,7 @@ static inline size_t xfrm_expire_msgsize(void)
+ nla_total_size(sizeof(struct xfrm_mark));
}
-static int build_expire(struct sk_buff *skb, struct xfrm_state *x, struct km_event *c)
+static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
{
struct xfrm_user_expire *ue;
struct nlmsghdr *nlh;
@@ -2242,7 +2321,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
+static int xfrm_exp_state_notify(struct xfrm_state *x, const struct km_event *c)
{
struct net *net = xs_net(x);
struct sk_buff *skb;
@@ -2259,12 +2338,12 @@ static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
}
-static int xfrm_aevent_state_notify(struct xfrm_state *x, struct km_event *c)
+static int xfrm_aevent_state_notify(struct xfrm_state *x, const struct km_event *c)
{
struct net *net = xs_net(x);
struct sk_buff *skb;
- skb = nlmsg_new(xfrm_aevent_msgsize(), GFP_ATOMIC);
+ skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC);
if (skb == NULL)
return -ENOMEM;
@@ -2274,7 +2353,7 @@ static int xfrm_aevent_state_notify(struct xfrm_state *x, struct km_event *c)
return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_AEVENTS, GFP_ATOMIC);
}
-static int xfrm_notify_sa_flush(struct km_event *c)
+static int xfrm_notify_sa_flush(const struct km_event *c)
{
struct net *net = c->net;
struct xfrm_usersa_flush *p;
@@ -2318,6 +2397,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
l += nla_total_size(sizeof(*x->encap));
if (x->tfcpad)
l += nla_total_size(sizeof(x->tfcpad));
+ if (x->replay_esn)
+ l += nla_total_size(xfrm_replay_state_esn_len(x->replay_esn));
if (x->security)
l += nla_total_size(sizeof(struct xfrm_user_sec_ctx) +
x->security->ctx_len);
@@ -2330,7 +2411,7 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
return l;
}
-static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
+static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c)
{
struct net *net = xs_net(x);
struct xfrm_usersa_info *p;
@@ -2387,7 +2468,7 @@ nla_put_failure:
return -1;
}
-static int xfrm_send_state_notify(struct xfrm_state *x, struct km_event *c)
+static int xfrm_send_state_notify(struct xfrm_state *x, const struct km_event *c)
{
switch (c->event) {
@@ -2546,7 +2627,7 @@ static inline size_t xfrm_polexpire_msgsize(struct xfrm_policy *xp)
}
static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
- int dir, struct km_event *c)
+ int dir, const struct km_event *c)
{
struct xfrm_user_polexpire *upe;
struct nlmsghdr *nlh;
@@ -2576,7 +2657,7 @@ nlmsg_failure:
return -EMSGSIZE;
}
-static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
+static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
struct net *net = xp_net(xp);
struct sk_buff *skb;
@@ -2591,7 +2672,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
}
-static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
+static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
struct net *net = xp_net(xp);
struct xfrm_userpolicy_info *p;
@@ -2656,7 +2737,7 @@ nlmsg_failure:
return -1;
}
-static int xfrm_notify_policy_flush(struct km_event *c)
+static int xfrm_notify_policy_flush(const struct km_event *c)
{
struct net *net = c->net;
struct nlmsghdr *nlh;
@@ -2681,7 +2762,7 @@ nlmsg_failure:
return -1;
}
-static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
+static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
switch (c->event) {